[{"data":1,"prerenderedAt":186224},["ShallowReactive",2],{"page-\u002Fautomating-document-data-pipelines\u002Fscheduling-and-logging-automation-jobs":3,"all-pages":6992},{"id":4,"title":5,"body":6,"breadcrumbTitle":6976,"canonical":6977,"date":6978,"description":6979,"draft":6980,"extension":6981,"image":6977,"meta":6982,"navigation":91,"path":6983,"robots":6977,"seo":6984,"seoTitle":6985,"stem":6986,"tags":6987,"updatedAt":6978,"__hash__":6991},"content\u002Fautomating-document-data-pipelines\u002Fscheduling-and-logging-automation-jobs\u002Findex.md","Scheduling and Logging Automation Jobs",{"type":7,"value":8,"toc":6956},"minimark",[9,13,17,22,110,113,117,124,416,419,423,428,431,936,950,954,965,1336,1356,1360,1370,1964,1968,1971,1977,1995,2003,2020,2025,2052,2055,2060,2319,2327,2536,2539,2704,2706,2710,2714,2717,3026,3036,3040,3580,3587,3591,3594,3936,3940,3943,4198,4206,4210,4268,4272,4399,4403,6915,6919,6945,6952],[10,11,5],"h1",{"id":12},"scheduling-and-logging-automation-jobs",[14,15,16],"p",{},"A script that runs only when you remember to start it is not automation — it is manual work with extra steps. The gap between a working script and a reliable unattended job comes down to three concerns: triggering it on a schedule, recording what happened, and recovering (or alerting) when something breaks. This guide covers all three for document and data pipelines.",[18,19,21],"h2",{"id":20},"prerequisites","Prerequisites",[23,24,29],"pre",{"className":25,"code":26,"language":27,"meta":28,"style":28},"language-bash shiki shiki-themes github-light","# pip install schedule tenacity\npython -m venv .venv\nsource .venv\u002Fbin\u002Factivate          # Windows: .venv\\Scripts\\activate\npip install schedule tenacity\n\n# Verify\npython -c \"import schedule, tenacity; print('OK')\"\n","bash","",[30,31,32,41,59,71,86,93,99],"code",{"__ignoreMap":28},[33,34,37],"span",{"class":35,"line":36},"line",1,[33,38,40],{"class":39},"sAwPA","# pip install schedule tenacity\n",[33,42,44,48,52,56],{"class":35,"line":43},2,[33,45,47],{"class":46},"s7eDp","python",[33,49,51],{"class":50},"sYu0t"," -m",[33,53,55],{"class":54},"sYBdl"," venv",[33,57,58],{"class":54}," .venv\n",[33,60,62,65,68],{"class":35,"line":61},3,[33,63,64],{"class":50},"source",[33,66,67],{"class":54}," .venv\u002Fbin\u002Factivate",[33,69,70],{"class":39},"          # Windows: .venv\\Scripts\\activate\n",[33,72,74,77,80,83],{"class":35,"line":73},4,[33,75,76],{"class":46},"pip",[33,78,79],{"class":54}," install",[33,81,82],{"class":54}," schedule",[33,84,85],{"class":54}," tenacity\n",[33,87,89],{"class":35,"line":88},5,[33,90,92],{"emptyLinePlaceholder":91},true,"\n",[33,94,96],{"class":35,"line":95},6,[33,97,98],{"class":39},"# Verify\n",[33,100,102,104,107],{"class":35,"line":101},7,[33,103,47],{"class":46},[33,105,106],{"class":50}," -c",[33,108,109],{"class":54}," \"import schedule, tenacity; print('OK')\"\n",[14,111,112],{},"For email alerts you also need access to an SMTP server (or a free transactional relay such as SendGrid). For webhook alerts, any HTTP endpoint works — Slack incoming webhooks are the most common choice.",[18,114,116],{"id":115},"diagnostic-confirm-your-runtime-environment","Diagnostic: Confirm Your Runtime Environment",[14,118,119,120,123],{},"Before scheduling anything, confirm the Python and file paths the scheduler will actually see. Cron and Task Scheduler both run with a minimal environment; your interactive shell's ",[30,121,122],{},"PATH"," and activated virtualenv are absent.",[23,125,128],{"className":126,"code":127,"language":47,"meta":28,"style":28},"language-python shiki shiki-themes github-light","# pip install (none — stdlib only)\n\"\"\"\nRun this script from the scheduler (not your terminal) and check the log file.\nIt captures the runtime environment so you can diagnose PATH\u002Fvenv issues before\nadding real pipeline logic.\n\"\"\"\nimport sys\nimport os\nimport logging\nfrom pathlib import Path\n\nLOG_PATH = Path(\"\u002Ftmp\u002Fenv_check.log\")\n\nlogging.basicConfig(\n    filename=LOG_PATH,\n    level=logging.DEBUG,\n    format=\"%(asctime)s %(levelname)s %(message)s\",\n)\n\nlogging.info(\"Python: %s\", sys.executable)\nlogging.info(\"Version: %s\", sys.version)\nlogging.info(\"CWD: %s\", Path.cwd())\nlogging.info(\"PATH: %s\", os.environ.get(\"PATH\", \"(not set)\"))\nlogging.info(\"VIRTUAL_ENV: %s\", os.environ.get(\"VIRTUAL_ENV\", \"(not set)\"))\nlogging.info(\"Script location: %s\", Path(__file__).resolve())\n",[30,129,130,135,140,145,150,155,159,169,177,185,199,204,222,227,233,248,264,288,293,298,315,330,345,372,395],{"__ignoreMap":28},[33,131,132],{"class":35,"line":36},[33,133,134],{"class":39},"# pip install (none — stdlib only)\n",[33,136,137],{"class":35,"line":43},[33,138,139],{"class":54},"\"\"\"\n",[33,141,142],{"class":35,"line":61},[33,143,144],{"class":54},"Run this script from the scheduler (not your terminal) and check the log file.\n",[33,146,147],{"class":35,"line":73},[33,148,149],{"class":54},"It captures the runtime environment so you can diagnose PATH\u002Fvenv issues before\n",[33,151,152],{"class":35,"line":88},[33,153,154],{"class":54},"adding real pipeline logic.\n",[33,156,157],{"class":35,"line":95},[33,158,139],{"class":54},[33,160,161,165],{"class":35,"line":101},[33,162,164],{"class":163},"sD7c4","import",[33,166,168],{"class":167},"sgsFI"," sys\n",[33,170,172,174],{"class":35,"line":171},8,[33,173,164],{"class":163},[33,175,176],{"class":167}," os\n",[33,178,180,182],{"class":35,"line":179},9,[33,181,164],{"class":163},[33,183,184],{"class":167}," logging\n",[33,186,188,191,194,196],{"class":35,"line":187},10,[33,189,190],{"class":163},"from",[33,192,193],{"class":167}," pathlib ",[33,195,164],{"class":163},[33,197,198],{"class":167}," Path\n",[33,200,202],{"class":35,"line":201},11,[33,203,92],{"emptyLinePlaceholder":91},[33,205,207,210,213,216,219],{"class":35,"line":206},12,[33,208,209],{"class":50},"LOG_PATH",[33,211,212],{"class":163}," =",[33,214,215],{"class":167}," Path(",[33,217,218],{"class":54},"\"\u002Ftmp\u002Fenv_check.log\"",[33,220,221],{"class":167},")\n",[33,223,225],{"class":35,"line":224},13,[33,226,92],{"emptyLinePlaceholder":91},[33,228,230],{"class":35,"line":229},14,[33,231,232],{"class":167},"logging.basicConfig(\n",[33,234,236,240,243,245],{"class":35,"line":235},15,[33,237,239],{"class":238},"sqxcx","    filename",[33,241,242],{"class":163},"=",[33,244,209],{"class":50},[33,246,247],{"class":167},",\n",[33,249,251,254,256,259,262],{"class":35,"line":250},16,[33,252,253],{"class":238},"    level",[33,255,242],{"class":163},[33,257,258],{"class":167},"logging.",[33,260,261],{"class":50},"DEBUG",[33,263,247],{"class":167},[33,265,267,270,272,275,278,281,284,286],{"class":35,"line":266},17,[33,268,269],{"class":238},"    format",[33,271,242],{"class":163},[33,273,274],{"class":54},"\"",[33,276,277],{"class":50},"%(asctime)s",[33,279,280],{"class":50}," %(levelname)s",[33,282,283],{"class":50}," %(message)s",[33,285,274],{"class":54},[33,287,247],{"class":167},[33,289,291],{"class":35,"line":290},18,[33,292,221],{"class":167},[33,294,296],{"class":35,"line":295},19,[33,297,92],{"emptyLinePlaceholder":91},[33,299,301,304,307,310,312],{"class":35,"line":300},20,[33,302,303],{"class":167},"logging.info(",[33,305,306],{"class":54},"\"Python: ",[33,308,309],{"class":50},"%s",[33,311,274],{"class":54},[33,313,314],{"class":167},", sys.executable)\n",[33,316,318,320,323,325,327],{"class":35,"line":317},21,[33,319,303],{"class":167},[33,321,322],{"class":54},"\"Version: ",[33,324,309],{"class":50},[33,326,274],{"class":54},[33,328,329],{"class":167},", sys.version)\n",[33,331,333,335,338,340,342],{"class":35,"line":332},22,[33,334,303],{"class":167},[33,336,337],{"class":54},"\"CWD: ",[33,339,309],{"class":50},[33,341,274],{"class":54},[33,343,344],{"class":167},", Path.cwd())\n",[33,346,348,350,353,355,357,360,363,366,369],{"class":35,"line":347},23,[33,349,303],{"class":167},[33,351,352],{"class":54},"\"PATH: ",[33,354,309],{"class":50},[33,356,274],{"class":54},[33,358,359],{"class":167},", os.environ.get(",[33,361,362],{"class":54},"\"PATH\"",[33,364,365],{"class":167},", ",[33,367,368],{"class":54},"\"(not set)\"",[33,370,371],{"class":167},"))\n",[33,373,375,377,380,382,384,386,389,391,393],{"class":35,"line":374},24,[33,376,303],{"class":167},[33,378,379],{"class":54},"\"VIRTUAL_ENV: ",[33,381,309],{"class":50},[33,383,274],{"class":54},[33,385,359],{"class":167},[33,387,388],{"class":54},"\"VIRTUAL_ENV\"",[33,390,365],{"class":167},[33,392,368],{"class":54},[33,394,371],{"class":167},[33,396,398,400,403,405,407,410,413],{"class":35,"line":397},25,[33,399,303],{"class":167},[33,401,402],{"class":54},"\"Script location: ",[33,404,309],{"class":50},[33,406,274],{"class":54},[33,408,409],{"class":167},", Path(",[33,411,412],{"class":50},"__file__",[33,414,415],{"class":167},").resolve())\n",[14,417,418],{},"Schedule this script first. If the log file shows the wrong Python executable, fix your scheduler invocation (absolute path to the venv interpreter) before proceeding.",[18,420,422],{"id":421},"core-implementation","Core Implementation",[424,425,427],"h3",{"id":426},"step-1-wrap-your-job-in-a-callable","Step 1 — Wrap Your Job in a Callable",[14,429,430],{},"Every scheduler expects a zero-argument callable. Wrap your pipeline logic in a function that accepts no positional arguments and returns on success or raises on failure.",[23,432,434],{"className":126,"code":433,"language":47,"meta":28,"style":28},"# pip install pdfplumber pandas openpyxl\n\"\"\"\nStep 1: Wrap the pipeline as a zero-argument callable.\nPairs with the extracting-PDF-data step that feeds data into pandas.\n\"\"\"\nimport logging\nfrom pathlib import Path\n\nimport pdfplumber\nimport pandas as pd\n\nINPUT_DIR = Path(\"\u002Fdata\u002Fincoming\")\nOUTPUT_PATH = Path(\"\u002Fdata\u002Freports\u002Fdaily_summary.xlsx\")\n\nlogger = logging.getLogger(__name__)\n\n\ndef run_daily_pipeline() -> None:\n    \"\"\"Extract tables from every PDF in INPUT_DIR and write a combined Excel report.\"\"\"\n    frames = []\n    pdf_files = list(INPUT_DIR.glob(\"*.pdf\"))\n    if not pdf_files:\n        logger.warning(\"No PDFs found in %s\", INPUT_DIR)\n        return\n\n    for pdf_path in pdf_files:\n        try:\n            with pdfplumber.open(pdf_path) as pdf:\n                for page in pdf.pages:\n                    table = page.extract_table()\n                    if table:\n                        df = pd.DataFrame(table[1:], columns=table[0])\n                        df[\"_source\"] = pdf_path.name\n                        frames.append(df)\n        except Exception:\n            logger.exception(\"Failed to parse %s\", pdf_path.name)\n\n    if not frames:\n        logger.warning(\"No tables extracted — nothing to write\")\n        return\n\n    combined = pd.concat(frames, ignore_index=True)\n    OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)\n    combined.to_excel(OUTPUT_PATH, index=False)\n    logger.info(\"Wrote %d rows to %s\", len(combined), OUTPUT_PATH)\n",[30,435,436,441,445,450,455,459,465,475,479,486,499,503,517,531,535,550,554,558,575,580,590,613,624,642,647,651,665,673,687,701,712,721,752,769,775,786,802,807,817,827,832,837,858,885,905],{"__ignoreMap":28},[33,437,438],{"class":35,"line":36},[33,439,440],{"class":39},"# pip install pdfplumber pandas openpyxl\n",[33,442,443],{"class":35,"line":43},[33,444,139],{"class":54},[33,446,447],{"class":35,"line":61},[33,448,449],{"class":54},"Step 1: Wrap the pipeline as a zero-argument callable.\n",[33,451,452],{"class":35,"line":73},[33,453,454],{"class":54},"Pairs with the extracting-PDF-data step that feeds data into pandas.\n",[33,456,457],{"class":35,"line":88},[33,458,139],{"class":54},[33,460,461,463],{"class":35,"line":95},[33,462,164],{"class":163},[33,464,184],{"class":167},[33,466,467,469,471,473],{"class":35,"line":101},[33,468,190],{"class":163},[33,470,193],{"class":167},[33,472,164],{"class":163},[33,474,198],{"class":167},[33,476,477],{"class":35,"line":171},[33,478,92],{"emptyLinePlaceholder":91},[33,480,481,483],{"class":35,"line":179},[33,482,164],{"class":163},[33,484,485],{"class":167}," pdfplumber\n",[33,487,488,490,493,496],{"class":35,"line":187},[33,489,164],{"class":163},[33,491,492],{"class":167}," pandas ",[33,494,495],{"class":163},"as",[33,497,498],{"class":167}," pd\n",[33,500,501],{"class":35,"line":201},[33,502,92],{"emptyLinePlaceholder":91},[33,504,505,508,510,512,515],{"class":35,"line":206},[33,506,507],{"class":50},"INPUT_DIR",[33,509,212],{"class":163},[33,511,215],{"class":167},[33,513,514],{"class":54},"\"\u002Fdata\u002Fincoming\"",[33,516,221],{"class":167},[33,518,519,522,524,526,529],{"class":35,"line":224},[33,520,521],{"class":50},"OUTPUT_PATH",[33,523,212],{"class":163},[33,525,215],{"class":167},[33,527,528],{"class":54},"\"\u002Fdata\u002Freports\u002Fdaily_summary.xlsx\"",[33,530,221],{"class":167},[33,532,533],{"class":35,"line":229},[33,534,92],{"emptyLinePlaceholder":91},[33,536,537,540,542,545,548],{"class":35,"line":235},[33,538,539],{"class":167},"logger ",[33,541,242],{"class":163},[33,543,544],{"class":167}," logging.getLogger(",[33,546,547],{"class":50},"__name__",[33,549,221],{"class":167},[33,551,552],{"class":35,"line":250},[33,553,92],{"emptyLinePlaceholder":91},[33,555,556],{"class":35,"line":266},[33,557,92],{"emptyLinePlaceholder":91},[33,559,560,563,566,569,572],{"class":35,"line":290},[33,561,562],{"class":163},"def",[33,564,565],{"class":46}," run_daily_pipeline",[33,567,568],{"class":167},"() -> ",[33,570,571],{"class":50},"None",[33,573,574],{"class":167},":\n",[33,576,577],{"class":35,"line":295},[33,578,579],{"class":54},"    \"\"\"Extract tables from every PDF in INPUT_DIR and write a combined Excel report.\"\"\"\n",[33,581,582,585,587],{"class":35,"line":300},[33,583,584],{"class":167},"    frames ",[33,586,242],{"class":163},[33,588,589],{"class":167}," []\n",[33,591,592,595,597,600,603,605,608,611],{"class":35,"line":317},[33,593,594],{"class":167},"    pdf_files ",[33,596,242],{"class":163},[33,598,599],{"class":50}," list",[33,601,602],{"class":167},"(",[33,604,507],{"class":50},[33,606,607],{"class":167},".glob(",[33,609,610],{"class":54},"\"*.pdf\"",[33,612,371],{"class":167},[33,614,615,618,621],{"class":35,"line":332},[33,616,617],{"class":163},"    if",[33,619,620],{"class":163}," not",[33,622,623],{"class":167}," pdf_files:\n",[33,625,626,629,632,634,636,638,640],{"class":35,"line":347},[33,627,628],{"class":167},"        logger.warning(",[33,630,631],{"class":54},"\"No PDFs found in ",[33,633,309],{"class":50},[33,635,274],{"class":54},[33,637,365],{"class":167},[33,639,507],{"class":50},[33,641,221],{"class":167},[33,643,644],{"class":35,"line":374},[33,645,646],{"class":163},"        return\n",[33,648,649],{"class":35,"line":397},[33,650,92],{"emptyLinePlaceholder":91},[33,652,654,657,660,663],{"class":35,"line":653},26,[33,655,656],{"class":163},"    for",[33,658,659],{"class":167}," pdf_path ",[33,661,662],{"class":163},"in",[33,664,623],{"class":167},[33,666,668,671],{"class":35,"line":667},27,[33,669,670],{"class":163},"        try",[33,672,574],{"class":167},[33,674,676,679,682,684],{"class":35,"line":675},28,[33,677,678],{"class":163},"            with",[33,680,681],{"class":167}," pdfplumber.open(pdf_path) ",[33,683,495],{"class":163},[33,685,686],{"class":167}," pdf:\n",[33,688,690,693,696,698],{"class":35,"line":689},29,[33,691,692],{"class":163},"                for",[33,694,695],{"class":167}," page ",[33,697,662],{"class":163},[33,699,700],{"class":167}," pdf.pages:\n",[33,702,704,707,709],{"class":35,"line":703},30,[33,705,706],{"class":167},"                    table ",[33,708,242],{"class":163},[33,710,711],{"class":167}," page.extract_table()\n",[33,713,715,718],{"class":35,"line":714},31,[33,716,717],{"class":163},"                    if",[33,719,720],{"class":167}," table:\n",[33,722,724,727,729,732,735,738,741,743,746,749],{"class":35,"line":723},32,[33,725,726],{"class":167},"                        df ",[33,728,242],{"class":163},[33,730,731],{"class":167}," pd.DataFrame(table[",[33,733,734],{"class":50},"1",[33,736,737],{"class":167},":], ",[33,739,740],{"class":238},"columns",[33,742,242],{"class":163},[33,744,745],{"class":167},"table[",[33,747,748],{"class":50},"0",[33,750,751],{"class":167},"])\n",[33,753,755,758,761,764,766],{"class":35,"line":754},33,[33,756,757],{"class":167},"                        df[",[33,759,760],{"class":54},"\"_source\"",[33,762,763],{"class":167},"] ",[33,765,242],{"class":163},[33,767,768],{"class":167}," pdf_path.name\n",[33,770,772],{"class":35,"line":771},34,[33,773,774],{"class":167},"                        frames.append(df)\n",[33,776,778,781,784],{"class":35,"line":777},35,[33,779,780],{"class":163},"        except",[33,782,783],{"class":50}," Exception",[33,785,574],{"class":167},[33,787,789,792,795,797,799],{"class":35,"line":788},36,[33,790,791],{"class":167},"            logger.exception(",[33,793,794],{"class":54},"\"Failed to parse ",[33,796,309],{"class":50},[33,798,274],{"class":54},[33,800,801],{"class":167},", pdf_path.name)\n",[33,803,805],{"class":35,"line":804},37,[33,806,92],{"emptyLinePlaceholder":91},[33,808,810,812,814],{"class":35,"line":809},38,[33,811,617],{"class":163},[33,813,620],{"class":163},[33,815,816],{"class":167}," frames:\n",[33,818,820,822,825],{"class":35,"line":819},39,[33,821,628],{"class":167},[33,823,824],{"class":54},"\"No tables extracted — nothing to write\"",[33,826,221],{"class":167},[33,828,830],{"class":35,"line":829},40,[33,831,646],{"class":163},[33,833,835],{"class":35,"line":834},41,[33,836,92],{"emptyLinePlaceholder":91},[33,838,840,843,845,848,851,853,856],{"class":35,"line":839},42,[33,841,842],{"class":167},"    combined ",[33,844,242],{"class":163},[33,846,847],{"class":167}," pd.concat(frames, ",[33,849,850],{"class":238},"ignore_index",[33,852,242],{"class":163},[33,854,855],{"class":50},"True",[33,857,221],{"class":167},[33,859,861,864,867,870,872,874,876,879,881,883],{"class":35,"line":860},43,[33,862,863],{"class":50},"    OUTPUT_PATH",[33,865,866],{"class":167},".parent.mkdir(",[33,868,869],{"class":238},"parents",[33,871,242],{"class":163},[33,873,855],{"class":50},[33,875,365],{"class":167},[33,877,878],{"class":238},"exist_ok",[33,880,242],{"class":163},[33,882,855],{"class":50},[33,884,221],{"class":167},[33,886,888,891,893,895,898,900,903],{"class":35,"line":887},44,[33,889,890],{"class":167},"    combined.to_excel(",[33,892,521],{"class":50},[33,894,365],{"class":167},[33,896,897],{"class":238},"index",[33,899,242],{"class":163},[33,901,902],{"class":50},"False",[33,904,221],{"class":167},[33,906,908,911,914,917,920,922,924,926,929,932,934],{"class":35,"line":907},45,[33,909,910],{"class":167},"    logger.info(",[33,912,913],{"class":54},"\"Wrote ",[33,915,916],{"class":50},"%d",[33,918,919],{"class":54}," rows to ",[33,921,309],{"class":50},[33,923,274],{"class":54},[33,925,365],{"class":167},[33,927,928],{"class":50},"len",[33,930,931],{"class":167},"(combined), ",[33,933,521],{"class":50},[33,935,221],{"class":167},[14,937,938,939,944,945,949],{},"The same wrapping pattern applies whether your pipeline uses ",[940,941,943],"a",{"href":942},"\u002Fautomating-pdf-extraction-generation\u002Fextracting-tables-from-pdfs\u002F","pdfplumber"," to pull raw tables or goes through the full ",[940,946,948],{"href":947},"\u002Fautomating-document-data-pipelines\u002Fextracting-pdf-data-into-pandas\u002F","Extracting PDF Data into pandas"," workflow first.",[424,951,953],{"id":952},"step-2-add-structured-logging","Step 2 — Add Structured Logging",[14,955,956,957,960,961,964],{},"The stdlib ",[30,958,959],{},"logging"," module is sufficient for most pipelines. Use a ",[30,962,963],{},"RotatingFileHandler"," so logs do not grow unbounded, and emit at the right level so production logs stay readable.",[23,966,968],{"className":126,"code":967,"language":47,"meta":28,"style":28},"# pip install (none — stdlib only)\n\"\"\"\nStep 2: Configure structured, rotating file logging.\nCall configure_logging() once at the top of your entry-point script.\n\"\"\"\nimport logging\nimport logging.handlers\nfrom pathlib import Path\n\nLOG_DIR = Path(\"\u002Fvar\u002Flog\u002Fdoc-pipeline\")\n\n\ndef configure_logging(name: str, level: int = logging.INFO) -> logging.Logger:\n    LOG_DIR.mkdir(parents=True, exist_ok=True)\n    log_file = LOG_DIR \u002F f\"{name}.log\"\n\n    handler = logging.handlers.RotatingFileHandler(\n        log_file,\n        maxBytes=5 * 1024 * 1024,   # 5 MB per file\n        backupCount=7,               # keep one week of rotations\n        encoding=\"utf-8\",\n    )\n    formatter = logging.Formatter(\n        fmt=\"%(asctime)s %(name)s %(levelname)s %(message)s\",\n        datefmt=\"%Y-%m-%dT%H:%M:%S\",\n    )\n    handler.setFormatter(formatter)\n\n    root = logging.getLogger()\n    root.setLevel(level)\n    root.addHandler(handler)\n\n    # Also write WARNING+ to stderr so cron can capture it in its mail\n    stderr_handler = logging.StreamHandler()\n    stderr_handler.setLevel(logging.WARNING)\n    stderr_handler.setFormatter(formatter)\n    root.addHandler(stderr_handler)\n\n    return logging.getLogger(name)\n",[30,969,970,974,978,983,988,992,998,1005,1015,1019,1033,1037,1041,1071,1095,1125,1129,1139,1144,1170,1186,1198,1203,1213,1235,1252,1256,1261,1265,1275,1280,1285,1289,1294,1304,1314,1319,1324,1328],{"__ignoreMap":28},[33,971,972],{"class":35,"line":36},[33,973,134],{"class":39},[33,975,976],{"class":35,"line":43},[33,977,139],{"class":54},[33,979,980],{"class":35,"line":61},[33,981,982],{"class":54},"Step 2: Configure structured, rotating file logging.\n",[33,984,985],{"class":35,"line":73},[33,986,987],{"class":54},"Call configure_logging() once at the top of your entry-point script.\n",[33,989,990],{"class":35,"line":88},[33,991,139],{"class":54},[33,993,994,996],{"class":35,"line":95},[33,995,164],{"class":163},[33,997,184],{"class":167},[33,999,1000,1002],{"class":35,"line":101},[33,1001,164],{"class":163},[33,1003,1004],{"class":167}," logging.handlers\n",[33,1006,1007,1009,1011,1013],{"class":35,"line":171},[33,1008,190],{"class":163},[33,1010,193],{"class":167},[33,1012,164],{"class":163},[33,1014,198],{"class":167},[33,1016,1017],{"class":35,"line":179},[33,1018,92],{"emptyLinePlaceholder":91},[33,1020,1021,1024,1026,1028,1031],{"class":35,"line":187},[33,1022,1023],{"class":50},"LOG_DIR",[33,1025,212],{"class":163},[33,1027,215],{"class":167},[33,1029,1030],{"class":54},"\"\u002Fvar\u002Flog\u002Fdoc-pipeline\"",[33,1032,221],{"class":167},[33,1034,1035],{"class":35,"line":201},[33,1036,92],{"emptyLinePlaceholder":91},[33,1038,1039],{"class":35,"line":206},[33,1040,92],{"emptyLinePlaceholder":91},[33,1042,1043,1045,1048,1051,1054,1057,1060,1062,1065,1068],{"class":35,"line":224},[33,1044,562],{"class":163},[33,1046,1047],{"class":46}," configure_logging",[33,1049,1050],{"class":167},"(name: ",[33,1052,1053],{"class":50},"str",[33,1055,1056],{"class":167},", level: ",[33,1058,1059],{"class":50},"int",[33,1061,212],{"class":163},[33,1063,1064],{"class":167}," logging.",[33,1066,1067],{"class":50},"INFO",[33,1069,1070],{"class":167},") -> logging.Logger:\n",[33,1072,1073,1076,1079,1081,1083,1085,1087,1089,1091,1093],{"class":35,"line":229},[33,1074,1075],{"class":50},"    LOG_DIR",[33,1077,1078],{"class":167},".mkdir(",[33,1080,869],{"class":238},[33,1082,242],{"class":163},[33,1084,855],{"class":50},[33,1086,365],{"class":167},[33,1088,878],{"class":238},[33,1090,242],{"class":163},[33,1092,855],{"class":50},[33,1094,221],{"class":167},[33,1096,1097,1100,1102,1105,1108,1111,1113,1116,1119,1122],{"class":35,"line":235},[33,1098,1099],{"class":167},"    log_file ",[33,1101,242],{"class":163},[33,1103,1104],{"class":50}," LOG_DIR",[33,1106,1107],{"class":163}," \u002F",[33,1109,1110],{"class":163}," f",[33,1112,274],{"class":54},[33,1114,1115],{"class":50},"{",[33,1117,1118],{"class":167},"name",[33,1120,1121],{"class":50},"}",[33,1123,1124],{"class":54},".log\"\n",[33,1126,1127],{"class":35,"line":250},[33,1128,92],{"emptyLinePlaceholder":91},[33,1130,1131,1134,1136],{"class":35,"line":266},[33,1132,1133],{"class":167},"    handler ",[33,1135,242],{"class":163},[33,1137,1138],{"class":167}," logging.handlers.RotatingFileHandler(\n",[33,1140,1141],{"class":35,"line":290},[33,1142,1143],{"class":167},"        log_file,\n",[33,1145,1146,1149,1151,1154,1157,1160,1162,1164,1167],{"class":35,"line":295},[33,1147,1148],{"class":238},"        maxBytes",[33,1150,242],{"class":163},[33,1152,1153],{"class":50},"5",[33,1155,1156],{"class":163}," *",[33,1158,1159],{"class":50}," 1024",[33,1161,1156],{"class":163},[33,1163,1159],{"class":50},[33,1165,1166],{"class":167},",   ",[33,1168,1169],{"class":39},"# 5 MB per file\n",[33,1171,1172,1175,1177,1180,1183],{"class":35,"line":300},[33,1173,1174],{"class":238},"        backupCount",[33,1176,242],{"class":163},[33,1178,1179],{"class":50},"7",[33,1181,1182],{"class":167},",               ",[33,1184,1185],{"class":39},"# keep one week of rotations\n",[33,1187,1188,1191,1193,1196],{"class":35,"line":317},[33,1189,1190],{"class":238},"        encoding",[33,1192,242],{"class":163},[33,1194,1195],{"class":54},"\"utf-8\"",[33,1197,247],{"class":167},[33,1199,1200],{"class":35,"line":332},[33,1201,1202],{"class":167},"    )\n",[33,1204,1205,1208,1210],{"class":35,"line":347},[33,1206,1207],{"class":167},"    formatter ",[33,1209,242],{"class":163},[33,1211,1212],{"class":167}," logging.Formatter(\n",[33,1214,1215,1218,1220,1222,1224,1227,1229,1231,1233],{"class":35,"line":374},[33,1216,1217],{"class":238},"        fmt",[33,1219,242],{"class":163},[33,1221,274],{"class":54},[33,1223,277],{"class":50},[33,1225,1226],{"class":50}," %(name)s",[33,1228,280],{"class":50},[33,1230,283],{"class":50},[33,1232,274],{"class":54},[33,1234,247],{"class":167},[33,1236,1237,1240,1242,1245,1247,1250],{"class":35,"line":397},[33,1238,1239],{"class":238},"        datefmt",[33,1241,242],{"class":163},[33,1243,1244],{"class":54},"\"%Y-%m-",[33,1246,916],{"class":50},[33,1248,1249],{"class":54},"T%H:%M:%S\"",[33,1251,247],{"class":167},[33,1253,1254],{"class":35,"line":653},[33,1255,1202],{"class":167},[33,1257,1258],{"class":35,"line":667},[33,1259,1260],{"class":167},"    handler.setFormatter(formatter)\n",[33,1262,1263],{"class":35,"line":675},[33,1264,92],{"emptyLinePlaceholder":91},[33,1266,1267,1270,1272],{"class":35,"line":689},[33,1268,1269],{"class":167},"    root ",[33,1271,242],{"class":163},[33,1273,1274],{"class":167}," logging.getLogger()\n",[33,1276,1277],{"class":35,"line":703},[33,1278,1279],{"class":167},"    root.setLevel(level)\n",[33,1281,1282],{"class":35,"line":714},[33,1283,1284],{"class":167},"    root.addHandler(handler)\n",[33,1286,1287],{"class":35,"line":723},[33,1288,92],{"emptyLinePlaceholder":91},[33,1290,1291],{"class":35,"line":754},[33,1292,1293],{"class":39},"    # Also write WARNING+ to stderr so cron can capture it in its mail\n",[33,1295,1296,1299,1301],{"class":35,"line":771},[33,1297,1298],{"class":167},"    stderr_handler ",[33,1300,242],{"class":163},[33,1302,1303],{"class":167}," logging.StreamHandler()\n",[33,1305,1306,1309,1312],{"class":35,"line":777},[33,1307,1308],{"class":167},"    stderr_handler.setLevel(logging.",[33,1310,1311],{"class":50},"WARNING",[33,1313,221],{"class":167},[33,1315,1316],{"class":35,"line":788},[33,1317,1318],{"class":167},"    stderr_handler.setFormatter(formatter)\n",[33,1320,1321],{"class":35,"line":804},[33,1322,1323],{"class":167},"    root.addHandler(stderr_handler)\n",[33,1325,1326],{"class":35,"line":809},[33,1327,92],{"emptyLinePlaceholder":91},[33,1329,1330,1333],{"class":35,"line":819},[33,1331,1332],{"class":163},"    return",[33,1334,1335],{"class":167}," logging.getLogger(name)\n",[14,1337,1338,1339,1341,1342,1344,1345,1347,1348,1351,1352,1355],{},"Log at ",[30,1340,261],{}," inside inner loops, ",[30,1343,1067],{}," at job boundaries (started \u002F finished N rows), ",[30,1346,1311],{}," for recoverable anomalies (empty input, skipped file), and ",[30,1349,1350],{},"ERROR","\u002F",[30,1353,1354],{},"CRITICAL"," for failures that need attention.",[424,1357,1359],{"id":1358},"step-3-add-retry-and-backoff","Step 3 — Add Retry and Backoff",[14,1361,1362,1363,1369],{},"Transient failures — network timeouts, locked files, momentarily unavailable APIs — should not abort the whole job. Use ",[940,1364,1368],{"href":1365,"rel":1366},"https:\u002F\u002Ftenacity.readthedocs.io\u002F",[1367],"nofollow","tenacity"," for declarative retry logic, or a hand-rolled decorator when you want zero extra dependencies.",[23,1371,1373],{"className":126,"code":1372,"language":47,"meta":28,"style":28},"# pip install tenacity\n\"\"\"\nStep 3: Retry with exponential backoff using tenacity.\nApply @retry_transient to any function that calls a network or file-system resource.\n\"\"\"\nimport logging\nfrom tenacity import (\n    retry,\n    stop_after_attempt,\n    wait_exponential,\n    retry_if_exception_type,\n    before_sleep_log,\n)\n\nlogger = logging.getLogger(__name__)\n\n# Tenacity decorator — 4 attempts, 2 s → 4 s → 8 s backoff\nretry_transient = retry(\n    reraise=True,\n    stop=stop_after_attempt(4),\n    wait=wait_exponential(multiplier=1, min=2, max=30),\n    retry=retry_if_exception_type((IOError, TimeoutError, ConnectionError)),\n    before_sleep=before_sleep_log(logger, logging.WARNING),\n)\n\n\n@retry_transient\ndef fetch_remote_report(url: str) -> bytes:\n    import urllib.request\n    with urllib.request.urlopen(url, timeout=15) as resp:\n        return resp.read()\n\n\n# Hand-rolled alternative (zero deps)\nimport functools\nimport time\n\n\ndef retry(attempts: int = 3, delay: float = 2.0, backoff: float = 2.0):\n    def decorator(fn):\n        @functools.wraps(fn)\n        def wrapper(*args, **kwargs):\n            wait = delay\n            for attempt in range(1, attempts + 1):\n                try:\n                    return fn(*args, **kwargs)\n                except Exception as exc:\n                    if attempt == attempts:\n                        raise\n                    logger.warning(\n                        \"%s attempt %d\u002F%d failed: %s — retrying in %.1fs\",\n                        fn.__name__, attempt, attempts, exc, wait,\n                    )\n                    time.sleep(wait)\n                    wait *= backoff\n        return wrapper\n    return decorator\n",[30,1374,1375,1380,1384,1389,1394,1398,1404,1416,1421,1426,1431,1436,1441,1445,1449,1461,1465,1470,1480,1491,1507,1546,1572,1586,1590,1594,1598,1603,1623,1631,1655,1663,1667,1671,1676,1683,1690,1694,1698,1738,1749,1757,1779,1789,1817,1824,1842,1856,1869,1875,1881,1913,1924,1930,1936,1948,1956],{"__ignoreMap":28},[33,1376,1377],{"class":35,"line":36},[33,1378,1379],{"class":39},"# pip install tenacity\n",[33,1381,1382],{"class":35,"line":43},[33,1383,139],{"class":54},[33,1385,1386],{"class":35,"line":61},[33,1387,1388],{"class":54},"Step 3: Retry with exponential backoff using tenacity.\n",[33,1390,1391],{"class":35,"line":73},[33,1392,1393],{"class":54},"Apply @retry_transient to any function that calls a network or file-system resource.\n",[33,1395,1396],{"class":35,"line":88},[33,1397,139],{"class":54},[33,1399,1400,1402],{"class":35,"line":95},[33,1401,164],{"class":163},[33,1403,184],{"class":167},[33,1405,1406,1408,1411,1413],{"class":35,"line":101},[33,1407,190],{"class":163},[33,1409,1410],{"class":167}," tenacity ",[33,1412,164],{"class":163},[33,1414,1415],{"class":167}," (\n",[33,1417,1418],{"class":35,"line":171},[33,1419,1420],{"class":167},"    retry,\n",[33,1422,1423],{"class":35,"line":179},[33,1424,1425],{"class":167},"    stop_after_attempt,\n",[33,1427,1428],{"class":35,"line":187},[33,1429,1430],{"class":167},"    wait_exponential,\n",[33,1432,1433],{"class":35,"line":201},[33,1434,1435],{"class":167},"    retry_if_exception_type,\n",[33,1437,1438],{"class":35,"line":206},[33,1439,1440],{"class":167},"    before_sleep_log,\n",[33,1442,1443],{"class":35,"line":224},[33,1444,221],{"class":167},[33,1446,1447],{"class":35,"line":229},[33,1448,92],{"emptyLinePlaceholder":91},[33,1450,1451,1453,1455,1457,1459],{"class":35,"line":235},[33,1452,539],{"class":167},[33,1454,242],{"class":163},[33,1456,544],{"class":167},[33,1458,547],{"class":50},[33,1460,221],{"class":167},[33,1462,1463],{"class":35,"line":250},[33,1464,92],{"emptyLinePlaceholder":91},[33,1466,1467],{"class":35,"line":266},[33,1468,1469],{"class":39},"# Tenacity decorator — 4 attempts, 2 s → 4 s → 8 s backoff\n",[33,1471,1472,1475,1477],{"class":35,"line":290},[33,1473,1474],{"class":167},"retry_transient ",[33,1476,242],{"class":163},[33,1478,1479],{"class":167}," retry(\n",[33,1481,1482,1485,1487,1489],{"class":35,"line":295},[33,1483,1484],{"class":238},"    reraise",[33,1486,242],{"class":163},[33,1488,855],{"class":50},[33,1490,247],{"class":167},[33,1492,1493,1496,1498,1501,1504],{"class":35,"line":300},[33,1494,1495],{"class":238},"    stop",[33,1497,242],{"class":163},[33,1499,1500],{"class":167},"stop_after_attempt(",[33,1502,1503],{"class":50},"4",[33,1505,1506],{"class":167},"),\n",[33,1508,1509,1512,1514,1517,1520,1522,1524,1526,1529,1531,1534,1536,1539,1541,1544],{"class":35,"line":317},[33,1510,1511],{"class":238},"    wait",[33,1513,242],{"class":163},[33,1515,1516],{"class":167},"wait_exponential(",[33,1518,1519],{"class":238},"multiplier",[33,1521,242],{"class":163},[33,1523,734],{"class":50},[33,1525,365],{"class":167},[33,1527,1528],{"class":238},"min",[33,1530,242],{"class":163},[33,1532,1533],{"class":50},"2",[33,1535,365],{"class":167},[33,1537,1538],{"class":238},"max",[33,1540,242],{"class":163},[33,1542,1543],{"class":50},"30",[33,1545,1506],{"class":167},[33,1547,1548,1551,1553,1556,1559,1561,1564,1566,1569],{"class":35,"line":332},[33,1549,1550],{"class":238},"    retry",[33,1552,242],{"class":163},[33,1554,1555],{"class":167},"retry_if_exception_type((",[33,1557,1558],{"class":50},"IOError",[33,1560,365],{"class":167},[33,1562,1563],{"class":50},"TimeoutError",[33,1565,365],{"class":167},[33,1567,1568],{"class":50},"ConnectionError",[33,1570,1571],{"class":167},")),\n",[33,1573,1574,1577,1579,1582,1584],{"class":35,"line":347},[33,1575,1576],{"class":238},"    before_sleep",[33,1578,242],{"class":163},[33,1580,1581],{"class":167},"before_sleep_log(logger, logging.",[33,1583,1311],{"class":50},[33,1585,1506],{"class":167},[33,1587,1588],{"class":35,"line":374},[33,1589,221],{"class":167},[33,1591,1592],{"class":35,"line":397},[33,1593,92],{"emptyLinePlaceholder":91},[33,1595,1596],{"class":35,"line":653},[33,1597,92],{"emptyLinePlaceholder":91},[33,1599,1600],{"class":35,"line":667},[33,1601,1602],{"class":46},"@retry_transient\n",[33,1604,1605,1607,1610,1613,1615,1618,1621],{"class":35,"line":675},[33,1606,562],{"class":163},[33,1608,1609],{"class":46}," fetch_remote_report",[33,1611,1612],{"class":167},"(url: ",[33,1614,1053],{"class":50},[33,1616,1617],{"class":167},") -> ",[33,1619,1620],{"class":50},"bytes",[33,1622,574],{"class":167},[33,1624,1625,1628],{"class":35,"line":689},[33,1626,1627],{"class":163},"    import",[33,1629,1630],{"class":167}," urllib.request\n",[33,1632,1633,1636,1639,1642,1644,1647,1650,1652],{"class":35,"line":703},[33,1634,1635],{"class":163},"    with",[33,1637,1638],{"class":167}," urllib.request.urlopen(url, ",[33,1640,1641],{"class":238},"timeout",[33,1643,242],{"class":163},[33,1645,1646],{"class":50},"15",[33,1648,1649],{"class":167},") ",[33,1651,495],{"class":163},[33,1653,1654],{"class":167}," resp:\n",[33,1656,1657,1660],{"class":35,"line":714},[33,1658,1659],{"class":163},"        return",[33,1661,1662],{"class":167}," resp.read()\n",[33,1664,1665],{"class":35,"line":723},[33,1666,92],{"emptyLinePlaceholder":91},[33,1668,1669],{"class":35,"line":754},[33,1670,92],{"emptyLinePlaceholder":91},[33,1672,1673],{"class":35,"line":771},[33,1674,1675],{"class":39},"# Hand-rolled alternative (zero deps)\n",[33,1677,1678,1680],{"class":35,"line":777},[33,1679,164],{"class":163},[33,1681,1682],{"class":167}," functools\n",[33,1684,1685,1687],{"class":35,"line":788},[33,1686,164],{"class":163},[33,1688,1689],{"class":167}," time\n",[33,1691,1692],{"class":35,"line":804},[33,1693,92],{"emptyLinePlaceholder":91},[33,1695,1696],{"class":35,"line":809},[33,1697,92],{"emptyLinePlaceholder":91},[33,1699,1700,1702,1705,1708,1710,1712,1715,1718,1721,1723,1726,1729,1731,1733,1735],{"class":35,"line":819},[33,1701,562],{"class":163},[33,1703,1704],{"class":46}," retry",[33,1706,1707],{"class":167},"(attempts: ",[33,1709,1059],{"class":50},[33,1711,212],{"class":163},[33,1713,1714],{"class":50}," 3",[33,1716,1717],{"class":167},", delay: ",[33,1719,1720],{"class":50},"float",[33,1722,212],{"class":163},[33,1724,1725],{"class":50}," 2.0",[33,1727,1728],{"class":167},", backoff: ",[33,1730,1720],{"class":50},[33,1732,212],{"class":163},[33,1734,1725],{"class":50},[33,1736,1737],{"class":167},"):\n",[33,1739,1740,1743,1746],{"class":35,"line":829},[33,1741,1742],{"class":163},"    def",[33,1744,1745],{"class":46}," decorator",[33,1747,1748],{"class":167},"(fn):\n",[33,1750,1751,1754],{"class":35,"line":834},[33,1752,1753],{"class":46},"        @functools.wraps",[33,1755,1756],{"class":167},"(fn)\n",[33,1758,1759,1762,1765,1767,1770,1773,1776],{"class":35,"line":839},[33,1760,1761],{"class":163},"        def",[33,1763,1764],{"class":46}," wrapper",[33,1766,602],{"class":167},[33,1768,1769],{"class":163},"*",[33,1771,1772],{"class":167},"args, ",[33,1774,1775],{"class":163},"**",[33,1777,1778],{"class":167},"kwargs):\n",[33,1780,1781,1784,1786],{"class":35,"line":860},[33,1782,1783],{"class":167},"            wait ",[33,1785,242],{"class":163},[33,1787,1788],{"class":167}," delay\n",[33,1790,1791,1794,1797,1799,1802,1804,1806,1809,1812,1815],{"class":35,"line":887},[33,1792,1793],{"class":163},"            for",[33,1795,1796],{"class":167}," attempt ",[33,1798,662],{"class":163},[33,1800,1801],{"class":50}," range",[33,1803,602],{"class":167},[33,1805,734],{"class":50},[33,1807,1808],{"class":167},", attempts ",[33,1810,1811],{"class":163},"+",[33,1813,1814],{"class":50}," 1",[33,1816,1737],{"class":167},[33,1818,1819,1822],{"class":35,"line":907},[33,1820,1821],{"class":163},"                try",[33,1823,574],{"class":167},[33,1825,1827,1830,1833,1835,1837,1839],{"class":35,"line":1826},46,[33,1828,1829],{"class":163},"                    return",[33,1831,1832],{"class":167}," fn(",[33,1834,1769],{"class":163},[33,1836,1772],{"class":167},[33,1838,1775],{"class":163},[33,1840,1841],{"class":167},"kwargs)\n",[33,1843,1845,1848,1850,1853],{"class":35,"line":1844},47,[33,1846,1847],{"class":163},"                except",[33,1849,783],{"class":50},[33,1851,1852],{"class":163}," as",[33,1854,1855],{"class":167}," exc:\n",[33,1857,1859,1861,1863,1866],{"class":35,"line":1858},48,[33,1860,717],{"class":163},[33,1862,1796],{"class":167},[33,1864,1865],{"class":163},"==",[33,1867,1868],{"class":167}," attempts:\n",[33,1870,1872],{"class":35,"line":1871},49,[33,1873,1874],{"class":163},"                        raise\n",[33,1876,1878],{"class":35,"line":1877},50,[33,1879,1880],{"class":167},"                    logger.warning(\n",[33,1882,1884,1887,1889,1891,1893,1895,1897,1900,1902,1905,1908,1911],{"class":35,"line":1883},51,[33,1885,1886],{"class":54},"                        \"",[33,1888,309],{"class":50},[33,1890,1796],{"class":54},[33,1892,916],{"class":50},[33,1894,1351],{"class":54},[33,1896,916],{"class":50},[33,1898,1899],{"class":54}," failed: ",[33,1901,309],{"class":50},[33,1903,1904],{"class":54}," — retrying in ",[33,1906,1907],{"class":50},"%.1f",[33,1909,1910],{"class":54},"s\"",[33,1912,247],{"class":167},[33,1914,1916,1919,1921],{"class":35,"line":1915},52,[33,1917,1918],{"class":167},"                        fn.",[33,1920,547],{"class":50},[33,1922,1923],{"class":167},", attempt, attempts, exc, wait,\n",[33,1925,1927],{"class":35,"line":1926},53,[33,1928,1929],{"class":167},"                    )\n",[33,1931,1933],{"class":35,"line":1932},54,[33,1934,1935],{"class":167},"                    time.sleep(wait)\n",[33,1937,1939,1942,1945],{"class":35,"line":1938},55,[33,1940,1941],{"class":167},"                    wait ",[33,1943,1944],{"class":163},"*=",[33,1946,1947],{"class":167}," backoff\n",[33,1949,1951,1953],{"class":35,"line":1950},56,[33,1952,1659],{"class":163},[33,1954,1955],{"class":167}," wrapper\n",[33,1957,1959,1961],{"class":35,"line":1958},57,[33,1960,1332],{"class":163},[33,1962,1963],{"class":167}," decorator\n",[424,1965,1967],{"id":1966},"step-4-schedule-the-job","Step 4 — Schedule the Job",[14,1969,1970],{},"Pick the scheduler that matches your environment.",[14,1972,1973],{},[1974,1975,1976],"strong",{},"cron (Linux\u002FmacOS)",[23,1978,1980],{"className":25,"code":1979,"language":27,"meta":28,"style":28},"# Run daily_pipeline.py every weekday at 06:30 using the venv interpreter\ncrontab -e\n",[30,1981,1982,1987],{"__ignoreMap":28},[33,1983,1984],{"class":35,"line":36},[33,1985,1986],{"class":39},"# Run daily_pipeline.py every weekday at 06:30 using the venv interpreter\n",[33,1988,1989,1992],{"class":35,"line":43},[33,1990,1991],{"class":46},"crontab",[33,1993,1994],{"class":50}," -e\n",[23,1996,2001],{"className":1997,"code":1999,"language":2000},[1998],"language-text","30 6 * * 1-5 \u002Fdata\u002F.venv\u002Fbin\u002Fpython \u002Fdata\u002Fscripts\u002Fdaily_pipeline.py >> \u002Fvar\u002Flog\u002Fdoc-pipeline\u002Fcron.log 2>&1\n","text",[30,2002,1999],{"__ignoreMap":28},[14,2004,2005,2006,2008,2009,2012,2013,2015,2016,2019],{},"Always use the absolute path to the venv's ",[30,2007,47],{},", not ",[30,2010,2011],{},"python3"," or ",[30,2014,47],{},". The ",[30,2017,2018],{},">> ... 2>&1"," captures any stderr (uncaught exceptions) alongside cron's own output mail.",[14,2021,2022],{},[1974,2023,2024],{},"Windows Task Scheduler",[23,2026,2030],{"className":2027,"code":2028,"language":2029,"meta":28,"style":28},"language-batch shiki shiki-themes github-light","rem Trigger: Daily at 06:30\nrem Program\u002Fscript: C:\\data\\.venv\\Scripts\\python.exe\nrem Arguments:    C:\\data\\scripts\\daily_pipeline.py\nrem Start in:     C:\\data\\scripts\n","batch",[30,2031,2032,2037,2042,2047],{"__ignoreMap":28},[33,2033,2034],{"class":35,"line":36},[33,2035,2036],{},"rem Trigger: Daily at 06:30\n",[33,2038,2039],{"class":35,"line":43},[33,2040,2041],{},"rem Program\u002Fscript: C:\\data\\.venv\\Scripts\\python.exe\n",[33,2043,2044],{"class":35,"line":61},[33,2045,2046],{},"rem Arguments:    C:\\data\\scripts\\daily_pipeline.py\n",[33,2048,2049],{"class":35,"line":73},[33,2050,2051],{},"rem Start in:     C:\\data\\scripts\n",[14,2053,2054],{},"Set \"Run whether user is logged on or not\" and \"Run with highest privileges\" if the script writes to protected paths.",[14,2056,2057],{},[1974,2058,2059],{},"GitHub Actions (CI\u002Fcloud)",[23,2061,2065],{"className":2062,"code":2063,"language":2064,"meta":28,"style":28},"language-yaml shiki shiki-themes github-light","# .github\u002Fworkflows\u002Fdaily_pipeline.yml\nname: Daily Document Pipeline\n\non:\n  schedule:\n    - cron: \"30 6 * * 1-5\"   # UTC\n  workflow_dispatch:           # allow manual trigger\n\njobs:\n  run-pipeline:\n    runs-on: ubuntu-latest\n    steps:\n      - uses: actions\u002Fcheckout@v4\n      - uses: actions\u002Fsetup-python@v5\n        with:\n          python-version: \"3.12\"\n      - name: Install deps\n        run: pip install -r requirements.txt\n      - name: Run pipeline\n        env:\n          SMTP_HOST: ${{ secrets.SMTP_HOST }}\n          SMTP_PASS: ${{ secrets.SMTP_PASS }}\n        run: python scripts\u002Fdaily_pipeline.py --output \u002Ftmp\u002Freport.xlsx\n      - name: Upload report\n        uses: actions\u002Fupload-artifact@v4\n        with:\n          name: daily-report\n          path: \u002Ftmp\u002Freport.xlsx\n","yaml",[30,2066,2067,2072,2083,2087,2094,2101,2117,2128,2132,2139,2146,2156,2163,2176,2187,2194,2204,2215,2225,2236,2243,2253,2263,2272,2283,2293,2299,2309],{"__ignoreMap":28},[33,2068,2069],{"class":35,"line":36},[33,2070,2071],{"class":39},"# .github\u002Fworkflows\u002Fdaily_pipeline.yml\n",[33,2073,2074,2077,2080],{"class":35,"line":43},[33,2075,1118],{"class":2076},"shJU0",[33,2078,2079],{"class":167},": ",[33,2081,2082],{"class":54},"Daily Document Pipeline\n",[33,2084,2085],{"class":35,"line":61},[33,2086,92],{"emptyLinePlaceholder":91},[33,2088,2089,2092],{"class":35,"line":73},[33,2090,2091],{"class":50},"on",[33,2093,574],{"class":167},[33,2095,2096,2099],{"class":35,"line":88},[33,2097,2098],{"class":2076},"  schedule",[33,2100,574],{"class":167},[33,2102,2103,2106,2109,2111,2114],{"class":35,"line":95},[33,2104,2105],{"class":167},"    - ",[33,2107,2108],{"class":2076},"cron",[33,2110,2079],{"class":167},[33,2112,2113],{"class":54},"\"30 6 * * 1-5\"",[33,2115,2116],{"class":39},"   # UTC\n",[33,2118,2119,2122,2125],{"class":35,"line":101},[33,2120,2121],{"class":2076},"  workflow_dispatch",[33,2123,2124],{"class":167},":           ",[33,2126,2127],{"class":39},"# allow manual trigger\n",[33,2129,2130],{"class":35,"line":171},[33,2131,92],{"emptyLinePlaceholder":91},[33,2133,2134,2137],{"class":35,"line":179},[33,2135,2136],{"class":2076},"jobs",[33,2138,574],{"class":167},[33,2140,2141,2144],{"class":35,"line":187},[33,2142,2143],{"class":2076},"  run-pipeline",[33,2145,574],{"class":167},[33,2147,2148,2151,2153],{"class":35,"line":201},[33,2149,2150],{"class":2076},"    runs-on",[33,2152,2079],{"class":167},[33,2154,2155],{"class":54},"ubuntu-latest\n",[33,2157,2158,2161],{"class":35,"line":206},[33,2159,2160],{"class":2076},"    steps",[33,2162,574],{"class":167},[33,2164,2165,2168,2171,2173],{"class":35,"line":224},[33,2166,2167],{"class":167},"      - ",[33,2169,2170],{"class":2076},"uses",[33,2172,2079],{"class":167},[33,2174,2175],{"class":54},"actions\u002Fcheckout@v4\n",[33,2177,2178,2180,2182,2184],{"class":35,"line":229},[33,2179,2167],{"class":167},[33,2181,2170],{"class":2076},[33,2183,2079],{"class":167},[33,2185,2186],{"class":54},"actions\u002Fsetup-python@v5\n",[33,2188,2189,2192],{"class":35,"line":235},[33,2190,2191],{"class":2076},"        with",[33,2193,574],{"class":167},[33,2195,2196,2199,2201],{"class":35,"line":250},[33,2197,2198],{"class":2076},"          python-version",[33,2200,2079],{"class":167},[33,2202,2203],{"class":54},"\"3.12\"\n",[33,2205,2206,2208,2210,2212],{"class":35,"line":266},[33,2207,2167],{"class":167},[33,2209,1118],{"class":2076},[33,2211,2079],{"class":167},[33,2213,2214],{"class":54},"Install deps\n",[33,2216,2217,2220,2222],{"class":35,"line":290},[33,2218,2219],{"class":2076},"        run",[33,2221,2079],{"class":167},[33,2223,2224],{"class":54},"pip install -r requirements.txt\n",[33,2226,2227,2229,2231,2233],{"class":35,"line":295},[33,2228,2167],{"class":167},[33,2230,1118],{"class":2076},[33,2232,2079],{"class":167},[33,2234,2235],{"class":54},"Run pipeline\n",[33,2237,2238,2241],{"class":35,"line":300},[33,2239,2240],{"class":2076},"        env",[33,2242,574],{"class":167},[33,2244,2245,2248,2250],{"class":35,"line":317},[33,2246,2247],{"class":2076},"          SMTP_HOST",[33,2249,2079],{"class":167},[33,2251,2252],{"class":54},"${{ secrets.SMTP_HOST }}\n",[33,2254,2255,2258,2260],{"class":35,"line":332},[33,2256,2257],{"class":2076},"          SMTP_PASS",[33,2259,2079],{"class":167},[33,2261,2262],{"class":54},"${{ secrets.SMTP_PASS }}\n",[33,2264,2265,2267,2269],{"class":35,"line":347},[33,2266,2219],{"class":2076},[33,2268,2079],{"class":167},[33,2270,2271],{"class":54},"python scripts\u002Fdaily_pipeline.py --output \u002Ftmp\u002Freport.xlsx\n",[33,2273,2274,2276,2278,2280],{"class":35,"line":374},[33,2275,2167],{"class":167},[33,2277,1118],{"class":2076},[33,2279,2079],{"class":167},[33,2281,2282],{"class":54},"Upload report\n",[33,2284,2285,2288,2290],{"class":35,"line":397},[33,2286,2287],{"class":2076},"        uses",[33,2289,2079],{"class":167},[33,2291,2292],{"class":54},"actions\u002Fupload-artifact@v4\n",[33,2294,2295,2297],{"class":35,"line":653},[33,2296,2191],{"class":2076},[33,2298,574],{"class":167},[33,2300,2301,2304,2306],{"class":35,"line":667},[33,2302,2303],{"class":2076},"          name",[33,2305,2079],{"class":167},[33,2307,2308],{"class":54},"daily-report\n",[33,2310,2311,2314,2316],{"class":35,"line":675},[33,2312,2313],{"class":2076},"          path",[33,2315,2079],{"class":167},[33,2317,2318],{"class":54},"\u002Ftmp\u002Freport.xlsx\n",[14,2320,2321],{},[1974,2322,2323,2326],{},[30,2324,2325],{},"schedule"," library (long-running process)",[23,2328,2330],{"className":126,"code":2329,"language":47,"meta":28,"style":28},"# pip install schedule\n\"\"\"\nUse the schedule library when your process runs continuously (e.g., inside a container\nor a systemd service) rather than being launched fresh by cron.\n\"\"\"\nimport time\nimport logging\nimport schedule\n\nlogger = logging.getLogger(__name__)\n\n\ndef job_with_guard() -> None:\n    logger.info(\"Job started\")\n    try:\n        run_daily_pipeline()          # from Step 1\n        logger.info(\"Job finished OK\")\n    except Exception:\n        logger.exception(\"Job failed\")\n\n\nschedule.every().day.at(\"06:30\").do(job_with_guard)\n\nif __name__ == \"__main__\":\n    logger.info(\"Scheduler started — waiting for next run\")\n    while True:\n        schedule.run_pending()\n        time.sleep(30)\n",[30,2331,2332,2337,2341,2346,2351,2355,2361,2367,2374,2378,2390,2394,2398,2411,2420,2427,2435,2445,2454,2464,2468,2472,2483,2487,2503,2512,2522,2527],{"__ignoreMap":28},[33,2333,2334],{"class":35,"line":36},[33,2335,2336],{"class":39},"# pip install schedule\n",[33,2338,2339],{"class":35,"line":43},[33,2340,139],{"class":54},[33,2342,2343],{"class":35,"line":61},[33,2344,2345],{"class":54},"Use the schedule library when your process runs continuously (e.g., inside a container\n",[33,2347,2348],{"class":35,"line":73},[33,2349,2350],{"class":54},"or a systemd service) rather than being launched fresh by cron.\n",[33,2352,2353],{"class":35,"line":88},[33,2354,139],{"class":54},[33,2356,2357,2359],{"class":35,"line":95},[33,2358,164],{"class":163},[33,2360,1689],{"class":167},[33,2362,2363,2365],{"class":35,"line":101},[33,2364,164],{"class":163},[33,2366,184],{"class":167},[33,2368,2369,2371],{"class":35,"line":171},[33,2370,164],{"class":163},[33,2372,2373],{"class":167}," schedule\n",[33,2375,2376],{"class":35,"line":179},[33,2377,92],{"emptyLinePlaceholder":91},[33,2379,2380,2382,2384,2386,2388],{"class":35,"line":187},[33,2381,539],{"class":167},[33,2383,242],{"class":163},[33,2385,544],{"class":167},[33,2387,547],{"class":50},[33,2389,221],{"class":167},[33,2391,2392],{"class":35,"line":201},[33,2393,92],{"emptyLinePlaceholder":91},[33,2395,2396],{"class":35,"line":206},[33,2397,92],{"emptyLinePlaceholder":91},[33,2399,2400,2402,2405,2407,2409],{"class":35,"line":224},[33,2401,562],{"class":163},[33,2403,2404],{"class":46}," job_with_guard",[33,2406,568],{"class":167},[33,2408,571],{"class":50},[33,2410,574],{"class":167},[33,2412,2413,2415,2418],{"class":35,"line":229},[33,2414,910],{"class":167},[33,2416,2417],{"class":54},"\"Job started\"",[33,2419,221],{"class":167},[33,2421,2422,2425],{"class":35,"line":235},[33,2423,2424],{"class":163},"    try",[33,2426,574],{"class":167},[33,2428,2429,2432],{"class":35,"line":250},[33,2430,2431],{"class":167},"        run_daily_pipeline()          ",[33,2433,2434],{"class":39},"# from Step 1\n",[33,2436,2437,2440,2443],{"class":35,"line":266},[33,2438,2439],{"class":167},"        logger.info(",[33,2441,2442],{"class":54},"\"Job finished OK\"",[33,2444,221],{"class":167},[33,2446,2447,2450,2452],{"class":35,"line":290},[33,2448,2449],{"class":163},"    except",[33,2451,783],{"class":50},[33,2453,574],{"class":167},[33,2455,2456,2459,2462],{"class":35,"line":295},[33,2457,2458],{"class":167},"        logger.exception(",[33,2460,2461],{"class":54},"\"Job failed\"",[33,2463,221],{"class":167},[33,2465,2466],{"class":35,"line":300},[33,2467,92],{"emptyLinePlaceholder":91},[33,2469,2470],{"class":35,"line":317},[33,2471,92],{"emptyLinePlaceholder":91},[33,2473,2474,2477,2480],{"class":35,"line":332},[33,2475,2476],{"class":167},"schedule.every().day.at(",[33,2478,2479],{"class":54},"\"06:30\"",[33,2481,2482],{"class":167},").do(job_with_guard)\n",[33,2484,2485],{"class":35,"line":347},[33,2486,92],{"emptyLinePlaceholder":91},[33,2488,2489,2492,2495,2498,2501],{"class":35,"line":374},[33,2490,2491],{"class":163},"if",[33,2493,2494],{"class":50}," __name__",[33,2496,2497],{"class":163}," ==",[33,2499,2500],{"class":54}," \"__main__\"",[33,2502,574],{"class":167},[33,2504,2505,2507,2510],{"class":35,"line":397},[33,2506,910],{"class":167},[33,2508,2509],{"class":54},"\"Scheduler started — waiting for next run\"",[33,2511,221],{"class":167},[33,2513,2514,2517,2520],{"class":35,"line":653},[33,2515,2516],{"class":163},"    while",[33,2518,2519],{"class":50}," True",[33,2521,574],{"class":167},[33,2523,2524],{"class":35,"line":667},[33,2525,2526],{"class":167},"        schedule.run_pending()\n",[33,2528,2529,2532,2534],{"class":35,"line":675},[33,2530,2531],{"class":167},"        time.sleep(",[33,2533,1543],{"class":50},[33,2535,221],{"class":167},[2537,2538],"hr",{},[2540,2541,2547,2548,2547,2552,2547,2556,2547,2547,2584,2547,2595,2547,2602,2547,2547,2607,2547,2547,2613,2547,2616,2547,2620,2547,2547,2623,2547,2547,2627,2547,2632,2547,2637,2547,2641,2547,2547,2645,2547,2547,2651,2547,2654,2547,2658,2547,2547,2662,2547,2547,2666,2547,2668,2547,2671,2547,2547,2675,2547,2682,2547,2687,2547,2547,2691,2547,2547,2695,2547,2699],"svg",{"viewBox":2542,"role":2543,"ariaLabel":2544,"xmlns":2545,"style":2546},"0 0 760 340","img","Flow diagram: scheduler triggers a job run, which goes through logging and retry stages, then reaches success or sends a failure alert","http:\u002F\u002Fwww.w3.org\u002F2000\u002Fsvg","width:100%;max-width:760px;height:auto;font-family:Inter,sans-serif","\n  ",[2549,2550,2551],"title",{},"Scheduling and logging job flow",[2553,2554,2555],"desc",{},"A scheduler trigger fires a job run. The job writes to a log. On failure it retries with backoff; on persistent failure it sends an alert. On success it writes output and exits 0.",[2557,2558,2559,2560,2559,2572,2547],"defs",{},"\n    ",[2561,2562,2564,2565,2564,2569,2559],"linearGradient",{"id":2563,"x1":748,"y1":748,"x2":734,"y2":748},"schedule-grad","\n      ",[2566,2567],"stop",{"offset":748,"style":2568},"stop-color:#2563eb",[2566,2570],{"offset":734,"style":2571},"stop-color:#dbeafe",[2573,2574,2564,2579,2559],"marker",{"id":2575,"viewBox":2576,"refX":2577,"refY":1153,"markerWidth":1179,"markerHeight":1179,"orient":2578},"schedule-arrow","0 0 10 10","9","auto-start-reverse",[2580,2581],"path",{"d":2582,"fill":2583},"M0 0 L10 5 L0 10 z","#475569",[2585,2586],"rect",{"x":2587,"y":2588,"width":2589,"height":2590,"rx":2591,"fill":2592,"stroke":2593,"style":2594},"20","130","120","60","8","#ffffff","#e2e8f0","stroke-width:1.5",[2000,2596,2601],{"x":2597,"y":2598,"fill":2599,"style":2600},"80","155","#0f172a","text-anchor:middle;font-size:13px","Scheduler",[2000,2603,2606],{"x":2597,"y":2604,"fill":2583,"style":2605},"172","text-anchor:middle;font-size:11px","cron \u002F Actions",[35,2608],{"x1":2609,"y1":2610,"x2":2611,"y2":2610,"stroke":2583,"markerEnd":2612,"style":2594},"140","160","200","url(#schedule-arrow)",[2585,2614],{"x":2611,"y":2588,"width":2589,"height":2590,"rx":2591,"fill":2615,"stroke":2593,"style":2594},"#f6f8fb",[2000,2617,2619],{"x":2618,"y":2598,"fill":2599,"style":2600},"260","Lock check",[2000,2621,2622],{"x":2618,"y":2604,"fill":2583,"style":2605},"skip if running",[35,2624],{"x1":2625,"y1":2610,"x2":2626,"y2":2610,"stroke":2583,"markerEnd":2612,"style":2594},"320","380",[2585,2628],{"x":2626,"y":2629,"width":2588,"height":2630,"rx":2591,"fill":2631,"stroke":2593,"style":2594},"115","90","url(#schedule-grad)",[2000,2633,2636],{"x":2634,"y":2635,"fill":2599,"style":2600},"445","150","Job run",[2000,2638,2640],{"x":2634,"y":2639,"fill":2599,"style":2605},"168","logging +",[2000,2642,2644],{"x":2634,"y":2643,"fill":2599,"style":2605},"184","retry\u002Fbackoff",[35,2646],{"x1":2647,"y1":2648,"x2":2649,"y2":2650,"stroke":2583,"markerEnd":2612,"style":2594},"510","145","590","100",[2585,2652],{"x":2649,"y":2653,"width":2609,"height":2590,"rx":2591,"fill":2592,"stroke":2593,"style":2594},"65",[2000,2655,2657],{"x":2656,"y":2630,"fill":2599,"style":2600},"660","Success",[2000,2659,2661],{"x":2656,"y":2660,"fill":2583,"style":2605},"107","exit 0, write output",[35,2663],{"x1":2647,"y1":2664,"x2":2649,"y2":2665,"stroke":2583,"markerEnd":2612,"style":2594},"175","225",[2585,2667],{"x":2649,"y":2611,"width":2609,"height":2590,"rx":2591,"fill":2592,"stroke":2593,"style":2594},[2000,2669,2670],{"x":2656,"y":2665,"fill":2599,"style":2600},"Failure alert",[2000,2672,2674],{"x":2656,"y":2673,"fill":2583,"style":2605},"242","email \u002F webhook",[2585,2676],{"x":2677,"y":2678,"width":2679,"height":2680,"rx":2681,"fill":2615,"stroke":2593,"style":2594},"390","255","110","50","6",[2000,2683,2686],{"x":2634,"y":2684,"fill":2599,"style":2685},"278","text-anchor:middle;font-size:12px","Rotating log",[2000,2688,2690],{"x":2634,"y":2689,"fill":2583,"style":2605},"294",".log + backups",[35,2692],{"x1":2634,"y1":2693,"x2":2634,"y2":2678,"stroke":2583,"markerEnd":2612,"style":2694},"205","stroke-width:1.5;stroke-dasharray:4 3",[35,2696],{"x1":2618,"y1":2697,"x2":2618,"y2":2698,"stroke":2593,"style":2594},"190","310",[2000,2700,2703],{"x":2701,"y":2702,"fill":2583,"style":2605},"220","325","already running → skip",[2537,2705],{},[18,2707,2709],{"id":2708},"edge-cases-and-variants","Edge Cases and Variants",[424,2711,2713],{"id":2712},"overlapping-runs-and-lock-files","Overlapping Runs and Lock Files",[14,2715,2716],{},"If your job takes longer than its scheduling interval, two instances can run simultaneously and corrupt shared output files. A lock file prevents this.",[23,2718,2720],{"className":126,"code":2719,"language":47,"meta":28,"style":28},"# pip install (none — stdlib only)\n\"\"\"\nLock file guard — place at the start of your entry-point script.\nUses O_CREAT | O_EXCL so that creation is atomic on POSIX systems.\n\"\"\"\nimport os\nimport sys\nimport logging\nfrom pathlib import Path\n\nLOCK_PATH = Path(\"\u002Ftmp\u002Fdoc_pipeline.lock\")\nlogger = logging.getLogger(__name__)\n\n\ndef acquire_lock() -> bool:\n    try:\n        fd = os.open(LOCK_PATH, os.O_CREAT | os.O_EXCL | os.O_WRONLY)\n        os.write(fd, str(os.getpid()).encode())\n        os.close(fd)\n        return True\n    except FileExistsError:\n        return False\n\n\ndef release_lock() -> None:\n    try:\n        LOCK_PATH.unlink()\n    except FileNotFoundError:\n        pass\n\n\nif __name__ == \"__main__\":\n    if not acquire_lock():\n        logger.warning(\"Another instance is running — exiting\")\n        sys.exit(0)\n    try:\n        run_daily_pipeline()   # your job from Step 1\n    finally:\n        release_lock()\n",[30,2721,2722,2726,2730,2735,2740,2744,2750,2756,2762,2772,2776,2790,2802,2806,2810,2824,2830,2866,2876,2881,2888,2897,2904,2908,2912,2925,2931,2939,2948,2953,2957,2961,2973,2982,2991,3000,3006,3014,3021],{"__ignoreMap":28},[33,2723,2724],{"class":35,"line":36},[33,2725,134],{"class":39},[33,2727,2728],{"class":35,"line":43},[33,2729,139],{"class":54},[33,2731,2732],{"class":35,"line":61},[33,2733,2734],{"class":54},"Lock file guard — place at the start of your entry-point script.\n",[33,2736,2737],{"class":35,"line":73},[33,2738,2739],{"class":54},"Uses O_CREAT | O_EXCL so that creation is atomic on POSIX systems.\n",[33,2741,2742],{"class":35,"line":88},[33,2743,139],{"class":54},[33,2745,2746,2748],{"class":35,"line":95},[33,2747,164],{"class":163},[33,2749,176],{"class":167},[33,2751,2752,2754],{"class":35,"line":101},[33,2753,164],{"class":163},[33,2755,168],{"class":167},[33,2757,2758,2760],{"class":35,"line":171},[33,2759,164],{"class":163},[33,2761,184],{"class":167},[33,2763,2764,2766,2768,2770],{"class":35,"line":179},[33,2765,190],{"class":163},[33,2767,193],{"class":167},[33,2769,164],{"class":163},[33,2771,198],{"class":167},[33,2773,2774],{"class":35,"line":187},[33,2775,92],{"emptyLinePlaceholder":91},[33,2777,2778,2781,2783,2785,2788],{"class":35,"line":201},[33,2779,2780],{"class":50},"LOCK_PATH",[33,2782,212],{"class":163},[33,2784,215],{"class":167},[33,2786,2787],{"class":54},"\"\u002Ftmp\u002Fdoc_pipeline.lock\"",[33,2789,221],{"class":167},[33,2791,2792,2794,2796,2798,2800],{"class":35,"line":206},[33,2793,539],{"class":167},[33,2795,242],{"class":163},[33,2797,544],{"class":167},[33,2799,547],{"class":50},[33,2801,221],{"class":167},[33,2803,2804],{"class":35,"line":224},[33,2805,92],{"emptyLinePlaceholder":91},[33,2807,2808],{"class":35,"line":229},[33,2809,92],{"emptyLinePlaceholder":91},[33,2811,2812,2814,2817,2819,2822],{"class":35,"line":235},[33,2813,562],{"class":163},[33,2815,2816],{"class":46}," acquire_lock",[33,2818,568],{"class":167},[33,2820,2821],{"class":50},"bool",[33,2823,574],{"class":167},[33,2825,2826,2828],{"class":35,"line":250},[33,2827,2424],{"class":163},[33,2829,574],{"class":167},[33,2831,2832,2835,2837,2840,2842,2845,2848,2851,2854,2857,2859,2861,2864],{"class":35,"line":266},[33,2833,2834],{"class":167},"        fd ",[33,2836,242],{"class":163},[33,2838,2839],{"class":167}," os.open(",[33,2841,2780],{"class":50},[33,2843,2844],{"class":167},", os.",[33,2846,2847],{"class":50},"O_CREAT",[33,2849,2850],{"class":163}," |",[33,2852,2853],{"class":167}," os.",[33,2855,2856],{"class":50},"O_EXCL",[33,2858,2850],{"class":163},[33,2860,2853],{"class":167},[33,2862,2863],{"class":50},"O_WRONLY",[33,2865,221],{"class":167},[33,2867,2868,2871,2873],{"class":35,"line":290},[33,2869,2870],{"class":167},"        os.write(fd, ",[33,2872,1053],{"class":50},[33,2874,2875],{"class":167},"(os.getpid()).encode())\n",[33,2877,2878],{"class":35,"line":295},[33,2879,2880],{"class":167},"        os.close(fd)\n",[33,2882,2883,2885],{"class":35,"line":300},[33,2884,1659],{"class":163},[33,2886,2887],{"class":50}," True\n",[33,2889,2890,2892,2895],{"class":35,"line":317},[33,2891,2449],{"class":163},[33,2893,2894],{"class":50}," FileExistsError",[33,2896,574],{"class":167},[33,2898,2899,2901],{"class":35,"line":332},[33,2900,1659],{"class":163},[33,2902,2903],{"class":50}," False\n",[33,2905,2906],{"class":35,"line":347},[33,2907,92],{"emptyLinePlaceholder":91},[33,2909,2910],{"class":35,"line":374},[33,2911,92],{"emptyLinePlaceholder":91},[33,2913,2914,2916,2919,2921,2923],{"class":35,"line":397},[33,2915,562],{"class":163},[33,2917,2918],{"class":46}," release_lock",[33,2920,568],{"class":167},[33,2922,571],{"class":50},[33,2924,574],{"class":167},[33,2926,2927,2929],{"class":35,"line":653},[33,2928,2424],{"class":163},[33,2930,574],{"class":167},[33,2932,2933,2936],{"class":35,"line":667},[33,2934,2935],{"class":50},"        LOCK_PATH",[33,2937,2938],{"class":167},".unlink()\n",[33,2940,2941,2943,2946],{"class":35,"line":675},[33,2942,2449],{"class":163},[33,2944,2945],{"class":50}," FileNotFoundError",[33,2947,574],{"class":167},[33,2949,2950],{"class":35,"line":689},[33,2951,2952],{"class":163},"        pass\n",[33,2954,2955],{"class":35,"line":703},[33,2956,92],{"emptyLinePlaceholder":91},[33,2958,2959],{"class":35,"line":714},[33,2960,92],{"emptyLinePlaceholder":91},[33,2962,2963,2965,2967,2969,2971],{"class":35,"line":723},[33,2964,2491],{"class":163},[33,2966,2494],{"class":50},[33,2968,2497],{"class":163},[33,2970,2500],{"class":54},[33,2972,574],{"class":167},[33,2974,2975,2977,2979],{"class":35,"line":754},[33,2976,617],{"class":163},[33,2978,620],{"class":163},[33,2980,2981],{"class":167}," acquire_lock():\n",[33,2983,2984,2986,2989],{"class":35,"line":771},[33,2985,628],{"class":167},[33,2987,2988],{"class":54},"\"Another instance is running — exiting\"",[33,2990,221],{"class":167},[33,2992,2993,2996,2998],{"class":35,"line":777},[33,2994,2995],{"class":167},"        sys.exit(",[33,2997,748],{"class":50},[33,2999,221],{"class":167},[33,3001,3002,3004],{"class":35,"line":788},[33,3003,2424],{"class":163},[33,3005,574],{"class":167},[33,3007,3008,3011],{"class":35,"line":804},[33,3009,3010],{"class":167},"        run_daily_pipeline()   ",[33,3012,3013],{"class":39},"# your job from Step 1\n",[33,3015,3016,3019],{"class":35,"line":809},[33,3017,3018],{"class":163},"    finally",[33,3020,574],{"class":167},[33,3022,3023],{"class":35,"line":819},[33,3024,3025],{"class":167},"        release_lock()\n",[14,3027,3028,3029,3032,3033,3035],{},"On Windows use ",[30,3030,3031],{},"msvcrt.locking"," or a named mutex instead of ",[30,3034,2856],{},".",[424,3037,3039],{"id":3038},"failure-alerting-email-and-webhook","Failure Alerting (Email and Webhook)",[23,3041,3043],{"className":126,"code":3042,"language":47,"meta":28,"style":28},"# pip install (none — stdlib only; requests for webhook)\n\"\"\"\nSend an alert on job failure. Call send_email_alert() or send_webhook_alert()\ninside your except block.\n\"\"\"\nimport smtplib\nimport json\nimport urllib.request\nimport os\nfrom email.message import EmailMessage\nfrom pathlib import Path\n\nSMTP_HOST = os.environ.get(\"SMTP_HOST\", \"smtp.example.com\")\nSMTP_PORT = int(os.environ.get(\"SMTP_PORT\", \"587\"))\nSMTP_USER = os.environ.get(\"SMTP_USER\", \"pipeline@example.com\")\nSMTP_PASS = os.environ.get(\"SMTP_PASS\", \"\")\nALERT_TO = os.environ.get(\"ALERT_TO\", \"ops@example.com\")\nWEBHOOK_URL = os.environ.get(\"SLACK_WEBHOOK_URL\", \"\")\n\n\ndef send_email_alert(subject: str, body: str) -> None:\n    msg = EmailMessage()\n    msg[\"Subject\"] = subject\n    msg[\"From\"] = SMTP_USER\n    msg[\"To\"] = ALERT_TO\n    msg.set_content(body)\n    try:\n        with smtplib.SMTP(SMTP_HOST, SMTP_PORT) as s:\n            s.starttls()\n            s.login(SMTP_USER, SMTP_PASS)\n            s.send_message(msg)\n    except Exception:\n        import logging\n        logging.getLogger(__name__).exception(\"Failed to send alert email\")\n\n\ndef send_webhook_alert(text: str) -> None:\n    if not WEBHOOK_URL:\n        return\n    payload = json.dumps({\"text\": text}).encode()\n    req = urllib.request.Request(\n        WEBHOOK_URL,\n        data=payload,\n        headers={\"Content-Type\": \"application\u002Fjson\"},\n        method=\"POST\",\n    )\n    try:\n        with urllib.request.urlopen(req, timeout=10):\n            pass\n    except Exception:\n        import logging\n        logging.getLogger(__name__).exception(\"Failed to send webhook alert\")\n",[30,3044,3045,3050,3054,3059,3064,3068,3075,3082,3088,3094,3106,3116,3120,3140,3163,3182,3201,3220,3238,3242,3246,3269,3279,3294,3308,3322,3327,3333,3353,3358,3371,3376,3384,3391,3406,3410,3414,3432,3443,3447,3463,3473,3480,3490,3510,3522,3526,3532,3548,3553,3561,3567],{"__ignoreMap":28},[33,3046,3047],{"class":35,"line":36},[33,3048,3049],{"class":39},"# pip install (none — stdlib only; requests for webhook)\n",[33,3051,3052],{"class":35,"line":43},[33,3053,139],{"class":54},[33,3055,3056],{"class":35,"line":61},[33,3057,3058],{"class":54},"Send an alert on job failure. Call send_email_alert() or send_webhook_alert()\n",[33,3060,3061],{"class":35,"line":73},[33,3062,3063],{"class":54},"inside your except block.\n",[33,3065,3066],{"class":35,"line":88},[33,3067,139],{"class":54},[33,3069,3070,3072],{"class":35,"line":95},[33,3071,164],{"class":163},[33,3073,3074],{"class":167}," smtplib\n",[33,3076,3077,3079],{"class":35,"line":101},[33,3078,164],{"class":163},[33,3080,3081],{"class":167}," json\n",[33,3083,3084,3086],{"class":35,"line":171},[33,3085,164],{"class":163},[33,3087,1630],{"class":167},[33,3089,3090,3092],{"class":35,"line":179},[33,3091,164],{"class":163},[33,3093,176],{"class":167},[33,3095,3096,3098,3101,3103],{"class":35,"line":187},[33,3097,190],{"class":163},[33,3099,3100],{"class":167}," email.message ",[33,3102,164],{"class":163},[33,3104,3105],{"class":167}," EmailMessage\n",[33,3107,3108,3110,3112,3114],{"class":35,"line":201},[33,3109,190],{"class":163},[33,3111,193],{"class":167},[33,3113,164],{"class":163},[33,3115,198],{"class":167},[33,3117,3118],{"class":35,"line":206},[33,3119,92],{"emptyLinePlaceholder":91},[33,3121,3122,3125,3127,3130,3133,3135,3138],{"class":35,"line":224},[33,3123,3124],{"class":50},"SMTP_HOST",[33,3126,212],{"class":163},[33,3128,3129],{"class":167}," os.environ.get(",[33,3131,3132],{"class":54},"\"SMTP_HOST\"",[33,3134,365],{"class":167},[33,3136,3137],{"class":54},"\"smtp.example.com\"",[33,3139,221],{"class":167},[33,3141,3142,3145,3147,3150,3153,3156,3158,3161],{"class":35,"line":229},[33,3143,3144],{"class":50},"SMTP_PORT",[33,3146,212],{"class":163},[33,3148,3149],{"class":50}," int",[33,3151,3152],{"class":167},"(os.environ.get(",[33,3154,3155],{"class":54},"\"SMTP_PORT\"",[33,3157,365],{"class":167},[33,3159,3160],{"class":54},"\"587\"",[33,3162,371],{"class":167},[33,3164,3165,3168,3170,3172,3175,3177,3180],{"class":35,"line":235},[33,3166,3167],{"class":50},"SMTP_USER",[33,3169,212],{"class":163},[33,3171,3129],{"class":167},[33,3173,3174],{"class":54},"\"SMTP_USER\"",[33,3176,365],{"class":167},[33,3178,3179],{"class":54},"\"pipeline@example.com\"",[33,3181,221],{"class":167},[33,3183,3184,3187,3189,3191,3194,3196,3199],{"class":35,"line":250},[33,3185,3186],{"class":50},"SMTP_PASS",[33,3188,212],{"class":163},[33,3190,3129],{"class":167},[33,3192,3193],{"class":54},"\"SMTP_PASS\"",[33,3195,365],{"class":167},[33,3197,3198],{"class":54},"\"\"",[33,3200,221],{"class":167},[33,3202,3203,3206,3208,3210,3213,3215,3218],{"class":35,"line":266},[33,3204,3205],{"class":50},"ALERT_TO",[33,3207,212],{"class":163},[33,3209,3129],{"class":167},[33,3211,3212],{"class":54},"\"ALERT_TO\"",[33,3214,365],{"class":167},[33,3216,3217],{"class":54},"\"ops@example.com\"",[33,3219,221],{"class":167},[33,3221,3222,3225,3227,3229,3232,3234,3236],{"class":35,"line":290},[33,3223,3224],{"class":50},"WEBHOOK_URL",[33,3226,212],{"class":163},[33,3228,3129],{"class":167},[33,3230,3231],{"class":54},"\"SLACK_WEBHOOK_URL\"",[33,3233,365],{"class":167},[33,3235,3198],{"class":54},[33,3237,221],{"class":167},[33,3239,3240],{"class":35,"line":295},[33,3241,92],{"emptyLinePlaceholder":91},[33,3243,3244],{"class":35,"line":300},[33,3245,92],{"emptyLinePlaceholder":91},[33,3247,3248,3250,3253,3256,3258,3261,3263,3265,3267],{"class":35,"line":317},[33,3249,562],{"class":163},[33,3251,3252],{"class":46}," send_email_alert",[33,3254,3255],{"class":167},"(subject: ",[33,3257,1053],{"class":50},[33,3259,3260],{"class":167},", body: ",[33,3262,1053],{"class":50},[33,3264,1617],{"class":167},[33,3266,571],{"class":50},[33,3268,574],{"class":167},[33,3270,3271,3274,3276],{"class":35,"line":332},[33,3272,3273],{"class":167},"    msg ",[33,3275,242],{"class":163},[33,3277,3278],{"class":167}," EmailMessage()\n",[33,3280,3281,3284,3287,3289,3291],{"class":35,"line":347},[33,3282,3283],{"class":167},"    msg[",[33,3285,3286],{"class":54},"\"Subject\"",[33,3288,763],{"class":167},[33,3290,242],{"class":163},[33,3292,3293],{"class":167}," subject\n",[33,3295,3296,3298,3301,3303,3305],{"class":35,"line":374},[33,3297,3283],{"class":167},[33,3299,3300],{"class":54},"\"From\"",[33,3302,763],{"class":167},[33,3304,242],{"class":163},[33,3306,3307],{"class":50}," SMTP_USER\n",[33,3309,3310,3312,3315,3317,3319],{"class":35,"line":397},[33,3311,3283],{"class":167},[33,3313,3314],{"class":54},"\"To\"",[33,3316,763],{"class":167},[33,3318,242],{"class":163},[33,3320,3321],{"class":50}," ALERT_TO\n",[33,3323,3324],{"class":35,"line":653},[33,3325,3326],{"class":167},"    msg.set_content(body)\n",[33,3328,3329,3331],{"class":35,"line":667},[33,3330,2424],{"class":163},[33,3332,574],{"class":167},[33,3334,3335,3337,3340,3342,3344,3346,3348,3350],{"class":35,"line":675},[33,3336,2191],{"class":163},[33,3338,3339],{"class":167}," smtplib.SMTP(",[33,3341,3124],{"class":50},[33,3343,365],{"class":167},[33,3345,3144],{"class":50},[33,3347,1649],{"class":167},[33,3349,495],{"class":163},[33,3351,3352],{"class":167}," s:\n",[33,3354,3355],{"class":35,"line":689},[33,3356,3357],{"class":167},"            s.starttls()\n",[33,3359,3360,3363,3365,3367,3369],{"class":35,"line":703},[33,3361,3362],{"class":167},"            s.login(",[33,3364,3167],{"class":50},[33,3366,365],{"class":167},[33,3368,3186],{"class":50},[33,3370,221],{"class":167},[33,3372,3373],{"class":35,"line":714},[33,3374,3375],{"class":167},"            s.send_message(msg)\n",[33,3377,3378,3380,3382],{"class":35,"line":723},[33,3379,2449],{"class":163},[33,3381,783],{"class":50},[33,3383,574],{"class":167},[33,3385,3386,3389],{"class":35,"line":754},[33,3387,3388],{"class":163},"        import",[33,3390,184],{"class":167},[33,3392,3393,3396,3398,3401,3404],{"class":35,"line":771},[33,3394,3395],{"class":167},"        logging.getLogger(",[33,3397,547],{"class":50},[33,3399,3400],{"class":167},").exception(",[33,3402,3403],{"class":54},"\"Failed to send alert email\"",[33,3405,221],{"class":167},[33,3407,3408],{"class":35,"line":777},[33,3409,92],{"emptyLinePlaceholder":91},[33,3411,3412],{"class":35,"line":788},[33,3413,92],{"emptyLinePlaceholder":91},[33,3415,3416,3418,3421,3424,3426,3428,3430],{"class":35,"line":804},[33,3417,562],{"class":163},[33,3419,3420],{"class":46}," send_webhook_alert",[33,3422,3423],{"class":167},"(text: ",[33,3425,1053],{"class":50},[33,3427,1617],{"class":167},[33,3429,571],{"class":50},[33,3431,574],{"class":167},[33,3433,3434,3436,3438,3441],{"class":35,"line":809},[33,3435,617],{"class":163},[33,3437,620],{"class":163},[33,3439,3440],{"class":50}," WEBHOOK_URL",[33,3442,574],{"class":167},[33,3444,3445],{"class":35,"line":819},[33,3446,646],{"class":163},[33,3448,3449,3452,3454,3457,3460],{"class":35,"line":829},[33,3450,3451],{"class":167},"    payload ",[33,3453,242],{"class":163},[33,3455,3456],{"class":167}," json.dumps({",[33,3458,3459],{"class":54},"\"text\"",[33,3461,3462],{"class":167},": text}).encode()\n",[33,3464,3465,3468,3470],{"class":35,"line":834},[33,3466,3467],{"class":167},"    req ",[33,3469,242],{"class":163},[33,3471,3472],{"class":167}," urllib.request.Request(\n",[33,3474,3475,3478],{"class":35,"line":839},[33,3476,3477],{"class":50},"        WEBHOOK_URL",[33,3479,247],{"class":167},[33,3481,3482,3485,3487],{"class":35,"line":860},[33,3483,3484],{"class":238},"        data",[33,3486,242],{"class":163},[33,3488,3489],{"class":167},"payload,\n",[33,3491,3492,3495,3497,3499,3502,3504,3507],{"class":35,"line":887},[33,3493,3494],{"class":238},"        headers",[33,3496,242],{"class":163},[33,3498,1115],{"class":167},[33,3500,3501],{"class":54},"\"Content-Type\"",[33,3503,2079],{"class":167},[33,3505,3506],{"class":54},"\"application\u002Fjson\"",[33,3508,3509],{"class":167},"},\n",[33,3511,3512,3515,3517,3520],{"class":35,"line":907},[33,3513,3514],{"class":238},"        method",[33,3516,242],{"class":163},[33,3518,3519],{"class":54},"\"POST\"",[33,3521,247],{"class":167},[33,3523,3524],{"class":35,"line":1826},[33,3525,1202],{"class":167},[33,3527,3528,3530],{"class":35,"line":1844},[33,3529,2424],{"class":163},[33,3531,574],{"class":167},[33,3533,3534,3536,3539,3541,3543,3546],{"class":35,"line":1858},[33,3535,2191],{"class":163},[33,3537,3538],{"class":167}," urllib.request.urlopen(req, ",[33,3540,1641],{"class":238},[33,3542,242],{"class":163},[33,3544,3545],{"class":50},"10",[33,3547,1737],{"class":167},[33,3549,3550],{"class":35,"line":1871},[33,3551,3552],{"class":163},"            pass\n",[33,3554,3555,3557,3559],{"class":35,"line":1877},[33,3556,2449],{"class":163},[33,3558,783],{"class":50},[33,3560,574],{"class":167},[33,3562,3563,3565],{"class":35,"line":1883},[33,3564,3388],{"class":163},[33,3566,184],{"class":167},[33,3568,3569,3571,3573,3575,3578],{"class":35,"line":1915},[33,3570,3395],{"class":167},[33,3572,547],{"class":50},[33,3574,3400],{"class":167},[33,3576,3577],{"class":54},"\"Failed to send webhook alert\"",[33,3579,221],{"class":167},[14,3581,3582,3583,3586],{},"Store credentials in environment variables, never in source code. On GitHub Actions they go in repository Secrets; on a server use a ",[30,3584,3585],{},".env"," file owned by the service account.",[424,3588,3590],{"id":3589},"idempotency","Idempotency",[14,3592,3593],{},"A job that can safely re-run without duplicating output is far easier to operate. Track processed files in a small SQLite ledger.",[23,3595,3597],{"className":126,"code":3596,"language":47,"meta":28,"style":28},"# pip install (none — stdlib only)\n\"\"\"\nIdempotency ledger: record each processed file by content hash.\nRe-running the job skips files already in the ledger.\n\"\"\"\nimport sqlite3\nimport hashlib\nfrom pathlib import Path\n\nLEDGER_PATH = Path(\"\u002Fdata\u002Fpipeline_ledger.db\")\n\n\ndef _get_db() -> sqlite3.Connection:\n    conn = sqlite3.connect(LEDGER_PATH)\n    conn.execute(\n        \"CREATE TABLE IF NOT EXISTS processed \"\n        \"(file_hash TEXT PRIMARY KEY, path TEXT, processed_at TEXT)\"\n    )\n    conn.commit()\n    return conn\n\n\ndef file_hash(path: Path) -> str:\n    h = hashlib.sha256()\n    h.update(path.read_bytes())\n    return h.hexdigest()\n\n\ndef already_processed(path: Path) -> bool:\n    fh = file_hash(path)\n    conn = _get_db()\n    row = conn.execute(\n        \"SELECT 1 FROM processed WHERE file_hash = ?\", (fh,)\n    ).fetchone()\n    conn.close()\n    return row is not None\n\n\ndef mark_processed(path: Path) -> None:\n    from datetime import datetime, timezone\n    fh = file_hash(path)\n    conn = _get_db()\n    conn.execute(\n        \"INSERT OR IGNORE INTO processed (file_hash, path, processed_at) VALUES (?, ?, ?)\",\n        (fh, str(path), datetime.now(timezone.utc).isoformat()),\n    )\n    conn.commit()\n    conn.close()\n",[30,3598,3599,3603,3607,3612,3617,3621,3628,3635,3645,3649,3663,3667,3671,3681,3695,3700,3705,3710,3714,3719,3726,3730,3734,3748,3758,3763,3770,3774,3778,3791,3801,3810,3820,3828,3833,3838,3853,3857,3861,3874,3887,3895,3903,3907,3914,3924,3928,3932],{"__ignoreMap":28},[33,3600,3601],{"class":35,"line":36},[33,3602,134],{"class":39},[33,3604,3605],{"class":35,"line":43},[33,3606,139],{"class":54},[33,3608,3609],{"class":35,"line":61},[33,3610,3611],{"class":54},"Idempotency ledger: record each processed file by content hash.\n",[33,3613,3614],{"class":35,"line":73},[33,3615,3616],{"class":54},"Re-running the job skips files already in the ledger.\n",[33,3618,3619],{"class":35,"line":88},[33,3620,139],{"class":54},[33,3622,3623,3625],{"class":35,"line":95},[33,3624,164],{"class":163},[33,3626,3627],{"class":167}," sqlite3\n",[33,3629,3630,3632],{"class":35,"line":101},[33,3631,164],{"class":163},[33,3633,3634],{"class":167}," hashlib\n",[33,3636,3637,3639,3641,3643],{"class":35,"line":171},[33,3638,190],{"class":163},[33,3640,193],{"class":167},[33,3642,164],{"class":163},[33,3644,198],{"class":167},[33,3646,3647],{"class":35,"line":179},[33,3648,92],{"emptyLinePlaceholder":91},[33,3650,3651,3654,3656,3658,3661],{"class":35,"line":187},[33,3652,3653],{"class":50},"LEDGER_PATH",[33,3655,212],{"class":163},[33,3657,215],{"class":167},[33,3659,3660],{"class":54},"\"\u002Fdata\u002Fpipeline_ledger.db\"",[33,3662,221],{"class":167},[33,3664,3665],{"class":35,"line":201},[33,3666,92],{"emptyLinePlaceholder":91},[33,3668,3669],{"class":35,"line":206},[33,3670,92],{"emptyLinePlaceholder":91},[33,3672,3673,3675,3678],{"class":35,"line":224},[33,3674,562],{"class":163},[33,3676,3677],{"class":46}," _get_db",[33,3679,3680],{"class":167},"() -> sqlite3.Connection:\n",[33,3682,3683,3686,3688,3691,3693],{"class":35,"line":229},[33,3684,3685],{"class":167},"    conn ",[33,3687,242],{"class":163},[33,3689,3690],{"class":167}," sqlite3.connect(",[33,3692,3653],{"class":50},[33,3694,221],{"class":167},[33,3696,3697],{"class":35,"line":235},[33,3698,3699],{"class":167},"    conn.execute(\n",[33,3701,3702],{"class":35,"line":250},[33,3703,3704],{"class":54},"        \"CREATE TABLE IF NOT EXISTS processed \"\n",[33,3706,3707],{"class":35,"line":266},[33,3708,3709],{"class":54},"        \"(file_hash TEXT PRIMARY KEY, path TEXT, processed_at TEXT)\"\n",[33,3711,3712],{"class":35,"line":290},[33,3713,1202],{"class":167},[33,3715,3716],{"class":35,"line":295},[33,3717,3718],{"class":167},"    conn.commit()\n",[33,3720,3721,3723],{"class":35,"line":300},[33,3722,1332],{"class":163},[33,3724,3725],{"class":167}," conn\n",[33,3727,3728],{"class":35,"line":317},[33,3729,92],{"emptyLinePlaceholder":91},[33,3731,3732],{"class":35,"line":332},[33,3733,92],{"emptyLinePlaceholder":91},[33,3735,3736,3738,3741,3744,3746],{"class":35,"line":347},[33,3737,562],{"class":163},[33,3739,3740],{"class":46}," file_hash",[33,3742,3743],{"class":167},"(path: Path) -> ",[33,3745,1053],{"class":50},[33,3747,574],{"class":167},[33,3749,3750,3753,3755],{"class":35,"line":374},[33,3751,3752],{"class":167},"    h ",[33,3754,242],{"class":163},[33,3756,3757],{"class":167}," hashlib.sha256()\n",[33,3759,3760],{"class":35,"line":397},[33,3761,3762],{"class":167},"    h.update(path.read_bytes())\n",[33,3764,3765,3767],{"class":35,"line":653},[33,3766,1332],{"class":163},[33,3768,3769],{"class":167}," h.hexdigest()\n",[33,3771,3772],{"class":35,"line":667},[33,3773,92],{"emptyLinePlaceholder":91},[33,3775,3776],{"class":35,"line":675},[33,3777,92],{"emptyLinePlaceholder":91},[33,3779,3780,3782,3785,3787,3789],{"class":35,"line":689},[33,3781,562],{"class":163},[33,3783,3784],{"class":46}," already_processed",[33,3786,3743],{"class":167},[33,3788,2821],{"class":50},[33,3790,574],{"class":167},[33,3792,3793,3796,3798],{"class":35,"line":703},[33,3794,3795],{"class":167},"    fh ",[33,3797,242],{"class":163},[33,3799,3800],{"class":167}," file_hash(path)\n",[33,3802,3803,3805,3807],{"class":35,"line":714},[33,3804,3685],{"class":167},[33,3806,242],{"class":163},[33,3808,3809],{"class":167}," _get_db()\n",[33,3811,3812,3815,3817],{"class":35,"line":723},[33,3813,3814],{"class":167},"    row ",[33,3816,242],{"class":163},[33,3818,3819],{"class":167}," conn.execute(\n",[33,3821,3822,3825],{"class":35,"line":754},[33,3823,3824],{"class":54},"        \"SELECT 1 FROM processed WHERE file_hash = ?\"",[33,3826,3827],{"class":167},", (fh,)\n",[33,3829,3830],{"class":35,"line":771},[33,3831,3832],{"class":167},"    ).fetchone()\n",[33,3834,3835],{"class":35,"line":777},[33,3836,3837],{"class":167},"    conn.close()\n",[33,3839,3840,3842,3845,3848,3850],{"class":35,"line":788},[33,3841,1332],{"class":163},[33,3843,3844],{"class":167}," row ",[33,3846,3847],{"class":163},"is",[33,3849,620],{"class":163},[33,3851,3852],{"class":50}," None\n",[33,3854,3855],{"class":35,"line":804},[33,3856,92],{"emptyLinePlaceholder":91},[33,3858,3859],{"class":35,"line":809},[33,3860,92],{"emptyLinePlaceholder":91},[33,3862,3863,3865,3868,3870,3872],{"class":35,"line":819},[33,3864,562],{"class":163},[33,3866,3867],{"class":46}," mark_processed",[33,3869,3743],{"class":167},[33,3871,571],{"class":50},[33,3873,574],{"class":167},[33,3875,3876,3879,3882,3884],{"class":35,"line":829},[33,3877,3878],{"class":163},"    from",[33,3880,3881],{"class":167}," datetime ",[33,3883,164],{"class":163},[33,3885,3886],{"class":167}," datetime, timezone\n",[33,3888,3889,3891,3893],{"class":35,"line":834},[33,3890,3795],{"class":167},[33,3892,242],{"class":163},[33,3894,3800],{"class":167},[33,3896,3897,3899,3901],{"class":35,"line":839},[33,3898,3685],{"class":167},[33,3900,242],{"class":163},[33,3902,3809],{"class":167},[33,3904,3905],{"class":35,"line":860},[33,3906,3699],{"class":167},[33,3908,3909,3912],{"class":35,"line":887},[33,3910,3911],{"class":54},"        \"INSERT OR IGNORE INTO processed (file_hash, path, processed_at) VALUES (?, ?, ?)\"",[33,3913,247],{"class":167},[33,3915,3916,3919,3921],{"class":35,"line":907},[33,3917,3918],{"class":167},"        (fh, ",[33,3920,1053],{"class":50},[33,3922,3923],{"class":167},"(path), datetime.now(timezone.utc).isoformat()),\n",[33,3925,3926],{"class":35,"line":1826},[33,3927,1202],{"class":167},[33,3929,3930],{"class":35,"line":1844},[33,3931,3718],{"class":167},[33,3933,3934],{"class":35,"line":1858},[33,3935,3837],{"class":167},[18,3937,3939],{"id":3938},"validation-and-health-check","Validation and Health Check",[14,3941,3942],{},"After the job runs, assert the output is sane before writing it to its final destination.",[23,3944,3946],{"className":126,"code":3945,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\n\"\"\"\nValidation step: check row count, required columns, and absence of all-null rows.\nRaises ValueError on any anomaly so the caller can catch and alert.\n\"\"\"\nimport pandas as pd\nfrom pathlib import Path\n\n\ndef validate_output(path: Path, min_rows: int = 1) -> None:\n    df = pd.read_excel(path)\n    if len(df) \u003C min_rows:\n        raise ValueError(f\"Output has {len(df)} rows — expected at least {min_rows}\")\n\n    required_cols = {\"date\", \"amount\", \"_source\"}\n    missing = required_cols - set(df.columns)\n    if missing:\n        raise ValueError(f\"Missing columns: {missing}\")\n\n    if df.isnull().all(axis=1).any():\n        raise ValueError(\"Output contains fully null rows\")\n",[30,3947,3948,3953,3957,3962,3967,3971,3981,3991,3995,3999,4021,4031,4047,4085,4089,4114,4133,4140,4164,4168,4185],{"__ignoreMap":28},[33,3949,3950],{"class":35,"line":36},[33,3951,3952],{"class":39},"# pip install pandas openpyxl\n",[33,3954,3955],{"class":35,"line":43},[33,3956,139],{"class":54},[33,3958,3959],{"class":35,"line":61},[33,3960,3961],{"class":54},"Validation step: check row count, required columns, and absence of all-null rows.\n",[33,3963,3964],{"class":35,"line":73},[33,3965,3966],{"class":54},"Raises ValueError on any anomaly so the caller can catch and alert.\n",[33,3968,3969],{"class":35,"line":88},[33,3970,139],{"class":54},[33,3972,3973,3975,3977,3979],{"class":35,"line":95},[33,3974,164],{"class":163},[33,3976,492],{"class":167},[33,3978,495],{"class":163},[33,3980,498],{"class":167},[33,3982,3983,3985,3987,3989],{"class":35,"line":101},[33,3984,190],{"class":163},[33,3986,193],{"class":167},[33,3988,164],{"class":163},[33,3990,198],{"class":167},[33,3992,3993],{"class":35,"line":171},[33,3994,92],{"emptyLinePlaceholder":91},[33,3996,3997],{"class":35,"line":179},[33,3998,92],{"emptyLinePlaceholder":91},[33,4000,4001,4003,4006,4009,4011,4013,4015,4017,4019],{"class":35,"line":187},[33,4002,562],{"class":163},[33,4004,4005],{"class":46}," validate_output",[33,4007,4008],{"class":167},"(path: Path, min_rows: ",[33,4010,1059],{"class":50},[33,4012,212],{"class":163},[33,4014,1814],{"class":50},[33,4016,1617],{"class":167},[33,4018,571],{"class":50},[33,4020,574],{"class":167},[33,4022,4023,4026,4028],{"class":35,"line":201},[33,4024,4025],{"class":167},"    df ",[33,4027,242],{"class":163},[33,4029,4030],{"class":167}," pd.read_excel(path)\n",[33,4032,4033,4035,4038,4041,4044],{"class":35,"line":206},[33,4034,617],{"class":163},[33,4036,4037],{"class":50}," len",[33,4039,4040],{"class":167},"(df) ",[33,4042,4043],{"class":163},"\u003C",[33,4045,4046],{"class":167}," min_rows:\n",[33,4048,4049,4052,4055,4057,4060,4063,4066,4069,4071,4074,4076,4079,4081,4083],{"class":35,"line":224},[33,4050,4051],{"class":163},"        raise",[33,4053,4054],{"class":50}," ValueError",[33,4056,602],{"class":167},[33,4058,4059],{"class":163},"f",[33,4061,4062],{"class":54},"\"Output has ",[33,4064,4065],{"class":50},"{len",[33,4067,4068],{"class":167},"(df)",[33,4070,1121],{"class":50},[33,4072,4073],{"class":54}," rows — expected at least ",[33,4075,1115],{"class":50},[33,4077,4078],{"class":167},"min_rows",[33,4080,1121],{"class":50},[33,4082,274],{"class":54},[33,4084,221],{"class":167},[33,4086,4087],{"class":35,"line":229},[33,4088,92],{"emptyLinePlaceholder":91},[33,4090,4091,4094,4096,4099,4102,4104,4107,4109,4111],{"class":35,"line":235},[33,4092,4093],{"class":167},"    required_cols ",[33,4095,242],{"class":163},[33,4097,4098],{"class":167}," {",[33,4100,4101],{"class":54},"\"date\"",[33,4103,365],{"class":167},[33,4105,4106],{"class":54},"\"amount\"",[33,4108,365],{"class":167},[33,4110,760],{"class":54},[33,4112,4113],{"class":167},"}\n",[33,4115,4116,4119,4121,4124,4127,4130],{"class":35,"line":250},[33,4117,4118],{"class":167},"    missing ",[33,4120,242],{"class":163},[33,4122,4123],{"class":167}," required_cols ",[33,4125,4126],{"class":163},"-",[33,4128,4129],{"class":50}," set",[33,4131,4132],{"class":167},"(df.columns)\n",[33,4134,4135,4137],{"class":35,"line":266},[33,4136,617],{"class":163},[33,4138,4139],{"class":167}," missing:\n",[33,4141,4142,4144,4146,4148,4150,4153,4155,4158,4160,4162],{"class":35,"line":290},[33,4143,4051],{"class":163},[33,4145,4054],{"class":50},[33,4147,602],{"class":167},[33,4149,4059],{"class":163},[33,4151,4152],{"class":54},"\"Missing columns: ",[33,4154,1115],{"class":50},[33,4156,4157],{"class":167},"missing",[33,4159,1121],{"class":50},[33,4161,274],{"class":54},[33,4163,221],{"class":167},[33,4165,4166],{"class":35,"line":295},[33,4167,92],{"emptyLinePlaceholder":91},[33,4169,4170,4172,4175,4178,4180,4182],{"class":35,"line":300},[33,4171,617],{"class":163},[33,4173,4174],{"class":167}," df.isnull().all(",[33,4176,4177],{"class":238},"axis",[33,4179,242],{"class":163},[33,4181,734],{"class":50},[33,4183,4184],{"class":167},").any():\n",[33,4186,4187,4189,4191,4193,4196],{"class":35,"line":317},[33,4188,4051],{"class":163},[33,4190,4054],{"class":50},[33,4192,602],{"class":167},[33,4194,4195],{"class":54},"\"Output contains fully null rows\"",[33,4197,221],{"class":167},[14,4199,4200,4201,4205],{},"The same validation pattern applies when ",[940,4202,4204],{"href":4203},"\u002Fautomating-document-data-pipelines\u002Fgenerating-reports-from-pipeline-data\u002F","Generating Reports from Pipeline Data"," — check the report's row count and spot-sample a few cells before emailing it.",[18,4207,4209],{"id":4208},"performance-and-scale-notes","Performance and Scale Notes",[4211,4212,4213,4223,4233,4246,4259],"ul",{},[4214,4215,4216,4219,4220,4222],"li",{},[1974,4217,4218],{},"Memory",": if ",[30,4221,507],{}," holds hundreds of large PDFs, do not load all frames into a list. Write each parsed DataFrame to a staging Parquet file, then concatenate from disk at the end.",[4214,4224,4225,4228,4229,4232],{},[1974,4226,4227],{},"Chunking",": for large CSVs fed into the pipeline, use ",[30,4230,4231],{},"pd.read_csv(..., chunksize=10_000)"," and process each chunk inside the retry-wrapped function.",[4214,4234,4235,2079,4238,4241,4242,4245],{},[1974,4236,4237],{},"Parallelism",[30,4239,4240],{},"concurrent.futures.ProcessPoolExecutor"," works well for CPU-bound OCR batches; keep pool size to ",[30,4243,4244],{},"os.cpu_count() - 1"," to leave headroom for the OS.",[4214,4247,4248,4251,4252,4254,4255,4258],{},[1974,4249,4250],{},"GitHub Actions minutes",": free tier gives 2,000 minutes\u002Fmonth. A 5-minute daily pipeline costs ~150 minutes\u002Fmonth — well within budget. Cache the ",[30,4253,76],{}," install step with ",[30,4256,4257],{},"actions\u002Fcache"," to cut run time by ~80%.",[4214,4260,4261,4264,4265,4267],{},[1974,4262,4263],{},"schedule library drift",": the ",[30,4266,2325],{}," library accumulates small timing drift over days. For production use with strict timing, prefer cron or GitHub Actions, which are backed by system clocks.",[18,4269,4271],{"id":4270},"troubleshooting","Troubleshooting",[4273,4274,4275,4291],"table",{},[4276,4277,4278],"thead",{},[4279,4280,4281,4285,4288],"tr",{},[4282,4283,4284],"th",{},"Symptom",[4282,4286,4287],{},"Root cause",[4282,4289,4290],{},"Fix",[4292,4293,4294,4317,4335,4366,4384],"tbody",{},[4279,4295,4296,4303,4310],{},[4297,4298,4299,4302],"td",{},[30,4300,4301],{},"ModuleNotFoundError"," in cron but not in terminal",[4297,4304,4305,4306,4309],{},"Cron uses ",[30,4307,4308],{},"\u002Fusr\u002Fbin\u002Fpython3",", not your venv",[4297,4311,4312,4313,4316],{},"Use the absolute path to ",[30,4314,4315],{},".venv\u002Fbin\u002Fpython"," in the crontab line",[4279,4318,4319,4322,4328],{},[4297,4320,4321],{},"Script runs in terminal but not in Task Scheduler",[4297,4323,4324,4325],{},"\"Start in\" directory not set; relative paths resolve against ",[30,4326,4327],{},"C:\\Windows\\System32",[4297,4329,4330,4331,4334],{},"Set \"Start in\" to the script's directory; use ",[30,4332,4333],{},"Path(__file__).parent"," for relative paths",[4279,4336,4337,4343,4356],{},[4297,4338,4339,4342],{},[30,4340,4341],{},"FileNotFoundError"," for a dependency binary (e.g., Tesseract, Java)",[4297,4344,4345,4346,4348,4349,4352,4353],{},"Cron's ",[30,4347,122],{}," is ",[30,4350,4351],{},"\u002Fusr\u002Fbin:\u002Fbin"," — system-installed tools may be in ",[30,4354,4355],{},"\u002Fusr\u002Flocal\u002Fbin",[4297,4357,4358,4359,4362,4363],{},"Add ",[30,4360,4361],{},"PATH=\u002Fusr\u002Flocal\u002Fbin:\u002Fusr\u002Fbin:\u002Fbin"," at the top of the crontab, or use absolute paths in ",[30,4364,4365],{},"subprocess.run()",[4279,4367,4368,4371,4377],{},[4297,4369,4370],{},"Lock file left behind after a crash",[4297,4372,4373,4374],{},"Previous run died before calling ",[30,4375,4376],{},"release_lock()",[4297,4378,4379,4380,4383],{},"Add a staleness check: if the PID in the lock file is not in ",[30,4381,4382],{},"psutil.pids()",", delete it and proceed",[4279,4385,4386,4389,4392],{},[4297,4387,4388],{},"GitHub Actions job silently skips",[4297,4390,4391],{},"Cron schedule is parsed in UTC; your intended local time differs",[4297,4393,4394,4395,4398],{},"Convert your local time to UTC explicitly; add ",[30,4396,4397],{},"workflow_dispatch:"," for manual testing",[18,4400,4402],{"id":4401},"complete-working-script","Complete Working Script",[23,4404,4406],{"className":126,"code":4405,"language":47,"meta":28,"style":28},"# pip install pdfplumber pandas openpyxl tenacity schedule\n\"\"\"\ndaily_pipeline.py — self-contained document pipeline with scheduling, logging,\nlock-file guard, retry, idempotency, validation, and failure alerting.\n\nUsage:\n  python daily_pipeline.py                 # run once and exit\n  python daily_pipeline.py --daemon        # run on internal schedule (06:30 daily)\n  python daily_pipeline.py --input \u002Fpath   # override input directory\n\"\"\"\nimport argparse\nimport hashlib\nimport logging\nimport logging.handlers\nimport os\nimport sqlite3\nimport sys\nimport time\nfrom datetime import datetime, timezone\nfrom pathlib import Path\n\nimport pdfplumber\nimport pandas as pd\nimport schedule\nfrom tenacity import (\n    retry,\n    stop_after_attempt,\n    wait_exponential,\n    retry_if_exception_type,\n    before_sleep_log,\n)\n\n# ── Config (override via CLI or environment) ──────────────────────────────────\nINPUT_DIR = Path(os.environ.get(\"PIPELINE_INPUT_DIR\", \"\u002Fdata\u002Fincoming\"))\nOUTPUT_DIR = Path(os.environ.get(\"PIPELINE_OUTPUT_DIR\", \"\u002Fdata\u002Freports\"))\nLOG_DIR = Path(os.environ.get(\"PIPELINE_LOG_DIR\", \"\u002Fvar\u002Flog\u002Fdoc-pipeline\"))\nLEDGER_PATH = Path(os.environ.get(\"PIPELINE_LEDGER\", \"\u002Fdata\u002Fpipeline_ledger.db\"))\nLOCK_PATH = Path(\"\u002Ftmp\u002Fdoc_pipeline.lock\")\nSMTP_HOST = os.environ.get(\"SMTP_HOST\", \"\")\nSMTP_USER = os.environ.get(\"SMTP_USER\", \"\")\nSMTP_PASS = os.environ.get(\"SMTP_PASS\", \"\")\nALERT_TO = os.environ.get(\"ALERT_TO\", \"\")\nWEBHOOK_URL = os.environ.get(\"SLACK_WEBHOOK_URL\", \"\")\n\n# ── Logging setup ─────────────────────────────────────────────────────────────\n\ndef configure_logging() -> logging.Logger:\n    LOG_DIR.mkdir(parents=True, exist_ok=True)\n    handler = logging.handlers.RotatingFileHandler(\n        LOG_DIR \u002F \"pipeline.log\",\n        maxBytes=5 * 1024 * 1024,\n        backupCount=7,\n        encoding=\"utf-8\",\n    )\n    fmt = logging.Formatter(\n        \"%(asctime)s %(name)s %(levelname)s %(message)s\",\n        datefmt=\"%Y-%m-%dT%H:%M:%S\",\n    )\n    handler.setFormatter(fmt)\n    stderr = logging.StreamHandler()\n    stderr.setLevel(logging.WARNING)\n    stderr.setFormatter(fmt)\n    root = logging.getLogger()\n    root.setLevel(logging.INFO)\n    root.addHandler(handler)\n    root.addHandler(stderr)\n    return logging.getLogger(\"pipeline\")\n\n\nlogger = configure_logging()\n\n# ── Lock file ─────────────────────────────────────────────────────────────────\n\ndef acquire_lock() -> bool:\n    try:\n        fd = os.open(LOCK_PATH, os.O_CREAT | os.O_EXCL | os.O_WRONLY)\n        os.write(fd, str(os.getpid()).encode())\n        os.close(fd)\n        return True\n    except FileExistsError:\n        return False\n\n\ndef release_lock() -> None:\n    try:\n        LOCK_PATH.unlink()\n    except FileNotFoundError:\n        pass\n\n# ── Idempotency ledger ────────────────────────────────────────────────────────\n\ndef _db() -> sqlite3.Connection:\n    conn = sqlite3.connect(LEDGER_PATH)\n    conn.execute(\n        \"CREATE TABLE IF NOT EXISTS processed \"\n        \"(file_hash TEXT PRIMARY KEY, path TEXT, processed_at TEXT)\"\n    )\n    conn.commit()\n    return conn\n\n\ndef _hash(path: Path) -> str:\n    return hashlib.sha256(path.read_bytes()).hexdigest()\n\n\ndef already_done(path: Path) -> bool:\n    fh = _hash(path)\n    conn = _db()\n    found = conn.execute(\n        \"SELECT 1 FROM processed WHERE file_hash = ?\", (fh,)\n    ).fetchone()\n    conn.close()\n    return found is not None\n\n\ndef mark_done(path: Path) -> None:\n    conn = _db()\n    conn.execute(\n        \"INSERT OR IGNORE INTO processed (file_hash, path, processed_at) VALUES (?, ?, ?)\",\n        (_hash(path), str(path), datetime.now(timezone.utc).isoformat()),\n    )\n    conn.commit()\n    conn.close()\n\n# ── Retry decorator ───────────────────────────────────────────────────────────\n\nretry_io = retry(\n    reraise=True,\n    stop=stop_after_attempt(4),\n    wait=wait_exponential(multiplier=1, min=2, max=30),\n    retry=retry_if_exception_type((IOError, TimeoutError)),\n    before_sleep=before_sleep_log(logger, logging.WARNING),\n)\n\n# ── Alert helpers ─────────────────────────────────────────────────────────────\n\ndef _alert(subject: str, body: str) -> None:\n    import json\n    import smtplib\n    import urllib.request\n    from email.message import EmailMessage\n\n    if SMTP_HOST and ALERT_TO:\n        try:\n            msg = EmailMessage()\n            msg[\"Subject\"] = subject\n            msg[\"From\"] = SMTP_USER\n            msg[\"To\"] = ALERT_TO\n            msg.set_content(body)\n            with smtplib.SMTP(SMTP_HOST, 587) as s:\n                s.starttls()\n                s.login(SMTP_USER, SMTP_PASS)\n                s.send_message(msg)\n        except Exception:\n            logger.exception(\"Email alert failed\")\n\n    if WEBHOOK_URL:\n        payload = json.dumps({\"text\": f\"*{subject}*\\n{body}\"}).encode()\n        req = urllib.request.Request(\n            WEBHOOK_URL,\n            data=payload,\n            headers={\"Content-Type\": \"application\u002Fjson\"},\n            method=\"POST\",\n        )\n        try:\n            with urllib.request.urlopen(req, timeout=10):\n                pass\n        except Exception:\n            logger.exception(\"Webhook alert failed\")\n\n# ── Core extraction ───────────────────────────────────────────────────────────\n\n@retry_io\ndef _extract_pdf(pdf_path: Path) -> list[pd.DataFrame]:\n    frames = []\n    with pdfplumber.open(pdf_path) as pdf:\n        for page in pdf.pages:\n            table = page.extract_table()\n            if table and len(table) > 1:\n                df = pd.DataFrame(table[1:], columns=table[0])\n                df[\"_source\"] = pdf_path.name\n                frames.append(df)\n    return frames\n\n# ── Main job ──────────────────────────────────────────────────────────────────\n\ndef run_job(input_dir: Path, output_dir: Path) -> None:\n    logger.info(\"Job started — scanning %s\", input_dir)\n    pdf_files = [p for p in input_dir.glob(\"*.pdf\") if not already_done(p)]\n\n    if not pdf_files:\n        logger.info(\"No new PDFs to process\")\n        return\n\n    frames: list[pd.DataFrame] = []\n    for pdf_path in pdf_files:\n        try:\n            extracted = _extract_pdf(pdf_path)\n            frames.extend(extracted)\n            mark_done(pdf_path)\n            logger.info(\"Parsed %s — %d table(s)\", pdf_path.name, len(extracted))\n        except Exception:\n            logger.exception(\"Skipping %s after retries exhausted\", pdf_path.name)\n\n    if not frames:\n        logger.warning(\"No tables extracted from %d file(s)\", len(pdf_files))\n        return\n\n    combined = pd.concat(frames, ignore_index=True)\n    output_dir.mkdir(parents=True, exist_ok=True)\n    today = datetime.now(timezone.utc).strftime(\"%Y%m%d\")\n    out_path = output_dir \u002F f\"pipeline_{today}.xlsx\"\n    combined.to_excel(out_path, index=False)\n    logger.info(\"Wrote %d rows to %s\", len(combined), out_path)\n\n\ndef run_with_guard(input_dir: Path, output_dir: Path) -> None:\n    if not acquire_lock():\n        logger.warning(\"Another instance is running — skipping this run\")\n        return\n    try:\n        run_job(input_dir, output_dir)\n    except Exception as exc:\n        logger.exception(\"Job failed with unhandled exception\")\n        _alert(\n            subject=\"[doc-pipeline] Job FAILED\",\n            body=f\"Exception: {exc}\\nSee {LOG_DIR \u002F 'pipeline.log'} for details.\",\n        )\n        sys.exit(1)\n    finally:\n        release_lock()\n\n# ── Entry point ───────────────────────────────────────────────────────────────\n\ndef main() -> None:\n    parser = argparse.ArgumentParser(description=\"Document extraction pipeline\")\n    parser.add_argument(\"--input\", type=Path, default=INPUT_DIR)\n    parser.add_argument(\"--output\", type=Path, default=OUTPUT_DIR)\n    parser.add_argument(\n        \"--daemon\",\n        action=\"store_true\",\n        help=\"Run on internal schedule (06:30 daily) instead of once\",\n    )\n    args = parser.parse_args()\n\n    if args.daemon:\n        logger.info(\"Daemon mode — scheduling daily at 06:30\")\n        schedule.every().day.at(\"06:30\").do(\n            run_with_guard, input_dir=args.input, output_dir=args.output\n        )\n        while True:\n            schedule.run_pending()\n            time.sleep(30)\n    else:\n        run_with_guard(input_dir=args.input, output_dir=args.output)\n\n\nif __name__ == \"__main__\":\n    main()\n",[30,4407,4408,4413,4417,4422,4427,4431,4436,4441,4446,4451,4455,4462,4468,4474,4480,4486,4492,4498,4504,4514,4524,4528,4534,4544,4550,4560,4564,4568,4572,4576,4580,4584,4588,4593,4611,4630,4647,4664,4676,4692,4708,4724,4740,4756,4760,4765,4769,4778,4800,4808,4820,4838,4848,4858,4862,4871,4888,4902,4907,4913,4923,4933,4939,4948,4958,4963,4969,4981,4986,4991,5001,5006,5012,5017,5030,5037,5066,5075,5080,5087,5096,5103,5108,5113,5126,5133,5140,5149,5154,5159,5165,5170,5180,5193,5198,5203,5208,5213,5218,5225,5230,5235,5249,5257,5262,5267,5281,5291,5301,5311,5318,5323,5328,5342,5347,5352,5366,5375,5380,5387,5397,5402,5407,5412,5417,5423,5428,5438,5449,5462,5495,5512,5525,5530,5535,5541,5546,5568,5575,5582,5589,5600,5605,5621,5628,5638,5652,5665,5678,5684,5704,5710,5724,5730,5739,5749,5754,5763,5804,5814,5822,5832,5850,5862,5868,5875,5890,5896,5905,5915,5920,5926,5931,5937,5948,5957,5968,5980,5990,6014,6038,6052,6058,6066,6071,6077,6082,6097,6112,6144,6149,6158,6168,6173,6178,6188,6199,6206,6217,6223,6229,6256,6265,6280,6285,6294,6314,6319,6324,6341,6363,6383,6411,6425,6447,6452,6457,6471,6480,6490,6495,6502,6508,6519,6529,6535,6548,6587,6592,6601,6608,6613,6618,6624,6629,6643,6664,6692,6716,6722,6730,6743,6756,6761,6772,6777,6785,6795,6806,6828,6833,6843,6849,6859,6867,6886,6891,6896,6909],{"__ignoreMap":28},[33,4409,4410],{"class":35,"line":36},[33,4411,4412],{"class":39},"# pip install pdfplumber pandas openpyxl tenacity schedule\n",[33,4414,4415],{"class":35,"line":43},[33,4416,139],{"class":54},[33,4418,4419],{"class":35,"line":61},[33,4420,4421],{"class":54},"daily_pipeline.py — self-contained document pipeline with scheduling, logging,\n",[33,4423,4424],{"class":35,"line":73},[33,4425,4426],{"class":54},"lock-file guard, retry, idempotency, validation, and failure alerting.\n",[33,4428,4429],{"class":35,"line":88},[33,4430,92],{"emptyLinePlaceholder":91},[33,4432,4433],{"class":35,"line":95},[33,4434,4435],{"class":54},"Usage:\n",[33,4437,4438],{"class":35,"line":101},[33,4439,4440],{"class":54},"  python daily_pipeline.py                 # run once and exit\n",[33,4442,4443],{"class":35,"line":171},[33,4444,4445],{"class":54},"  python daily_pipeline.py --daemon        # run on internal schedule (06:30 daily)\n",[33,4447,4448],{"class":35,"line":179},[33,4449,4450],{"class":54},"  python daily_pipeline.py --input \u002Fpath   # override input directory\n",[33,4452,4453],{"class":35,"line":187},[33,4454,139],{"class":54},[33,4456,4457,4459],{"class":35,"line":201},[33,4458,164],{"class":163},[33,4460,4461],{"class":167}," argparse\n",[33,4463,4464,4466],{"class":35,"line":206},[33,4465,164],{"class":163},[33,4467,3634],{"class":167},[33,4469,4470,4472],{"class":35,"line":224},[33,4471,164],{"class":163},[33,4473,184],{"class":167},[33,4475,4476,4478],{"class":35,"line":229},[33,4477,164],{"class":163},[33,4479,1004],{"class":167},[33,4481,4482,4484],{"class":35,"line":235},[33,4483,164],{"class":163},[33,4485,176],{"class":167},[33,4487,4488,4490],{"class":35,"line":250},[33,4489,164],{"class":163},[33,4491,3627],{"class":167},[33,4493,4494,4496],{"class":35,"line":266},[33,4495,164],{"class":163},[33,4497,168],{"class":167},[33,4499,4500,4502],{"class":35,"line":290},[33,4501,164],{"class":163},[33,4503,1689],{"class":167},[33,4505,4506,4508,4510,4512],{"class":35,"line":295},[33,4507,190],{"class":163},[33,4509,3881],{"class":167},[33,4511,164],{"class":163},[33,4513,3886],{"class":167},[33,4515,4516,4518,4520,4522],{"class":35,"line":300},[33,4517,190],{"class":163},[33,4519,193],{"class":167},[33,4521,164],{"class":163},[33,4523,198],{"class":167},[33,4525,4526],{"class":35,"line":317},[33,4527,92],{"emptyLinePlaceholder":91},[33,4529,4530,4532],{"class":35,"line":332},[33,4531,164],{"class":163},[33,4533,485],{"class":167},[33,4535,4536,4538,4540,4542],{"class":35,"line":347},[33,4537,164],{"class":163},[33,4539,492],{"class":167},[33,4541,495],{"class":163},[33,4543,498],{"class":167},[33,4545,4546,4548],{"class":35,"line":374},[33,4547,164],{"class":163},[33,4549,2373],{"class":167},[33,4551,4552,4554,4556,4558],{"class":35,"line":397},[33,4553,190],{"class":163},[33,4555,1410],{"class":167},[33,4557,164],{"class":163},[33,4559,1415],{"class":167},[33,4561,4562],{"class":35,"line":653},[33,4563,1420],{"class":167},[33,4565,4566],{"class":35,"line":667},[33,4567,1425],{"class":167},[33,4569,4570],{"class":35,"line":675},[33,4571,1430],{"class":167},[33,4573,4574],{"class":35,"line":689},[33,4575,1435],{"class":167},[33,4577,4578],{"class":35,"line":703},[33,4579,1440],{"class":167},[33,4581,4582],{"class":35,"line":714},[33,4583,221],{"class":167},[33,4585,4586],{"class":35,"line":723},[33,4587,92],{"emptyLinePlaceholder":91},[33,4589,4590],{"class":35,"line":754},[33,4591,4592],{"class":39},"# ── Config (override via CLI or environment) ──────────────────────────────────\n",[33,4594,4595,4597,4599,4602,4605,4607,4609],{"class":35,"line":771},[33,4596,507],{"class":50},[33,4598,212],{"class":163},[33,4600,4601],{"class":167}," Path(os.environ.get(",[33,4603,4604],{"class":54},"\"PIPELINE_INPUT_DIR\"",[33,4606,365],{"class":167},[33,4608,514],{"class":54},[33,4610,371],{"class":167},[33,4612,4613,4616,4618,4620,4623,4625,4628],{"class":35,"line":777},[33,4614,4615],{"class":50},"OUTPUT_DIR",[33,4617,212],{"class":163},[33,4619,4601],{"class":167},[33,4621,4622],{"class":54},"\"PIPELINE_OUTPUT_DIR\"",[33,4624,365],{"class":167},[33,4626,4627],{"class":54},"\"\u002Fdata\u002Freports\"",[33,4629,371],{"class":167},[33,4631,4632,4634,4636,4638,4641,4643,4645],{"class":35,"line":788},[33,4633,1023],{"class":50},[33,4635,212],{"class":163},[33,4637,4601],{"class":167},[33,4639,4640],{"class":54},"\"PIPELINE_LOG_DIR\"",[33,4642,365],{"class":167},[33,4644,1030],{"class":54},[33,4646,371],{"class":167},[33,4648,4649,4651,4653,4655,4658,4660,4662],{"class":35,"line":804},[33,4650,3653],{"class":50},[33,4652,212],{"class":163},[33,4654,4601],{"class":167},[33,4656,4657],{"class":54},"\"PIPELINE_LEDGER\"",[33,4659,365],{"class":167},[33,4661,3660],{"class":54},[33,4663,371],{"class":167},[33,4665,4666,4668,4670,4672,4674],{"class":35,"line":809},[33,4667,2780],{"class":50},[33,4669,212],{"class":163},[33,4671,215],{"class":167},[33,4673,2787],{"class":54},[33,4675,221],{"class":167},[33,4677,4678,4680,4682,4684,4686,4688,4690],{"class":35,"line":819},[33,4679,3124],{"class":50},[33,4681,212],{"class":163},[33,4683,3129],{"class":167},[33,4685,3132],{"class":54},[33,4687,365],{"class":167},[33,4689,3198],{"class":54},[33,4691,221],{"class":167},[33,4693,4694,4696,4698,4700,4702,4704,4706],{"class":35,"line":829},[33,4695,3167],{"class":50},[33,4697,212],{"class":163},[33,4699,3129],{"class":167},[33,4701,3174],{"class":54},[33,4703,365],{"class":167},[33,4705,3198],{"class":54},[33,4707,221],{"class":167},[33,4709,4710,4712,4714,4716,4718,4720,4722],{"class":35,"line":834},[33,4711,3186],{"class":50},[33,4713,212],{"class":163},[33,4715,3129],{"class":167},[33,4717,3193],{"class":54},[33,4719,365],{"class":167},[33,4721,3198],{"class":54},[33,4723,221],{"class":167},[33,4725,4726,4728,4730,4732,4734,4736,4738],{"class":35,"line":839},[33,4727,3205],{"class":50},[33,4729,212],{"class":163},[33,4731,3129],{"class":167},[33,4733,3212],{"class":54},[33,4735,365],{"class":167},[33,4737,3198],{"class":54},[33,4739,221],{"class":167},[33,4741,4742,4744,4746,4748,4750,4752,4754],{"class":35,"line":860},[33,4743,3224],{"class":50},[33,4745,212],{"class":163},[33,4747,3129],{"class":167},[33,4749,3231],{"class":54},[33,4751,365],{"class":167},[33,4753,3198],{"class":54},[33,4755,221],{"class":167},[33,4757,4758],{"class":35,"line":887},[33,4759,92],{"emptyLinePlaceholder":91},[33,4761,4762],{"class":35,"line":907},[33,4763,4764],{"class":39},"# ── Logging setup ─────────────────────────────────────────────────────────────\n",[33,4766,4767],{"class":35,"line":1826},[33,4768,92],{"emptyLinePlaceholder":91},[33,4770,4771,4773,4775],{"class":35,"line":1844},[33,4772,562],{"class":163},[33,4774,1047],{"class":46},[33,4776,4777],{"class":167},"() -> logging.Logger:\n",[33,4779,4780,4782,4784,4786,4788,4790,4792,4794,4796,4798],{"class":35,"line":1858},[33,4781,1075],{"class":50},[33,4783,1078],{"class":167},[33,4785,869],{"class":238},[33,4787,242],{"class":163},[33,4789,855],{"class":50},[33,4791,365],{"class":167},[33,4793,878],{"class":238},[33,4795,242],{"class":163},[33,4797,855],{"class":50},[33,4799,221],{"class":167},[33,4801,4802,4804,4806],{"class":35,"line":1871},[33,4803,1133],{"class":167},[33,4805,242],{"class":163},[33,4807,1138],{"class":167},[33,4809,4810,4813,4815,4818],{"class":35,"line":1877},[33,4811,4812],{"class":50},"        LOG_DIR",[33,4814,1107],{"class":163},[33,4816,4817],{"class":54}," \"pipeline.log\"",[33,4819,247],{"class":167},[33,4821,4822,4824,4826,4828,4830,4832,4834,4836],{"class":35,"line":1883},[33,4823,1148],{"class":238},[33,4825,242],{"class":163},[33,4827,1153],{"class":50},[33,4829,1156],{"class":163},[33,4831,1159],{"class":50},[33,4833,1156],{"class":163},[33,4835,1159],{"class":50},[33,4837,247],{"class":167},[33,4839,4840,4842,4844,4846],{"class":35,"line":1915},[33,4841,1174],{"class":238},[33,4843,242],{"class":163},[33,4845,1179],{"class":50},[33,4847,247],{"class":167},[33,4849,4850,4852,4854,4856],{"class":35,"line":1926},[33,4851,1190],{"class":238},[33,4853,242],{"class":163},[33,4855,1195],{"class":54},[33,4857,247],{"class":167},[33,4859,4860],{"class":35,"line":1932},[33,4861,1202],{"class":167},[33,4863,4864,4867,4869],{"class":35,"line":1938},[33,4865,4866],{"class":167},"    fmt ",[33,4868,242],{"class":163},[33,4870,1212],{"class":167},[33,4872,4873,4876,4878,4880,4882,4884,4886],{"class":35,"line":1950},[33,4874,4875],{"class":54},"        \"",[33,4877,277],{"class":50},[33,4879,1226],{"class":50},[33,4881,280],{"class":50},[33,4883,283],{"class":50},[33,4885,274],{"class":54},[33,4887,247],{"class":167},[33,4889,4890,4892,4894,4896,4898,4900],{"class":35,"line":1958},[33,4891,1239],{"class":238},[33,4893,242],{"class":163},[33,4895,1244],{"class":54},[33,4897,916],{"class":50},[33,4899,1249],{"class":54},[33,4901,247],{"class":167},[33,4903,4905],{"class":35,"line":4904},58,[33,4906,1202],{"class":167},[33,4908,4910],{"class":35,"line":4909},59,[33,4911,4912],{"class":167},"    handler.setFormatter(fmt)\n",[33,4914,4916,4919,4921],{"class":35,"line":4915},60,[33,4917,4918],{"class":167},"    stderr ",[33,4920,242],{"class":163},[33,4922,1303],{"class":167},[33,4924,4926,4929,4931],{"class":35,"line":4925},61,[33,4927,4928],{"class":167},"    stderr.setLevel(logging.",[33,4930,1311],{"class":50},[33,4932,221],{"class":167},[33,4934,4936],{"class":35,"line":4935},62,[33,4937,4938],{"class":167},"    stderr.setFormatter(fmt)\n",[33,4940,4942,4944,4946],{"class":35,"line":4941},63,[33,4943,1269],{"class":167},[33,4945,242],{"class":163},[33,4947,1274],{"class":167},[33,4949,4951,4954,4956],{"class":35,"line":4950},64,[33,4952,4953],{"class":167},"    root.setLevel(logging.",[33,4955,1067],{"class":50},[33,4957,221],{"class":167},[33,4959,4961],{"class":35,"line":4960},65,[33,4962,1284],{"class":167},[33,4964,4966],{"class":35,"line":4965},66,[33,4967,4968],{"class":167},"    root.addHandler(stderr)\n",[33,4970,4972,4974,4976,4979],{"class":35,"line":4971},67,[33,4973,1332],{"class":163},[33,4975,544],{"class":167},[33,4977,4978],{"class":54},"\"pipeline\"",[33,4980,221],{"class":167},[33,4982,4984],{"class":35,"line":4983},68,[33,4985,92],{"emptyLinePlaceholder":91},[33,4987,4989],{"class":35,"line":4988},69,[33,4990,92],{"emptyLinePlaceholder":91},[33,4992,4994,4996,4998],{"class":35,"line":4993},70,[33,4995,539],{"class":167},[33,4997,242],{"class":163},[33,4999,5000],{"class":167}," configure_logging()\n",[33,5002,5004],{"class":35,"line":5003},71,[33,5005,92],{"emptyLinePlaceholder":91},[33,5007,5009],{"class":35,"line":5008},72,[33,5010,5011],{"class":39},"# ── Lock file ─────────────────────────────────────────────────────────────────\n",[33,5013,5015],{"class":35,"line":5014},73,[33,5016,92],{"emptyLinePlaceholder":91},[33,5018,5020,5022,5024,5026,5028],{"class":35,"line":5019},74,[33,5021,562],{"class":163},[33,5023,2816],{"class":46},[33,5025,568],{"class":167},[33,5027,2821],{"class":50},[33,5029,574],{"class":167},[33,5031,5033,5035],{"class":35,"line":5032},75,[33,5034,2424],{"class":163},[33,5036,574],{"class":167},[33,5038,5040,5042,5044,5046,5048,5050,5052,5054,5056,5058,5060,5062,5064],{"class":35,"line":5039},76,[33,5041,2834],{"class":167},[33,5043,242],{"class":163},[33,5045,2839],{"class":167},[33,5047,2780],{"class":50},[33,5049,2844],{"class":167},[33,5051,2847],{"class":50},[33,5053,2850],{"class":163},[33,5055,2853],{"class":167},[33,5057,2856],{"class":50},[33,5059,2850],{"class":163},[33,5061,2853],{"class":167},[33,5063,2863],{"class":50},[33,5065,221],{"class":167},[33,5067,5069,5071,5073],{"class":35,"line":5068},77,[33,5070,2870],{"class":167},[33,5072,1053],{"class":50},[33,5074,2875],{"class":167},[33,5076,5078],{"class":35,"line":5077},78,[33,5079,2880],{"class":167},[33,5081,5083,5085],{"class":35,"line":5082},79,[33,5084,1659],{"class":163},[33,5086,2887],{"class":50},[33,5088,5090,5092,5094],{"class":35,"line":5089},80,[33,5091,2449],{"class":163},[33,5093,2894],{"class":50},[33,5095,574],{"class":167},[33,5097,5099,5101],{"class":35,"line":5098},81,[33,5100,1659],{"class":163},[33,5102,2903],{"class":50},[33,5104,5106],{"class":35,"line":5105},82,[33,5107,92],{"emptyLinePlaceholder":91},[33,5109,5111],{"class":35,"line":5110},83,[33,5112,92],{"emptyLinePlaceholder":91},[33,5114,5116,5118,5120,5122,5124],{"class":35,"line":5115},84,[33,5117,562],{"class":163},[33,5119,2918],{"class":46},[33,5121,568],{"class":167},[33,5123,571],{"class":50},[33,5125,574],{"class":167},[33,5127,5129,5131],{"class":35,"line":5128},85,[33,5130,2424],{"class":163},[33,5132,574],{"class":167},[33,5134,5136,5138],{"class":35,"line":5135},86,[33,5137,2935],{"class":50},[33,5139,2938],{"class":167},[33,5141,5143,5145,5147],{"class":35,"line":5142},87,[33,5144,2449],{"class":163},[33,5146,2945],{"class":50},[33,5148,574],{"class":167},[33,5150,5152],{"class":35,"line":5151},88,[33,5153,2952],{"class":163},[33,5155,5157],{"class":35,"line":5156},89,[33,5158,92],{"emptyLinePlaceholder":91},[33,5160,5162],{"class":35,"line":5161},90,[33,5163,5164],{"class":39},"# ── Idempotency ledger ────────────────────────────────────────────────────────\n",[33,5166,5168],{"class":35,"line":5167},91,[33,5169,92],{"emptyLinePlaceholder":91},[33,5171,5173,5175,5178],{"class":35,"line":5172},92,[33,5174,562],{"class":163},[33,5176,5177],{"class":46}," _db",[33,5179,3680],{"class":167},[33,5181,5183,5185,5187,5189,5191],{"class":35,"line":5182},93,[33,5184,3685],{"class":167},[33,5186,242],{"class":163},[33,5188,3690],{"class":167},[33,5190,3653],{"class":50},[33,5192,221],{"class":167},[33,5194,5196],{"class":35,"line":5195},94,[33,5197,3699],{"class":167},[33,5199,5201],{"class":35,"line":5200},95,[33,5202,3704],{"class":54},[33,5204,5206],{"class":35,"line":5205},96,[33,5207,3709],{"class":54},[33,5209,5211],{"class":35,"line":5210},97,[33,5212,1202],{"class":167},[33,5214,5216],{"class":35,"line":5215},98,[33,5217,3718],{"class":167},[33,5219,5221,5223],{"class":35,"line":5220},99,[33,5222,1332],{"class":163},[33,5224,3725],{"class":167},[33,5226,5228],{"class":35,"line":5227},100,[33,5229,92],{"emptyLinePlaceholder":91},[33,5231,5233],{"class":35,"line":5232},101,[33,5234,92],{"emptyLinePlaceholder":91},[33,5236,5238,5240,5243,5245,5247],{"class":35,"line":5237},102,[33,5239,562],{"class":163},[33,5241,5242],{"class":46}," _hash",[33,5244,3743],{"class":167},[33,5246,1053],{"class":50},[33,5248,574],{"class":167},[33,5250,5252,5254],{"class":35,"line":5251},103,[33,5253,1332],{"class":163},[33,5255,5256],{"class":167}," hashlib.sha256(path.read_bytes()).hexdigest()\n",[33,5258,5260],{"class":35,"line":5259},104,[33,5261,92],{"emptyLinePlaceholder":91},[33,5263,5265],{"class":35,"line":5264},105,[33,5266,92],{"emptyLinePlaceholder":91},[33,5268,5270,5272,5275,5277,5279],{"class":35,"line":5269},106,[33,5271,562],{"class":163},[33,5273,5274],{"class":46}," already_done",[33,5276,3743],{"class":167},[33,5278,2821],{"class":50},[33,5280,574],{"class":167},[33,5282,5284,5286,5288],{"class":35,"line":5283},107,[33,5285,3795],{"class":167},[33,5287,242],{"class":163},[33,5289,5290],{"class":167}," _hash(path)\n",[33,5292,5294,5296,5298],{"class":35,"line":5293},108,[33,5295,3685],{"class":167},[33,5297,242],{"class":163},[33,5299,5300],{"class":167}," _db()\n",[33,5302,5304,5307,5309],{"class":35,"line":5303},109,[33,5305,5306],{"class":167},"    found ",[33,5308,242],{"class":163},[33,5310,3819],{"class":167},[33,5312,5314,5316],{"class":35,"line":5313},110,[33,5315,3824],{"class":54},[33,5317,3827],{"class":167},[33,5319,5321],{"class":35,"line":5320},111,[33,5322,3832],{"class":167},[33,5324,5326],{"class":35,"line":5325},112,[33,5327,3837],{"class":167},[33,5329,5331,5333,5336,5338,5340],{"class":35,"line":5330},113,[33,5332,1332],{"class":163},[33,5334,5335],{"class":167}," found ",[33,5337,3847],{"class":163},[33,5339,620],{"class":163},[33,5341,3852],{"class":50},[33,5343,5345],{"class":35,"line":5344},114,[33,5346,92],{"emptyLinePlaceholder":91},[33,5348,5350],{"class":35,"line":5349},115,[33,5351,92],{"emptyLinePlaceholder":91},[33,5353,5355,5357,5360,5362,5364],{"class":35,"line":5354},116,[33,5356,562],{"class":163},[33,5358,5359],{"class":46}," mark_done",[33,5361,3743],{"class":167},[33,5363,571],{"class":50},[33,5365,574],{"class":167},[33,5367,5369,5371,5373],{"class":35,"line":5368},117,[33,5370,3685],{"class":167},[33,5372,242],{"class":163},[33,5374,5300],{"class":167},[33,5376,5378],{"class":35,"line":5377},118,[33,5379,3699],{"class":167},[33,5381,5383,5385],{"class":35,"line":5382},119,[33,5384,3911],{"class":54},[33,5386,247],{"class":167},[33,5388,5390,5393,5395],{"class":35,"line":5389},120,[33,5391,5392],{"class":167},"        (_hash(path), ",[33,5394,1053],{"class":50},[33,5396,3923],{"class":167},[33,5398,5400],{"class":35,"line":5399},121,[33,5401,1202],{"class":167},[33,5403,5405],{"class":35,"line":5404},122,[33,5406,3718],{"class":167},[33,5408,5410],{"class":35,"line":5409},123,[33,5411,3837],{"class":167},[33,5413,5415],{"class":35,"line":5414},124,[33,5416,92],{"emptyLinePlaceholder":91},[33,5418,5420],{"class":35,"line":5419},125,[33,5421,5422],{"class":39},"# ── Retry decorator ───────────────────────────────────────────────────────────\n",[33,5424,5426],{"class":35,"line":5425},126,[33,5427,92],{"emptyLinePlaceholder":91},[33,5429,5431,5434,5436],{"class":35,"line":5430},127,[33,5432,5433],{"class":167},"retry_io ",[33,5435,242],{"class":163},[33,5437,1479],{"class":167},[33,5439,5441,5443,5445,5447],{"class":35,"line":5440},128,[33,5442,1484],{"class":238},[33,5444,242],{"class":163},[33,5446,855],{"class":50},[33,5448,247],{"class":167},[33,5450,5452,5454,5456,5458,5460],{"class":35,"line":5451},129,[33,5453,1495],{"class":238},[33,5455,242],{"class":163},[33,5457,1500],{"class":167},[33,5459,1503],{"class":50},[33,5461,1506],{"class":167},[33,5463,5465,5467,5469,5471,5473,5475,5477,5479,5481,5483,5485,5487,5489,5491,5493],{"class":35,"line":5464},130,[33,5466,1511],{"class":238},[33,5468,242],{"class":163},[33,5470,1516],{"class":167},[33,5472,1519],{"class":238},[33,5474,242],{"class":163},[33,5476,734],{"class":50},[33,5478,365],{"class":167},[33,5480,1528],{"class":238},[33,5482,242],{"class":163},[33,5484,1533],{"class":50},[33,5486,365],{"class":167},[33,5488,1538],{"class":238},[33,5490,242],{"class":163},[33,5492,1543],{"class":50},[33,5494,1506],{"class":167},[33,5496,5498,5500,5502,5504,5506,5508,5510],{"class":35,"line":5497},131,[33,5499,1550],{"class":238},[33,5501,242],{"class":163},[33,5503,1555],{"class":167},[33,5505,1558],{"class":50},[33,5507,365],{"class":167},[33,5509,1563],{"class":50},[33,5511,1571],{"class":167},[33,5513,5515,5517,5519,5521,5523],{"class":35,"line":5514},132,[33,5516,1576],{"class":238},[33,5518,242],{"class":163},[33,5520,1581],{"class":167},[33,5522,1311],{"class":50},[33,5524,1506],{"class":167},[33,5526,5528],{"class":35,"line":5527},133,[33,5529,221],{"class":167},[33,5531,5533],{"class":35,"line":5532},134,[33,5534,92],{"emptyLinePlaceholder":91},[33,5536,5538],{"class":35,"line":5537},135,[33,5539,5540],{"class":39},"# ── Alert helpers ─────────────────────────────────────────────────────────────\n",[33,5542,5544],{"class":35,"line":5543},136,[33,5545,92],{"emptyLinePlaceholder":91},[33,5547,5549,5551,5554,5556,5558,5560,5562,5564,5566],{"class":35,"line":5548},137,[33,5550,562],{"class":163},[33,5552,5553],{"class":46}," _alert",[33,5555,3255],{"class":167},[33,5557,1053],{"class":50},[33,5559,3260],{"class":167},[33,5561,1053],{"class":50},[33,5563,1617],{"class":167},[33,5565,571],{"class":50},[33,5567,574],{"class":167},[33,5569,5571,5573],{"class":35,"line":5570},138,[33,5572,1627],{"class":163},[33,5574,3081],{"class":167},[33,5576,5578,5580],{"class":35,"line":5577},139,[33,5579,1627],{"class":163},[33,5581,3074],{"class":167},[33,5583,5585,5587],{"class":35,"line":5584},140,[33,5586,1627],{"class":163},[33,5588,1630],{"class":167},[33,5590,5592,5594,5596,5598],{"class":35,"line":5591},141,[33,5593,3878],{"class":163},[33,5595,3100],{"class":167},[33,5597,164],{"class":163},[33,5599,3105],{"class":167},[33,5601,5603],{"class":35,"line":5602},142,[33,5604,92],{"emptyLinePlaceholder":91},[33,5606,5608,5610,5613,5616,5619],{"class":35,"line":5607},143,[33,5609,617],{"class":163},[33,5611,5612],{"class":50}," SMTP_HOST",[33,5614,5615],{"class":163}," and",[33,5617,5618],{"class":50}," ALERT_TO",[33,5620,574],{"class":167},[33,5622,5624,5626],{"class":35,"line":5623},144,[33,5625,670],{"class":163},[33,5627,574],{"class":167},[33,5629,5631,5634,5636],{"class":35,"line":5630},145,[33,5632,5633],{"class":167},"            msg ",[33,5635,242],{"class":163},[33,5637,3278],{"class":167},[33,5639,5641,5644,5646,5648,5650],{"class":35,"line":5640},146,[33,5642,5643],{"class":167},"            msg[",[33,5645,3286],{"class":54},[33,5647,763],{"class":167},[33,5649,242],{"class":163},[33,5651,3293],{"class":167},[33,5653,5655,5657,5659,5661,5663],{"class":35,"line":5654},147,[33,5656,5643],{"class":167},[33,5658,3300],{"class":54},[33,5660,763],{"class":167},[33,5662,242],{"class":163},[33,5664,3307],{"class":50},[33,5666,5668,5670,5672,5674,5676],{"class":35,"line":5667},148,[33,5669,5643],{"class":167},[33,5671,3314],{"class":54},[33,5673,763],{"class":167},[33,5675,242],{"class":163},[33,5677,3321],{"class":50},[33,5679,5681],{"class":35,"line":5680},149,[33,5682,5683],{"class":167},"            msg.set_content(body)\n",[33,5685,5687,5689,5691,5693,5695,5698,5700,5702],{"class":35,"line":5686},150,[33,5688,678],{"class":163},[33,5690,3339],{"class":167},[33,5692,3124],{"class":50},[33,5694,365],{"class":167},[33,5696,5697],{"class":50},"587",[33,5699,1649],{"class":167},[33,5701,495],{"class":163},[33,5703,3352],{"class":167},[33,5705,5707],{"class":35,"line":5706},151,[33,5708,5709],{"class":167},"                s.starttls()\n",[33,5711,5713,5716,5718,5720,5722],{"class":35,"line":5712},152,[33,5714,5715],{"class":167},"                s.login(",[33,5717,3167],{"class":50},[33,5719,365],{"class":167},[33,5721,3186],{"class":50},[33,5723,221],{"class":167},[33,5725,5727],{"class":35,"line":5726},153,[33,5728,5729],{"class":167},"                s.send_message(msg)\n",[33,5731,5733,5735,5737],{"class":35,"line":5732},154,[33,5734,780],{"class":163},[33,5736,783],{"class":50},[33,5738,574],{"class":167},[33,5740,5742,5744,5747],{"class":35,"line":5741},155,[33,5743,791],{"class":167},[33,5745,5746],{"class":54},"\"Email alert failed\"",[33,5748,221],{"class":167},[33,5750,5752],{"class":35,"line":5751},156,[33,5753,92],{"emptyLinePlaceholder":91},[33,5755,5757,5759,5761],{"class":35,"line":5756},157,[33,5758,617],{"class":163},[33,5760,3440],{"class":50},[33,5762,574],{"class":167},[33,5764,5766,5769,5771,5773,5775,5777,5779,5782,5784,5787,5789,5791,5794,5797,5799,5801],{"class":35,"line":5765},158,[33,5767,5768],{"class":167},"        payload ",[33,5770,242],{"class":163},[33,5772,3456],{"class":167},[33,5774,3459],{"class":54},[33,5776,2079],{"class":167},[33,5778,4059],{"class":163},[33,5780,5781],{"class":54},"\"*",[33,5783,1115],{"class":50},[33,5785,5786],{"class":167},"subject",[33,5788,1121],{"class":50},[33,5790,1769],{"class":54},[33,5792,5793],{"class":50},"\\n{",[33,5795,5796],{"class":167},"body",[33,5798,1121],{"class":50},[33,5800,274],{"class":54},[33,5802,5803],{"class":167},"}).encode()\n",[33,5805,5807,5810,5812],{"class":35,"line":5806},159,[33,5808,5809],{"class":167},"        req ",[33,5811,242],{"class":163},[33,5813,3472],{"class":167},[33,5815,5817,5820],{"class":35,"line":5816},160,[33,5818,5819],{"class":50},"            WEBHOOK_URL",[33,5821,247],{"class":167},[33,5823,5825,5828,5830],{"class":35,"line":5824},161,[33,5826,5827],{"class":238},"            data",[33,5829,242],{"class":163},[33,5831,3489],{"class":167},[33,5833,5835,5838,5840,5842,5844,5846,5848],{"class":35,"line":5834},162,[33,5836,5837],{"class":238},"            headers",[33,5839,242],{"class":163},[33,5841,1115],{"class":167},[33,5843,3501],{"class":54},[33,5845,2079],{"class":167},[33,5847,3506],{"class":54},[33,5849,3509],{"class":167},[33,5851,5853,5856,5858,5860],{"class":35,"line":5852},163,[33,5854,5855],{"class":238},"            method",[33,5857,242],{"class":163},[33,5859,3519],{"class":54},[33,5861,247],{"class":167},[33,5863,5865],{"class":35,"line":5864},164,[33,5866,5867],{"class":167},"        )\n",[33,5869,5871,5873],{"class":35,"line":5870},165,[33,5872,670],{"class":163},[33,5874,574],{"class":167},[33,5876,5878,5880,5882,5884,5886,5888],{"class":35,"line":5877},166,[33,5879,678],{"class":163},[33,5881,3538],{"class":167},[33,5883,1641],{"class":238},[33,5885,242],{"class":163},[33,5887,3545],{"class":50},[33,5889,1737],{"class":167},[33,5891,5893],{"class":35,"line":5892},167,[33,5894,5895],{"class":163},"                pass\n",[33,5897,5899,5901,5903],{"class":35,"line":5898},168,[33,5900,780],{"class":163},[33,5902,783],{"class":50},[33,5904,574],{"class":167},[33,5906,5908,5910,5913],{"class":35,"line":5907},169,[33,5909,791],{"class":167},[33,5911,5912],{"class":54},"\"Webhook alert failed\"",[33,5914,221],{"class":167},[33,5916,5918],{"class":35,"line":5917},170,[33,5919,92],{"emptyLinePlaceholder":91},[33,5921,5923],{"class":35,"line":5922},171,[33,5924,5925],{"class":39},"# ── Core extraction ───────────────────────────────────────────────────────────\n",[33,5927,5929],{"class":35,"line":5928},172,[33,5930,92],{"emptyLinePlaceholder":91},[33,5932,5934],{"class":35,"line":5933},173,[33,5935,5936],{"class":46},"@retry_io\n",[33,5938,5940,5942,5945],{"class":35,"line":5939},174,[33,5941,562],{"class":163},[33,5943,5944],{"class":46}," _extract_pdf",[33,5946,5947],{"class":167},"(pdf_path: Path) -> list[pd.DataFrame]:\n",[33,5949,5951,5953,5955],{"class":35,"line":5950},175,[33,5952,584],{"class":167},[33,5954,242],{"class":163},[33,5956,589],{"class":167},[33,5958,5960,5962,5964,5966],{"class":35,"line":5959},176,[33,5961,1635],{"class":163},[33,5963,681],{"class":167},[33,5965,495],{"class":163},[33,5967,686],{"class":167},[33,5969,5971,5974,5976,5978],{"class":35,"line":5970},177,[33,5972,5973],{"class":163},"        for",[33,5975,695],{"class":167},[33,5977,662],{"class":163},[33,5979,700],{"class":167},[33,5981,5983,5986,5988],{"class":35,"line":5982},178,[33,5984,5985],{"class":167},"            table ",[33,5987,242],{"class":163},[33,5989,711],{"class":167},[33,5991,5993,5996,5999,6002,6004,6007,6010,6012],{"class":35,"line":5992},179,[33,5994,5995],{"class":163},"            if",[33,5997,5998],{"class":167}," table ",[33,6000,6001],{"class":163},"and",[33,6003,4037],{"class":50},[33,6005,6006],{"class":167},"(table) ",[33,6008,6009],{"class":163},">",[33,6011,1814],{"class":50},[33,6013,574],{"class":167},[33,6015,6017,6020,6022,6024,6026,6028,6030,6032,6034,6036],{"class":35,"line":6016},180,[33,6018,6019],{"class":167},"                df ",[33,6021,242],{"class":163},[33,6023,731],{"class":167},[33,6025,734],{"class":50},[33,6027,737],{"class":167},[33,6029,740],{"class":238},[33,6031,242],{"class":163},[33,6033,745],{"class":167},[33,6035,748],{"class":50},[33,6037,751],{"class":167},[33,6039,6041,6044,6046,6048,6050],{"class":35,"line":6040},181,[33,6042,6043],{"class":167},"                df[",[33,6045,760],{"class":54},[33,6047,763],{"class":167},[33,6049,242],{"class":163},[33,6051,768],{"class":167},[33,6053,6055],{"class":35,"line":6054},182,[33,6056,6057],{"class":167},"                frames.append(df)\n",[33,6059,6061,6063],{"class":35,"line":6060},183,[33,6062,1332],{"class":163},[33,6064,6065],{"class":167}," frames\n",[33,6067,6069],{"class":35,"line":6068},184,[33,6070,92],{"emptyLinePlaceholder":91},[33,6072,6074],{"class":35,"line":6073},185,[33,6075,6076],{"class":39},"# ── Main job ──────────────────────────────────────────────────────────────────\n",[33,6078,6080],{"class":35,"line":6079},186,[33,6081,92],{"emptyLinePlaceholder":91},[33,6083,6085,6087,6090,6093,6095],{"class":35,"line":6084},187,[33,6086,562],{"class":163},[33,6088,6089],{"class":46}," run_job",[33,6091,6092],{"class":167},"(input_dir: Path, output_dir: Path) -> ",[33,6094,571],{"class":50},[33,6096,574],{"class":167},[33,6098,6100,6102,6105,6107,6109],{"class":35,"line":6099},188,[33,6101,910],{"class":167},[33,6103,6104],{"class":54},"\"Job started — scanning ",[33,6106,309],{"class":50},[33,6108,274],{"class":54},[33,6110,6111],{"class":167},", input_dir)\n",[33,6113,6115,6117,6119,6122,6125,6128,6130,6133,6135,6137,6139,6141],{"class":35,"line":6114},189,[33,6116,594],{"class":167},[33,6118,242],{"class":163},[33,6120,6121],{"class":167}," [p ",[33,6123,6124],{"class":163},"for",[33,6126,6127],{"class":167}," p ",[33,6129,662],{"class":163},[33,6131,6132],{"class":167}," input_dir.glob(",[33,6134,610],{"class":54},[33,6136,1649],{"class":167},[33,6138,2491],{"class":163},[33,6140,620],{"class":163},[33,6142,6143],{"class":167}," already_done(p)]\n",[33,6145,6147],{"class":35,"line":6146},190,[33,6148,92],{"emptyLinePlaceholder":91},[33,6150,6152,6154,6156],{"class":35,"line":6151},191,[33,6153,617],{"class":163},[33,6155,620],{"class":163},[33,6157,623],{"class":167},[33,6159,6161,6163,6166],{"class":35,"line":6160},192,[33,6162,2439],{"class":167},[33,6164,6165],{"class":54},"\"No new PDFs to process\"",[33,6167,221],{"class":167},[33,6169,6171],{"class":35,"line":6170},193,[33,6172,646],{"class":163},[33,6174,6176],{"class":35,"line":6175},194,[33,6177,92],{"emptyLinePlaceholder":91},[33,6179,6181,6184,6186],{"class":35,"line":6180},195,[33,6182,6183],{"class":167},"    frames: list[pd.DataFrame] ",[33,6185,242],{"class":163},[33,6187,589],{"class":167},[33,6189,6191,6193,6195,6197],{"class":35,"line":6190},196,[33,6192,656],{"class":163},[33,6194,659],{"class":167},[33,6196,662],{"class":163},[33,6198,623],{"class":167},[33,6200,6202,6204],{"class":35,"line":6201},197,[33,6203,670],{"class":163},[33,6205,574],{"class":167},[33,6207,6209,6212,6214],{"class":35,"line":6208},198,[33,6210,6211],{"class":167},"            extracted ",[33,6213,242],{"class":163},[33,6215,6216],{"class":167}," _extract_pdf(pdf_path)\n",[33,6218,6220],{"class":35,"line":6219},199,[33,6221,6222],{"class":167},"            frames.extend(extracted)\n",[33,6224,6226],{"class":35,"line":6225},200,[33,6227,6228],{"class":167},"            mark_done(pdf_path)\n",[33,6230,6232,6235,6238,6240,6243,6245,6248,6251,6253],{"class":35,"line":6231},201,[33,6233,6234],{"class":167},"            logger.info(",[33,6236,6237],{"class":54},"\"Parsed ",[33,6239,309],{"class":50},[33,6241,6242],{"class":54}," — ",[33,6244,916],{"class":50},[33,6246,6247],{"class":54}," table(s)\"",[33,6249,6250],{"class":167},", pdf_path.name, ",[33,6252,928],{"class":50},[33,6254,6255],{"class":167},"(extracted))\n",[33,6257,6259,6261,6263],{"class":35,"line":6258},202,[33,6260,780],{"class":163},[33,6262,783],{"class":50},[33,6264,574],{"class":167},[33,6266,6268,6270,6273,6275,6278],{"class":35,"line":6267},203,[33,6269,791],{"class":167},[33,6271,6272],{"class":54},"\"Skipping ",[33,6274,309],{"class":50},[33,6276,6277],{"class":54}," after retries exhausted\"",[33,6279,801],{"class":167},[33,6281,6283],{"class":35,"line":6282},204,[33,6284,92],{"emptyLinePlaceholder":91},[33,6286,6288,6290,6292],{"class":35,"line":6287},205,[33,6289,617],{"class":163},[33,6291,620],{"class":163},[33,6293,816],{"class":167},[33,6295,6297,6299,6302,6304,6307,6309,6311],{"class":35,"line":6296},206,[33,6298,628],{"class":167},[33,6300,6301],{"class":54},"\"No tables extracted from ",[33,6303,916],{"class":50},[33,6305,6306],{"class":54}," file(s)\"",[33,6308,365],{"class":167},[33,6310,928],{"class":50},[33,6312,6313],{"class":167},"(pdf_files))\n",[33,6315,6317],{"class":35,"line":6316},207,[33,6318,646],{"class":163},[33,6320,6322],{"class":35,"line":6321},208,[33,6323,92],{"emptyLinePlaceholder":91},[33,6325,6327,6329,6331,6333,6335,6337,6339],{"class":35,"line":6326},209,[33,6328,842],{"class":167},[33,6330,242],{"class":163},[33,6332,847],{"class":167},[33,6334,850],{"class":238},[33,6336,242],{"class":163},[33,6338,855],{"class":50},[33,6340,221],{"class":167},[33,6342,6344,6347,6349,6351,6353,6355,6357,6359,6361],{"class":35,"line":6343},210,[33,6345,6346],{"class":167},"    output_dir.mkdir(",[33,6348,869],{"class":238},[33,6350,242],{"class":163},[33,6352,855],{"class":50},[33,6354,365],{"class":167},[33,6356,878],{"class":238},[33,6358,242],{"class":163},[33,6360,855],{"class":50},[33,6362,221],{"class":167},[33,6364,6366,6369,6371,6374,6377,6379,6381],{"class":35,"line":6365},211,[33,6367,6368],{"class":167},"    today ",[33,6370,242],{"class":163},[33,6372,6373],{"class":167}," datetime.now(timezone.utc).strftime(",[33,6375,6376],{"class":54},"\"%Y%m",[33,6378,916],{"class":50},[33,6380,274],{"class":54},[33,6382,221],{"class":167},[33,6384,6386,6389,6391,6394,6396,6398,6401,6403,6406,6408],{"class":35,"line":6385},212,[33,6387,6388],{"class":167},"    out_path ",[33,6390,242],{"class":163},[33,6392,6393],{"class":167}," output_dir ",[33,6395,1351],{"class":163},[33,6397,1110],{"class":163},[33,6399,6400],{"class":54},"\"pipeline_",[33,6402,1115],{"class":50},[33,6404,6405],{"class":167},"today",[33,6407,1121],{"class":50},[33,6409,6410],{"class":54},".xlsx\"\n",[33,6412,6414,6417,6419,6421,6423],{"class":35,"line":6413},213,[33,6415,6416],{"class":167},"    combined.to_excel(out_path, ",[33,6418,897],{"class":238},[33,6420,242],{"class":163},[33,6422,902],{"class":50},[33,6424,221],{"class":167},[33,6426,6428,6430,6432,6434,6436,6438,6440,6442,6444],{"class":35,"line":6427},214,[33,6429,910],{"class":167},[33,6431,913],{"class":54},[33,6433,916],{"class":50},[33,6435,919],{"class":54},[33,6437,309],{"class":50},[33,6439,274],{"class":54},[33,6441,365],{"class":167},[33,6443,928],{"class":50},[33,6445,6446],{"class":167},"(combined), out_path)\n",[33,6448,6450],{"class":35,"line":6449},215,[33,6451,92],{"emptyLinePlaceholder":91},[33,6453,6455],{"class":35,"line":6454},216,[33,6456,92],{"emptyLinePlaceholder":91},[33,6458,6460,6462,6465,6467,6469],{"class":35,"line":6459},217,[33,6461,562],{"class":163},[33,6463,6464],{"class":46}," run_with_guard",[33,6466,6092],{"class":167},[33,6468,571],{"class":50},[33,6470,574],{"class":167},[33,6472,6474,6476,6478],{"class":35,"line":6473},218,[33,6475,617],{"class":163},[33,6477,620],{"class":163},[33,6479,2981],{"class":167},[33,6481,6483,6485,6488],{"class":35,"line":6482},219,[33,6484,628],{"class":167},[33,6486,6487],{"class":54},"\"Another instance is running — skipping this run\"",[33,6489,221],{"class":167},[33,6491,6493],{"class":35,"line":6492},220,[33,6494,646],{"class":163},[33,6496,6498,6500],{"class":35,"line":6497},221,[33,6499,2424],{"class":163},[33,6501,574],{"class":167},[33,6503,6505],{"class":35,"line":6504},222,[33,6506,6507],{"class":167},"        run_job(input_dir, output_dir)\n",[33,6509,6511,6513,6515,6517],{"class":35,"line":6510},223,[33,6512,2449],{"class":163},[33,6514,783],{"class":50},[33,6516,1852],{"class":163},[33,6518,1855],{"class":167},[33,6520,6522,6524,6527],{"class":35,"line":6521},224,[33,6523,2458],{"class":167},[33,6525,6526],{"class":54},"\"Job failed with unhandled exception\"",[33,6528,221],{"class":167},[33,6530,6532],{"class":35,"line":6531},225,[33,6533,6534],{"class":167},"        _alert(\n",[33,6536,6538,6541,6543,6546],{"class":35,"line":6537},226,[33,6539,6540],{"class":238},"            subject",[33,6542,242],{"class":163},[33,6544,6545],{"class":54},"\"[doc-pipeline] Job FAILED\"",[33,6547,247],{"class":167},[33,6549,6551,6554,6556,6558,6561,6563,6566,6569,6572,6575,6577,6580,6582,6585],{"class":35,"line":6550},227,[33,6552,6553],{"class":238},"            body",[33,6555,242],{"class":163},[33,6557,4059],{"class":163},[33,6559,6560],{"class":54},"\"Exception: ",[33,6562,1115],{"class":50},[33,6564,6565],{"class":167},"exc",[33,6567,6568],{"class":50},"}\\n",[33,6570,6571],{"class":54},"See ",[33,6573,6574],{"class":50},"{LOG_DIR",[33,6576,1107],{"class":163},[33,6578,6579],{"class":54}," 'pipeline.log'",[33,6581,1121],{"class":50},[33,6583,6584],{"class":54}," for details.\"",[33,6586,247],{"class":167},[33,6588,6590],{"class":35,"line":6589},228,[33,6591,5867],{"class":167},[33,6593,6595,6597,6599],{"class":35,"line":6594},229,[33,6596,2995],{"class":167},[33,6598,734],{"class":50},[33,6600,221],{"class":167},[33,6602,6604,6606],{"class":35,"line":6603},230,[33,6605,3018],{"class":163},[33,6607,574],{"class":167},[33,6609,6611],{"class":35,"line":6610},231,[33,6612,3025],{"class":167},[33,6614,6616],{"class":35,"line":6615},232,[33,6617,92],{"emptyLinePlaceholder":91},[33,6619,6621],{"class":35,"line":6620},233,[33,6622,6623],{"class":39},"# ── Entry point ───────────────────────────────────────────────────────────────\n",[33,6625,6627],{"class":35,"line":6626},234,[33,6628,92],{"emptyLinePlaceholder":91},[33,6630,6632,6634,6637,6639,6641],{"class":35,"line":6631},235,[33,6633,562],{"class":163},[33,6635,6636],{"class":46}," main",[33,6638,568],{"class":167},[33,6640,571],{"class":50},[33,6642,574],{"class":167},[33,6644,6646,6649,6651,6654,6657,6659,6662],{"class":35,"line":6645},236,[33,6647,6648],{"class":167},"    parser ",[33,6650,242],{"class":163},[33,6652,6653],{"class":167}," argparse.ArgumentParser(",[33,6655,6656],{"class":238},"description",[33,6658,242],{"class":163},[33,6660,6661],{"class":54},"\"Document extraction pipeline\"",[33,6663,221],{"class":167},[33,6665,6667,6670,6673,6675,6678,6680,6683,6686,6688,6690],{"class":35,"line":6666},237,[33,6668,6669],{"class":167},"    parser.add_argument(",[33,6671,6672],{"class":54},"\"--input\"",[33,6674,365],{"class":167},[33,6676,6677],{"class":238},"type",[33,6679,242],{"class":163},[33,6681,6682],{"class":167},"Path, ",[33,6684,6685],{"class":238},"default",[33,6687,242],{"class":163},[33,6689,507],{"class":50},[33,6691,221],{"class":167},[33,6693,6695,6697,6700,6702,6704,6706,6708,6710,6712,6714],{"class":35,"line":6694},238,[33,6696,6669],{"class":167},[33,6698,6699],{"class":54},"\"--output\"",[33,6701,365],{"class":167},[33,6703,6677],{"class":238},[33,6705,242],{"class":163},[33,6707,6682],{"class":167},[33,6709,6685],{"class":238},[33,6711,242],{"class":163},[33,6713,4615],{"class":50},[33,6715,221],{"class":167},[33,6717,6719],{"class":35,"line":6718},239,[33,6720,6721],{"class":167},"    parser.add_argument(\n",[33,6723,6725,6728],{"class":35,"line":6724},240,[33,6726,6727],{"class":54},"        \"--daemon\"",[33,6729,247],{"class":167},[33,6731,6733,6736,6738,6741],{"class":35,"line":6732},241,[33,6734,6735],{"class":238},"        action",[33,6737,242],{"class":163},[33,6739,6740],{"class":54},"\"store_true\"",[33,6742,247],{"class":167},[33,6744,6746,6749,6751,6754],{"class":35,"line":6745},242,[33,6747,6748],{"class":238},"        help",[33,6750,242],{"class":163},[33,6752,6753],{"class":54},"\"Run on internal schedule (06:30 daily) instead of once\"",[33,6755,247],{"class":167},[33,6757,6759],{"class":35,"line":6758},243,[33,6760,1202],{"class":167},[33,6762,6764,6767,6769],{"class":35,"line":6763},244,[33,6765,6766],{"class":167},"    args ",[33,6768,242],{"class":163},[33,6770,6771],{"class":167}," parser.parse_args()\n",[33,6773,6775],{"class":35,"line":6774},245,[33,6776,92],{"emptyLinePlaceholder":91},[33,6778,6780,6782],{"class":35,"line":6779},246,[33,6781,617],{"class":163},[33,6783,6784],{"class":167}," args.daemon:\n",[33,6786,6788,6790,6793],{"class":35,"line":6787},247,[33,6789,2439],{"class":167},[33,6791,6792],{"class":54},"\"Daemon mode — scheduling daily at 06:30\"",[33,6794,221],{"class":167},[33,6796,6798,6801,6803],{"class":35,"line":6797},248,[33,6799,6800],{"class":167},"        schedule.every().day.at(",[33,6802,2479],{"class":54},[33,6804,6805],{"class":167},").do(\n",[33,6807,6809,6812,6815,6817,6820,6823,6825],{"class":35,"line":6808},249,[33,6810,6811],{"class":167},"            run_with_guard, ",[33,6813,6814],{"class":238},"input_dir",[33,6816,242],{"class":163},[33,6818,6819],{"class":167},"args.input, ",[33,6821,6822],{"class":238},"output_dir",[33,6824,242],{"class":163},[33,6826,6827],{"class":167},"args.output\n",[33,6829,6831],{"class":35,"line":6830},250,[33,6832,5867],{"class":167},[33,6834,6836,6839,6841],{"class":35,"line":6835},251,[33,6837,6838],{"class":163},"        while",[33,6840,2519],{"class":50},[33,6842,574],{"class":167},[33,6844,6846],{"class":35,"line":6845},252,[33,6847,6848],{"class":167},"            schedule.run_pending()\n",[33,6850,6852,6855,6857],{"class":35,"line":6851},253,[33,6853,6854],{"class":167},"            time.sleep(",[33,6856,1543],{"class":50},[33,6858,221],{"class":167},[33,6860,6862,6865],{"class":35,"line":6861},254,[33,6863,6864],{"class":163},"    else",[33,6866,574],{"class":167},[33,6868,6870,6873,6875,6877,6879,6881,6883],{"class":35,"line":6869},255,[33,6871,6872],{"class":167},"        run_with_guard(",[33,6874,6814],{"class":238},[33,6876,242],{"class":163},[33,6878,6819],{"class":167},[33,6880,6822],{"class":238},[33,6882,242],{"class":163},[33,6884,6885],{"class":167},"args.output)\n",[33,6887,6889],{"class":35,"line":6888},256,[33,6890,92],{"emptyLinePlaceholder":91},[33,6892,6894],{"class":35,"line":6893},257,[33,6895,92],{"emptyLinePlaceholder":91},[33,6897,6899,6901,6903,6905,6907],{"class":35,"line":6898},258,[33,6900,2491],{"class":163},[33,6902,2494],{"class":50},[33,6904,2497],{"class":163},[33,6906,2500],{"class":54},[33,6908,574],{"class":167},[33,6910,6912],{"class":35,"line":6911},259,[33,6913,6914],{"class":167},"    main()\n",[18,6916,6918],{"id":6917},"related","Related",[4211,6920,6921,6926,6931,6938],{},[4214,6922,6923,6925],{},[940,6924,948],{"href":947}," — the upstream step that feeds raw tables into the pipeline this guide schedules",[4214,6927,6928,6930],{},[940,6929,4204],{"href":4203}," — downstream step: turn pipeline output into formatted Excel or PDF reports",[4214,6932,6933,6937],{},[940,6934,6936],{"href":6935},"\u002Fpython-for-excel-csv-data-processing\u002Fautomating-excel-report-generation\u002F","Automating Excel Report Generation"," — production scheduling and logging patterns for Excel-specific pipelines",[4214,6939,6940,6944],{},[940,6941,6943],{"href":6942},"\u002Fautomating-pdf-extraction-generation\u002F","Automating PDF Extraction & Generation"," — covers the production-hardening section of the PDF extraction workflow",[14,6946,6947,6948,3035],{},"Part of ",[940,6949,6951],{"href":6950},"\u002Fautomating-document-data-pipelines\u002F","Automating Document & Data Pipelines",[6953,6954,6955],"style",{},"html pre.shiki code .sAwPA, html code.shiki .sAwPA{--shiki-default:#6A737D}html pre.shiki code .s7eDp, html code.shiki .s7eDp{--shiki-default:#6F42C1}html pre.shiki code .sYu0t, html code.shiki .sYu0t{--shiki-default:#005CC5}html pre.shiki code .sYBdl, html code.shiki .sYBdl{--shiki-default:#032F62}html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html pre.shiki code .sD7c4, html code.shiki .sD7c4{--shiki-default:#D73A49}html pre.shiki code .sgsFI, html code.shiki .sgsFI{--shiki-default:#24292E}html pre.shiki code .sqxcx, html code.shiki .sqxcx{--shiki-default:#E36209}html pre.shiki code .shJU0, html code.shiki .shJU0{--shiki-default:#22863A}",{"title":28,"searchDepth":43,"depth":43,"links":6957},[6958,6959,6960,6966,6971,6972,6973,6974,6975],{"id":20,"depth":43,"text":21},{"id":115,"depth":43,"text":116},{"id":421,"depth":43,"text":422,"children":6961},[6962,6963,6964,6965],{"id":426,"depth":61,"text":427},{"id":952,"depth":61,"text":953},{"id":1358,"depth":61,"text":1359},{"id":1966,"depth":61,"text":1967},{"id":2708,"depth":43,"text":2709,"children":6967},[6968,6969,6970],{"id":2712,"depth":61,"text":2713},{"id":3038,"depth":61,"text":3039},{"id":3589,"depth":61,"text":3590},{"id":3938,"depth":43,"text":3939},{"id":4208,"depth":43,"text":4209},{"id":4270,"depth":43,"text":4271},{"id":4401,"depth":43,"text":4402},{"id":6917,"depth":43,"text":6918},"Scheduling & Logging",null,"2026-06-18","Run document and data pipelines unattended with cron, Windows Task Scheduler, GitHub Actions, and the schedule library; add structured logging, retries, and failure alerts.",false,"md",{},"\u002Fautomating-document-data-pipelines\u002Fscheduling-and-logging-automation-jobs",{"title":5,"description":6979},"Scheduling and Logging Python Automation Jobs","automating-document-data-pipelines\u002Fscheduling-and-logging-automation-jobs\u002Findex",[47,6988,959,6989,6990],"scheduling","automation","devops","SOYX7iHrvUapZh7zjIKP2nP9NAsWe7uMhr74mBa_kjw",[6993,9634,16141,26234,29291,34951,36793,41436,43146,46398,49146,53850,57589,59428,64805,68018,71111,75764,79642,81741,86105,88647,92949,95758,95778,99617,102112,107423,110383,112818,117975,120157,125354,128338,131227,133935,138208,139800,142391,147044,149542,156150,161323,163896,166429,170117,172224,176325,179033,181624],{"id":6994,"title":6995,"body":6996,"breadcrumbTitle":9623,"canonical":6977,"date":6978,"description":9624,"draft":6980,"extension":6981,"image":6977,"meta":9625,"navigation":91,"path":9626,"robots":6977,"seo":9627,"seoTitle":6995,"stem":9628,"tags":9629,"updatedAt":6978,"__hash__":9633},"content\u002Fautomating-document-data-pipelines\u002Fextracting-pdf-data-into-pandas\u002Fhandle-multi-page-pdf-tables-in-pandas\u002Findex.md","Handle Multi-Page PDF Tables in pandas",{"type":7,"value":6997,"toc":9615},[6998,7001,7015,7018,7022,7032,7287,7290,7294,8344,8347,8379,8383,8386,8865,8868,8872,8886,9244,9248,9251,9572,9579,9581,9608,9612],[10,6999,6995],{"id":7000},"handle-multi-page-pdf-tables-in-pandas",[14,7002,7003,7004,7006,7007,7010,7011,7014],{},"A table that spans multiple PDF pages produces a repeated header row at the top of each new page. When you extract per-page DataFrames with ",[940,7005,943],{"href":942}," or camelot and call ",[30,7008,7009],{},"pd.concat()"," without filtering, those header strings appear as data rows scattered through the combined DataFrame. Downstream operations — ",[30,7012,7013],{},"pd.to_numeric",", groupby, any dtype coercion — fail or silently produce NaN wherever a header row landed.",[14,7016,7017],{},"The same symptom has two other causes: a page break splits a single row across two pages (the row appears fragmented in both frames), and inconsistent column counts across pages (some pages export N columns, others N+1 due to a merged-cell footnote column). Each variant needs a slightly different fix.",[18,7019,7021],{"id":7020},"root-cause","Root Cause",[14,7023,7024,7025,365,7028,7031],{},"PDF table extractors treat each page independently. They do not know that pages 1 through 8 hold one logical table. Each page's extracted frame has its own header row — usually promoted from the first data row extracted on that page. When those frames are concatenated, the header strings (",[30,7026,7027],{},"\"Date\"",[30,7029,7030],{},"\"Amount\"",", etc.) become ordinary string values in whatever rows they occupy.",[23,7033,7035],{"className":126,"code":7034,"language":47,"meta":28,"style":28},"# pip install pdfplumber pandas\nfrom pathlib import Path\nimport pdfplumber\nimport pandas as pd\n\nPDF_PATH = Path(\"data\u002Freport.pdf\")\n\n\ndef naive_concat(path: Path) -> pd.DataFrame:\n    \"\"\"Demonstrates the duplicated-header problem — do not use as-is.\"\"\"\n    frames = []\n    with pdfplumber.open(path) as pdf:\n        for page in pdf.pages:\n            raw = page.extract_tables()\n            for tbl in (raw or []):\n                if tbl and len(tbl) > 1:\n                    # First raw row becomes columns; already promoted\n                    frames.append(pd.DataFrame(tbl[1:], columns=tbl[0]))\n    return pd.concat(frames, ignore_index=True)\n\n\nif __name__ == \"__main__\":\n    df = naive_concat(PDF_PATH)\n    # Spot the header-as-data problem:\n    print(df[df[\"Amount\"] == \"Amount\"])   # should return empty; if not, headers leaked\n",[30,7036,7037,7042,7052,7058,7068,7072,7086,7090,7094,7104,7109,7117,7128,7138,7148,7166,7186,7191,7212,7226,7230,7234,7246,7259,7264],{"__ignoreMap":28},[33,7038,7039],{"class":35,"line":36},[33,7040,7041],{"class":39},"# pip install pdfplumber pandas\n",[33,7043,7044,7046,7048,7050],{"class":35,"line":43},[33,7045,190],{"class":163},[33,7047,193],{"class":167},[33,7049,164],{"class":163},[33,7051,198],{"class":167},[33,7053,7054,7056],{"class":35,"line":61},[33,7055,164],{"class":163},[33,7057,485],{"class":167},[33,7059,7060,7062,7064,7066],{"class":35,"line":73},[33,7061,164],{"class":163},[33,7063,492],{"class":167},[33,7065,495],{"class":163},[33,7067,498],{"class":167},[33,7069,7070],{"class":35,"line":88},[33,7071,92],{"emptyLinePlaceholder":91},[33,7073,7074,7077,7079,7081,7084],{"class":35,"line":95},[33,7075,7076],{"class":50},"PDF_PATH",[33,7078,212],{"class":163},[33,7080,215],{"class":167},[33,7082,7083],{"class":54},"\"data\u002Freport.pdf\"",[33,7085,221],{"class":167},[33,7087,7088],{"class":35,"line":101},[33,7089,92],{"emptyLinePlaceholder":91},[33,7091,7092],{"class":35,"line":171},[33,7093,92],{"emptyLinePlaceholder":91},[33,7095,7096,7098,7101],{"class":35,"line":179},[33,7097,562],{"class":163},[33,7099,7100],{"class":46}," naive_concat",[33,7102,7103],{"class":167},"(path: Path) -> pd.DataFrame:\n",[33,7105,7106],{"class":35,"line":187},[33,7107,7108],{"class":54},"    \"\"\"Demonstrates the duplicated-header problem — do not use as-is.\"\"\"\n",[33,7110,7111,7113,7115],{"class":35,"line":201},[33,7112,584],{"class":167},[33,7114,242],{"class":163},[33,7116,589],{"class":167},[33,7118,7119,7121,7124,7126],{"class":35,"line":206},[33,7120,1635],{"class":163},[33,7122,7123],{"class":167}," pdfplumber.open(path) ",[33,7125,495],{"class":163},[33,7127,686],{"class":167},[33,7129,7130,7132,7134,7136],{"class":35,"line":224},[33,7131,5973],{"class":163},[33,7133,695],{"class":167},[33,7135,662],{"class":163},[33,7137,700],{"class":167},[33,7139,7140,7143,7145],{"class":35,"line":229},[33,7141,7142],{"class":167},"            raw ",[33,7144,242],{"class":163},[33,7146,7147],{"class":167}," page.extract_tables()\n",[33,7149,7150,7152,7155,7157,7160,7163],{"class":35,"line":235},[33,7151,1793],{"class":163},[33,7153,7154],{"class":167}," tbl ",[33,7156,662],{"class":163},[33,7158,7159],{"class":167}," (raw ",[33,7161,7162],{"class":163},"or",[33,7164,7165],{"class":167}," []):\n",[33,7167,7168,7171,7173,7175,7177,7180,7182,7184],{"class":35,"line":250},[33,7169,7170],{"class":163},"                if",[33,7172,7154],{"class":167},[33,7174,6001],{"class":163},[33,7176,4037],{"class":50},[33,7178,7179],{"class":167},"(tbl) ",[33,7181,6009],{"class":163},[33,7183,1814],{"class":50},[33,7185,574],{"class":167},[33,7187,7188],{"class":35,"line":266},[33,7189,7190],{"class":39},"                    # First raw row becomes columns; already promoted\n",[33,7192,7193,7196,7198,7200,7202,7204,7207,7209],{"class":35,"line":290},[33,7194,7195],{"class":167},"                    frames.append(pd.DataFrame(tbl[",[33,7197,734],{"class":50},[33,7199,737],{"class":167},[33,7201,740],{"class":238},[33,7203,242],{"class":163},[33,7205,7206],{"class":167},"tbl[",[33,7208,748],{"class":50},[33,7210,7211],{"class":167},"]))\n",[33,7213,7214,7216,7218,7220,7222,7224],{"class":35,"line":295},[33,7215,1332],{"class":163},[33,7217,847],{"class":167},[33,7219,850],{"class":238},[33,7221,242],{"class":163},[33,7223,855],{"class":50},[33,7225,221],{"class":167},[33,7227,7228],{"class":35,"line":300},[33,7229,92],{"emptyLinePlaceholder":91},[33,7231,7232],{"class":35,"line":317},[33,7233,92],{"emptyLinePlaceholder":91},[33,7235,7236,7238,7240,7242,7244],{"class":35,"line":332},[33,7237,2491],{"class":163},[33,7239,2494],{"class":50},[33,7241,2497],{"class":163},[33,7243,2500],{"class":54},[33,7245,574],{"class":167},[33,7247,7248,7250,7252,7255,7257],{"class":35,"line":347},[33,7249,4025],{"class":167},[33,7251,242],{"class":163},[33,7253,7254],{"class":167}," naive_concat(",[33,7256,7076],{"class":50},[33,7258,221],{"class":167},[33,7260,7261],{"class":35,"line":374},[33,7262,7263],{"class":39},"    # Spot the header-as-data problem:\n",[33,7265,7266,7269,7272,7274,7276,7278,7281,7284],{"class":35,"line":397},[33,7267,7268],{"class":50},"    print",[33,7270,7271],{"class":167},"(df[df[",[33,7273,7030],{"class":54},[33,7275,763],{"class":167},[33,7277,1865],{"class":163},[33,7279,7280],{"class":54}," \"Amount\"",[33,7282,7283],{"class":167},"])   ",[33,7285,7286],{"class":39},"# should return empty; if not, headers leaked\n",[14,7288,7289],{},"Run the diagnostic above first. If the filter returns rows, repeated headers are present and need to be removed before any further processing.",[18,7291,7293],{"id":7292},"fix-drop-repeated-header-rows-standardize-columns-and-concat","Fix: Drop Repeated Header Rows, Standardize Columns, and Concat",[23,7295,7297],{"className":126,"code":7296,"language":47,"meta":28,"style":28},"# pip install pdfplumber pandas\nfrom pathlib import Path\nimport pdfplumber\nimport pandas as pd\n\nPDF_PATH = Path(\"data\u002Freport.pdf\")\n\n\ndef extract_frames(path: Path) -> list[pd.DataFrame]:\n    \"\"\"Extract one DataFrame per page table. First raw row → column names.\"\"\"\n    frames: list[pd.DataFrame] = []\n    try:\n        with pdfplumber.open(path) as pdf:\n            for page_num, page in enumerate(pdf.pages, start=1):\n                for raw in (page.extract_tables() or []):\n                    if not raw or len(raw) \u003C 2:\n                        continue\n                    # Normalize header: strip whitespace, replace None\n                    header = [\n                        str(c).strip() if c else f\"col_{i}\"\n                        for i, c in enumerate(raw[0])\n                    ]\n                    df = pd.DataFrame(raw[1:], columns=header)\n                    df[\"_page\"] = page_num   # audit column — drop before export\n                    frames.append(df)\n    except Exception as e:\n        raise RuntimeError(f\"Extraction failed: {e}\") from e\n    return frames\n\n\ndef drop_repeated_headers(\n    frames: list[pd.DataFrame],\n    canonical_cols: list[str] | None = None,\n) -> pd.DataFrame:\n    \"\"\"\n    Remove rows where every non-audit cell matches its column name.\n    Standardize all frames to the canonical column set before concat.\n\n    Args:\n        frames:         List of per-page DataFrames from extract_frames().\n        canonical_cols: Expected column names. Defaults to the first frame's columns.\n    \"\"\"\n    if not frames:\n        return pd.DataFrame()\n\n    # Determine canonical columns from first frame if not provided\n    non_audit = [c for c in frames[0].columns if not c.startswith(\"_\")]\n    if canonical_cols is None:\n        canonical_cols = non_audit\n\n    cleaned: list[pd.DataFrame] = []\n    for df in frames:\n        # Drop rows where every data cell equals its column name (repeated header)\n        data_cols = [c for c in df.columns if not c.startswith(\"_\")]\n        header_mask = df[data_cols].apply(\n            lambda row: all(             # True if every cell is the column name\n                str(v).strip() == str(col).strip()\n                for v, col in zip(row, data_cols)\n            ),\n            axis=1,\n        )\n        df = df[~header_mask].copy()    # remove header rows\n\n        # Standardize to canonical column set; fill missing columns with NaN\n        audit = [c for c in df.columns if c.startswith(\"_\")]\n        for col in canonical_cols:\n            if col not in df.columns:\n                df[col] = pd.NA         # add missing column as empty\n        df = df[canonical_cols + audit] # reorder to canonical + audit\n        cleaned.append(df)\n\n    # ignore_index=True re-sequences the integer index across all pages\n    combined = pd.concat(cleaned, ignore_index=True)\n\n    # Forward-fill group-label columns (merged cells appear only on first data row)\n    combined = combined.ffill()\n\n    return combined\n\n\nif __name__ == \"__main__\":\n    frames = extract_frames(PDF_PATH)\n    print(f\"Extracted {len(frames)} page frame(s)\")\n\n    combined = drop_repeated_headers(frames)\n    print(f\"Combined shape: {combined.shape}\")\n    print(combined.dtypes)\n\n    # Verify no header strings leaked\n    for col in combined.columns:\n        if not col.startswith(\"_\"):\n            leaks = combined[combined[col].astype(str).str.strip() == col]\n            if not leaks.empty:\n                print(f\"Warning: header leak in column '{col}': {len(leaks)} row(s)\")\n\n    combined.drop(columns=[\"_page\"], errors=\"ignore\").to_csv(\n        \"output\u002Fcombined.csv\", index=False\n    )\n",[30,7298,7299,7303,7313,7319,7329,7333,7345,7349,7353,7363,7368,7376,7382,7392,7416,7432,7454,7459,7464,7474,7505,7524,7529,7550,7568,7573,7584,7614,7620,7624,7628,7638,7643,7664,7669,7674,7679,7684,7688,7693,7698,7703,7707,7715,7722,7726,7731,7768,7781,7791,7795,7804,7815,7820,7848,7858,7875,7891,7906,7911,7922,7926,7945,7949,7954,7979,7991,8006,8022,8039,8044,8048,8053,8070,8074,8079,8088,8092,8099,8103,8107,8119,8132,8155,8159,8168,8190,8197,8201,8206,8217,8231,8251,8260,8294,8298,8326,8340],{"__ignoreMap":28},[33,7300,7301],{"class":35,"line":36},[33,7302,7041],{"class":39},[33,7304,7305,7307,7309,7311],{"class":35,"line":43},[33,7306,190],{"class":163},[33,7308,193],{"class":167},[33,7310,164],{"class":163},[33,7312,198],{"class":167},[33,7314,7315,7317],{"class":35,"line":61},[33,7316,164],{"class":163},[33,7318,485],{"class":167},[33,7320,7321,7323,7325,7327],{"class":35,"line":73},[33,7322,164],{"class":163},[33,7324,492],{"class":167},[33,7326,495],{"class":163},[33,7328,498],{"class":167},[33,7330,7331],{"class":35,"line":88},[33,7332,92],{"emptyLinePlaceholder":91},[33,7334,7335,7337,7339,7341,7343],{"class":35,"line":95},[33,7336,7076],{"class":50},[33,7338,212],{"class":163},[33,7340,215],{"class":167},[33,7342,7083],{"class":54},[33,7344,221],{"class":167},[33,7346,7347],{"class":35,"line":101},[33,7348,92],{"emptyLinePlaceholder":91},[33,7350,7351],{"class":35,"line":171},[33,7352,92],{"emptyLinePlaceholder":91},[33,7354,7355,7357,7360],{"class":35,"line":179},[33,7356,562],{"class":163},[33,7358,7359],{"class":46}," extract_frames",[33,7361,7362],{"class":167},"(path: Path) -> list[pd.DataFrame]:\n",[33,7364,7365],{"class":35,"line":187},[33,7366,7367],{"class":54},"    \"\"\"Extract one DataFrame per page table. First raw row → column names.\"\"\"\n",[33,7369,7370,7372,7374],{"class":35,"line":201},[33,7371,6183],{"class":167},[33,7373,242],{"class":163},[33,7375,589],{"class":167},[33,7377,7378,7380],{"class":35,"line":206},[33,7379,2424],{"class":163},[33,7381,574],{"class":167},[33,7383,7384,7386,7388,7390],{"class":35,"line":224},[33,7385,2191],{"class":163},[33,7387,7123],{"class":167},[33,7389,495],{"class":163},[33,7391,686],{"class":167},[33,7393,7394,7396,7399,7401,7404,7407,7410,7412,7414],{"class":35,"line":229},[33,7395,1793],{"class":163},[33,7397,7398],{"class":167}," page_num, page ",[33,7400,662],{"class":163},[33,7402,7403],{"class":50}," enumerate",[33,7405,7406],{"class":167},"(pdf.pages, ",[33,7408,7409],{"class":238},"start",[33,7411,242],{"class":163},[33,7413,734],{"class":50},[33,7415,1737],{"class":167},[33,7417,7418,7420,7423,7425,7428,7430],{"class":35,"line":235},[33,7419,692],{"class":163},[33,7421,7422],{"class":167}," raw ",[33,7424,662],{"class":163},[33,7426,7427],{"class":167}," (page.extract_tables() ",[33,7429,7162],{"class":163},[33,7431,7165],{"class":167},[33,7433,7434,7436,7438,7440,7442,7444,7447,7449,7452],{"class":35,"line":250},[33,7435,717],{"class":163},[33,7437,620],{"class":163},[33,7439,7422],{"class":167},[33,7441,7162],{"class":163},[33,7443,4037],{"class":50},[33,7445,7446],{"class":167},"(raw) ",[33,7448,4043],{"class":163},[33,7450,7451],{"class":50}," 2",[33,7453,574],{"class":167},[33,7455,7456],{"class":35,"line":266},[33,7457,7458],{"class":163},"                        continue\n",[33,7460,7461],{"class":35,"line":290},[33,7462,7463],{"class":39},"                    # Normalize header: strip whitespace, replace None\n",[33,7465,7466,7469,7471],{"class":35,"line":295},[33,7467,7468],{"class":167},"                    header ",[33,7470,242],{"class":163},[33,7472,7473],{"class":167}," [\n",[33,7475,7476,7479,7482,7484,7487,7490,7492,7495,7497,7500,7502],{"class":35,"line":300},[33,7477,7478],{"class":50},"                        str",[33,7480,7481],{"class":167},"(c).strip() ",[33,7483,2491],{"class":163},[33,7485,7486],{"class":167}," c ",[33,7488,7489],{"class":163},"else",[33,7491,1110],{"class":163},[33,7493,7494],{"class":54},"\"col_",[33,7496,1115],{"class":50},[33,7498,7499],{"class":167},"i",[33,7501,1121],{"class":50},[33,7503,7504],{"class":54},"\"\n",[33,7506,7507,7510,7513,7515,7517,7520,7522],{"class":35,"line":317},[33,7508,7509],{"class":163},"                        for",[33,7511,7512],{"class":167}," i, c ",[33,7514,662],{"class":163},[33,7516,7403],{"class":50},[33,7518,7519],{"class":167},"(raw[",[33,7521,748],{"class":50},[33,7523,751],{"class":167},[33,7525,7526],{"class":35,"line":332},[33,7527,7528],{"class":167},"                    ]\n",[33,7530,7531,7534,7536,7539,7541,7543,7545,7547],{"class":35,"line":347},[33,7532,7533],{"class":167},"                    df ",[33,7535,242],{"class":163},[33,7537,7538],{"class":167}," pd.DataFrame(raw[",[33,7540,734],{"class":50},[33,7542,737],{"class":167},[33,7544,740],{"class":238},[33,7546,242],{"class":163},[33,7548,7549],{"class":167},"header)\n",[33,7551,7552,7555,7558,7560,7562,7565],{"class":35,"line":374},[33,7553,7554],{"class":167},"                    df[",[33,7556,7557],{"class":54},"\"_page\"",[33,7559,763],{"class":167},[33,7561,242],{"class":163},[33,7563,7564],{"class":167}," page_num   ",[33,7566,7567],{"class":39},"# audit column — drop before export\n",[33,7569,7570],{"class":35,"line":397},[33,7571,7572],{"class":167},"                    frames.append(df)\n",[33,7574,7575,7577,7579,7581],{"class":35,"line":653},[33,7576,2449],{"class":163},[33,7578,783],{"class":50},[33,7580,1852],{"class":163},[33,7582,7583],{"class":167}," e:\n",[33,7585,7586,7588,7591,7593,7595,7598,7600,7603,7605,7607,7609,7611],{"class":35,"line":667},[33,7587,4051],{"class":163},[33,7589,7590],{"class":50}," RuntimeError",[33,7592,602],{"class":167},[33,7594,4059],{"class":163},[33,7596,7597],{"class":54},"\"Extraction failed: ",[33,7599,1115],{"class":50},[33,7601,7602],{"class":167},"e",[33,7604,1121],{"class":50},[33,7606,274],{"class":54},[33,7608,1649],{"class":167},[33,7610,190],{"class":163},[33,7612,7613],{"class":167}," e\n",[33,7615,7616,7618],{"class":35,"line":675},[33,7617,1332],{"class":163},[33,7619,6065],{"class":167},[33,7621,7622],{"class":35,"line":689},[33,7623,92],{"emptyLinePlaceholder":91},[33,7625,7626],{"class":35,"line":703},[33,7627,92],{"emptyLinePlaceholder":91},[33,7629,7630,7632,7635],{"class":35,"line":714},[33,7631,562],{"class":163},[33,7633,7634],{"class":46}," drop_repeated_headers",[33,7636,7637],{"class":167},"(\n",[33,7639,7640],{"class":35,"line":723},[33,7641,7642],{"class":167},"    frames: list[pd.DataFrame],\n",[33,7644,7645,7648,7650,7652,7655,7658,7660,7662],{"class":35,"line":754},[33,7646,7647],{"class":167},"    canonical_cols: list[",[33,7649,1053],{"class":50},[33,7651,763],{"class":167},[33,7653,7654],{"class":163},"|",[33,7656,7657],{"class":50}," None",[33,7659,212],{"class":163},[33,7661,7657],{"class":50},[33,7663,247],{"class":167},[33,7665,7666],{"class":35,"line":771},[33,7667,7668],{"class":167},") -> pd.DataFrame:\n",[33,7670,7671],{"class":35,"line":777},[33,7672,7673],{"class":54},"    \"\"\"\n",[33,7675,7676],{"class":35,"line":788},[33,7677,7678],{"class":54},"    Remove rows where every non-audit cell matches its column name.\n",[33,7680,7681],{"class":35,"line":804},[33,7682,7683],{"class":54},"    Standardize all frames to the canonical column set before concat.\n",[33,7685,7686],{"class":35,"line":809},[33,7687,92],{"emptyLinePlaceholder":91},[33,7689,7690],{"class":35,"line":819},[33,7691,7692],{"class":54},"    Args:\n",[33,7694,7695],{"class":35,"line":829},[33,7696,7697],{"class":54},"        frames:         List of per-page DataFrames from extract_frames().\n",[33,7699,7700],{"class":35,"line":834},[33,7701,7702],{"class":54},"        canonical_cols: Expected column names. Defaults to the first frame's columns.\n",[33,7704,7705],{"class":35,"line":839},[33,7706,7673],{"class":54},[33,7708,7709,7711,7713],{"class":35,"line":860},[33,7710,617],{"class":163},[33,7712,620],{"class":163},[33,7714,816],{"class":167},[33,7716,7717,7719],{"class":35,"line":887},[33,7718,1659],{"class":163},[33,7720,7721],{"class":167}," pd.DataFrame()\n",[33,7723,7724],{"class":35,"line":907},[33,7725,92],{"emptyLinePlaceholder":91},[33,7727,7728],{"class":35,"line":1826},[33,7729,7730],{"class":39},"    # Determine canonical columns from first frame if not provided\n",[33,7732,7733,7736,7738,7741,7743,7745,7747,7750,7752,7755,7757,7759,7762,7765],{"class":35,"line":1844},[33,7734,7735],{"class":167},"    non_audit ",[33,7737,242],{"class":163},[33,7739,7740],{"class":167}," [c ",[33,7742,6124],{"class":163},[33,7744,7486],{"class":167},[33,7746,662],{"class":163},[33,7748,7749],{"class":167}," frames[",[33,7751,748],{"class":50},[33,7753,7754],{"class":167},"].columns ",[33,7756,2491],{"class":163},[33,7758,620],{"class":163},[33,7760,7761],{"class":167}," c.startswith(",[33,7763,7764],{"class":54},"\"_\"",[33,7766,7767],{"class":167},")]\n",[33,7769,7770,7772,7775,7777,7779],{"class":35,"line":1858},[33,7771,617],{"class":163},[33,7773,7774],{"class":167}," canonical_cols ",[33,7776,3847],{"class":163},[33,7778,7657],{"class":50},[33,7780,574],{"class":167},[33,7782,7783,7786,7788],{"class":35,"line":1871},[33,7784,7785],{"class":167},"        canonical_cols ",[33,7787,242],{"class":163},[33,7789,7790],{"class":167}," non_audit\n",[33,7792,7793],{"class":35,"line":1877},[33,7794,92],{"emptyLinePlaceholder":91},[33,7796,7797,7800,7802],{"class":35,"line":1883},[33,7798,7799],{"class":167},"    cleaned: list[pd.DataFrame] ",[33,7801,242],{"class":163},[33,7803,589],{"class":167},[33,7805,7806,7808,7811,7813],{"class":35,"line":1915},[33,7807,656],{"class":163},[33,7809,7810],{"class":167}," df ",[33,7812,662],{"class":163},[33,7814,816],{"class":167},[33,7816,7817],{"class":35,"line":1926},[33,7818,7819],{"class":39},"        # Drop rows where every data cell equals its column name (repeated header)\n",[33,7821,7822,7825,7827,7829,7831,7833,7835,7838,7840,7842,7844,7846],{"class":35,"line":1932},[33,7823,7824],{"class":167},"        data_cols ",[33,7826,242],{"class":163},[33,7828,7740],{"class":167},[33,7830,6124],{"class":163},[33,7832,7486],{"class":167},[33,7834,662],{"class":163},[33,7836,7837],{"class":167}," df.columns ",[33,7839,2491],{"class":163},[33,7841,620],{"class":163},[33,7843,7761],{"class":167},[33,7845,7764],{"class":54},[33,7847,7767],{"class":167},[33,7849,7850,7853,7855],{"class":35,"line":1938},[33,7851,7852],{"class":167},"        header_mask ",[33,7854,242],{"class":163},[33,7856,7857],{"class":167}," df[data_cols].apply(\n",[33,7859,7860,7863,7866,7869,7872],{"class":35,"line":1950},[33,7861,7862],{"class":163},"            lambda",[33,7864,7865],{"class":167}," row: ",[33,7867,7868],{"class":50},"all",[33,7870,7871],{"class":167},"(             ",[33,7873,7874],{"class":39},"# True if every cell is the column name\n",[33,7876,7877,7880,7883,7885,7888],{"class":35,"line":1958},[33,7878,7879],{"class":50},"                str",[33,7881,7882],{"class":167},"(v).strip() ",[33,7884,1865],{"class":163},[33,7886,7887],{"class":50}," str",[33,7889,7890],{"class":167},"(col).strip()\n",[33,7892,7893,7895,7898,7900,7903],{"class":35,"line":4904},[33,7894,692],{"class":163},[33,7896,7897],{"class":167}," v, col ",[33,7899,662],{"class":163},[33,7901,7902],{"class":50}," zip",[33,7904,7905],{"class":167},"(row, data_cols)\n",[33,7907,7908],{"class":35,"line":4909},[33,7909,7910],{"class":167},"            ),\n",[33,7912,7913,7916,7918,7920],{"class":35,"line":4915},[33,7914,7915],{"class":238},"            axis",[33,7917,242],{"class":163},[33,7919,734],{"class":50},[33,7921,247],{"class":167},[33,7923,7924],{"class":35,"line":4925},[33,7925,5867],{"class":167},[33,7927,7928,7931,7933,7936,7939,7942],{"class":35,"line":4935},[33,7929,7930],{"class":167},"        df ",[33,7932,242],{"class":163},[33,7934,7935],{"class":167}," df[",[33,7937,7938],{"class":163},"~",[33,7940,7941],{"class":167},"header_mask].copy()    ",[33,7943,7944],{"class":39},"# remove header rows\n",[33,7946,7947],{"class":35,"line":4941},[33,7948,92],{"emptyLinePlaceholder":91},[33,7950,7951],{"class":35,"line":4950},[33,7952,7953],{"class":39},"        # Standardize to canonical column set; fill missing columns with NaN\n",[33,7955,7956,7959,7961,7963,7965,7967,7969,7971,7973,7975,7977],{"class":35,"line":4960},[33,7957,7958],{"class":167},"        audit ",[33,7960,242],{"class":163},[33,7962,7740],{"class":167},[33,7964,6124],{"class":163},[33,7966,7486],{"class":167},[33,7968,662],{"class":163},[33,7970,7837],{"class":167},[33,7972,2491],{"class":163},[33,7974,7761],{"class":167},[33,7976,7764],{"class":54},[33,7978,7767],{"class":167},[33,7980,7981,7983,7986,7988],{"class":35,"line":4965},[33,7982,5973],{"class":163},[33,7984,7985],{"class":167}," col ",[33,7987,662],{"class":163},[33,7989,7990],{"class":167}," canonical_cols:\n",[33,7992,7993,7995,7997,8000,8003],{"class":35,"line":4971},[33,7994,5995],{"class":163},[33,7996,7985],{"class":167},[33,7998,7999],{"class":163},"not",[33,8001,8002],{"class":163}," in",[33,8004,8005],{"class":167}," df.columns:\n",[33,8007,8008,8011,8013,8016,8019],{"class":35,"line":4983},[33,8009,8010],{"class":167},"                df[col] ",[33,8012,242],{"class":163},[33,8014,8015],{"class":167}," pd.",[33,8017,8018],{"class":50},"NA",[33,8020,8021],{"class":39},"         # add missing column as empty\n",[33,8023,8024,8026,8028,8031,8033,8036],{"class":35,"line":4988},[33,8025,7930],{"class":167},[33,8027,242],{"class":163},[33,8029,8030],{"class":167}," df[canonical_cols ",[33,8032,1811],{"class":163},[33,8034,8035],{"class":167}," audit] ",[33,8037,8038],{"class":39},"# reorder to canonical + audit\n",[33,8040,8041],{"class":35,"line":4993},[33,8042,8043],{"class":167},"        cleaned.append(df)\n",[33,8045,8046],{"class":35,"line":5003},[33,8047,92],{"emptyLinePlaceholder":91},[33,8049,8050],{"class":35,"line":5008},[33,8051,8052],{"class":39},"    # ignore_index=True re-sequences the integer index across all pages\n",[33,8054,8055,8057,8059,8062,8064,8066,8068],{"class":35,"line":5014},[33,8056,842],{"class":167},[33,8058,242],{"class":163},[33,8060,8061],{"class":167}," pd.concat(cleaned, ",[33,8063,850],{"class":238},[33,8065,242],{"class":163},[33,8067,855],{"class":50},[33,8069,221],{"class":167},[33,8071,8072],{"class":35,"line":5019},[33,8073,92],{"emptyLinePlaceholder":91},[33,8075,8076],{"class":35,"line":5032},[33,8077,8078],{"class":39},"    # Forward-fill group-label columns (merged cells appear only on first data row)\n",[33,8080,8081,8083,8085],{"class":35,"line":5039},[33,8082,842],{"class":167},[33,8084,242],{"class":163},[33,8086,8087],{"class":167}," combined.ffill()\n",[33,8089,8090],{"class":35,"line":5068},[33,8091,92],{"emptyLinePlaceholder":91},[33,8093,8094,8096],{"class":35,"line":5077},[33,8095,1332],{"class":163},[33,8097,8098],{"class":167}," combined\n",[33,8100,8101],{"class":35,"line":5082},[33,8102,92],{"emptyLinePlaceholder":91},[33,8104,8105],{"class":35,"line":5089},[33,8106,92],{"emptyLinePlaceholder":91},[33,8108,8109,8111,8113,8115,8117],{"class":35,"line":5098},[33,8110,2491],{"class":163},[33,8112,2494],{"class":50},[33,8114,2497],{"class":163},[33,8116,2500],{"class":54},[33,8118,574],{"class":167},[33,8120,8121,8123,8125,8128,8130],{"class":35,"line":5105},[33,8122,584],{"class":167},[33,8124,242],{"class":163},[33,8126,8127],{"class":167}," extract_frames(",[33,8129,7076],{"class":50},[33,8131,221],{"class":167},[33,8133,8134,8136,8138,8140,8143,8145,8148,8150,8153],{"class":35,"line":5110},[33,8135,7268],{"class":50},[33,8137,602],{"class":167},[33,8139,4059],{"class":163},[33,8141,8142],{"class":54},"\"Extracted ",[33,8144,4065],{"class":50},[33,8146,8147],{"class":167},"(frames)",[33,8149,1121],{"class":50},[33,8151,8152],{"class":54}," page frame(s)\"",[33,8154,221],{"class":167},[33,8156,8157],{"class":35,"line":5115},[33,8158,92],{"emptyLinePlaceholder":91},[33,8160,8161,8163,8165],{"class":35,"line":5128},[33,8162,842],{"class":167},[33,8164,242],{"class":163},[33,8166,8167],{"class":167}," drop_repeated_headers(frames)\n",[33,8169,8170,8172,8174,8176,8179,8181,8184,8186,8188],{"class":35,"line":5135},[33,8171,7268],{"class":50},[33,8173,602],{"class":167},[33,8175,4059],{"class":163},[33,8177,8178],{"class":54},"\"Combined shape: ",[33,8180,1115],{"class":50},[33,8182,8183],{"class":167},"combined.shape",[33,8185,1121],{"class":50},[33,8187,274],{"class":54},[33,8189,221],{"class":167},[33,8191,8192,8194],{"class":35,"line":5142},[33,8193,7268],{"class":50},[33,8195,8196],{"class":167},"(combined.dtypes)\n",[33,8198,8199],{"class":35,"line":5151},[33,8200,92],{"emptyLinePlaceholder":91},[33,8202,8203],{"class":35,"line":5156},[33,8204,8205],{"class":39},"    # Verify no header strings leaked\n",[33,8207,8208,8210,8212,8214],{"class":35,"line":5161},[33,8209,656],{"class":163},[33,8211,7985],{"class":167},[33,8213,662],{"class":163},[33,8215,8216],{"class":167}," combined.columns:\n",[33,8218,8219,8222,8224,8227,8229],{"class":35,"line":5167},[33,8220,8221],{"class":163},"        if",[33,8223,620],{"class":163},[33,8225,8226],{"class":167}," col.startswith(",[33,8228,7764],{"class":54},[33,8230,1737],{"class":167},[33,8232,8233,8236,8238,8241,8243,8246,8248],{"class":35,"line":5172},[33,8234,8235],{"class":167},"            leaks ",[33,8237,242],{"class":163},[33,8239,8240],{"class":167}," combined[combined[col].astype(",[33,8242,1053],{"class":50},[33,8244,8245],{"class":167},").str.strip() ",[33,8247,1865],{"class":163},[33,8249,8250],{"class":167}," col]\n",[33,8252,8253,8255,8257],{"class":35,"line":5182},[33,8254,5995],{"class":163},[33,8256,620],{"class":163},[33,8258,8259],{"class":167}," leaks.empty:\n",[33,8261,8262,8265,8267,8269,8272,8274,8277,8279,8282,8284,8287,8289,8292],{"class":35,"line":5195},[33,8263,8264],{"class":50},"                print",[33,8266,602],{"class":167},[33,8268,4059],{"class":163},[33,8270,8271],{"class":54},"\"Warning: header leak in column '",[33,8273,1115],{"class":50},[33,8275,8276],{"class":167},"col",[33,8278,1121],{"class":50},[33,8280,8281],{"class":54},"': ",[33,8283,4065],{"class":50},[33,8285,8286],{"class":167},"(leaks)",[33,8288,1121],{"class":50},[33,8290,8291],{"class":54}," row(s)\"",[33,8293,221],{"class":167},[33,8295,8296],{"class":35,"line":5200},[33,8297,92],{"emptyLinePlaceholder":91},[33,8299,8300,8303,8305,8307,8310,8312,8315,8318,8320,8323],{"class":35,"line":5205},[33,8301,8302],{"class":167},"    combined.drop(",[33,8304,740],{"class":238},[33,8306,242],{"class":163},[33,8308,8309],{"class":167},"[",[33,8311,7557],{"class":54},[33,8313,8314],{"class":167},"], ",[33,8316,8317],{"class":238},"errors",[33,8319,242],{"class":163},[33,8321,8322],{"class":54},"\"ignore\"",[33,8324,8325],{"class":167},").to_csv(\n",[33,8327,8328,8331,8333,8335,8337],{"class":35,"line":5210},[33,8329,8330],{"class":54},"        \"output\u002Fcombined.csv\"",[33,8332,365],{"class":167},[33,8334,897],{"class":238},[33,8336,242],{"class":163},[33,8338,8339],{"class":50},"False\n",[33,8341,8342],{"class":35,"line":5215},[33,8343,1202],{"class":167},[14,8345,8346],{},"Key changes from the naive version:",[4211,8348,8349,8358,8368],{},[4214,8350,8351,8354,8355,8357],{},[30,8352,8353],{},"header_mask"," compares every cell value to its column name — a row matches only if ",[1974,8356,7868],{}," cells are header strings, which avoids false positives on data rows that happen to contain a column's name in one cell.",[4214,8359,8360,8363,8364,8367],{},[30,8361,8362],{},"ignore_index=True"," in ",[30,8365,8366],{},"pd.concat"," prevents index collisions; each page's index starts at 0, so without it the combined frame has duplicate index values.",[4214,8369,8370,8371,8374,8375,8378],{},"The column standardization loop adds missing columns as ",[30,8372,8373],{},"pd.NA"," rather than raising a ",[30,8376,8377],{},"KeyError"," during concat when column counts differ across pages.",[18,8380,8382],{"id":8381},"variant-fix-1-page-break-row-splits-fragmented-rows","Variant Fix 1: Page-Break Row Splits (Fragmented Rows)",[14,8384,8385],{},"Some PDF generators cut a table row at a page break and continue it on the next page. The first page ends with a partial row (some cells empty), and the next page starts with the row's continuation — not a header.",[23,8387,8389],{"className":126,"code":8388,"language":47,"meta":28,"style":28},"# pip install pdfplumber pandas\nfrom pathlib import Path\nimport pdfplumber\nimport pandas as pd\n\nPDF_PATH = Path(\"data\u002Fsplit_rows.pdf\")\n# Assume the last column on each page is \"Notes\" — often the column that gets split\n\n\ndef stitch_split_rows(frames: list[pd.DataFrame], key_col: str) -> pd.DataFrame:\n    \"\"\"\n    Merge a row that was split across a page break.\n    Identifies the fragment by a null value in key_col on the first row\n    of a subsequent frame — that null signals continuation, not a new row.\n\n    Args:\n        frames:   Per-page DataFrames, already header-deduped.\n        key_col:  A column that is always populated on a real new row (e.g., \"ID\").\n    \"\"\"\n    merged: list[pd.DataFrame] = []\n    carry: pd.Series | None = None   # partial row carried from previous page\n\n    for df in frames:\n        rows = []\n        for _, row in df.iterrows():\n            if carry is not None:\n                # First row of this frame: if key_col is null, it's a continuation\n                if pd.isna(row.get(key_col)):\n                    # Merge: prefer non-null values from the continuation row\n                    combined_row = carry.combine_first(row)  # carry wins for non-null\n                    rows.append(combined_row)\n                    carry = None\n                    continue\n                else:\n                    rows.append(carry)   # carry was complete; flush it\n                    carry = None\n\n            # Check if this row itself is a split (last cell null → continues next page)\n            if pd.isna(row.iloc[-1]) and not pd.isna(row.get(key_col, pd.NA)):\n                carry = row.copy()      # hold it until next page confirms continuation\n            else:\n                rows.append(row)\n\n        if rows:\n            merged.append(pd.DataFrame(rows))\n\n    if carry is not None:\n        merged.append(pd.DataFrame([carry]))   # flush final carry\n\n    return pd.concat(merged, ignore_index=True) if merged else pd.DataFrame()\n\n\nif __name__ == \"__main__\":\n    frames = extract_frames(PDF_PATH)       # reuse function from main fix above\n    frames = [drop_repeated_headers([f]) for f in frames]\n    result = stitch_split_rows(frames, key_col=\"ID\")\n    print(result)\n",[30,8390,8391,8395,8405,8411,8421,8425,8438,8443,8447,8451,8465,8469,8474,8479,8484,8488,8492,8497,8502,8506,8515,8531,8535,8545,8554,8566,8581,8586,8593,8598,8611,8616,8625,8630,8637,8645,8653,8657,8662,8688,8701,8708,8713,8717,8724,8729,8733,8747,8755,8759,8783,8787,8791,8803,8819,8838,8858],{"__ignoreMap":28},[33,8392,8393],{"class":35,"line":36},[33,8394,7041],{"class":39},[33,8396,8397,8399,8401,8403],{"class":35,"line":43},[33,8398,190],{"class":163},[33,8400,193],{"class":167},[33,8402,164],{"class":163},[33,8404,198],{"class":167},[33,8406,8407,8409],{"class":35,"line":61},[33,8408,164],{"class":163},[33,8410,485],{"class":167},[33,8412,8413,8415,8417,8419],{"class":35,"line":73},[33,8414,164],{"class":163},[33,8416,492],{"class":167},[33,8418,495],{"class":163},[33,8420,498],{"class":167},[33,8422,8423],{"class":35,"line":88},[33,8424,92],{"emptyLinePlaceholder":91},[33,8426,8427,8429,8431,8433,8436],{"class":35,"line":95},[33,8428,7076],{"class":50},[33,8430,212],{"class":163},[33,8432,215],{"class":167},[33,8434,8435],{"class":54},"\"data\u002Fsplit_rows.pdf\"",[33,8437,221],{"class":167},[33,8439,8440],{"class":35,"line":101},[33,8441,8442],{"class":39},"# Assume the last column on each page is \"Notes\" — often the column that gets split\n",[33,8444,8445],{"class":35,"line":171},[33,8446,92],{"emptyLinePlaceholder":91},[33,8448,8449],{"class":35,"line":179},[33,8450,92],{"emptyLinePlaceholder":91},[33,8452,8453,8455,8458,8461,8463],{"class":35,"line":187},[33,8454,562],{"class":163},[33,8456,8457],{"class":46}," stitch_split_rows",[33,8459,8460],{"class":167},"(frames: list[pd.DataFrame], key_col: ",[33,8462,1053],{"class":50},[33,8464,7668],{"class":167},[33,8466,8467],{"class":35,"line":201},[33,8468,7673],{"class":54},[33,8470,8471],{"class":35,"line":206},[33,8472,8473],{"class":54},"    Merge a row that was split across a page break.\n",[33,8475,8476],{"class":35,"line":224},[33,8477,8478],{"class":54},"    Identifies the fragment by a null value in key_col on the first row\n",[33,8480,8481],{"class":35,"line":229},[33,8482,8483],{"class":54},"    of a subsequent frame — that null signals continuation, not a new row.\n",[33,8485,8486],{"class":35,"line":235},[33,8487,92],{"emptyLinePlaceholder":91},[33,8489,8490],{"class":35,"line":250},[33,8491,7692],{"class":54},[33,8493,8494],{"class":35,"line":266},[33,8495,8496],{"class":54},"        frames:   Per-page DataFrames, already header-deduped.\n",[33,8498,8499],{"class":35,"line":290},[33,8500,8501],{"class":54},"        key_col:  A column that is always populated on a real new row (e.g., \"ID\").\n",[33,8503,8504],{"class":35,"line":295},[33,8505,7673],{"class":54},[33,8507,8508,8511,8513],{"class":35,"line":300},[33,8509,8510],{"class":167},"    merged: list[pd.DataFrame] ",[33,8512,242],{"class":163},[33,8514,589],{"class":167},[33,8516,8517,8520,8522,8524,8526,8528],{"class":35,"line":317},[33,8518,8519],{"class":167},"    carry: pd.Series ",[33,8521,7654],{"class":163},[33,8523,7657],{"class":50},[33,8525,212],{"class":163},[33,8527,7657],{"class":50},[33,8529,8530],{"class":39},"   # partial row carried from previous page\n",[33,8532,8533],{"class":35,"line":332},[33,8534,92],{"emptyLinePlaceholder":91},[33,8536,8537,8539,8541,8543],{"class":35,"line":347},[33,8538,656],{"class":163},[33,8540,7810],{"class":167},[33,8542,662],{"class":163},[33,8544,816],{"class":167},[33,8546,8547,8550,8552],{"class":35,"line":374},[33,8548,8549],{"class":167},"        rows ",[33,8551,242],{"class":163},[33,8553,589],{"class":167},[33,8555,8556,8558,8561,8563],{"class":35,"line":397},[33,8557,5973],{"class":163},[33,8559,8560],{"class":167}," _, row ",[33,8562,662],{"class":163},[33,8564,8565],{"class":167}," df.iterrows():\n",[33,8567,8568,8570,8573,8575,8577,8579],{"class":35,"line":653},[33,8569,5995],{"class":163},[33,8571,8572],{"class":167}," carry ",[33,8574,3847],{"class":163},[33,8576,620],{"class":163},[33,8578,7657],{"class":50},[33,8580,574],{"class":167},[33,8582,8583],{"class":35,"line":667},[33,8584,8585],{"class":39},"                # First row of this frame: if key_col is null, it's a continuation\n",[33,8587,8588,8590],{"class":35,"line":675},[33,8589,7170],{"class":163},[33,8591,8592],{"class":167}," pd.isna(row.get(key_col)):\n",[33,8594,8595],{"class":35,"line":689},[33,8596,8597],{"class":39},"                    # Merge: prefer non-null values from the continuation row\n",[33,8599,8600,8603,8605,8608],{"class":35,"line":703},[33,8601,8602],{"class":167},"                    combined_row ",[33,8604,242],{"class":163},[33,8606,8607],{"class":167}," carry.combine_first(row)  ",[33,8609,8610],{"class":39},"# carry wins for non-null\n",[33,8612,8613],{"class":35,"line":714},[33,8614,8615],{"class":167},"                    rows.append(combined_row)\n",[33,8617,8618,8621,8623],{"class":35,"line":723},[33,8619,8620],{"class":167},"                    carry ",[33,8622,242],{"class":163},[33,8624,3852],{"class":50},[33,8626,8627],{"class":35,"line":754},[33,8628,8629],{"class":163},"                    continue\n",[33,8631,8632,8635],{"class":35,"line":771},[33,8633,8634],{"class":163},"                else",[33,8636,574],{"class":167},[33,8638,8639,8642],{"class":35,"line":777},[33,8640,8641],{"class":167},"                    rows.append(carry)   ",[33,8643,8644],{"class":39},"# carry was complete; flush it\n",[33,8646,8647,8649,8651],{"class":35,"line":788},[33,8648,8620],{"class":167},[33,8650,242],{"class":163},[33,8652,3852],{"class":50},[33,8654,8655],{"class":35,"line":804},[33,8656,92],{"emptyLinePlaceholder":91},[33,8658,8659],{"class":35,"line":809},[33,8660,8661],{"class":39},"            # Check if this row itself is a split (last cell null → continues next page)\n",[33,8663,8664,8666,8669,8671,8673,8676,8678,8680,8683,8685],{"class":35,"line":819},[33,8665,5995],{"class":163},[33,8667,8668],{"class":167}," pd.isna(row.iloc[",[33,8670,4126],{"class":163},[33,8672,734],{"class":50},[33,8674,8675],{"class":167},"]) ",[33,8677,6001],{"class":163},[33,8679,620],{"class":163},[33,8681,8682],{"class":167}," pd.isna(row.get(key_col, pd.",[33,8684,8018],{"class":50},[33,8686,8687],{"class":167},")):\n",[33,8689,8690,8693,8695,8698],{"class":35,"line":829},[33,8691,8692],{"class":167},"                carry ",[33,8694,242],{"class":163},[33,8696,8697],{"class":167}," row.copy()      ",[33,8699,8700],{"class":39},"# hold it until next page confirms continuation\n",[33,8702,8703,8706],{"class":35,"line":834},[33,8704,8705],{"class":163},"            else",[33,8707,574],{"class":167},[33,8709,8710],{"class":35,"line":839},[33,8711,8712],{"class":167},"                rows.append(row)\n",[33,8714,8715],{"class":35,"line":860},[33,8716,92],{"emptyLinePlaceholder":91},[33,8718,8719,8721],{"class":35,"line":887},[33,8720,8221],{"class":163},[33,8722,8723],{"class":167}," rows:\n",[33,8725,8726],{"class":35,"line":907},[33,8727,8728],{"class":167},"            merged.append(pd.DataFrame(rows))\n",[33,8730,8731],{"class":35,"line":1826},[33,8732,92],{"emptyLinePlaceholder":91},[33,8734,8735,8737,8739,8741,8743,8745],{"class":35,"line":1844},[33,8736,617],{"class":163},[33,8738,8572],{"class":167},[33,8740,3847],{"class":163},[33,8742,620],{"class":163},[33,8744,7657],{"class":50},[33,8746,574],{"class":167},[33,8748,8749,8752],{"class":35,"line":1858},[33,8750,8751],{"class":167},"        merged.append(pd.DataFrame([carry]))   ",[33,8753,8754],{"class":39},"# flush final carry\n",[33,8756,8757],{"class":35,"line":1871},[33,8758,92],{"emptyLinePlaceholder":91},[33,8760,8761,8763,8766,8768,8770,8772,8774,8776,8779,8781],{"class":35,"line":1877},[33,8762,1332],{"class":163},[33,8764,8765],{"class":167}," pd.concat(merged, ",[33,8767,850],{"class":238},[33,8769,242],{"class":163},[33,8771,855],{"class":50},[33,8773,1649],{"class":167},[33,8775,2491],{"class":163},[33,8777,8778],{"class":167}," merged ",[33,8780,7489],{"class":163},[33,8782,7721],{"class":167},[33,8784,8785],{"class":35,"line":1883},[33,8786,92],{"emptyLinePlaceholder":91},[33,8788,8789],{"class":35,"line":1915},[33,8790,92],{"emptyLinePlaceholder":91},[33,8792,8793,8795,8797,8799,8801],{"class":35,"line":1926},[33,8794,2491],{"class":163},[33,8796,2494],{"class":50},[33,8798,2497],{"class":163},[33,8800,2500],{"class":54},[33,8802,574],{"class":167},[33,8804,8805,8807,8809,8811,8813,8816],{"class":35,"line":1932},[33,8806,584],{"class":167},[33,8808,242],{"class":163},[33,8810,8127],{"class":167},[33,8812,7076],{"class":50},[33,8814,8815],{"class":167},")       ",[33,8817,8818],{"class":39},"# reuse function from main fix above\n",[33,8820,8821,8823,8825,8828,8830,8833,8835],{"class":35,"line":1938},[33,8822,584],{"class":167},[33,8824,242],{"class":163},[33,8826,8827],{"class":167}," [drop_repeated_headers([f]) ",[33,8829,6124],{"class":163},[33,8831,8832],{"class":167}," f ",[33,8834,662],{"class":163},[33,8836,8837],{"class":167}," frames]\n",[33,8839,8840,8843,8845,8848,8851,8853,8856],{"class":35,"line":1950},[33,8841,8842],{"class":167},"    result ",[33,8844,242],{"class":163},[33,8846,8847],{"class":167}," stitch_split_rows(frames, ",[33,8849,8850],{"class":238},"key_col",[33,8852,242],{"class":163},[33,8854,8855],{"class":54},"\"ID\"",[33,8857,221],{"class":167},[33,8859,8860,8862],{"class":35,"line":1958},[33,8861,7268],{"class":50},[33,8863,8864],{"class":167},"(result)\n",[14,8866,8867],{},"This approach works when a consistent key column (e.g., a row ID or date) is always populated on a genuine new data row. If no such column exists, fall back to detecting the split by checking whether the last N cells of the last row on a page are empty.",[18,8869,8871],{"id":8870},"variant-fix-2-inconsistent-column-counts-across-pages","Variant Fix 2: Inconsistent Column Counts Across Pages",[14,8873,8874,8875,8877,8878,8881,8882,8885],{},"A PDF where some pages add a \"footnote\" or \"change\" column causes per-page DataFrames with differing widths. ",[30,8876,8366],{}," with ",[30,8879,8880],{},"sort=False"," pads missing columns with ",[30,8883,8884],{},"NaN",", but column order becomes unpredictable.",[23,8887,8889],{"className":126,"code":8888,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\n\n\ndef unify_columns(\n    frames: list[pd.DataFrame],\n    canonical_cols: list[str],\n) -> pd.DataFrame:\n    \"\"\"\n    Force all frames to exactly canonical_cols, dropping extras and\n    inserting NaN columns for any that are absent.\n    \"\"\"\n    unified: list[pd.DataFrame] = []\n    for df in frames:\n        # Drop any columns not in canonical set\n        extra = [c for c in df.columns if c not in canonical_cols and not c.startswith(\"_\")]\n        df = df.drop(columns=extra, errors=\"ignore\")\n        # Add missing canonical columns\n        for col in canonical_cols:\n            if col not in df.columns:\n                df[col] = pd.NA\n        df = df[canonical_cols]   # enforce column order\n        unified.append(df)\n    return pd.concat(unified, ignore_index=True)\n\n\nif __name__ == \"__main__\":\n    # Example: pages 3 and 7 have an extra \"Revision\" column\n    frames = extract_frames(Path(\"data\u002Finconsistent_cols.pdf\"))\n    frames_cleaned = [drop_repeated_headers([f]) for f in frames]\n    CANONICAL = [\"Date\", \"Invoice\", \"Amount\", \"Currency\", \"Status\"]\n    result = unify_columns(frames_cleaned, CANONICAL)\n    print(result.shape)\n    assert list(result.columns) == CANONICAL, \"Column order mismatch\"\n",[30,8890,8891,8896,8906,8910,8914,8923,8927,8936,8940,8944,8949,8954,8958,8967,8977,8982,9019,9043,9048,9058,9070,9081,9093,9098,9113,9117,9121,9133,9138,9152,9169,9203,9217,9224],{"__ignoreMap":28},[33,8892,8893],{"class":35,"line":36},[33,8894,8895],{"class":39},"# pip install pandas\n",[33,8897,8898,8900,8902,8904],{"class":35,"line":43},[33,8899,164],{"class":163},[33,8901,492],{"class":167},[33,8903,495],{"class":163},[33,8905,498],{"class":167},[33,8907,8908],{"class":35,"line":61},[33,8909,92],{"emptyLinePlaceholder":91},[33,8911,8912],{"class":35,"line":73},[33,8913,92],{"emptyLinePlaceholder":91},[33,8915,8916,8918,8921],{"class":35,"line":88},[33,8917,562],{"class":163},[33,8919,8920],{"class":46}," unify_columns",[33,8922,7637],{"class":167},[33,8924,8925],{"class":35,"line":95},[33,8926,7642],{"class":167},[33,8928,8929,8931,8933],{"class":35,"line":101},[33,8930,7647],{"class":167},[33,8932,1053],{"class":50},[33,8934,8935],{"class":167},"],\n",[33,8937,8938],{"class":35,"line":171},[33,8939,7668],{"class":167},[33,8941,8942],{"class":35,"line":179},[33,8943,7673],{"class":54},[33,8945,8946],{"class":35,"line":187},[33,8947,8948],{"class":54},"    Force all frames to exactly canonical_cols, dropping extras and\n",[33,8950,8951],{"class":35,"line":201},[33,8952,8953],{"class":54},"    inserting NaN columns for any that are absent.\n",[33,8955,8956],{"class":35,"line":206},[33,8957,7673],{"class":54},[33,8959,8960,8963,8965],{"class":35,"line":224},[33,8961,8962],{"class":167},"    unified: list[pd.DataFrame] ",[33,8964,242],{"class":163},[33,8966,589],{"class":167},[33,8968,8969,8971,8973,8975],{"class":35,"line":229},[33,8970,656],{"class":163},[33,8972,7810],{"class":167},[33,8974,662],{"class":163},[33,8976,816],{"class":167},[33,8978,8979],{"class":35,"line":235},[33,8980,8981],{"class":39},"        # Drop any columns not in canonical set\n",[33,8983,8984,8987,8989,8991,8993,8995,8997,8999,9001,9003,9005,9007,9009,9011,9013,9015,9017],{"class":35,"line":250},[33,8985,8986],{"class":167},"        extra ",[33,8988,242],{"class":163},[33,8990,7740],{"class":167},[33,8992,6124],{"class":163},[33,8994,7486],{"class":167},[33,8996,662],{"class":163},[33,8998,7837],{"class":167},[33,9000,2491],{"class":163},[33,9002,7486],{"class":167},[33,9004,7999],{"class":163},[33,9006,8002],{"class":163},[33,9008,7774],{"class":167},[33,9010,6001],{"class":163},[33,9012,620],{"class":163},[33,9014,7761],{"class":167},[33,9016,7764],{"class":54},[33,9018,7767],{"class":167},[33,9020,9021,9023,9025,9028,9030,9032,9035,9037,9039,9041],{"class":35,"line":266},[33,9022,7930],{"class":167},[33,9024,242],{"class":163},[33,9026,9027],{"class":167}," df.drop(",[33,9029,740],{"class":238},[33,9031,242],{"class":163},[33,9033,9034],{"class":167},"extra, ",[33,9036,8317],{"class":238},[33,9038,242],{"class":163},[33,9040,8322],{"class":54},[33,9042,221],{"class":167},[33,9044,9045],{"class":35,"line":290},[33,9046,9047],{"class":39},"        # Add missing canonical columns\n",[33,9049,9050,9052,9054,9056],{"class":35,"line":295},[33,9051,5973],{"class":163},[33,9053,7985],{"class":167},[33,9055,662],{"class":163},[33,9057,7990],{"class":167},[33,9059,9060,9062,9064,9066,9068],{"class":35,"line":300},[33,9061,5995],{"class":163},[33,9063,7985],{"class":167},[33,9065,7999],{"class":163},[33,9067,8002],{"class":163},[33,9069,8005],{"class":167},[33,9071,9072,9074,9076,9078],{"class":35,"line":317},[33,9073,8010],{"class":167},[33,9075,242],{"class":163},[33,9077,8015],{"class":167},[33,9079,9080],{"class":50},"NA\n",[33,9082,9083,9085,9087,9090],{"class":35,"line":332},[33,9084,7930],{"class":167},[33,9086,242],{"class":163},[33,9088,9089],{"class":167}," df[canonical_cols]   ",[33,9091,9092],{"class":39},"# enforce column order\n",[33,9094,9095],{"class":35,"line":347},[33,9096,9097],{"class":167},"        unified.append(df)\n",[33,9099,9100,9102,9105,9107,9109,9111],{"class":35,"line":374},[33,9101,1332],{"class":163},[33,9103,9104],{"class":167}," pd.concat(unified, ",[33,9106,850],{"class":238},[33,9108,242],{"class":163},[33,9110,855],{"class":50},[33,9112,221],{"class":167},[33,9114,9115],{"class":35,"line":397},[33,9116,92],{"emptyLinePlaceholder":91},[33,9118,9119],{"class":35,"line":653},[33,9120,92],{"emptyLinePlaceholder":91},[33,9122,9123,9125,9127,9129,9131],{"class":35,"line":667},[33,9124,2491],{"class":163},[33,9126,2494],{"class":50},[33,9128,2497],{"class":163},[33,9130,2500],{"class":54},[33,9132,574],{"class":167},[33,9134,9135],{"class":35,"line":675},[33,9136,9137],{"class":39},"    # Example: pages 3 and 7 have an extra \"Revision\" column\n",[33,9139,9140,9142,9144,9147,9150],{"class":35,"line":689},[33,9141,584],{"class":167},[33,9143,242],{"class":163},[33,9145,9146],{"class":167}," extract_frames(Path(",[33,9148,9149],{"class":54},"\"data\u002Finconsistent_cols.pdf\"",[33,9151,371],{"class":167},[33,9153,9154,9157,9159,9161,9163,9165,9167],{"class":35,"line":703},[33,9155,9156],{"class":167},"    frames_cleaned ",[33,9158,242],{"class":163},[33,9160,8827],{"class":167},[33,9162,6124],{"class":163},[33,9164,8832],{"class":167},[33,9166,662],{"class":163},[33,9168,8837],{"class":167},[33,9170,9171,9174,9176,9179,9181,9183,9186,9188,9190,9192,9195,9197,9200],{"class":35,"line":714},[33,9172,9173],{"class":50},"    CANONICAL",[33,9175,212],{"class":163},[33,9177,9178],{"class":167}," [",[33,9180,7027],{"class":54},[33,9182,365],{"class":167},[33,9184,9185],{"class":54},"\"Invoice\"",[33,9187,365],{"class":167},[33,9189,7030],{"class":54},[33,9191,365],{"class":167},[33,9193,9194],{"class":54},"\"Currency\"",[33,9196,365],{"class":167},[33,9198,9199],{"class":54},"\"Status\"",[33,9201,9202],{"class":167},"]\n",[33,9204,9205,9207,9209,9212,9215],{"class":35,"line":723},[33,9206,8842],{"class":167},[33,9208,242],{"class":163},[33,9210,9211],{"class":167}," unify_columns(frames_cleaned, ",[33,9213,9214],{"class":50},"CANONICAL",[33,9216,221],{"class":167},[33,9218,9219,9221],{"class":35,"line":754},[33,9220,7268],{"class":50},[33,9222,9223],{"class":167},"(result.shape)\n",[33,9225,9226,9229,9231,9234,9236,9239,9241],{"class":35,"line":771},[33,9227,9228],{"class":163},"    assert",[33,9230,599],{"class":50},[33,9232,9233],{"class":167},"(result.columns) ",[33,9235,1865],{"class":163},[33,9237,9238],{"class":50}," CANONICAL",[33,9240,365],{"class":167},[33,9242,9243],{"class":54},"\"Column order mismatch\"\n",[18,9245,9247],{"id":9246},"verification","Verification",[14,9249,9250],{},"After any fix, confirm that no header strings appear in data columns and that the row count matches a manual count from the source PDF:",[23,9252,9254],{"className":126,"code":9253,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\n\n\ndef verify_no_header_leaks(df: pd.DataFrame) -> None:\n    \"\"\"Assert that no cell in a data column exactly matches its column name.\"\"\"\n    leaks_found = False\n    for col in df.columns:\n        if col.startswith(\"_\"):\n            continue\n        leaks = df[df[col].astype(str).str.strip() == str(col).strip()]\n        if not leaks.empty:\n            print(f\"Header leak in '{col}': rows {leaks.index.tolist()}\")\n            leaks_found = True\n    if not leaks_found:\n        print(f\"OK — no header leaks. Shape: {df.shape}\")\n    else:\n        raise AssertionError(\"Header rows present in data — re-run drop_repeated_headers()\")\n\n\nif __name__ == \"__main__\":\n    df = pd.read_csv(\"output\u002Fcombined.csv\")\n    verify_no_header_leaks(df)\n    # Compare against a manually counted total from the PDF\n    EXPECTED_ROWS = 243   # set from your source PDF\n    assert df.shape[0] == EXPECTED_ROWS, (\n        f\"Row count mismatch: got {df.shape[0]}, expected {EXPECTED_ROWS}\"\n    )\n    print(\"Verification passed.\")\n",[30,9255,9256,9260,9270,9274,9278,9292,9297,9306,9316,9326,9331,9352,9360,9392,9401,9410,9433,9439,9453,9457,9461,9473,9487,9492,9497,9510,9529,9557,9561],{"__ignoreMap":28},[33,9257,9258],{"class":35,"line":36},[33,9259,8895],{"class":39},[33,9261,9262,9264,9266,9268],{"class":35,"line":43},[33,9263,164],{"class":163},[33,9265,492],{"class":167},[33,9267,495],{"class":163},[33,9269,498],{"class":167},[33,9271,9272],{"class":35,"line":61},[33,9273,92],{"emptyLinePlaceholder":91},[33,9275,9276],{"class":35,"line":73},[33,9277,92],{"emptyLinePlaceholder":91},[33,9279,9280,9282,9285,9288,9290],{"class":35,"line":88},[33,9281,562],{"class":163},[33,9283,9284],{"class":46}," verify_no_header_leaks",[33,9286,9287],{"class":167},"(df: pd.DataFrame) -> ",[33,9289,571],{"class":50},[33,9291,574],{"class":167},[33,9293,9294],{"class":35,"line":95},[33,9295,9296],{"class":54},"    \"\"\"Assert that no cell in a data column exactly matches its column name.\"\"\"\n",[33,9298,9299,9302,9304],{"class":35,"line":101},[33,9300,9301],{"class":167},"    leaks_found ",[33,9303,242],{"class":163},[33,9305,2903],{"class":50},[33,9307,9308,9310,9312,9314],{"class":35,"line":171},[33,9309,656],{"class":163},[33,9311,7985],{"class":167},[33,9313,662],{"class":163},[33,9315,8005],{"class":167},[33,9317,9318,9320,9322,9324],{"class":35,"line":179},[33,9319,8221],{"class":163},[33,9321,8226],{"class":167},[33,9323,7764],{"class":54},[33,9325,1737],{"class":167},[33,9327,9328],{"class":35,"line":187},[33,9329,9330],{"class":163},"            continue\n",[33,9332,9333,9336,9338,9341,9343,9345,9347,9349],{"class":35,"line":201},[33,9334,9335],{"class":167},"        leaks ",[33,9337,242],{"class":163},[33,9339,9340],{"class":167}," df[df[col].astype(",[33,9342,1053],{"class":50},[33,9344,8245],{"class":167},[33,9346,1865],{"class":163},[33,9348,7887],{"class":50},[33,9350,9351],{"class":167},"(col).strip()]\n",[33,9353,9354,9356,9358],{"class":35,"line":206},[33,9355,8221],{"class":163},[33,9357,620],{"class":163},[33,9359,8259],{"class":167},[33,9361,9362,9365,9367,9369,9372,9374,9376,9378,9381,9383,9386,9388,9390],{"class":35,"line":224},[33,9363,9364],{"class":50},"            print",[33,9366,602],{"class":167},[33,9368,4059],{"class":163},[33,9370,9371],{"class":54},"\"Header leak in '",[33,9373,1115],{"class":50},[33,9375,8276],{"class":167},[33,9377,1121],{"class":50},[33,9379,9380],{"class":54},"': rows ",[33,9382,1115],{"class":50},[33,9384,9385],{"class":167},"leaks.index.tolist()",[33,9387,1121],{"class":50},[33,9389,274],{"class":54},[33,9391,221],{"class":167},[33,9393,9394,9397,9399],{"class":35,"line":229},[33,9395,9396],{"class":167},"            leaks_found ",[33,9398,242],{"class":163},[33,9400,2887],{"class":50},[33,9402,9403,9405,9407],{"class":35,"line":235},[33,9404,617],{"class":163},[33,9406,620],{"class":163},[33,9408,9409],{"class":167}," leaks_found:\n",[33,9411,9412,9415,9417,9419,9422,9424,9427,9429,9431],{"class":35,"line":250},[33,9413,9414],{"class":50},"        print",[33,9416,602],{"class":167},[33,9418,4059],{"class":163},[33,9420,9421],{"class":54},"\"OK — no header leaks. Shape: ",[33,9423,1115],{"class":50},[33,9425,9426],{"class":167},"df.shape",[33,9428,1121],{"class":50},[33,9430,274],{"class":54},[33,9432,221],{"class":167},[33,9434,9435,9437],{"class":35,"line":266},[33,9436,6864],{"class":163},[33,9438,574],{"class":167},[33,9440,9441,9443,9446,9448,9451],{"class":35,"line":290},[33,9442,4051],{"class":163},[33,9444,9445],{"class":50}," AssertionError",[33,9447,602],{"class":167},[33,9449,9450],{"class":54},"\"Header rows present in data — re-run drop_repeated_headers()\"",[33,9452,221],{"class":167},[33,9454,9455],{"class":35,"line":295},[33,9456,92],{"emptyLinePlaceholder":91},[33,9458,9459],{"class":35,"line":300},[33,9460,92],{"emptyLinePlaceholder":91},[33,9462,9463,9465,9467,9469,9471],{"class":35,"line":317},[33,9464,2491],{"class":163},[33,9466,2494],{"class":50},[33,9468,2497],{"class":163},[33,9470,2500],{"class":54},[33,9472,574],{"class":167},[33,9474,9475,9477,9479,9482,9485],{"class":35,"line":332},[33,9476,4025],{"class":167},[33,9478,242],{"class":163},[33,9480,9481],{"class":167}," pd.read_csv(",[33,9483,9484],{"class":54},"\"output\u002Fcombined.csv\"",[33,9486,221],{"class":167},[33,9488,9489],{"class":35,"line":347},[33,9490,9491],{"class":167},"    verify_no_header_leaks(df)\n",[33,9493,9494],{"class":35,"line":374},[33,9495,9496],{"class":39},"    # Compare against a manually counted total from the PDF\n",[33,9498,9499,9502,9504,9507],{"class":35,"line":397},[33,9500,9501],{"class":50},"    EXPECTED_ROWS",[33,9503,212],{"class":163},[33,9505,9506],{"class":50}," 243",[33,9508,9509],{"class":39},"   # set from your source PDF\n",[33,9511,9512,9514,9517,9519,9521,9523,9526],{"class":35,"line":653},[33,9513,9228],{"class":163},[33,9515,9516],{"class":167}," df.shape[",[33,9518,748],{"class":50},[33,9520,763],{"class":167},[33,9522,1865],{"class":163},[33,9524,9525],{"class":50}," EXPECTED_ROWS",[33,9527,9528],{"class":167},", (\n",[33,9530,9531,9534,9537,9539,9542,9544,9547,9549,9552,9555],{"class":35,"line":667},[33,9532,9533],{"class":163},"        f",[33,9535,9536],{"class":54},"\"Row count mismatch: got ",[33,9538,1115],{"class":50},[33,9540,9541],{"class":167},"df.shape[",[33,9543,748],{"class":50},[33,9545,9546],{"class":167},"]",[33,9548,1121],{"class":50},[33,9550,9551],{"class":54},", expected ",[33,9553,9554],{"class":50},"{EXPECTED_ROWS}",[33,9556,7504],{"class":54},[33,9558,9559],{"class":35,"line":675},[33,9560,1202],{"class":167},[33,9562,9563,9565,9567,9570],{"class":35,"line":689},[33,9564,7268],{"class":50},[33,9566,602],{"class":167},[33,9568,9569],{"class":54},"\"Verification passed.\"",[33,9571,221],{"class":167},[14,9573,9574,9575,9578],{},"Run ",[30,9576,9577],{},"verify_no_header_leaks"," after every pipeline run, not just during development. PDF structure changes silently when the upstream source system upgrades its PDF generator.",[18,9580,6918],{"id":6917},[4211,9582,9583,9588,9594,9601],{},[4214,9584,9585,9587],{},[940,9586,948],{"href":947}," — full extraction pipeline: classify, extract, concat, normalize dtypes",[4214,9589,9590,9593],{},[940,9591,9592],{"href":942},"Extracting Tables from PDFs"," — pdfplumber and camelot extraction modes, including the multi-page dedup step",[4214,9595,9596,9600],{},[940,9597,9599],{"href":9598},"\u002Fpython-for-excel-csv-data-processing\u002Fcleaning-messy-csv-data-with-pandas\u002F","Cleaning Messy CSV Data with pandas"," — dtype coercion and whitespace-strip patterns that apply after PDF extraction",[4214,9602,9603,9607],{},[940,9604,9606],{"href":9605},"\u002Fautomating-pdf-extraction-generation\u002Fcomparing-pdf-table-extraction-libraries\u002F","pdfplumber vs camelot vs tabula"," — choose the extractor with the best per-page accuracy for your PDF type",[14,9609,6947,9610,3035],{},[940,9611,948],{"href":947},[6953,9613,9614],{},"html pre.shiki code .sAwPA, html code.shiki .sAwPA{--shiki-default:#6A737D}html pre.shiki code .sD7c4, html code.shiki .sD7c4{--shiki-default:#D73A49}html pre.shiki code .sgsFI, html code.shiki .sgsFI{--shiki-default:#24292E}html pre.shiki code .sYu0t, html code.shiki .sYu0t{--shiki-default:#005CC5}html pre.shiki code .sYBdl, html code.shiki .sYBdl{--shiki-default:#032F62}html pre.shiki code .s7eDp, html code.shiki .s7eDp{--shiki-default:#6F42C1}html pre.shiki code .sqxcx, html code.shiki .sqxcx{--shiki-default:#E36209}html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}",{"title":28,"searchDepth":43,"depth":43,"links":9616},[9617,9618,9619,9620,9621,9622],{"id":7020,"depth":43,"text":7021},{"id":7292,"depth":43,"text":7293},{"id":8381,"depth":43,"text":8382},{"id":8870,"depth":43,"text":8871},{"id":9246,"depth":43,"text":9247},{"id":6917,"depth":43,"text":6918},"Multi-Page PDF Tables","Fix duplicated header rows and misaligned columns when a PDF table spans multiple pages. Drop repeated headers, standardize columns, and concat with ignore_index=True.",{},"\u002Fautomating-document-data-pipelines\u002Fextracting-pdf-data-into-pandas\u002Fhandle-multi-page-pdf-tables-in-pandas",{"title":6995,"description":9624},"automating-document-data-pipelines\u002Fextracting-pdf-data-into-pandas\u002Fhandle-multi-page-pdf-tables-in-pandas\u002Findex",[47,9630,9631,943,9632],"pandas","pdf","data cleaning","p1pLjykqYNz0NDiPwXS87ng0VcUC7CEJHzm-oR741x4",{"id":9635,"title":948,"body":9636,"breadcrumbTitle":16131,"canonical":6977,"date":6978,"description":16132,"draft":6980,"extension":6981,"image":6977,"meta":16133,"navigation":91,"path":16134,"robots":6977,"seo":16135,"seoTitle":16136,"stem":16137,"tags":16138,"updatedAt":6978,"__hash__":16140},"content\u002Fautomating-document-data-pipelines\u002Fextracting-pdf-data-into-pandas\u002Findex.md",{"type":7,"value":9637,"toc":16110},[9638,9641,9644,9650,9652,9655,9726,9740,9747,9751,9754,10059,10079,10083,10086,10090,10525,10537,10541,11055,11069,11233,11237,11240,11570,11582,11586,11591,11868,11879,11883,11902,12491,12494,12500,12504,12507,12941,12945,12949,12960,13312,13316,13321,13478,13482,13842,13846,13856,13865,14287,14301,14307,14309,14434,14438,16078,16080,16103,16107],[10,9639,948],{"id":9640},"extracting-pdf-data-into-pandas",[14,9642,9643],{},"PDF files do not store tabular data the way spreadsheets do. A \"table\" in a PDF is usually a set of positioned text characters that happen to look like a grid — there is no schema, no delimiter, and no dtype. Pulling those characters into a pandas DataFrame requires choosing the right extractor for the PDF's internal structure, promoting headers, handling repeated header rows across pages, and coercing strings into proper numeric and datetime types before any analysis is possible.",[14,9645,9646,9647,9649],{},"This guide implements the complete workflow: classify the PDF structure, route to ",[940,9648,943],{"href":942}," or camelot based on what you find, build per-page DataFrames, concatenate them into a single tidy frame, and normalize dtypes so the result is analysis-ready.",[18,9651,21],{"id":20},[14,9653,9654],{},"Install Python dependencies and verify the system-level requirements camelot needs:",[23,9656,9658],{"className":25,"code":9657,"language":27,"meta":28,"style":28},"# System dependencies (Ubuntu\u002FDebian — camelot lattice mode requires Ghostscript)\nsudo apt-get install ghostscript libsm6 libxext6\n\n# Python packages\npip install pdfplumber \"camelot-py[cv]\" pandas\n\n# Verify camelot can see Ghostscript\npython -c \"import camelot; print(camelot.__version__)\"\n",[30,9659,9660,9665,9684,9688,9693,9708,9712,9717],{"__ignoreMap":28},[33,9661,9662],{"class":35,"line":36},[33,9663,9664],{"class":39},"# System dependencies (Ubuntu\u002FDebian — camelot lattice mode requires Ghostscript)\n",[33,9666,9667,9670,9673,9675,9678,9681],{"class":35,"line":43},[33,9668,9669],{"class":46},"sudo",[33,9671,9672],{"class":54}," apt-get",[33,9674,79],{"class":54},[33,9676,9677],{"class":54}," ghostscript",[33,9679,9680],{"class":54}," libsm6",[33,9682,9683],{"class":54}," libxext6\n",[33,9685,9686],{"class":35,"line":61},[33,9687,92],{"emptyLinePlaceholder":91},[33,9689,9690],{"class":35,"line":73},[33,9691,9692],{"class":39},"# Python packages\n",[33,9694,9695,9697,9699,9702,9705],{"class":35,"line":88},[33,9696,76],{"class":46},[33,9698,79],{"class":54},[33,9700,9701],{"class":54}," pdfplumber",[33,9703,9704],{"class":54}," \"camelot-py[cv]\"",[33,9706,9707],{"class":54}," pandas\n",[33,9709,9710],{"class":35,"line":95},[33,9711,92],{"emptyLinePlaceholder":91},[33,9713,9714],{"class":35,"line":101},[33,9715,9716],{"class":39},"# Verify camelot can see Ghostscript\n",[33,9718,9719,9721,9723],{"class":35,"line":171},[33,9720,47],{"class":46},[33,9722,106],{"class":50},[33,9724,9725],{"class":54}," \"import camelot; print(camelot.__version__)\"\n",[14,9727,9728,9729,9732,9733,9735,9736,3035],{},"If the camelot import raises ",[30,9730,9731],{},"OSError: ghostscript not found",", the Ghostscript binary is missing or not on ",[30,9734,122],{},". Lattice mode will not work without it; stream mode and pdfplumber will. For a full fix, see ",[940,9737,9739],{"href":9738},"\u002Fautomating-pdf-extraction-generation\u002Fextracting-tables-from-pdfs\u002Ffix-camelot-import-error-on-linux\u002F","Fix Camelot Import Error on Linux",[14,9741,9742,9743,9746],{},"Place a representative PDF at ",[30,9744,9745],{},"data\u002Freport.pdf"," to follow along. A multi-page financial statement or invoice works well.",[18,9748,9750],{"id":9749},"step-1-classify-the-pdf-before-extracting","Step 1: Classify the PDF Before Extracting",[14,9752,9753],{},"Not every PDF responds to the same extractor. Running a quick diagnostic before extraction avoids silent failures — camelot lattice returns empty results on PDFs without vector lines, while pdfplumber's default table finder fails on scanned pages.",[23,9755,9757],{"className":126,"code":9756,"language":47,"meta":28,"style":28},"# pip install pdfplumber\nfrom pathlib import Path\nimport pdfplumber\n\nPDF_PATH = Path(\"data\u002Freport.pdf\")\n\n\ndef classify_pdf(path: Path) -> str:\n    \"\"\"\n    Return 'lattice', 'stream', or 'ocr' based on the first page's content.\n    'lattice'  — vector lines found; use camelot lattice\n    'stream'   — text layer present, no vector lines; use camelot stream or pdfplumber\n    'ocr'      — no text and no lines; rasterized scan, needs OCR\n    \"\"\"\n    try:\n        with pdfplumber.open(path) as pdf:\n            page = pdf.pages[0]\n            has_text = bool((page.extract_text() or \"\").strip())\n            has_lines = bool(page.lines or page.rects)\n    except Exception as e:\n        raise RuntimeError(f\"Cannot open {path}: {e}\") from e\n\n    if not has_text and not has_lines:\n        return \"ocr\"\n    return \"lattice\" if has_lines else \"stream\"\n\n\nif __name__ == \"__main__\":\n    mode = classify_pdf(PDF_PATH)\n    print(f\"Recommended extraction mode: {mode}\")\n",[30,9758,9759,9764,9774,9780,9784,9796,9800,9804,9817,9821,9826,9831,9836,9841,9845,9851,9861,9875,9896,9913,9923,9958,9962,9978,9985,10003,10007,10011,10023,10037],{"__ignoreMap":28},[33,9760,9761],{"class":35,"line":36},[33,9762,9763],{"class":39},"# pip install pdfplumber\n",[33,9765,9766,9768,9770,9772],{"class":35,"line":43},[33,9767,190],{"class":163},[33,9769,193],{"class":167},[33,9771,164],{"class":163},[33,9773,198],{"class":167},[33,9775,9776,9778],{"class":35,"line":61},[33,9777,164],{"class":163},[33,9779,485],{"class":167},[33,9781,9782],{"class":35,"line":73},[33,9783,92],{"emptyLinePlaceholder":91},[33,9785,9786,9788,9790,9792,9794],{"class":35,"line":88},[33,9787,7076],{"class":50},[33,9789,212],{"class":163},[33,9791,215],{"class":167},[33,9793,7083],{"class":54},[33,9795,221],{"class":167},[33,9797,9798],{"class":35,"line":95},[33,9799,92],{"emptyLinePlaceholder":91},[33,9801,9802],{"class":35,"line":101},[33,9803,92],{"emptyLinePlaceholder":91},[33,9805,9806,9808,9811,9813,9815],{"class":35,"line":171},[33,9807,562],{"class":163},[33,9809,9810],{"class":46}," classify_pdf",[33,9812,3743],{"class":167},[33,9814,1053],{"class":50},[33,9816,574],{"class":167},[33,9818,9819],{"class":35,"line":179},[33,9820,7673],{"class":54},[33,9822,9823],{"class":35,"line":187},[33,9824,9825],{"class":54},"    Return 'lattice', 'stream', or 'ocr' based on the first page's content.\n",[33,9827,9828],{"class":35,"line":201},[33,9829,9830],{"class":54},"    'lattice'  — vector lines found; use camelot lattice\n",[33,9832,9833],{"class":35,"line":206},[33,9834,9835],{"class":54},"    'stream'   — text layer present, no vector lines; use camelot stream or pdfplumber\n",[33,9837,9838],{"class":35,"line":224},[33,9839,9840],{"class":54},"    'ocr'      — no text and no lines; rasterized scan, needs OCR\n",[33,9842,9843],{"class":35,"line":229},[33,9844,7673],{"class":54},[33,9846,9847,9849],{"class":35,"line":235},[33,9848,2424],{"class":163},[33,9850,574],{"class":167},[33,9852,9853,9855,9857,9859],{"class":35,"line":250},[33,9854,2191],{"class":163},[33,9856,7123],{"class":167},[33,9858,495],{"class":163},[33,9860,686],{"class":167},[33,9862,9863,9866,9868,9871,9873],{"class":35,"line":266},[33,9864,9865],{"class":167},"            page ",[33,9867,242],{"class":163},[33,9869,9870],{"class":167}," pdf.pages[",[33,9872,748],{"class":50},[33,9874,9202],{"class":167},[33,9876,9877,9880,9882,9885,9888,9890,9893],{"class":35,"line":290},[33,9878,9879],{"class":167},"            has_text ",[33,9881,242],{"class":163},[33,9883,9884],{"class":50}," bool",[33,9886,9887],{"class":167},"((page.extract_text() ",[33,9889,7162],{"class":163},[33,9891,9892],{"class":54}," \"\"",[33,9894,9895],{"class":167},").strip())\n",[33,9897,9898,9901,9903,9905,9908,9910],{"class":35,"line":295},[33,9899,9900],{"class":167},"            has_lines ",[33,9902,242],{"class":163},[33,9904,9884],{"class":50},[33,9906,9907],{"class":167},"(page.lines ",[33,9909,7162],{"class":163},[33,9911,9912],{"class":167}," page.rects)\n",[33,9914,9915,9917,9919,9921],{"class":35,"line":300},[33,9916,2449],{"class":163},[33,9918,783],{"class":50},[33,9920,1852],{"class":163},[33,9922,7583],{"class":167},[33,9924,9925,9927,9929,9931,9933,9936,9938,9940,9942,9944,9946,9948,9950,9952,9954,9956],{"class":35,"line":317},[33,9926,4051],{"class":163},[33,9928,7590],{"class":50},[33,9930,602],{"class":167},[33,9932,4059],{"class":163},[33,9934,9935],{"class":54},"\"Cannot open ",[33,9937,1115],{"class":50},[33,9939,2580],{"class":167},[33,9941,1121],{"class":50},[33,9943,2079],{"class":54},[33,9945,1115],{"class":50},[33,9947,7602],{"class":167},[33,9949,1121],{"class":50},[33,9951,274],{"class":54},[33,9953,1649],{"class":167},[33,9955,190],{"class":163},[33,9957,7613],{"class":167},[33,9959,9960],{"class":35,"line":332},[33,9961,92],{"emptyLinePlaceholder":91},[33,9963,9964,9966,9968,9971,9973,9975],{"class":35,"line":347},[33,9965,617],{"class":163},[33,9967,620],{"class":163},[33,9969,9970],{"class":167}," has_text ",[33,9972,6001],{"class":163},[33,9974,620],{"class":163},[33,9976,9977],{"class":167}," has_lines:\n",[33,9979,9980,9982],{"class":35,"line":374},[33,9981,1659],{"class":163},[33,9983,9984],{"class":54}," \"ocr\"\n",[33,9986,9987,9989,9992,9995,9998,10000],{"class":35,"line":397},[33,9988,1332],{"class":163},[33,9990,9991],{"class":54}," \"lattice\"",[33,9993,9994],{"class":163}," if",[33,9996,9997],{"class":167}," has_lines ",[33,9999,7489],{"class":163},[33,10001,10002],{"class":54}," \"stream\"\n",[33,10004,10005],{"class":35,"line":653},[33,10006,92],{"emptyLinePlaceholder":91},[33,10008,10009],{"class":35,"line":667},[33,10010,92],{"emptyLinePlaceholder":91},[33,10012,10013,10015,10017,10019,10021],{"class":35,"line":675},[33,10014,2491],{"class":163},[33,10016,2494],{"class":50},[33,10018,2497],{"class":163},[33,10020,2500],{"class":54},[33,10022,574],{"class":167},[33,10024,10025,10028,10030,10033,10035],{"class":35,"line":689},[33,10026,10027],{"class":167},"    mode ",[33,10029,242],{"class":163},[33,10031,10032],{"class":167}," classify_pdf(",[33,10034,7076],{"class":50},[33,10036,221],{"class":167},[33,10038,10039,10041,10043,10045,10048,10050,10053,10055,10057],{"class":35,"line":703},[33,10040,7268],{"class":50},[33,10042,602],{"class":167},[33,10044,4059],{"class":163},[33,10046,10047],{"class":54},"\"Recommended extraction mode: ",[33,10049,1115],{"class":50},[33,10051,10052],{"class":167},"mode",[33,10054,1121],{"class":50},[33,10056,274],{"class":54},[33,10058,221],{"class":167},[14,10060,10061,10062,10065,10066,10069,10070,10073,10074,10078],{},"The check on ",[30,10063,10064],{},"page.lines"," and ",[30,10067,10068],{},"page.rects"," distinguishes PDFs with drawn borders (camelot lattice) from those with whitespace-delimited columns (camelot stream or pdfplumber). For scanned documents the result is ",[30,10071,10072],{},"\"ocr\""," — see ",[940,10075,10077],{"href":10076},"\u002Fautomating-pdf-extraction-generation\u002Fextracting-tables-from-pdfs\u002Fhow-to-extract-tables-from-scanned-pdfs\u002F","How to Extract Tables from Scanned PDFs"," for that path.",[18,10080,10082],{"id":10081},"step-2-extract-tables-per-page","Step 2: Extract Tables Per Page",[14,10084,10085],{},"Extract each page independently and collect the raw DataFrames in a list. Keeping pages separate at this stage makes header deduplication straightforward in the next step.",[424,10087,10089],{"id":10088},"pdfplumber-whitespace-and-loosely-bordered-tables","pdfplumber — whitespace and loosely bordered tables",[23,10091,10093],{"className":126,"code":10092,"language":47,"meta":28,"style":28},"# pip install pdfplumber pandas\nfrom pathlib import Path\nimport pdfplumber\nimport pandas as pd\n\nPDF_PATH = Path(\"data\u002Freport.pdf\")\n\n\ndef extract_pages_pdfplumber(path: Path) -> list[pd.DataFrame]:\n    \"\"\"Extract tables from every page using pdfplumber.\"\"\"\n    page_frames: list[pd.DataFrame] = []\n    try:\n        with pdfplumber.open(path) as pdf:\n            for page_num, page in enumerate(pdf.pages, start=1):\n                raw_tables = page.extract_tables(table_settings={\n                    \"vertical_strategy\": \"lines_strict\",\n                    \"horizontal_strategy\": \"lines_strict\",\n                    \"snap_tolerance\": 3,\n                }) or []\n                for raw in raw_tables:\n                    if not raw or len(raw) \u003C 2:\n                        continue\n                    # Promote first row as header; replace None with placeholder\n                    header = [\n                        str(c).strip() if c else f\"col_{i}\"\n                        for i, c in enumerate(raw[0])\n                    ]\n                    df = pd.DataFrame(raw[1:], columns=header)\n                    df[\"_source_page\"] = page_num       # audit column\n                    page_frames.append(df)\n    except Exception as e:\n        raise RuntimeError(f\"pdfplumber extraction failed on {path}: {e}\") from e\n    return page_frames\n\n\nif __name__ == \"__main__\":\n    frames = extract_pages_pdfplumber(PDF_PATH)\n    print(f\"Extracted {len(frames)} table(s) across pages\")\n    for df in frames:\n        print(df.shape, df.columns.tolist())\n",[30,10094,10095,10099,10109,10115,10125,10129,10141,10145,10149,10158,10163,10172,10178,10188,10208,10226,10238,10249,10261,10270,10281,10301,10305,10310,10318,10342,10358,10362,10380,10397,10402,10412,10447,10454,10458,10462,10474,10487,10508,10518],{"__ignoreMap":28},[33,10096,10097],{"class":35,"line":36},[33,10098,7041],{"class":39},[33,10100,10101,10103,10105,10107],{"class":35,"line":43},[33,10102,190],{"class":163},[33,10104,193],{"class":167},[33,10106,164],{"class":163},[33,10108,198],{"class":167},[33,10110,10111,10113],{"class":35,"line":61},[33,10112,164],{"class":163},[33,10114,485],{"class":167},[33,10116,10117,10119,10121,10123],{"class":35,"line":73},[33,10118,164],{"class":163},[33,10120,492],{"class":167},[33,10122,495],{"class":163},[33,10124,498],{"class":167},[33,10126,10127],{"class":35,"line":88},[33,10128,92],{"emptyLinePlaceholder":91},[33,10130,10131,10133,10135,10137,10139],{"class":35,"line":95},[33,10132,7076],{"class":50},[33,10134,212],{"class":163},[33,10136,215],{"class":167},[33,10138,7083],{"class":54},[33,10140,221],{"class":167},[33,10142,10143],{"class":35,"line":101},[33,10144,92],{"emptyLinePlaceholder":91},[33,10146,10147],{"class":35,"line":171},[33,10148,92],{"emptyLinePlaceholder":91},[33,10150,10151,10153,10156],{"class":35,"line":179},[33,10152,562],{"class":163},[33,10154,10155],{"class":46}," extract_pages_pdfplumber",[33,10157,7362],{"class":167},[33,10159,10160],{"class":35,"line":187},[33,10161,10162],{"class":54},"    \"\"\"Extract tables from every page using pdfplumber.\"\"\"\n",[33,10164,10165,10168,10170],{"class":35,"line":201},[33,10166,10167],{"class":167},"    page_frames: list[pd.DataFrame] ",[33,10169,242],{"class":163},[33,10171,589],{"class":167},[33,10173,10174,10176],{"class":35,"line":206},[33,10175,2424],{"class":163},[33,10177,574],{"class":167},[33,10179,10180,10182,10184,10186],{"class":35,"line":224},[33,10181,2191],{"class":163},[33,10183,7123],{"class":167},[33,10185,495],{"class":163},[33,10187,686],{"class":167},[33,10189,10190,10192,10194,10196,10198,10200,10202,10204,10206],{"class":35,"line":229},[33,10191,1793],{"class":163},[33,10193,7398],{"class":167},[33,10195,662],{"class":163},[33,10197,7403],{"class":50},[33,10199,7406],{"class":167},[33,10201,7409],{"class":238},[33,10203,242],{"class":163},[33,10205,734],{"class":50},[33,10207,1737],{"class":167},[33,10209,10210,10213,10215,10218,10221,10223],{"class":35,"line":235},[33,10211,10212],{"class":167},"                raw_tables ",[33,10214,242],{"class":163},[33,10216,10217],{"class":167}," page.extract_tables(",[33,10219,10220],{"class":238},"table_settings",[33,10222,242],{"class":163},[33,10224,10225],{"class":167},"{\n",[33,10227,10228,10231,10233,10236],{"class":35,"line":250},[33,10229,10230],{"class":54},"                    \"vertical_strategy\"",[33,10232,2079],{"class":167},[33,10234,10235],{"class":54},"\"lines_strict\"",[33,10237,247],{"class":167},[33,10239,10240,10243,10245,10247],{"class":35,"line":266},[33,10241,10242],{"class":54},"                    \"horizontal_strategy\"",[33,10244,2079],{"class":167},[33,10246,10235],{"class":54},[33,10248,247],{"class":167},[33,10250,10251,10254,10256,10259],{"class":35,"line":290},[33,10252,10253],{"class":54},"                    \"snap_tolerance\"",[33,10255,2079],{"class":167},[33,10257,10258],{"class":50},"3",[33,10260,247],{"class":167},[33,10262,10263,10266,10268],{"class":35,"line":295},[33,10264,10265],{"class":167},"                }) ",[33,10267,7162],{"class":163},[33,10269,589],{"class":167},[33,10271,10272,10274,10276,10278],{"class":35,"line":300},[33,10273,692],{"class":163},[33,10275,7422],{"class":167},[33,10277,662],{"class":163},[33,10279,10280],{"class":167}," raw_tables:\n",[33,10282,10283,10285,10287,10289,10291,10293,10295,10297,10299],{"class":35,"line":317},[33,10284,717],{"class":163},[33,10286,620],{"class":163},[33,10288,7422],{"class":167},[33,10290,7162],{"class":163},[33,10292,4037],{"class":50},[33,10294,7446],{"class":167},[33,10296,4043],{"class":163},[33,10298,7451],{"class":50},[33,10300,574],{"class":167},[33,10302,10303],{"class":35,"line":332},[33,10304,7458],{"class":163},[33,10306,10307],{"class":35,"line":347},[33,10308,10309],{"class":39},"                    # Promote first row as header; replace None with placeholder\n",[33,10311,10312,10314,10316],{"class":35,"line":374},[33,10313,7468],{"class":167},[33,10315,242],{"class":163},[33,10317,7473],{"class":167},[33,10319,10320,10322,10324,10326,10328,10330,10332,10334,10336,10338,10340],{"class":35,"line":397},[33,10321,7478],{"class":50},[33,10323,7481],{"class":167},[33,10325,2491],{"class":163},[33,10327,7486],{"class":167},[33,10329,7489],{"class":163},[33,10331,1110],{"class":163},[33,10333,7494],{"class":54},[33,10335,1115],{"class":50},[33,10337,7499],{"class":167},[33,10339,1121],{"class":50},[33,10341,7504],{"class":54},[33,10343,10344,10346,10348,10350,10352,10354,10356],{"class":35,"line":653},[33,10345,7509],{"class":163},[33,10347,7512],{"class":167},[33,10349,662],{"class":163},[33,10351,7403],{"class":50},[33,10353,7519],{"class":167},[33,10355,748],{"class":50},[33,10357,751],{"class":167},[33,10359,10360],{"class":35,"line":667},[33,10361,7528],{"class":167},[33,10363,10364,10366,10368,10370,10372,10374,10376,10378],{"class":35,"line":675},[33,10365,7533],{"class":167},[33,10367,242],{"class":163},[33,10369,7538],{"class":167},[33,10371,734],{"class":50},[33,10373,737],{"class":167},[33,10375,740],{"class":238},[33,10377,242],{"class":163},[33,10379,7549],{"class":167},[33,10381,10382,10384,10387,10389,10391,10394],{"class":35,"line":689},[33,10383,7554],{"class":167},[33,10385,10386],{"class":54},"\"_source_page\"",[33,10388,763],{"class":167},[33,10390,242],{"class":163},[33,10392,10393],{"class":167}," page_num       ",[33,10395,10396],{"class":39},"# audit column\n",[33,10398,10399],{"class":35,"line":703},[33,10400,10401],{"class":167},"                    page_frames.append(df)\n",[33,10403,10404,10406,10408,10410],{"class":35,"line":714},[33,10405,2449],{"class":163},[33,10407,783],{"class":50},[33,10409,1852],{"class":163},[33,10411,7583],{"class":167},[33,10413,10414,10416,10418,10420,10422,10425,10427,10429,10431,10433,10435,10437,10439,10441,10443,10445],{"class":35,"line":723},[33,10415,4051],{"class":163},[33,10417,7590],{"class":50},[33,10419,602],{"class":167},[33,10421,4059],{"class":163},[33,10423,10424],{"class":54},"\"pdfplumber extraction failed on ",[33,10426,1115],{"class":50},[33,10428,2580],{"class":167},[33,10430,1121],{"class":50},[33,10432,2079],{"class":54},[33,10434,1115],{"class":50},[33,10436,7602],{"class":167},[33,10438,1121],{"class":50},[33,10440,274],{"class":54},[33,10442,1649],{"class":167},[33,10444,190],{"class":163},[33,10446,7613],{"class":167},[33,10448,10449,10451],{"class":35,"line":754},[33,10450,1332],{"class":163},[33,10452,10453],{"class":167}," page_frames\n",[33,10455,10456],{"class":35,"line":771},[33,10457,92],{"emptyLinePlaceholder":91},[33,10459,10460],{"class":35,"line":777},[33,10461,92],{"emptyLinePlaceholder":91},[33,10463,10464,10466,10468,10470,10472],{"class":35,"line":788},[33,10465,2491],{"class":163},[33,10467,2494],{"class":50},[33,10469,2497],{"class":163},[33,10471,2500],{"class":54},[33,10473,574],{"class":167},[33,10475,10476,10478,10480,10483,10485],{"class":35,"line":804},[33,10477,584],{"class":167},[33,10479,242],{"class":163},[33,10481,10482],{"class":167}," extract_pages_pdfplumber(",[33,10484,7076],{"class":50},[33,10486,221],{"class":167},[33,10488,10489,10491,10493,10495,10497,10499,10501,10503,10506],{"class":35,"line":809},[33,10490,7268],{"class":50},[33,10492,602],{"class":167},[33,10494,4059],{"class":163},[33,10496,8142],{"class":54},[33,10498,4065],{"class":50},[33,10500,8147],{"class":167},[33,10502,1121],{"class":50},[33,10504,10505],{"class":54}," table(s) across pages\"",[33,10507,221],{"class":167},[33,10509,10510,10512,10514,10516],{"class":35,"line":819},[33,10511,656],{"class":163},[33,10513,7810],{"class":167},[33,10515,662],{"class":163},[33,10517,816],{"class":167},[33,10519,10520,10522],{"class":35,"line":829},[33,10521,9414],{"class":50},[33,10523,10524],{"class":167},"(df.shape, df.columns.tolist())\n",[14,10526,10527,10528,10531,10532,10536],{},"If columns are still misaligned after extraction — values appearing in the wrong column — that is coordinate drift. The ",[30,10529,10530],{},"extract_words"," approach in ",[940,10533,10535],{"href":10534},"\u002Fautomating-pdf-extraction-generation\u002Fextracting-tables-from-pdfs\u002Ffix-pdf-text-extraction-alignment-issues\u002F","Fix PDF Text Extraction Alignment Issues"," resolves it by sorting text objects by their x\u002Fy positions directly.",[424,10538,10540],{"id":10539},"camelot-bordered-and-whitespace-column-tables","camelot — bordered and whitespace-column tables",[23,10542,10544],{"className":126,"code":10543,"language":47,"meta":28,"style":28},"# pip install \"camelot-py[cv]\" pandas\nfrom pathlib import Path\nimport camelot\nimport pandas as pd\n\nPDF_PATH = Path(\"data\u002Freport.pdf\")\n\n\ndef extract_pages_camelot(\n    path: Path,\n    pages: str = \"1-end\",\n    flavor: str = \"lattice\",\n) -> list[pd.DataFrame]:\n    \"\"\"Extract tables using camelot, one DataFrame per detected table.\"\"\"\n    try:\n        table_list = camelot.read_pdf(\n            str(path),\n            pages=pages,\n            flavor=flavor,\n            process_background=(flavor == \"lattice\"),  # find lines on coloured fills\n        )\n    except Exception as e:\n        raise RuntimeError(f\"camelot ({flavor}) failed on {path}: {e}\") from e\n\n    if table_list.n == 0:\n        return []\n\n    frames: list[pd.DataFrame] = []\n    for t in table_list:\n        df = t.df.copy()\n        # First row is always the header in camelot's raw output\n        df.columns = df.iloc[0].str.strip()\n        df = df.iloc[1:].reset_index(drop=True)\n        df.replace(\"\", pd.NA, inplace=True)\n        df[\"_accuracy\"] = t.parsing_report.get(\"accuracy\", None)  # audit column\n        frames.append(df)\n    return frames\n\n\nif __name__ == \"__main__\":\n    frames = extract_pages_camelot(PDF_PATH, pages=\"1-5\", flavor=\"lattice\")\n    for i, df in enumerate(frames):\n        print(f\"Table {i + 1}: {df.shape}, accuracy={df['_accuracy'].iloc[0]}\")\n",[30,10545,10546,10551,10561,10568,10578,10582,10594,10598,10602,10611,10616,10630,10643,10648,10653,10659,10669,10677,10687,10697,10717,10721,10731,10776,10780,10794,10800,10804,10812,10824,10833,10838,10853,10875,10898,10925,10930,10936,10940,10944,10956,10988,11002],{"__ignoreMap":28},[33,10547,10548],{"class":35,"line":36},[33,10549,10550],{"class":39},"# pip install \"camelot-py[cv]\" pandas\n",[33,10552,10553,10555,10557,10559],{"class":35,"line":43},[33,10554,190],{"class":163},[33,10556,193],{"class":167},[33,10558,164],{"class":163},[33,10560,198],{"class":167},[33,10562,10563,10565],{"class":35,"line":61},[33,10564,164],{"class":163},[33,10566,10567],{"class":167}," camelot\n",[33,10569,10570,10572,10574,10576],{"class":35,"line":73},[33,10571,164],{"class":163},[33,10573,492],{"class":167},[33,10575,495],{"class":163},[33,10577,498],{"class":167},[33,10579,10580],{"class":35,"line":88},[33,10581,92],{"emptyLinePlaceholder":91},[33,10583,10584,10586,10588,10590,10592],{"class":35,"line":95},[33,10585,7076],{"class":50},[33,10587,212],{"class":163},[33,10589,215],{"class":167},[33,10591,7083],{"class":54},[33,10593,221],{"class":167},[33,10595,10596],{"class":35,"line":101},[33,10597,92],{"emptyLinePlaceholder":91},[33,10599,10600],{"class":35,"line":171},[33,10601,92],{"emptyLinePlaceholder":91},[33,10603,10604,10606,10609],{"class":35,"line":179},[33,10605,562],{"class":163},[33,10607,10608],{"class":46}," extract_pages_camelot",[33,10610,7637],{"class":167},[33,10612,10613],{"class":35,"line":187},[33,10614,10615],{"class":167},"    path: Path,\n",[33,10617,10618,10621,10623,10625,10628],{"class":35,"line":201},[33,10619,10620],{"class":167},"    pages: ",[33,10622,1053],{"class":50},[33,10624,212],{"class":163},[33,10626,10627],{"class":54}," \"1-end\"",[33,10629,247],{"class":167},[33,10631,10632,10635,10637,10639,10641],{"class":35,"line":206},[33,10633,10634],{"class":167},"    flavor: ",[33,10636,1053],{"class":50},[33,10638,212],{"class":163},[33,10640,9991],{"class":54},[33,10642,247],{"class":167},[33,10644,10645],{"class":35,"line":224},[33,10646,10647],{"class":167},") -> list[pd.DataFrame]:\n",[33,10649,10650],{"class":35,"line":229},[33,10651,10652],{"class":54},"    \"\"\"Extract tables using camelot, one DataFrame per detected table.\"\"\"\n",[33,10654,10655,10657],{"class":35,"line":235},[33,10656,2424],{"class":163},[33,10658,574],{"class":167},[33,10660,10661,10664,10666],{"class":35,"line":250},[33,10662,10663],{"class":167},"        table_list ",[33,10665,242],{"class":163},[33,10667,10668],{"class":167}," camelot.read_pdf(\n",[33,10670,10671,10674],{"class":35,"line":266},[33,10672,10673],{"class":50},"            str",[33,10675,10676],{"class":167},"(path),\n",[33,10678,10679,10682,10684],{"class":35,"line":290},[33,10680,10681],{"class":238},"            pages",[33,10683,242],{"class":163},[33,10685,10686],{"class":167},"pages,\n",[33,10688,10689,10692,10694],{"class":35,"line":295},[33,10690,10691],{"class":238},"            flavor",[33,10693,242],{"class":163},[33,10695,10696],{"class":167},"flavor,\n",[33,10698,10699,10702,10704,10707,10709,10711,10714],{"class":35,"line":300},[33,10700,10701],{"class":238},"            process_background",[33,10703,242],{"class":163},[33,10705,10706],{"class":167},"(flavor ",[33,10708,1865],{"class":163},[33,10710,9991],{"class":54},[33,10712,10713],{"class":167},"),  ",[33,10715,10716],{"class":39},"# find lines on coloured fills\n",[33,10718,10719],{"class":35,"line":317},[33,10720,5867],{"class":167},[33,10722,10723,10725,10727,10729],{"class":35,"line":332},[33,10724,2449],{"class":163},[33,10726,783],{"class":50},[33,10728,1852],{"class":163},[33,10730,7583],{"class":167},[33,10732,10733,10735,10737,10739,10741,10744,10746,10749,10751,10754,10756,10758,10760,10762,10764,10766,10768,10770,10772,10774],{"class":35,"line":347},[33,10734,4051],{"class":163},[33,10736,7590],{"class":50},[33,10738,602],{"class":167},[33,10740,4059],{"class":163},[33,10742,10743],{"class":54},"\"camelot (",[33,10745,1115],{"class":50},[33,10747,10748],{"class":167},"flavor",[33,10750,1121],{"class":50},[33,10752,10753],{"class":54},") failed on ",[33,10755,1115],{"class":50},[33,10757,2580],{"class":167},[33,10759,1121],{"class":50},[33,10761,2079],{"class":54},[33,10763,1115],{"class":50},[33,10765,7602],{"class":167},[33,10767,1121],{"class":50},[33,10769,274],{"class":54},[33,10771,1649],{"class":167},[33,10773,190],{"class":163},[33,10775,7613],{"class":167},[33,10777,10778],{"class":35,"line":374},[33,10779,92],{"emptyLinePlaceholder":91},[33,10781,10782,10784,10787,10789,10792],{"class":35,"line":397},[33,10783,617],{"class":163},[33,10785,10786],{"class":167}," table_list.n ",[33,10788,1865],{"class":163},[33,10790,10791],{"class":50}," 0",[33,10793,574],{"class":167},[33,10795,10796,10798],{"class":35,"line":653},[33,10797,1659],{"class":163},[33,10799,589],{"class":167},[33,10801,10802],{"class":35,"line":667},[33,10803,92],{"emptyLinePlaceholder":91},[33,10805,10806,10808,10810],{"class":35,"line":675},[33,10807,6183],{"class":167},[33,10809,242],{"class":163},[33,10811,589],{"class":167},[33,10813,10814,10816,10819,10821],{"class":35,"line":689},[33,10815,656],{"class":163},[33,10817,10818],{"class":167}," t ",[33,10820,662],{"class":163},[33,10822,10823],{"class":167}," table_list:\n",[33,10825,10826,10828,10830],{"class":35,"line":703},[33,10827,7930],{"class":167},[33,10829,242],{"class":163},[33,10831,10832],{"class":167}," t.df.copy()\n",[33,10834,10835],{"class":35,"line":714},[33,10836,10837],{"class":39},"        # First row is always the header in camelot's raw output\n",[33,10839,10840,10843,10845,10848,10850],{"class":35,"line":723},[33,10841,10842],{"class":167},"        df.columns ",[33,10844,242],{"class":163},[33,10846,10847],{"class":167}," df.iloc[",[33,10849,748],{"class":50},[33,10851,10852],{"class":167},"].str.strip()\n",[33,10854,10855,10857,10859,10861,10863,10866,10869,10871,10873],{"class":35,"line":754},[33,10856,7930],{"class":167},[33,10858,242],{"class":163},[33,10860,10847],{"class":167},[33,10862,734],{"class":50},[33,10864,10865],{"class":167},":].reset_index(",[33,10867,10868],{"class":238},"drop",[33,10870,242],{"class":163},[33,10872,855],{"class":50},[33,10874,221],{"class":167},[33,10876,10877,10880,10882,10885,10887,10889,10892,10894,10896],{"class":35,"line":771},[33,10878,10879],{"class":167},"        df.replace(",[33,10881,3198],{"class":54},[33,10883,10884],{"class":167},", pd.",[33,10886,8018],{"class":50},[33,10888,365],{"class":167},[33,10890,10891],{"class":238},"inplace",[33,10893,242],{"class":163},[33,10895,855],{"class":50},[33,10897,221],{"class":167},[33,10899,10900,10903,10906,10908,10910,10913,10916,10918,10920,10923],{"class":35,"line":777},[33,10901,10902],{"class":167},"        df[",[33,10904,10905],{"class":54},"\"_accuracy\"",[33,10907,763],{"class":167},[33,10909,242],{"class":163},[33,10911,10912],{"class":167}," t.parsing_report.get(",[33,10914,10915],{"class":54},"\"accuracy\"",[33,10917,365],{"class":167},[33,10919,571],{"class":50},[33,10921,10922],{"class":167},")  ",[33,10924,10396],{"class":39},[33,10926,10927],{"class":35,"line":788},[33,10928,10929],{"class":167},"        frames.append(df)\n",[33,10931,10932,10934],{"class":35,"line":804},[33,10933,1332],{"class":163},[33,10935,6065],{"class":167},[33,10937,10938],{"class":35,"line":809},[33,10939,92],{"emptyLinePlaceholder":91},[33,10941,10942],{"class":35,"line":819},[33,10943,92],{"emptyLinePlaceholder":91},[33,10945,10946,10948,10950,10952,10954],{"class":35,"line":829},[33,10947,2491],{"class":163},[33,10949,2494],{"class":50},[33,10951,2497],{"class":163},[33,10953,2500],{"class":54},[33,10955,574],{"class":167},[33,10957,10958,10960,10962,10965,10967,10969,10972,10974,10977,10979,10981,10983,10986],{"class":35,"line":834},[33,10959,584],{"class":167},[33,10961,242],{"class":163},[33,10963,10964],{"class":167}," extract_pages_camelot(",[33,10966,7076],{"class":50},[33,10968,365],{"class":167},[33,10970,10971],{"class":238},"pages",[33,10973,242],{"class":163},[33,10975,10976],{"class":54},"\"1-5\"",[33,10978,365],{"class":167},[33,10980,10748],{"class":238},[33,10982,242],{"class":163},[33,10984,10985],{"class":54},"\"lattice\"",[33,10987,221],{"class":167},[33,10989,10990,10992,10995,10997,10999],{"class":35,"line":839},[33,10991,656],{"class":163},[33,10993,10994],{"class":167}," i, df ",[33,10996,662],{"class":163},[33,10998,7403],{"class":50},[33,11000,11001],{"class":167},"(frames):\n",[33,11003,11004,11006,11008,11010,11013,11015,11018,11020,11023,11025,11027,11029,11031,11034,11036,11039,11042,11045,11047,11049,11051,11053],{"class":35,"line":860},[33,11005,9414],{"class":50},[33,11007,602],{"class":167},[33,11009,4059],{"class":163},[33,11011,11012],{"class":54},"\"Table ",[33,11014,1115],{"class":50},[33,11016,11017],{"class":167},"i ",[33,11019,1811],{"class":163},[33,11021,11022],{"class":50}," 1}",[33,11024,2079],{"class":54},[33,11026,1115],{"class":50},[33,11028,9426],{"class":167},[33,11030,1121],{"class":50},[33,11032,11033],{"class":54},", accuracy=",[33,11035,1115],{"class":50},[33,11037,11038],{"class":167},"df[",[33,11040,11041],{"class":54},"'_accuracy'",[33,11043,11044],{"class":167},"].iloc[",[33,11046,748],{"class":50},[33,11048,9546],{"class":167},[33,11050,1121],{"class":50},[33,11052,274],{"class":54},[33,11054,221],{"class":167},[14,11056,11057,11058,11061,11062,11065,11066,3035],{},"camelot's ",[30,11059,11060],{},"parsing_report[\"accuracy\"]"," score runs from 0 to 100. Scores below 80 on lattice mode indicate that line detection missed borders — try increasing ",[30,11063,11064],{},"line_scale"," from the default 15 to 40 or 60, or switch to ",[30,11067,11068],{},"flavor=\"stream\"",[2540,11070,2547,11073,2547,11076,2547,11079,2547,2547,11093,2547,11096,2547,11101,2547,11106,2547,11110,2547,11113,2547,11117,2547,11120,2547,11124,2547,2547,11129,2547,2547,11134,2547,11137,2547,11141,2547,11143,2547,2547,11146,2547,2547,11150,2547,11152,2547,11157,2547,11160,2547,11167,2547,11171,2547,11174,2547,11178,2547,11181,2547,2547,11185,2547,2547,11189,2547,11191,2547,11196,2547,11200,2547,2547,11204,2547,2547,11208,2547,11211,2547,11216,2547,11220,2547,11223,2547,11226,2547,11229],{"viewBox":11071,"role":2543,"ariaLabel":11072,"xmlns":2545,"style":2546},"0 0 760 320","Pipeline from PDF pages through extraction to per-page DataFrames through concat and clean to a tidy DataFrame",[2549,11074,11075],{},"PDF to pandas DataFrame Pipeline",[2553,11077,11078],{},"Five-stage flow: PDF pages are fed into an extractor (pdfplumber or camelot), which produces per-page DataFrames, which are concatenated and cleaned into a single tidy DataFrame ready for analysis or export.",[2557,11080,2559,11081,2559,11088,2547],{},[2561,11082,2564,11084,2564,11086,2559],{"id":11083,"x1":748,"y1":748,"x2":734,"y2":748},"pdf-pandas-grad",[2566,11085],{"offset":748,"style":2568},[2566,11087],{"offset":734,"style":2571},[2573,11089,2564,11091,2559],{"id":11090,"viewBox":2576,"refX":2577,"refY":1153,"markerWidth":1179,"markerHeight":1179,"orient":2578},"pdf-pandas-arrow",[2580,11092],{"d":2582,"fill":2583},[2585,11094],{"x":2591,"y":2597,"width":11095,"height":2610,"rx":3545,"fill":2592,"stroke":2593,"style":2594},"118",[2000,11097,11100],{"x":11098,"y":11099,"fill":2599,"style":2600},"67","108","PDF Pages",[2585,11102],{"x":11103,"y":11095,"width":2630,"height":11104,"rx":1503,"fill":2615,"stroke":2593,"style":11105},"22","28","stroke-width:1",[2000,11107,11109],{"x":11098,"y":11108,"fill":2583,"style":2605},"136","page 1",[2585,11111],{"x":11103,"y":11112,"width":2630,"height":11104,"rx":1503,"fill":2615,"stroke":2593,"style":11105},"152",[2000,11114,11116],{"x":11098,"y":11115,"fill":2583,"style":2605},"170","page 2",[2585,11118],{"x":11103,"y":11119,"width":2630,"height":11104,"rx":1503,"fill":2615,"stroke":2593,"style":11105},"186",[2000,11121,11123],{"x":11098,"y":11122,"fill":2583,"style":2605},"204","page N",[2000,11125,11128],{"x":11098,"y":11126,"fill":2583,"style":11127},"228","text-anchor:middle;font-size:10px","…",[35,11130],{"x1":11131,"y1":2610,"x2":11132,"y2":2610,"stroke":2583,"markerEnd":11133,"style":2594},"126","158","url(#pdf-pandas-arrow)",[2585,11135],{"x":11132,"y":2679,"width":11095,"height":2650,"rx":3545,"fill":11136,"stroke":2593,"style":2594},"url(#pdf-pandas-grad)",[2000,11138,11140],{"x":11139,"y":2635,"fill":2599,"style":2600},"217","Extractor",[2000,11142,943],{"x":11139,"y":2639,"fill":2599,"style":2605},[2000,11144,11145],{"x":11139,"y":2643,"fill":2583,"style":2605},"or camelot",[35,11147],{"x1":11148,"y1":2610,"x2":11149,"y2":2610,"stroke":2583,"markerEnd":11133,"style":2594},"276","308",[2585,11151],{"x":11149,"y":2597,"width":2588,"height":2610,"rx":3545,"fill":2592,"stroke":2593,"style":2594},[2000,11153,11156],{"x":11154,"y":11155,"fill":2599,"style":2685},"373","104","Per-page",[2000,11158,11159],{"x":11154,"y":2589,"fill":2599,"style":2685},"DataFrames",[2585,11161],{"x":11162,"y":2588,"width":11163,"height":11164,"rx":1503,"fill":11165,"stroke":11166,"style":11105},"322","102","26","#dbeafe","#2563eb",[2000,11168,11170],{"x":11154,"y":11169,"fill":2599,"style":2605},"147","df_page_1",[2585,11172],{"x":11162,"y":11173,"width":11163,"height":11164,"rx":1503,"fill":11165,"stroke":11166,"style":11105},"162",[2000,11175,11177],{"x":11154,"y":11176,"fill":2599,"style":2605},"179","df_page_2",[2585,11179],{"x":11162,"y":11180,"width":11163,"height":11164,"rx":1503,"fill":11165,"stroke":11166,"style":11105},"194",[2000,11182,11184],{"x":11154,"y":11183,"fill":2599,"style":2605},"211","df_page_N",[35,11186],{"x1":11187,"y1":2610,"x2":11188,"y2":2610,"stroke":2583,"markerEnd":11133,"style":2594},"438","468",[2585,11190],{"x":11188,"y":2679,"width":11095,"height":2650,"rx":3545,"fill":11136,"stroke":2593,"style":2594},[2000,11192,11195],{"x":11193,"y":11194,"fill":2599,"style":2600},"527","148","Concat",[2000,11197,11199],{"x":11193,"y":11198,"fill":2599,"style":2605},"166","+ Clean",[2000,11201,11203],{"x":11193,"y":11202,"fill":2583,"style":2605},"182","dedup headers",[35,11205],{"x1":11206,"y1":2610,"x2":11207,"y2":2610,"stroke":2583,"markerEnd":11133,"style":2594},"586","618",[2585,11209],{"x":11207,"y":2650,"width":2588,"height":2589,"rx":3545,"fill":2592,"stroke":11166,"style":11210},"stroke-width:2",[2000,11212,11215],{"x":11213,"y":11214,"fill":2599,"style":2600},"683","128","Tidy",[2000,11217,11219],{"x":11213,"y":11218,"fill":2599,"style":2600},"146","DataFrame",[2000,11221,11222],{"x":11213,"y":2639,"fill":2583,"style":2605},"correct dtypes",[2000,11224,11225],{"x":11213,"y":2643,"fill":2583,"style":2605},"clean strings",[2000,11227,11228],{"x":11213,"y":2611,"fill":11166,"style":2605},"analysis-ready",[2000,11230,11232],{"x":2626,"y":11231,"fill":2583,"style":2605},"290","\nEach page extracted independently; per-page frames concatenated then dtype-normalized\n",[18,11234,11236],{"id":11235},"step-3-concatenate-per-page-tables-and-drop-repeated-headers","Step 3: Concatenate Per-Page Tables and Drop Repeated Headers",[14,11238,11239],{},"PDFs that paginate a single logical table repeat the header row at the top of each page. Concatenating naively pollutes the DataFrame with header strings as data rows.",[23,11241,11243],{"className":126,"code":11242,"language":47,"meta":28,"style":28},"# pip install pandas\nfrom pathlib import Path\nimport pandas as pd\n\n\ndef concat_drop_headers(frames: list[pd.DataFrame]) -> pd.DataFrame:\n    \"\"\"\n    Concatenate per-page DataFrames and remove repeated header rows.\n    Assumes all frames share the same column set (or a superset).\n    \"\"\"\n    if not frames:\n        return pd.DataFrame()\n\n    canonical_cols = frames[0].columns.tolist()\n\n    cleaned: list[pd.DataFrame] = []\n    for df in frames:\n        # A row is a repeated header if every non-null cell matches its column name\n        non_audit = [c for c in df.columns if not c.startswith(\"_\")]\n        header_mask = df[non_audit].apply(\n            lambda row: [\n                str(v).strip() == str(col).strip()\n                for v, col in zip(row, non_audit)\n            ],\n            axis=1,\n        ).apply(all)\n        df = df[~header_mask].copy()\n        cleaned.append(df)\n\n    combined = pd.concat(cleaned, ignore_index=True)\n    return combined\n\n\nif __name__ == \"__main__\":\n    # Example using pdfplumber frames from Step 2\n    from pathlib import Path\n    frames = extract_pages_pdfplumber(Path(\"data\u002Freport.pdf\"))\n    combined = concat_drop_headers(frames)\n    print(combined.shape)\n    print(combined.head())\n",[30,11244,11245,11249,11259,11269,11273,11277,11287,11291,11296,11301,11305,11313,11319,11323,11337,11341,11349,11359,11364,11391,11400,11407,11419,11432,11437,11447,11456,11469,11473,11477,11493,11499,11503,11507,11519,11524,11534,11547,11556,11563],{"__ignoreMap":28},[33,11246,11247],{"class":35,"line":36},[33,11248,8895],{"class":39},[33,11250,11251,11253,11255,11257],{"class":35,"line":43},[33,11252,190],{"class":163},[33,11254,193],{"class":167},[33,11256,164],{"class":163},[33,11258,198],{"class":167},[33,11260,11261,11263,11265,11267],{"class":35,"line":61},[33,11262,164],{"class":163},[33,11264,492],{"class":167},[33,11266,495],{"class":163},[33,11268,498],{"class":167},[33,11270,11271],{"class":35,"line":73},[33,11272,92],{"emptyLinePlaceholder":91},[33,11274,11275],{"class":35,"line":88},[33,11276,92],{"emptyLinePlaceholder":91},[33,11278,11279,11281,11284],{"class":35,"line":95},[33,11280,562],{"class":163},[33,11282,11283],{"class":46}," concat_drop_headers",[33,11285,11286],{"class":167},"(frames: list[pd.DataFrame]) -> pd.DataFrame:\n",[33,11288,11289],{"class":35,"line":101},[33,11290,7673],{"class":54},[33,11292,11293],{"class":35,"line":171},[33,11294,11295],{"class":54},"    Concatenate per-page DataFrames and remove repeated header rows.\n",[33,11297,11298],{"class":35,"line":179},[33,11299,11300],{"class":54},"    Assumes all frames share the same column set (or a superset).\n",[33,11302,11303],{"class":35,"line":187},[33,11304,7673],{"class":54},[33,11306,11307,11309,11311],{"class":35,"line":201},[33,11308,617],{"class":163},[33,11310,620],{"class":163},[33,11312,816],{"class":167},[33,11314,11315,11317],{"class":35,"line":206},[33,11316,1659],{"class":163},[33,11318,7721],{"class":167},[33,11320,11321],{"class":35,"line":224},[33,11322,92],{"emptyLinePlaceholder":91},[33,11324,11325,11328,11330,11332,11334],{"class":35,"line":229},[33,11326,11327],{"class":167},"    canonical_cols ",[33,11329,242],{"class":163},[33,11331,7749],{"class":167},[33,11333,748],{"class":50},[33,11335,11336],{"class":167},"].columns.tolist()\n",[33,11338,11339],{"class":35,"line":235},[33,11340,92],{"emptyLinePlaceholder":91},[33,11342,11343,11345,11347],{"class":35,"line":250},[33,11344,7799],{"class":167},[33,11346,242],{"class":163},[33,11348,589],{"class":167},[33,11350,11351,11353,11355,11357],{"class":35,"line":266},[33,11352,656],{"class":163},[33,11354,7810],{"class":167},[33,11356,662],{"class":163},[33,11358,816],{"class":167},[33,11360,11361],{"class":35,"line":290},[33,11362,11363],{"class":39},"        # A row is a repeated header if every non-null cell matches its column name\n",[33,11365,11366,11369,11371,11373,11375,11377,11379,11381,11383,11385,11387,11389],{"class":35,"line":295},[33,11367,11368],{"class":167},"        non_audit ",[33,11370,242],{"class":163},[33,11372,7740],{"class":167},[33,11374,6124],{"class":163},[33,11376,7486],{"class":167},[33,11378,662],{"class":163},[33,11380,7837],{"class":167},[33,11382,2491],{"class":163},[33,11384,620],{"class":163},[33,11386,7761],{"class":167},[33,11388,7764],{"class":54},[33,11390,7767],{"class":167},[33,11392,11393,11395,11397],{"class":35,"line":300},[33,11394,7852],{"class":167},[33,11396,242],{"class":163},[33,11398,11399],{"class":167}," df[non_audit].apply(\n",[33,11401,11402,11404],{"class":35,"line":317},[33,11403,7862],{"class":163},[33,11405,11406],{"class":167}," row: [\n",[33,11408,11409,11411,11413,11415,11417],{"class":35,"line":332},[33,11410,7879],{"class":50},[33,11412,7882],{"class":167},[33,11414,1865],{"class":163},[33,11416,7887],{"class":50},[33,11418,7890],{"class":167},[33,11420,11421,11423,11425,11427,11429],{"class":35,"line":347},[33,11422,692],{"class":163},[33,11424,7897],{"class":167},[33,11426,662],{"class":163},[33,11428,7902],{"class":50},[33,11430,11431],{"class":167},"(row, non_audit)\n",[33,11433,11434],{"class":35,"line":374},[33,11435,11436],{"class":167},"            ],\n",[33,11438,11439,11441,11443,11445],{"class":35,"line":397},[33,11440,7915],{"class":238},[33,11442,242],{"class":163},[33,11444,734],{"class":50},[33,11446,247],{"class":167},[33,11448,11449,11452,11454],{"class":35,"line":653},[33,11450,11451],{"class":167},"        ).apply(",[33,11453,7868],{"class":50},[33,11455,221],{"class":167},[33,11457,11458,11460,11462,11464,11466],{"class":35,"line":667},[33,11459,7930],{"class":167},[33,11461,242],{"class":163},[33,11463,7935],{"class":167},[33,11465,7938],{"class":163},[33,11467,11468],{"class":167},"header_mask].copy()\n",[33,11470,11471],{"class":35,"line":675},[33,11472,8043],{"class":167},[33,11474,11475],{"class":35,"line":689},[33,11476,92],{"emptyLinePlaceholder":91},[33,11478,11479,11481,11483,11485,11487,11489,11491],{"class":35,"line":703},[33,11480,842],{"class":167},[33,11482,242],{"class":163},[33,11484,8061],{"class":167},[33,11486,850],{"class":238},[33,11488,242],{"class":163},[33,11490,855],{"class":50},[33,11492,221],{"class":167},[33,11494,11495,11497],{"class":35,"line":714},[33,11496,1332],{"class":163},[33,11498,8098],{"class":167},[33,11500,11501],{"class":35,"line":723},[33,11502,92],{"emptyLinePlaceholder":91},[33,11504,11505],{"class":35,"line":754},[33,11506,92],{"emptyLinePlaceholder":91},[33,11508,11509,11511,11513,11515,11517],{"class":35,"line":771},[33,11510,2491],{"class":163},[33,11512,2494],{"class":50},[33,11514,2497],{"class":163},[33,11516,2500],{"class":54},[33,11518,574],{"class":167},[33,11520,11521],{"class":35,"line":777},[33,11522,11523],{"class":39},"    # Example using pdfplumber frames from Step 2\n",[33,11525,11526,11528,11530,11532],{"class":35,"line":788},[33,11527,3878],{"class":163},[33,11529,193],{"class":167},[33,11531,164],{"class":163},[33,11533,198],{"class":167},[33,11535,11536,11538,11540,11543,11545],{"class":35,"line":804},[33,11537,584],{"class":167},[33,11539,242],{"class":163},[33,11541,11542],{"class":167}," extract_pages_pdfplumber(Path(",[33,11544,7083],{"class":54},[33,11546,371],{"class":167},[33,11548,11549,11551,11553],{"class":35,"line":809},[33,11550,842],{"class":167},[33,11552,242],{"class":163},[33,11554,11555],{"class":167}," concat_drop_headers(frames)\n",[33,11557,11558,11560],{"class":35,"line":819},[33,11559,7268],{"class":50},[33,11561,11562],{"class":167},"(combined.shape)\n",[33,11564,11565,11567],{"class":35,"line":829},[33,11566,7268],{"class":50},[33,11568,11569],{"class":167},"(combined.head())\n",[14,11571,11572,11573,11576,11577,8363,11579,11581],{},"For tables where the header row is not an exact string match — for example, the PDF adds a page number inside the header cell — apply a looser match: ",[30,11574,11575],{},"row.str.contains(col_keyword)",". The variant with ",[30,11578,8362],{},[30,11580,8366],{}," re-sequences the integer index across pages.",[18,11583,11585],{"id":11584},"step-4-forward-fill-merged-cells","Step 4: Forward-Fill Merged Cells",[14,11587,11588,11589,3035],{},"PDF tables frequently use merged cells to avoid repeating a group label (e.g., a date that applies to several rows). After extraction each merged cell's value appears only in the first row; the rest are empty or ",[30,11590,8884],{},[23,11592,11594],{"className":126,"code":11593,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\n\n\ndef forward_fill_groups(df: pd.DataFrame, group_cols: list[str]) -> pd.DataFrame:\n    \"\"\"\n    Forward-fill a list of columns that represent merged-cell group labels.\n    Only fills within a run of NaN values; stops at the next explicit value.\n    \"\"\"\n    df = df.copy()\n    for col in group_cols:\n        if col in df.columns:\n            # Replace empty strings with NaN first\n            df[col] = df[col].replace(\"\", pd.NA)\n            df[col] = df[col].ffill()\n    return df\n\n\nif __name__ == \"__main__\":\n    sample = pd.DataFrame({\n        \"Region\":   [\"North\", pd.NA, pd.NA, \"South\", pd.NA],\n        \"Month\":    [\"Jan\", \"Feb\", \"Mar\", \"Jan\", \"Feb\"],\n        \"Revenue\":  [1000, 1200, 900, 1500, 1100],\n    })\n    print(forward_fill_groups(sample, group_cols=[\"Region\"]))\n",[30,11595,11596,11600,11610,11614,11618,11633,11637,11642,11647,11651,11660,11671,11681,11686,11704,11713,11720,11724,11728,11740,11750,11780,11811,11844,11849],{"__ignoreMap":28},[33,11597,11598],{"class":35,"line":36},[33,11599,8895],{"class":39},[33,11601,11602,11604,11606,11608],{"class":35,"line":43},[33,11603,164],{"class":163},[33,11605,492],{"class":167},[33,11607,495],{"class":163},[33,11609,498],{"class":167},[33,11611,11612],{"class":35,"line":61},[33,11613,92],{"emptyLinePlaceholder":91},[33,11615,11616],{"class":35,"line":73},[33,11617,92],{"emptyLinePlaceholder":91},[33,11619,11620,11622,11625,11628,11630],{"class":35,"line":88},[33,11621,562],{"class":163},[33,11623,11624],{"class":46}," forward_fill_groups",[33,11626,11627],{"class":167},"(df: pd.DataFrame, group_cols: list[",[33,11629,1053],{"class":50},[33,11631,11632],{"class":167},"]) -> pd.DataFrame:\n",[33,11634,11635],{"class":35,"line":95},[33,11636,7673],{"class":54},[33,11638,11639],{"class":35,"line":101},[33,11640,11641],{"class":54},"    Forward-fill a list of columns that represent merged-cell group labels.\n",[33,11643,11644],{"class":35,"line":171},[33,11645,11646],{"class":54},"    Only fills within a run of NaN values; stops at the next explicit value.\n",[33,11648,11649],{"class":35,"line":179},[33,11650,7673],{"class":54},[33,11652,11653,11655,11657],{"class":35,"line":187},[33,11654,4025],{"class":167},[33,11656,242],{"class":163},[33,11658,11659],{"class":167}," df.copy()\n",[33,11661,11662,11664,11666,11668],{"class":35,"line":201},[33,11663,656],{"class":163},[33,11665,7985],{"class":167},[33,11667,662],{"class":163},[33,11669,11670],{"class":167}," group_cols:\n",[33,11672,11673,11675,11677,11679],{"class":35,"line":206},[33,11674,8221],{"class":163},[33,11676,7985],{"class":167},[33,11678,662],{"class":163},[33,11680,8005],{"class":167},[33,11682,11683],{"class":35,"line":224},[33,11684,11685],{"class":39},"            # Replace empty strings with NaN first\n",[33,11687,11688,11691,11693,11696,11698,11700,11702],{"class":35,"line":229},[33,11689,11690],{"class":167},"            df[col] ",[33,11692,242],{"class":163},[33,11694,11695],{"class":167}," df[col].replace(",[33,11697,3198],{"class":54},[33,11699,10884],{"class":167},[33,11701,8018],{"class":50},[33,11703,221],{"class":167},[33,11705,11706,11708,11710],{"class":35,"line":235},[33,11707,11690],{"class":167},[33,11709,242],{"class":163},[33,11711,11712],{"class":167}," df[col].ffill()\n",[33,11714,11715,11717],{"class":35,"line":250},[33,11716,1332],{"class":163},[33,11718,11719],{"class":167}," df\n",[33,11721,11722],{"class":35,"line":266},[33,11723,92],{"emptyLinePlaceholder":91},[33,11725,11726],{"class":35,"line":290},[33,11727,92],{"emptyLinePlaceholder":91},[33,11729,11730,11732,11734,11736,11738],{"class":35,"line":295},[33,11731,2491],{"class":163},[33,11733,2494],{"class":50},[33,11735,2497],{"class":163},[33,11737,2500],{"class":54},[33,11739,574],{"class":167},[33,11741,11742,11745,11747],{"class":35,"line":300},[33,11743,11744],{"class":167},"    sample ",[33,11746,242],{"class":163},[33,11748,11749],{"class":167}," pd.DataFrame({\n",[33,11751,11752,11755,11758,11761,11763,11765,11767,11769,11771,11774,11776,11778],{"class":35,"line":317},[33,11753,11754],{"class":54},"        \"Region\"",[33,11756,11757],{"class":167},":   [",[33,11759,11760],{"class":54},"\"North\"",[33,11762,10884],{"class":167},[33,11764,8018],{"class":50},[33,11766,10884],{"class":167},[33,11768,8018],{"class":50},[33,11770,365],{"class":167},[33,11772,11773],{"class":54},"\"South\"",[33,11775,10884],{"class":167},[33,11777,8018],{"class":50},[33,11779,8935],{"class":167},[33,11781,11782,11785,11788,11791,11793,11796,11798,11801,11803,11805,11807,11809],{"class":35,"line":332},[33,11783,11784],{"class":54},"        \"Month\"",[33,11786,11787],{"class":167},":    [",[33,11789,11790],{"class":54},"\"Jan\"",[33,11792,365],{"class":167},[33,11794,11795],{"class":54},"\"Feb\"",[33,11797,365],{"class":167},[33,11799,11800],{"class":54},"\"Mar\"",[33,11802,365],{"class":167},[33,11804,11790],{"class":54},[33,11806,365],{"class":167},[33,11808,11795],{"class":54},[33,11810,8935],{"class":167},[33,11812,11813,11816,11819,11822,11824,11827,11829,11832,11834,11837,11839,11842],{"class":35,"line":347},[33,11814,11815],{"class":54},"        \"Revenue\"",[33,11817,11818],{"class":167},":  [",[33,11820,11821],{"class":50},"1000",[33,11823,365],{"class":167},[33,11825,11826],{"class":50},"1200",[33,11828,365],{"class":167},[33,11830,11831],{"class":50},"900",[33,11833,365],{"class":167},[33,11835,11836],{"class":50},"1500",[33,11838,365],{"class":167},[33,11840,11841],{"class":50},"1100",[33,11843,8935],{"class":167},[33,11845,11846],{"class":35,"line":374},[33,11847,11848],{"class":167},"    })\n",[33,11850,11851,11853,11856,11859,11861,11863,11866],{"class":35,"line":397},[33,11852,7268],{"class":50},[33,11854,11855],{"class":167},"(forward_fill_groups(sample, ",[33,11857,11858],{"class":238},"group_cols",[33,11860,242],{"class":163},[33,11862,8309],{"class":167},[33,11864,11865],{"class":54},"\"Region\"",[33,11867,7211],{"class":167},[14,11869,11870,11871,11874,11875,11878],{},"Apply ",[30,11872,11873],{},"forward_fill_groups"," immediately after ",[30,11876,11877],{},"concat_drop_headers",", before any dtype coercion.",[18,11880,11882],{"id":11881},"step-5-normalize-dtypes-numbers-dates-currency","Step 5: Normalize Dtypes — Numbers, Dates, Currency",[14,11884,11885,11886,11889,11890,11893,11894,11897,11898,11901],{},"Extracted PDF text is always ",[30,11887,11888],{},"object"," dtype. Numeric columns contain comma-formatted strings (",[30,11891,11892],{},"\"1,250.00\"","), currency symbols (",[30,11895,11896],{},"\"$1,250.00\"","), and percentage suffixes (",[30,11899,11900],{},"\"12.5%\"","). Date columns contain free-form strings.",[23,11903,11905],{"className":126,"code":11904,"language":47,"meta":28,"style":28},"# pip install pandas\nimport re\nimport pandas as pd\n\n\ndef clean_currency(series: pd.Series) -> pd.Series:\n    \"\"\"Strip currency symbols, commas, and parentheses (negatives) → float.\"\"\"\n    s = series.astype(str).str.strip()\n    s = s.str.replace(r\"[$€£¥,\\s]\", \"\", regex=True)   # remove symbols and commas\n    s = s.str.replace(r\"\\((.+)\\)\", r\"-\\1\", regex=True)  # (1250.00) → -1250.00\n    s = s.str.replace(r\"%$\", \"\", regex=True)             # strip trailing %\n    return pd.to_numeric(s, errors=\"coerce\")\n\n\ndef coerce_dtypes(df: pd.DataFrame) -> pd.DataFrame:\n    \"\"\"\n    Attempt automatic dtype normalization on all columns:\n    - Columns with >70 % parseable numeric values → float\n    - Columns with >70 % parseable dates → datetime\n    \"\"\"\n    df = df.copy()\n    for col in df.columns:\n        if col.startswith(\"_\"):\n            continue  # skip audit columns\n\n        series = df[col].copy()\n\n        # Try numeric (including currency-formatted strings)\n        numeric_candidate = clean_currency(series)\n        if numeric_candidate.notna().mean() > 0.70:\n            df[col] = numeric_candidate\n            continue\n\n        # Try datetime\n        try:\n            date_candidate = pd.to_datetime(series, infer_datetime_format=True, errors=\"coerce\")\n            if date_candidate.notna().mean() > 0.70:\n                df[col] = date_candidate\n                continue\n        except Exception:\n            pass\n\n        # Otherwise strip leading\u002Ftrailing whitespace and keep as string\n        df[col] = series.astype(str).str.strip().replace(\"nan\", pd.NA)\n\n    return df\n\n\nif __name__ == \"__main__\":\n    sample = pd.DataFrame({\n        \"Date\":    [\"2026-01-15\", \"2026-02-01\", \"bad\"],\n        \"Revenue\": [\"$1,250.00\", \"$900.50\", \"n\u002Fa\"],\n        \"Units\":   [\"120\", \"85\", \"62\"],\n    })\n    cleaned = coerce_dtypes(sample)\n    print(cleaned.dtypes)\n    print(cleaned)\n",[30,11906,11907,11911,11918,11928,11932,11936,11946,11951,11966,12004,12058,12094,12110,12114,12118,12128,12132,12137,12142,12147,12151,12159,12169,12179,12187,12191,12201,12205,12210,12220,12234,12243,12247,12251,12256,12262,12289,12302,12311,12316,12324,12328,12332,12337,12360,12364,12370,12374,12378,12390,12398,12420,12441,12463,12467,12477,12484],{"__ignoreMap":28},[33,11908,11909],{"class":35,"line":36},[33,11910,8895],{"class":39},[33,11912,11913,11915],{"class":35,"line":43},[33,11914,164],{"class":163},[33,11916,11917],{"class":167}," re\n",[33,11919,11920,11922,11924,11926],{"class":35,"line":61},[33,11921,164],{"class":163},[33,11923,492],{"class":167},[33,11925,495],{"class":163},[33,11927,498],{"class":167},[33,11929,11930],{"class":35,"line":73},[33,11931,92],{"emptyLinePlaceholder":91},[33,11933,11934],{"class":35,"line":88},[33,11935,92],{"emptyLinePlaceholder":91},[33,11937,11938,11940,11943],{"class":35,"line":95},[33,11939,562],{"class":163},[33,11941,11942],{"class":46}," clean_currency",[33,11944,11945],{"class":167},"(series: pd.Series) -> pd.Series:\n",[33,11947,11948],{"class":35,"line":101},[33,11949,11950],{"class":54},"    \"\"\"Strip currency symbols, commas, and parentheses (negatives) → float.\"\"\"\n",[33,11952,11953,11956,11958,11961,11963],{"class":35,"line":171},[33,11954,11955],{"class":167},"    s ",[33,11957,242],{"class":163},[33,11959,11960],{"class":167}," series.astype(",[33,11962,1053],{"class":50},[33,11964,11965],{"class":167},").str.strip()\n",[33,11967,11968,11970,11972,11975,11978,11980,11983,11985,11987,11989,11991,11994,11996,11998,12001],{"class":35,"line":179},[33,11969,11955],{"class":167},[33,11971,242],{"class":163},[33,11973,11974],{"class":167}," s.str.replace(",[33,11976,11977],{"class":163},"r",[33,11979,274],{"class":54},[33,11981,11982],{"class":50},"[$€£¥,\\s]",[33,11984,274],{"class":54},[33,11986,365],{"class":167},[33,11988,3198],{"class":54},[33,11990,365],{"class":167},[33,11992,11993],{"class":238},"regex",[33,11995,242],{"class":163},[33,11997,855],{"class":50},[33,11999,12000],{"class":167},")   ",[33,12002,12003],{"class":39},"# remove symbols and commas\n",[33,12005,12006,12008,12010,12012,12014,12016,12020,12023,12025,12028,12031,12033,12035,12037,12040,12043,12045,12047,12049,12051,12053,12055],{"class":35,"line":187},[33,12007,11955],{"class":167},[33,12009,242],{"class":163},[33,12011,11974],{"class":167},[33,12013,11977],{"class":163},[33,12015,274],{"class":54},[33,12017,12019],{"class":12018},"s691h","\\(",[33,12021,12022],{"class":50},"(.",[33,12024,1811],{"class":163},[33,12026,12027],{"class":50},")",[33,12029,12030],{"class":12018},"\\)",[33,12032,274],{"class":54},[33,12034,365],{"class":167},[33,12036,11977],{"class":163},[33,12038,12039],{"class":54},"\"-",[33,12041,12042],{"class":2076},"\\1",[33,12044,274],{"class":54},[33,12046,365],{"class":167},[33,12048,11993],{"class":238},[33,12050,242],{"class":163},[33,12052,855],{"class":50},[33,12054,10922],{"class":167},[33,12056,12057],{"class":39},"# (1250.00) → -1250.00\n",[33,12059,12060,12062,12064,12066,12068,12071,12074,12076,12078,12080,12082,12084,12086,12088,12091],{"class":35,"line":201},[33,12061,11955],{"class":167},[33,12063,242],{"class":163},[33,12065,11974],{"class":167},[33,12067,11977],{"class":163},[33,12069,12070],{"class":54},"\"%",[33,12072,12073],{"class":50},"$",[33,12075,274],{"class":54},[33,12077,365],{"class":167},[33,12079,3198],{"class":54},[33,12081,365],{"class":167},[33,12083,11993],{"class":238},[33,12085,242],{"class":163},[33,12087,855],{"class":50},[33,12089,12090],{"class":167},")             ",[33,12092,12093],{"class":39},"# strip trailing %\n",[33,12095,12096,12098,12101,12103,12105,12108],{"class":35,"line":206},[33,12097,1332],{"class":163},[33,12099,12100],{"class":167}," pd.to_numeric(s, ",[33,12102,8317],{"class":238},[33,12104,242],{"class":163},[33,12106,12107],{"class":54},"\"coerce\"",[33,12109,221],{"class":167},[33,12111,12112],{"class":35,"line":224},[33,12113,92],{"emptyLinePlaceholder":91},[33,12115,12116],{"class":35,"line":229},[33,12117,92],{"emptyLinePlaceholder":91},[33,12119,12120,12122,12125],{"class":35,"line":235},[33,12121,562],{"class":163},[33,12123,12124],{"class":46}," coerce_dtypes",[33,12126,12127],{"class":167},"(df: pd.DataFrame) -> pd.DataFrame:\n",[33,12129,12130],{"class":35,"line":250},[33,12131,7673],{"class":54},[33,12133,12134],{"class":35,"line":266},[33,12135,12136],{"class":54},"    Attempt automatic dtype normalization on all columns:\n",[33,12138,12139],{"class":35,"line":290},[33,12140,12141],{"class":54},"    - Columns with >70 % parseable numeric values → float\n",[33,12143,12144],{"class":35,"line":295},[33,12145,12146],{"class":54},"    - Columns with >70 % parseable dates → datetime\n",[33,12148,12149],{"class":35,"line":300},[33,12150,7673],{"class":54},[33,12152,12153,12155,12157],{"class":35,"line":317},[33,12154,4025],{"class":167},[33,12156,242],{"class":163},[33,12158,11659],{"class":167},[33,12160,12161,12163,12165,12167],{"class":35,"line":332},[33,12162,656],{"class":163},[33,12164,7985],{"class":167},[33,12166,662],{"class":163},[33,12168,8005],{"class":167},[33,12170,12171,12173,12175,12177],{"class":35,"line":347},[33,12172,8221],{"class":163},[33,12174,8226],{"class":167},[33,12176,7764],{"class":54},[33,12178,1737],{"class":167},[33,12180,12181,12184],{"class":35,"line":374},[33,12182,12183],{"class":163},"            continue",[33,12185,12186],{"class":39},"  # skip audit columns\n",[33,12188,12189],{"class":35,"line":397},[33,12190,92],{"emptyLinePlaceholder":91},[33,12192,12193,12196,12198],{"class":35,"line":653},[33,12194,12195],{"class":167},"        series ",[33,12197,242],{"class":163},[33,12199,12200],{"class":167}," df[col].copy()\n",[33,12202,12203],{"class":35,"line":667},[33,12204,92],{"emptyLinePlaceholder":91},[33,12206,12207],{"class":35,"line":675},[33,12208,12209],{"class":39},"        # Try numeric (including currency-formatted strings)\n",[33,12211,12212,12215,12217],{"class":35,"line":689},[33,12213,12214],{"class":167},"        numeric_candidate ",[33,12216,242],{"class":163},[33,12218,12219],{"class":167}," clean_currency(series)\n",[33,12221,12222,12224,12227,12229,12232],{"class":35,"line":703},[33,12223,8221],{"class":163},[33,12225,12226],{"class":167}," numeric_candidate.notna().mean() ",[33,12228,6009],{"class":163},[33,12230,12231],{"class":50}," 0.70",[33,12233,574],{"class":167},[33,12235,12236,12238,12240],{"class":35,"line":714},[33,12237,11690],{"class":167},[33,12239,242],{"class":163},[33,12241,12242],{"class":167}," numeric_candidate\n",[33,12244,12245],{"class":35,"line":723},[33,12246,9330],{"class":163},[33,12248,12249],{"class":35,"line":754},[33,12250,92],{"emptyLinePlaceholder":91},[33,12252,12253],{"class":35,"line":771},[33,12254,12255],{"class":39},"        # Try datetime\n",[33,12257,12258,12260],{"class":35,"line":777},[33,12259,670],{"class":163},[33,12261,574],{"class":167},[33,12263,12264,12267,12269,12272,12275,12277,12279,12281,12283,12285,12287],{"class":35,"line":788},[33,12265,12266],{"class":167},"            date_candidate ",[33,12268,242],{"class":163},[33,12270,12271],{"class":167}," pd.to_datetime(series, ",[33,12273,12274],{"class":238},"infer_datetime_format",[33,12276,242],{"class":163},[33,12278,855],{"class":50},[33,12280,365],{"class":167},[33,12282,8317],{"class":238},[33,12284,242],{"class":163},[33,12286,12107],{"class":54},[33,12288,221],{"class":167},[33,12290,12291,12293,12296,12298,12300],{"class":35,"line":804},[33,12292,5995],{"class":163},[33,12294,12295],{"class":167}," date_candidate.notna().mean() ",[33,12297,6009],{"class":163},[33,12299,12231],{"class":50},[33,12301,574],{"class":167},[33,12303,12304,12306,12308],{"class":35,"line":809},[33,12305,8010],{"class":167},[33,12307,242],{"class":163},[33,12309,12310],{"class":167}," date_candidate\n",[33,12312,12313],{"class":35,"line":819},[33,12314,12315],{"class":163},"                continue\n",[33,12317,12318,12320,12322],{"class":35,"line":829},[33,12319,780],{"class":163},[33,12321,783],{"class":50},[33,12323,574],{"class":167},[33,12325,12326],{"class":35,"line":834},[33,12327,3552],{"class":163},[33,12329,12330],{"class":35,"line":839},[33,12331,92],{"emptyLinePlaceholder":91},[33,12333,12334],{"class":35,"line":860},[33,12335,12336],{"class":39},"        # Otherwise strip leading\u002Ftrailing whitespace and keep as string\n",[33,12338,12339,12342,12344,12346,12348,12351,12354,12356,12358],{"class":35,"line":887},[33,12340,12341],{"class":167},"        df[col] ",[33,12343,242],{"class":163},[33,12345,11960],{"class":167},[33,12347,1053],{"class":50},[33,12349,12350],{"class":167},").str.strip().replace(",[33,12352,12353],{"class":54},"\"nan\"",[33,12355,10884],{"class":167},[33,12357,8018],{"class":50},[33,12359,221],{"class":167},[33,12361,12362],{"class":35,"line":907},[33,12363,92],{"emptyLinePlaceholder":91},[33,12365,12366,12368],{"class":35,"line":1826},[33,12367,1332],{"class":163},[33,12369,11719],{"class":167},[33,12371,12372],{"class":35,"line":1844},[33,12373,92],{"emptyLinePlaceholder":91},[33,12375,12376],{"class":35,"line":1858},[33,12377,92],{"emptyLinePlaceholder":91},[33,12379,12380,12382,12384,12386,12388],{"class":35,"line":1871},[33,12381,2491],{"class":163},[33,12383,2494],{"class":50},[33,12385,2497],{"class":163},[33,12387,2500],{"class":54},[33,12389,574],{"class":167},[33,12391,12392,12394,12396],{"class":35,"line":1877},[33,12393,11744],{"class":167},[33,12395,242],{"class":163},[33,12397,11749],{"class":167},[33,12399,12400,12403,12405,12408,12410,12413,12415,12418],{"class":35,"line":1883},[33,12401,12402],{"class":54},"        \"Date\"",[33,12404,11787],{"class":167},[33,12406,12407],{"class":54},"\"2026-01-15\"",[33,12409,365],{"class":167},[33,12411,12412],{"class":54},"\"2026-02-01\"",[33,12414,365],{"class":167},[33,12416,12417],{"class":54},"\"bad\"",[33,12419,8935],{"class":167},[33,12421,12422,12424,12427,12429,12431,12434,12436,12439],{"class":35,"line":1915},[33,12423,11815],{"class":54},[33,12425,12426],{"class":167},": [",[33,12428,11896],{"class":54},[33,12430,365],{"class":167},[33,12432,12433],{"class":54},"\"$900.50\"",[33,12435,365],{"class":167},[33,12437,12438],{"class":54},"\"n\u002Fa\"",[33,12440,8935],{"class":167},[33,12442,12443,12446,12448,12451,12453,12456,12458,12461],{"class":35,"line":1926},[33,12444,12445],{"class":54},"        \"Units\"",[33,12447,11757],{"class":167},[33,12449,12450],{"class":54},"\"120\"",[33,12452,365],{"class":167},[33,12454,12455],{"class":54},"\"85\"",[33,12457,365],{"class":167},[33,12459,12460],{"class":54},"\"62\"",[33,12462,8935],{"class":167},[33,12464,12465],{"class":35,"line":1932},[33,12466,11848],{"class":167},[33,12468,12469,12472,12474],{"class":35,"line":1938},[33,12470,12471],{"class":167},"    cleaned ",[33,12473,242],{"class":163},[33,12475,12476],{"class":167}," coerce_dtypes(sample)\n",[33,12478,12479,12481],{"class":35,"line":1950},[33,12480,7268],{"class":50},[33,12482,12483],{"class":167},"(cleaned.dtypes)\n",[33,12485,12486,12488],{"class":35,"line":1958},[33,12487,7268],{"class":50},[33,12489,12490],{"class":167},"(cleaned)\n",[14,12492,12493],{},"The 70 % threshold means a column is classified as numeric\u002Fdate even if a minority of cells contain free-form notes or error markers. Adjust the threshold upward for cleaner sources.",[14,12495,12496,12497,12499],{},"For a broader treatment of pandas string-cleaning patterns, see ",[940,12498,9599],{"href":9598}," — the same encoding-fix and whitespace-strip techniques apply to text extracted from PDFs.",[18,12501,12503],{"id":12502},"step-6-validate-before-export","Step 6: Validate Before Export",[14,12505,12506],{},"Row-count and dtype checks catch extraction failures early — before downstream code silently processes a truncated or mistyped DataFrame.",[23,12508,12510],{"className":126,"code":12509,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\n\n\ndef validate_dataframe(\n    df: pd.DataFrame,\n    expected_cols: list[str],\n    min_rows: int = 1,\n    max_null_ratio: float = 0.30,\n) -> None:\n    \"\"\"Raise or warn if the DataFrame does not meet basic quality expectations.\"\"\"\n    missing = [c for c in expected_cols if c not in df.columns]\n    if missing:\n        raise ValueError(f\"Missing expected columns: {missing}\")\n\n    if df.shape[0] \u003C min_rows:\n        raise ValueError(f\"Too few rows: got {df.shape[0]}, expected >= {min_rows}\")\n\n    null_ratio = df[expected_cols].isnull().mean().mean()\n    if null_ratio > max_null_ratio:\n        print(\n            f\"Warning: {null_ratio:.1%} of values are null in expected columns \"\n            f\"(threshold {max_null_ratio:.0%}). Check forward-fill and header dedup.\"\n        )\n\n    print(f\"OK — shape={df.shape}, null_ratio={null_ratio:.1%}\")\n    print(df.dtypes.to_string())\n\n\nif __name__ == \"__main__\":\n    df = pd.DataFrame({\n        \"Date\":    pd.to_datetime([\"2026-01-15\", \"2026-02-01\"]),\n        \"Revenue\": [1250.0, 900.5],\n        \"Units\":   [120, 85],\n    })\n    validate_dataframe(df, expected_cols=[\"Date\", \"Revenue\", \"Units\"], min_rows=1)\n",[30,12511,12512,12516,12526,12530,12534,12543,12548,12557,12570,12584,12592,12597,12625,12631,12654,12658,12672,12708,12712,12722,12734,12740,12761,12781,12785,12789,12821,12828,12832,12836,12848,12856,12872,12888,12903,12907],{"__ignoreMap":28},[33,12513,12514],{"class":35,"line":36},[33,12515,8895],{"class":39},[33,12517,12518,12520,12522,12524],{"class":35,"line":43},[33,12519,164],{"class":163},[33,12521,492],{"class":167},[33,12523,495],{"class":163},[33,12525,498],{"class":167},[33,12527,12528],{"class":35,"line":61},[33,12529,92],{"emptyLinePlaceholder":91},[33,12531,12532],{"class":35,"line":73},[33,12533,92],{"emptyLinePlaceholder":91},[33,12535,12536,12538,12541],{"class":35,"line":88},[33,12537,562],{"class":163},[33,12539,12540],{"class":46}," validate_dataframe",[33,12542,7637],{"class":167},[33,12544,12545],{"class":35,"line":95},[33,12546,12547],{"class":167},"    df: pd.DataFrame,\n",[33,12549,12550,12553,12555],{"class":35,"line":101},[33,12551,12552],{"class":167},"    expected_cols: list[",[33,12554,1053],{"class":50},[33,12556,8935],{"class":167},[33,12558,12559,12562,12564,12566,12568],{"class":35,"line":171},[33,12560,12561],{"class":167},"    min_rows: ",[33,12563,1059],{"class":50},[33,12565,212],{"class":163},[33,12567,1814],{"class":50},[33,12569,247],{"class":167},[33,12571,12572,12575,12577,12579,12582],{"class":35,"line":179},[33,12573,12574],{"class":167},"    max_null_ratio: ",[33,12576,1720],{"class":50},[33,12578,212],{"class":163},[33,12580,12581],{"class":50}," 0.30",[33,12583,247],{"class":167},[33,12585,12586,12588,12590],{"class":35,"line":187},[33,12587,1617],{"class":167},[33,12589,571],{"class":50},[33,12591,574],{"class":167},[33,12593,12594],{"class":35,"line":201},[33,12595,12596],{"class":54},"    \"\"\"Raise or warn if the DataFrame does not meet basic quality expectations.\"\"\"\n",[33,12598,12599,12601,12603,12605,12607,12609,12611,12614,12616,12618,12620,12622],{"class":35,"line":206},[33,12600,4118],{"class":167},[33,12602,242],{"class":163},[33,12604,7740],{"class":167},[33,12606,6124],{"class":163},[33,12608,7486],{"class":167},[33,12610,662],{"class":163},[33,12612,12613],{"class":167}," expected_cols ",[33,12615,2491],{"class":163},[33,12617,7486],{"class":167},[33,12619,7999],{"class":163},[33,12621,8002],{"class":163},[33,12623,12624],{"class":167}," df.columns]\n",[33,12626,12627,12629],{"class":35,"line":224},[33,12628,617],{"class":163},[33,12630,4139],{"class":167},[33,12632,12633,12635,12637,12639,12641,12644,12646,12648,12650,12652],{"class":35,"line":229},[33,12634,4051],{"class":163},[33,12636,4054],{"class":50},[33,12638,602],{"class":167},[33,12640,4059],{"class":163},[33,12642,12643],{"class":54},"\"Missing expected columns: ",[33,12645,1115],{"class":50},[33,12647,4157],{"class":167},[33,12649,1121],{"class":50},[33,12651,274],{"class":54},[33,12653,221],{"class":167},[33,12655,12656],{"class":35,"line":235},[33,12657,92],{"emptyLinePlaceholder":91},[33,12659,12660,12662,12664,12666,12668,12670],{"class":35,"line":250},[33,12661,617],{"class":163},[33,12663,9516],{"class":167},[33,12665,748],{"class":50},[33,12667,763],{"class":167},[33,12669,4043],{"class":163},[33,12671,4046],{"class":167},[33,12673,12674,12676,12678,12680,12682,12685,12687,12689,12691,12693,12695,12698,12700,12702,12704,12706],{"class":35,"line":266},[33,12675,4051],{"class":163},[33,12677,4054],{"class":50},[33,12679,602],{"class":167},[33,12681,4059],{"class":163},[33,12683,12684],{"class":54},"\"Too few rows: got ",[33,12686,1115],{"class":50},[33,12688,9541],{"class":167},[33,12690,748],{"class":50},[33,12692,9546],{"class":167},[33,12694,1121],{"class":50},[33,12696,12697],{"class":54},", expected >= ",[33,12699,1115],{"class":50},[33,12701,4078],{"class":167},[33,12703,1121],{"class":50},[33,12705,274],{"class":54},[33,12707,221],{"class":167},[33,12709,12710],{"class":35,"line":290},[33,12711,92],{"emptyLinePlaceholder":91},[33,12713,12714,12717,12719],{"class":35,"line":295},[33,12715,12716],{"class":167},"    null_ratio ",[33,12718,242],{"class":163},[33,12720,12721],{"class":167}," df[expected_cols].isnull().mean().mean()\n",[33,12723,12724,12726,12729,12731],{"class":35,"line":300},[33,12725,617],{"class":163},[33,12727,12728],{"class":167}," null_ratio ",[33,12730,6009],{"class":163},[33,12732,12733],{"class":167}," max_null_ratio:\n",[33,12735,12736,12738],{"class":35,"line":317},[33,12737,9414],{"class":50},[33,12739,7637],{"class":167},[33,12741,12742,12745,12748,12750,12753,12756,12758],{"class":35,"line":332},[33,12743,12744],{"class":163},"            f",[33,12746,12747],{"class":54},"\"Warning: ",[33,12749,1115],{"class":50},[33,12751,12752],{"class":167},"null_ratio",[33,12754,12755],{"class":163},":.1%",[33,12757,1121],{"class":50},[33,12759,12760],{"class":54}," of values are null in expected columns \"\n",[33,12762,12763,12765,12768,12770,12773,12776,12778],{"class":35,"line":347},[33,12764,12744],{"class":163},[33,12766,12767],{"class":54},"\"(threshold ",[33,12769,1115],{"class":50},[33,12771,12772],{"class":167},"max_null_ratio",[33,12774,12775],{"class":163},":.0%",[33,12777,1121],{"class":50},[33,12779,12780],{"class":54},"). Check forward-fill and header dedup.\"\n",[33,12782,12783],{"class":35,"line":374},[33,12784,5867],{"class":167},[33,12786,12787],{"class":35,"line":397},[33,12788,92],{"emptyLinePlaceholder":91},[33,12790,12791,12793,12795,12797,12800,12802,12804,12806,12809,12811,12813,12815,12817,12819],{"class":35,"line":653},[33,12792,7268],{"class":50},[33,12794,602],{"class":167},[33,12796,4059],{"class":163},[33,12798,12799],{"class":54},"\"OK — shape=",[33,12801,1115],{"class":50},[33,12803,9426],{"class":167},[33,12805,1121],{"class":50},[33,12807,12808],{"class":54},", null_ratio=",[33,12810,1115],{"class":50},[33,12812,12752],{"class":167},[33,12814,12755],{"class":163},[33,12816,1121],{"class":50},[33,12818,274],{"class":54},[33,12820,221],{"class":167},[33,12822,12823,12825],{"class":35,"line":667},[33,12824,7268],{"class":50},[33,12826,12827],{"class":167},"(df.dtypes.to_string())\n",[33,12829,12830],{"class":35,"line":675},[33,12831,92],{"emptyLinePlaceholder":91},[33,12833,12834],{"class":35,"line":689},[33,12835,92],{"emptyLinePlaceholder":91},[33,12837,12838,12840,12842,12844,12846],{"class":35,"line":703},[33,12839,2491],{"class":163},[33,12841,2494],{"class":50},[33,12843,2497],{"class":163},[33,12845,2500],{"class":54},[33,12847,574],{"class":167},[33,12849,12850,12852,12854],{"class":35,"line":714},[33,12851,4025],{"class":167},[33,12853,242],{"class":163},[33,12855,11749],{"class":167},[33,12857,12858,12860,12863,12865,12867,12869],{"class":35,"line":723},[33,12859,12402],{"class":54},[33,12861,12862],{"class":167},":    pd.to_datetime([",[33,12864,12407],{"class":54},[33,12866,365],{"class":167},[33,12868,12412],{"class":54},[33,12870,12871],{"class":167},"]),\n",[33,12873,12874,12876,12878,12881,12883,12886],{"class":35,"line":754},[33,12875,11815],{"class":54},[33,12877,12426],{"class":167},[33,12879,12880],{"class":50},"1250.0",[33,12882,365],{"class":167},[33,12884,12885],{"class":50},"900.5",[33,12887,8935],{"class":167},[33,12889,12890,12892,12894,12896,12898,12901],{"class":35,"line":771},[33,12891,12445],{"class":54},[33,12893,11757],{"class":167},[33,12895,2589],{"class":50},[33,12897,365],{"class":167},[33,12899,12900],{"class":50},"85",[33,12902,8935],{"class":167},[33,12904,12905],{"class":35,"line":777},[33,12906,11848],{"class":167},[33,12908,12909,12912,12915,12917,12919,12921,12923,12926,12928,12931,12933,12935,12937,12939],{"class":35,"line":788},[33,12910,12911],{"class":167},"    validate_dataframe(df, ",[33,12913,12914],{"class":238},"expected_cols",[33,12916,242],{"class":163},[33,12918,8309],{"class":167},[33,12920,7027],{"class":54},[33,12922,365],{"class":167},[33,12924,12925],{"class":54},"\"Revenue\"",[33,12927,365],{"class":167},[33,12929,12930],{"class":54},"\"Units\"",[33,12932,8314],{"class":167},[33,12934,4078],{"class":238},[33,12936,242],{"class":163},[33,12938,734],{"class":50},[33,12940,221],{"class":167},[18,12942,12944],{"id":12943},"edge-cases-variants","Edge Cases & Variants",[424,12946,12948],{"id":12947},"variant-1-pdfplumber-with-text-based-extraction-no-table-grid-detected","Variant 1: pdfplumber with text-based extraction (no table grid detected)",[14,12950,12951,12952,12955,12956,12959],{},"When ",[30,12953,12954],{},"page.extract_tables()"," returns nothing — the PDF has no detectable grid at all — fall back to ",[30,12957,12958],{},"page.extract_text()"," and parse it line by line:",[23,12961,12963],{"className":126,"code":12962,"language":47,"meta":28,"style":28},"# pip install pdfplumber pandas\nfrom pathlib import Path\nimport pdfplumber\nimport pandas as pd\n\nPDF_PATH = Path(\"data\u002Freport.pdf\")\n\n\ndef extract_text_lines(path: Path, separator: str = r\"\\s{2,}\") -> pd.DataFrame:\n    \"\"\"\n    Parse a whitespace-aligned PDF as fixed-width text rows.\n    Works when no table grid is detectable.\n    \"\"\"\n    import re\n    rows: list[list[str]] = []\n    try:\n        with pdfplumber.open(path) as pdf:\n            for page in pdf.pages:\n                text = page.extract_text() or \"\"\n                for line in text.splitlines():\n                    line = line.strip()\n                    if not line:\n                        continue\n                    cells = re.split(separator, line)\n                    rows.append(cells)\n    except Exception as e:\n        raise RuntimeError(f\"Text extraction failed: {e}\") from e\n\n    if not rows:\n        return pd.DataFrame()\n\n    # Use the first non-empty row as header if it looks like a header\n    header = rows[0]\n    return pd.DataFrame(rows[1:], columns=header)\n\n\nif __name__ == \"__main__\":\n    df = extract_text_lines(PDF_PATH)\n    print(df.head())\n",[30,12964,12965,12969,12979,12985,12995,12999,13011,13015,13019,13048,13052,13057,13062,13066,13072,13086,13092,13102,13112,13127,13139,13149,13158,13162,13172,13177,13187,13214,13218,13226,13232,13236,13241,13255,13272,13276,13280,13292,13305],{"__ignoreMap":28},[33,12966,12967],{"class":35,"line":36},[33,12968,7041],{"class":39},[33,12970,12971,12973,12975,12977],{"class":35,"line":43},[33,12972,190],{"class":163},[33,12974,193],{"class":167},[33,12976,164],{"class":163},[33,12978,198],{"class":167},[33,12980,12981,12983],{"class":35,"line":61},[33,12982,164],{"class":163},[33,12984,485],{"class":167},[33,12986,12987,12989,12991,12993],{"class":35,"line":73},[33,12988,164],{"class":163},[33,12990,492],{"class":167},[33,12992,495],{"class":163},[33,12994,498],{"class":167},[33,12996,12997],{"class":35,"line":88},[33,12998,92],{"emptyLinePlaceholder":91},[33,13000,13001,13003,13005,13007,13009],{"class":35,"line":95},[33,13002,7076],{"class":50},[33,13004,212],{"class":163},[33,13006,215],{"class":167},[33,13008,7083],{"class":54},[33,13010,221],{"class":167},[33,13012,13013],{"class":35,"line":101},[33,13014,92],{"emptyLinePlaceholder":91},[33,13016,13017],{"class":35,"line":171},[33,13018,92],{"emptyLinePlaceholder":91},[33,13020,13021,13023,13026,13029,13031,13033,13036,13038,13041,13044,13046],{"class":35,"line":179},[33,13022,562],{"class":163},[33,13024,13025],{"class":46}," extract_text_lines",[33,13027,13028],{"class":167},"(path: Path, separator: ",[33,13030,1053],{"class":50},[33,13032,212],{"class":163},[33,13034,13035],{"class":163}," r",[33,13037,274],{"class":54},[33,13039,13040],{"class":50},"\\s",[33,13042,13043],{"class":163},"{2,}",[33,13045,274],{"class":54},[33,13047,7668],{"class":167},[33,13049,13050],{"class":35,"line":187},[33,13051,7673],{"class":54},[33,13053,13054],{"class":35,"line":201},[33,13055,13056],{"class":54},"    Parse a whitespace-aligned PDF as fixed-width text rows.\n",[33,13058,13059],{"class":35,"line":206},[33,13060,13061],{"class":54},"    Works when no table grid is detectable.\n",[33,13063,13064],{"class":35,"line":224},[33,13065,7673],{"class":54},[33,13067,13068,13070],{"class":35,"line":229},[33,13069,1627],{"class":163},[33,13071,11917],{"class":167},[33,13073,13074,13077,13079,13082,13084],{"class":35,"line":235},[33,13075,13076],{"class":167},"    rows: list[list[",[33,13078,1053],{"class":50},[33,13080,13081],{"class":167},"]] ",[33,13083,242],{"class":163},[33,13085,589],{"class":167},[33,13087,13088,13090],{"class":35,"line":250},[33,13089,2424],{"class":163},[33,13091,574],{"class":167},[33,13093,13094,13096,13098,13100],{"class":35,"line":266},[33,13095,2191],{"class":163},[33,13097,7123],{"class":167},[33,13099,495],{"class":163},[33,13101,686],{"class":167},[33,13103,13104,13106,13108,13110],{"class":35,"line":290},[33,13105,1793],{"class":163},[33,13107,695],{"class":167},[33,13109,662],{"class":163},[33,13111,700],{"class":167},[33,13113,13114,13117,13119,13122,13124],{"class":35,"line":295},[33,13115,13116],{"class":167},"                text ",[33,13118,242],{"class":163},[33,13120,13121],{"class":167}," page.extract_text() ",[33,13123,7162],{"class":163},[33,13125,13126],{"class":54}," \"\"\n",[33,13128,13129,13131,13134,13136],{"class":35,"line":300},[33,13130,692],{"class":163},[33,13132,13133],{"class":167}," line ",[33,13135,662],{"class":163},[33,13137,13138],{"class":167}," text.splitlines():\n",[33,13140,13141,13144,13146],{"class":35,"line":317},[33,13142,13143],{"class":167},"                    line ",[33,13145,242],{"class":163},[33,13147,13148],{"class":167}," line.strip()\n",[33,13150,13151,13153,13155],{"class":35,"line":332},[33,13152,717],{"class":163},[33,13154,620],{"class":163},[33,13156,13157],{"class":167}," line:\n",[33,13159,13160],{"class":35,"line":347},[33,13161,7458],{"class":163},[33,13163,13164,13167,13169],{"class":35,"line":374},[33,13165,13166],{"class":167},"                    cells ",[33,13168,242],{"class":163},[33,13170,13171],{"class":167}," re.split(separator, line)\n",[33,13173,13174],{"class":35,"line":397},[33,13175,13176],{"class":167},"                    rows.append(cells)\n",[33,13178,13179,13181,13183,13185],{"class":35,"line":653},[33,13180,2449],{"class":163},[33,13182,783],{"class":50},[33,13184,1852],{"class":163},[33,13186,7583],{"class":167},[33,13188,13189,13191,13193,13195,13197,13200,13202,13204,13206,13208,13210,13212],{"class":35,"line":667},[33,13190,4051],{"class":163},[33,13192,7590],{"class":50},[33,13194,602],{"class":167},[33,13196,4059],{"class":163},[33,13198,13199],{"class":54},"\"Text extraction failed: ",[33,13201,1115],{"class":50},[33,13203,7602],{"class":167},[33,13205,1121],{"class":50},[33,13207,274],{"class":54},[33,13209,1649],{"class":167},[33,13211,190],{"class":163},[33,13213,7613],{"class":167},[33,13215,13216],{"class":35,"line":675},[33,13217,92],{"emptyLinePlaceholder":91},[33,13219,13220,13222,13224],{"class":35,"line":689},[33,13221,617],{"class":163},[33,13223,620],{"class":163},[33,13225,8723],{"class":167},[33,13227,13228,13230],{"class":35,"line":703},[33,13229,1659],{"class":163},[33,13231,7721],{"class":167},[33,13233,13234],{"class":35,"line":714},[33,13235,92],{"emptyLinePlaceholder":91},[33,13237,13238],{"class":35,"line":723},[33,13239,13240],{"class":39},"    # Use the first non-empty row as header if it looks like a header\n",[33,13242,13243,13246,13248,13251,13253],{"class":35,"line":754},[33,13244,13245],{"class":167},"    header ",[33,13247,242],{"class":163},[33,13249,13250],{"class":167}," rows[",[33,13252,748],{"class":50},[33,13254,9202],{"class":167},[33,13256,13257,13259,13262,13264,13266,13268,13270],{"class":35,"line":771},[33,13258,1332],{"class":163},[33,13260,13261],{"class":167}," pd.DataFrame(rows[",[33,13263,734],{"class":50},[33,13265,737],{"class":167},[33,13267,740],{"class":238},[33,13269,242],{"class":163},[33,13271,7549],{"class":167},[33,13273,13274],{"class":35,"line":777},[33,13275,92],{"emptyLinePlaceholder":91},[33,13277,13278],{"class":35,"line":788},[33,13279,92],{"emptyLinePlaceholder":91},[33,13281,13282,13284,13286,13288,13290],{"class":35,"line":804},[33,13283,2491],{"class":163},[33,13285,2494],{"class":50},[33,13287,2497],{"class":163},[33,13289,2500],{"class":54},[33,13291,574],{"class":167},[33,13293,13294,13296,13298,13301,13303],{"class":35,"line":809},[33,13295,4025],{"class":167},[33,13297,242],{"class":163},[33,13299,13300],{"class":167}," extract_text_lines(",[33,13302,7076],{"class":50},[33,13304,221],{"class":167},[33,13306,13307,13309],{"class":35,"line":819},[33,13308,7268],{"class":50},[33,13310,13311],{"class":167},"(df.head())\n",[424,13313,13315],{"id":13314},"variant-2-camelot-stream-with-explicit-column-coordinates","Variant 2: camelot stream with explicit column coordinates",[14,13317,12951,13318,13320],{},[30,13319,11068],{}," merges adjacent columns, provide explicit x-coordinates measured from the PDF viewer. Open the PDF, note the x-positions of column separators (in points, 72 per inch), and pass them directly:",[23,13322,13324],{"className":126,"code":13323,"language":47,"meta":28,"style":28},"# pip install \"camelot-py[cv]\" pandas\nimport camelot\nfrom pathlib import Path\n\nPDF_PATH = Path(\"data\u002Freport.pdf\")\n\ntables = camelot.read_pdf(\n    str(PDF_PATH),\n    pages=\"2\",\n    flavor=\"stream\",\n    columns=[\"120,240,360,480,600\"],   # x-coordinates of column separators in pt\n    edge_tol=500,\n    row_tol=2,\n)\ndf = tables[0].df\nprint(df)\n",[30,13325,13326,13330,13336,13346,13350,13362,13366,13375,13386,13398,13410,13428,13440,13451,13455,13470],{"__ignoreMap":28},[33,13327,13328],{"class":35,"line":36},[33,13329,10550],{"class":39},[33,13331,13332,13334],{"class":35,"line":43},[33,13333,164],{"class":163},[33,13335,10567],{"class":167},[33,13337,13338,13340,13342,13344],{"class":35,"line":61},[33,13339,190],{"class":163},[33,13341,193],{"class":167},[33,13343,164],{"class":163},[33,13345,198],{"class":167},[33,13347,13348],{"class":35,"line":73},[33,13349,92],{"emptyLinePlaceholder":91},[33,13351,13352,13354,13356,13358,13360],{"class":35,"line":88},[33,13353,7076],{"class":50},[33,13355,212],{"class":163},[33,13357,215],{"class":167},[33,13359,7083],{"class":54},[33,13361,221],{"class":167},[33,13363,13364],{"class":35,"line":95},[33,13365,92],{"emptyLinePlaceholder":91},[33,13367,13368,13371,13373],{"class":35,"line":101},[33,13369,13370],{"class":167},"tables ",[33,13372,242],{"class":163},[33,13374,10668],{"class":167},[33,13376,13377,13380,13382,13384],{"class":35,"line":171},[33,13378,13379],{"class":50},"    str",[33,13381,602],{"class":167},[33,13383,7076],{"class":50},[33,13385,1506],{"class":167},[33,13387,13388,13391,13393,13396],{"class":35,"line":179},[33,13389,13390],{"class":238},"    pages",[33,13392,242],{"class":163},[33,13394,13395],{"class":54},"\"2\"",[33,13397,247],{"class":167},[33,13399,13400,13403,13405,13408],{"class":35,"line":187},[33,13401,13402],{"class":238},"    flavor",[33,13404,242],{"class":163},[33,13406,13407],{"class":54},"\"stream\"",[33,13409,247],{"class":167},[33,13411,13412,13415,13417,13419,13422,13425],{"class":35,"line":201},[33,13413,13414],{"class":238},"    columns",[33,13416,242],{"class":163},[33,13418,8309],{"class":167},[33,13420,13421],{"class":54},"\"120,240,360,480,600\"",[33,13423,13424],{"class":167},"],   ",[33,13426,13427],{"class":39},"# x-coordinates of column separators in pt\n",[33,13429,13430,13433,13435,13438],{"class":35,"line":206},[33,13431,13432],{"class":238},"    edge_tol",[33,13434,242],{"class":163},[33,13436,13437],{"class":50},"500",[33,13439,247],{"class":167},[33,13441,13442,13445,13447,13449],{"class":35,"line":224},[33,13443,13444],{"class":238},"    row_tol",[33,13446,242],{"class":163},[33,13448,1533],{"class":50},[33,13450,247],{"class":167},[33,13452,13453],{"class":35,"line":229},[33,13454,221],{"class":167},[33,13456,13457,13460,13462,13465,13467],{"class":35,"line":235},[33,13458,13459],{"class":167},"df ",[33,13461,242],{"class":163},[33,13463,13464],{"class":167}," tables[",[33,13466,748],{"class":50},[33,13468,13469],{"class":167},"].df\n",[33,13471,13472,13475],{"class":35,"line":250},[33,13473,13474],{"class":50},"print",[33,13476,13477],{"class":167},"(df)\n",[424,13479,13481],{"id":13480},"variant-3-mixed-structure-pdf-some-pages-lattice-some-stream","Variant 3: Mixed-structure PDF — some pages lattice, some stream",[23,13483,13485],{"className":126,"code":13484,"language":47,"meta":28,"style":28},"# pip install pdfplumber \"camelot-py[cv]\" pandas\nfrom pathlib import Path\nimport pdfplumber\nimport camelot\nimport pandas as pd\n\nPDF_PATH = Path(\"data\u002Fmixed.pdf\")\n\n\ndef extract_mixed(path: Path) -> list[pd.DataFrame]:\n    \"\"\"Route each page to the best extractor based on its own structure.\"\"\"\n    frames: list[pd.DataFrame] = []\n    try:\n        with pdfplumber.open(path) as pdf:\n            for page_num, page in enumerate(pdf.pages, start=1):\n                has_lines = bool(page.lines or page.rects)\n                if has_lines:\n                    tbls = camelot.read_pdf(\n                        str(path), pages=str(page_num), flavor=\"lattice\"\n                    )\n                    for t in tbls:\n                        df = t.df.copy()\n                        df.columns = df.iloc[0].str.strip()\n                        df = df.iloc[1:].reset_index(drop=True)\n                        frames.append(df)\n                else:\n                    raw_tables = page.extract_tables() or []\n                    for raw in raw_tables:\n                        if raw and len(raw) > 1:\n                            df = pd.DataFrame(raw[1:], columns=raw[0])\n                            frames.append(df)\n    except Exception as e:\n        raise RuntimeError(f\"Mixed extraction failed: {e}\") from e\n    return frames\n",[30,13486,13487,13492,13502,13508,13514,13524,13528,13541,13545,13549,13558,13563,13571,13577,13587,13607,13622,13628,13637,13660,13664,13676,13684,13697,13717,13721,13727,13741,13751,13770,13794,13799,13809,13836],{"__ignoreMap":28},[33,13488,13489],{"class":35,"line":36},[33,13490,13491],{"class":39},"# pip install pdfplumber \"camelot-py[cv]\" pandas\n",[33,13493,13494,13496,13498,13500],{"class":35,"line":43},[33,13495,190],{"class":163},[33,13497,193],{"class":167},[33,13499,164],{"class":163},[33,13501,198],{"class":167},[33,13503,13504,13506],{"class":35,"line":61},[33,13505,164],{"class":163},[33,13507,485],{"class":167},[33,13509,13510,13512],{"class":35,"line":73},[33,13511,164],{"class":163},[33,13513,10567],{"class":167},[33,13515,13516,13518,13520,13522],{"class":35,"line":88},[33,13517,164],{"class":163},[33,13519,492],{"class":167},[33,13521,495],{"class":163},[33,13523,498],{"class":167},[33,13525,13526],{"class":35,"line":95},[33,13527,92],{"emptyLinePlaceholder":91},[33,13529,13530,13532,13534,13536,13539],{"class":35,"line":101},[33,13531,7076],{"class":50},[33,13533,212],{"class":163},[33,13535,215],{"class":167},[33,13537,13538],{"class":54},"\"data\u002Fmixed.pdf\"",[33,13540,221],{"class":167},[33,13542,13543],{"class":35,"line":171},[33,13544,92],{"emptyLinePlaceholder":91},[33,13546,13547],{"class":35,"line":179},[33,13548,92],{"emptyLinePlaceholder":91},[33,13550,13551,13553,13556],{"class":35,"line":187},[33,13552,562],{"class":163},[33,13554,13555],{"class":46}," extract_mixed",[33,13557,7362],{"class":167},[33,13559,13560],{"class":35,"line":201},[33,13561,13562],{"class":54},"    \"\"\"Route each page to the best extractor based on its own structure.\"\"\"\n",[33,13564,13565,13567,13569],{"class":35,"line":206},[33,13566,6183],{"class":167},[33,13568,242],{"class":163},[33,13570,589],{"class":167},[33,13572,13573,13575],{"class":35,"line":224},[33,13574,2424],{"class":163},[33,13576,574],{"class":167},[33,13578,13579,13581,13583,13585],{"class":35,"line":229},[33,13580,2191],{"class":163},[33,13582,7123],{"class":167},[33,13584,495],{"class":163},[33,13586,686],{"class":167},[33,13588,13589,13591,13593,13595,13597,13599,13601,13603,13605],{"class":35,"line":235},[33,13590,1793],{"class":163},[33,13592,7398],{"class":167},[33,13594,662],{"class":163},[33,13596,7403],{"class":50},[33,13598,7406],{"class":167},[33,13600,7409],{"class":238},[33,13602,242],{"class":163},[33,13604,734],{"class":50},[33,13606,1737],{"class":167},[33,13608,13609,13612,13614,13616,13618,13620],{"class":35,"line":250},[33,13610,13611],{"class":167},"                has_lines ",[33,13613,242],{"class":163},[33,13615,9884],{"class":50},[33,13617,9907],{"class":167},[33,13619,7162],{"class":163},[33,13621,9912],{"class":167},[33,13623,13624,13626],{"class":35,"line":266},[33,13625,7170],{"class":163},[33,13627,9977],{"class":167},[33,13629,13630,13633,13635],{"class":35,"line":290},[33,13631,13632],{"class":167},"                    tbls ",[33,13634,242],{"class":163},[33,13636,10668],{"class":167},[33,13638,13639,13641,13644,13646,13648,13650,13653,13655,13657],{"class":35,"line":295},[33,13640,7478],{"class":50},[33,13642,13643],{"class":167},"(path), ",[33,13645,10971],{"class":238},[33,13647,242],{"class":163},[33,13649,1053],{"class":50},[33,13651,13652],{"class":167},"(page_num), ",[33,13654,10748],{"class":238},[33,13656,242],{"class":163},[33,13658,13659],{"class":54},"\"lattice\"\n",[33,13661,13662],{"class":35,"line":300},[33,13663,1929],{"class":167},[33,13665,13666,13669,13671,13673],{"class":35,"line":317},[33,13667,13668],{"class":163},"                    for",[33,13670,10818],{"class":167},[33,13672,662],{"class":163},[33,13674,13675],{"class":167}," tbls:\n",[33,13677,13678,13680,13682],{"class":35,"line":332},[33,13679,726],{"class":167},[33,13681,242],{"class":163},[33,13683,10832],{"class":167},[33,13685,13686,13689,13691,13693,13695],{"class":35,"line":347},[33,13687,13688],{"class":167},"                        df.columns ",[33,13690,242],{"class":163},[33,13692,10847],{"class":167},[33,13694,748],{"class":50},[33,13696,10852],{"class":167},[33,13698,13699,13701,13703,13705,13707,13709,13711,13713,13715],{"class":35,"line":374},[33,13700,726],{"class":167},[33,13702,242],{"class":163},[33,13704,10847],{"class":167},[33,13706,734],{"class":50},[33,13708,10865],{"class":167},[33,13710,10868],{"class":238},[33,13712,242],{"class":163},[33,13714,855],{"class":50},[33,13716,221],{"class":167},[33,13718,13719],{"class":35,"line":397},[33,13720,774],{"class":167},[33,13722,13723,13725],{"class":35,"line":653},[33,13724,8634],{"class":163},[33,13726,574],{"class":167},[33,13728,13729,13732,13734,13737,13739],{"class":35,"line":667},[33,13730,13731],{"class":167},"                    raw_tables ",[33,13733,242],{"class":163},[33,13735,13736],{"class":167}," page.extract_tables() ",[33,13738,7162],{"class":163},[33,13740,589],{"class":167},[33,13742,13743,13745,13747,13749],{"class":35,"line":675},[33,13744,13668],{"class":163},[33,13746,7422],{"class":167},[33,13748,662],{"class":163},[33,13750,10280],{"class":167},[33,13752,13753,13756,13758,13760,13762,13764,13766,13768],{"class":35,"line":689},[33,13754,13755],{"class":163},"                        if",[33,13757,7422],{"class":167},[33,13759,6001],{"class":163},[33,13761,4037],{"class":50},[33,13763,7446],{"class":167},[33,13765,6009],{"class":163},[33,13767,1814],{"class":50},[33,13769,574],{"class":167},[33,13771,13772,13775,13777,13779,13781,13783,13785,13787,13790,13792],{"class":35,"line":703},[33,13773,13774],{"class":167},"                            df ",[33,13776,242],{"class":163},[33,13778,7538],{"class":167},[33,13780,734],{"class":50},[33,13782,737],{"class":167},[33,13784,740],{"class":238},[33,13786,242],{"class":163},[33,13788,13789],{"class":167},"raw[",[33,13791,748],{"class":50},[33,13793,751],{"class":167},[33,13795,13796],{"class":35,"line":714},[33,13797,13798],{"class":167},"                            frames.append(df)\n",[33,13800,13801,13803,13805,13807],{"class":35,"line":723},[33,13802,2449],{"class":163},[33,13804,783],{"class":50},[33,13806,1852],{"class":163},[33,13808,7583],{"class":167},[33,13810,13811,13813,13815,13817,13819,13822,13824,13826,13828,13830,13832,13834],{"class":35,"line":754},[33,13812,4051],{"class":163},[33,13814,7590],{"class":50},[33,13816,602],{"class":167},[33,13818,4059],{"class":163},[33,13820,13821],{"class":54},"\"Mixed extraction failed: ",[33,13823,1115],{"class":50},[33,13825,7602],{"class":167},[33,13827,1121],{"class":50},[33,13829,274],{"class":54},[33,13831,1649],{"class":167},[33,13833,190],{"class":163},[33,13835,7613],{"class":167},[33,13837,13838,13840],{"class":35,"line":771},[33,13839,1332],{"class":163},[33,13841,6065],{"class":167},[18,13843,13845],{"id":13844},"performance-scale","Performance & Scale",[14,13847,13848,13851,13852,13855],{},[1974,13849,13850],{},"Memory:"," pdfplumber loads the entire file into memory when opened. For PDFs above 100 MB, slice pages directly: ",[30,13853,13854],{},"pdf.pages[0:10]"," — Python slice notation works on pdfplumber's page list.",[14,13857,13858,13861,13862,13864],{},[1974,13859,13860],{},"Concurrency:"," PDF parsing is CPU-bound. Use ",[30,13863,4240],{}," to process multiple PDF files simultaneously; do not use threads.",[23,13866,13868],{"className":126,"code":13867,"language":47,"meta":28,"style":28},"# pip install pdfplumber pandas\nfrom concurrent.futures import ProcessPoolExecutor, as_completed\nfrom pathlib import Path\nimport pandas as pd\n\n\ndef process_pdf(path: Path) -> pd.DataFrame:\n    \"\"\"Import inside function so each worker process gets clean imports.\"\"\"\n    import pdfplumber\n\n    frames = []\n    with pdfplumber.open(path) as pdf:\n        for page in pdf.pages:\n            for raw in (page.extract_tables() or []):\n                if raw and len(raw) > 1:\n                    frames.append(pd.DataFrame(raw[1:], columns=raw[0]))\n    return pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()\n\n\ndef batch_extract(pdf_dir: Path) -> dict[str, pd.DataFrame]:\n    paths = list(pdf_dir.glob(\"*.pdf\"))\n    results: dict[str, pd.DataFrame] = {}\n    with ProcessPoolExecutor() as pool:\n        futures = {pool.submit(process_pdf, p): p for p in paths}\n        for fut in as_completed(futures):\n            p = futures[fut]\n            try:\n                results[p.name] = fut.result()\n            except Exception as e:\n                print(f\"Failed {p.name}: {e}\")\n    return results\n\n\nif __name__ == \"__main__\":\n    all_frames = batch_extract(Path(\"data\u002Fpdfs\u002F\"))\n    for name, df in all_frames.items():\n        print(f\"{name}: {df.shape}\")\n",[30,13869,13870,13874,13886,13896,13906,13910,13914,13923,13928,13934,13938,13946,13956,13966,13980,13998,14017,14040,14044,14048,14063,14079,14094,14106,14125,14137,14147,14154,14164,14175,14205,14212,14216,14220,14232,14247,14259],{"__ignoreMap":28},[33,13871,13872],{"class":35,"line":36},[33,13873,7041],{"class":39},[33,13875,13876,13878,13881,13883],{"class":35,"line":43},[33,13877,190],{"class":163},[33,13879,13880],{"class":167}," concurrent.futures ",[33,13882,164],{"class":163},[33,13884,13885],{"class":167}," ProcessPoolExecutor, as_completed\n",[33,13887,13888,13890,13892,13894],{"class":35,"line":61},[33,13889,190],{"class":163},[33,13891,193],{"class":167},[33,13893,164],{"class":163},[33,13895,198],{"class":167},[33,13897,13898,13900,13902,13904],{"class":35,"line":73},[33,13899,164],{"class":163},[33,13901,492],{"class":167},[33,13903,495],{"class":163},[33,13905,498],{"class":167},[33,13907,13908],{"class":35,"line":88},[33,13909,92],{"emptyLinePlaceholder":91},[33,13911,13912],{"class":35,"line":95},[33,13913,92],{"emptyLinePlaceholder":91},[33,13915,13916,13918,13921],{"class":35,"line":101},[33,13917,562],{"class":163},[33,13919,13920],{"class":46}," process_pdf",[33,13922,7103],{"class":167},[33,13924,13925],{"class":35,"line":171},[33,13926,13927],{"class":54},"    \"\"\"Import inside function so each worker process gets clean imports.\"\"\"\n",[33,13929,13930,13932],{"class":35,"line":179},[33,13931,1627],{"class":163},[33,13933,485],{"class":167},[33,13935,13936],{"class":35,"line":187},[33,13937,92],{"emptyLinePlaceholder":91},[33,13939,13940,13942,13944],{"class":35,"line":201},[33,13941,584],{"class":167},[33,13943,242],{"class":163},[33,13945,589],{"class":167},[33,13947,13948,13950,13952,13954],{"class":35,"line":206},[33,13949,1635],{"class":163},[33,13951,7123],{"class":167},[33,13953,495],{"class":163},[33,13955,686],{"class":167},[33,13957,13958,13960,13962,13964],{"class":35,"line":224},[33,13959,5973],{"class":163},[33,13961,695],{"class":167},[33,13963,662],{"class":163},[33,13965,700],{"class":167},[33,13967,13968,13970,13972,13974,13976,13978],{"class":35,"line":229},[33,13969,1793],{"class":163},[33,13971,7422],{"class":167},[33,13973,662],{"class":163},[33,13975,7427],{"class":167},[33,13977,7162],{"class":163},[33,13979,7165],{"class":167},[33,13981,13982,13984,13986,13988,13990,13992,13994,13996],{"class":35,"line":235},[33,13983,7170],{"class":163},[33,13985,7422],{"class":167},[33,13987,6001],{"class":163},[33,13989,4037],{"class":50},[33,13991,7446],{"class":167},[33,13993,6009],{"class":163},[33,13995,1814],{"class":50},[33,13997,574],{"class":167},[33,13999,14000,14003,14005,14007,14009,14011,14013,14015],{"class":35,"line":250},[33,14001,14002],{"class":167},"                    frames.append(pd.DataFrame(raw[",[33,14004,734],{"class":50},[33,14006,737],{"class":167},[33,14008,740],{"class":238},[33,14010,242],{"class":163},[33,14012,13789],{"class":167},[33,14014,748],{"class":50},[33,14016,7211],{"class":167},[33,14018,14019,14021,14023,14025,14027,14029,14031,14033,14036,14038],{"class":35,"line":266},[33,14020,1332],{"class":163},[33,14022,847],{"class":167},[33,14024,850],{"class":238},[33,14026,242],{"class":163},[33,14028,855],{"class":50},[33,14030,1649],{"class":167},[33,14032,2491],{"class":163},[33,14034,14035],{"class":167}," frames ",[33,14037,7489],{"class":163},[33,14039,7721],{"class":167},[33,14041,14042],{"class":35,"line":290},[33,14043,92],{"emptyLinePlaceholder":91},[33,14045,14046],{"class":35,"line":295},[33,14047,92],{"emptyLinePlaceholder":91},[33,14049,14050,14052,14055,14058,14060],{"class":35,"line":300},[33,14051,562],{"class":163},[33,14053,14054],{"class":46}," batch_extract",[33,14056,14057],{"class":167},"(pdf_dir: Path) -> dict[",[33,14059,1053],{"class":50},[33,14061,14062],{"class":167},", pd.DataFrame]:\n",[33,14064,14065,14068,14070,14072,14075,14077],{"class":35,"line":317},[33,14066,14067],{"class":167},"    paths ",[33,14069,242],{"class":163},[33,14071,599],{"class":50},[33,14073,14074],{"class":167},"(pdf_dir.glob(",[33,14076,610],{"class":54},[33,14078,371],{"class":167},[33,14080,14081,14084,14086,14089,14091],{"class":35,"line":332},[33,14082,14083],{"class":167},"    results: dict[",[33,14085,1053],{"class":50},[33,14087,14088],{"class":167},", pd.DataFrame] ",[33,14090,242],{"class":163},[33,14092,14093],{"class":167}," {}\n",[33,14095,14096,14098,14101,14103],{"class":35,"line":347},[33,14097,1635],{"class":163},[33,14099,14100],{"class":167}," ProcessPoolExecutor() ",[33,14102,495],{"class":163},[33,14104,14105],{"class":167}," pool:\n",[33,14107,14108,14111,14113,14116,14118,14120,14122],{"class":35,"line":374},[33,14109,14110],{"class":167},"        futures ",[33,14112,242],{"class":163},[33,14114,14115],{"class":167}," {pool.submit(process_pdf, p): p ",[33,14117,6124],{"class":163},[33,14119,6127],{"class":167},[33,14121,662],{"class":163},[33,14123,14124],{"class":167}," paths}\n",[33,14126,14127,14129,14132,14134],{"class":35,"line":397},[33,14128,5973],{"class":163},[33,14130,14131],{"class":167}," fut ",[33,14133,662],{"class":163},[33,14135,14136],{"class":167}," as_completed(futures):\n",[33,14138,14139,14142,14144],{"class":35,"line":653},[33,14140,14141],{"class":167},"            p ",[33,14143,242],{"class":163},[33,14145,14146],{"class":167}," futures[fut]\n",[33,14148,14149,14152],{"class":35,"line":667},[33,14150,14151],{"class":163},"            try",[33,14153,574],{"class":167},[33,14155,14156,14159,14161],{"class":35,"line":675},[33,14157,14158],{"class":167},"                results[p.name] ",[33,14160,242],{"class":163},[33,14162,14163],{"class":167}," fut.result()\n",[33,14165,14166,14169,14171,14173],{"class":35,"line":689},[33,14167,14168],{"class":163},"            except",[33,14170,783],{"class":50},[33,14172,1852],{"class":163},[33,14174,7583],{"class":167},[33,14176,14177,14179,14181,14183,14186,14188,14191,14193,14195,14197,14199,14201,14203],{"class":35,"line":703},[33,14178,8264],{"class":50},[33,14180,602],{"class":167},[33,14182,4059],{"class":163},[33,14184,14185],{"class":54},"\"Failed ",[33,14187,1115],{"class":50},[33,14189,14190],{"class":167},"p.name",[33,14192,1121],{"class":50},[33,14194,2079],{"class":54},[33,14196,1115],{"class":50},[33,14198,7602],{"class":167},[33,14200,1121],{"class":50},[33,14202,274],{"class":54},[33,14204,221],{"class":167},[33,14206,14207,14209],{"class":35,"line":714},[33,14208,1332],{"class":163},[33,14210,14211],{"class":167}," results\n",[33,14213,14214],{"class":35,"line":723},[33,14215,92],{"emptyLinePlaceholder":91},[33,14217,14218],{"class":35,"line":754},[33,14219,92],{"emptyLinePlaceholder":91},[33,14221,14222,14224,14226,14228,14230],{"class":35,"line":771},[33,14223,2491],{"class":163},[33,14225,2494],{"class":50},[33,14227,2497],{"class":163},[33,14229,2500],{"class":54},[33,14231,574],{"class":167},[33,14233,14234,14237,14239,14242,14245],{"class":35,"line":777},[33,14235,14236],{"class":167},"    all_frames ",[33,14238,242],{"class":163},[33,14240,14241],{"class":167}," batch_extract(Path(",[33,14243,14244],{"class":54},"\"data\u002Fpdfs\u002F\"",[33,14246,371],{"class":167},[33,14248,14249,14251,14254,14256],{"class":35,"line":788},[33,14250,656],{"class":163},[33,14252,14253],{"class":167}," name, df ",[33,14255,662],{"class":163},[33,14257,14258],{"class":167}," all_frames.items():\n",[33,14260,14261,14263,14265,14267,14269,14271,14273,14275,14277,14279,14281,14283,14285],{"class":35,"line":804},[33,14262,9414],{"class":50},[33,14264,602],{"class":167},[33,14266,4059],{"class":163},[33,14268,274],{"class":54},[33,14270,1115],{"class":50},[33,14272,1118],{"class":167},[33,14274,1121],{"class":50},[33,14276,2079],{"class":54},[33,14278,1115],{"class":50},[33,14280,9426],{"class":167},[33,14282,1121],{"class":50},[33,14284,274],{"class":54},[33,14286,221],{"class":167},[14,14288,14289,14292,14293,14296,14297,14300],{},[1974,14290,14291],{},"Out-of-core:"," for datasets that exceed RAM, write each page's DataFrame directly to Parquet in append mode using ",[30,14294,14295],{},"pyarrow"," and read it back with ",[30,14298,14299],{},"pd.read_parquet()"," at analysis time. Parquet preserves dtypes and compresses numeric data efficiently.",[14,14302,14303,14304,14306],{},"For choosing between pdfplumber, camelot, and tabula before building a pipeline, see ",[940,14305,9606],{"href":9605}," — it benchmarks accuracy, speed, and dependency weight on a common set of PDF types.",[18,14308,4271],{"id":4270},[4273,14310,14311,14322],{},[4276,14312,14313],{},[4279,14314,14315,14318,14320],{},[4282,14316,14317],{},"Error",[4282,14319,4287],{},[4282,14321,4290],{},[4292,14323,14324,14341,14355,14374,14396,14419],{},[4279,14325,14326,14332,14335],{},[4297,14327,14328,14331],{},[30,14329,14330],{},"camelot.TableList.n == 0"," on lattice",[4297,14333,14334],{},"No vector lines detected on the target pages",[4297,14336,14337,14338,14340],{},"Switch to ",[30,14339,11068],{}," or pdfplumber; verify with classify_pdf()",[4279,14342,14343,14346,14349],{},[4297,14344,14345],{},"Header row appears mid-DataFrame",[4297,14347,14348],{},"Repeated header rows not removed before concat",[4297,14350,11870,14351,14354],{},[30,14352,14353],{},"concat_drop_headers()"," from Step 3",[4279,14356,14357,14363,14366],{},[4297,14358,14359,14360,14362],{},"Numeric column stays ",[30,14361,11888],{}," dtype",[4297,14364,14365],{},"Values contain currency symbols or commas",[4297,14367,11870,14368,14371,14372],{},[30,14369,14370],{},"clean_currency()"," from Step 5 before ",[30,14373,7013],{},[4279,14375,14376,14382,14385],{},[4297,14377,14378,14381],{},[30,14379,14380],{},"ValueError: shape of values"," on concat",[4297,14383,14384],{},"Per-page tables have differing column counts",[4297,14386,14387,14388,14391,14392,14395],{},"Inspect each frame's ",[30,14389,14390],{},".columns","; use ",[30,14393,14394],{},"pd.concat(..., sort=False)"," with uniform columns",[4279,14397,14398,14403,14406],{},[4297,14399,14400,14401],{},"Date column stays ",[30,14402,11888],{},[4297,14404,14405],{},"Inconsistent date formats across pages",[4297,14407,14408,14409,10065,14412,14415,14416],{},"Pass ",[30,14410,14411],{},"format=None",[30,14413,14414],{},"infer_datetime_format=True"," or use ",[30,14417,14418],{},"dateutil.parser.parse",[4279,14420,14421,14425,14428],{},[4297,14422,14423],{},[30,14424,9731],{},[4297,14426,14427],{},"Ghostscript binary not on PATH",[4297,14429,14430,14433],{},[30,14431,14432],{},"sudo apt-get install ghostscript","; lattice mode requires it",[18,14435,14437],{"id":14436},"complete-script","Complete Script",[23,14439,14441],{"className":126,"code":14440,"language":47,"meta":28,"style":28},"#!\u002Fusr\u002Fbin\u002Fenv python3\n\"\"\"\npdf_to_pandas.py — Extract PDF tables into a clean pandas DataFrame.\n\nUsage:\n    python pdf_to_pandas.py report.pdf --pages 1-end --output output\u002Fresult.csv\n    python pdf_to_pandas.py report.pdf --flavor stream --group-cols \"Region,Category\"\n\npip install pdfplumber \"camelot-py[cv]\" pandas\n\"\"\"\nimport argparse\nimport re\nimport sys\nfrom pathlib import Path\n\nimport pandas as pd\nimport pdfplumber\n\n\ndef classify(path: Path) -> str:\n    with pdfplumber.open(path) as pdf:\n        page = pdf.pages[0]\n        has_text = bool((page.extract_text() or \"\").strip())\n        has_lines = bool(page.lines or page.rects)\n    if not has_text and not has_lines:\n        return \"ocr\"\n    return \"lattice\" if has_lines else \"stream\"\n\n\ndef extract_pdfplumber(path: Path) -> list[pd.DataFrame]:\n    frames = []\n    with pdfplumber.open(path) as pdf:\n        for pnum, page in enumerate(pdf.pages, start=1):\n            for raw in (page.extract_tables() or []):\n                if raw and len(raw) > 1:\n                    header = [str(c).strip() if c else f\"col_{i}\" for i, c in enumerate(raw[0])]\n                    df = pd.DataFrame(raw[1:], columns=header)\n                    df[\"_page\"] = pnum\n                    frames.append(df)\n    return frames\n\n\ndef extract_camelot(path: Path, pages: str, flavor: str) -> list[pd.DataFrame]:\n    import camelot as cam\n    tbl = cam.read_pdf(str(path), pages=pages, flavor=flavor,\n                       process_background=(flavor == \"lattice\"))\n    frames = []\n    for t in tbl:\n        df = t.df.copy()\n        df.columns = df.iloc[0].str.strip()\n        df = df.iloc[1:].reset_index(drop=True)\n        df.replace(\"\", pd.NA, inplace=True)\n        frames.append(df)\n    return frames\n\n\ndef dedup_headers(frames: list[pd.DataFrame]) -> pd.DataFrame:\n    if not frames:\n        return pd.DataFrame()\n    cleaned = []\n    for df in frames:\n        non_a = [c for c in df.columns if not c.startswith(\"_\")]\n        mask = df[non_a].apply(\n            lambda r: all(str(v).strip() == str(c).strip() for v, c in zip(r, non_a)), axis=1\n        )\n        cleaned.append(df[~mask])\n    return pd.concat(cleaned, ignore_index=True)\n\n\ndef clean_currency(s: pd.Series) -> pd.Series:\n    s = s.astype(str).str.strip()\n    s = s.str.replace(r\"[$€£¥,\\s]\", \"\", regex=True)\n    s = s.str.replace(r\"\\((.+)\\)\", r\"-\\1\", regex=True)\n    s = s.str.replace(r\"%$\", \"\", regex=True)\n    return pd.to_numeric(s, errors=\"coerce\")\n\n\ndef coerce_dtypes(df: pd.DataFrame) -> pd.DataFrame:\n    df = df.copy()\n    for col in df.columns:\n        if col.startswith(\"_\"):\n            continue\n        num = clean_currency(df[col])\n        if num.notna().mean() > 0.70:\n            df[col] = num\n            continue\n        try:\n            dt = pd.to_datetime(df[col], infer_datetime_format=True, errors=\"coerce\")\n            if dt.notna().mean() > 0.70:\n                df[col] = dt\n                continue\n        except Exception:\n            pass\n        df[col] = df[col].astype(str).str.strip().replace(\"nan\", pd.NA)\n    return df\n\n\ndef main() -> None:\n    ap = argparse.ArgumentParser(description=\"Extract PDF tables into pandas\")\n    ap.add_argument(\"pdf\", type=Path)\n    ap.add_argument(\"--pages\", default=\"1-end\")\n    ap.add_argument(\"--flavor\", choices=[\"auto\", \"lattice\", \"stream\", \"pdfplumber\"],\n                    default=\"auto\")\n    ap.add_argument(\"--group-cols\", default=\"\",\n                    help=\"Comma-separated column names to forward-fill (merged cells)\")\n    ap.add_argument(\"--output\", type=Path, default=Path(\"output\u002Fresult.csv\"))\n    args = ap.parse_args()\n\n    if not args.pdf.exists():\n        sys.exit(f\"File not found: {args.pdf}\")\n\n    flavor = classify(args.pdf) if args.flavor == \"auto\" else args.flavor\n    print(f\"Extraction mode: {flavor}\")\n\n    if flavor == \"ocr\":\n        sys.exit(\"Scanned PDF — use the OCR pipeline (see how-to-extract-tables-from-scanned-pdfs\u002F)\")\n\n    try:\n        frames = (\n            extract_pdfplumber(args.pdf)\n            if flavor == \"pdfplumber\"\n            else extract_camelot(args.pdf, args.pages, flavor)\n        )\n    except Exception as e:\n        sys.exit(f\"Extraction failed: {e}\")\n\n    if not frames:\n        sys.exit(\"No tables found.\")\n\n    combined = dedup_headers(frames)\n\n    group_cols = [c.strip() for c in args.group_cols.split(\",\") if c.strip()]\n    for col in group_cols:\n        if col in combined.columns:\n            combined[col] = combined[col].replace(\"\", pd.NA).ffill()\n\n    combined = coerce_dtypes(combined)\n\n    args.output.parent.mkdir(parents=True, exist_ok=True)\n    combined.to_csv(args.output, index=False)\n    print(f\"Saved {combined.shape[0]} rows × {combined.shape[1]} cols → {args.output}\")\n    print(combined.dtypes.to_string())\n\n\nif __name__ == \"__main__\":\n    main()\n",[30,14442,14443,14448,14452,14457,14461,14465,14470,14475,14479,14484,14488,14494,14500,14506,14516,14520,14530,14536,14540,14544,14557,14567,14580,14597,14612,14626,14632,14646,14650,14654,14663,14671,14681,14702,14716,14734,14780,14798,14811,14815,14821,14825,14829,14848,14860,14887,14902,14910,14921,14929,14941,14961,14981,14985,14991,14995,14999,15008,15016,15022,15030,15040,15067,15077,15117,15121,15131,15145,15149,15153,15162,15175,15205,15249,15279,15293,15297,15301,15309,15317,15327,15337,15341,15351,15364,15373,15377,15383,15409,15422,15431,15435,15443,15447,15468,15474,15478,15482,15494,15512,15529,15547,15581,15592,15609,15621,15647,15656,15660,15669,15689,15693,15719,15740,15744,15758,15767,15771,15777,15786,15791,15802,15809,15813,15823,15841,15845,15853,15862,15866,15875,15879,15908,15918,15928,15947,15951,15960,15964,15985,15998,16047,16054,16058,16062,16074],{"__ignoreMap":28},[33,14444,14445],{"class":35,"line":36},[33,14446,14447],{"class":39},"#!\u002Fusr\u002Fbin\u002Fenv python3\n",[33,14449,14450],{"class":35,"line":43},[33,14451,139],{"class":54},[33,14453,14454],{"class":35,"line":61},[33,14455,14456],{"class":54},"pdf_to_pandas.py — Extract PDF tables into a clean pandas DataFrame.\n",[33,14458,14459],{"class":35,"line":73},[33,14460,92],{"emptyLinePlaceholder":91},[33,14462,14463],{"class":35,"line":88},[33,14464,4435],{"class":54},[33,14466,14467],{"class":35,"line":95},[33,14468,14469],{"class":54},"    python pdf_to_pandas.py report.pdf --pages 1-end --output output\u002Fresult.csv\n",[33,14471,14472],{"class":35,"line":101},[33,14473,14474],{"class":54},"    python pdf_to_pandas.py report.pdf --flavor stream --group-cols \"Region,Category\"\n",[33,14476,14477],{"class":35,"line":171},[33,14478,92],{"emptyLinePlaceholder":91},[33,14480,14481],{"class":35,"line":179},[33,14482,14483],{"class":54},"pip install pdfplumber \"camelot-py[cv]\" pandas\n",[33,14485,14486],{"class":35,"line":187},[33,14487,139],{"class":54},[33,14489,14490,14492],{"class":35,"line":201},[33,14491,164],{"class":163},[33,14493,4461],{"class":167},[33,14495,14496,14498],{"class":35,"line":206},[33,14497,164],{"class":163},[33,14499,11917],{"class":167},[33,14501,14502,14504],{"class":35,"line":224},[33,14503,164],{"class":163},[33,14505,168],{"class":167},[33,14507,14508,14510,14512,14514],{"class":35,"line":229},[33,14509,190],{"class":163},[33,14511,193],{"class":167},[33,14513,164],{"class":163},[33,14515,198],{"class":167},[33,14517,14518],{"class":35,"line":235},[33,14519,92],{"emptyLinePlaceholder":91},[33,14521,14522,14524,14526,14528],{"class":35,"line":250},[33,14523,164],{"class":163},[33,14525,492],{"class":167},[33,14527,495],{"class":163},[33,14529,498],{"class":167},[33,14531,14532,14534],{"class":35,"line":266},[33,14533,164],{"class":163},[33,14535,485],{"class":167},[33,14537,14538],{"class":35,"line":290},[33,14539,92],{"emptyLinePlaceholder":91},[33,14541,14542],{"class":35,"line":295},[33,14543,92],{"emptyLinePlaceholder":91},[33,14545,14546,14548,14551,14553,14555],{"class":35,"line":300},[33,14547,562],{"class":163},[33,14549,14550],{"class":46}," classify",[33,14552,3743],{"class":167},[33,14554,1053],{"class":50},[33,14556,574],{"class":167},[33,14558,14559,14561,14563,14565],{"class":35,"line":317},[33,14560,1635],{"class":163},[33,14562,7123],{"class":167},[33,14564,495],{"class":163},[33,14566,686],{"class":167},[33,14568,14569,14572,14574,14576,14578],{"class":35,"line":332},[33,14570,14571],{"class":167},"        page ",[33,14573,242],{"class":163},[33,14575,9870],{"class":167},[33,14577,748],{"class":50},[33,14579,9202],{"class":167},[33,14581,14582,14585,14587,14589,14591,14593,14595],{"class":35,"line":347},[33,14583,14584],{"class":167},"        has_text ",[33,14586,242],{"class":163},[33,14588,9884],{"class":50},[33,14590,9887],{"class":167},[33,14592,7162],{"class":163},[33,14594,9892],{"class":54},[33,14596,9895],{"class":167},[33,14598,14599,14602,14604,14606,14608,14610],{"class":35,"line":374},[33,14600,14601],{"class":167},"        has_lines ",[33,14603,242],{"class":163},[33,14605,9884],{"class":50},[33,14607,9907],{"class":167},[33,14609,7162],{"class":163},[33,14611,9912],{"class":167},[33,14613,14614,14616,14618,14620,14622,14624],{"class":35,"line":397},[33,14615,617],{"class":163},[33,14617,620],{"class":163},[33,14619,9970],{"class":167},[33,14621,6001],{"class":163},[33,14623,620],{"class":163},[33,14625,9977],{"class":167},[33,14627,14628,14630],{"class":35,"line":653},[33,14629,1659],{"class":163},[33,14631,9984],{"class":54},[33,14633,14634,14636,14638,14640,14642,14644],{"class":35,"line":667},[33,14635,1332],{"class":163},[33,14637,9991],{"class":54},[33,14639,9994],{"class":163},[33,14641,9997],{"class":167},[33,14643,7489],{"class":163},[33,14645,10002],{"class":54},[33,14647,14648],{"class":35,"line":675},[33,14649,92],{"emptyLinePlaceholder":91},[33,14651,14652],{"class":35,"line":689},[33,14653,92],{"emptyLinePlaceholder":91},[33,14655,14656,14658,14661],{"class":35,"line":703},[33,14657,562],{"class":163},[33,14659,14660],{"class":46}," extract_pdfplumber",[33,14662,7362],{"class":167},[33,14664,14665,14667,14669],{"class":35,"line":714},[33,14666,584],{"class":167},[33,14668,242],{"class":163},[33,14670,589],{"class":167},[33,14672,14673,14675,14677,14679],{"class":35,"line":723},[33,14674,1635],{"class":163},[33,14676,7123],{"class":167},[33,14678,495],{"class":163},[33,14680,686],{"class":167},[33,14682,14683,14685,14688,14690,14692,14694,14696,14698,14700],{"class":35,"line":754},[33,14684,5973],{"class":163},[33,14686,14687],{"class":167}," pnum, page ",[33,14689,662],{"class":163},[33,14691,7403],{"class":50},[33,14693,7406],{"class":167},[33,14695,7409],{"class":238},[33,14697,242],{"class":163},[33,14699,734],{"class":50},[33,14701,1737],{"class":167},[33,14703,14704,14706,14708,14710,14712,14714],{"class":35,"line":771},[33,14705,1793],{"class":163},[33,14707,7422],{"class":167},[33,14709,662],{"class":163},[33,14711,7427],{"class":167},[33,14713,7162],{"class":163},[33,14715,7165],{"class":167},[33,14717,14718,14720,14722,14724,14726,14728,14730,14732],{"class":35,"line":777},[33,14719,7170],{"class":163},[33,14721,7422],{"class":167},[33,14723,6001],{"class":163},[33,14725,4037],{"class":50},[33,14727,7446],{"class":167},[33,14729,6009],{"class":163},[33,14731,1814],{"class":50},[33,14733,574],{"class":167},[33,14735,14736,14738,14740,14742,14744,14746,14748,14750,14752,14754,14756,14758,14760,14762,14764,14767,14769,14771,14773,14775,14777],{"class":35,"line":788},[33,14737,7468],{"class":167},[33,14739,242],{"class":163},[33,14741,9178],{"class":167},[33,14743,1053],{"class":50},[33,14745,7481],{"class":167},[33,14747,2491],{"class":163},[33,14749,7486],{"class":167},[33,14751,7489],{"class":163},[33,14753,1110],{"class":163},[33,14755,7494],{"class":54},[33,14757,1115],{"class":50},[33,14759,7499],{"class":167},[33,14761,1121],{"class":50},[33,14763,274],{"class":54},[33,14765,14766],{"class":163}," for",[33,14768,7512],{"class":167},[33,14770,662],{"class":163},[33,14772,7403],{"class":50},[33,14774,7519],{"class":167},[33,14776,748],{"class":50},[33,14778,14779],{"class":167},"])]\n",[33,14781,14782,14784,14786,14788,14790,14792,14794,14796],{"class":35,"line":804},[33,14783,7533],{"class":167},[33,14785,242],{"class":163},[33,14787,7538],{"class":167},[33,14789,734],{"class":50},[33,14791,737],{"class":167},[33,14793,740],{"class":238},[33,14795,242],{"class":163},[33,14797,7549],{"class":167},[33,14799,14800,14802,14804,14806,14808],{"class":35,"line":809},[33,14801,7554],{"class":167},[33,14803,7557],{"class":54},[33,14805,763],{"class":167},[33,14807,242],{"class":163},[33,14809,14810],{"class":167}," pnum\n",[33,14812,14813],{"class":35,"line":819},[33,14814,7572],{"class":167},[33,14816,14817,14819],{"class":35,"line":829},[33,14818,1332],{"class":163},[33,14820,6065],{"class":167},[33,14822,14823],{"class":35,"line":834},[33,14824,92],{"emptyLinePlaceholder":91},[33,14826,14827],{"class":35,"line":839},[33,14828,92],{"emptyLinePlaceholder":91},[33,14830,14831,14833,14836,14839,14841,14844,14846],{"class":35,"line":860},[33,14832,562],{"class":163},[33,14834,14835],{"class":46}," extract_camelot",[33,14837,14838],{"class":167},"(path: Path, pages: ",[33,14840,1053],{"class":50},[33,14842,14843],{"class":167},", flavor: ",[33,14845,1053],{"class":50},[33,14847,10647],{"class":167},[33,14849,14850,14852,14855,14857],{"class":35,"line":887},[33,14851,1627],{"class":163},[33,14853,14854],{"class":167}," camelot ",[33,14856,495],{"class":163},[33,14858,14859],{"class":167}," cam\n",[33,14861,14862,14865,14867,14870,14872,14874,14876,14878,14881,14883,14885],{"class":35,"line":907},[33,14863,14864],{"class":167},"    tbl ",[33,14866,242],{"class":163},[33,14868,14869],{"class":167}," cam.read_pdf(",[33,14871,1053],{"class":50},[33,14873,13643],{"class":167},[33,14875,10971],{"class":238},[33,14877,242],{"class":163},[33,14879,14880],{"class":167},"pages, ",[33,14882,10748],{"class":238},[33,14884,242],{"class":163},[33,14886,10696],{"class":167},[33,14888,14889,14892,14894,14896,14898,14900],{"class":35,"line":1826},[33,14890,14891],{"class":238},"                       process_background",[33,14893,242],{"class":163},[33,14895,10706],{"class":167},[33,14897,1865],{"class":163},[33,14899,9991],{"class":54},[33,14901,371],{"class":167},[33,14903,14904,14906,14908],{"class":35,"line":1844},[33,14905,584],{"class":167},[33,14907,242],{"class":163},[33,14909,589],{"class":167},[33,14911,14912,14914,14916,14918],{"class":35,"line":1858},[33,14913,656],{"class":163},[33,14915,10818],{"class":167},[33,14917,662],{"class":163},[33,14919,14920],{"class":167}," tbl:\n",[33,14922,14923,14925,14927],{"class":35,"line":1871},[33,14924,7930],{"class":167},[33,14926,242],{"class":163},[33,14928,10832],{"class":167},[33,14930,14931,14933,14935,14937,14939],{"class":35,"line":1877},[33,14932,10842],{"class":167},[33,14934,242],{"class":163},[33,14936,10847],{"class":167},[33,14938,748],{"class":50},[33,14940,10852],{"class":167},[33,14942,14943,14945,14947,14949,14951,14953,14955,14957,14959],{"class":35,"line":1883},[33,14944,7930],{"class":167},[33,14946,242],{"class":163},[33,14948,10847],{"class":167},[33,14950,734],{"class":50},[33,14952,10865],{"class":167},[33,14954,10868],{"class":238},[33,14956,242],{"class":163},[33,14958,855],{"class":50},[33,14960,221],{"class":167},[33,14962,14963,14965,14967,14969,14971,14973,14975,14977,14979],{"class":35,"line":1915},[33,14964,10879],{"class":167},[33,14966,3198],{"class":54},[33,14968,10884],{"class":167},[33,14970,8018],{"class":50},[33,14972,365],{"class":167},[33,14974,10891],{"class":238},[33,14976,242],{"class":163},[33,14978,855],{"class":50},[33,14980,221],{"class":167},[33,14982,14983],{"class":35,"line":1926},[33,14984,10929],{"class":167},[33,14986,14987,14989],{"class":35,"line":1932},[33,14988,1332],{"class":163},[33,14990,6065],{"class":167},[33,14992,14993],{"class":35,"line":1938},[33,14994,92],{"emptyLinePlaceholder":91},[33,14996,14997],{"class":35,"line":1950},[33,14998,92],{"emptyLinePlaceholder":91},[33,15000,15001,15003,15006],{"class":35,"line":1958},[33,15002,562],{"class":163},[33,15004,15005],{"class":46}," dedup_headers",[33,15007,11286],{"class":167},[33,15009,15010,15012,15014],{"class":35,"line":4904},[33,15011,617],{"class":163},[33,15013,620],{"class":163},[33,15015,816],{"class":167},[33,15017,15018,15020],{"class":35,"line":4909},[33,15019,1659],{"class":163},[33,15021,7721],{"class":167},[33,15023,15024,15026,15028],{"class":35,"line":4915},[33,15025,12471],{"class":167},[33,15027,242],{"class":163},[33,15029,589],{"class":167},[33,15031,15032,15034,15036,15038],{"class":35,"line":4925},[33,15033,656],{"class":163},[33,15035,7810],{"class":167},[33,15037,662],{"class":163},[33,15039,816],{"class":167},[33,15041,15042,15045,15047,15049,15051,15053,15055,15057,15059,15061,15063,15065],{"class":35,"line":4935},[33,15043,15044],{"class":167},"        non_a ",[33,15046,242],{"class":163},[33,15048,7740],{"class":167},[33,15050,6124],{"class":163},[33,15052,7486],{"class":167},[33,15054,662],{"class":163},[33,15056,7837],{"class":167},[33,15058,2491],{"class":163},[33,15060,620],{"class":163},[33,15062,7761],{"class":167},[33,15064,7764],{"class":54},[33,15066,7767],{"class":167},[33,15068,15069,15072,15074],{"class":35,"line":4941},[33,15070,15071],{"class":167},"        mask ",[33,15073,242],{"class":163},[33,15075,15076],{"class":167}," df[non_a].apply(\n",[33,15078,15079,15081,15084,15086,15088,15090,15092,15094,15096,15098,15100,15103,15105,15107,15110,15112,15114],{"class":35,"line":4950},[33,15080,7862],{"class":163},[33,15082,15083],{"class":167}," r: ",[33,15085,7868],{"class":50},[33,15087,602],{"class":167},[33,15089,1053],{"class":50},[33,15091,7882],{"class":167},[33,15093,1865],{"class":163},[33,15095,7887],{"class":50},[33,15097,7481],{"class":167},[33,15099,6124],{"class":163},[33,15101,15102],{"class":167}," v, c ",[33,15104,662],{"class":163},[33,15106,7902],{"class":50},[33,15108,15109],{"class":167},"(r, non_a)), ",[33,15111,4177],{"class":238},[33,15113,242],{"class":163},[33,15115,15116],{"class":50},"1\n",[33,15118,15119],{"class":35,"line":4960},[33,15120,5867],{"class":167},[33,15122,15123,15126,15128],{"class":35,"line":4965},[33,15124,15125],{"class":167},"        cleaned.append(df[",[33,15127,7938],{"class":163},[33,15129,15130],{"class":167},"mask])\n",[33,15132,15133,15135,15137,15139,15141,15143],{"class":35,"line":4971},[33,15134,1332],{"class":163},[33,15136,8061],{"class":167},[33,15138,850],{"class":238},[33,15140,242],{"class":163},[33,15142,855],{"class":50},[33,15144,221],{"class":167},[33,15146,15147],{"class":35,"line":4983},[33,15148,92],{"emptyLinePlaceholder":91},[33,15150,15151],{"class":35,"line":4988},[33,15152,92],{"emptyLinePlaceholder":91},[33,15154,15155,15157,15159],{"class":35,"line":4993},[33,15156,562],{"class":163},[33,15158,11942],{"class":46},[33,15160,15161],{"class":167},"(s: pd.Series) -> pd.Series:\n",[33,15163,15164,15166,15168,15171,15173],{"class":35,"line":5003},[33,15165,11955],{"class":167},[33,15167,242],{"class":163},[33,15169,15170],{"class":167}," s.astype(",[33,15172,1053],{"class":50},[33,15174,11965],{"class":167},[33,15176,15177,15179,15181,15183,15185,15187,15189,15191,15193,15195,15197,15199,15201,15203],{"class":35,"line":5008},[33,15178,11955],{"class":167},[33,15180,242],{"class":163},[33,15182,11974],{"class":167},[33,15184,11977],{"class":163},[33,15186,274],{"class":54},[33,15188,11982],{"class":50},[33,15190,274],{"class":54},[33,15192,365],{"class":167},[33,15194,3198],{"class":54},[33,15196,365],{"class":167},[33,15198,11993],{"class":238},[33,15200,242],{"class":163},[33,15202,855],{"class":50},[33,15204,221],{"class":167},[33,15206,15207,15209,15211,15213,15215,15217,15219,15221,15223,15225,15227,15229,15231,15233,15235,15237,15239,15241,15243,15245,15247],{"class":35,"line":5014},[33,15208,11955],{"class":167},[33,15210,242],{"class":163},[33,15212,11974],{"class":167},[33,15214,11977],{"class":163},[33,15216,274],{"class":54},[33,15218,12019],{"class":12018},[33,15220,12022],{"class":50},[33,15222,1811],{"class":163},[33,15224,12027],{"class":50},[33,15226,12030],{"class":12018},[33,15228,274],{"class":54},[33,15230,365],{"class":167},[33,15232,11977],{"class":163},[33,15234,12039],{"class":54},[33,15236,12042],{"class":2076},[33,15238,274],{"class":54},[33,15240,365],{"class":167},[33,15242,11993],{"class":238},[33,15244,242],{"class":163},[33,15246,855],{"class":50},[33,15248,221],{"class":167},[33,15250,15251,15253,15255,15257,15259,15261,15263,15265,15267,15269,15271,15273,15275,15277],{"class":35,"line":5019},[33,15252,11955],{"class":167},[33,15254,242],{"class":163},[33,15256,11974],{"class":167},[33,15258,11977],{"class":163},[33,15260,12070],{"class":54},[33,15262,12073],{"class":50},[33,15264,274],{"class":54},[33,15266,365],{"class":167},[33,15268,3198],{"class":54},[33,15270,365],{"class":167},[33,15272,11993],{"class":238},[33,15274,242],{"class":163},[33,15276,855],{"class":50},[33,15278,221],{"class":167},[33,15280,15281,15283,15285,15287,15289,15291],{"class":35,"line":5032},[33,15282,1332],{"class":163},[33,15284,12100],{"class":167},[33,15286,8317],{"class":238},[33,15288,242],{"class":163},[33,15290,12107],{"class":54},[33,15292,221],{"class":167},[33,15294,15295],{"class":35,"line":5039},[33,15296,92],{"emptyLinePlaceholder":91},[33,15298,15299],{"class":35,"line":5068},[33,15300,92],{"emptyLinePlaceholder":91},[33,15302,15303,15305,15307],{"class":35,"line":5077},[33,15304,562],{"class":163},[33,15306,12124],{"class":46},[33,15308,12127],{"class":167},[33,15310,15311,15313,15315],{"class":35,"line":5082},[33,15312,4025],{"class":167},[33,15314,242],{"class":163},[33,15316,11659],{"class":167},[33,15318,15319,15321,15323,15325],{"class":35,"line":5089},[33,15320,656],{"class":163},[33,15322,7985],{"class":167},[33,15324,662],{"class":163},[33,15326,8005],{"class":167},[33,15328,15329,15331,15333,15335],{"class":35,"line":5098},[33,15330,8221],{"class":163},[33,15332,8226],{"class":167},[33,15334,7764],{"class":54},[33,15336,1737],{"class":167},[33,15338,15339],{"class":35,"line":5105},[33,15340,9330],{"class":163},[33,15342,15343,15346,15348],{"class":35,"line":5110},[33,15344,15345],{"class":167},"        num ",[33,15347,242],{"class":163},[33,15349,15350],{"class":167}," clean_currency(df[col])\n",[33,15352,15353,15355,15358,15360,15362],{"class":35,"line":5115},[33,15354,8221],{"class":163},[33,15356,15357],{"class":167}," num.notna().mean() ",[33,15359,6009],{"class":163},[33,15361,12231],{"class":50},[33,15363,574],{"class":167},[33,15365,15366,15368,15370],{"class":35,"line":5128},[33,15367,11690],{"class":167},[33,15369,242],{"class":163},[33,15371,15372],{"class":167}," num\n",[33,15374,15375],{"class":35,"line":5135},[33,15376,9330],{"class":163},[33,15378,15379,15381],{"class":35,"line":5142},[33,15380,670],{"class":163},[33,15382,574],{"class":167},[33,15384,15385,15388,15390,15393,15395,15397,15399,15401,15403,15405,15407],{"class":35,"line":5151},[33,15386,15387],{"class":167},"            dt ",[33,15389,242],{"class":163},[33,15391,15392],{"class":167}," pd.to_datetime(df[col], ",[33,15394,12274],{"class":238},[33,15396,242],{"class":163},[33,15398,855],{"class":50},[33,15400,365],{"class":167},[33,15402,8317],{"class":238},[33,15404,242],{"class":163},[33,15406,12107],{"class":54},[33,15408,221],{"class":167},[33,15410,15411,15413,15416,15418,15420],{"class":35,"line":5156},[33,15412,5995],{"class":163},[33,15414,15415],{"class":167}," dt.notna().mean() ",[33,15417,6009],{"class":163},[33,15419,12231],{"class":50},[33,15421,574],{"class":167},[33,15423,15424,15426,15428],{"class":35,"line":5161},[33,15425,8010],{"class":167},[33,15427,242],{"class":163},[33,15429,15430],{"class":167}," dt\n",[33,15432,15433],{"class":35,"line":5167},[33,15434,12315],{"class":163},[33,15436,15437,15439,15441],{"class":35,"line":5172},[33,15438,780],{"class":163},[33,15440,783],{"class":50},[33,15442,574],{"class":167},[33,15444,15445],{"class":35,"line":5182},[33,15446,3552],{"class":163},[33,15448,15449,15451,15453,15456,15458,15460,15462,15464,15466],{"class":35,"line":5195},[33,15450,12341],{"class":167},[33,15452,242],{"class":163},[33,15454,15455],{"class":167}," df[col].astype(",[33,15457,1053],{"class":50},[33,15459,12350],{"class":167},[33,15461,12353],{"class":54},[33,15463,10884],{"class":167},[33,15465,8018],{"class":50},[33,15467,221],{"class":167},[33,15469,15470,15472],{"class":35,"line":5200},[33,15471,1332],{"class":163},[33,15473,11719],{"class":167},[33,15475,15476],{"class":35,"line":5205},[33,15477,92],{"emptyLinePlaceholder":91},[33,15479,15480],{"class":35,"line":5210},[33,15481,92],{"emptyLinePlaceholder":91},[33,15483,15484,15486,15488,15490,15492],{"class":35,"line":5215},[33,15485,562],{"class":163},[33,15487,6636],{"class":46},[33,15489,568],{"class":167},[33,15491,571],{"class":50},[33,15493,574],{"class":167},[33,15495,15496,15499,15501,15503,15505,15507,15510],{"class":35,"line":5220},[33,15497,15498],{"class":167},"    ap ",[33,15500,242],{"class":163},[33,15502,6653],{"class":167},[33,15504,6656],{"class":238},[33,15506,242],{"class":163},[33,15508,15509],{"class":54},"\"Extract PDF tables into pandas\"",[33,15511,221],{"class":167},[33,15513,15514,15517,15520,15522,15524,15526],{"class":35,"line":5227},[33,15515,15516],{"class":167},"    ap.add_argument(",[33,15518,15519],{"class":54},"\"pdf\"",[33,15521,365],{"class":167},[33,15523,6677],{"class":238},[33,15525,242],{"class":163},[33,15527,15528],{"class":167},"Path)\n",[33,15530,15531,15533,15536,15538,15540,15542,15545],{"class":35,"line":5232},[33,15532,15516],{"class":167},[33,15534,15535],{"class":54},"\"--pages\"",[33,15537,365],{"class":167},[33,15539,6685],{"class":238},[33,15541,242],{"class":163},[33,15543,15544],{"class":54},"\"1-end\"",[33,15546,221],{"class":167},[33,15548,15549,15551,15554,15556,15559,15561,15563,15566,15568,15570,15572,15574,15576,15579],{"class":35,"line":5237},[33,15550,15516],{"class":167},[33,15552,15553],{"class":54},"\"--flavor\"",[33,15555,365],{"class":167},[33,15557,15558],{"class":238},"choices",[33,15560,242],{"class":163},[33,15562,8309],{"class":167},[33,15564,15565],{"class":54},"\"auto\"",[33,15567,365],{"class":167},[33,15569,10985],{"class":54},[33,15571,365],{"class":167},[33,15573,13407],{"class":54},[33,15575,365],{"class":167},[33,15577,15578],{"class":54},"\"pdfplumber\"",[33,15580,8935],{"class":167},[33,15582,15583,15586,15588,15590],{"class":35,"line":5251},[33,15584,15585],{"class":238},"                    default",[33,15587,242],{"class":163},[33,15589,15565],{"class":54},[33,15591,221],{"class":167},[33,15593,15594,15596,15599,15601,15603,15605,15607],{"class":35,"line":5259},[33,15595,15516],{"class":167},[33,15597,15598],{"class":54},"\"--group-cols\"",[33,15600,365],{"class":167},[33,15602,6685],{"class":238},[33,15604,242],{"class":163},[33,15606,3198],{"class":54},[33,15608,247],{"class":167},[33,15610,15611,15614,15616,15619],{"class":35,"line":5264},[33,15612,15613],{"class":238},"                    help",[33,15615,242],{"class":163},[33,15617,15618],{"class":54},"\"Comma-separated column names to forward-fill (merged cells)\"",[33,15620,221],{"class":167},[33,15622,15623,15625,15627,15629,15631,15633,15635,15637,15639,15642,15645],{"class":35,"line":5269},[33,15624,15516],{"class":167},[33,15626,6699],{"class":54},[33,15628,365],{"class":167},[33,15630,6677],{"class":238},[33,15632,242],{"class":163},[33,15634,6682],{"class":167},[33,15636,6685],{"class":238},[33,15638,242],{"class":163},[33,15640,15641],{"class":167},"Path(",[33,15643,15644],{"class":54},"\"output\u002Fresult.csv\"",[33,15646,371],{"class":167},[33,15648,15649,15651,15653],{"class":35,"line":5283},[33,15650,6766],{"class":167},[33,15652,242],{"class":163},[33,15654,15655],{"class":167}," ap.parse_args()\n",[33,15657,15658],{"class":35,"line":5293},[33,15659,92],{"emptyLinePlaceholder":91},[33,15661,15662,15664,15666],{"class":35,"line":5303},[33,15663,617],{"class":163},[33,15665,620],{"class":163},[33,15667,15668],{"class":167}," args.pdf.exists():\n",[33,15670,15671,15673,15675,15678,15680,15683,15685,15687],{"class":35,"line":5313},[33,15672,2995],{"class":167},[33,15674,4059],{"class":163},[33,15676,15677],{"class":54},"\"File not found: ",[33,15679,1115],{"class":50},[33,15681,15682],{"class":167},"args.pdf",[33,15684,1121],{"class":50},[33,15686,274],{"class":54},[33,15688,221],{"class":167},[33,15690,15691],{"class":35,"line":5320},[33,15692,92],{"emptyLinePlaceholder":91},[33,15694,15695,15698,15700,15703,15705,15708,15710,15713,15716],{"class":35,"line":5325},[33,15696,15697],{"class":167},"    flavor ",[33,15699,242],{"class":163},[33,15701,15702],{"class":167}," classify(args.pdf) ",[33,15704,2491],{"class":163},[33,15706,15707],{"class":167}," args.flavor ",[33,15709,1865],{"class":163},[33,15711,15712],{"class":54}," \"auto\"",[33,15714,15715],{"class":163}," else",[33,15717,15718],{"class":167}," args.flavor\n",[33,15720,15721,15723,15725,15727,15730,15732,15734,15736,15738],{"class":35,"line":5330},[33,15722,7268],{"class":50},[33,15724,602],{"class":167},[33,15726,4059],{"class":163},[33,15728,15729],{"class":54},"\"Extraction mode: ",[33,15731,1115],{"class":50},[33,15733,10748],{"class":167},[33,15735,1121],{"class":50},[33,15737,274],{"class":54},[33,15739,221],{"class":167},[33,15741,15742],{"class":35,"line":5344},[33,15743,92],{"emptyLinePlaceholder":91},[33,15745,15746,15748,15751,15753,15756],{"class":35,"line":5349},[33,15747,617],{"class":163},[33,15749,15750],{"class":167}," flavor ",[33,15752,1865],{"class":163},[33,15754,15755],{"class":54}," \"ocr\"",[33,15757,574],{"class":167},[33,15759,15760,15762,15765],{"class":35,"line":5354},[33,15761,2995],{"class":167},[33,15763,15764],{"class":54},"\"Scanned PDF — use the OCR pipeline (see how-to-extract-tables-from-scanned-pdfs\u002F)\"",[33,15766,221],{"class":167},[33,15768,15769],{"class":35,"line":5368},[33,15770,92],{"emptyLinePlaceholder":91},[33,15772,15773,15775],{"class":35,"line":5377},[33,15774,2424],{"class":163},[33,15776,574],{"class":167},[33,15778,15779,15782,15784],{"class":35,"line":5382},[33,15780,15781],{"class":167},"        frames ",[33,15783,242],{"class":163},[33,15785,1415],{"class":167},[33,15787,15788],{"class":35,"line":5389},[33,15789,15790],{"class":167},"            extract_pdfplumber(args.pdf)\n",[33,15792,15793,15795,15797,15799],{"class":35,"line":5399},[33,15794,5995],{"class":163},[33,15796,15750],{"class":167},[33,15798,1865],{"class":163},[33,15800,15801],{"class":54}," \"pdfplumber\"\n",[33,15803,15804,15806],{"class":35,"line":5404},[33,15805,8705],{"class":163},[33,15807,15808],{"class":167}," extract_camelot(args.pdf, args.pages, flavor)\n",[33,15810,15811],{"class":35,"line":5409},[33,15812,5867],{"class":167},[33,15814,15815,15817,15819,15821],{"class":35,"line":5414},[33,15816,2449],{"class":163},[33,15818,783],{"class":50},[33,15820,1852],{"class":163},[33,15822,7583],{"class":167},[33,15824,15825,15827,15829,15831,15833,15835,15837,15839],{"class":35,"line":5419},[33,15826,2995],{"class":167},[33,15828,4059],{"class":163},[33,15830,7597],{"class":54},[33,15832,1115],{"class":50},[33,15834,7602],{"class":167},[33,15836,1121],{"class":50},[33,15838,274],{"class":54},[33,15840,221],{"class":167},[33,15842,15843],{"class":35,"line":5425},[33,15844,92],{"emptyLinePlaceholder":91},[33,15846,15847,15849,15851],{"class":35,"line":5430},[33,15848,617],{"class":163},[33,15850,620],{"class":163},[33,15852,816],{"class":167},[33,15854,15855,15857,15860],{"class":35,"line":5440},[33,15856,2995],{"class":167},[33,15858,15859],{"class":54},"\"No tables found.\"",[33,15861,221],{"class":167},[33,15863,15864],{"class":35,"line":5451},[33,15865,92],{"emptyLinePlaceholder":91},[33,15867,15868,15870,15872],{"class":35,"line":5464},[33,15869,842],{"class":167},[33,15871,242],{"class":163},[33,15873,15874],{"class":167}," dedup_headers(frames)\n",[33,15876,15877],{"class":35,"line":5497},[33,15878,92],{"emptyLinePlaceholder":91},[33,15880,15881,15884,15886,15889,15891,15893,15895,15898,15901,15903,15905],{"class":35,"line":5514},[33,15882,15883],{"class":167},"    group_cols ",[33,15885,242],{"class":163},[33,15887,15888],{"class":167}," [c.strip() ",[33,15890,6124],{"class":163},[33,15892,7486],{"class":167},[33,15894,662],{"class":163},[33,15896,15897],{"class":167}," args.group_cols.split(",[33,15899,15900],{"class":54},"\",\"",[33,15902,1649],{"class":167},[33,15904,2491],{"class":163},[33,15906,15907],{"class":167}," c.strip()]\n",[33,15909,15910,15912,15914,15916],{"class":35,"line":5527},[33,15911,656],{"class":163},[33,15913,7985],{"class":167},[33,15915,662],{"class":163},[33,15917,11670],{"class":167},[33,15919,15920,15922,15924,15926],{"class":35,"line":5532},[33,15921,8221],{"class":163},[33,15923,7985],{"class":167},[33,15925,662],{"class":163},[33,15927,8216],{"class":167},[33,15929,15930,15933,15935,15938,15940,15942,15944],{"class":35,"line":5537},[33,15931,15932],{"class":167},"            combined[col] ",[33,15934,242],{"class":163},[33,15936,15937],{"class":167}," combined[col].replace(",[33,15939,3198],{"class":54},[33,15941,10884],{"class":167},[33,15943,8018],{"class":50},[33,15945,15946],{"class":167},").ffill()\n",[33,15948,15949],{"class":35,"line":5543},[33,15950,92],{"emptyLinePlaceholder":91},[33,15952,15953,15955,15957],{"class":35,"line":5548},[33,15954,842],{"class":167},[33,15956,242],{"class":163},[33,15958,15959],{"class":167}," coerce_dtypes(combined)\n",[33,15961,15962],{"class":35,"line":5570},[33,15963,92],{"emptyLinePlaceholder":91},[33,15965,15966,15969,15971,15973,15975,15977,15979,15981,15983],{"class":35,"line":5577},[33,15967,15968],{"class":167},"    args.output.parent.mkdir(",[33,15970,869],{"class":238},[33,15972,242],{"class":163},[33,15974,855],{"class":50},[33,15976,365],{"class":167},[33,15978,878],{"class":238},[33,15980,242],{"class":163},[33,15982,855],{"class":50},[33,15984,221],{"class":167},[33,15986,15987,15990,15992,15994,15996],{"class":35,"line":5584},[33,15988,15989],{"class":167},"    combined.to_csv(args.output, ",[33,15991,897],{"class":238},[33,15993,242],{"class":163},[33,15995,902],{"class":50},[33,15997,221],{"class":167},[33,15999,16000,16002,16004,16006,16009,16011,16014,16016,16018,16020,16023,16025,16027,16029,16031,16033,16036,16038,16041,16043,16045],{"class":35,"line":5591},[33,16001,7268],{"class":50},[33,16003,602],{"class":167},[33,16005,4059],{"class":163},[33,16007,16008],{"class":54},"\"Saved ",[33,16010,1115],{"class":50},[33,16012,16013],{"class":167},"combined.shape[",[33,16015,748],{"class":50},[33,16017,9546],{"class":167},[33,16019,1121],{"class":50},[33,16021,16022],{"class":54}," rows × ",[33,16024,1115],{"class":50},[33,16026,16013],{"class":167},[33,16028,734],{"class":50},[33,16030,9546],{"class":167},[33,16032,1121],{"class":50},[33,16034,16035],{"class":54}," cols → ",[33,16037,1115],{"class":50},[33,16039,16040],{"class":167},"args.output",[33,16042,1121],{"class":50},[33,16044,274],{"class":54},[33,16046,221],{"class":167},[33,16048,16049,16051],{"class":35,"line":5602},[33,16050,7268],{"class":50},[33,16052,16053],{"class":167},"(combined.dtypes.to_string())\n",[33,16055,16056],{"class":35,"line":5607},[33,16057,92],{"emptyLinePlaceholder":91},[33,16059,16060],{"class":35,"line":5623},[33,16061,92],{"emptyLinePlaceholder":91},[33,16063,16064,16066,16068,16070,16072],{"class":35,"line":5630},[33,16065,2491],{"class":163},[33,16067,2494],{"class":50},[33,16069,2497],{"class":163},[33,16071,2500],{"class":54},[33,16073,574],{"class":167},[33,16075,16076],{"class":35,"line":5640},[33,16077,6914],{"class":167},[18,16079,6918],{"id":6917},[4211,16081,16082,16087,16093,16098],{},[4214,16083,16084,16086],{},[940,16085,9592],{"href":942}," — pdfplumber and camelot extraction modes explained in depth",[4214,16088,16089,16092],{},[940,16090,6995],{"href":16091},"\u002Fautomating-document-data-pipelines\u002Fextracting-pdf-data-into-pandas\u002Fhandle-multi-page-pdf-tables-in-pandas\u002F"," — fix duplicated headers and column misalignment across page breaks",[4214,16094,16095,16097],{},[940,16096,9606],{"href":9605}," — choose the right library before building your pipeline",[4214,16099,16100,16102],{},[940,16101,9599],{"href":9598}," — same dtype-normalization patterns for CSV sources",[14,16104,6947,16105,3035],{},[940,16106,6951],{"href":6950},[6953,16108,16109],{},"html pre.shiki code .sAwPA, html code.shiki .sAwPA{--shiki-default:#6A737D}html pre.shiki code .s7eDp, html code.shiki .s7eDp{--shiki-default:#6F42C1}html pre.shiki code .sYBdl, html code.shiki .sYBdl{--shiki-default:#032F62}html pre.shiki code .sYu0t, html code.shiki .sYu0t{--shiki-default:#005CC5}html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html pre.shiki code .sD7c4, html code.shiki .sD7c4{--shiki-default:#D73A49}html pre.shiki code .sgsFI, html code.shiki .sgsFI{--shiki-default:#24292E}html pre.shiki code .sqxcx, html code.shiki .sqxcx{--shiki-default:#E36209}html pre.shiki code .s691h, html code.shiki .s691h{--shiki-default:#22863A;--shiki-default-font-weight:bold}html pre.shiki code .shJU0, html code.shiki .shJU0{--shiki-default:#22863A}",{"title":28,"searchDepth":43,"depth":43,"links":16111},[16112,16113,16114,16118,16119,16120,16121,16122,16127,16128,16129,16130],{"id":20,"depth":43,"text":21},{"id":9749,"depth":43,"text":9750},{"id":10081,"depth":43,"text":10082,"children":16115},[16116,16117],{"id":10088,"depth":61,"text":10089},{"id":10539,"depth":61,"text":10540},{"id":11235,"depth":43,"text":11236},{"id":11584,"depth":43,"text":11585},{"id":11881,"depth":43,"text":11882},{"id":12502,"depth":43,"text":12503},{"id":12943,"depth":43,"text":12944,"children":16123},[16124,16125,16126],{"id":12947,"depth":61,"text":12948},{"id":13314,"depth":61,"text":13315},{"id":13480,"depth":61,"text":13481},{"id":13844,"depth":43,"text":13845},{"id":4270,"depth":43,"text":4271},{"id":14436,"depth":43,"text":14437},{"id":6917,"depth":43,"text":6918},"PDF Data into pandas","Turn PDF tables and text into clean pandas DataFrames using pdfplumber and camelot. Covers extraction, dtype normalization, date\u002Fcurrency parsing, and per-page concat.",{},"\u002Fautomating-document-data-pipelines\u002Fextracting-pdf-data-into-pandas",{"title":948,"description":16132},"Extracting PDF Data into pandas with Python","automating-document-data-pipelines\u002Fextracting-pdf-data-into-pandas\u002Findex",[47,9630,9631,943,16139],"camelot","vbar6qcL8Tbry5phwoCznM0FlOfIpEXzUnoHD2loOCQ",{"id":16142,"title":4204,"body":16143,"breadcrumbTitle":26224,"canonical":6977,"date":6978,"description":26225,"draft":6980,"extension":6981,"image":6977,"meta":26226,"navigation":91,"path":26227,"robots":6977,"seo":26228,"seoTitle":26229,"stem":26230,"tags":26231,"updatedAt":6978,"__hash__":26233},"content\u002Fautomating-document-data-pipelines\u002Fgenerating-reports-from-pipeline-data\u002Findex.md",{"type":7,"value":16144,"toc":26205},[16145,16148,16154,16157,16159,16165,16202,16205,16522,16526,16529,16918,16921,16923,16926,17053,17057,17068,18032,18036,18056,18993,18997,19008,20267,20269,20273,20280,20387,20391,20401,20854,20872,20876,20892,20934,21304,21308,21311,21799,21807,21811,21814,21826,22002,22024,22040,22048,22361,22364,22366,22495,22497,22511,26103,26106,26165,26172,26174,26198,26202],[10,16146,4204],{"id":16147},"generating-reports-from-pipeline-data",[14,16149,16150,16151,3035],{},"A pipeline's transform step produces a cleaned DataFrame. The mistake is writing a separate script for each output format. One transform, three deliverables — Excel for analysts, Word for managers, PDF for clients — is the practical pattern. This guide wires that fan-out so segment splits, output naming, and row-count validation all run in a single pass over ",[940,16152,16153],{"href":6950},"your pipeline's data in Automating Document & Data Pipelines",[14,16155,16156],{},"The tricky parts are not the individual libraries; they are keeping outputs synchronized (every segment that appears in Excel must appear in Word and PDF), handling template variables that don't exist in some segments, and keeping memory flat when the DataFrame has hundreds of thousands of rows.",[18,16158,21],{"id":20},[14,16160,16161,16162,16164],{},"Install all three output libraries plus pandas before starting. If you are ingesting the DataFrame from a PDF source, see the ",[940,16163,948],{"href":947}," guide for that upstream step.",[23,16166,16168],{"className":25,"code":16167,"language":27,"meta":28,"style":28},"# System deps: none beyond Python 3.9+\npip install pandas openpyxl xlsxwriter python-docx docxtpl reportlab weasyprint\n",[30,16169,16170,16175],{"__ignoreMap":28},[33,16171,16172],{"class":35,"line":36},[33,16173,16174],{"class":39},"# System deps: none beyond Python 3.9+\n",[33,16176,16177,16179,16181,16184,16187,16190,16193,16196,16199],{"class":35,"line":43},[33,16178,76],{"class":46},[33,16180,79],{"class":54},[33,16182,16183],{"class":54}," pandas",[33,16185,16186],{"class":54}," openpyxl",[33,16188,16189],{"class":54}," xlsxwriter",[33,16191,16192],{"class":54}," python-docx",[33,16194,16195],{"class":54}," docxtpl",[33,16197,16198],{"class":54}," reportlab",[33,16200,16201],{"class":54}," weasyprint\n",[14,16203,16204],{},"Create a minimal test fixture so every snippet below runs independently:",[23,16206,16208],{"className":126,"code":16207,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\nfrom pathlib import Path\n\nFIXTURE_CSV = Path(\"\u002Ftmp\u002Fpipeline_sample.csv\")\n\ndata = {\n    \"region\": [\"North\", \"North\", \"South\", \"South\", \"East\", \"East\"],\n    \"product\": [\"Widget A\", \"Widget B\", \"Widget A\", \"Widget C\", \"Widget B\", \"Widget C\"],\n    \"units\": [120, 85, 200, 60, 95, 110],\n    \"revenue\": [2400.0, 1700.0, 4000.0, 1200.0, 1900.0, 2200.0],\n    \"cost\": [1440.0, 1020.0, 2400.0, 720.0, 1140.0, 1320.0],\n}\ndf = pd.DataFrame(data)\ndf[\"margin\"] = df[\"revenue\"] - df[\"cost\"]\nFIXTURE_CSV.parent.mkdir(parents=True, exist_ok=True)\ndf.to_csv(FIXTURE_CSV, index=False)\nprint(df)\n",[30,16209,16210,16214,16224,16234,16238,16252,16256,16266,16298,16332,16364,16401,16437,16441,16450,16477,16499,16516],{"__ignoreMap":28},[33,16211,16212],{"class":35,"line":36},[33,16213,8895],{"class":39},[33,16215,16216,16218,16220,16222],{"class":35,"line":43},[33,16217,164],{"class":163},[33,16219,492],{"class":167},[33,16221,495],{"class":163},[33,16223,498],{"class":167},[33,16225,16226,16228,16230,16232],{"class":35,"line":61},[33,16227,190],{"class":163},[33,16229,193],{"class":167},[33,16231,164],{"class":163},[33,16233,198],{"class":167},[33,16235,16236],{"class":35,"line":73},[33,16237,92],{"emptyLinePlaceholder":91},[33,16239,16240,16243,16245,16247,16250],{"class":35,"line":88},[33,16241,16242],{"class":50},"FIXTURE_CSV",[33,16244,212],{"class":163},[33,16246,215],{"class":167},[33,16248,16249],{"class":54},"\"\u002Ftmp\u002Fpipeline_sample.csv\"",[33,16251,221],{"class":167},[33,16253,16254],{"class":35,"line":95},[33,16255,92],{"emptyLinePlaceholder":91},[33,16257,16258,16261,16263],{"class":35,"line":101},[33,16259,16260],{"class":167},"data ",[33,16262,242],{"class":163},[33,16264,16265],{"class":167}," {\n",[33,16267,16268,16271,16273,16275,16277,16279,16281,16283,16285,16287,16289,16292,16294,16296],{"class":35,"line":171},[33,16269,16270],{"class":54},"    \"region\"",[33,16272,12426],{"class":167},[33,16274,11760],{"class":54},[33,16276,365],{"class":167},[33,16278,11760],{"class":54},[33,16280,365],{"class":167},[33,16282,11773],{"class":54},[33,16284,365],{"class":167},[33,16286,11773],{"class":54},[33,16288,365],{"class":167},[33,16290,16291],{"class":54},"\"East\"",[33,16293,365],{"class":167},[33,16295,16291],{"class":54},[33,16297,8935],{"class":167},[33,16299,16300,16303,16305,16308,16310,16313,16315,16317,16319,16322,16324,16326,16328,16330],{"class":35,"line":179},[33,16301,16302],{"class":54},"    \"product\"",[33,16304,12426],{"class":167},[33,16306,16307],{"class":54},"\"Widget A\"",[33,16309,365],{"class":167},[33,16311,16312],{"class":54},"\"Widget B\"",[33,16314,365],{"class":167},[33,16316,16307],{"class":54},[33,16318,365],{"class":167},[33,16320,16321],{"class":54},"\"Widget C\"",[33,16323,365],{"class":167},[33,16325,16312],{"class":54},[33,16327,365],{"class":167},[33,16329,16321],{"class":54},[33,16331,8935],{"class":167},[33,16333,16334,16337,16339,16341,16343,16345,16347,16349,16351,16353,16355,16358,16360,16362],{"class":35,"line":187},[33,16335,16336],{"class":54},"    \"units\"",[33,16338,12426],{"class":167},[33,16340,2589],{"class":50},[33,16342,365],{"class":167},[33,16344,12900],{"class":50},[33,16346,365],{"class":167},[33,16348,2611],{"class":50},[33,16350,365],{"class":167},[33,16352,2590],{"class":50},[33,16354,365],{"class":167},[33,16356,16357],{"class":50},"95",[33,16359,365],{"class":167},[33,16361,2679],{"class":50},[33,16363,8935],{"class":167},[33,16365,16366,16369,16371,16374,16376,16379,16381,16384,16386,16389,16391,16394,16396,16399],{"class":35,"line":201},[33,16367,16368],{"class":54},"    \"revenue\"",[33,16370,12426],{"class":167},[33,16372,16373],{"class":50},"2400.0",[33,16375,365],{"class":167},[33,16377,16378],{"class":50},"1700.0",[33,16380,365],{"class":167},[33,16382,16383],{"class":50},"4000.0",[33,16385,365],{"class":167},[33,16387,16388],{"class":50},"1200.0",[33,16390,365],{"class":167},[33,16392,16393],{"class":50},"1900.0",[33,16395,365],{"class":167},[33,16397,16398],{"class":50},"2200.0",[33,16400,8935],{"class":167},[33,16402,16403,16406,16408,16411,16413,16416,16418,16420,16422,16425,16427,16430,16432,16435],{"class":35,"line":206},[33,16404,16405],{"class":54},"    \"cost\"",[33,16407,12426],{"class":167},[33,16409,16410],{"class":50},"1440.0",[33,16412,365],{"class":167},[33,16414,16415],{"class":50},"1020.0",[33,16417,365],{"class":167},[33,16419,16373],{"class":50},[33,16421,365],{"class":167},[33,16423,16424],{"class":50},"720.0",[33,16426,365],{"class":167},[33,16428,16429],{"class":50},"1140.0",[33,16431,365],{"class":167},[33,16433,16434],{"class":50},"1320.0",[33,16436,8935],{"class":167},[33,16438,16439],{"class":35,"line":224},[33,16440,4113],{"class":167},[33,16442,16443,16445,16447],{"class":35,"line":229},[33,16444,13459],{"class":167},[33,16446,242],{"class":163},[33,16448,16449],{"class":167}," pd.DataFrame(data)\n",[33,16451,16452,16454,16457,16459,16461,16463,16466,16468,16470,16472,16475],{"class":35,"line":235},[33,16453,11038],{"class":167},[33,16455,16456],{"class":54},"\"margin\"",[33,16458,763],{"class":167},[33,16460,242],{"class":163},[33,16462,7935],{"class":167},[33,16464,16465],{"class":54},"\"revenue\"",[33,16467,763],{"class":167},[33,16469,4126],{"class":163},[33,16471,7935],{"class":167},[33,16473,16474],{"class":54},"\"cost\"",[33,16476,9202],{"class":167},[33,16478,16479,16481,16483,16485,16487,16489,16491,16493,16495,16497],{"class":35,"line":250},[33,16480,16242],{"class":50},[33,16482,866],{"class":167},[33,16484,869],{"class":238},[33,16486,242],{"class":163},[33,16488,855],{"class":50},[33,16490,365],{"class":167},[33,16492,878],{"class":238},[33,16494,242],{"class":163},[33,16496,855],{"class":50},[33,16498,221],{"class":167},[33,16500,16501,16504,16506,16508,16510,16512,16514],{"class":35,"line":266},[33,16502,16503],{"class":167},"df.to_csv(",[33,16505,16242],{"class":50},[33,16507,365],{"class":167},[33,16509,897],{"class":238},[33,16511,242],{"class":163},[33,16513,902],{"class":50},[33,16515,221],{"class":167},[33,16517,16518,16520],{"class":35,"line":290},[33,16519,13474],{"class":50},[33,16521,13477],{"class":167},[18,16523,16525],{"id":16524},"diagnostic-inspect-the-dataframe-before-fanning-out","Diagnostic: Inspect the DataFrame Before Fanning Out",[14,16527,16528],{},"Before writing any output, validate shape, dtypes, and the grouping column. A missing column or the wrong dtype for a numeric field will corrupt all three outputs silently.",[23,16530,16532],{"className":126,"code":16531,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\nfrom pathlib import Path\n\nINPUT_CSV = Path(\"\u002Ftmp\u002Fpipeline_sample.csv\")\n\ndef diagnose(path: Path) -> pd.DataFrame:\n    try:\n        df = pd.read_csv(path)\n    except FileNotFoundError:\n        raise SystemExit(f\"Input not found: {path}\")\n\n    required_cols = {\"region\", \"product\", \"units\", \"revenue\", \"cost\", \"margin\"}\n    missing = required_cols - set(df.columns)\n    if missing:\n        raise ValueError(f\"DataFrame missing columns: {missing}\")\n\n    numeric_cols = [\"units\", \"revenue\", \"cost\", \"margin\"]\n    for col in numeric_cols:\n        if not pd.api.types.is_numeric_dtype(df[col]):\n            df[col] = pd.to_numeric(df[col], errors=\"coerce\")\n            null_count = df[col].isna().sum()\n            if null_count:\n                raise ValueError(f\"Column '{col}' has {null_count} non-numeric rows after coercion\")\n\n    segments = df[\"region\"].unique()\n    print(f\"Shape: {df.shape}  |  Segments: {list(segments)}  |  Nulls: {df.isna().sum().sum()}\")\n    return df\n\ndf = diagnose(INPUT_CSV)\n",[30,16533,16534,16538,16548,16558,16562,16575,16579,16588,16594,16603,16611,16635,16639,16674,16688,16694,16717,16721,16746,16757,16766,16783,16793,16800,16835,16839,16853,16895,16901,16905],{"__ignoreMap":28},[33,16535,16536],{"class":35,"line":36},[33,16537,8895],{"class":39},[33,16539,16540,16542,16544,16546],{"class":35,"line":43},[33,16541,164],{"class":163},[33,16543,492],{"class":167},[33,16545,495],{"class":163},[33,16547,498],{"class":167},[33,16549,16550,16552,16554,16556],{"class":35,"line":61},[33,16551,190],{"class":163},[33,16553,193],{"class":167},[33,16555,164],{"class":163},[33,16557,198],{"class":167},[33,16559,16560],{"class":35,"line":73},[33,16561,92],{"emptyLinePlaceholder":91},[33,16563,16564,16567,16569,16571,16573],{"class":35,"line":88},[33,16565,16566],{"class":50},"INPUT_CSV",[33,16568,212],{"class":163},[33,16570,215],{"class":167},[33,16572,16249],{"class":54},[33,16574,221],{"class":167},[33,16576,16577],{"class":35,"line":95},[33,16578,92],{"emptyLinePlaceholder":91},[33,16580,16581,16583,16586],{"class":35,"line":101},[33,16582,562],{"class":163},[33,16584,16585],{"class":46}," diagnose",[33,16587,7103],{"class":167},[33,16589,16590,16592],{"class":35,"line":171},[33,16591,2424],{"class":163},[33,16593,574],{"class":167},[33,16595,16596,16598,16600],{"class":35,"line":179},[33,16597,7930],{"class":167},[33,16599,242],{"class":163},[33,16601,16602],{"class":167}," pd.read_csv(path)\n",[33,16604,16605,16607,16609],{"class":35,"line":187},[33,16606,2449],{"class":163},[33,16608,2945],{"class":50},[33,16610,574],{"class":167},[33,16612,16613,16615,16618,16620,16622,16625,16627,16629,16631,16633],{"class":35,"line":201},[33,16614,4051],{"class":163},[33,16616,16617],{"class":50}," SystemExit",[33,16619,602],{"class":167},[33,16621,4059],{"class":163},[33,16623,16624],{"class":54},"\"Input not found: ",[33,16626,1115],{"class":50},[33,16628,2580],{"class":167},[33,16630,1121],{"class":50},[33,16632,274],{"class":54},[33,16634,221],{"class":167},[33,16636,16637],{"class":35,"line":206},[33,16638,92],{"emptyLinePlaceholder":91},[33,16640,16641,16643,16645,16647,16650,16652,16655,16657,16660,16662,16664,16666,16668,16670,16672],{"class":35,"line":224},[33,16642,4093],{"class":167},[33,16644,242],{"class":163},[33,16646,4098],{"class":167},[33,16648,16649],{"class":54},"\"region\"",[33,16651,365],{"class":167},[33,16653,16654],{"class":54},"\"product\"",[33,16656,365],{"class":167},[33,16658,16659],{"class":54},"\"units\"",[33,16661,365],{"class":167},[33,16663,16465],{"class":54},[33,16665,365],{"class":167},[33,16667,16474],{"class":54},[33,16669,365],{"class":167},[33,16671,16456],{"class":54},[33,16673,4113],{"class":167},[33,16675,16676,16678,16680,16682,16684,16686],{"class":35,"line":229},[33,16677,4118],{"class":167},[33,16679,242],{"class":163},[33,16681,4123],{"class":167},[33,16683,4126],{"class":163},[33,16685,4129],{"class":50},[33,16687,4132],{"class":167},[33,16689,16690,16692],{"class":35,"line":235},[33,16691,617],{"class":163},[33,16693,4139],{"class":167},[33,16695,16696,16698,16700,16702,16704,16707,16709,16711,16713,16715],{"class":35,"line":250},[33,16697,4051],{"class":163},[33,16699,4054],{"class":50},[33,16701,602],{"class":167},[33,16703,4059],{"class":163},[33,16705,16706],{"class":54},"\"DataFrame missing columns: ",[33,16708,1115],{"class":50},[33,16710,4157],{"class":167},[33,16712,1121],{"class":50},[33,16714,274],{"class":54},[33,16716,221],{"class":167},[33,16718,16719],{"class":35,"line":266},[33,16720,92],{"emptyLinePlaceholder":91},[33,16722,16723,16726,16728,16730,16732,16734,16736,16738,16740,16742,16744],{"class":35,"line":290},[33,16724,16725],{"class":167},"    numeric_cols ",[33,16727,242],{"class":163},[33,16729,9178],{"class":167},[33,16731,16659],{"class":54},[33,16733,365],{"class":167},[33,16735,16465],{"class":54},[33,16737,365],{"class":167},[33,16739,16474],{"class":54},[33,16741,365],{"class":167},[33,16743,16456],{"class":54},[33,16745,9202],{"class":167},[33,16747,16748,16750,16752,16754],{"class":35,"line":295},[33,16749,656],{"class":163},[33,16751,7985],{"class":167},[33,16753,662],{"class":163},[33,16755,16756],{"class":167}," numeric_cols:\n",[33,16758,16759,16761,16763],{"class":35,"line":300},[33,16760,8221],{"class":163},[33,16762,620],{"class":163},[33,16764,16765],{"class":167}," pd.api.types.is_numeric_dtype(df[col]):\n",[33,16767,16768,16770,16772,16775,16777,16779,16781],{"class":35,"line":317},[33,16769,11690],{"class":167},[33,16771,242],{"class":163},[33,16773,16774],{"class":167}," pd.to_numeric(df[col], ",[33,16776,8317],{"class":238},[33,16778,242],{"class":163},[33,16780,12107],{"class":54},[33,16782,221],{"class":167},[33,16784,16785,16788,16790],{"class":35,"line":332},[33,16786,16787],{"class":167},"            null_count ",[33,16789,242],{"class":163},[33,16791,16792],{"class":167}," df[col].isna().sum()\n",[33,16794,16795,16797],{"class":35,"line":347},[33,16796,5995],{"class":163},[33,16798,16799],{"class":167}," null_count:\n",[33,16801,16802,16805,16807,16809,16811,16814,16816,16818,16820,16823,16825,16828,16830,16833],{"class":35,"line":374},[33,16803,16804],{"class":163},"                raise",[33,16806,4054],{"class":50},[33,16808,602],{"class":167},[33,16810,4059],{"class":163},[33,16812,16813],{"class":54},"\"Column '",[33,16815,1115],{"class":50},[33,16817,8276],{"class":167},[33,16819,1121],{"class":50},[33,16821,16822],{"class":54},"' has ",[33,16824,1115],{"class":50},[33,16826,16827],{"class":167},"null_count",[33,16829,1121],{"class":50},[33,16831,16832],{"class":54}," non-numeric rows after coercion\"",[33,16834,221],{"class":167},[33,16836,16837],{"class":35,"line":397},[33,16838,92],{"emptyLinePlaceholder":91},[33,16840,16841,16844,16846,16848,16850],{"class":35,"line":653},[33,16842,16843],{"class":167},"    segments ",[33,16845,242],{"class":163},[33,16847,7935],{"class":167},[33,16849,16649],{"class":54},[33,16851,16852],{"class":167},"].unique()\n",[33,16854,16855,16857,16859,16861,16864,16866,16868,16870,16873,16876,16879,16881,16884,16886,16889,16891,16893],{"class":35,"line":667},[33,16856,7268],{"class":50},[33,16858,602],{"class":167},[33,16860,4059],{"class":163},[33,16862,16863],{"class":54},"\"Shape: ",[33,16865,1115],{"class":50},[33,16867,9426],{"class":167},[33,16869,1121],{"class":50},[33,16871,16872],{"class":54},"  |  Segments: ",[33,16874,16875],{"class":50},"{list",[33,16877,16878],{"class":167},"(segments)",[33,16880,1121],{"class":50},[33,16882,16883],{"class":54},"  |  Nulls: ",[33,16885,1115],{"class":50},[33,16887,16888],{"class":167},"df.isna().sum().sum()",[33,16890,1121],{"class":50},[33,16892,274],{"class":54},[33,16894,221],{"class":167},[33,16896,16897,16899],{"class":35,"line":675},[33,16898,1332],{"class":163},[33,16900,11719],{"class":167},[33,16902,16903],{"class":35,"line":689},[33,16904,92],{"emptyLinePlaceholder":91},[33,16906,16907,16909,16911,16914,16916],{"class":35,"line":703},[33,16908,13459],{"class":167},[33,16910,242],{"class":163},[33,16912,16913],{"class":167}," diagnose(",[33,16915,16566],{"class":50},[33,16917,221],{"class":167},[14,16919,16920],{},"Check the segment count matches your expectation. Three regions that should be four means a data issue upstream — not a report issue.",[18,16922,422],{"id":421},[14,16924,16925],{},"The fan-out follows a fixed order: Excel first (pure data, easiest to validate), then Word (narrative summary over aggregates), then PDF (final deliverable). Each step reads the same in-memory DataFrame so there is no drift between outputs.",[2540,16927,2547,16929,2547,16932,2547,16935,2547,2547,16974,2547,16977,2547,16980,2547,16984,2547,2547,16988,2547,16993,2547,16995,2547,2547,16999,2547,17002,2547,2547,17006,2547,17011,2547,17016,2547,17020,2547,2547,17023,2547,17027,2547,17031,2547,17034,2547,2547,17038,2547,17042,2547,17046,2547,17050],{"viewBox":2542,"role":2543,"ariaLabel":16928,"xmlns":2545,"style":2546},"DataFrame fan-out to Excel, Word, and PDF outputs",[2549,16930,16931],{},"Pipeline report fan-out diagram",[2553,16933,16934],{},"A cleaned pandas DataFrame on the left fans out via arrows to three output boxes on the right: a formatted Excel workbook, a Word summary document, and a PDF report.",[2557,16936,2559,16937,2559,16944,2559,16953,2559,16961,2559,16969,2547],{},[2561,16938,2564,16940,2564,16942,2559],{"id":16939,"x1":748,"y1":748,"x2":734,"y2":748},"pipeline-reports-grad-src",[2566,16941],{"offset":748,"style":2568},[2566,16943],{"offset":734,"style":2571},[2561,16945,2564,16947,2564,16950,2559],{"id":16946,"x1":748,"y1":748,"x2":748,"y2":734},"pipeline-reports-grad-excel",[2566,16948],{"offset":748,"style":16949},"stop-color:#ffffff",[2566,16951],{"offset":734,"style":16952},"stop-color:#f0fdf4",[2561,16954,2564,16956,2564,16958,2559],{"id":16955,"x1":748,"y1":748,"x2":748,"y2":734},"pipeline-reports-grad-word",[2566,16957],{"offset":748,"style":16949},[2566,16959],{"offset":734,"style":16960},"stop-color:#eff6ff",[2561,16962,2564,16964,2564,16966,2559],{"id":16963,"x1":748,"y1":748,"x2":748,"y2":734},"pipeline-reports-grad-pdf",[2566,16965],{"offset":748,"style":16949},[2566,16967],{"offset":734,"style":16968},"stop-color:#fef2f2",[2573,16970,2564,16972,2559],{"id":16971,"viewBox":2576,"refX":2577,"refY":1153,"markerWidth":1179,"markerHeight":1179,"orient":2578},"pipeline-reports-arrow",[2580,16973],{"d":2582,"fill":2583},[2585,16975],{"x":2587,"y":2589,"width":2611,"height":2650,"rx":3545,"fill":16976,"stroke":11166,"style":2594},"url(#pipeline-reports-grad-src)",[2000,16978,9630],{"x":2589,"y":11132,"fill":2592,"style":16979},"text-anchor:middle;font-size:14px;font-weight:bold",[2000,16981,11219],{"x":2589,"y":16982,"fill":11165,"style":16983},"178","text-anchor:middle;font-size:14px",[2000,16985,16987],{"x":2589,"y":16986,"fill":11165,"style":2685},"198","(cleaned + typed)",[35,16989],{"x1":2701,"y1":2648,"x2":16990,"y2":16991,"stroke":2583,"markerEnd":16992,"style":2594},"490","75","url(#pipeline-reports-arrow)",[35,16994],{"x1":2701,"y1":11115,"x2":16990,"y2":11115,"stroke":2583,"markerEnd":16992,"style":2594},[35,16996],{"x1":2701,"y1":16997,"x2":16990,"y2":16998,"stroke":2583,"markerEnd":16992,"style":2594},"195","263",[2585,17000],{"x":17001,"y":2598,"width":2589,"height":1543,"rx":2681,"fill":2615,"stroke":2593,"style":11105},"275",[2000,17003,17005],{"x":17004,"y":2664,"fill":2583,"style":2685},"335","per-segment split",[2585,17007],{"x":16990,"y":1543,"width":17008,"height":2630,"rx":3545,"fill":17009,"stroke":17010,"style":2594},"240","url(#pipeline-reports-grad-excel)","#16a34a",[2000,17012,17015],{"x":17013,"y":17014,"fill":2599,"style":16979},"610","64","Excel Workbook",[2000,17017,17019],{"x":17013,"y":17018,"fill":2583,"style":2685},"84","openpyxl \u002F xlsxwriter",[2000,17021,17022],{"x":17013,"y":11155,"fill":2583,"style":2685},"formatted table + totals row",[2585,17024],{"x":16990,"y":17025,"width":17008,"height":2630,"rx":3545,"fill":17026,"stroke":11166,"style":2594},"125","url(#pipeline-reports-grad-word)",[2000,17028,17030],{"x":17013,"y":17029,"fill":2599,"style":16979},"159","Word Summary",[2000,17032,17033],{"x":17013,"y":11176,"fill":2583,"style":2685},"python-docx \u002F docxtpl",[2000,17035,17037],{"x":17013,"y":17036,"fill":2583,"style":2685},"199","narrative + aggregate table",[2585,17039],{"x":16990,"y":2701,"width":17008,"height":2630,"rx":3545,"fill":17040,"stroke":17041,"style":2594},"url(#pipeline-reports-grad-pdf)","#dc2626",[2000,17043,17045],{"x":17013,"y":17044,"fill":2599,"style":16979},"254","PDF Report",[2000,17047,17049],{"x":17013,"y":17048,"fill":2583,"style":2685},"274","ReportLab \u002F WeasyPrint",[2000,17051,17052],{"x":17013,"y":2689,"fill":2583,"style":2685},"print-ready, per-segment",[424,17054,17056],{"id":17055},"step-1-dataframe-to-formatted-excel-workbook","Step 1 — DataFrame to Formatted Excel Workbook",[14,17058,17059,17060,17063,17064,17067],{},"Use ",[940,17061,17062],{"href":6935},"openpyxl for formatted Excel output"," when you need header styling, column widths, and a totals row. ",[30,17065,17066],{},"xlsxwriter"," is faster for write-only large files; swap the engine if rows exceed 200,000.",[23,17069,17071],{"className":126,"code":17070,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nimport pandas as pd\nfrom pathlib import Path\nfrom openpyxl import load_workbook\nfrom openpyxl.styles import Font, PatternFill, Alignment, numbers\nfrom openpyxl.utils import get_column_letter\n\nOUTPUT_DIR = Path(\"\u002Ftmp\u002Freports\")\nOUTPUT_DIR.mkdir(parents=True, exist_ok=True)\n\nHEADER_FILL = PatternFill(\"solid\", fgColor=\"2563EB\")\nTOTAL_FILL  = PatternFill(\"solid\", fgColor=\"DBEAFE\")\nHEADER_FONT = Font(bold=True, color=\"FFFFFF\", size=11)\n\ndef df_to_excel(df: pd.DataFrame, segment: str, output_dir: Path) -> Path:\n    safe_seg = segment.replace(\" \", \"_\").lower()\n    path = output_dir \u002F f\"pipeline-reports-{safe_seg}.xlsx\"\n\n    # Write raw data first with pandas engine\n    try:\n        with pd.ExcelWriter(path, engine=\"openpyxl\") as writer:\n            df.to_excel(writer, sheet_name=\"Data\", index=False)\n    except PermissionError:\n        raise RuntimeError(f\"File locked: {path}\")\n\n    # Re-open with openpyxl to apply formatting\n    wb = load_workbook(path)\n    ws = wb[\"Data\"]\n\n    # Header row styling\n    for cell in ws[1]:\n        cell.font = HEADER_FONT\n        cell.fill = HEADER_FILL\n        cell.alignment = Alignment(horizontal=\"center\")\n\n    # Column widths + number formats\n    currency_cols = {col: ws.cell(1, col).value\n                     for col in range(1, ws.max_column + 1)\n                     if ws.cell(1, col).value in (\"revenue\", \"cost\", \"margin\")}\n    for col_idx, header in currency_cols.items():\n        letter = get_column_letter(col_idx)\n        ws.column_dimensions[letter].width = 14\n        for row in ws.iter_rows(min_row=2, min_col=col_idx, max_col=col_idx):\n            for cell in row:\n                cell.number_format = '\"$\"#,##0.00'\n\n    # Totals row\n    total_row = ws.max_row + 1\n    numeric_headers = [\"units\", \"revenue\", \"cost\", \"margin\"]\n    for col_idx in range(1, ws.max_column + 1):\n        header = ws.cell(1, col_idx).value\n        cell = ws.cell(total_row, col_idx)\n        if header in numeric_headers:\n            letter = get_column_letter(col_idx)\n            cell.value = f\"=SUM({letter}2:{letter}{ws.max_row - 1})\"\n            cell.font = Font(bold=True)\n            cell.fill = TOTAL_FILL\n            if header != \"units\":\n                cell.number_format = '\"$\"#,##0.00'\n        elif col_idx == 1:\n            cell.value = \"TOTAL\"\n            cell.font = Font(bold=True)\n            cell.fill = TOTAL_FILL\n\n    wb.save(path)\n    return path\n\ndf = pd.read_csv(\"\u002Ftmp\u002Fpipeline_sample.csv\")\nfor region, group in df.groupby(\"region\"):\n    out = df_to_excel(group.copy(), region, OUTPUT_DIR)\n    print(f\"Excel: {out}  ({len(group)} rows)\")\n",[30,17072,17073,17077,17087,17097,17109,17121,17133,17137,17150,17172,17176,17201,17224,17263,17267,17282,17302,17327,17331,17336,17342,17364,17387,17396,17419,17423,17428,17438,17452,17456,17461,17478,17488,17498,17518,17522,17527,17542,17566,17597,17609,17619,17629,17665,17676,17686,17690,17695,17710,17735,17758,17772,17782,17794,17803,17842,17859,17869,17883,17891,17904,17913,17929,17937,17941,17946,17953,17957,17969,17985,17999],{"__ignoreMap":28},[33,17074,17075],{"class":35,"line":36},[33,17076,3952],{"class":39},[33,17078,17079,17081,17083,17085],{"class":35,"line":43},[33,17080,164],{"class":163},[33,17082,492],{"class":167},[33,17084,495],{"class":163},[33,17086,498],{"class":167},[33,17088,17089,17091,17093,17095],{"class":35,"line":61},[33,17090,190],{"class":163},[33,17092,193],{"class":167},[33,17094,164],{"class":163},[33,17096,198],{"class":167},[33,17098,17099,17101,17104,17106],{"class":35,"line":73},[33,17100,190],{"class":163},[33,17102,17103],{"class":167}," openpyxl ",[33,17105,164],{"class":163},[33,17107,17108],{"class":167}," load_workbook\n",[33,17110,17111,17113,17116,17118],{"class":35,"line":88},[33,17112,190],{"class":163},[33,17114,17115],{"class":167}," openpyxl.styles ",[33,17117,164],{"class":163},[33,17119,17120],{"class":167}," Font, PatternFill, Alignment, numbers\n",[33,17122,17123,17125,17128,17130],{"class":35,"line":95},[33,17124,190],{"class":163},[33,17126,17127],{"class":167}," openpyxl.utils ",[33,17129,164],{"class":163},[33,17131,17132],{"class":167}," get_column_letter\n",[33,17134,17135],{"class":35,"line":101},[33,17136,92],{"emptyLinePlaceholder":91},[33,17138,17139,17141,17143,17145,17148],{"class":35,"line":171},[33,17140,4615],{"class":50},[33,17142,212],{"class":163},[33,17144,215],{"class":167},[33,17146,17147],{"class":54},"\"\u002Ftmp\u002Freports\"",[33,17149,221],{"class":167},[33,17151,17152,17154,17156,17158,17160,17162,17164,17166,17168,17170],{"class":35,"line":179},[33,17153,4615],{"class":50},[33,17155,1078],{"class":167},[33,17157,869],{"class":238},[33,17159,242],{"class":163},[33,17161,855],{"class":50},[33,17163,365],{"class":167},[33,17165,878],{"class":238},[33,17167,242],{"class":163},[33,17169,855],{"class":50},[33,17171,221],{"class":167},[33,17173,17174],{"class":35,"line":187},[33,17175,92],{"emptyLinePlaceholder":91},[33,17177,17178,17181,17183,17186,17189,17191,17194,17196,17199],{"class":35,"line":201},[33,17179,17180],{"class":50},"HEADER_FILL",[33,17182,212],{"class":163},[33,17184,17185],{"class":167}," PatternFill(",[33,17187,17188],{"class":54},"\"solid\"",[33,17190,365],{"class":167},[33,17192,17193],{"class":238},"fgColor",[33,17195,242],{"class":163},[33,17197,17198],{"class":54},"\"2563EB\"",[33,17200,221],{"class":167},[33,17202,17203,17206,17209,17211,17213,17215,17217,17219,17222],{"class":35,"line":206},[33,17204,17205],{"class":50},"TOTAL_FILL",[33,17207,17208],{"class":163},"  =",[33,17210,17185],{"class":167},[33,17212,17188],{"class":54},[33,17214,365],{"class":167},[33,17216,17193],{"class":238},[33,17218,242],{"class":163},[33,17220,17221],{"class":54},"\"DBEAFE\"",[33,17223,221],{"class":167},[33,17225,17226,17229,17231,17234,17237,17239,17241,17243,17246,17248,17251,17253,17256,17258,17261],{"class":35,"line":224},[33,17227,17228],{"class":50},"HEADER_FONT",[33,17230,212],{"class":163},[33,17232,17233],{"class":167}," Font(",[33,17235,17236],{"class":238},"bold",[33,17238,242],{"class":163},[33,17240,855],{"class":50},[33,17242,365],{"class":167},[33,17244,17245],{"class":238},"color",[33,17247,242],{"class":163},[33,17249,17250],{"class":54},"\"FFFFFF\"",[33,17252,365],{"class":167},[33,17254,17255],{"class":238},"size",[33,17257,242],{"class":163},[33,17259,17260],{"class":50},"11",[33,17262,221],{"class":167},[33,17264,17265],{"class":35,"line":229},[33,17266,92],{"emptyLinePlaceholder":91},[33,17268,17269,17271,17274,17277,17279],{"class":35,"line":235},[33,17270,562],{"class":163},[33,17272,17273],{"class":46}," df_to_excel",[33,17275,17276],{"class":167},"(df: pd.DataFrame, segment: ",[33,17278,1053],{"class":50},[33,17280,17281],{"class":167},", output_dir: Path) -> Path:\n",[33,17283,17284,17287,17289,17292,17295,17297,17299],{"class":35,"line":250},[33,17285,17286],{"class":167},"    safe_seg ",[33,17288,242],{"class":163},[33,17290,17291],{"class":167}," segment.replace(",[33,17293,17294],{"class":54},"\" \"",[33,17296,365],{"class":167},[33,17298,7764],{"class":54},[33,17300,17301],{"class":167},").lower()\n",[33,17303,17304,17307,17309,17311,17313,17315,17318,17320,17323,17325],{"class":35,"line":266},[33,17305,17306],{"class":167},"    path ",[33,17308,242],{"class":163},[33,17310,6393],{"class":167},[33,17312,1351],{"class":163},[33,17314,1110],{"class":163},[33,17316,17317],{"class":54},"\"pipeline-reports-",[33,17319,1115],{"class":50},[33,17321,17322],{"class":167},"safe_seg",[33,17324,1121],{"class":50},[33,17326,6410],{"class":54},[33,17328,17329],{"class":35,"line":290},[33,17330,92],{"emptyLinePlaceholder":91},[33,17332,17333],{"class":35,"line":295},[33,17334,17335],{"class":39},"    # Write raw data first with pandas engine\n",[33,17337,17338,17340],{"class":35,"line":300},[33,17339,2424],{"class":163},[33,17341,574],{"class":167},[33,17343,17344,17346,17349,17352,17354,17357,17359,17361],{"class":35,"line":317},[33,17345,2191],{"class":163},[33,17347,17348],{"class":167}," pd.ExcelWriter(path, ",[33,17350,17351],{"class":238},"engine",[33,17353,242],{"class":163},[33,17355,17356],{"class":54},"\"openpyxl\"",[33,17358,1649],{"class":167},[33,17360,495],{"class":163},[33,17362,17363],{"class":167}," writer:\n",[33,17365,17366,17369,17372,17374,17377,17379,17381,17383,17385],{"class":35,"line":332},[33,17367,17368],{"class":167},"            df.to_excel(writer, ",[33,17370,17371],{"class":238},"sheet_name",[33,17373,242],{"class":163},[33,17375,17376],{"class":54},"\"Data\"",[33,17378,365],{"class":167},[33,17380,897],{"class":238},[33,17382,242],{"class":163},[33,17384,902],{"class":50},[33,17386,221],{"class":167},[33,17388,17389,17391,17394],{"class":35,"line":347},[33,17390,2449],{"class":163},[33,17392,17393],{"class":50}," PermissionError",[33,17395,574],{"class":167},[33,17397,17398,17400,17402,17404,17406,17409,17411,17413,17415,17417],{"class":35,"line":374},[33,17399,4051],{"class":163},[33,17401,7590],{"class":50},[33,17403,602],{"class":167},[33,17405,4059],{"class":163},[33,17407,17408],{"class":54},"\"File locked: ",[33,17410,1115],{"class":50},[33,17412,2580],{"class":167},[33,17414,1121],{"class":50},[33,17416,274],{"class":54},[33,17418,221],{"class":167},[33,17420,17421],{"class":35,"line":397},[33,17422,92],{"emptyLinePlaceholder":91},[33,17424,17425],{"class":35,"line":653},[33,17426,17427],{"class":39},"    # Re-open with openpyxl to apply formatting\n",[33,17429,17430,17433,17435],{"class":35,"line":667},[33,17431,17432],{"class":167},"    wb ",[33,17434,242],{"class":163},[33,17436,17437],{"class":167}," load_workbook(path)\n",[33,17439,17440,17443,17445,17448,17450],{"class":35,"line":675},[33,17441,17442],{"class":167},"    ws ",[33,17444,242],{"class":163},[33,17446,17447],{"class":167}," wb[",[33,17449,17376],{"class":54},[33,17451,9202],{"class":167},[33,17453,17454],{"class":35,"line":689},[33,17455,92],{"emptyLinePlaceholder":91},[33,17457,17458],{"class":35,"line":703},[33,17459,17460],{"class":39},"    # Header row styling\n",[33,17462,17463,17465,17468,17470,17473,17475],{"class":35,"line":714},[33,17464,656],{"class":163},[33,17466,17467],{"class":167}," cell ",[33,17469,662],{"class":163},[33,17471,17472],{"class":167}," ws[",[33,17474,734],{"class":50},[33,17476,17477],{"class":167},"]:\n",[33,17479,17480,17483,17485],{"class":35,"line":723},[33,17481,17482],{"class":167},"        cell.font ",[33,17484,242],{"class":163},[33,17486,17487],{"class":50}," HEADER_FONT\n",[33,17489,17490,17493,17495],{"class":35,"line":754},[33,17491,17492],{"class":167},"        cell.fill ",[33,17494,242],{"class":163},[33,17496,17497],{"class":50}," HEADER_FILL\n",[33,17499,17500,17503,17505,17508,17511,17513,17516],{"class":35,"line":771},[33,17501,17502],{"class":167},"        cell.alignment ",[33,17504,242],{"class":163},[33,17506,17507],{"class":167}," Alignment(",[33,17509,17510],{"class":238},"horizontal",[33,17512,242],{"class":163},[33,17514,17515],{"class":54},"\"center\"",[33,17517,221],{"class":167},[33,17519,17520],{"class":35,"line":777},[33,17521,92],{"emptyLinePlaceholder":91},[33,17523,17524],{"class":35,"line":788},[33,17525,17526],{"class":39},"    # Column widths + number formats\n",[33,17528,17529,17532,17534,17537,17539],{"class":35,"line":804},[33,17530,17531],{"class":167},"    currency_cols ",[33,17533,242],{"class":163},[33,17535,17536],{"class":167}," {col: ws.cell(",[33,17538,734],{"class":50},[33,17540,17541],{"class":167},", col).value\n",[33,17543,17544,17547,17549,17551,17553,17555,17557,17560,17562,17564],{"class":35,"line":809},[33,17545,17546],{"class":163},"                     for",[33,17548,7985],{"class":167},[33,17550,662],{"class":163},[33,17552,1801],{"class":50},[33,17554,602],{"class":167},[33,17556,734],{"class":50},[33,17558,17559],{"class":167},", ws.max_column ",[33,17561,1811],{"class":163},[33,17563,1814],{"class":50},[33,17565,221],{"class":167},[33,17567,17568,17571,17574,17576,17579,17581,17584,17586,17588,17590,17592,17594],{"class":35,"line":819},[33,17569,17570],{"class":163},"                     if",[33,17572,17573],{"class":167}," ws.cell(",[33,17575,734],{"class":50},[33,17577,17578],{"class":167},", col).value ",[33,17580,662],{"class":163},[33,17582,17583],{"class":167}," (",[33,17585,16465],{"class":54},[33,17587,365],{"class":167},[33,17589,16474],{"class":54},[33,17591,365],{"class":167},[33,17593,16456],{"class":54},[33,17595,17596],{"class":167},")}\n",[33,17598,17599,17601,17604,17606],{"class":35,"line":829},[33,17600,656],{"class":163},[33,17602,17603],{"class":167}," col_idx, header ",[33,17605,662],{"class":163},[33,17607,17608],{"class":167}," currency_cols.items():\n",[33,17610,17611,17614,17616],{"class":35,"line":834},[33,17612,17613],{"class":167},"        letter ",[33,17615,242],{"class":163},[33,17617,17618],{"class":167}," get_column_letter(col_idx)\n",[33,17620,17621,17624,17626],{"class":35,"line":839},[33,17622,17623],{"class":167},"        ws.column_dimensions[letter].width ",[33,17625,242],{"class":163},[33,17627,17628],{"class":50}," 14\n",[33,17630,17631,17633,17635,17637,17640,17643,17645,17647,17649,17652,17654,17657,17660,17662],{"class":35,"line":860},[33,17632,5973],{"class":163},[33,17634,3844],{"class":167},[33,17636,662],{"class":163},[33,17638,17639],{"class":167}," ws.iter_rows(",[33,17641,17642],{"class":238},"min_row",[33,17644,242],{"class":163},[33,17646,1533],{"class":50},[33,17648,365],{"class":167},[33,17650,17651],{"class":238},"min_col",[33,17653,242],{"class":163},[33,17655,17656],{"class":167},"col_idx, ",[33,17658,17659],{"class":238},"max_col",[33,17661,242],{"class":163},[33,17663,17664],{"class":167},"col_idx):\n",[33,17666,17667,17669,17671,17673],{"class":35,"line":887},[33,17668,1793],{"class":163},[33,17670,17467],{"class":167},[33,17672,662],{"class":163},[33,17674,17675],{"class":167}," row:\n",[33,17677,17678,17681,17683],{"class":35,"line":907},[33,17679,17680],{"class":167},"                cell.number_format ",[33,17682,242],{"class":163},[33,17684,17685],{"class":54}," '\"$\"#,##0.00'\n",[33,17687,17688],{"class":35,"line":1826},[33,17689,92],{"emptyLinePlaceholder":91},[33,17691,17692],{"class":35,"line":1844},[33,17693,17694],{"class":39},"    # Totals row\n",[33,17696,17697,17700,17702,17705,17707],{"class":35,"line":1858},[33,17698,17699],{"class":167},"    total_row ",[33,17701,242],{"class":163},[33,17703,17704],{"class":167}," ws.max_row ",[33,17706,1811],{"class":163},[33,17708,17709],{"class":50}," 1\n",[33,17711,17712,17715,17717,17719,17721,17723,17725,17727,17729,17731,17733],{"class":35,"line":1871},[33,17713,17714],{"class":167},"    numeric_headers ",[33,17716,242],{"class":163},[33,17718,9178],{"class":167},[33,17720,16659],{"class":54},[33,17722,365],{"class":167},[33,17724,16465],{"class":54},[33,17726,365],{"class":167},[33,17728,16474],{"class":54},[33,17730,365],{"class":167},[33,17732,16456],{"class":54},[33,17734,9202],{"class":167},[33,17736,17737,17739,17742,17744,17746,17748,17750,17752,17754,17756],{"class":35,"line":1877},[33,17738,656],{"class":163},[33,17740,17741],{"class":167}," col_idx ",[33,17743,662],{"class":163},[33,17745,1801],{"class":50},[33,17747,602],{"class":167},[33,17749,734],{"class":50},[33,17751,17559],{"class":167},[33,17753,1811],{"class":163},[33,17755,1814],{"class":50},[33,17757,1737],{"class":167},[33,17759,17760,17763,17765,17767,17769],{"class":35,"line":1883},[33,17761,17762],{"class":167},"        header ",[33,17764,242],{"class":163},[33,17766,17573],{"class":167},[33,17768,734],{"class":50},[33,17770,17771],{"class":167},", col_idx).value\n",[33,17773,17774,17777,17779],{"class":35,"line":1915},[33,17775,17776],{"class":167},"        cell ",[33,17778,242],{"class":163},[33,17780,17781],{"class":167}," ws.cell(total_row, col_idx)\n",[33,17783,17784,17786,17789,17791],{"class":35,"line":1926},[33,17785,8221],{"class":163},[33,17787,17788],{"class":167}," header ",[33,17790,662],{"class":163},[33,17792,17793],{"class":167}," numeric_headers:\n",[33,17795,17796,17799,17801],{"class":35,"line":1932},[33,17797,17798],{"class":167},"            letter ",[33,17800,242],{"class":163},[33,17802,17618],{"class":167},[33,17804,17805,17808,17810,17812,17815,17817,17820,17822,17825,17827,17829,17832,17835,17837,17839],{"class":35,"line":1938},[33,17806,17807],{"class":167},"            cell.value ",[33,17809,242],{"class":163},[33,17811,1110],{"class":163},[33,17813,17814],{"class":54},"\"=SUM(",[33,17816,1115],{"class":50},[33,17818,17819],{"class":167},"letter",[33,17821,1121],{"class":50},[33,17823,17824],{"class":54},"2:",[33,17826,1115],{"class":50},[33,17828,17819],{"class":167},[33,17830,17831],{"class":50},"}{",[33,17833,17834],{"class":167},"ws.max_row ",[33,17836,4126],{"class":163},[33,17838,11022],{"class":50},[33,17840,17841],{"class":54},")\"\n",[33,17843,17844,17847,17849,17851,17853,17855,17857],{"class":35,"line":1950},[33,17845,17846],{"class":167},"            cell.font ",[33,17848,242],{"class":163},[33,17850,17233],{"class":167},[33,17852,17236],{"class":238},[33,17854,242],{"class":163},[33,17856,855],{"class":50},[33,17858,221],{"class":167},[33,17860,17861,17864,17866],{"class":35,"line":1958},[33,17862,17863],{"class":167},"            cell.fill ",[33,17865,242],{"class":163},[33,17867,17868],{"class":50}," TOTAL_FILL\n",[33,17870,17871,17873,17875,17878,17881],{"class":35,"line":4904},[33,17872,5995],{"class":163},[33,17874,17788],{"class":167},[33,17876,17877],{"class":163},"!=",[33,17879,17880],{"class":54}," \"units\"",[33,17882,574],{"class":167},[33,17884,17885,17887,17889],{"class":35,"line":4909},[33,17886,17680],{"class":167},[33,17888,242],{"class":163},[33,17890,17685],{"class":54},[33,17892,17893,17896,17898,17900,17902],{"class":35,"line":4915},[33,17894,17895],{"class":163},"        elif",[33,17897,17741],{"class":167},[33,17899,1865],{"class":163},[33,17901,1814],{"class":50},[33,17903,574],{"class":167},[33,17905,17906,17908,17910],{"class":35,"line":4925},[33,17907,17807],{"class":167},[33,17909,242],{"class":163},[33,17911,17912],{"class":54}," \"TOTAL\"\n",[33,17914,17915,17917,17919,17921,17923,17925,17927],{"class":35,"line":4935},[33,17916,17846],{"class":167},[33,17918,242],{"class":163},[33,17920,17233],{"class":167},[33,17922,17236],{"class":238},[33,17924,242],{"class":163},[33,17926,855],{"class":50},[33,17928,221],{"class":167},[33,17930,17931,17933,17935],{"class":35,"line":4941},[33,17932,17863],{"class":167},[33,17934,242],{"class":163},[33,17936,17868],{"class":50},[33,17938,17939],{"class":35,"line":4950},[33,17940,92],{"emptyLinePlaceholder":91},[33,17942,17943],{"class":35,"line":4960},[33,17944,17945],{"class":167},"    wb.save(path)\n",[33,17947,17948,17950],{"class":35,"line":4965},[33,17949,1332],{"class":163},[33,17951,17952],{"class":167}," path\n",[33,17954,17955],{"class":35,"line":4971},[33,17956,92],{"emptyLinePlaceholder":91},[33,17958,17959,17961,17963,17965,17967],{"class":35,"line":4983},[33,17960,13459],{"class":167},[33,17962,242],{"class":163},[33,17964,9481],{"class":167},[33,17966,16249],{"class":54},[33,17968,221],{"class":167},[33,17970,17971,17973,17976,17978,17981,17983],{"class":35,"line":4988},[33,17972,6124],{"class":163},[33,17974,17975],{"class":167}," region, group ",[33,17977,662],{"class":163},[33,17979,17980],{"class":167}," df.groupby(",[33,17982,16649],{"class":54},[33,17984,1737],{"class":167},[33,17986,17987,17990,17992,17995,17997],{"class":35,"line":4993},[33,17988,17989],{"class":167},"    out ",[33,17991,242],{"class":163},[33,17993,17994],{"class":167}," df_to_excel(group.copy(), region, ",[33,17996,4615],{"class":50},[33,17998,221],{"class":167},[33,18000,18001,18003,18005,18007,18010,18012,18015,18017,18020,18022,18025,18027,18030],{"class":35,"line":5003},[33,18002,7268],{"class":50},[33,18004,602],{"class":167},[33,18006,4059],{"class":163},[33,18008,18009],{"class":54},"\"Excel: ",[33,18011,1115],{"class":50},[33,18013,18014],{"class":167},"out",[33,18016,1121],{"class":50},[33,18018,18019],{"class":54},"  (",[33,18021,4065],{"class":50},[33,18023,18024],{"class":167},"(group)",[33,18026,1121],{"class":50},[33,18028,18029],{"class":54}," rows)\"",[33,18031,221],{"class":167},[424,18033,18035],{"id":18034},"step-2-dataframe-to-word-summary","Step 2 — DataFrame to Word Summary",[14,18037,18038,18042,18043,18048,18049,18052,18053,18055],{},[940,18039,18041],{"href":18040},"\u002Fword-document-templating-batch-processing\u002Fdynamic-mail-merge-with-python\u002F","python-docx"," writes structure programmatically; ",[940,18044,18047],{"href":18045,"rel":18046},"https:\u002F\u002Fdocxtpl.readthedocs.io",[1367],"docxtpl"," is better when you have an existing ",[30,18050,18051],{},".docx"," template with Jinja2 placeholders. The snippet below uses ",[30,18054,18041],{}," directly so there is no external template file dependency.",[23,18057,18059],{"className":126,"code":18058,"language":47,"meta":28,"style":28},"# pip install pandas python-docx\nimport pandas as pd\nfrom pathlib import Path\nfrom docx import Document\nfrom docx.shared import Pt, RGBColor, Inches\nfrom docx.enum.text import WD_ALIGN_PARAGRAPH\n\nOUTPUT_DIR = Path(\"\u002Ftmp\u002Freports\")\nOUTPUT_DIR.mkdir(parents=True, exist_ok=True)\n\ndef df_to_word(df: pd.DataFrame, segment: str, output_dir: Path) -> Path:\n    safe_seg = segment.replace(\" \", \"_\").lower()\n    path = output_dir \u002F f\"pipeline-reports-{safe_seg}.docx\"\n\n    doc = Document()\n\n    # Title\n    title = doc.add_heading(f\"Pipeline Report — {segment}\", level=1)\n    title.runs[0].font.color.rgb = RGBColor(0x0F, 0x17, 0x2A)\n\n    # Executive summary paragraph\n    total_revenue = df[\"revenue\"].sum()\n    total_margin  = df[\"margin\"].sum()\n    margin_pct    = (total_margin \u002F total_revenue * 100) if total_revenue else 0.0\n    summary = (\n        f\"Region {segment} generated ${total_revenue:,.0f} in revenue across \"\n        f\"{len(df)} product lines with a net margin of {margin_pct:.1f}%.\"\n    )\n    p = doc.add_paragraph(summary)\n    p.runs[0].font.size = Pt(11)\n\n    # Aggregate by product\n    agg = (\n        df.groupby(\"product\")\n          .agg(units=(\"units\", \"sum\"), revenue=(\"revenue\", \"sum\"), margin=(\"margin\", \"sum\"))\n          .reset_index()\n          .sort_values(\"revenue\", ascending=False)\n    )\n\n    doc.add_heading(\"Product Breakdown\", level=2)\n    cols = list(agg.columns)\n    table = doc.add_table(rows=1, cols=len(cols))\n    table.style = \"Table Grid\"\n\n    # Header row\n    hdr = table.rows[0].cells\n    for i, col in enumerate(cols):\n        hdr[i].text = col.capitalize()\n        hdr[i].paragraphs[0].runs[0].font.bold = True\n\n    # Data rows\n    for _, row in agg.iterrows():\n        cells = table.add_row().cells\n        cells[0].text = str(row[\"product\"])\n        cells[1].text = f\"{int(row['units']):,}\"\n        cells[2].text = f\"${row['revenue']:,.0f}\"\n        cells[3].text = f\"${row['margin']:,.0f}\"\n\n    try:\n        doc.save(path)\n    except PermissionError:\n        raise RuntimeError(f\"File locked: {path}\")\n\n    return path\n\ndf = pd.read_csv(\"\u002Ftmp\u002Fpipeline_sample.csv\")\nfor region, group in df.groupby(\"region\"):\n    out = df_to_word(group.copy(), region, OUTPUT_DIR)\n    print(f\"Word: {out}  ({len(group)} rows)\")\n",[30,18060,18061,18066,18076,18086,18098,18110,18122,18126,18138,18160,18164,18177,18193,18216,18220,18230,18234,18239,18274,18311,18315,18320,18334,18347,18378,18387,18416,18444,18448,18458,18477,18481,18486,18495,18504,18556,18561,18579,18583,18587,18605,18617,18646,18656,18660,18665,18680,18694,18704,18723,18727,18732,18743,18753,18774,18806,18837,18866,18870,18876,18881,18889,18911,18915,18921,18925,18937,18951,18964],{"__ignoreMap":28},[33,18062,18063],{"class":35,"line":36},[33,18064,18065],{"class":39},"# pip install pandas python-docx\n",[33,18067,18068,18070,18072,18074],{"class":35,"line":43},[33,18069,164],{"class":163},[33,18071,492],{"class":167},[33,18073,495],{"class":163},[33,18075,498],{"class":167},[33,18077,18078,18080,18082,18084],{"class":35,"line":61},[33,18079,190],{"class":163},[33,18081,193],{"class":167},[33,18083,164],{"class":163},[33,18085,198],{"class":167},[33,18087,18088,18090,18093,18095],{"class":35,"line":73},[33,18089,190],{"class":163},[33,18091,18092],{"class":167}," docx ",[33,18094,164],{"class":163},[33,18096,18097],{"class":167}," Document\n",[33,18099,18100,18102,18105,18107],{"class":35,"line":88},[33,18101,190],{"class":163},[33,18103,18104],{"class":167}," docx.shared ",[33,18106,164],{"class":163},[33,18108,18109],{"class":167}," Pt, RGBColor, Inches\n",[33,18111,18112,18114,18117,18119],{"class":35,"line":95},[33,18113,190],{"class":163},[33,18115,18116],{"class":167}," docx.enum.text ",[33,18118,164],{"class":163},[33,18120,18121],{"class":50}," WD_ALIGN_PARAGRAPH\n",[33,18123,18124],{"class":35,"line":101},[33,18125,92],{"emptyLinePlaceholder":91},[33,18127,18128,18130,18132,18134,18136],{"class":35,"line":171},[33,18129,4615],{"class":50},[33,18131,212],{"class":163},[33,18133,215],{"class":167},[33,18135,17147],{"class":54},[33,18137,221],{"class":167},[33,18139,18140,18142,18144,18146,18148,18150,18152,18154,18156,18158],{"class":35,"line":179},[33,18141,4615],{"class":50},[33,18143,1078],{"class":167},[33,18145,869],{"class":238},[33,18147,242],{"class":163},[33,18149,855],{"class":50},[33,18151,365],{"class":167},[33,18153,878],{"class":238},[33,18155,242],{"class":163},[33,18157,855],{"class":50},[33,18159,221],{"class":167},[33,18161,18162],{"class":35,"line":187},[33,18163,92],{"emptyLinePlaceholder":91},[33,18165,18166,18168,18171,18173,18175],{"class":35,"line":201},[33,18167,562],{"class":163},[33,18169,18170],{"class":46}," df_to_word",[33,18172,17276],{"class":167},[33,18174,1053],{"class":50},[33,18176,17281],{"class":167},[33,18178,18179,18181,18183,18185,18187,18189,18191],{"class":35,"line":206},[33,18180,17286],{"class":167},[33,18182,242],{"class":163},[33,18184,17291],{"class":167},[33,18186,17294],{"class":54},[33,18188,365],{"class":167},[33,18190,7764],{"class":54},[33,18192,17301],{"class":167},[33,18194,18195,18197,18199,18201,18203,18205,18207,18209,18211,18213],{"class":35,"line":224},[33,18196,17306],{"class":167},[33,18198,242],{"class":163},[33,18200,6393],{"class":167},[33,18202,1351],{"class":163},[33,18204,1110],{"class":163},[33,18206,17317],{"class":54},[33,18208,1115],{"class":50},[33,18210,17322],{"class":167},[33,18212,1121],{"class":50},[33,18214,18215],{"class":54},".docx\"\n",[33,18217,18218],{"class":35,"line":229},[33,18219,92],{"emptyLinePlaceholder":91},[33,18221,18222,18225,18227],{"class":35,"line":235},[33,18223,18224],{"class":167},"    doc ",[33,18226,242],{"class":163},[33,18228,18229],{"class":167}," Document()\n",[33,18231,18232],{"class":35,"line":250},[33,18233,92],{"emptyLinePlaceholder":91},[33,18235,18236],{"class":35,"line":266},[33,18237,18238],{"class":39},"    # Title\n",[33,18240,18241,18244,18246,18249,18251,18254,18256,18259,18261,18263,18265,18268,18270,18272],{"class":35,"line":290},[33,18242,18243],{"class":167},"    title ",[33,18245,242],{"class":163},[33,18247,18248],{"class":167}," doc.add_heading(",[33,18250,4059],{"class":163},[33,18252,18253],{"class":54},"\"Pipeline Report — ",[33,18255,1115],{"class":50},[33,18257,18258],{"class":167},"segment",[33,18260,1121],{"class":50},[33,18262,274],{"class":54},[33,18264,365],{"class":167},[33,18266,18267],{"class":238},"level",[33,18269,242],{"class":163},[33,18271,734],{"class":50},[33,18273,221],{"class":167},[33,18275,18276,18279,18281,18284,18286,18289,18292,18295,18297,18299,18302,18304,18306,18309],{"class":35,"line":295},[33,18277,18278],{"class":167},"    title.runs[",[33,18280,748],{"class":50},[33,18282,18283],{"class":167},"].font.color.rgb ",[33,18285,242],{"class":163},[33,18287,18288],{"class":167}," RGBColor(",[33,18290,18291],{"class":163},"0x",[33,18293,18294],{"class":50},"0F",[33,18296,365],{"class":167},[33,18298,18291],{"class":163},[33,18300,18301],{"class":50},"17",[33,18303,365],{"class":167},[33,18305,18291],{"class":163},[33,18307,18308],{"class":50},"2A",[33,18310,221],{"class":167},[33,18312,18313],{"class":35,"line":300},[33,18314,92],{"emptyLinePlaceholder":91},[33,18316,18317],{"class":35,"line":317},[33,18318,18319],{"class":39},"    # Executive summary paragraph\n",[33,18321,18322,18325,18327,18329,18331],{"class":35,"line":332},[33,18323,18324],{"class":167},"    total_revenue ",[33,18326,242],{"class":163},[33,18328,7935],{"class":167},[33,18330,16465],{"class":54},[33,18332,18333],{"class":167},"].sum()\n",[33,18335,18336,18339,18341,18343,18345],{"class":35,"line":347},[33,18337,18338],{"class":167},"    total_margin  ",[33,18340,242],{"class":163},[33,18342,7935],{"class":167},[33,18344,16456],{"class":54},[33,18346,18333],{"class":167},[33,18348,18349,18352,18354,18357,18359,18362,18364,18367,18369,18371,18373,18375],{"class":35,"line":374},[33,18350,18351],{"class":167},"    margin_pct    ",[33,18353,242],{"class":163},[33,18355,18356],{"class":167}," (total_margin ",[33,18358,1351],{"class":163},[33,18360,18361],{"class":167}," total_revenue ",[33,18363,1769],{"class":163},[33,18365,18366],{"class":50}," 100",[33,18368,1649],{"class":167},[33,18370,2491],{"class":163},[33,18372,18361],{"class":167},[33,18374,7489],{"class":163},[33,18376,18377],{"class":50}," 0.0\n",[33,18379,18380,18383,18385],{"class":35,"line":397},[33,18381,18382],{"class":167},"    summary ",[33,18384,242],{"class":163},[33,18386,1415],{"class":167},[33,18388,18389,18391,18394,18396,18398,18400,18403,18405,18408,18411,18413],{"class":35,"line":653},[33,18390,9533],{"class":163},[33,18392,18393],{"class":54},"\"Region ",[33,18395,1115],{"class":50},[33,18397,18258],{"class":167},[33,18399,1121],{"class":50},[33,18401,18402],{"class":54}," generated $",[33,18404,1115],{"class":50},[33,18406,18407],{"class":167},"total_revenue",[33,18409,18410],{"class":163},":,.0f",[33,18412,1121],{"class":50},[33,18414,18415],{"class":54}," in revenue across \"\n",[33,18417,18418,18420,18422,18424,18426,18428,18431,18433,18436,18439,18441],{"class":35,"line":667},[33,18419,9533],{"class":163},[33,18421,274],{"class":54},[33,18423,4065],{"class":50},[33,18425,4068],{"class":167},[33,18427,1121],{"class":50},[33,18429,18430],{"class":54}," product lines with a net margin of ",[33,18432,1115],{"class":50},[33,18434,18435],{"class":167},"margin_pct",[33,18437,18438],{"class":163},":.1f",[33,18440,1121],{"class":50},[33,18442,18443],{"class":54},"%.\"\n",[33,18445,18446],{"class":35,"line":675},[33,18447,1202],{"class":167},[33,18449,18450,18453,18455],{"class":35,"line":689},[33,18451,18452],{"class":167},"    p ",[33,18454,242],{"class":163},[33,18456,18457],{"class":167}," doc.add_paragraph(summary)\n",[33,18459,18460,18463,18465,18468,18470,18473,18475],{"class":35,"line":703},[33,18461,18462],{"class":167},"    p.runs[",[33,18464,748],{"class":50},[33,18466,18467],{"class":167},"].font.size ",[33,18469,242],{"class":163},[33,18471,18472],{"class":167}," Pt(",[33,18474,17260],{"class":50},[33,18476,221],{"class":167},[33,18478,18479],{"class":35,"line":714},[33,18480,92],{"emptyLinePlaceholder":91},[33,18482,18483],{"class":35,"line":723},[33,18484,18485],{"class":39},"    # Aggregate by product\n",[33,18487,18488,18491,18493],{"class":35,"line":754},[33,18489,18490],{"class":167},"    agg ",[33,18492,242],{"class":163},[33,18494,1415],{"class":167},[33,18496,18497,18500,18502],{"class":35,"line":771},[33,18498,18499],{"class":167},"        df.groupby(",[33,18501,16654],{"class":54},[33,18503,221],{"class":167},[33,18505,18506,18509,18512,18514,18516,18518,18520,18523,18526,18529,18531,18533,18535,18537,18539,18541,18544,18546,18548,18550,18552,18554],{"class":35,"line":777},[33,18507,18508],{"class":167},"          .agg(",[33,18510,18511],{"class":238},"units",[33,18513,242],{"class":163},[33,18515,602],{"class":167},[33,18517,16659],{"class":54},[33,18519,365],{"class":167},[33,18521,18522],{"class":54},"\"sum\"",[33,18524,18525],{"class":167},"), ",[33,18527,18528],{"class":238},"revenue",[33,18530,242],{"class":163},[33,18532,602],{"class":167},[33,18534,16465],{"class":54},[33,18536,365],{"class":167},[33,18538,18522],{"class":54},[33,18540,18525],{"class":167},[33,18542,18543],{"class":238},"margin",[33,18545,242],{"class":163},[33,18547,602],{"class":167},[33,18549,16456],{"class":54},[33,18551,365],{"class":167},[33,18553,18522],{"class":54},[33,18555,371],{"class":167},[33,18557,18558],{"class":35,"line":788},[33,18559,18560],{"class":167},"          .reset_index()\n",[33,18562,18563,18566,18568,18570,18573,18575,18577],{"class":35,"line":804},[33,18564,18565],{"class":167},"          .sort_values(",[33,18567,16465],{"class":54},[33,18569,365],{"class":167},[33,18571,18572],{"class":238},"ascending",[33,18574,242],{"class":163},[33,18576,902],{"class":50},[33,18578,221],{"class":167},[33,18580,18581],{"class":35,"line":809},[33,18582,1202],{"class":167},[33,18584,18585],{"class":35,"line":819},[33,18586,92],{"emptyLinePlaceholder":91},[33,18588,18589,18592,18595,18597,18599,18601,18603],{"class":35,"line":829},[33,18590,18591],{"class":167},"    doc.add_heading(",[33,18593,18594],{"class":54},"\"Product Breakdown\"",[33,18596,365],{"class":167},[33,18598,18267],{"class":238},[33,18600,242],{"class":163},[33,18602,1533],{"class":50},[33,18604,221],{"class":167},[33,18606,18607,18610,18612,18614],{"class":35,"line":834},[33,18608,18609],{"class":167},"    cols ",[33,18611,242],{"class":163},[33,18613,599],{"class":50},[33,18615,18616],{"class":167},"(agg.columns)\n",[33,18618,18619,18622,18624,18627,18630,18632,18634,18636,18639,18641,18643],{"class":35,"line":839},[33,18620,18621],{"class":167},"    table ",[33,18623,242],{"class":163},[33,18625,18626],{"class":167}," doc.add_table(",[33,18628,18629],{"class":238},"rows",[33,18631,242],{"class":163},[33,18633,734],{"class":50},[33,18635,365],{"class":167},[33,18637,18638],{"class":238},"cols",[33,18640,242],{"class":163},[33,18642,928],{"class":50},[33,18644,18645],{"class":167},"(cols))\n",[33,18647,18648,18651,18653],{"class":35,"line":860},[33,18649,18650],{"class":167},"    table.style ",[33,18652,242],{"class":163},[33,18654,18655],{"class":54}," \"Table Grid\"\n",[33,18657,18658],{"class":35,"line":887},[33,18659,92],{"emptyLinePlaceholder":91},[33,18661,18662],{"class":35,"line":907},[33,18663,18664],{"class":39},"    # Header row\n",[33,18666,18667,18670,18672,18675,18677],{"class":35,"line":1826},[33,18668,18669],{"class":167},"    hdr ",[33,18671,242],{"class":163},[33,18673,18674],{"class":167}," table.rows[",[33,18676,748],{"class":50},[33,18678,18679],{"class":167},"].cells\n",[33,18681,18682,18684,18687,18689,18691],{"class":35,"line":1844},[33,18683,656],{"class":163},[33,18685,18686],{"class":167}," i, col ",[33,18688,662],{"class":163},[33,18690,7403],{"class":50},[33,18692,18693],{"class":167},"(cols):\n",[33,18695,18696,18699,18701],{"class":35,"line":1858},[33,18697,18698],{"class":167},"        hdr[i].text ",[33,18700,242],{"class":163},[33,18702,18703],{"class":167}," col.capitalize()\n",[33,18705,18706,18709,18711,18714,18716,18719,18721],{"class":35,"line":1871},[33,18707,18708],{"class":167},"        hdr[i].paragraphs[",[33,18710,748],{"class":50},[33,18712,18713],{"class":167},"].runs[",[33,18715,748],{"class":50},[33,18717,18718],{"class":167},"].font.bold ",[33,18720,242],{"class":163},[33,18722,2887],{"class":50},[33,18724,18725],{"class":35,"line":1877},[33,18726,92],{"emptyLinePlaceholder":91},[33,18728,18729],{"class":35,"line":1883},[33,18730,18731],{"class":39},"    # Data rows\n",[33,18733,18734,18736,18738,18740],{"class":35,"line":1915},[33,18735,656],{"class":163},[33,18737,8560],{"class":167},[33,18739,662],{"class":163},[33,18741,18742],{"class":167}," agg.iterrows():\n",[33,18744,18745,18748,18750],{"class":35,"line":1926},[33,18746,18747],{"class":167},"        cells ",[33,18749,242],{"class":163},[33,18751,18752],{"class":167}," table.add_row().cells\n",[33,18754,18755,18758,18760,18763,18765,18767,18770,18772],{"class":35,"line":1932},[33,18756,18757],{"class":167},"        cells[",[33,18759,748],{"class":50},[33,18761,18762],{"class":167},"].text ",[33,18764,242],{"class":163},[33,18766,7887],{"class":50},[33,18768,18769],{"class":167},"(row[",[33,18771,16654],{"class":54},[33,18773,751],{"class":167},[33,18775,18776,18778,18780,18782,18784,18786,18788,18791,18793,18796,18799,18802,18804],{"class":35,"line":1938},[33,18777,18757],{"class":167},[33,18779,734],{"class":50},[33,18781,18762],{"class":167},[33,18783,242],{"class":163},[33,18785,1110],{"class":163},[33,18787,274],{"class":54},[33,18789,18790],{"class":50},"{int",[33,18792,18769],{"class":167},[33,18794,18795],{"class":54},"'units'",[33,18797,18798],{"class":167},"])",[33,18800,18801],{"class":163},":,",[33,18803,1121],{"class":50},[33,18805,7504],{"class":54},[33,18807,18808,18810,18812,18814,18816,18818,18821,18823,18826,18829,18831,18833,18835],{"class":35,"line":1950},[33,18809,18757],{"class":167},[33,18811,1533],{"class":50},[33,18813,18762],{"class":167},[33,18815,242],{"class":163},[33,18817,1110],{"class":163},[33,18819,18820],{"class":54},"\"$",[33,18822,1115],{"class":50},[33,18824,18825],{"class":167},"row[",[33,18827,18828],{"class":54},"'revenue'",[33,18830,9546],{"class":167},[33,18832,18410],{"class":163},[33,18834,1121],{"class":50},[33,18836,7504],{"class":54},[33,18838,18839,18841,18843,18845,18847,18849,18851,18853,18855,18858,18860,18862,18864],{"class":35,"line":1958},[33,18840,18757],{"class":167},[33,18842,10258],{"class":50},[33,18844,18762],{"class":167},[33,18846,242],{"class":163},[33,18848,1110],{"class":163},[33,18850,18820],{"class":54},[33,18852,1115],{"class":50},[33,18854,18825],{"class":167},[33,18856,18857],{"class":54},"'margin'",[33,18859,9546],{"class":167},[33,18861,18410],{"class":163},[33,18863,1121],{"class":50},[33,18865,7504],{"class":54},[33,18867,18868],{"class":35,"line":4904},[33,18869,92],{"emptyLinePlaceholder":91},[33,18871,18872,18874],{"class":35,"line":4909},[33,18873,2424],{"class":163},[33,18875,574],{"class":167},[33,18877,18878],{"class":35,"line":4915},[33,18879,18880],{"class":167},"        doc.save(path)\n",[33,18882,18883,18885,18887],{"class":35,"line":4925},[33,18884,2449],{"class":163},[33,18886,17393],{"class":50},[33,18888,574],{"class":167},[33,18890,18891,18893,18895,18897,18899,18901,18903,18905,18907,18909],{"class":35,"line":4935},[33,18892,4051],{"class":163},[33,18894,7590],{"class":50},[33,18896,602],{"class":167},[33,18898,4059],{"class":163},[33,18900,17408],{"class":54},[33,18902,1115],{"class":50},[33,18904,2580],{"class":167},[33,18906,1121],{"class":50},[33,18908,274],{"class":54},[33,18910,221],{"class":167},[33,18912,18913],{"class":35,"line":4941},[33,18914,92],{"emptyLinePlaceholder":91},[33,18916,18917,18919],{"class":35,"line":4950},[33,18918,1332],{"class":163},[33,18920,17952],{"class":167},[33,18922,18923],{"class":35,"line":4960},[33,18924,92],{"emptyLinePlaceholder":91},[33,18926,18927,18929,18931,18933,18935],{"class":35,"line":4965},[33,18928,13459],{"class":167},[33,18930,242],{"class":163},[33,18932,9481],{"class":167},[33,18934,16249],{"class":54},[33,18936,221],{"class":167},[33,18938,18939,18941,18943,18945,18947,18949],{"class":35,"line":4971},[33,18940,6124],{"class":163},[33,18942,17975],{"class":167},[33,18944,662],{"class":163},[33,18946,17980],{"class":167},[33,18948,16649],{"class":54},[33,18950,1737],{"class":167},[33,18952,18953,18955,18957,18960,18962],{"class":35,"line":4983},[33,18954,17989],{"class":167},[33,18956,242],{"class":163},[33,18958,18959],{"class":167}," df_to_word(group.copy(), region, ",[33,18961,4615],{"class":50},[33,18963,221],{"class":167},[33,18965,18966,18968,18970,18972,18975,18977,18979,18981,18983,18985,18987,18989,18991],{"class":35,"line":4988},[33,18967,7268],{"class":50},[33,18969,602],{"class":167},[33,18971,4059],{"class":163},[33,18973,18974],{"class":54},"\"Word: ",[33,18976,1115],{"class":50},[33,18978,18014],{"class":167},[33,18980,1121],{"class":50},[33,18982,18019],{"class":54},[33,18984,4065],{"class":50},[33,18986,18024],{"class":167},[33,18988,1121],{"class":50},[33,18990,18029],{"class":54},[33,18992,221],{"class":167},[424,18994,18996],{"id":18995},"step-3-dataframe-to-pdf-report","Step 3 — DataFrame to PDF Report",[14,18998,18999,19003,19004,19007],{},[940,19000,19002],{"href":19001},"\u002Fautomating-pdf-extraction-generation\u002Fgenerating-pdf-reports-dynamically\u002F","ReportLab"," gives fine-grained layout control. WeasyPrint is simpler when you already have an HTML template. The snippet below uses ReportLab's ",[30,19005,19006],{},"Platypus"," story API, which handles page breaks automatically across long segment tables.",[23,19009,19011],{"className":126,"code":19010,"language":47,"meta":28,"style":28},"# pip install pandas reportlab\nimport pandas as pd\nfrom pathlib import Path\nfrom reportlab.lib.pagesizes import A4\nfrom reportlab.lib import colors\nfrom reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle\nfrom reportlab.lib.units import cm\nfrom reportlab.platypus import (\n    SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle\n)\n\nOUTPUT_DIR = Path(\"\u002Ftmp\u002Freports\")\nOUTPUT_DIR.mkdir(parents=True, exist_ok=True)\n\nACCENT = colors.HexColor(\"#2563EB\")\nACCENT_SOFT = colors.HexColor(\"#DBEAFE\")\nBORDER = colors.HexColor(\"#E2E8F0\")\n\ndef df_to_pdf(df: pd.DataFrame, segment: str, output_dir: Path) -> Path:\n    safe_seg = segment.replace(\" \", \"_\").lower()\n    path = output_dir \u002F f\"pipeline-reports-{safe_seg}.pdf\"\n\n    styles = getSampleStyleSheet()\n    title_style = ParagraphStyle(\n        \"ReportTitle\", parent=styles[\"Heading1\"],\n        fontSize=18, textColor=ACCENT, spaceAfter=6\n    )\n    body_style = ParagraphStyle(\n        \"ReportBody\", parent=styles[\"Normal\"],\n        fontSize=10, leading=14, spaceAfter=8\n    )\n\n    total_revenue = df[\"revenue\"].sum()\n    total_margin  = df[\"margin\"].sum()\n    margin_pct    = (total_margin \u002F total_revenue * 100) if total_revenue else 0.0\n\n    story = [\n        Paragraph(f\"Pipeline Report — {segment}\", title_style),\n        Paragraph(\n            f\"Total revenue: \u003Cb>${total_revenue:,.0f}\u003C\u002Fb> | \"\n            f\"Net margin: \u003Cb>{margin_pct:.1f}%\u003C\u002Fb> | \"\n            f\"Rows: \u003Cb>{len(df)}\u003C\u002Fb>\",\n            body_style,\n        ),\n        Spacer(1, 0.4 * cm),\n    ]\n\n    # Detail table\n    headers = [\"Product\", \"Units\", \"Revenue\", \"Cost\", \"Margin\"]\n    table_data = [headers] + [\n        [\n            row[\"product\"],\n            f\"{int(row['units']):,}\",\n            f\"${row['revenue']:,.0f}\",\n            f\"${row['cost']:,.0f}\",\n            f\"${row['margin']:,.0f}\",\n        ]\n        for _, row in df.iterrows()\n    ]\n\n    col_widths = [5 * cm, 2.5 * cm, 3 * cm, 3 * cm, 3 * cm]\n    tbl = Table(table_data, colWidths=col_widths, repeatRows=1)\n    tbl.setStyle(TableStyle([\n        (\"BACKGROUND\",  (0, 0), (-1, 0), ACCENT),\n        (\"TEXTCOLOR\",   (0, 0), (-1, 0), colors.white),\n        (\"FONTNAME\",    (0, 0), (-1, 0), \"Helvetica-Bold\"),\n        (\"FONTSIZE\",    (0, 0), (-1, -1), 9),\n        (\"ROWBACKGROUNDS\", (0, 1), (-1, -1), [colors.white, ACCENT_SOFT]),\n        (\"GRID\",        (0, 0), (-1, -1), 0.5, BORDER),\n        (\"ALIGN\",       (1, 0), (-1, -1), \"RIGHT\"),\n    ]))\n    story.append(tbl)\n\n    try:\n        doc = SimpleDocTemplate(\n            str(path), pagesize=A4,\n            leftMargin=2 * cm, rightMargin=2 * cm,\n            topMargin=2 * cm, bottomMargin=2 * cm,\n        )\n        doc.build(story)\n    except PermissionError:\n        raise RuntimeError(f\"File locked: {path}\")\n\n    return path\n\ndf = pd.read_csv(\"\u002Ftmp\u002Fpipeline_sample.csv\")\nfor region, group in df.groupby(\"region\"):\n    out = df_to_pdf(group.copy(), region, OUTPUT_DIR)\n    print(f\"PDF: {out}  ({len(group)} rows)\")\n",[30,19012,19013,19018,19028,19038,19050,19062,19074,19086,19097,19102,19106,19110,19122,19144,19148,19163,19177,19191,19195,19208,19224,19247,19251,19261,19271,19291,19320,19324,19333,19351,19378,19382,19386,19398,19410,19436,19440,19449,19469,19474,19492,19510,19528,19533,19538,19555,19560,19564,19569,19601,19615,19620,19629,19651,19673,19696,19718,19723,19734,19738,19742,19784,19810,19815,19849,19878,19911,19944,19979,20018,20053,20058,20063,20067,20073,20083,20097,20122,20146,20150,20155,20163,20185,20189,20195,20199,20211,20225,20238],{"__ignoreMap":28},[33,19014,19015],{"class":35,"line":36},[33,19016,19017],{"class":39},"# pip install pandas reportlab\n",[33,19019,19020,19022,19024,19026],{"class":35,"line":43},[33,19021,164],{"class":163},[33,19023,492],{"class":167},[33,19025,495],{"class":163},[33,19027,498],{"class":167},[33,19029,19030,19032,19034,19036],{"class":35,"line":61},[33,19031,190],{"class":163},[33,19033,193],{"class":167},[33,19035,164],{"class":163},[33,19037,198],{"class":167},[33,19039,19040,19042,19045,19047],{"class":35,"line":73},[33,19041,190],{"class":163},[33,19043,19044],{"class":167}," reportlab.lib.pagesizes ",[33,19046,164],{"class":163},[33,19048,19049],{"class":167}," A4\n",[33,19051,19052,19054,19057,19059],{"class":35,"line":88},[33,19053,190],{"class":163},[33,19055,19056],{"class":167}," reportlab.lib ",[33,19058,164],{"class":163},[33,19060,19061],{"class":167}," colors\n",[33,19063,19064,19066,19069,19071],{"class":35,"line":95},[33,19065,190],{"class":163},[33,19067,19068],{"class":167}," reportlab.lib.styles ",[33,19070,164],{"class":163},[33,19072,19073],{"class":167}," getSampleStyleSheet, ParagraphStyle\n",[33,19075,19076,19078,19081,19083],{"class":35,"line":101},[33,19077,190],{"class":163},[33,19079,19080],{"class":167}," reportlab.lib.units ",[33,19082,164],{"class":163},[33,19084,19085],{"class":167}," cm\n",[33,19087,19088,19090,19093,19095],{"class":35,"line":171},[33,19089,190],{"class":163},[33,19091,19092],{"class":167}," reportlab.platypus ",[33,19094,164],{"class":163},[33,19096,1415],{"class":167},[33,19098,19099],{"class":35,"line":179},[33,19100,19101],{"class":167},"    SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle\n",[33,19103,19104],{"class":35,"line":187},[33,19105,221],{"class":167},[33,19107,19108],{"class":35,"line":201},[33,19109,92],{"emptyLinePlaceholder":91},[33,19111,19112,19114,19116,19118,19120],{"class":35,"line":206},[33,19113,4615],{"class":50},[33,19115,212],{"class":163},[33,19117,215],{"class":167},[33,19119,17147],{"class":54},[33,19121,221],{"class":167},[33,19123,19124,19126,19128,19130,19132,19134,19136,19138,19140,19142],{"class":35,"line":224},[33,19125,4615],{"class":50},[33,19127,1078],{"class":167},[33,19129,869],{"class":238},[33,19131,242],{"class":163},[33,19133,855],{"class":50},[33,19135,365],{"class":167},[33,19137,878],{"class":238},[33,19139,242],{"class":163},[33,19141,855],{"class":50},[33,19143,221],{"class":167},[33,19145,19146],{"class":35,"line":229},[33,19147,92],{"emptyLinePlaceholder":91},[33,19149,19150,19153,19155,19158,19161],{"class":35,"line":235},[33,19151,19152],{"class":50},"ACCENT",[33,19154,212],{"class":163},[33,19156,19157],{"class":167}," colors.HexColor(",[33,19159,19160],{"class":54},"\"#2563EB\"",[33,19162,221],{"class":167},[33,19164,19165,19168,19170,19172,19175],{"class":35,"line":250},[33,19166,19167],{"class":50},"ACCENT_SOFT",[33,19169,212],{"class":163},[33,19171,19157],{"class":167},[33,19173,19174],{"class":54},"\"#DBEAFE\"",[33,19176,221],{"class":167},[33,19178,19179,19182,19184,19186,19189],{"class":35,"line":266},[33,19180,19181],{"class":50},"BORDER",[33,19183,212],{"class":163},[33,19185,19157],{"class":167},[33,19187,19188],{"class":54},"\"#E2E8F0\"",[33,19190,221],{"class":167},[33,19192,19193],{"class":35,"line":290},[33,19194,92],{"emptyLinePlaceholder":91},[33,19196,19197,19199,19202,19204,19206],{"class":35,"line":295},[33,19198,562],{"class":163},[33,19200,19201],{"class":46}," df_to_pdf",[33,19203,17276],{"class":167},[33,19205,1053],{"class":50},[33,19207,17281],{"class":167},[33,19209,19210,19212,19214,19216,19218,19220,19222],{"class":35,"line":300},[33,19211,17286],{"class":167},[33,19213,242],{"class":163},[33,19215,17291],{"class":167},[33,19217,17294],{"class":54},[33,19219,365],{"class":167},[33,19221,7764],{"class":54},[33,19223,17301],{"class":167},[33,19225,19226,19228,19230,19232,19234,19236,19238,19240,19242,19244],{"class":35,"line":317},[33,19227,17306],{"class":167},[33,19229,242],{"class":163},[33,19231,6393],{"class":167},[33,19233,1351],{"class":163},[33,19235,1110],{"class":163},[33,19237,17317],{"class":54},[33,19239,1115],{"class":50},[33,19241,17322],{"class":167},[33,19243,1121],{"class":50},[33,19245,19246],{"class":54},".pdf\"\n",[33,19248,19249],{"class":35,"line":332},[33,19250,92],{"emptyLinePlaceholder":91},[33,19252,19253,19256,19258],{"class":35,"line":347},[33,19254,19255],{"class":167},"    styles ",[33,19257,242],{"class":163},[33,19259,19260],{"class":167}," getSampleStyleSheet()\n",[33,19262,19263,19266,19268],{"class":35,"line":374},[33,19264,19265],{"class":167},"    title_style ",[33,19267,242],{"class":163},[33,19269,19270],{"class":167}," ParagraphStyle(\n",[33,19272,19273,19276,19278,19281,19283,19286,19289],{"class":35,"line":397},[33,19274,19275],{"class":54},"        \"ReportTitle\"",[33,19277,365],{"class":167},[33,19279,19280],{"class":238},"parent",[33,19282,242],{"class":163},[33,19284,19285],{"class":167},"styles[",[33,19287,19288],{"class":54},"\"Heading1\"",[33,19290,8935],{"class":167},[33,19292,19293,19296,19298,19301,19303,19306,19308,19310,19312,19315,19317],{"class":35,"line":653},[33,19294,19295],{"class":238},"        fontSize",[33,19297,242],{"class":163},[33,19299,19300],{"class":50},"18",[33,19302,365],{"class":167},[33,19304,19305],{"class":238},"textColor",[33,19307,242],{"class":163},[33,19309,19152],{"class":50},[33,19311,365],{"class":167},[33,19313,19314],{"class":238},"spaceAfter",[33,19316,242],{"class":163},[33,19318,19319],{"class":50},"6\n",[33,19321,19322],{"class":35,"line":667},[33,19323,1202],{"class":167},[33,19325,19326,19329,19331],{"class":35,"line":675},[33,19327,19328],{"class":167},"    body_style ",[33,19330,242],{"class":163},[33,19332,19270],{"class":167},[33,19334,19335,19338,19340,19342,19344,19346,19349],{"class":35,"line":689},[33,19336,19337],{"class":54},"        \"ReportBody\"",[33,19339,365],{"class":167},[33,19341,19280],{"class":238},[33,19343,242],{"class":163},[33,19345,19285],{"class":167},[33,19347,19348],{"class":54},"\"Normal\"",[33,19350,8935],{"class":167},[33,19352,19353,19355,19357,19359,19361,19364,19366,19369,19371,19373,19375],{"class":35,"line":703},[33,19354,19295],{"class":238},[33,19356,242],{"class":163},[33,19358,3545],{"class":50},[33,19360,365],{"class":167},[33,19362,19363],{"class":238},"leading",[33,19365,242],{"class":163},[33,19367,19368],{"class":50},"14",[33,19370,365],{"class":167},[33,19372,19314],{"class":238},[33,19374,242],{"class":163},[33,19376,19377],{"class":50},"8\n",[33,19379,19380],{"class":35,"line":714},[33,19381,1202],{"class":167},[33,19383,19384],{"class":35,"line":723},[33,19385,92],{"emptyLinePlaceholder":91},[33,19387,19388,19390,19392,19394,19396],{"class":35,"line":754},[33,19389,18324],{"class":167},[33,19391,242],{"class":163},[33,19393,7935],{"class":167},[33,19395,16465],{"class":54},[33,19397,18333],{"class":167},[33,19399,19400,19402,19404,19406,19408],{"class":35,"line":771},[33,19401,18338],{"class":167},[33,19403,242],{"class":163},[33,19405,7935],{"class":167},[33,19407,16456],{"class":54},[33,19409,18333],{"class":167},[33,19411,19412,19414,19416,19418,19420,19422,19424,19426,19428,19430,19432,19434],{"class":35,"line":777},[33,19413,18351],{"class":167},[33,19415,242],{"class":163},[33,19417,18356],{"class":167},[33,19419,1351],{"class":163},[33,19421,18361],{"class":167},[33,19423,1769],{"class":163},[33,19425,18366],{"class":50},[33,19427,1649],{"class":167},[33,19429,2491],{"class":163},[33,19431,18361],{"class":167},[33,19433,7489],{"class":163},[33,19435,18377],{"class":50},[33,19437,19438],{"class":35,"line":788},[33,19439,92],{"emptyLinePlaceholder":91},[33,19441,19442,19445,19447],{"class":35,"line":804},[33,19443,19444],{"class":167},"    story ",[33,19446,242],{"class":163},[33,19448,7473],{"class":167},[33,19450,19451,19454,19456,19458,19460,19462,19464,19466],{"class":35,"line":809},[33,19452,19453],{"class":167},"        Paragraph(",[33,19455,4059],{"class":163},[33,19457,18253],{"class":54},[33,19459,1115],{"class":50},[33,19461,18258],{"class":167},[33,19463,1121],{"class":50},[33,19465,274],{"class":54},[33,19467,19468],{"class":167},", title_style),\n",[33,19470,19471],{"class":35,"line":819},[33,19472,19473],{"class":167},"        Paragraph(\n",[33,19475,19476,19478,19481,19483,19485,19487,19489],{"class":35,"line":829},[33,19477,12744],{"class":163},[33,19479,19480],{"class":54},"\"Total revenue: \u003Cb>$",[33,19482,1115],{"class":50},[33,19484,18407],{"class":167},[33,19486,18410],{"class":163},[33,19488,1121],{"class":50},[33,19490,19491],{"class":54},"\u003C\u002Fb> | \"\n",[33,19493,19494,19496,19499,19501,19503,19505,19507],{"class":35,"line":834},[33,19495,12744],{"class":163},[33,19497,19498],{"class":54},"\"Net margin: \u003Cb>",[33,19500,1115],{"class":50},[33,19502,18435],{"class":167},[33,19504,18438],{"class":163},[33,19506,1121],{"class":50},[33,19508,19509],{"class":54},"%\u003C\u002Fb> | \"\n",[33,19511,19512,19514,19517,19519,19521,19523,19526],{"class":35,"line":839},[33,19513,12744],{"class":163},[33,19515,19516],{"class":54},"\"Rows: \u003Cb>",[33,19518,4065],{"class":50},[33,19520,4068],{"class":167},[33,19522,1121],{"class":50},[33,19524,19525],{"class":54},"\u003C\u002Fb>\"",[33,19527,247],{"class":167},[33,19529,19530],{"class":35,"line":860},[33,19531,19532],{"class":167},"            body_style,\n",[33,19534,19535],{"class":35,"line":887},[33,19536,19537],{"class":167},"        ),\n",[33,19539,19540,19543,19545,19547,19550,19552],{"class":35,"line":907},[33,19541,19542],{"class":167},"        Spacer(",[33,19544,734],{"class":50},[33,19546,365],{"class":167},[33,19548,19549],{"class":50},"0.4",[33,19551,1156],{"class":163},[33,19553,19554],{"class":167}," cm),\n",[33,19556,19557],{"class":35,"line":1826},[33,19558,19559],{"class":167},"    ]\n",[33,19561,19562],{"class":35,"line":1844},[33,19563,92],{"emptyLinePlaceholder":91},[33,19565,19566],{"class":35,"line":1858},[33,19567,19568],{"class":39},"    # Detail table\n",[33,19570,19571,19574,19576,19578,19581,19583,19585,19587,19589,19591,19594,19596,19599],{"class":35,"line":1871},[33,19572,19573],{"class":167},"    headers ",[33,19575,242],{"class":163},[33,19577,9178],{"class":167},[33,19579,19580],{"class":54},"\"Product\"",[33,19582,365],{"class":167},[33,19584,12930],{"class":54},[33,19586,365],{"class":167},[33,19588,12925],{"class":54},[33,19590,365],{"class":167},[33,19592,19593],{"class":54},"\"Cost\"",[33,19595,365],{"class":167},[33,19597,19598],{"class":54},"\"Margin\"",[33,19600,9202],{"class":167},[33,19602,19603,19606,19608,19611,19613],{"class":35,"line":1877},[33,19604,19605],{"class":167},"    table_data ",[33,19607,242],{"class":163},[33,19609,19610],{"class":167}," [headers] ",[33,19612,1811],{"class":163},[33,19614,7473],{"class":167},[33,19616,19617],{"class":35,"line":1883},[33,19618,19619],{"class":167},"        [\n",[33,19621,19622,19625,19627],{"class":35,"line":1915},[33,19623,19624],{"class":167},"            row[",[33,19626,16654],{"class":54},[33,19628,8935],{"class":167},[33,19630,19631,19633,19635,19637,19639,19641,19643,19645,19647,19649],{"class":35,"line":1926},[33,19632,12744],{"class":163},[33,19634,274],{"class":54},[33,19636,18790],{"class":50},[33,19638,18769],{"class":167},[33,19640,18795],{"class":54},[33,19642,18798],{"class":167},[33,19644,18801],{"class":163},[33,19646,1121],{"class":50},[33,19648,274],{"class":54},[33,19650,247],{"class":167},[33,19652,19653,19655,19657,19659,19661,19663,19665,19667,19669,19671],{"class":35,"line":1932},[33,19654,12744],{"class":163},[33,19656,18820],{"class":54},[33,19658,1115],{"class":50},[33,19660,18825],{"class":167},[33,19662,18828],{"class":54},[33,19664,9546],{"class":167},[33,19666,18410],{"class":163},[33,19668,1121],{"class":50},[33,19670,274],{"class":54},[33,19672,247],{"class":167},[33,19674,19675,19677,19679,19681,19683,19686,19688,19690,19692,19694],{"class":35,"line":1938},[33,19676,12744],{"class":163},[33,19678,18820],{"class":54},[33,19680,1115],{"class":50},[33,19682,18825],{"class":167},[33,19684,19685],{"class":54},"'cost'",[33,19687,9546],{"class":167},[33,19689,18410],{"class":163},[33,19691,1121],{"class":50},[33,19693,274],{"class":54},[33,19695,247],{"class":167},[33,19697,19698,19700,19702,19704,19706,19708,19710,19712,19714,19716],{"class":35,"line":1950},[33,19699,12744],{"class":163},[33,19701,18820],{"class":54},[33,19703,1115],{"class":50},[33,19705,18825],{"class":167},[33,19707,18857],{"class":54},[33,19709,9546],{"class":167},[33,19711,18410],{"class":163},[33,19713,1121],{"class":50},[33,19715,274],{"class":54},[33,19717,247],{"class":167},[33,19719,19720],{"class":35,"line":1958},[33,19721,19722],{"class":167},"        ]\n",[33,19724,19725,19727,19729,19731],{"class":35,"line":4904},[33,19726,5973],{"class":163},[33,19728,8560],{"class":167},[33,19730,662],{"class":163},[33,19732,19733],{"class":167}," df.iterrows()\n",[33,19735,19736],{"class":35,"line":4909},[33,19737,19559],{"class":167},[33,19739,19740],{"class":35,"line":4915},[33,19741,92],{"emptyLinePlaceholder":91},[33,19743,19744,19747,19749,19751,19753,19755,19758,19761,19763,19765,19767,19769,19771,19773,19775,19777,19779,19781],{"class":35,"line":4925},[33,19745,19746],{"class":167},"    col_widths ",[33,19748,242],{"class":163},[33,19750,9178],{"class":167},[33,19752,1153],{"class":50},[33,19754,1156],{"class":163},[33,19756,19757],{"class":167}," cm, ",[33,19759,19760],{"class":50},"2.5",[33,19762,1156],{"class":163},[33,19764,19757],{"class":167},[33,19766,10258],{"class":50},[33,19768,1156],{"class":163},[33,19770,19757],{"class":167},[33,19772,10258],{"class":50},[33,19774,1156],{"class":163},[33,19776,19757],{"class":167},[33,19778,10258],{"class":50},[33,19780,1156],{"class":163},[33,19782,19783],{"class":167}," cm]\n",[33,19785,19786,19788,19790,19793,19796,19798,19801,19804,19806,19808],{"class":35,"line":4935},[33,19787,14864],{"class":167},[33,19789,242],{"class":163},[33,19791,19792],{"class":167}," Table(table_data, ",[33,19794,19795],{"class":238},"colWidths",[33,19797,242],{"class":163},[33,19799,19800],{"class":167},"col_widths, ",[33,19802,19803],{"class":238},"repeatRows",[33,19805,242],{"class":163},[33,19807,734],{"class":50},[33,19809,221],{"class":167},[33,19811,19812],{"class":35,"line":4941},[33,19813,19814],{"class":167},"    tbl.setStyle(TableStyle([\n",[33,19816,19817,19820,19823,19826,19828,19830,19832,19835,19837,19839,19841,19843,19845,19847],{"class":35,"line":4950},[33,19818,19819],{"class":167},"        (",[33,19821,19822],{"class":54},"\"BACKGROUND\"",[33,19824,19825],{"class":167},",  (",[33,19827,748],{"class":50},[33,19829,365],{"class":167},[33,19831,748],{"class":50},[33,19833,19834],{"class":167},"), (",[33,19836,4126],{"class":163},[33,19838,734],{"class":50},[33,19840,365],{"class":167},[33,19842,748],{"class":50},[33,19844,18525],{"class":167},[33,19846,19152],{"class":50},[33,19848,1506],{"class":167},[33,19850,19851,19853,19856,19859,19861,19863,19865,19867,19869,19871,19873,19875],{"class":35,"line":4960},[33,19852,19819],{"class":167},[33,19854,19855],{"class":54},"\"TEXTCOLOR\"",[33,19857,19858],{"class":167},",   (",[33,19860,748],{"class":50},[33,19862,365],{"class":167},[33,19864,748],{"class":50},[33,19866,19834],{"class":167},[33,19868,4126],{"class":163},[33,19870,734],{"class":50},[33,19872,365],{"class":167},[33,19874,748],{"class":50},[33,19876,19877],{"class":167},"), colors.white),\n",[33,19879,19880,19882,19885,19888,19890,19892,19894,19896,19898,19900,19902,19904,19906,19909],{"class":35,"line":4965},[33,19881,19819],{"class":167},[33,19883,19884],{"class":54},"\"FONTNAME\"",[33,19886,19887],{"class":167},",    (",[33,19889,748],{"class":50},[33,19891,365],{"class":167},[33,19893,748],{"class":50},[33,19895,19834],{"class":167},[33,19897,4126],{"class":163},[33,19899,734],{"class":50},[33,19901,365],{"class":167},[33,19903,748],{"class":50},[33,19905,18525],{"class":167},[33,19907,19908],{"class":54},"\"Helvetica-Bold\"",[33,19910,1506],{"class":167},[33,19912,19913,19915,19918,19920,19922,19924,19926,19928,19930,19932,19934,19936,19938,19940,19942],{"class":35,"line":4971},[33,19914,19819],{"class":167},[33,19916,19917],{"class":54},"\"FONTSIZE\"",[33,19919,19887],{"class":167},[33,19921,748],{"class":50},[33,19923,365],{"class":167},[33,19925,748],{"class":50},[33,19927,19834],{"class":167},[33,19929,4126],{"class":163},[33,19931,734],{"class":50},[33,19933,365],{"class":167},[33,19935,4126],{"class":163},[33,19937,734],{"class":50},[33,19939,18525],{"class":167},[33,19941,2577],{"class":50},[33,19943,1506],{"class":167},[33,19945,19946,19948,19951,19954,19956,19958,19960,19962,19964,19966,19968,19970,19972,19975,19977],{"class":35,"line":4983},[33,19947,19819],{"class":167},[33,19949,19950],{"class":54},"\"ROWBACKGROUNDS\"",[33,19952,19953],{"class":167},", (",[33,19955,748],{"class":50},[33,19957,365],{"class":167},[33,19959,734],{"class":50},[33,19961,19834],{"class":167},[33,19963,4126],{"class":163},[33,19965,734],{"class":50},[33,19967,365],{"class":167},[33,19969,4126],{"class":163},[33,19971,734],{"class":50},[33,19973,19974],{"class":167},"), [colors.white, ",[33,19976,19167],{"class":50},[33,19978,12871],{"class":167},[33,19980,19981,19983,19986,19989,19991,19993,19995,19997,19999,20001,20003,20005,20007,20009,20012,20014,20016],{"class":35,"line":4988},[33,19982,19819],{"class":167},[33,19984,19985],{"class":54},"\"GRID\"",[33,19987,19988],{"class":167},",        (",[33,19990,748],{"class":50},[33,19992,365],{"class":167},[33,19994,748],{"class":50},[33,19996,19834],{"class":167},[33,19998,4126],{"class":163},[33,20000,734],{"class":50},[33,20002,365],{"class":167},[33,20004,4126],{"class":163},[33,20006,734],{"class":50},[33,20008,18525],{"class":167},[33,20010,20011],{"class":50},"0.5",[33,20013,365],{"class":167},[33,20015,19181],{"class":50},[33,20017,1506],{"class":167},[33,20019,20020,20022,20025,20028,20030,20032,20034,20036,20038,20040,20042,20044,20046,20048,20051],{"class":35,"line":4993},[33,20021,19819],{"class":167},[33,20023,20024],{"class":54},"\"ALIGN\"",[33,20026,20027],{"class":167},",       (",[33,20029,734],{"class":50},[33,20031,365],{"class":167},[33,20033,748],{"class":50},[33,20035,19834],{"class":167},[33,20037,4126],{"class":163},[33,20039,734],{"class":50},[33,20041,365],{"class":167},[33,20043,4126],{"class":163},[33,20045,734],{"class":50},[33,20047,18525],{"class":167},[33,20049,20050],{"class":54},"\"RIGHT\"",[33,20052,1506],{"class":167},[33,20054,20055],{"class":35,"line":5003},[33,20056,20057],{"class":167},"    ]))\n",[33,20059,20060],{"class":35,"line":5008},[33,20061,20062],{"class":167},"    story.append(tbl)\n",[33,20064,20065],{"class":35,"line":5014},[33,20066,92],{"emptyLinePlaceholder":91},[33,20068,20069,20071],{"class":35,"line":5019},[33,20070,2424],{"class":163},[33,20072,574],{"class":167},[33,20074,20075,20078,20080],{"class":35,"line":5032},[33,20076,20077],{"class":167},"        doc ",[33,20079,242],{"class":163},[33,20081,20082],{"class":167}," SimpleDocTemplate(\n",[33,20084,20085,20087,20089,20092,20094],{"class":35,"line":5039},[33,20086,10673],{"class":50},[33,20088,13643],{"class":167},[33,20090,20091],{"class":238},"pagesize",[33,20093,242],{"class":163},[33,20095,20096],{"class":167},"A4,\n",[33,20098,20099,20102,20104,20106,20108,20110,20113,20115,20117,20119],{"class":35,"line":5068},[33,20100,20101],{"class":238},"            leftMargin",[33,20103,242],{"class":163},[33,20105,1533],{"class":50},[33,20107,1156],{"class":163},[33,20109,19757],{"class":167},[33,20111,20112],{"class":238},"rightMargin",[33,20114,242],{"class":163},[33,20116,1533],{"class":50},[33,20118,1156],{"class":163},[33,20120,20121],{"class":167}," cm,\n",[33,20123,20124,20127,20129,20131,20133,20135,20138,20140,20142,20144],{"class":35,"line":5077},[33,20125,20126],{"class":238},"            topMargin",[33,20128,242],{"class":163},[33,20130,1533],{"class":50},[33,20132,1156],{"class":163},[33,20134,19757],{"class":167},[33,20136,20137],{"class":238},"bottomMargin",[33,20139,242],{"class":163},[33,20141,1533],{"class":50},[33,20143,1156],{"class":163},[33,20145,20121],{"class":167},[33,20147,20148],{"class":35,"line":5082},[33,20149,5867],{"class":167},[33,20151,20152],{"class":35,"line":5089},[33,20153,20154],{"class":167},"        doc.build(story)\n",[33,20156,20157,20159,20161],{"class":35,"line":5098},[33,20158,2449],{"class":163},[33,20160,17393],{"class":50},[33,20162,574],{"class":167},[33,20164,20165,20167,20169,20171,20173,20175,20177,20179,20181,20183],{"class":35,"line":5105},[33,20166,4051],{"class":163},[33,20168,7590],{"class":50},[33,20170,602],{"class":167},[33,20172,4059],{"class":163},[33,20174,17408],{"class":54},[33,20176,1115],{"class":50},[33,20178,2580],{"class":167},[33,20180,1121],{"class":50},[33,20182,274],{"class":54},[33,20184,221],{"class":167},[33,20186,20187],{"class":35,"line":5110},[33,20188,92],{"emptyLinePlaceholder":91},[33,20190,20191,20193],{"class":35,"line":5115},[33,20192,1332],{"class":163},[33,20194,17952],{"class":167},[33,20196,20197],{"class":35,"line":5128},[33,20198,92],{"emptyLinePlaceholder":91},[33,20200,20201,20203,20205,20207,20209],{"class":35,"line":5135},[33,20202,13459],{"class":167},[33,20204,242],{"class":163},[33,20206,9481],{"class":167},[33,20208,16249],{"class":54},[33,20210,221],{"class":167},[33,20212,20213,20215,20217,20219,20221,20223],{"class":35,"line":5142},[33,20214,6124],{"class":163},[33,20216,17975],{"class":167},[33,20218,662],{"class":163},[33,20220,17980],{"class":167},[33,20222,16649],{"class":54},[33,20224,1737],{"class":167},[33,20226,20227,20229,20231,20234,20236],{"class":35,"line":5151},[33,20228,17989],{"class":167},[33,20230,242],{"class":163},[33,20232,20233],{"class":167}," df_to_pdf(group.copy(), region, ",[33,20235,4615],{"class":50},[33,20237,221],{"class":167},[33,20239,20240,20242,20244,20246,20249,20251,20253,20255,20257,20259,20261,20263,20265],{"class":35,"line":5156},[33,20241,7268],{"class":50},[33,20243,602],{"class":167},[33,20245,4059],{"class":163},[33,20247,20248],{"class":54},"\"PDF: ",[33,20250,1115],{"class":50},[33,20252,18014],{"class":167},[33,20254,1121],{"class":50},[33,20256,18019],{"class":54},[33,20258,4065],{"class":50},[33,20260,18024],{"class":167},[33,20262,1121],{"class":50},[33,20264,18029],{"class":54},[33,20266,221],{"class":167},[18,20268,2709],{"id":2708},[424,20270,20272],{"id":20271},"variant-1-segment-with-a-single-row","Variant 1 — Segment with a Single Row",[14,20274,20275,20276,20279],{},"A one-row segment breaks some table libraries (ReportLab raises ",[30,20277,20278],{},"LayoutError"," on a table with only a header and one data row if column widths exceed the page). Guard explicitly:",[23,20281,20283],{"className":126,"code":20282,"language":47,"meta":28,"style":28},"# pip install reportlab\nfrom reportlab.platypus import Table\n\ndef safe_table(data, col_widths):\n    if len(data) \u003C 2:\n        # Pad with a placeholder row so Platypus doesn't raise\n        data = data + [[\"—\"] * len(data[0])]\n    return Table(data, colWidths=col_widths, repeatRows=1)\n",[30,20284,20285,20290,20301,20305,20315,20330,20335,20366],{"__ignoreMap":28},[33,20286,20287],{"class":35,"line":36},[33,20288,20289],{"class":39},"# pip install reportlab\n",[33,20291,20292,20294,20296,20298],{"class":35,"line":43},[33,20293,190],{"class":163},[33,20295,19092],{"class":167},[33,20297,164],{"class":163},[33,20299,20300],{"class":167}," Table\n",[33,20302,20303],{"class":35,"line":61},[33,20304,92],{"emptyLinePlaceholder":91},[33,20306,20307,20309,20312],{"class":35,"line":73},[33,20308,562],{"class":163},[33,20310,20311],{"class":46}," safe_table",[33,20313,20314],{"class":167},"(data, col_widths):\n",[33,20316,20317,20319,20321,20324,20326,20328],{"class":35,"line":88},[33,20318,617],{"class":163},[33,20320,4037],{"class":50},[33,20322,20323],{"class":167},"(data) ",[33,20325,4043],{"class":163},[33,20327,7451],{"class":50},[33,20329,574],{"class":167},[33,20331,20332],{"class":35,"line":95},[33,20333,20334],{"class":39},"        # Pad with a placeholder row so Platypus doesn't raise\n",[33,20336,20337,20340,20342,20345,20347,20350,20353,20355,20357,20359,20362,20364],{"class":35,"line":101},[33,20338,20339],{"class":167},"        data ",[33,20341,242],{"class":163},[33,20343,20344],{"class":167}," data ",[33,20346,1811],{"class":163},[33,20348,20349],{"class":167}," [[",[33,20351,20352],{"class":54},"\"—\"",[33,20354,763],{"class":167},[33,20356,1769],{"class":163},[33,20358,4037],{"class":50},[33,20360,20361],{"class":167},"(data[",[33,20363,748],{"class":50},[33,20365,14779],{"class":167},[33,20367,20368,20370,20373,20375,20377,20379,20381,20383,20385],{"class":35,"line":171},[33,20369,1332],{"class":163},[33,20371,20372],{"class":167}," Table(data, ",[33,20374,19795],{"class":238},[33,20376,242],{"class":163},[33,20378,19800],{"class":167},[33,20380,19803],{"class":238},[33,20382,242],{"class":163},[33,20384,734],{"class":50},[33,20386,221],{"class":167},[424,20388,20390],{"id":20389},"variant-2-docxtpl-mail-merge-for-word-templates","Variant 2 — docxtpl Mail Merge for Word Templates",[14,20392,20393,20394,20396,20397,20400],{},"When the Word file is an existing template maintained by a designer, use ",[940,20395,18047],{"href":18040}," instead of building structure in code. The template uses ",[30,20398,20399],{},"{{ variable }}"," Jinja2 syntax.",[23,20402,20404],{"className":126,"code":20403,"language":47,"meta":28,"style":28},"# pip install pandas docxtpl\nimport pandas as pd\nfrom pathlib import Path\nfrom docxtpl import DocxTemplate\n\nTEMPLATE_PATH = Path(\"\u002Ftmp\u002Freport_template.docx\")   # existing .docx with {{ }} placeholders\nOUTPUT_DIR    = Path(\"\u002Ftmp\u002Freports\")\nOUTPUT_DIR.mkdir(parents=True, exist_ok=True)\n\ndef merge_word_template(df: pd.DataFrame, segment: str, template: Path, output_dir: Path) -> Path:\n    if not template.exists():\n        raise FileNotFoundError(f\"Template not found: {template}\")\n    safe_seg = segment.replace(\" \", \"_\").lower()\n    path = output_dir \u002F f\"pipeline-reports-{safe_seg}-merged.docx\"\n\n    tpl = DocxTemplate(template)\n    context = {\n        \"segment\":        segment,\n        \"total_revenue\":  f\"${df['revenue'].sum():,.0f}\",\n        \"total_margin\":   f\"${df['margin'].sum():,.0f}\",\n        \"row_count\":      len(df),\n        \"rows\": [\n            {\n                \"product\": row[\"product\"],\n                \"units\":   int(row[\"units\"]),\n                \"revenue\": f\"${row['revenue']:,.0f}\",\n            }\n            for _, row in df.iterrows()\n        ],\n    }\n    try:\n        tpl.render(context)\n        tpl.save(path)\n    except Exception as exc:\n        raise RuntimeError(f\"Template render failed for {segment}: {exc}\") from exc\n\n    return path\n",[30,20405,20406,20411,20421,20431,20443,20447,20464,20477,20499,20503,20517,20526,20550,20566,20589,20593,20603,20612,20620,20649,20677,20690,20698,20703,20715,20730,20757,20762,20772,20777,20782,20788,20793,20798,20808,20844,20848],{"__ignoreMap":28},[33,20407,20408],{"class":35,"line":36},[33,20409,20410],{"class":39},"# pip install pandas docxtpl\n",[33,20412,20413,20415,20417,20419],{"class":35,"line":43},[33,20414,164],{"class":163},[33,20416,492],{"class":167},[33,20418,495],{"class":163},[33,20420,498],{"class":167},[33,20422,20423,20425,20427,20429],{"class":35,"line":61},[33,20424,190],{"class":163},[33,20426,193],{"class":167},[33,20428,164],{"class":163},[33,20430,198],{"class":167},[33,20432,20433,20435,20438,20440],{"class":35,"line":73},[33,20434,190],{"class":163},[33,20436,20437],{"class":167}," docxtpl ",[33,20439,164],{"class":163},[33,20441,20442],{"class":167}," DocxTemplate\n",[33,20444,20445],{"class":35,"line":88},[33,20446,92],{"emptyLinePlaceholder":91},[33,20448,20449,20452,20454,20456,20459,20461],{"class":35,"line":95},[33,20450,20451],{"class":50},"TEMPLATE_PATH",[33,20453,212],{"class":163},[33,20455,215],{"class":167},[33,20457,20458],{"class":54},"\"\u002Ftmp\u002Freport_template.docx\"",[33,20460,12000],{"class":167},[33,20462,20463],{"class":39},"# existing .docx with {{ }} placeholders\n",[33,20465,20466,20468,20471,20473,20475],{"class":35,"line":101},[33,20467,4615],{"class":50},[33,20469,20470],{"class":163},"    =",[33,20472,215],{"class":167},[33,20474,17147],{"class":54},[33,20476,221],{"class":167},[33,20478,20479,20481,20483,20485,20487,20489,20491,20493,20495,20497],{"class":35,"line":171},[33,20480,4615],{"class":50},[33,20482,1078],{"class":167},[33,20484,869],{"class":238},[33,20486,242],{"class":163},[33,20488,855],{"class":50},[33,20490,365],{"class":167},[33,20492,878],{"class":238},[33,20494,242],{"class":163},[33,20496,855],{"class":50},[33,20498,221],{"class":167},[33,20500,20501],{"class":35,"line":179},[33,20502,92],{"emptyLinePlaceholder":91},[33,20504,20505,20507,20510,20512,20514],{"class":35,"line":187},[33,20506,562],{"class":163},[33,20508,20509],{"class":46}," merge_word_template",[33,20511,17276],{"class":167},[33,20513,1053],{"class":50},[33,20515,20516],{"class":167},", template: Path, output_dir: Path) -> Path:\n",[33,20518,20519,20521,20523],{"class":35,"line":201},[33,20520,617],{"class":163},[33,20522,620],{"class":163},[33,20524,20525],{"class":167}," template.exists():\n",[33,20527,20528,20530,20532,20534,20536,20539,20541,20544,20546,20548],{"class":35,"line":206},[33,20529,4051],{"class":163},[33,20531,2945],{"class":50},[33,20533,602],{"class":167},[33,20535,4059],{"class":163},[33,20537,20538],{"class":54},"\"Template not found: ",[33,20540,1115],{"class":50},[33,20542,20543],{"class":167},"template",[33,20545,1121],{"class":50},[33,20547,274],{"class":54},[33,20549,221],{"class":167},[33,20551,20552,20554,20556,20558,20560,20562,20564],{"class":35,"line":224},[33,20553,17286],{"class":167},[33,20555,242],{"class":163},[33,20557,17291],{"class":167},[33,20559,17294],{"class":54},[33,20561,365],{"class":167},[33,20563,7764],{"class":54},[33,20565,17301],{"class":167},[33,20567,20568,20570,20572,20574,20576,20578,20580,20582,20584,20586],{"class":35,"line":229},[33,20569,17306],{"class":167},[33,20571,242],{"class":163},[33,20573,6393],{"class":167},[33,20575,1351],{"class":163},[33,20577,1110],{"class":163},[33,20579,17317],{"class":54},[33,20581,1115],{"class":50},[33,20583,17322],{"class":167},[33,20585,1121],{"class":50},[33,20587,20588],{"class":54},"-merged.docx\"\n",[33,20590,20591],{"class":35,"line":235},[33,20592,92],{"emptyLinePlaceholder":91},[33,20594,20595,20598,20600],{"class":35,"line":250},[33,20596,20597],{"class":167},"    tpl ",[33,20599,242],{"class":163},[33,20601,20602],{"class":167}," DocxTemplate(template)\n",[33,20604,20605,20608,20610],{"class":35,"line":266},[33,20606,20607],{"class":167},"    context ",[33,20609,242],{"class":163},[33,20611,16265],{"class":167},[33,20613,20614,20617],{"class":35,"line":290},[33,20615,20616],{"class":54},"        \"segment\"",[33,20618,20619],{"class":167},":        segment,\n",[33,20621,20622,20625,20628,20630,20632,20634,20636,20638,20641,20643,20645,20647],{"class":35,"line":295},[33,20623,20624],{"class":54},"        \"total_revenue\"",[33,20626,20627],{"class":167},":  ",[33,20629,4059],{"class":163},[33,20631,18820],{"class":54},[33,20633,1115],{"class":50},[33,20635,11038],{"class":167},[33,20637,18828],{"class":54},[33,20639,20640],{"class":167},"].sum()",[33,20642,18410],{"class":163},[33,20644,1121],{"class":50},[33,20646,274],{"class":54},[33,20648,247],{"class":167},[33,20650,20651,20654,20657,20659,20661,20663,20665,20667,20669,20671,20673,20675],{"class":35,"line":300},[33,20652,20653],{"class":54},"        \"total_margin\"",[33,20655,20656],{"class":167},":   ",[33,20658,4059],{"class":163},[33,20660,18820],{"class":54},[33,20662,1115],{"class":50},[33,20664,11038],{"class":167},[33,20666,18857],{"class":54},[33,20668,20640],{"class":167},[33,20670,18410],{"class":163},[33,20672,1121],{"class":50},[33,20674,274],{"class":54},[33,20676,247],{"class":167},[33,20678,20679,20682,20685,20687],{"class":35,"line":317},[33,20680,20681],{"class":54},"        \"row_count\"",[33,20683,20684],{"class":167},":      ",[33,20686,928],{"class":50},[33,20688,20689],{"class":167},"(df),\n",[33,20691,20692,20695],{"class":35,"line":332},[33,20693,20694],{"class":54},"        \"rows\"",[33,20696,20697],{"class":167},": [\n",[33,20699,20700],{"class":35,"line":347},[33,20701,20702],{"class":167},"            {\n",[33,20704,20705,20708,20711,20713],{"class":35,"line":374},[33,20706,20707],{"class":54},"                \"product\"",[33,20709,20710],{"class":167},": row[",[33,20712,16654],{"class":54},[33,20714,8935],{"class":167},[33,20716,20717,20720,20722,20724,20726,20728],{"class":35,"line":397},[33,20718,20719],{"class":54},"                \"units\"",[33,20721,20656],{"class":167},[33,20723,1059],{"class":50},[33,20725,18769],{"class":167},[33,20727,16659],{"class":54},[33,20729,12871],{"class":167},[33,20731,20732,20735,20737,20739,20741,20743,20745,20747,20749,20751,20753,20755],{"class":35,"line":653},[33,20733,20734],{"class":54},"                \"revenue\"",[33,20736,2079],{"class":167},[33,20738,4059],{"class":163},[33,20740,18820],{"class":54},[33,20742,1115],{"class":50},[33,20744,18825],{"class":167},[33,20746,18828],{"class":54},[33,20748,9546],{"class":167},[33,20750,18410],{"class":163},[33,20752,1121],{"class":50},[33,20754,274],{"class":54},[33,20756,247],{"class":167},[33,20758,20759],{"class":35,"line":667},[33,20760,20761],{"class":167},"            }\n",[33,20763,20764,20766,20768,20770],{"class":35,"line":675},[33,20765,1793],{"class":163},[33,20767,8560],{"class":167},[33,20769,662],{"class":163},[33,20771,19733],{"class":167},[33,20773,20774],{"class":35,"line":689},[33,20775,20776],{"class":167},"        ],\n",[33,20778,20779],{"class":35,"line":703},[33,20780,20781],{"class":167},"    }\n",[33,20783,20784,20786],{"class":35,"line":714},[33,20785,2424],{"class":163},[33,20787,574],{"class":167},[33,20789,20790],{"class":35,"line":723},[33,20791,20792],{"class":167},"        tpl.render(context)\n",[33,20794,20795],{"class":35,"line":754},[33,20796,20797],{"class":167},"        tpl.save(path)\n",[33,20799,20800,20802,20804,20806],{"class":35,"line":771},[33,20801,2449],{"class":163},[33,20803,783],{"class":50},[33,20805,1852],{"class":163},[33,20807,1855],{"class":167},[33,20809,20810,20812,20814,20816,20818,20821,20823,20825,20827,20829,20831,20833,20835,20837,20839,20841],{"class":35,"line":777},[33,20811,4051],{"class":163},[33,20813,7590],{"class":50},[33,20815,602],{"class":167},[33,20817,4059],{"class":163},[33,20819,20820],{"class":54},"\"Template render failed for ",[33,20822,1115],{"class":50},[33,20824,18258],{"class":167},[33,20826,1121],{"class":50},[33,20828,2079],{"class":54},[33,20830,1115],{"class":50},[33,20832,6565],{"class":167},[33,20834,1121],{"class":50},[33,20836,274],{"class":54},[33,20838,1649],{"class":167},[33,20840,190],{"class":163},[33,20842,20843],{"class":167}," exc\n",[33,20845,20846],{"class":35,"line":788},[33,20847,92],{"emptyLinePlaceholder":91},[33,20849,20850,20852],{"class":35,"line":804},[33,20851,1332],{"class":163},[33,20853,17952],{"class":167},[14,20855,20856,20857,20859,20860,20863,20864,20867,20868,20871],{},"If a template variable is missing from the context, ",[30,20858,18047],{}," raises ",[30,20861,20862],{},"jinja2.exceptions.UndefinedError",". Pass ",[30,20865,20866],{},"jinja_env=tpl.get_jinja_env(autoescape=False)"," and set ",[30,20869,20870],{},"undefined=jinja2.Undefined"," to silently skip missing keys during development.",[424,20873,20875],{"id":20874},"variant-3-weasyprint-for-html-to-pdf","Variant 3 — WeasyPrint for HTML-to-PDF",[14,20877,20878,20879,20884,20885,10065,20888,20891],{},"When your team writes Jinja2 HTML templates, ",[940,20880,20883],{"href":20881,"rel":20882},"https:\u002F\u002Fweasyprint.org\u002F",[1367],"WeasyPrint"," converts them to PDF without installing a headless browser. It requires system-level ",[30,20886,20887],{},"libpango",[30,20889,20890],{},"libcairo",":",[23,20893,20895],{"className":25,"code":20894,"language":27,"meta":28,"style":28},"# Ubuntu\u002FDebian\nsudo apt-get install -y libpango-1.0-0 libcairo2 libpangocairo-1.0-0\npip install weasyprint jinja2\n",[30,20896,20897,20902,20922],{"__ignoreMap":28},[33,20898,20899],{"class":35,"line":36},[33,20900,20901],{"class":39},"# Ubuntu\u002FDebian\n",[33,20903,20904,20906,20908,20910,20913,20916,20919],{"class":35,"line":43},[33,20905,9669],{"class":46},[33,20907,9672],{"class":54},[33,20909,79],{"class":54},[33,20911,20912],{"class":50}," -y",[33,20914,20915],{"class":54}," libpango-1.0-0",[33,20917,20918],{"class":54}," libcairo2",[33,20920,20921],{"class":54}," libpangocairo-1.0-0\n",[33,20923,20924,20926,20928,20931],{"class":35,"line":61},[33,20925,76],{"class":46},[33,20927,79],{"class":54},[33,20929,20930],{"class":54}," weasyprint",[33,20932,20933],{"class":54}," jinja2\n",[23,20935,20937],{"className":126,"code":20936,"language":47,"meta":28,"style":28},"# pip install weasyprint jinja2\nfrom pathlib import Path\nimport pandas as pd\nfrom jinja2 import Environment, FileSystemLoader\nfrom weasyprint import HTML\n\nTEMPLATE_DIR = Path(\"\u002Ftmp\u002Ftemplates\")\nOUTPUT_DIR   = Path(\"\u002Ftmp\u002Freports\")\nOUTPUT_DIR.mkdir(parents=True, exist_ok=True)\n\ndef html_to_pdf(df: pd.DataFrame, segment: str, template_dir: Path, output_dir: Path) -> Path:\n    safe_seg = segment.replace(\" \", \"_\").lower()\n    path = output_dir \u002F f\"pipeline-reports-{safe_seg}-web.pdf\"\n\n    env  = Environment(loader=FileSystemLoader(str(template_dir)))\n    try:\n        tmpl = env.get_template(\"report.html\")\n    except Exception as exc:\n        raise FileNotFoundError(f\"Template 'report.html' not found in {template_dir}: {exc}\") from exc\n\n    html_str = tmpl.render(segment=segment, rows=df.to_dict(\"records\"))\n    try:\n        HTML(string=html_str).write_pdf(str(path))\n    except Exception as exc:\n        raise RuntimeError(f\"WeasyPrint failed for {segment}: {exc}\") from exc\n\n    return path\n",[30,20938,20939,20944,20954,20964,20976,20988,20992,21006,21019,21041,21045,21059,21075,21098,21102,21125,21131,21146,21156,21192,21196,21225,21231,21249,21259,21294,21298],{"__ignoreMap":28},[33,20940,20941],{"class":35,"line":36},[33,20942,20943],{"class":39},"# pip install weasyprint jinja2\n",[33,20945,20946,20948,20950,20952],{"class":35,"line":43},[33,20947,190],{"class":163},[33,20949,193],{"class":167},[33,20951,164],{"class":163},[33,20953,198],{"class":167},[33,20955,20956,20958,20960,20962],{"class":35,"line":61},[33,20957,164],{"class":163},[33,20959,492],{"class":167},[33,20961,495],{"class":163},[33,20963,498],{"class":167},[33,20965,20966,20968,20971,20973],{"class":35,"line":73},[33,20967,190],{"class":163},[33,20969,20970],{"class":167}," jinja2 ",[33,20972,164],{"class":163},[33,20974,20975],{"class":167}," Environment, FileSystemLoader\n",[33,20977,20978,20980,20983,20985],{"class":35,"line":88},[33,20979,190],{"class":163},[33,20981,20982],{"class":167}," weasyprint ",[33,20984,164],{"class":163},[33,20986,20987],{"class":50}," HTML\n",[33,20989,20990],{"class":35,"line":95},[33,20991,92],{"emptyLinePlaceholder":91},[33,20993,20994,20997,20999,21001,21004],{"class":35,"line":101},[33,20995,20996],{"class":50},"TEMPLATE_DIR",[33,20998,212],{"class":163},[33,21000,215],{"class":167},[33,21002,21003],{"class":54},"\"\u002Ftmp\u002Ftemplates\"",[33,21005,221],{"class":167},[33,21007,21008,21010,21013,21015,21017],{"class":35,"line":171},[33,21009,4615],{"class":50},[33,21011,21012],{"class":163},"   =",[33,21014,215],{"class":167},[33,21016,17147],{"class":54},[33,21018,221],{"class":167},[33,21020,21021,21023,21025,21027,21029,21031,21033,21035,21037,21039],{"class":35,"line":179},[33,21022,4615],{"class":50},[33,21024,1078],{"class":167},[33,21026,869],{"class":238},[33,21028,242],{"class":163},[33,21030,855],{"class":50},[33,21032,365],{"class":167},[33,21034,878],{"class":238},[33,21036,242],{"class":163},[33,21038,855],{"class":50},[33,21040,221],{"class":167},[33,21042,21043],{"class":35,"line":187},[33,21044,92],{"emptyLinePlaceholder":91},[33,21046,21047,21049,21052,21054,21056],{"class":35,"line":201},[33,21048,562],{"class":163},[33,21050,21051],{"class":46}," html_to_pdf",[33,21053,17276],{"class":167},[33,21055,1053],{"class":50},[33,21057,21058],{"class":167},", template_dir: Path, output_dir: Path) -> Path:\n",[33,21060,21061,21063,21065,21067,21069,21071,21073],{"class":35,"line":206},[33,21062,17286],{"class":167},[33,21064,242],{"class":163},[33,21066,17291],{"class":167},[33,21068,17294],{"class":54},[33,21070,365],{"class":167},[33,21072,7764],{"class":54},[33,21074,17301],{"class":167},[33,21076,21077,21079,21081,21083,21085,21087,21089,21091,21093,21095],{"class":35,"line":224},[33,21078,17306],{"class":167},[33,21080,242],{"class":163},[33,21082,6393],{"class":167},[33,21084,1351],{"class":163},[33,21086,1110],{"class":163},[33,21088,17317],{"class":54},[33,21090,1115],{"class":50},[33,21092,17322],{"class":167},[33,21094,1121],{"class":50},[33,21096,21097],{"class":54},"-web.pdf\"\n",[33,21099,21100],{"class":35,"line":229},[33,21101,92],{"emptyLinePlaceholder":91},[33,21103,21104,21107,21109,21112,21115,21117,21120,21122],{"class":35,"line":235},[33,21105,21106],{"class":167},"    env  ",[33,21108,242],{"class":163},[33,21110,21111],{"class":167}," Environment(",[33,21113,21114],{"class":238},"loader",[33,21116,242],{"class":163},[33,21118,21119],{"class":167},"FileSystemLoader(",[33,21121,1053],{"class":50},[33,21123,21124],{"class":167},"(template_dir)))\n",[33,21126,21127,21129],{"class":35,"line":250},[33,21128,2424],{"class":163},[33,21130,574],{"class":167},[33,21132,21133,21136,21138,21141,21144],{"class":35,"line":266},[33,21134,21135],{"class":167},"        tmpl ",[33,21137,242],{"class":163},[33,21139,21140],{"class":167}," env.get_template(",[33,21142,21143],{"class":54},"\"report.html\"",[33,21145,221],{"class":167},[33,21147,21148,21150,21152,21154],{"class":35,"line":290},[33,21149,2449],{"class":163},[33,21151,783],{"class":50},[33,21153,1852],{"class":163},[33,21155,1855],{"class":167},[33,21157,21158,21160,21162,21164,21166,21169,21171,21174,21176,21178,21180,21182,21184,21186,21188,21190],{"class":35,"line":295},[33,21159,4051],{"class":163},[33,21161,2945],{"class":50},[33,21163,602],{"class":167},[33,21165,4059],{"class":163},[33,21167,21168],{"class":54},"\"Template 'report.html' not found in ",[33,21170,1115],{"class":50},[33,21172,21173],{"class":167},"template_dir",[33,21175,1121],{"class":50},[33,21177,2079],{"class":54},[33,21179,1115],{"class":50},[33,21181,6565],{"class":167},[33,21183,1121],{"class":50},[33,21185,274],{"class":54},[33,21187,1649],{"class":167},[33,21189,190],{"class":163},[33,21191,20843],{"class":167},[33,21193,21194],{"class":35,"line":300},[33,21195,92],{"emptyLinePlaceholder":91},[33,21197,21198,21201,21203,21206,21208,21210,21213,21215,21217,21220,21223],{"class":35,"line":317},[33,21199,21200],{"class":167},"    html_str ",[33,21202,242],{"class":163},[33,21204,21205],{"class":167}," tmpl.render(",[33,21207,18258],{"class":238},[33,21209,242],{"class":163},[33,21211,21212],{"class":167},"segment, ",[33,21214,18629],{"class":238},[33,21216,242],{"class":163},[33,21218,21219],{"class":167},"df.to_dict(",[33,21221,21222],{"class":54},"\"records\"",[33,21224,371],{"class":167},[33,21226,21227,21229],{"class":35,"line":332},[33,21228,2424],{"class":163},[33,21230,574],{"class":167},[33,21232,21233,21236,21239,21241,21244,21246],{"class":35,"line":347},[33,21234,21235],{"class":167},"        HTML(",[33,21237,21238],{"class":238},"string",[33,21240,242],{"class":163},[33,21242,21243],{"class":167},"html_str).write_pdf(",[33,21245,1053],{"class":50},[33,21247,21248],{"class":167},"(path))\n",[33,21250,21251,21253,21255,21257],{"class":35,"line":374},[33,21252,2449],{"class":163},[33,21254,783],{"class":50},[33,21256,1852],{"class":163},[33,21258,1855],{"class":167},[33,21260,21261,21263,21265,21267,21269,21272,21274,21276,21278,21280,21282,21284,21286,21288,21290,21292],{"class":35,"line":397},[33,21262,4051],{"class":163},[33,21264,7590],{"class":50},[33,21266,602],{"class":167},[33,21268,4059],{"class":163},[33,21270,21271],{"class":54},"\"WeasyPrint failed for ",[33,21273,1115],{"class":50},[33,21275,18258],{"class":167},[33,21277,1121],{"class":50},[33,21279,2079],{"class":54},[33,21281,1115],{"class":50},[33,21283,6565],{"class":167},[33,21285,1121],{"class":50},[33,21287,274],{"class":54},[33,21289,1649],{"class":167},[33,21291,190],{"class":163},[33,21293,20843],{"class":167},[33,21295,21296],{"class":35,"line":653},[33,21297,92],{"emptyLinePlaceholder":91},[33,21299,21300,21302],{"class":35,"line":667},[33,21301,1332],{"class":163},[33,21303,17952],{"class":167},[18,21305,21307],{"id":21306},"validation-assert-outputs-match-row-counts","Validation — Assert Outputs Match Row Counts",[14,21309,21310],{},"After writing all three output types for a segment, assert that the row counts agree. Mismatches mean a silent filter crept into one of the writer functions.",[23,21312,21314],{"className":126,"code":21313,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nimport pandas as pd\nfrom pathlib import Path\n\nOUTPUT_DIR = Path(\"\u002Ftmp\u002Freports\")\n\ndef validate_outputs(segment: str, expected_rows: int, output_dir: Path) -> None:\n    safe_seg = segment.replace(\" \", \"_\").lower()\n\n    xlsx_path = output_dir \u002F f\"pipeline-reports-{safe_seg}.xlsx\"\n    if xlsx_path.exists():\n        xl_df = pd.read_excel(xlsx_path, sheet_name=\"Data\", engine=\"openpyxl\")\n        # Subtract 1 for the TOTAL row appended during writing\n        actual_rows = len(xl_df) - 1\n        assert actual_rows == expected_rows, (\n            f\"[{segment}] Excel row mismatch: expected {expected_rows}, got {actual_rows}\"\n        )\n        print(f\"[{segment}] Excel OK ({actual_rows} rows)\")\n\n    pdf_path = output_dir \u002F f\"pipeline-reports-{safe_seg}.pdf\"\n    if pdf_path.exists():\n        size = pdf_path.stat().st_size\n        assert size > 1024, f\"[{segment}] PDF suspiciously small: {size} bytes\"\n        print(f\"[{segment}] PDF OK ({size:,} bytes)\")\n\n    docx_path = output_dir \u002F f\"pipeline-reports-{safe_seg}.docx\"\n    if docx_path.exists():\n        assert docx_path.stat().st_size > 512, f\"[{segment}] DOCX suspiciously small\"\n        print(f\"[{segment}] DOCX OK\")\n\ndf = pd.read_csv(\"\u002Ftmp\u002Fpipeline_sample.csv\")\nfor region, group in df.groupby(\"region\"):\n    validate_outputs(region, len(group), OUTPUT_DIR)\n",[30,21315,21316,21320,21330,21340,21344,21356,21360,21384,21400,21404,21427,21434,21460,21465,21481,21494,21529,21533,21562,21566,21589,21596,21606,21641,21673,21677,21700,21707,21734,21755,21759,21771,21785],{"__ignoreMap":28},[33,21317,21318],{"class":35,"line":36},[33,21319,3952],{"class":39},[33,21321,21322,21324,21326,21328],{"class":35,"line":43},[33,21323,164],{"class":163},[33,21325,492],{"class":167},[33,21327,495],{"class":163},[33,21329,498],{"class":167},[33,21331,21332,21334,21336,21338],{"class":35,"line":61},[33,21333,190],{"class":163},[33,21335,193],{"class":167},[33,21337,164],{"class":163},[33,21339,198],{"class":167},[33,21341,21342],{"class":35,"line":73},[33,21343,92],{"emptyLinePlaceholder":91},[33,21345,21346,21348,21350,21352,21354],{"class":35,"line":88},[33,21347,4615],{"class":50},[33,21349,212],{"class":163},[33,21351,215],{"class":167},[33,21353,17147],{"class":54},[33,21355,221],{"class":167},[33,21357,21358],{"class":35,"line":95},[33,21359,92],{"emptyLinePlaceholder":91},[33,21361,21362,21364,21367,21370,21372,21375,21377,21380,21382],{"class":35,"line":101},[33,21363,562],{"class":163},[33,21365,21366],{"class":46}," validate_outputs",[33,21368,21369],{"class":167},"(segment: ",[33,21371,1053],{"class":50},[33,21373,21374],{"class":167},", expected_rows: ",[33,21376,1059],{"class":50},[33,21378,21379],{"class":167},", output_dir: Path) -> ",[33,21381,571],{"class":50},[33,21383,574],{"class":167},[33,21385,21386,21388,21390,21392,21394,21396,21398],{"class":35,"line":171},[33,21387,17286],{"class":167},[33,21389,242],{"class":163},[33,21391,17291],{"class":167},[33,21393,17294],{"class":54},[33,21395,365],{"class":167},[33,21397,7764],{"class":54},[33,21399,17301],{"class":167},[33,21401,21402],{"class":35,"line":179},[33,21403,92],{"emptyLinePlaceholder":91},[33,21405,21406,21409,21411,21413,21415,21417,21419,21421,21423,21425],{"class":35,"line":187},[33,21407,21408],{"class":167},"    xlsx_path ",[33,21410,242],{"class":163},[33,21412,6393],{"class":167},[33,21414,1351],{"class":163},[33,21416,1110],{"class":163},[33,21418,17317],{"class":54},[33,21420,1115],{"class":50},[33,21422,17322],{"class":167},[33,21424,1121],{"class":50},[33,21426,6410],{"class":54},[33,21428,21429,21431],{"class":35,"line":201},[33,21430,617],{"class":163},[33,21432,21433],{"class":167}," xlsx_path.exists():\n",[33,21435,21436,21439,21441,21444,21446,21448,21450,21452,21454,21456,21458],{"class":35,"line":206},[33,21437,21438],{"class":167},"        xl_df ",[33,21440,242],{"class":163},[33,21442,21443],{"class":167}," pd.read_excel(xlsx_path, ",[33,21445,17371],{"class":238},[33,21447,242],{"class":163},[33,21449,17376],{"class":54},[33,21451,365],{"class":167},[33,21453,17351],{"class":238},[33,21455,242],{"class":163},[33,21457,17356],{"class":54},[33,21459,221],{"class":167},[33,21461,21462],{"class":35,"line":224},[33,21463,21464],{"class":39},"        # Subtract 1 for the TOTAL row appended during writing\n",[33,21466,21467,21470,21472,21474,21477,21479],{"class":35,"line":229},[33,21468,21469],{"class":167},"        actual_rows ",[33,21471,242],{"class":163},[33,21473,4037],{"class":50},[33,21475,21476],{"class":167},"(xl_df) ",[33,21478,4126],{"class":163},[33,21480,17709],{"class":50},[33,21482,21483,21486,21489,21491],{"class":35,"line":235},[33,21484,21485],{"class":163},"        assert",[33,21487,21488],{"class":167}," actual_rows ",[33,21490,1865],{"class":163},[33,21492,21493],{"class":167}," expected_rows, (\n",[33,21495,21496,21498,21501,21503,21505,21507,21510,21512,21515,21517,21520,21522,21525,21527],{"class":35,"line":250},[33,21497,12744],{"class":163},[33,21499,21500],{"class":54},"\"[",[33,21502,1115],{"class":50},[33,21504,18258],{"class":167},[33,21506,1121],{"class":50},[33,21508,21509],{"class":54},"] Excel row mismatch: expected ",[33,21511,1115],{"class":50},[33,21513,21514],{"class":167},"expected_rows",[33,21516,1121],{"class":50},[33,21518,21519],{"class":54},", got ",[33,21521,1115],{"class":50},[33,21523,21524],{"class":167},"actual_rows",[33,21526,1121],{"class":50},[33,21528,7504],{"class":54},[33,21530,21531],{"class":35,"line":266},[33,21532,5867],{"class":167},[33,21534,21535,21537,21539,21541,21543,21545,21547,21549,21552,21554,21556,21558,21560],{"class":35,"line":290},[33,21536,9414],{"class":50},[33,21538,602],{"class":167},[33,21540,4059],{"class":163},[33,21542,21500],{"class":54},[33,21544,1115],{"class":50},[33,21546,18258],{"class":167},[33,21548,1121],{"class":50},[33,21550,21551],{"class":54},"] Excel OK (",[33,21553,1115],{"class":50},[33,21555,21524],{"class":167},[33,21557,1121],{"class":50},[33,21559,18029],{"class":54},[33,21561,221],{"class":167},[33,21563,21564],{"class":35,"line":295},[33,21565,92],{"emptyLinePlaceholder":91},[33,21567,21568,21571,21573,21575,21577,21579,21581,21583,21585,21587],{"class":35,"line":300},[33,21569,21570],{"class":167},"    pdf_path ",[33,21572,242],{"class":163},[33,21574,6393],{"class":167},[33,21576,1351],{"class":163},[33,21578,1110],{"class":163},[33,21580,17317],{"class":54},[33,21582,1115],{"class":50},[33,21584,17322],{"class":167},[33,21586,1121],{"class":50},[33,21588,19246],{"class":54},[33,21590,21591,21593],{"class":35,"line":317},[33,21592,617],{"class":163},[33,21594,21595],{"class":167}," pdf_path.exists():\n",[33,21597,21598,21601,21603],{"class":35,"line":332},[33,21599,21600],{"class":167},"        size ",[33,21602,242],{"class":163},[33,21604,21605],{"class":167}," pdf_path.stat().st_size\n",[33,21607,21608,21610,21613,21615,21617,21619,21621,21623,21625,21627,21629,21632,21634,21636,21638],{"class":35,"line":347},[33,21609,21485],{"class":163},[33,21611,21612],{"class":167}," size ",[33,21614,6009],{"class":163},[33,21616,1159],{"class":50},[33,21618,365],{"class":167},[33,21620,4059],{"class":163},[33,21622,21500],{"class":54},[33,21624,1115],{"class":50},[33,21626,18258],{"class":167},[33,21628,1121],{"class":50},[33,21630,21631],{"class":54},"] PDF suspiciously small: ",[33,21633,1115],{"class":50},[33,21635,17255],{"class":167},[33,21637,1121],{"class":50},[33,21639,21640],{"class":54}," bytes\"\n",[33,21642,21643,21645,21647,21649,21651,21653,21655,21657,21660,21662,21664,21666,21668,21671],{"class":35,"line":374},[33,21644,9414],{"class":50},[33,21646,602],{"class":167},[33,21648,4059],{"class":163},[33,21650,21500],{"class":54},[33,21652,1115],{"class":50},[33,21654,18258],{"class":167},[33,21656,1121],{"class":50},[33,21658,21659],{"class":54},"] PDF OK (",[33,21661,1115],{"class":50},[33,21663,17255],{"class":167},[33,21665,18801],{"class":163},[33,21667,1121],{"class":50},[33,21669,21670],{"class":54}," bytes)\"",[33,21672,221],{"class":167},[33,21674,21675],{"class":35,"line":397},[33,21676,92],{"emptyLinePlaceholder":91},[33,21678,21679,21682,21684,21686,21688,21690,21692,21694,21696,21698],{"class":35,"line":653},[33,21680,21681],{"class":167},"    docx_path ",[33,21683,242],{"class":163},[33,21685,6393],{"class":167},[33,21687,1351],{"class":163},[33,21689,1110],{"class":163},[33,21691,17317],{"class":54},[33,21693,1115],{"class":50},[33,21695,17322],{"class":167},[33,21697,1121],{"class":50},[33,21699,18215],{"class":54},[33,21701,21702,21704],{"class":35,"line":667},[33,21703,617],{"class":163},[33,21705,21706],{"class":167}," docx_path.exists():\n",[33,21708,21709,21711,21714,21716,21719,21721,21723,21725,21727,21729,21731],{"class":35,"line":675},[33,21710,21485],{"class":163},[33,21712,21713],{"class":167}," docx_path.stat().st_size ",[33,21715,6009],{"class":163},[33,21717,21718],{"class":50}," 512",[33,21720,365],{"class":167},[33,21722,4059],{"class":163},[33,21724,21500],{"class":54},[33,21726,1115],{"class":50},[33,21728,18258],{"class":167},[33,21730,1121],{"class":50},[33,21732,21733],{"class":54},"] DOCX suspiciously small\"\n",[33,21735,21736,21738,21740,21742,21744,21746,21748,21750,21753],{"class":35,"line":689},[33,21737,9414],{"class":50},[33,21739,602],{"class":167},[33,21741,4059],{"class":163},[33,21743,21500],{"class":54},[33,21745,1115],{"class":50},[33,21747,18258],{"class":167},[33,21749,1121],{"class":50},[33,21751,21752],{"class":54},"] DOCX OK\"",[33,21754,221],{"class":167},[33,21756,21757],{"class":35,"line":703},[33,21758,92],{"emptyLinePlaceholder":91},[33,21760,21761,21763,21765,21767,21769],{"class":35,"line":714},[33,21762,13459],{"class":167},[33,21764,242],{"class":163},[33,21766,9481],{"class":167},[33,21768,16249],{"class":54},[33,21770,221],{"class":167},[33,21772,21773,21775,21777,21779,21781,21783],{"class":35,"line":723},[33,21774,6124],{"class":163},[33,21776,17975],{"class":167},[33,21778,662],{"class":163},[33,21780,17980],{"class":167},[33,21782,16649],{"class":54},[33,21784,1737],{"class":167},[33,21786,21787,21790,21792,21795,21797],{"class":35,"line":754},[33,21788,21789],{"class":167},"    validate_outputs(region, ",[33,21791,928],{"class":50},[33,21793,21794],{"class":167},"(group), ",[33,21796,4615],{"class":50},[33,21798,221],{"class":167},[14,21800,21801,21802,21806],{},"For the ",[940,21803,21805],{"href":21804},"\u002Fpython-for-excel-csv-data-processing\u002Fautomating-excel-report-generation\u002Fautomating-monthly-sales-reports-in-excel\u002F","automating-monthly-sales-reports-in-excel"," pattern, add a checksum column to the DataFrame before writing and re-derive it after reading back to catch silent numeric rounding.",[18,21808,21810],{"id":21809},"performance-and-scale","Performance and Scale",[14,21812,21813],{},"For DataFrames under 100,000 rows, the per-segment loop above runs in seconds. Beyond that:",[14,21815,21816,6242,21818,21821,21822,21825],{},[1974,21817,4218],{},[30,21819,21820],{},"groupby"," materialises each group as a copy. With 50 segments and 1 M rows, peak RAM is roughly ",[30,21823,21824],{},"(total_df_size) * 2",". Switch to a generator that streams segments from disk:",[23,21827,21829],{"className":126,"code":21828,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\nfrom pathlib import Path\n\ndef iter_segments(csv_path: Path, segment_col: str, chunksize: int = 50_000):\n    \"\"\"Yield (segment_value, chunk_df) without loading the full file.\"\"\"\n    # Read unique segments first (cheap header scan)\n    segments = pd.read_csv(csv_path, usecols=[segment_col])[segment_col].unique()\n    for seg in segments:\n        chunks = []\n        for chunk in pd.read_csv(csv_path, chunksize=chunksize):\n            subset = chunk[chunk[segment_col] == seg]\n            if not subset.empty:\n                chunks.append(subset)\n        if chunks:\n            yield seg, pd.concat(chunks, ignore_index=True)\n",[30,21830,21831,21835,21845,21855,21859,21883,21888,21893,21910,21922,21931,21950,21965,21974,21979,21986],{"__ignoreMap":28},[33,21832,21833],{"class":35,"line":36},[33,21834,8895],{"class":39},[33,21836,21837,21839,21841,21843],{"class":35,"line":43},[33,21838,164],{"class":163},[33,21840,492],{"class":167},[33,21842,495],{"class":163},[33,21844,498],{"class":167},[33,21846,21847,21849,21851,21853],{"class":35,"line":61},[33,21848,190],{"class":163},[33,21850,193],{"class":167},[33,21852,164],{"class":163},[33,21854,198],{"class":167},[33,21856,21857],{"class":35,"line":73},[33,21858,92],{"emptyLinePlaceholder":91},[33,21860,21861,21863,21866,21869,21871,21874,21876,21878,21881],{"class":35,"line":88},[33,21862,562],{"class":163},[33,21864,21865],{"class":46}," iter_segments",[33,21867,21868],{"class":167},"(csv_path: Path, segment_col: ",[33,21870,1053],{"class":50},[33,21872,21873],{"class":167},", chunksize: ",[33,21875,1059],{"class":50},[33,21877,212],{"class":163},[33,21879,21880],{"class":50}," 50_000",[33,21882,1737],{"class":167},[33,21884,21885],{"class":35,"line":95},[33,21886,21887],{"class":54},"    \"\"\"Yield (segment_value, chunk_df) without loading the full file.\"\"\"\n",[33,21889,21890],{"class":35,"line":101},[33,21891,21892],{"class":39},"    # Read unique segments first (cheap header scan)\n",[33,21894,21895,21897,21899,21902,21905,21907],{"class":35,"line":171},[33,21896,16843],{"class":167},[33,21898,242],{"class":163},[33,21900,21901],{"class":167}," pd.read_csv(csv_path, ",[33,21903,21904],{"class":238},"usecols",[33,21906,242],{"class":163},[33,21908,21909],{"class":167},"[segment_col])[segment_col].unique()\n",[33,21911,21912,21914,21917,21919],{"class":35,"line":179},[33,21913,656],{"class":163},[33,21915,21916],{"class":167}," seg ",[33,21918,662],{"class":163},[33,21920,21921],{"class":167}," segments:\n",[33,21923,21924,21927,21929],{"class":35,"line":187},[33,21925,21926],{"class":167},"        chunks ",[33,21928,242],{"class":163},[33,21930,589],{"class":167},[33,21932,21933,21935,21938,21940,21942,21945,21947],{"class":35,"line":201},[33,21934,5973],{"class":163},[33,21936,21937],{"class":167}," chunk ",[33,21939,662],{"class":163},[33,21941,21901],{"class":167},[33,21943,21944],{"class":238},"chunksize",[33,21946,242],{"class":163},[33,21948,21949],{"class":167},"chunksize):\n",[33,21951,21952,21955,21957,21960,21962],{"class":35,"line":206},[33,21953,21954],{"class":167},"            subset ",[33,21956,242],{"class":163},[33,21958,21959],{"class":167}," chunk[chunk[segment_col] ",[33,21961,1865],{"class":163},[33,21963,21964],{"class":167}," seg]\n",[33,21966,21967,21969,21971],{"class":35,"line":224},[33,21968,5995],{"class":163},[33,21970,620],{"class":163},[33,21972,21973],{"class":167}," subset.empty:\n",[33,21975,21976],{"class":35,"line":229},[33,21977,21978],{"class":167},"                chunks.append(subset)\n",[33,21980,21981,21983],{"class":35,"line":235},[33,21982,8221],{"class":163},[33,21984,21985],{"class":167}," chunks:\n",[33,21987,21988,21991,21994,21996,21998,22000],{"class":35,"line":250},[33,21989,21990],{"class":163},"            yield",[33,21992,21993],{"class":167}," seg, pd.concat(chunks, ",[33,21995,850],{"class":238},[33,21997,242],{"class":163},[33,21999,855],{"class":50},[33,22001,221],{"class":167},[14,22003,22004,6242,22007,22010,22011,22014,22015,8877,22017,22020,22021,22023],{},[1974,22005,22006],{},"Excel at scale",[30,22008,22009],{},"openpyxl"," in write-only mode (",[30,22012,22013],{},"write_only=True",") cuts memory by 60% but loses the re-open step for totals rows. Use ",[30,22016,17066],{},[30,22018,22019],{},"constant_memory=True"," for write-only Excel above 200,000 rows; note that ",[30,22022,17066],{}," cannot add a formula row after closing — pre-calculate totals with pandas instead.",[14,22025,22026,22029,22030,22032,22033,8877,22036,22039],{},[1974,22027,22028],{},"PDF at scale"," — ReportLab's ",[30,22031,19006],{}," builds the full story in RAM. For very long tables, split the story into batches of 500 rows and use ",[30,22034,22035],{},"BaseDocTemplate",[30,22037,22038],{},"PageBreak()"," between batches. WeasyPrint is slower per page but can stream HTML from a file, which avoids loading the entire DOM.",[14,22041,22042,22045,22046,20891],{},[1974,22043,22044],{},"Parallel segments"," — if segments are independent, use ",[30,22047,4240],{},[23,22049,22051],{"className":126,"code":22050,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl python-docx reportlab\nimport pandas as pd\nfrom pathlib import Path\nfrom concurrent.futures import ProcessPoolExecutor, as_completed\n\nOUTPUT_DIR = Path(\"\u002Ftmp\u002Freports\")\n\ndef write_segment(args):\n    segment, rows_json, output_dir_str = args\n    import pandas as pd  # re-import inside worker process\n    from pathlib import Path\n    # Import your actual writer functions here\n    group = pd.read_json(rows_json, orient=\"records\")\n    output_dir = Path(output_dir_str)\n    # ... call df_to_excel, df_to_word, df_to_pdf\n    return segment, len(group)\n\ndf = pd.read_csv(\"\u002Ftmp\u002Fpipeline_sample.csv\")\ntasks = [\n    (region, group.to_json(orient=\"records\"), str(OUTPUT_DIR))\n    for region, group in df.groupby(\"region\")\n]\nwith ProcessPoolExecutor(max_workers=4) as pool:\n    futures = {pool.submit(write_segment, t): t[0] for t in tasks}\n    for fut in as_completed(futures):\n        seg, n = fut.result()\n        print(f\"Done: {seg} ({n} rows)\")\n",[30,22052,22053,22058,22068,22078,22088,22092,22104,22108,22118,22128,22142,22152,22157,22176,22186,22191,22203,22207,22219,22228,22249,22263,22267,22288,22311,22321,22330],{"__ignoreMap":28},[33,22054,22055],{"class":35,"line":36},[33,22056,22057],{"class":39},"# pip install pandas openpyxl python-docx reportlab\n",[33,22059,22060,22062,22064,22066],{"class":35,"line":43},[33,22061,164],{"class":163},[33,22063,492],{"class":167},[33,22065,495],{"class":163},[33,22067,498],{"class":167},[33,22069,22070,22072,22074,22076],{"class":35,"line":61},[33,22071,190],{"class":163},[33,22073,193],{"class":167},[33,22075,164],{"class":163},[33,22077,198],{"class":167},[33,22079,22080,22082,22084,22086],{"class":35,"line":73},[33,22081,190],{"class":163},[33,22083,13880],{"class":167},[33,22085,164],{"class":163},[33,22087,13885],{"class":167},[33,22089,22090],{"class":35,"line":88},[33,22091,92],{"emptyLinePlaceholder":91},[33,22093,22094,22096,22098,22100,22102],{"class":35,"line":95},[33,22095,4615],{"class":50},[33,22097,212],{"class":163},[33,22099,215],{"class":167},[33,22101,17147],{"class":54},[33,22103,221],{"class":167},[33,22105,22106],{"class":35,"line":101},[33,22107,92],{"emptyLinePlaceholder":91},[33,22109,22110,22112,22115],{"class":35,"line":171},[33,22111,562],{"class":163},[33,22113,22114],{"class":46}," write_segment",[33,22116,22117],{"class":167},"(args):\n",[33,22119,22120,22123,22125],{"class":35,"line":179},[33,22121,22122],{"class":167},"    segment, rows_json, output_dir_str ",[33,22124,242],{"class":163},[33,22126,22127],{"class":167}," args\n",[33,22129,22130,22132,22134,22136,22139],{"class":35,"line":187},[33,22131,1627],{"class":163},[33,22133,492],{"class":167},[33,22135,495],{"class":163},[33,22137,22138],{"class":167}," pd  ",[33,22140,22141],{"class":39},"# re-import inside worker process\n",[33,22143,22144,22146,22148,22150],{"class":35,"line":201},[33,22145,3878],{"class":163},[33,22147,193],{"class":167},[33,22149,164],{"class":163},[33,22151,198],{"class":167},[33,22153,22154],{"class":35,"line":206},[33,22155,22156],{"class":39},"    # Import your actual writer functions here\n",[33,22158,22159,22162,22164,22167,22170,22172,22174],{"class":35,"line":224},[33,22160,22161],{"class":167},"    group ",[33,22163,242],{"class":163},[33,22165,22166],{"class":167}," pd.read_json(rows_json, ",[33,22168,22169],{"class":238},"orient",[33,22171,242],{"class":163},[33,22173,21222],{"class":54},[33,22175,221],{"class":167},[33,22177,22178,22181,22183],{"class":35,"line":229},[33,22179,22180],{"class":167},"    output_dir ",[33,22182,242],{"class":163},[33,22184,22185],{"class":167}," Path(output_dir_str)\n",[33,22187,22188],{"class":35,"line":235},[33,22189,22190],{"class":39},"    # ... call df_to_excel, df_to_word, df_to_pdf\n",[33,22192,22193,22195,22198,22200],{"class":35,"line":250},[33,22194,1332],{"class":163},[33,22196,22197],{"class":167}," segment, ",[33,22199,928],{"class":50},[33,22201,22202],{"class":167},"(group)\n",[33,22204,22205],{"class":35,"line":266},[33,22206,92],{"emptyLinePlaceholder":91},[33,22208,22209,22211,22213,22215,22217],{"class":35,"line":290},[33,22210,13459],{"class":167},[33,22212,242],{"class":163},[33,22214,9481],{"class":167},[33,22216,16249],{"class":54},[33,22218,221],{"class":167},[33,22220,22221,22224,22226],{"class":35,"line":295},[33,22222,22223],{"class":167},"tasks ",[33,22225,242],{"class":163},[33,22227,7473],{"class":167},[33,22229,22230,22233,22235,22237,22239,22241,22243,22245,22247],{"class":35,"line":300},[33,22231,22232],{"class":167},"    (region, group.to_json(",[33,22234,22169],{"class":238},[33,22236,242],{"class":163},[33,22238,21222],{"class":54},[33,22240,18525],{"class":167},[33,22242,1053],{"class":50},[33,22244,602],{"class":167},[33,22246,4615],{"class":50},[33,22248,371],{"class":167},[33,22250,22251,22253,22255,22257,22259,22261],{"class":35,"line":317},[33,22252,656],{"class":163},[33,22254,17975],{"class":167},[33,22256,662],{"class":163},[33,22258,17980],{"class":167},[33,22260,16649],{"class":54},[33,22262,221],{"class":167},[33,22264,22265],{"class":35,"line":332},[33,22266,9202],{"class":167},[33,22268,22269,22272,22275,22278,22280,22282,22284,22286],{"class":35,"line":347},[33,22270,22271],{"class":163},"with",[33,22273,22274],{"class":167}," ProcessPoolExecutor(",[33,22276,22277],{"class":238},"max_workers",[33,22279,242],{"class":163},[33,22281,1503],{"class":50},[33,22283,1649],{"class":167},[33,22285,495],{"class":163},[33,22287,14105],{"class":167},[33,22289,22290,22293,22295,22298,22300,22302,22304,22306,22308],{"class":35,"line":374},[33,22291,22292],{"class":167},"    futures ",[33,22294,242],{"class":163},[33,22296,22297],{"class":167}," {pool.submit(write_segment, t): t[",[33,22299,748],{"class":50},[33,22301,763],{"class":167},[33,22303,6124],{"class":163},[33,22305,10818],{"class":167},[33,22307,662],{"class":163},[33,22309,22310],{"class":167}," tasks}\n",[33,22312,22313,22315,22317,22319],{"class":35,"line":397},[33,22314,656],{"class":163},[33,22316,14131],{"class":167},[33,22318,662],{"class":163},[33,22320,14136],{"class":167},[33,22322,22323,22326,22328],{"class":35,"line":653},[33,22324,22325],{"class":167},"        seg, n ",[33,22327,242],{"class":163},[33,22329,14163],{"class":167},[33,22331,22332,22334,22336,22338,22341,22343,22346,22348,22350,22352,22355,22357,22359],{"class":35,"line":667},[33,22333,9414],{"class":50},[33,22335,602],{"class":167},[33,22337,4059],{"class":163},[33,22339,22340],{"class":54},"\"Done: ",[33,22342,1115],{"class":50},[33,22344,22345],{"class":167},"seg",[33,22347,1121],{"class":50},[33,22349,17583],{"class":54},[33,22351,1115],{"class":50},[33,22353,22354],{"class":167},"n",[33,22356,1121],{"class":50},[33,22358,18029],{"class":54},[33,22360,221],{"class":167},[14,22362,22363],{},"Pass the group as JSON (not a raw DataFrame) to avoid pickling issues with some openpyxl internals.",[18,22365,4271],{"id":4270},[4273,22367,22368,22378],{},[4276,22369,22370],{},[4279,22371,22372,22374,22376],{},[4282,22373,14317],{},[4282,22375,4287],{},[4282,22377,4290],{},[4292,22379,22380,22405,22425,22445,22460,22477],{},[4279,22381,22382,22388,22397],{},[4297,22383,22384,22387],{},[30,22385,22386],{},"openpyxl.utils.exceptions.InvalidFileException"," when re-opening",[4297,22389,22390,8877,22393,22396],{},[30,22391,22392],{},"pd.ExcelWriter",[30,22394,22395],{},"engine=\"openpyxl\""," left the file handle open",[4297,22398,17059,22399,22401,22402],{},[30,22400,22271],{}," statement; flush before ",[30,22403,22404],{},"load_workbook",[4279,22406,22407,22412,22415],{},[4297,22408,22409],{},[30,22410,22411],{},"reportlab.platypus.doctemplate.LayoutError: Flowable ... too large",[4297,22413,22414],{},"A single table cell overflows the page width",[4297,22416,22417,22418,22420,22421,22424],{},"Reduce ",[30,22419,19795],{},"; wrap long strings with ",[30,22422,22423],{},"Paragraph()"," inside cells",[4279,22426,22427,22432,22435],{},[4297,22428,22429,22431],{},[30,22430,20862],{}," in docxtpl",[4297,22433,22434],{},"Template placeholder has no matching key in context",[4297,22436,22437,22438,22441,22442,22444],{},"Audit ",[30,22439,22440],{},"context.keys()"," against template variables; set ",[30,22443,20870],{}," to skip missing",[4279,22446,22447,22452,22455],{},[4297,22448,22449],{},[30,22450,22451],{},"WeasyPrint OSError: no library called \"libpango\"",[4297,22453,22454],{},"System C libraries not installed",[4297,22456,22457],{},[30,22458,22459],{},"sudo apt-get install libpango-1.0-0 libcairo2 libpangocairo-1.0-0",[4279,22461,22462,22465,22471],{},[4297,22463,22464],{},"PDF file is valid but all numeric columns show 0",[4297,22466,22467,22468,22470],{},"Numeric column dtype is ",[30,22469,11888],{}," after reading CSV",[4297,22472,9574,22473,22476],{},[30,22474,22475],{},"pd.to_numeric(df[col], errors=\"coerce\")"," on all numeric columns before writing",[4279,22478,22479,22485,22488],{},[4297,22480,22481,22482],{},"Excel totals row shows ",[30,22483,22484],{},"#REF!",[4297,22486,22487],{},"Row reference in SUM formula miscalculated after rows removed",[4297,22489,22490,22491,22494],{},"Recompute ",[30,22492,22493],{},"ws.max_row"," after all data rows are written; use explicit row indices",[18,22496,4402],{"id":4401},[14,22498,22499,22500,10065,22503,22506,22507,22510],{},"This script reads a CSV, validates shape, fans out to all three formats per segment, and validates outputs. Pass ",[30,22501,22502],{},"--input",[30,22504,22505],{},"--output-dir","; ",[30,22508,22509],{},"--formats"," controls which outputs to generate.",[23,22512,22514],{"className":126,"code":22513,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl python-docx reportlab\n\"\"\"\npipeline_report_fanout.py — fan-out pipeline DataFrame to Excel, Word, and PDF.\n\nUsage:\n    python pipeline_report_fanout.py \\\n        --input \u002Ftmp\u002Fpipeline_sample.csv \\\n        --output-dir \u002Ftmp\u002Freports \\\n        --segment-col region \\\n        --formats excel word pdf\n\"\"\"\nimport argparse\nimport sys\nfrom pathlib import Path\n\nimport pandas as pd\nfrom openpyxl import load_workbook\nfrom openpyxl.styles import Font, PatternFill, Alignment\nfrom openpyxl.utils import get_column_letter\nfrom docx import Document\nfrom docx.shared import Pt, RGBColor\nfrom reportlab.lib.pagesizes import A4\nfrom reportlab.lib import colors\nfrom reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle\nfrom reportlab.lib.units import cm\nfrom reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle\n\n\n# ── helpers ──────────────────────────────────────────────────────────────────\n\ndef safe_name(segment: str) -> str:\n    return segment.replace(\" \", \"_\").lower()\n\n\ndef validate_df(df: pd.DataFrame, segment_col: str) -> pd.DataFrame:\n    if segment_col not in df.columns:\n        raise ValueError(f\"Segment column '{segment_col}' not found in DataFrame\")\n    numeric_cols = [c for c in df.columns if c != segment_col and\n                    pd.api.types.is_numeric_dtype(df[c])]\n    for col in numeric_cols:\n        df[col] = pd.to_numeric(df[col], errors=\"coerce\")\n    null_total = df.isna().sum().sum()\n    if null_total:\n        print(f\"Warning: {null_total} null values found after coercion\", file=sys.stderr)\n    return df\n\n\n# ── Excel ─────────────────────────────────────────────────────────────────────\n\ndef write_excel(df: pd.DataFrame, segment: str, output_dir: Path) -> Path:\n    path = output_dir \u002F f\"pipeline-reports-{safe_name(segment)}.xlsx\"\n    try:\n        with pd.ExcelWriter(path, engine=\"openpyxl\") as writer:\n            df.to_excel(writer, sheet_name=\"Data\", index=False)\n    except PermissionError:\n        raise RuntimeError(f\"Cannot write Excel (file locked?): {path}\")\n\n    wb = load_workbook(path)\n    ws = wb[\"Data\"]\n    header_fill = PatternFill(\"solid\", fgColor=\"2563EB\")\n    total_fill  = PatternFill(\"solid\", fgColor=\"DBEAFE\")\n    for cell in ws[1]:\n        cell.font = Font(bold=True, color=\"FFFFFF\", size=11)\n        cell.fill = header_fill\n        cell.alignment = Alignment(horizontal=\"center\")\n\n    numeric_headers = [\n        ws.cell(1, c).value for c in range(1, ws.max_column + 1)\n        if ws.cell(1, c).value and pd.api.types.is_numeric_dtype(df.get(ws.cell(1, c).value, pd.Series(dtype=float)))\n    ]\n    total_row = ws.max_row + 1\n    for col_idx in range(1, ws.max_column + 1):\n        header = ws.cell(1, col_idx).value\n        cell   = ws.cell(total_row, col_idx)\n        letter = get_column_letter(col_idx)\n        if header in numeric_headers:\n            cell.value = f\"=SUM({letter}2:{letter}{ws.max_row - 1})\"\n            cell.font  = Font(bold=True)\n            cell.fill  = total_fill\n        elif col_idx == 1:\n            cell.value = \"TOTAL\"\n            cell.font  = Font(bold=True)\n            cell.fill  = total_fill\n\n    wb.save(path)\n    return path\n\n\n# ── Word ──────────────────────────────────────────────────────────────────────\n\ndef write_word(df: pd.DataFrame, segment: str, output_dir: Path) -> Path:\n    path = output_dir \u002F f\"pipeline-reports-{safe_name(segment)}.docx\"\n    doc  = Document()\n\n    heading = doc.add_heading(f\"Pipeline Report — {segment}\", level=1)\n    heading.runs[0].font.color.rgb = RGBColor(0x0F, 0x17, 0x2A)\n\n    numeric_df = df.select_dtypes(include=\"number\")\n    totals     = numeric_df.sum()\n    revenue    = totals.get(\"revenue\", 0)\n    margin     = totals.get(\"margin\", revenue - totals.get(\"cost\", 0))\n    margin_pct = (margin \u002F revenue * 100) if revenue else 0.0\n\n    p = doc.add_paragraph(\n        f\"Segment {segment}: {len(df)} rows, revenue ${revenue:,.0f}, margin {margin_pct:.1f}%.\"\n    )\n    p.runs[0].font.size = Pt(11)\n\n    doc.add_heading(\"Detail\", level=2)\n    cols  = list(df.columns)\n    table = doc.add_table(rows=1, cols=len(cols))\n    table.style = \"Table Grid\"\n    hdr = table.rows[0].cells\n    for i, col in enumerate(cols):\n        hdr[i].text = col\n        hdr[i].paragraphs[0].runs[0].font.bold = True\n\n    for _, row in df.iterrows():\n        cells = table.add_row().cells\n        for i, col in enumerate(cols):\n            val = row[col]\n            cells[i].text = (\n                f\"${val:,.0f}\" if col in (\"revenue\", \"cost\", \"margin\")\n                else f\"{int(val):,}\" if col == \"units\"\n                else str(val)\n            )\n\n    try:\n        doc.save(path)\n    except PermissionError:\n        raise RuntimeError(f\"Cannot write Word doc (file locked?): {path}\")\n\n    return path\n\n\n# ── PDF ───────────────────────────────────────────────────────────────────────\n\ndef write_pdf(df: pd.DataFrame, segment: str, output_dir: Path) -> Path:\n    path   = output_dir \u002F f\"pipeline-reports-{safe_name(segment)}.pdf\"\n    styles = getSampleStyleSheet()\n    accent = colors.HexColor(\"#2563EB\")\n    soft   = colors.HexColor(\"#DBEAFE\")\n    border = colors.HexColor(\"#E2E8F0\")\n\n    title_style = ParagraphStyle(\n        \"Title2\", parent=styles[\"Heading1\"], fontSize=16, textColor=accent, spaceAfter=6\n    )\n    body_style = ParagraphStyle(\n        \"Body2\", parent=styles[\"Normal\"], fontSize=10, leading=14, spaceAfter=8\n    )\n\n    numeric_df = df.select_dtypes(include=\"number\")\n    totals     = numeric_df.sum()\n    revenue    = totals.get(\"revenue\", 0)\n    margin     = totals.get(\"margin\", 0)\n    margin_pct = (margin \u002F revenue * 100) if revenue else 0.0\n\n    story = [\n        Paragraph(f\"Pipeline Report — {segment}\", title_style),\n        Paragraph(\n            f\"Revenue: \u003Cb>${revenue:,.0f}\u003C\u002Fb>  |  Margin: \u003Cb>{margin_pct:.1f}%\u003C\u002Fb>  |  Rows: \u003Cb>{len(df)}\u003C\u002Fb>\",\n            body_style,\n        ),\n        Spacer(1, 0.3 * cm),\n    ]\n\n    cols = list(df.columns)\n    header_row = [c.capitalize() for c in cols]\n    data = [header_row]\n    for _, row in df.iterrows():\n        data.append([\n            f\"${row[c]:,.0f}\" if c in (\"revenue\", \"cost\", \"margin\")\n            else f\"{int(row[c]):,}\" if c == \"units\"\n            else str(row[c])\n            for c in cols\n        ])\n\n    col_count = len(cols)\n    col_width  = 16.0 \u002F col_count * cm\n    tbl = Table(data, colWidths=[col_width] * col_count, repeatRows=1)\n    tbl.setStyle(TableStyle([\n        (\"BACKGROUND\",     (0, 0), (-1, 0),  accent),\n        (\"TEXTCOLOR\",      (0, 0), (-1, 0),  colors.white),\n        (\"FONTNAME\",       (0, 0), (-1, 0),  \"Helvetica-Bold\"),\n        (\"FONTSIZE\",       (0, 0), (-1, -1), 9),\n        (\"ROWBACKGROUNDS\", (0, 1), (-1, -1), [colors.white, soft]),\n        (\"GRID\",           (0, 0), (-1, -1), 0.5, border),\n        (\"ALIGN\",          (1, 0), (-1, -1), \"RIGHT\"),\n    ]))\n    story.append(tbl)\n\n    try:\n        doc = SimpleDocTemplate(\n            str(path), pagesize=A4,\n            leftMargin=2 * cm, rightMargin=2 * cm,\n            topMargin=2 * cm, bottomMargin=2 * cm,\n        )\n        doc.build(story)\n    except PermissionError:\n        raise RuntimeError(f\"Cannot write PDF (file locked?): {path}\")\n\n    return path\n\n\n# ── validation ────────────────────────────────────────────────────────────────\n\ndef validate(segment: str, expected_rows: int, output_dir: Path, formats: list) -> None:\n    sn = safe_name(segment)\n    if \"excel\" in formats:\n        p = output_dir \u002F f\"pipeline-reports-{sn}.xlsx\"\n        xl = pd.read_excel(p, sheet_name=\"Data\", engine=\"openpyxl\")\n        actual = len(xl) - 1  # subtract TOTAL row\n        assert actual == expected_rows, f\"[{segment}] Excel: expected {expected_rows}, got {actual}\"\n        print(f\"  PASS Excel  {p.name} ({actual} rows)\")\n    if \"word\" in formats:\n        p = output_dir \u002F f\"pipeline-reports-{sn}.docx\"\n        assert p.stat().st_size > 512, f\"[{segment}] DOCX too small\"\n        print(f\"  PASS Word   {p.name}\")\n    if \"pdf\" in formats:\n        p = output_dir \u002F f\"pipeline-reports-{sn}.pdf\"\n        assert p.stat().st_size > 1024, f\"[{segment}] PDF too small\"\n        print(f\"  PASS PDF    {p.name}\")\n\n\n# ── main ──────────────────────────────────────────────────────────────────────\n\ndef main():\n    parser = argparse.ArgumentParser(description=\"Fan out pipeline DataFrame to multiple report formats.\")\n    parser.add_argument(\"--input\",       required=True, type=Path, help=\"Input CSV file\")\n    parser.add_argument(\"--output-dir\",  required=True, type=Path, help=\"Output directory\")\n    parser.add_argument(\"--segment-col\", default=\"region\", help=\"Column to split on (default: region)\")\n    parser.add_argument(\"--formats\",     nargs=\"+\", default=[\"excel\", \"word\", \"pdf\"],\n                        choices=[\"excel\", \"word\", \"pdf\"], help=\"Output formats to generate\")\n    args = parser.parse_args()\n\n    if not args.input.exists():\n        sys.exit(f\"Input file not found: {args.input}\")\n\n    args.output_dir.mkdir(parents=True, exist_ok=True)\n\n    try:\n        df = pd.read_csv(args.input)\n    except Exception as exc:\n        sys.exit(f\"Failed to read CSV: {exc}\")\n\n    df = validate_df(df, args.segment_col)\n\n    print(f\"Input shape: {df.shape}  |  Segments: {df[args.segment_col].nunique()}\")\n\n    writers = {\n        \"excel\": write_excel,\n        \"word\":  write_word,\n        \"pdf\":   write_pdf,\n    }\n\n    for segment, group in df.groupby(args.segment_col):\n        print(f\"\\nSegment: {segment}  ({len(group)} rows)\")\n        for fmt in args.formats:\n            try:\n                out = writers[fmt](group.copy(), str(segment), args.output_dir)\n                print(f\"  Wrote {fmt}: {out}\")\n            except Exception as exc:\n                print(f\"  ERROR {fmt} [{segment}]: {exc}\", file=sys.stderr)\n\n        print(f\"  Validating…\")\n        try:\n            validate(str(segment), len(group), args.output_dir, args.formats)\n        except AssertionError as exc:\n            print(f\"  VALIDATION FAILED: {exc}\", file=sys.stderr)\n\n    print(\"\\nDone.\")\n\n\nif __name__ == \"__main__\":\n    main()\n",[30,22515,22516,22520,22524,22529,22533,22537,22545,22552,22559,22566,22571,22575,22581,22587,22597,22601,22611,22621,22632,22642,22652,22663,22673,22683,22693,22703,22714,22718,22722,22727,22731,22748,22762,22766,22770,22784,22797,22822,22849,22854,22864,22880,22890,22897,22927,22933,22937,22941,22946,22950,22963,22986,22992,23010,23030,23038,23061,23065,23073,23085,23106,23127,23141,23173,23182,23198,23202,23210,23240,23270,23274,23286,23308,23320,23329,23337,23347,23379,23396,23406,23418,23426,23442,23450,23454,23458,23464,23468,23472,23477,23481,23494,23516,23525,23529,23560,23591,23595,23615,23625,23643,23669,23698,23702,23711,23756,23760,23776,23780,23797,23808,23832,23840,23852,23864,23873,23889,23893,23903,23911,23923,23933,23942,23980,24008,24017,24022,24026,24032,24036,24044,24067,24071,24077,24081,24085,24090,24094,24107,24130,24138,24151,24164,24177,24181,24189,24229,24233,24241,24280,24284,24288,24304,24312,24328,24344,24370,24374,24382,24400,24404,24443,24447,24451,24466,24470,24474,24484,24503,24513,24523,24528,24565,24592,24601,24612,24617,24621,24633,24652,24680,24684,24712,24740,24770,24802,24831,24865,24898,24902,24906,24910,24916,24924,24936,24958,24980,24984,24988,24996,25019,25023,25029,25033,25037,25042,25046,25073,25083,25095,25119,25145,25164,25206,25235,25246,25268,25294,25315,25326,25348,25373,25394,25398,25402,25407,25411,25420,25437,25471,25504,25530,25570,25600,25608,25612,25621,25641,25645,25666,25670,25676,25685,25695,25714,25718,25727,25731,25761,25765,25774,25782,25790,25798,25802,25806,25818,25852,25864,25870,25886,25917,25928,25973,25978,25992,25999,26015,26026,26054,26059,26075,26080,26085,26098],{"__ignoreMap":28},[33,22517,22518],{"class":35,"line":36},[33,22519,22057],{"class":39},[33,22521,22522],{"class":35,"line":43},[33,22523,139],{"class":54},[33,22525,22526],{"class":35,"line":61},[33,22527,22528],{"class":54},"pipeline_report_fanout.py — fan-out pipeline DataFrame to Excel, Word, and PDF.\n",[33,22530,22531],{"class":35,"line":73},[33,22532,92],{"emptyLinePlaceholder":91},[33,22534,22535],{"class":35,"line":88},[33,22536,4435],{"class":54},[33,22538,22539,22542],{"class":35,"line":95},[33,22540,22541],{"class":54},"    python pipeline_report_fanout.py ",[33,22543,22544],{"class":50},"\\\n",[33,22546,22547,22550],{"class":35,"line":101},[33,22548,22549],{"class":54},"        --input \u002Ftmp\u002Fpipeline_sample.csv ",[33,22551,22544],{"class":50},[33,22553,22554,22557],{"class":35,"line":171},[33,22555,22556],{"class":54},"        --output-dir \u002Ftmp\u002Freports ",[33,22558,22544],{"class":50},[33,22560,22561,22564],{"class":35,"line":179},[33,22562,22563],{"class":54},"        --segment-col region ",[33,22565,22544],{"class":50},[33,22567,22568],{"class":35,"line":187},[33,22569,22570],{"class":54},"        --formats excel word pdf\n",[33,22572,22573],{"class":35,"line":201},[33,22574,139],{"class":54},[33,22576,22577,22579],{"class":35,"line":206},[33,22578,164],{"class":163},[33,22580,4461],{"class":167},[33,22582,22583,22585],{"class":35,"line":224},[33,22584,164],{"class":163},[33,22586,168],{"class":167},[33,22588,22589,22591,22593,22595],{"class":35,"line":229},[33,22590,190],{"class":163},[33,22592,193],{"class":167},[33,22594,164],{"class":163},[33,22596,198],{"class":167},[33,22598,22599],{"class":35,"line":235},[33,22600,92],{"emptyLinePlaceholder":91},[33,22602,22603,22605,22607,22609],{"class":35,"line":250},[33,22604,164],{"class":163},[33,22606,492],{"class":167},[33,22608,495],{"class":163},[33,22610,498],{"class":167},[33,22612,22613,22615,22617,22619],{"class":35,"line":266},[33,22614,190],{"class":163},[33,22616,17103],{"class":167},[33,22618,164],{"class":163},[33,22620,17108],{"class":167},[33,22622,22623,22625,22627,22629],{"class":35,"line":290},[33,22624,190],{"class":163},[33,22626,17115],{"class":167},[33,22628,164],{"class":163},[33,22630,22631],{"class":167}," Font, PatternFill, Alignment\n",[33,22633,22634,22636,22638,22640],{"class":35,"line":295},[33,22635,190],{"class":163},[33,22637,17127],{"class":167},[33,22639,164],{"class":163},[33,22641,17132],{"class":167},[33,22643,22644,22646,22648,22650],{"class":35,"line":300},[33,22645,190],{"class":163},[33,22647,18092],{"class":167},[33,22649,164],{"class":163},[33,22651,18097],{"class":167},[33,22653,22654,22656,22658,22660],{"class":35,"line":317},[33,22655,190],{"class":163},[33,22657,18104],{"class":167},[33,22659,164],{"class":163},[33,22661,22662],{"class":167}," Pt, RGBColor\n",[33,22664,22665,22667,22669,22671],{"class":35,"line":332},[33,22666,190],{"class":163},[33,22668,19044],{"class":167},[33,22670,164],{"class":163},[33,22672,19049],{"class":167},[33,22674,22675,22677,22679,22681],{"class":35,"line":347},[33,22676,190],{"class":163},[33,22678,19056],{"class":167},[33,22680,164],{"class":163},[33,22682,19061],{"class":167},[33,22684,22685,22687,22689,22691],{"class":35,"line":374},[33,22686,190],{"class":163},[33,22688,19068],{"class":167},[33,22690,164],{"class":163},[33,22692,19073],{"class":167},[33,22694,22695,22697,22699,22701],{"class":35,"line":397},[33,22696,190],{"class":163},[33,22698,19080],{"class":167},[33,22700,164],{"class":163},[33,22702,19085],{"class":167},[33,22704,22705,22707,22709,22711],{"class":35,"line":653},[33,22706,190],{"class":163},[33,22708,19092],{"class":167},[33,22710,164],{"class":163},[33,22712,22713],{"class":167}," SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle\n",[33,22715,22716],{"class":35,"line":667},[33,22717,92],{"emptyLinePlaceholder":91},[33,22719,22720],{"class":35,"line":675},[33,22721,92],{"emptyLinePlaceholder":91},[33,22723,22724],{"class":35,"line":689},[33,22725,22726],{"class":39},"# ── helpers ──────────────────────────────────────────────────────────────────\n",[33,22728,22729],{"class":35,"line":703},[33,22730,92],{"emptyLinePlaceholder":91},[33,22732,22733,22735,22738,22740,22742,22744,22746],{"class":35,"line":714},[33,22734,562],{"class":163},[33,22736,22737],{"class":46}," safe_name",[33,22739,21369],{"class":167},[33,22741,1053],{"class":50},[33,22743,1617],{"class":167},[33,22745,1053],{"class":50},[33,22747,574],{"class":167},[33,22749,22750,22752,22754,22756,22758,22760],{"class":35,"line":723},[33,22751,1332],{"class":163},[33,22753,17291],{"class":167},[33,22755,17294],{"class":54},[33,22757,365],{"class":167},[33,22759,7764],{"class":54},[33,22761,17301],{"class":167},[33,22763,22764],{"class":35,"line":754},[33,22765,92],{"emptyLinePlaceholder":91},[33,22767,22768],{"class":35,"line":771},[33,22769,92],{"emptyLinePlaceholder":91},[33,22771,22772,22774,22777,22780,22782],{"class":35,"line":777},[33,22773,562],{"class":163},[33,22775,22776],{"class":46}," validate_df",[33,22778,22779],{"class":167},"(df: pd.DataFrame, segment_col: ",[33,22781,1053],{"class":50},[33,22783,7668],{"class":167},[33,22785,22786,22788,22791,22793,22795],{"class":35,"line":788},[33,22787,617],{"class":163},[33,22789,22790],{"class":167}," segment_col ",[33,22792,7999],{"class":163},[33,22794,8002],{"class":163},[33,22796,8005],{"class":167},[33,22798,22799,22801,22803,22805,22807,22810,22812,22815,22817,22820],{"class":35,"line":804},[33,22800,4051],{"class":163},[33,22802,4054],{"class":50},[33,22804,602],{"class":167},[33,22806,4059],{"class":163},[33,22808,22809],{"class":54},"\"Segment column '",[33,22811,1115],{"class":50},[33,22813,22814],{"class":167},"segment_col",[33,22816,1121],{"class":50},[33,22818,22819],{"class":54},"' not found in DataFrame\"",[33,22821,221],{"class":167},[33,22823,22824,22826,22828,22830,22832,22834,22836,22838,22840,22842,22844,22846],{"class":35,"line":809},[33,22825,16725],{"class":167},[33,22827,242],{"class":163},[33,22829,7740],{"class":167},[33,22831,6124],{"class":163},[33,22833,7486],{"class":167},[33,22835,662],{"class":163},[33,22837,7837],{"class":167},[33,22839,2491],{"class":163},[33,22841,7486],{"class":167},[33,22843,17877],{"class":163},[33,22845,22790],{"class":167},[33,22847,22848],{"class":163},"and\n",[33,22850,22851],{"class":35,"line":819},[33,22852,22853],{"class":167},"                    pd.api.types.is_numeric_dtype(df[c])]\n",[33,22855,22856,22858,22860,22862],{"class":35,"line":829},[33,22857,656],{"class":163},[33,22859,7985],{"class":167},[33,22861,662],{"class":163},[33,22863,16756],{"class":167},[33,22865,22866,22868,22870,22872,22874,22876,22878],{"class":35,"line":834},[33,22867,12341],{"class":167},[33,22869,242],{"class":163},[33,22871,16774],{"class":167},[33,22873,8317],{"class":238},[33,22875,242],{"class":163},[33,22877,12107],{"class":54},[33,22879,221],{"class":167},[33,22881,22882,22885,22887],{"class":35,"line":839},[33,22883,22884],{"class":167},"    null_total ",[33,22886,242],{"class":163},[33,22888,22889],{"class":167}," df.isna().sum().sum()\n",[33,22891,22892,22894],{"class":35,"line":860},[33,22893,617],{"class":163},[33,22895,22896],{"class":167}," null_total:\n",[33,22898,22899,22901,22903,22905,22907,22909,22912,22914,22917,22919,22922,22924],{"class":35,"line":887},[33,22900,9414],{"class":50},[33,22902,602],{"class":167},[33,22904,4059],{"class":163},[33,22906,12747],{"class":54},[33,22908,1115],{"class":50},[33,22910,22911],{"class":167},"null_total",[33,22913,1121],{"class":50},[33,22915,22916],{"class":54}," null values found after coercion\"",[33,22918,365],{"class":167},[33,22920,22921],{"class":238},"file",[33,22923,242],{"class":163},[33,22925,22926],{"class":167},"sys.stderr)\n",[33,22928,22929,22931],{"class":35,"line":907},[33,22930,1332],{"class":163},[33,22932,11719],{"class":167},[33,22934,22935],{"class":35,"line":1826},[33,22936,92],{"emptyLinePlaceholder":91},[33,22938,22939],{"class":35,"line":1844},[33,22940,92],{"emptyLinePlaceholder":91},[33,22942,22943],{"class":35,"line":1858},[33,22944,22945],{"class":39},"# ── Excel ─────────────────────────────────────────────────────────────────────\n",[33,22947,22948],{"class":35,"line":1871},[33,22949,92],{"emptyLinePlaceholder":91},[33,22951,22952,22954,22957,22959,22961],{"class":35,"line":1877},[33,22953,562],{"class":163},[33,22955,22956],{"class":46}," write_excel",[33,22958,17276],{"class":167},[33,22960,1053],{"class":50},[33,22962,17281],{"class":167},[33,22964,22965,22967,22969,22971,22973,22975,22977,22979,22982,22984],{"class":35,"line":1883},[33,22966,17306],{"class":167},[33,22968,242],{"class":163},[33,22970,6393],{"class":167},[33,22972,1351],{"class":163},[33,22974,1110],{"class":163},[33,22976,17317],{"class":54},[33,22978,1115],{"class":50},[33,22980,22981],{"class":167},"safe_name(segment)",[33,22983,1121],{"class":50},[33,22985,6410],{"class":54},[33,22987,22988,22990],{"class":35,"line":1915},[33,22989,2424],{"class":163},[33,22991,574],{"class":167},[33,22993,22994,22996,22998,23000,23002,23004,23006,23008],{"class":35,"line":1926},[33,22995,2191],{"class":163},[33,22997,17348],{"class":167},[33,22999,17351],{"class":238},[33,23001,242],{"class":163},[33,23003,17356],{"class":54},[33,23005,1649],{"class":167},[33,23007,495],{"class":163},[33,23009,17363],{"class":167},[33,23011,23012,23014,23016,23018,23020,23022,23024,23026,23028],{"class":35,"line":1932},[33,23013,17368],{"class":167},[33,23015,17371],{"class":238},[33,23017,242],{"class":163},[33,23019,17376],{"class":54},[33,23021,365],{"class":167},[33,23023,897],{"class":238},[33,23025,242],{"class":163},[33,23027,902],{"class":50},[33,23029,221],{"class":167},[33,23031,23032,23034,23036],{"class":35,"line":1938},[33,23033,2449],{"class":163},[33,23035,17393],{"class":50},[33,23037,574],{"class":167},[33,23039,23040,23042,23044,23046,23048,23051,23053,23055,23057,23059],{"class":35,"line":1950},[33,23041,4051],{"class":163},[33,23043,7590],{"class":50},[33,23045,602],{"class":167},[33,23047,4059],{"class":163},[33,23049,23050],{"class":54},"\"Cannot write Excel (file locked?): ",[33,23052,1115],{"class":50},[33,23054,2580],{"class":167},[33,23056,1121],{"class":50},[33,23058,274],{"class":54},[33,23060,221],{"class":167},[33,23062,23063],{"class":35,"line":1958},[33,23064,92],{"emptyLinePlaceholder":91},[33,23066,23067,23069,23071],{"class":35,"line":4904},[33,23068,17432],{"class":167},[33,23070,242],{"class":163},[33,23072,17437],{"class":167},[33,23074,23075,23077,23079,23081,23083],{"class":35,"line":4909},[33,23076,17442],{"class":167},[33,23078,242],{"class":163},[33,23080,17447],{"class":167},[33,23082,17376],{"class":54},[33,23084,9202],{"class":167},[33,23086,23087,23090,23092,23094,23096,23098,23100,23102,23104],{"class":35,"line":4915},[33,23088,23089],{"class":167},"    header_fill ",[33,23091,242],{"class":163},[33,23093,17185],{"class":167},[33,23095,17188],{"class":54},[33,23097,365],{"class":167},[33,23099,17193],{"class":238},[33,23101,242],{"class":163},[33,23103,17198],{"class":54},[33,23105,221],{"class":167},[33,23107,23108,23111,23113,23115,23117,23119,23121,23123,23125],{"class":35,"line":4925},[33,23109,23110],{"class":167},"    total_fill  ",[33,23112,242],{"class":163},[33,23114,17185],{"class":167},[33,23116,17188],{"class":54},[33,23118,365],{"class":167},[33,23120,17193],{"class":238},[33,23122,242],{"class":163},[33,23124,17221],{"class":54},[33,23126,221],{"class":167},[33,23128,23129,23131,23133,23135,23137,23139],{"class":35,"line":4935},[33,23130,656],{"class":163},[33,23132,17467],{"class":167},[33,23134,662],{"class":163},[33,23136,17472],{"class":167},[33,23138,734],{"class":50},[33,23140,17477],{"class":167},[33,23142,23143,23145,23147,23149,23151,23153,23155,23157,23159,23161,23163,23165,23167,23169,23171],{"class":35,"line":4941},[33,23144,17482],{"class":167},[33,23146,242],{"class":163},[33,23148,17233],{"class":167},[33,23150,17236],{"class":238},[33,23152,242],{"class":163},[33,23154,855],{"class":50},[33,23156,365],{"class":167},[33,23158,17245],{"class":238},[33,23160,242],{"class":163},[33,23162,17250],{"class":54},[33,23164,365],{"class":167},[33,23166,17255],{"class":238},[33,23168,242],{"class":163},[33,23170,17260],{"class":50},[33,23172,221],{"class":167},[33,23174,23175,23177,23179],{"class":35,"line":4950},[33,23176,17492],{"class":167},[33,23178,242],{"class":163},[33,23180,23181],{"class":167}," header_fill\n",[33,23183,23184,23186,23188,23190,23192,23194,23196],{"class":35,"line":4960},[33,23185,17502],{"class":167},[33,23187,242],{"class":163},[33,23189,17507],{"class":167},[33,23191,17510],{"class":238},[33,23193,242],{"class":163},[33,23195,17515],{"class":54},[33,23197,221],{"class":167},[33,23199,23200],{"class":35,"line":4965},[33,23201,92],{"emptyLinePlaceholder":91},[33,23203,23204,23206,23208],{"class":35,"line":4971},[33,23205,17714],{"class":167},[33,23207,242],{"class":163},[33,23209,7473],{"class":167},[33,23211,23212,23215,23217,23220,23222,23224,23226,23228,23230,23232,23234,23236,23238],{"class":35,"line":4983},[33,23213,23214],{"class":167},"        ws.cell(",[33,23216,734],{"class":50},[33,23218,23219],{"class":167},", c).value ",[33,23221,6124],{"class":163},[33,23223,7486],{"class":167},[33,23225,662],{"class":163},[33,23227,1801],{"class":50},[33,23229,602],{"class":167},[33,23231,734],{"class":50},[33,23233,17559],{"class":167},[33,23235,1811],{"class":163},[33,23237,1814],{"class":50},[33,23239,221],{"class":167},[33,23241,23242,23244,23246,23248,23250,23252,23255,23257,23260,23263,23265,23267],{"class":35,"line":4988},[33,23243,8221],{"class":163},[33,23245,17573],{"class":167},[33,23247,734],{"class":50},[33,23249,23219],{"class":167},[33,23251,6001],{"class":163},[33,23253,23254],{"class":167}," pd.api.types.is_numeric_dtype(df.get(ws.cell(",[33,23256,734],{"class":50},[33,23258,23259],{"class":167},", c).value, pd.Series(",[33,23261,23262],{"class":238},"dtype",[33,23264,242],{"class":163},[33,23266,1720],{"class":50},[33,23268,23269],{"class":167},")))\n",[33,23271,23272],{"class":35,"line":4993},[33,23273,19559],{"class":167},[33,23275,23276,23278,23280,23282,23284],{"class":35,"line":5003},[33,23277,17699],{"class":167},[33,23279,242],{"class":163},[33,23281,17704],{"class":167},[33,23283,1811],{"class":163},[33,23285,17709],{"class":50},[33,23287,23288,23290,23292,23294,23296,23298,23300,23302,23304,23306],{"class":35,"line":5008},[33,23289,656],{"class":163},[33,23291,17741],{"class":167},[33,23293,662],{"class":163},[33,23295,1801],{"class":50},[33,23297,602],{"class":167},[33,23299,734],{"class":50},[33,23301,17559],{"class":167},[33,23303,1811],{"class":163},[33,23305,1814],{"class":50},[33,23307,1737],{"class":167},[33,23309,23310,23312,23314,23316,23318],{"class":35,"line":5014},[33,23311,17762],{"class":167},[33,23313,242],{"class":163},[33,23315,17573],{"class":167},[33,23317,734],{"class":50},[33,23319,17771],{"class":167},[33,23321,23322,23325,23327],{"class":35,"line":5019},[33,23323,23324],{"class":167},"        cell   ",[33,23326,242],{"class":163},[33,23328,17781],{"class":167},[33,23330,23331,23333,23335],{"class":35,"line":5032},[33,23332,17613],{"class":167},[33,23334,242],{"class":163},[33,23336,17618],{"class":167},[33,23338,23339,23341,23343,23345],{"class":35,"line":5039},[33,23340,8221],{"class":163},[33,23342,17788],{"class":167},[33,23344,662],{"class":163},[33,23346,17793],{"class":167},[33,23348,23349,23351,23353,23355,23357,23359,23361,23363,23365,23367,23369,23371,23373,23375,23377],{"class":35,"line":5068},[33,23350,17807],{"class":167},[33,23352,242],{"class":163},[33,23354,1110],{"class":163},[33,23356,17814],{"class":54},[33,23358,1115],{"class":50},[33,23360,17819],{"class":167},[33,23362,1121],{"class":50},[33,23364,17824],{"class":54},[33,23366,1115],{"class":50},[33,23368,17819],{"class":167},[33,23370,17831],{"class":50},[33,23372,17834],{"class":167},[33,23374,4126],{"class":163},[33,23376,11022],{"class":50},[33,23378,17841],{"class":54},[33,23380,23381,23384,23386,23388,23390,23392,23394],{"class":35,"line":5077},[33,23382,23383],{"class":167},"            cell.font  ",[33,23385,242],{"class":163},[33,23387,17233],{"class":167},[33,23389,17236],{"class":238},[33,23391,242],{"class":163},[33,23393,855],{"class":50},[33,23395,221],{"class":167},[33,23397,23398,23401,23403],{"class":35,"line":5082},[33,23399,23400],{"class":167},"            cell.fill  ",[33,23402,242],{"class":163},[33,23404,23405],{"class":167}," total_fill\n",[33,23407,23408,23410,23412,23414,23416],{"class":35,"line":5089},[33,23409,17895],{"class":163},[33,23411,17741],{"class":167},[33,23413,1865],{"class":163},[33,23415,1814],{"class":50},[33,23417,574],{"class":167},[33,23419,23420,23422,23424],{"class":35,"line":5098},[33,23421,17807],{"class":167},[33,23423,242],{"class":163},[33,23425,17912],{"class":54},[33,23427,23428,23430,23432,23434,23436,23438,23440],{"class":35,"line":5105},[33,23429,23383],{"class":167},[33,23431,242],{"class":163},[33,23433,17233],{"class":167},[33,23435,17236],{"class":238},[33,23437,242],{"class":163},[33,23439,855],{"class":50},[33,23441,221],{"class":167},[33,23443,23444,23446,23448],{"class":35,"line":5110},[33,23445,23400],{"class":167},[33,23447,242],{"class":163},[33,23449,23405],{"class":167},[33,23451,23452],{"class":35,"line":5115},[33,23453,92],{"emptyLinePlaceholder":91},[33,23455,23456],{"class":35,"line":5128},[33,23457,17945],{"class":167},[33,23459,23460,23462],{"class":35,"line":5135},[33,23461,1332],{"class":163},[33,23463,17952],{"class":167},[33,23465,23466],{"class":35,"line":5142},[33,23467,92],{"emptyLinePlaceholder":91},[33,23469,23470],{"class":35,"line":5151},[33,23471,92],{"emptyLinePlaceholder":91},[33,23473,23474],{"class":35,"line":5156},[33,23475,23476],{"class":39},"# ── Word ──────────────────────────────────────────────────────────────────────\n",[33,23478,23479],{"class":35,"line":5161},[33,23480,92],{"emptyLinePlaceholder":91},[33,23482,23483,23485,23488,23490,23492],{"class":35,"line":5167},[33,23484,562],{"class":163},[33,23486,23487],{"class":46}," write_word",[33,23489,17276],{"class":167},[33,23491,1053],{"class":50},[33,23493,17281],{"class":167},[33,23495,23496,23498,23500,23502,23504,23506,23508,23510,23512,23514],{"class":35,"line":5172},[33,23497,17306],{"class":167},[33,23499,242],{"class":163},[33,23501,6393],{"class":167},[33,23503,1351],{"class":163},[33,23505,1110],{"class":163},[33,23507,17317],{"class":54},[33,23509,1115],{"class":50},[33,23511,22981],{"class":167},[33,23513,1121],{"class":50},[33,23515,18215],{"class":54},[33,23517,23518,23521,23523],{"class":35,"line":5182},[33,23519,23520],{"class":167},"    doc  ",[33,23522,242],{"class":163},[33,23524,18229],{"class":167},[33,23526,23527],{"class":35,"line":5195},[33,23528,92],{"emptyLinePlaceholder":91},[33,23530,23531,23534,23536,23538,23540,23542,23544,23546,23548,23550,23552,23554,23556,23558],{"class":35,"line":5200},[33,23532,23533],{"class":167},"    heading ",[33,23535,242],{"class":163},[33,23537,18248],{"class":167},[33,23539,4059],{"class":163},[33,23541,18253],{"class":54},[33,23543,1115],{"class":50},[33,23545,18258],{"class":167},[33,23547,1121],{"class":50},[33,23549,274],{"class":54},[33,23551,365],{"class":167},[33,23553,18267],{"class":238},[33,23555,242],{"class":163},[33,23557,734],{"class":50},[33,23559,221],{"class":167},[33,23561,23562,23565,23567,23569,23571,23573,23575,23577,23579,23581,23583,23585,23587,23589],{"class":35,"line":5205},[33,23563,23564],{"class":167},"    heading.runs[",[33,23566,748],{"class":50},[33,23568,18283],{"class":167},[33,23570,242],{"class":163},[33,23572,18288],{"class":167},[33,23574,18291],{"class":163},[33,23576,18294],{"class":50},[33,23578,365],{"class":167},[33,23580,18291],{"class":163},[33,23582,18301],{"class":50},[33,23584,365],{"class":167},[33,23586,18291],{"class":163},[33,23588,18308],{"class":50},[33,23590,221],{"class":167},[33,23592,23593],{"class":35,"line":5210},[33,23594,92],{"emptyLinePlaceholder":91},[33,23596,23597,23600,23602,23605,23608,23610,23613],{"class":35,"line":5215},[33,23598,23599],{"class":167},"    numeric_df ",[33,23601,242],{"class":163},[33,23603,23604],{"class":167}," df.select_dtypes(",[33,23606,23607],{"class":238},"include",[33,23609,242],{"class":163},[33,23611,23612],{"class":54},"\"number\"",[33,23614,221],{"class":167},[33,23616,23617,23620,23622],{"class":35,"line":5220},[33,23618,23619],{"class":167},"    totals     ",[33,23621,242],{"class":163},[33,23623,23624],{"class":167}," numeric_df.sum()\n",[33,23626,23627,23630,23632,23635,23637,23639,23641],{"class":35,"line":5227},[33,23628,23629],{"class":167},"    revenue    ",[33,23631,242],{"class":163},[33,23633,23634],{"class":167}," totals.get(",[33,23636,16465],{"class":54},[33,23638,365],{"class":167},[33,23640,748],{"class":50},[33,23642,221],{"class":167},[33,23644,23645,23648,23650,23652,23654,23657,23659,23661,23663,23665,23667],{"class":35,"line":5232},[33,23646,23647],{"class":167},"    margin     ",[33,23649,242],{"class":163},[33,23651,23634],{"class":167},[33,23653,16456],{"class":54},[33,23655,23656],{"class":167},", revenue ",[33,23658,4126],{"class":163},[33,23660,23634],{"class":167},[33,23662,16474],{"class":54},[33,23664,365],{"class":167},[33,23666,748],{"class":50},[33,23668,371],{"class":167},[33,23670,23671,23674,23676,23679,23681,23684,23686,23688,23690,23692,23694,23696],{"class":35,"line":5237},[33,23672,23673],{"class":167},"    margin_pct ",[33,23675,242],{"class":163},[33,23677,23678],{"class":167}," (margin ",[33,23680,1351],{"class":163},[33,23682,23683],{"class":167}," revenue ",[33,23685,1769],{"class":163},[33,23687,18366],{"class":50},[33,23689,1649],{"class":167},[33,23691,2491],{"class":163},[33,23693,23683],{"class":167},[33,23695,7489],{"class":163},[33,23697,18377],{"class":50},[33,23699,23700],{"class":35,"line":5251},[33,23701,92],{"emptyLinePlaceholder":91},[33,23703,23704,23706,23708],{"class":35,"line":5259},[33,23705,18452],{"class":167},[33,23707,242],{"class":163},[33,23709,23710],{"class":167}," doc.add_paragraph(\n",[33,23712,23713,23715,23718,23720,23722,23724,23726,23728,23730,23732,23735,23737,23739,23741,23743,23746,23748,23750,23752,23754],{"class":35,"line":5264},[33,23714,9533],{"class":163},[33,23716,23717],{"class":54},"\"Segment ",[33,23719,1115],{"class":50},[33,23721,18258],{"class":167},[33,23723,1121],{"class":50},[33,23725,2079],{"class":54},[33,23727,4065],{"class":50},[33,23729,4068],{"class":167},[33,23731,1121],{"class":50},[33,23733,23734],{"class":54}," rows, revenue $",[33,23736,1115],{"class":50},[33,23738,18528],{"class":167},[33,23740,18410],{"class":163},[33,23742,1121],{"class":50},[33,23744,23745],{"class":54},", margin ",[33,23747,1115],{"class":50},[33,23749,18435],{"class":167},[33,23751,18438],{"class":163},[33,23753,1121],{"class":50},[33,23755,18443],{"class":54},[33,23757,23758],{"class":35,"line":5269},[33,23759,1202],{"class":167},[33,23761,23762,23764,23766,23768,23770,23772,23774],{"class":35,"line":5283},[33,23763,18462],{"class":167},[33,23765,748],{"class":50},[33,23767,18467],{"class":167},[33,23769,242],{"class":163},[33,23771,18472],{"class":167},[33,23773,17260],{"class":50},[33,23775,221],{"class":167},[33,23777,23778],{"class":35,"line":5293},[33,23779,92],{"emptyLinePlaceholder":91},[33,23781,23782,23784,23787,23789,23791,23793,23795],{"class":35,"line":5303},[33,23783,18591],{"class":167},[33,23785,23786],{"class":54},"\"Detail\"",[33,23788,365],{"class":167},[33,23790,18267],{"class":238},[33,23792,242],{"class":163},[33,23794,1533],{"class":50},[33,23796,221],{"class":167},[33,23798,23799,23802,23804,23806],{"class":35,"line":5313},[33,23800,23801],{"class":167},"    cols  ",[33,23803,242],{"class":163},[33,23805,599],{"class":50},[33,23807,4132],{"class":167},[33,23809,23810,23812,23814,23816,23818,23820,23822,23824,23826,23828,23830],{"class":35,"line":5320},[33,23811,18621],{"class":167},[33,23813,242],{"class":163},[33,23815,18626],{"class":167},[33,23817,18629],{"class":238},[33,23819,242],{"class":163},[33,23821,734],{"class":50},[33,23823,365],{"class":167},[33,23825,18638],{"class":238},[33,23827,242],{"class":163},[33,23829,928],{"class":50},[33,23831,18645],{"class":167},[33,23833,23834,23836,23838],{"class":35,"line":5325},[33,23835,18650],{"class":167},[33,23837,242],{"class":163},[33,23839,18655],{"class":54},[33,23841,23842,23844,23846,23848,23850],{"class":35,"line":5330},[33,23843,18669],{"class":167},[33,23845,242],{"class":163},[33,23847,18674],{"class":167},[33,23849,748],{"class":50},[33,23851,18679],{"class":167},[33,23853,23854,23856,23858,23860,23862],{"class":35,"line":5344},[33,23855,656],{"class":163},[33,23857,18686],{"class":167},[33,23859,662],{"class":163},[33,23861,7403],{"class":50},[33,23863,18693],{"class":167},[33,23865,23866,23868,23870],{"class":35,"line":5349},[33,23867,18698],{"class":167},[33,23869,242],{"class":163},[33,23871,23872],{"class":167}," col\n",[33,23874,23875,23877,23879,23881,23883,23885,23887],{"class":35,"line":5354},[33,23876,18708],{"class":167},[33,23878,748],{"class":50},[33,23880,18713],{"class":167},[33,23882,748],{"class":50},[33,23884,18718],{"class":167},[33,23886,242],{"class":163},[33,23888,2887],{"class":50},[33,23890,23891],{"class":35,"line":5368},[33,23892,92],{"emptyLinePlaceholder":91},[33,23894,23895,23897,23899,23901],{"class":35,"line":5377},[33,23896,656],{"class":163},[33,23898,8560],{"class":167},[33,23900,662],{"class":163},[33,23902,8565],{"class":167},[33,23904,23905,23907,23909],{"class":35,"line":5382},[33,23906,18747],{"class":167},[33,23908,242],{"class":163},[33,23910,18752],{"class":167},[33,23912,23913,23915,23917,23919,23921],{"class":35,"line":5389},[33,23914,5973],{"class":163},[33,23916,18686],{"class":167},[33,23918,662],{"class":163},[33,23920,7403],{"class":50},[33,23922,18693],{"class":167},[33,23924,23925,23928,23930],{"class":35,"line":5399},[33,23926,23927],{"class":167},"            val ",[33,23929,242],{"class":163},[33,23931,23932],{"class":167}," row[col]\n",[33,23934,23935,23938,23940],{"class":35,"line":5404},[33,23936,23937],{"class":167},"            cells[i].text ",[33,23939,242],{"class":163},[33,23941,1415],{"class":167},[33,23943,23944,23947,23949,23951,23954,23956,23958,23960,23962,23964,23966,23968,23970,23972,23974,23976,23978],{"class":35,"line":5409},[33,23945,23946],{"class":163},"                f",[33,23948,18820],{"class":54},[33,23950,1115],{"class":50},[33,23952,23953],{"class":167},"val",[33,23955,18410],{"class":163},[33,23957,1121],{"class":50},[33,23959,274],{"class":54},[33,23961,9994],{"class":163},[33,23963,7985],{"class":167},[33,23965,662],{"class":163},[33,23967,17583],{"class":167},[33,23969,16465],{"class":54},[33,23971,365],{"class":167},[33,23973,16474],{"class":54},[33,23975,365],{"class":167},[33,23977,16456],{"class":54},[33,23979,221],{"class":167},[33,23981,23982,23984,23986,23988,23990,23993,23995,23997,23999,24001,24003,24005],{"class":35,"line":5414},[33,23983,8634],{"class":163},[33,23985,1110],{"class":163},[33,23987,274],{"class":54},[33,23989,18790],{"class":50},[33,23991,23992],{"class":167},"(val)",[33,23994,18801],{"class":163},[33,23996,1121],{"class":50},[33,23998,274],{"class":54},[33,24000,9994],{"class":163},[33,24002,7985],{"class":167},[33,24004,1865],{"class":163},[33,24006,24007],{"class":54}," \"units\"\n",[33,24009,24010,24012,24014],{"class":35,"line":5419},[33,24011,8634],{"class":163},[33,24013,7887],{"class":50},[33,24015,24016],{"class":167},"(val)\n",[33,24018,24019],{"class":35,"line":5425},[33,24020,24021],{"class":167},"            )\n",[33,24023,24024],{"class":35,"line":5430},[33,24025,92],{"emptyLinePlaceholder":91},[33,24027,24028,24030],{"class":35,"line":5440},[33,24029,2424],{"class":163},[33,24031,574],{"class":167},[33,24033,24034],{"class":35,"line":5451},[33,24035,18880],{"class":167},[33,24037,24038,24040,24042],{"class":35,"line":5464},[33,24039,2449],{"class":163},[33,24041,17393],{"class":50},[33,24043,574],{"class":167},[33,24045,24046,24048,24050,24052,24054,24057,24059,24061,24063,24065],{"class":35,"line":5497},[33,24047,4051],{"class":163},[33,24049,7590],{"class":50},[33,24051,602],{"class":167},[33,24053,4059],{"class":163},[33,24055,24056],{"class":54},"\"Cannot write Word doc (file locked?): ",[33,24058,1115],{"class":50},[33,24060,2580],{"class":167},[33,24062,1121],{"class":50},[33,24064,274],{"class":54},[33,24066,221],{"class":167},[33,24068,24069],{"class":35,"line":5514},[33,24070,92],{"emptyLinePlaceholder":91},[33,24072,24073,24075],{"class":35,"line":5527},[33,24074,1332],{"class":163},[33,24076,17952],{"class":167},[33,24078,24079],{"class":35,"line":5532},[33,24080,92],{"emptyLinePlaceholder":91},[33,24082,24083],{"class":35,"line":5537},[33,24084,92],{"emptyLinePlaceholder":91},[33,24086,24087],{"class":35,"line":5543},[33,24088,24089],{"class":39},"# ── PDF ───────────────────────────────────────────────────────────────────────\n",[33,24091,24092],{"class":35,"line":5548},[33,24093,92],{"emptyLinePlaceholder":91},[33,24095,24096,24098,24101,24103,24105],{"class":35,"line":5570},[33,24097,562],{"class":163},[33,24099,24100],{"class":46}," write_pdf",[33,24102,17276],{"class":167},[33,24104,1053],{"class":50},[33,24106,17281],{"class":167},[33,24108,24109,24112,24114,24116,24118,24120,24122,24124,24126,24128],{"class":35,"line":5577},[33,24110,24111],{"class":167},"    path   ",[33,24113,242],{"class":163},[33,24115,6393],{"class":167},[33,24117,1351],{"class":163},[33,24119,1110],{"class":163},[33,24121,17317],{"class":54},[33,24123,1115],{"class":50},[33,24125,22981],{"class":167},[33,24127,1121],{"class":50},[33,24129,19246],{"class":54},[33,24131,24132,24134,24136],{"class":35,"line":5584},[33,24133,19255],{"class":167},[33,24135,242],{"class":163},[33,24137,19260],{"class":167},[33,24139,24140,24143,24145,24147,24149],{"class":35,"line":5591},[33,24141,24142],{"class":167},"    accent ",[33,24144,242],{"class":163},[33,24146,19157],{"class":167},[33,24148,19160],{"class":54},[33,24150,221],{"class":167},[33,24152,24153,24156,24158,24160,24162],{"class":35,"line":5602},[33,24154,24155],{"class":167},"    soft   ",[33,24157,242],{"class":163},[33,24159,19157],{"class":167},[33,24161,19174],{"class":54},[33,24163,221],{"class":167},[33,24165,24166,24169,24171,24173,24175],{"class":35,"line":5607},[33,24167,24168],{"class":167},"    border ",[33,24170,242],{"class":163},[33,24172,19157],{"class":167},[33,24174,19188],{"class":54},[33,24176,221],{"class":167},[33,24178,24179],{"class":35,"line":5623},[33,24180,92],{"emptyLinePlaceholder":91},[33,24182,24183,24185,24187],{"class":35,"line":5630},[33,24184,19265],{"class":167},[33,24186,242],{"class":163},[33,24188,19270],{"class":167},[33,24190,24191,24194,24196,24198,24200,24202,24204,24206,24209,24211,24214,24216,24218,24220,24223,24225,24227],{"class":35,"line":5640},[33,24192,24193],{"class":54},"        \"Title2\"",[33,24195,365],{"class":167},[33,24197,19280],{"class":238},[33,24199,242],{"class":163},[33,24201,19285],{"class":167},[33,24203,19288],{"class":54},[33,24205,8314],{"class":167},[33,24207,24208],{"class":238},"fontSize",[33,24210,242],{"class":163},[33,24212,24213],{"class":50},"16",[33,24215,365],{"class":167},[33,24217,19305],{"class":238},[33,24219,242],{"class":163},[33,24221,24222],{"class":167},"accent, ",[33,24224,19314],{"class":238},[33,24226,242],{"class":163},[33,24228,19319],{"class":50},[33,24230,24231],{"class":35,"line":5654},[33,24232,1202],{"class":167},[33,24234,24235,24237,24239],{"class":35,"line":5667},[33,24236,19328],{"class":167},[33,24238,242],{"class":163},[33,24240,19270],{"class":167},[33,24242,24243,24246,24248,24250,24252,24254,24256,24258,24260,24262,24264,24266,24268,24270,24272,24274,24276,24278],{"class":35,"line":5680},[33,24244,24245],{"class":54},"        \"Body2\"",[33,24247,365],{"class":167},[33,24249,19280],{"class":238},[33,24251,242],{"class":163},[33,24253,19285],{"class":167},[33,24255,19348],{"class":54},[33,24257,8314],{"class":167},[33,24259,24208],{"class":238},[33,24261,242],{"class":163},[33,24263,3545],{"class":50},[33,24265,365],{"class":167},[33,24267,19363],{"class":238},[33,24269,242],{"class":163},[33,24271,19368],{"class":50},[33,24273,365],{"class":167},[33,24275,19314],{"class":238},[33,24277,242],{"class":163},[33,24279,19377],{"class":50},[33,24281,24282],{"class":35,"line":5686},[33,24283,1202],{"class":167},[33,24285,24286],{"class":35,"line":5706},[33,24287,92],{"emptyLinePlaceholder":91},[33,24289,24290,24292,24294,24296,24298,24300,24302],{"class":35,"line":5712},[33,24291,23599],{"class":167},[33,24293,242],{"class":163},[33,24295,23604],{"class":167},[33,24297,23607],{"class":238},[33,24299,242],{"class":163},[33,24301,23612],{"class":54},[33,24303,221],{"class":167},[33,24305,24306,24308,24310],{"class":35,"line":5726},[33,24307,23619],{"class":167},[33,24309,242],{"class":163},[33,24311,23624],{"class":167},[33,24313,24314,24316,24318,24320,24322,24324,24326],{"class":35,"line":5732},[33,24315,23629],{"class":167},[33,24317,242],{"class":163},[33,24319,23634],{"class":167},[33,24321,16465],{"class":54},[33,24323,365],{"class":167},[33,24325,748],{"class":50},[33,24327,221],{"class":167},[33,24329,24330,24332,24334,24336,24338,24340,24342],{"class":35,"line":5741},[33,24331,23647],{"class":167},[33,24333,242],{"class":163},[33,24335,23634],{"class":167},[33,24337,16456],{"class":54},[33,24339,365],{"class":167},[33,24341,748],{"class":50},[33,24343,221],{"class":167},[33,24345,24346,24348,24350,24352,24354,24356,24358,24360,24362,24364,24366,24368],{"class":35,"line":5751},[33,24347,23673],{"class":167},[33,24349,242],{"class":163},[33,24351,23678],{"class":167},[33,24353,1351],{"class":163},[33,24355,23683],{"class":167},[33,24357,1769],{"class":163},[33,24359,18366],{"class":50},[33,24361,1649],{"class":167},[33,24363,2491],{"class":163},[33,24365,23683],{"class":167},[33,24367,7489],{"class":163},[33,24369,18377],{"class":50},[33,24371,24372],{"class":35,"line":5756},[33,24373,92],{"emptyLinePlaceholder":91},[33,24375,24376,24378,24380],{"class":35,"line":5765},[33,24377,19444],{"class":167},[33,24379,242],{"class":163},[33,24381,7473],{"class":167},[33,24383,24384,24386,24388,24390,24392,24394,24396,24398],{"class":35,"line":5806},[33,24385,19453],{"class":167},[33,24387,4059],{"class":163},[33,24389,18253],{"class":54},[33,24391,1115],{"class":50},[33,24393,18258],{"class":167},[33,24395,1121],{"class":50},[33,24397,274],{"class":54},[33,24399,19468],{"class":167},[33,24401,24402],{"class":35,"line":5816},[33,24403,19473],{"class":167},[33,24405,24406,24408,24411,24413,24415,24417,24419,24422,24424,24426,24428,24430,24433,24435,24437,24439,24441],{"class":35,"line":5824},[33,24407,12744],{"class":163},[33,24409,24410],{"class":54},"\"Revenue: \u003Cb>$",[33,24412,1115],{"class":50},[33,24414,18528],{"class":167},[33,24416,18410],{"class":163},[33,24418,1121],{"class":50},[33,24420,24421],{"class":54},"\u003C\u002Fb>  |  Margin: \u003Cb>",[33,24423,1115],{"class":50},[33,24425,18435],{"class":167},[33,24427,18438],{"class":163},[33,24429,1121],{"class":50},[33,24431,24432],{"class":54},"%\u003C\u002Fb>  |  Rows: \u003Cb>",[33,24434,4065],{"class":50},[33,24436,4068],{"class":167},[33,24438,1121],{"class":50},[33,24440,19525],{"class":54},[33,24442,247],{"class":167},[33,24444,24445],{"class":35,"line":5834},[33,24446,19532],{"class":167},[33,24448,24449],{"class":35,"line":5852},[33,24450,19537],{"class":167},[33,24452,24453,24455,24457,24459,24462,24464],{"class":35,"line":5864},[33,24454,19542],{"class":167},[33,24456,734],{"class":50},[33,24458,365],{"class":167},[33,24460,24461],{"class":50},"0.3",[33,24463,1156],{"class":163},[33,24465,19554],{"class":167},[33,24467,24468],{"class":35,"line":5870},[33,24469,19559],{"class":167},[33,24471,24472],{"class":35,"line":5877},[33,24473,92],{"emptyLinePlaceholder":91},[33,24475,24476,24478,24480,24482],{"class":35,"line":5892},[33,24477,18609],{"class":167},[33,24479,242],{"class":163},[33,24481,599],{"class":50},[33,24483,4132],{"class":167},[33,24485,24486,24489,24491,24494,24496,24498,24500],{"class":35,"line":5898},[33,24487,24488],{"class":167},"    header_row ",[33,24490,242],{"class":163},[33,24492,24493],{"class":167}," [c.capitalize() ",[33,24495,6124],{"class":163},[33,24497,7486],{"class":167},[33,24499,662],{"class":163},[33,24501,24502],{"class":167}," cols]\n",[33,24504,24505,24508,24510],{"class":35,"line":5907},[33,24506,24507],{"class":167},"    data ",[33,24509,242],{"class":163},[33,24511,24512],{"class":167}," [header_row]\n",[33,24514,24515,24517,24519,24521],{"class":35,"line":5917},[33,24516,656],{"class":163},[33,24518,8560],{"class":167},[33,24520,662],{"class":163},[33,24522,8565],{"class":167},[33,24524,24525],{"class":35,"line":5922},[33,24526,24527],{"class":167},"        data.append([\n",[33,24529,24530,24532,24534,24536,24539,24541,24543,24545,24547,24549,24551,24553,24555,24557,24559,24561,24563],{"class":35,"line":5928},[33,24531,12744],{"class":163},[33,24533,18820],{"class":54},[33,24535,1115],{"class":50},[33,24537,24538],{"class":167},"row[c]",[33,24540,18410],{"class":163},[33,24542,1121],{"class":50},[33,24544,274],{"class":54},[33,24546,9994],{"class":163},[33,24548,7486],{"class":167},[33,24550,662],{"class":163},[33,24552,17583],{"class":167},[33,24554,16465],{"class":54},[33,24556,365],{"class":167},[33,24558,16474],{"class":54},[33,24560,365],{"class":167},[33,24562,16456],{"class":54},[33,24564,221],{"class":167},[33,24566,24567,24569,24571,24573,24575,24578,24580,24582,24584,24586,24588,24590],{"class":35,"line":5933},[33,24568,8705],{"class":163},[33,24570,1110],{"class":163},[33,24572,274],{"class":54},[33,24574,18790],{"class":50},[33,24576,24577],{"class":167},"(row[c])",[33,24579,18801],{"class":163},[33,24581,1121],{"class":50},[33,24583,274],{"class":54},[33,24585,9994],{"class":163},[33,24587,7486],{"class":167},[33,24589,1865],{"class":163},[33,24591,24007],{"class":54},[33,24593,24594,24596,24598],{"class":35,"line":5939},[33,24595,8705],{"class":163},[33,24597,7887],{"class":50},[33,24599,24600],{"class":167},"(row[c])\n",[33,24602,24603,24605,24607,24609],{"class":35,"line":5950},[33,24604,1793],{"class":163},[33,24606,7486],{"class":167},[33,24608,662],{"class":163},[33,24610,24611],{"class":167}," cols\n",[33,24613,24614],{"class":35,"line":5959},[33,24615,24616],{"class":167},"        ])\n",[33,24618,24619],{"class":35,"line":5970},[33,24620,92],{"emptyLinePlaceholder":91},[33,24622,24623,24626,24628,24630],{"class":35,"line":5982},[33,24624,24625],{"class":167},"    col_count ",[33,24627,242],{"class":163},[33,24629,4037],{"class":50},[33,24631,24632],{"class":167},"(cols)\n",[33,24634,24635,24638,24640,24643,24645,24648,24650],{"class":35,"line":5992},[33,24636,24637],{"class":167},"    col_width  ",[33,24639,242],{"class":163},[33,24641,24642],{"class":50}," 16.0",[33,24644,1107],{"class":163},[33,24646,24647],{"class":167}," col_count ",[33,24649,1769],{"class":163},[33,24651,19085],{"class":167},[33,24653,24654,24656,24658,24660,24662,24664,24667,24669,24672,24674,24676,24678],{"class":35,"line":6016},[33,24655,14864],{"class":167},[33,24657,242],{"class":163},[33,24659,20372],{"class":167},[33,24661,19795],{"class":238},[33,24663,242],{"class":163},[33,24665,24666],{"class":167},"[col_width] ",[33,24668,1769],{"class":163},[33,24670,24671],{"class":167}," col_count, ",[33,24673,19803],{"class":238},[33,24675,242],{"class":163},[33,24677,734],{"class":50},[33,24679,221],{"class":167},[33,24681,24682],{"class":35,"line":6040},[33,24683,19814],{"class":167},[33,24685,24686,24688,24690,24693,24695,24697,24699,24701,24703,24705,24707,24709],{"class":35,"line":6054},[33,24687,19819],{"class":167},[33,24689,19822],{"class":54},[33,24691,24692],{"class":167},",     (",[33,24694,748],{"class":50},[33,24696,365],{"class":167},[33,24698,748],{"class":50},[33,24700,19834],{"class":167},[33,24702,4126],{"class":163},[33,24704,734],{"class":50},[33,24706,365],{"class":167},[33,24708,748],{"class":50},[33,24710,24711],{"class":167},"),  accent),\n",[33,24713,24714,24716,24718,24721,24723,24725,24727,24729,24731,24733,24735,24737],{"class":35,"line":6060},[33,24715,19819],{"class":167},[33,24717,19855],{"class":54},[33,24719,24720],{"class":167},",      (",[33,24722,748],{"class":50},[33,24724,365],{"class":167},[33,24726,748],{"class":50},[33,24728,19834],{"class":167},[33,24730,4126],{"class":163},[33,24732,734],{"class":50},[33,24734,365],{"class":167},[33,24736,748],{"class":50},[33,24738,24739],{"class":167},"),  colors.white),\n",[33,24741,24742,24744,24746,24748,24750,24752,24754,24756,24758,24760,24762,24764,24766,24768],{"class":35,"line":6068},[33,24743,19819],{"class":167},[33,24745,19884],{"class":54},[33,24747,20027],{"class":167},[33,24749,748],{"class":50},[33,24751,365],{"class":167},[33,24753,748],{"class":50},[33,24755,19834],{"class":167},[33,24757,4126],{"class":163},[33,24759,734],{"class":50},[33,24761,365],{"class":167},[33,24763,748],{"class":50},[33,24765,10713],{"class":167},[33,24767,19908],{"class":54},[33,24769,1506],{"class":167},[33,24771,24772,24774,24776,24778,24780,24782,24784,24786,24788,24790,24792,24794,24796,24798,24800],{"class":35,"line":6073},[33,24773,19819],{"class":167},[33,24775,19917],{"class":54},[33,24777,20027],{"class":167},[33,24779,748],{"class":50},[33,24781,365],{"class":167},[33,24783,748],{"class":50},[33,24785,19834],{"class":167},[33,24787,4126],{"class":163},[33,24789,734],{"class":50},[33,24791,365],{"class":167},[33,24793,4126],{"class":163},[33,24795,734],{"class":50},[33,24797,18525],{"class":167},[33,24799,2577],{"class":50},[33,24801,1506],{"class":167},[33,24803,24804,24806,24808,24810,24812,24814,24816,24818,24820,24822,24824,24826,24828],{"class":35,"line":6079},[33,24805,19819],{"class":167},[33,24807,19950],{"class":54},[33,24809,19953],{"class":167},[33,24811,748],{"class":50},[33,24813,365],{"class":167},[33,24815,734],{"class":50},[33,24817,19834],{"class":167},[33,24819,4126],{"class":163},[33,24821,734],{"class":50},[33,24823,365],{"class":167},[33,24825,4126],{"class":163},[33,24827,734],{"class":50},[33,24829,24830],{"class":167},"), [colors.white, soft]),\n",[33,24832,24833,24835,24837,24840,24842,24844,24846,24848,24850,24852,24854,24856,24858,24860,24862],{"class":35,"line":6084},[33,24834,19819],{"class":167},[33,24836,19985],{"class":54},[33,24838,24839],{"class":167},",           (",[33,24841,748],{"class":50},[33,24843,365],{"class":167},[33,24845,748],{"class":50},[33,24847,19834],{"class":167},[33,24849,4126],{"class":163},[33,24851,734],{"class":50},[33,24853,365],{"class":167},[33,24855,4126],{"class":163},[33,24857,734],{"class":50},[33,24859,18525],{"class":167},[33,24861,20011],{"class":50},[33,24863,24864],{"class":167},", border),\n",[33,24866,24867,24869,24871,24874,24876,24878,24880,24882,24884,24886,24888,24890,24892,24894,24896],{"class":35,"line":6099},[33,24868,19819],{"class":167},[33,24870,20024],{"class":54},[33,24872,24873],{"class":167},",          (",[33,24875,734],{"class":50},[33,24877,365],{"class":167},[33,24879,748],{"class":50},[33,24881,19834],{"class":167},[33,24883,4126],{"class":163},[33,24885,734],{"class":50},[33,24887,365],{"class":167},[33,24889,4126],{"class":163},[33,24891,734],{"class":50},[33,24893,18525],{"class":167},[33,24895,20050],{"class":54},[33,24897,1506],{"class":167},[33,24899,24900],{"class":35,"line":6114},[33,24901,20057],{"class":167},[33,24903,24904],{"class":35,"line":6146},[33,24905,20062],{"class":167},[33,24907,24908],{"class":35,"line":6151},[33,24909,92],{"emptyLinePlaceholder":91},[33,24911,24912,24914],{"class":35,"line":6160},[33,24913,2424],{"class":163},[33,24915,574],{"class":167},[33,24917,24918,24920,24922],{"class":35,"line":6170},[33,24919,20077],{"class":167},[33,24921,242],{"class":163},[33,24923,20082],{"class":167},[33,24925,24926,24928,24930,24932,24934],{"class":35,"line":6175},[33,24927,10673],{"class":50},[33,24929,13643],{"class":167},[33,24931,20091],{"class":238},[33,24933,242],{"class":163},[33,24935,20096],{"class":167},[33,24937,24938,24940,24942,24944,24946,24948,24950,24952,24954,24956],{"class":35,"line":6180},[33,24939,20101],{"class":238},[33,24941,242],{"class":163},[33,24943,1533],{"class":50},[33,24945,1156],{"class":163},[33,24947,19757],{"class":167},[33,24949,20112],{"class":238},[33,24951,242],{"class":163},[33,24953,1533],{"class":50},[33,24955,1156],{"class":163},[33,24957,20121],{"class":167},[33,24959,24960,24962,24964,24966,24968,24970,24972,24974,24976,24978],{"class":35,"line":6190},[33,24961,20126],{"class":238},[33,24963,242],{"class":163},[33,24965,1533],{"class":50},[33,24967,1156],{"class":163},[33,24969,19757],{"class":167},[33,24971,20137],{"class":238},[33,24973,242],{"class":163},[33,24975,1533],{"class":50},[33,24977,1156],{"class":163},[33,24979,20121],{"class":167},[33,24981,24982],{"class":35,"line":6201},[33,24983,5867],{"class":167},[33,24985,24986],{"class":35,"line":6208},[33,24987,20154],{"class":167},[33,24989,24990,24992,24994],{"class":35,"line":6219},[33,24991,2449],{"class":163},[33,24993,17393],{"class":50},[33,24995,574],{"class":167},[33,24997,24998,25000,25002,25004,25006,25009,25011,25013,25015,25017],{"class":35,"line":6225},[33,24999,4051],{"class":163},[33,25001,7590],{"class":50},[33,25003,602],{"class":167},[33,25005,4059],{"class":163},[33,25007,25008],{"class":54},"\"Cannot write PDF (file locked?): ",[33,25010,1115],{"class":50},[33,25012,2580],{"class":167},[33,25014,1121],{"class":50},[33,25016,274],{"class":54},[33,25018,221],{"class":167},[33,25020,25021],{"class":35,"line":6231},[33,25022,92],{"emptyLinePlaceholder":91},[33,25024,25025,25027],{"class":35,"line":6258},[33,25026,1332],{"class":163},[33,25028,17952],{"class":167},[33,25030,25031],{"class":35,"line":6267},[33,25032,92],{"emptyLinePlaceholder":91},[33,25034,25035],{"class":35,"line":6282},[33,25036,92],{"emptyLinePlaceholder":91},[33,25038,25039],{"class":35,"line":6287},[33,25040,25041],{"class":39},"# ── validation ────────────────────────────────────────────────────────────────\n",[33,25043,25044],{"class":35,"line":6296},[33,25045,92],{"emptyLinePlaceholder":91},[33,25047,25048,25050,25053,25055,25057,25059,25061,25064,25067,25069,25071],{"class":35,"line":6316},[33,25049,562],{"class":163},[33,25051,25052],{"class":46}," validate",[33,25054,21369],{"class":167},[33,25056,1053],{"class":50},[33,25058,21374],{"class":167},[33,25060,1059],{"class":50},[33,25062,25063],{"class":167},", output_dir: Path, formats: ",[33,25065,25066],{"class":50},"list",[33,25068,1617],{"class":167},[33,25070,571],{"class":50},[33,25072,574],{"class":167},[33,25074,25075,25078,25080],{"class":35,"line":6321},[33,25076,25077],{"class":167},"    sn ",[33,25079,242],{"class":163},[33,25081,25082],{"class":167}," safe_name(segment)\n",[33,25084,25085,25087,25090,25092],{"class":35,"line":6326},[33,25086,617],{"class":163},[33,25088,25089],{"class":54}," \"excel\"",[33,25091,8002],{"class":163},[33,25093,25094],{"class":167}," formats:\n",[33,25096,25097,25100,25102,25104,25106,25108,25110,25112,25115,25117],{"class":35,"line":6343},[33,25098,25099],{"class":167},"        p ",[33,25101,242],{"class":163},[33,25103,6393],{"class":167},[33,25105,1351],{"class":163},[33,25107,1110],{"class":163},[33,25109,17317],{"class":54},[33,25111,1115],{"class":50},[33,25113,25114],{"class":167},"sn",[33,25116,1121],{"class":50},[33,25118,6410],{"class":54},[33,25120,25121,25124,25126,25129,25131,25133,25135,25137,25139,25141,25143],{"class":35,"line":6365},[33,25122,25123],{"class":167},"        xl ",[33,25125,242],{"class":163},[33,25127,25128],{"class":167}," pd.read_excel(p, ",[33,25130,17371],{"class":238},[33,25132,242],{"class":163},[33,25134,17376],{"class":54},[33,25136,365],{"class":167},[33,25138,17351],{"class":238},[33,25140,242],{"class":163},[33,25142,17356],{"class":54},[33,25144,221],{"class":167},[33,25146,25147,25150,25152,25154,25157,25159,25161],{"class":35,"line":6385},[33,25148,25149],{"class":167},"        actual ",[33,25151,242],{"class":163},[33,25153,4037],{"class":50},[33,25155,25156],{"class":167},"(xl) ",[33,25158,4126],{"class":163},[33,25160,1814],{"class":50},[33,25162,25163],{"class":39},"  # subtract TOTAL row\n",[33,25165,25166,25168,25171,25173,25176,25178,25180,25182,25184,25186,25189,25191,25193,25195,25197,25199,25202,25204],{"class":35,"line":6413},[33,25167,21485],{"class":163},[33,25169,25170],{"class":167}," actual ",[33,25172,1865],{"class":163},[33,25174,25175],{"class":167}," expected_rows, ",[33,25177,4059],{"class":163},[33,25179,21500],{"class":54},[33,25181,1115],{"class":50},[33,25183,18258],{"class":167},[33,25185,1121],{"class":50},[33,25187,25188],{"class":54},"] Excel: expected ",[33,25190,1115],{"class":50},[33,25192,21514],{"class":167},[33,25194,1121],{"class":50},[33,25196,21519],{"class":54},[33,25198,1115],{"class":50},[33,25200,25201],{"class":167},"actual",[33,25203,1121],{"class":50},[33,25205,7504],{"class":54},[33,25207,25208,25210,25212,25214,25217,25219,25221,25223,25225,25227,25229,25231,25233],{"class":35,"line":6427},[33,25209,9414],{"class":50},[33,25211,602],{"class":167},[33,25213,4059],{"class":163},[33,25215,25216],{"class":54},"\"  PASS Excel  ",[33,25218,1115],{"class":50},[33,25220,14190],{"class":167},[33,25222,1121],{"class":50},[33,25224,17583],{"class":54},[33,25226,1115],{"class":50},[33,25228,25201],{"class":167},[33,25230,1121],{"class":50},[33,25232,18029],{"class":54},[33,25234,221],{"class":167},[33,25236,25237,25239,25242,25244],{"class":35,"line":6449},[33,25238,617],{"class":163},[33,25240,25241],{"class":54}," \"word\"",[33,25243,8002],{"class":163},[33,25245,25094],{"class":167},[33,25247,25248,25250,25252,25254,25256,25258,25260,25262,25264,25266],{"class":35,"line":6454},[33,25249,25099],{"class":167},[33,25251,242],{"class":163},[33,25253,6393],{"class":167},[33,25255,1351],{"class":163},[33,25257,1110],{"class":163},[33,25259,17317],{"class":54},[33,25261,1115],{"class":50},[33,25263,25114],{"class":167},[33,25265,1121],{"class":50},[33,25267,18215],{"class":54},[33,25269,25270,25272,25275,25277,25279,25281,25283,25285,25287,25289,25291],{"class":35,"line":6459},[33,25271,21485],{"class":163},[33,25273,25274],{"class":167}," p.stat().st_size ",[33,25276,6009],{"class":163},[33,25278,21718],{"class":50},[33,25280,365],{"class":167},[33,25282,4059],{"class":163},[33,25284,21500],{"class":54},[33,25286,1115],{"class":50},[33,25288,18258],{"class":167},[33,25290,1121],{"class":50},[33,25292,25293],{"class":54},"] DOCX too small\"\n",[33,25295,25296,25298,25300,25302,25305,25307,25309,25311,25313],{"class":35,"line":6473},[33,25297,9414],{"class":50},[33,25299,602],{"class":167},[33,25301,4059],{"class":163},[33,25303,25304],{"class":54},"\"  PASS Word   ",[33,25306,1115],{"class":50},[33,25308,14190],{"class":167},[33,25310,1121],{"class":50},[33,25312,274],{"class":54},[33,25314,221],{"class":167},[33,25316,25317,25319,25322,25324],{"class":35,"line":6482},[33,25318,617],{"class":163},[33,25320,25321],{"class":54}," \"pdf\"",[33,25323,8002],{"class":163},[33,25325,25094],{"class":167},[33,25327,25328,25330,25332,25334,25336,25338,25340,25342,25344,25346],{"class":35,"line":6492},[33,25329,25099],{"class":167},[33,25331,242],{"class":163},[33,25333,6393],{"class":167},[33,25335,1351],{"class":163},[33,25337,1110],{"class":163},[33,25339,17317],{"class":54},[33,25341,1115],{"class":50},[33,25343,25114],{"class":167},[33,25345,1121],{"class":50},[33,25347,19246],{"class":54},[33,25349,25350,25352,25354,25356,25358,25360,25362,25364,25366,25368,25370],{"class":35,"line":6497},[33,25351,21485],{"class":163},[33,25353,25274],{"class":167},[33,25355,6009],{"class":163},[33,25357,1159],{"class":50},[33,25359,365],{"class":167},[33,25361,4059],{"class":163},[33,25363,21500],{"class":54},[33,25365,1115],{"class":50},[33,25367,18258],{"class":167},[33,25369,1121],{"class":50},[33,25371,25372],{"class":54},"] PDF too small\"\n",[33,25374,25375,25377,25379,25381,25384,25386,25388,25390,25392],{"class":35,"line":6504},[33,25376,9414],{"class":50},[33,25378,602],{"class":167},[33,25380,4059],{"class":163},[33,25382,25383],{"class":54},"\"  PASS PDF    ",[33,25385,1115],{"class":50},[33,25387,14190],{"class":167},[33,25389,1121],{"class":50},[33,25391,274],{"class":54},[33,25393,221],{"class":167},[33,25395,25396],{"class":35,"line":6510},[33,25397,92],{"emptyLinePlaceholder":91},[33,25399,25400],{"class":35,"line":6521},[33,25401,92],{"emptyLinePlaceholder":91},[33,25403,25404],{"class":35,"line":6531},[33,25405,25406],{"class":39},"# ── main ──────────────────────────────────────────────────────────────────────\n",[33,25408,25409],{"class":35,"line":6537},[33,25410,92],{"emptyLinePlaceholder":91},[33,25412,25413,25415,25417],{"class":35,"line":6550},[33,25414,562],{"class":163},[33,25416,6636],{"class":46},[33,25418,25419],{"class":167},"():\n",[33,25421,25422,25424,25426,25428,25430,25432,25435],{"class":35,"line":6589},[33,25423,6648],{"class":167},[33,25425,242],{"class":163},[33,25427,6653],{"class":167},[33,25429,6656],{"class":238},[33,25431,242],{"class":163},[33,25433,25434],{"class":54},"\"Fan out pipeline DataFrame to multiple report formats.\"",[33,25436,221],{"class":167},[33,25438,25439,25441,25443,25446,25449,25451,25453,25455,25457,25459,25461,25464,25466,25469],{"class":35,"line":6594},[33,25440,6669],{"class":167},[33,25442,6672],{"class":54},[33,25444,25445],{"class":167},",       ",[33,25447,25448],{"class":238},"required",[33,25450,242],{"class":163},[33,25452,855],{"class":50},[33,25454,365],{"class":167},[33,25456,6677],{"class":238},[33,25458,242],{"class":163},[33,25460,6682],{"class":167},[33,25462,25463],{"class":238},"help",[33,25465,242],{"class":163},[33,25467,25468],{"class":54},"\"Input CSV file\"",[33,25470,221],{"class":167},[33,25472,25473,25475,25478,25481,25483,25485,25487,25489,25491,25493,25495,25497,25499,25502],{"class":35,"line":6603},[33,25474,6669],{"class":167},[33,25476,25477],{"class":54},"\"--output-dir\"",[33,25479,25480],{"class":167},",  ",[33,25482,25448],{"class":238},[33,25484,242],{"class":163},[33,25486,855],{"class":50},[33,25488,365],{"class":167},[33,25490,6677],{"class":238},[33,25492,242],{"class":163},[33,25494,6682],{"class":167},[33,25496,25463],{"class":238},[33,25498,242],{"class":163},[33,25500,25501],{"class":54},"\"Output directory\"",[33,25503,221],{"class":167},[33,25505,25506,25508,25511,25513,25515,25517,25519,25521,25523,25525,25528],{"class":35,"line":6610},[33,25507,6669],{"class":167},[33,25509,25510],{"class":54},"\"--segment-col\"",[33,25512,365],{"class":167},[33,25514,6685],{"class":238},[33,25516,242],{"class":163},[33,25518,16649],{"class":54},[33,25520,365],{"class":167},[33,25522,25463],{"class":238},[33,25524,242],{"class":163},[33,25526,25527],{"class":54},"\"Column to split on (default: region)\"",[33,25529,221],{"class":167},[33,25531,25532,25534,25537,25540,25543,25545,25548,25550,25552,25554,25556,25559,25561,25564,25566,25568],{"class":35,"line":6615},[33,25533,6669],{"class":167},[33,25535,25536],{"class":54},"\"--formats\"",[33,25538,25539],{"class":167},",     ",[33,25541,25542],{"class":238},"nargs",[33,25544,242],{"class":163},[33,25546,25547],{"class":54},"\"+\"",[33,25549,365],{"class":167},[33,25551,6685],{"class":238},[33,25553,242],{"class":163},[33,25555,8309],{"class":167},[33,25557,25558],{"class":54},"\"excel\"",[33,25560,365],{"class":167},[33,25562,25563],{"class":54},"\"word\"",[33,25565,365],{"class":167},[33,25567,15519],{"class":54},[33,25569,8935],{"class":167},[33,25571,25572,25575,25577,25579,25581,25583,25585,25587,25589,25591,25593,25595,25598],{"class":35,"line":6620},[33,25573,25574],{"class":238},"                        choices",[33,25576,242],{"class":163},[33,25578,8309],{"class":167},[33,25580,25558],{"class":54},[33,25582,365],{"class":167},[33,25584,25563],{"class":54},[33,25586,365],{"class":167},[33,25588,15519],{"class":54},[33,25590,8314],{"class":167},[33,25592,25463],{"class":238},[33,25594,242],{"class":163},[33,25596,25597],{"class":54},"\"Output formats to generate\"",[33,25599,221],{"class":167},[33,25601,25602,25604,25606],{"class":35,"line":6626},[33,25603,6766],{"class":167},[33,25605,242],{"class":163},[33,25607,6771],{"class":167},[33,25609,25610],{"class":35,"line":6631},[33,25611,92],{"emptyLinePlaceholder":91},[33,25613,25614,25616,25618],{"class":35,"line":6645},[33,25615,617],{"class":163},[33,25617,620],{"class":163},[33,25619,25620],{"class":167}," args.input.exists():\n",[33,25622,25623,25625,25627,25630,25632,25635,25637,25639],{"class":35,"line":6666},[33,25624,2995],{"class":167},[33,25626,4059],{"class":163},[33,25628,25629],{"class":54},"\"Input file not found: ",[33,25631,1115],{"class":50},[33,25633,25634],{"class":167},"args.input",[33,25636,1121],{"class":50},[33,25638,274],{"class":54},[33,25640,221],{"class":167},[33,25642,25643],{"class":35,"line":6694},[33,25644,92],{"emptyLinePlaceholder":91},[33,25646,25647,25650,25652,25654,25656,25658,25660,25662,25664],{"class":35,"line":6718},[33,25648,25649],{"class":167},"    args.output_dir.mkdir(",[33,25651,869],{"class":238},[33,25653,242],{"class":163},[33,25655,855],{"class":50},[33,25657,365],{"class":167},[33,25659,878],{"class":238},[33,25661,242],{"class":163},[33,25663,855],{"class":50},[33,25665,221],{"class":167},[33,25667,25668],{"class":35,"line":6724},[33,25669,92],{"emptyLinePlaceholder":91},[33,25671,25672,25674],{"class":35,"line":6732},[33,25673,2424],{"class":163},[33,25675,574],{"class":167},[33,25677,25678,25680,25682],{"class":35,"line":6745},[33,25679,7930],{"class":167},[33,25681,242],{"class":163},[33,25683,25684],{"class":167}," pd.read_csv(args.input)\n",[33,25686,25687,25689,25691,25693],{"class":35,"line":6758},[33,25688,2449],{"class":163},[33,25690,783],{"class":50},[33,25692,1852],{"class":163},[33,25694,1855],{"class":167},[33,25696,25697,25699,25701,25704,25706,25708,25710,25712],{"class":35,"line":6763},[33,25698,2995],{"class":167},[33,25700,4059],{"class":163},[33,25702,25703],{"class":54},"\"Failed to read CSV: ",[33,25705,1115],{"class":50},[33,25707,6565],{"class":167},[33,25709,1121],{"class":50},[33,25711,274],{"class":54},[33,25713,221],{"class":167},[33,25715,25716],{"class":35,"line":6774},[33,25717,92],{"emptyLinePlaceholder":91},[33,25719,25720,25722,25724],{"class":35,"line":6779},[33,25721,4025],{"class":167},[33,25723,242],{"class":163},[33,25725,25726],{"class":167}," validate_df(df, args.segment_col)\n",[33,25728,25729],{"class":35,"line":6787},[33,25730,92],{"emptyLinePlaceholder":91},[33,25732,25733,25735,25737,25739,25742,25744,25746,25748,25750,25752,25755,25757,25759],{"class":35,"line":6797},[33,25734,7268],{"class":50},[33,25736,602],{"class":167},[33,25738,4059],{"class":163},[33,25740,25741],{"class":54},"\"Input shape: ",[33,25743,1115],{"class":50},[33,25745,9426],{"class":167},[33,25747,1121],{"class":50},[33,25749,16872],{"class":54},[33,25751,1115],{"class":50},[33,25753,25754],{"class":167},"df[args.segment_col].nunique()",[33,25756,1121],{"class":50},[33,25758,274],{"class":54},[33,25760,221],{"class":167},[33,25762,25763],{"class":35,"line":6808},[33,25764,92],{"emptyLinePlaceholder":91},[33,25766,25767,25770,25772],{"class":35,"line":6830},[33,25768,25769],{"class":167},"    writers ",[33,25771,242],{"class":163},[33,25773,16265],{"class":167},[33,25775,25776,25779],{"class":35,"line":6835},[33,25777,25778],{"class":54},"        \"excel\"",[33,25780,25781],{"class":167},": write_excel,\n",[33,25783,25784,25787],{"class":35,"line":6845},[33,25785,25786],{"class":54},"        \"word\"",[33,25788,25789],{"class":167},":  write_word,\n",[33,25791,25792,25795],{"class":35,"line":6851},[33,25793,25794],{"class":54},"        \"pdf\"",[33,25796,25797],{"class":167},":   write_pdf,\n",[33,25799,25800],{"class":35,"line":6861},[33,25801,20781],{"class":167},[33,25803,25804],{"class":35,"line":6869},[33,25805,92],{"emptyLinePlaceholder":91},[33,25807,25808,25810,25813,25815],{"class":35,"line":6888},[33,25809,656],{"class":163},[33,25811,25812],{"class":167}," segment, group ",[33,25814,662],{"class":163},[33,25816,25817],{"class":167}," df.groupby(args.segment_col):\n",[33,25819,25820,25822,25824,25826,25828,25831,25834,25836,25838,25840,25842,25844,25846,25848,25850],{"class":35,"line":6893},[33,25821,9414],{"class":50},[33,25823,602],{"class":167},[33,25825,4059],{"class":163},[33,25827,274],{"class":54},[33,25829,25830],{"class":50},"\\n",[33,25832,25833],{"class":54},"Segment: ",[33,25835,1115],{"class":50},[33,25837,18258],{"class":167},[33,25839,1121],{"class":50},[33,25841,18019],{"class":54},[33,25843,4065],{"class":50},[33,25845,18024],{"class":167},[33,25847,1121],{"class":50},[33,25849,18029],{"class":54},[33,25851,221],{"class":167},[33,25853,25854,25856,25859,25861],{"class":35,"line":6898},[33,25855,5973],{"class":163},[33,25857,25858],{"class":167}," fmt ",[33,25860,662],{"class":163},[33,25862,25863],{"class":167}," args.formats:\n",[33,25865,25866,25868],{"class":35,"line":6911},[33,25867,14151],{"class":163},[33,25869,574],{"class":167},[33,25871,25873,25876,25878,25881,25883],{"class":35,"line":25872},260,[33,25874,25875],{"class":167},"                out ",[33,25877,242],{"class":163},[33,25879,25880],{"class":167}," writers[fmt](group.copy(), ",[33,25882,1053],{"class":50},[33,25884,25885],{"class":167},"(segment), args.output_dir)\n",[33,25887,25889,25891,25893,25895,25898,25900,25903,25905,25907,25909,25911,25913,25915],{"class":35,"line":25888},261,[33,25890,8264],{"class":50},[33,25892,602],{"class":167},[33,25894,4059],{"class":163},[33,25896,25897],{"class":54},"\"  Wrote ",[33,25899,1115],{"class":50},[33,25901,25902],{"class":167},"fmt",[33,25904,1121],{"class":50},[33,25906,2079],{"class":54},[33,25908,1115],{"class":50},[33,25910,18014],{"class":167},[33,25912,1121],{"class":50},[33,25914,274],{"class":54},[33,25916,221],{"class":167},[33,25918,25920,25922,25924,25926],{"class":35,"line":25919},262,[33,25921,14168],{"class":163},[33,25923,783],{"class":50},[33,25925,1852],{"class":163},[33,25927,1855],{"class":167},[33,25929,25931,25933,25935,25937,25940,25942,25944,25946,25948,25950,25952,25954,25957,25959,25961,25963,25965,25967,25969,25971],{"class":35,"line":25930},263,[33,25932,8264],{"class":50},[33,25934,602],{"class":167},[33,25936,4059],{"class":163},[33,25938,25939],{"class":54},"\"  ERROR ",[33,25941,1115],{"class":50},[33,25943,25902],{"class":167},[33,25945,1121],{"class":50},[33,25947,9178],{"class":54},[33,25949,1115],{"class":50},[33,25951,18258],{"class":167},[33,25953,1121],{"class":50},[33,25955,25956],{"class":54},"]: ",[33,25958,1115],{"class":50},[33,25960,6565],{"class":167},[33,25962,1121],{"class":50},[33,25964,274],{"class":54},[33,25966,365],{"class":167},[33,25968,22921],{"class":238},[33,25970,242],{"class":163},[33,25972,22926],{"class":167},[33,25974,25976],{"class":35,"line":25975},264,[33,25977,92],{"emptyLinePlaceholder":91},[33,25979,25981,25983,25985,25987,25990],{"class":35,"line":25980},265,[33,25982,9414],{"class":50},[33,25984,602],{"class":167},[33,25986,4059],{"class":163},[33,25988,25989],{"class":54},"\"  Validating…\"",[33,25991,221],{"class":167},[33,25993,25995,25997],{"class":35,"line":25994},266,[33,25996,670],{"class":163},[33,25998,574],{"class":167},[33,26000,26002,26005,26007,26010,26012],{"class":35,"line":26001},267,[33,26003,26004],{"class":167},"            validate(",[33,26006,1053],{"class":50},[33,26008,26009],{"class":167},"(segment), ",[33,26011,928],{"class":50},[33,26013,26014],{"class":167},"(group), args.output_dir, args.formats)\n",[33,26016,26018,26020,26022,26024],{"class":35,"line":26017},268,[33,26019,780],{"class":163},[33,26021,9445],{"class":50},[33,26023,1852],{"class":163},[33,26025,1855],{"class":167},[33,26027,26029,26031,26033,26035,26038,26040,26042,26044,26046,26048,26050,26052],{"class":35,"line":26028},269,[33,26030,9364],{"class":50},[33,26032,602],{"class":167},[33,26034,4059],{"class":163},[33,26036,26037],{"class":54},"\"  VALIDATION FAILED: ",[33,26039,1115],{"class":50},[33,26041,6565],{"class":167},[33,26043,1121],{"class":50},[33,26045,274],{"class":54},[33,26047,365],{"class":167},[33,26049,22921],{"class":238},[33,26051,242],{"class":163},[33,26053,22926],{"class":167},[33,26055,26057],{"class":35,"line":26056},270,[33,26058,92],{"emptyLinePlaceholder":91},[33,26060,26062,26064,26066,26068,26070,26073],{"class":35,"line":26061},271,[33,26063,7268],{"class":50},[33,26065,602],{"class":167},[33,26067,274],{"class":54},[33,26069,25830],{"class":50},[33,26071,26072],{"class":54},"Done.\"",[33,26074,221],{"class":167},[33,26076,26078],{"class":35,"line":26077},272,[33,26079,92],{"emptyLinePlaceholder":91},[33,26081,26083],{"class":35,"line":26082},273,[33,26084,92],{"emptyLinePlaceholder":91},[33,26086,26088,26090,26092,26094,26096],{"class":35,"line":26087},274,[33,26089,2491],{"class":163},[33,26091,2494],{"class":50},[33,26093,2497],{"class":163},[33,26095,2500],{"class":54},[33,26097,574],{"class":167},[33,26099,26101],{"class":35,"line":26100},275,[33,26102,6914],{"class":167},[14,26104,26105],{},"Run it against the fixture:",[23,26107,26109],{"className":25,"code":26108,"language":27,"meta":28,"style":28},"python pipeline_report_fanout.py \\\n    --input \u002Ftmp\u002Fpipeline_sample.csv \\\n    --output-dir \u002Ftmp\u002Freports \\\n    --segment-col region \\\n    --formats excel word pdf\n",[30,26110,26111,26121,26131,26141,26151],{"__ignoreMap":28},[33,26112,26113,26115,26118],{"class":35,"line":36},[33,26114,47],{"class":46},[33,26116,26117],{"class":54}," pipeline_report_fanout.py",[33,26119,26120],{"class":50}," \\\n",[33,26122,26123,26126,26129],{"class":35,"line":43},[33,26124,26125],{"class":50},"    --input",[33,26127,26128],{"class":54}," \u002Ftmp\u002Fpipeline_sample.csv",[33,26130,26120],{"class":50},[33,26132,26133,26136,26139],{"class":35,"line":61},[33,26134,26135],{"class":50},"    --output-dir",[33,26137,26138],{"class":54}," \u002Ftmp\u002Freports",[33,26140,26120],{"class":50},[33,26142,26143,26146,26149],{"class":35,"line":73},[33,26144,26145],{"class":50},"    --segment-col",[33,26147,26148],{"class":54}," region",[33,26150,26120],{"class":50},[33,26152,26153,26156,26159,26162],{"class":35,"line":88},[33,26154,26155],{"class":50},"    --formats",[33,26157,26158],{"class":54}," excel",[33,26160,26161],{"class":54}," word",[33,26163,26164],{"class":54}," pdf\n",[14,26166,26167,26168,26171],{},"Expected output: nine files under ",[30,26169,26170],{},"\u002Ftmp\u002Freports\u002F"," — three formats per three regions — each passing the validation step.",[18,26173,6918],{"id":6917},[4211,26175,26176,26181,26187,26193],{},[4214,26177,26178,26180],{},[940,26179,6936],{"href":6935}," — openpyxl formatting, charts, and scheduled workbook creation",[4214,26182,26183,26186],{},[940,26184,26185],{"href":18040},"Dynamic Mail Merge with Python"," — docxtpl Jinja2 templates and per-recipient Word output",[4214,26188,26189,26192],{},[940,26190,26191],{"href":19001},"Generating PDF Reports Dynamically"," — ReportLab layout, fonts, and invoice-style PDF generation",[4214,26194,26195,26197],{},[940,26196,948],{"href":947}," — the upstream step that produces the DataFrame this guide consumes",[14,26199,6947,26200,3035],{},[940,26201,6951],{"href":6950},[6953,26203,26204],{},"html pre.shiki code .sAwPA, html code.shiki .sAwPA{--shiki-default:#6A737D}html pre.shiki code .s7eDp, html code.shiki .s7eDp{--shiki-default:#6F42C1}html pre.shiki code .sYBdl, html code.shiki .sYBdl{--shiki-default:#032F62}html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html pre.shiki code .sD7c4, html code.shiki .sD7c4{--shiki-default:#D73A49}html pre.shiki code .sgsFI, html code.shiki .sgsFI{--shiki-default:#24292E}html pre.shiki code .sYu0t, html code.shiki .sYu0t{--shiki-default:#005CC5}html pre.shiki code .sqxcx, html code.shiki .sqxcx{--shiki-default:#E36209}",{"title":28,"searchDepth":43,"depth":43,"links":26206},[26207,26208,26209,26214,26219,26220,26221,26222,26223],{"id":20,"depth":43,"text":21},{"id":16524,"depth":43,"text":16525},{"id":421,"depth":43,"text":422,"children":26210},[26211,26212,26213],{"id":17055,"depth":61,"text":17056},{"id":18034,"depth":61,"text":18035},{"id":18995,"depth":61,"text":18996},{"id":2708,"depth":43,"text":2709,"children":26215},[26216,26217,26218],{"id":20271,"depth":61,"text":20272},{"id":20389,"depth":61,"text":20390},{"id":20874,"depth":61,"text":20875},{"id":21306,"depth":43,"text":21307},{"id":21809,"depth":43,"text":21810},{"id":4270,"depth":43,"text":4271},{"id":4401,"depth":43,"text":4402},{"id":6917,"depth":43,"text":6918},"Generating Reports","Turn a cleaned pandas DataFrame into Excel workbooks, Word summaries, and PDF reports in one pass — fan-out templating, per-segment splitting, and validated output naming.",{},"\u002Fautomating-document-data-pipelines\u002Fgenerating-reports-from-pipeline-data",{"title":4204,"description":26225},"Generating Reports from Pipeline Data in Python","automating-document-data-pipelines\u002Fgenerating-reports-from-pipeline-data\u002Findex",[47,9630,22009,18041,26232],"reportlab","N68oWl_Sxdd-gxooZYJGqnOvEkYIgvvV3q9H83OeRvw",{"id":26235,"title":6951,"body":26236,"breadcrumbTitle":29281,"canonical":6977,"date":6978,"description":29282,"draft":6980,"extension":6981,"image":6977,"meta":29283,"navigation":91,"path":29284,"robots":6977,"seo":29285,"seoTitle":29286,"stem":29287,"tags":29288,"updatedAt":6978,"__hash__":29290},"content\u002Fautomating-document-data-pipelines\u002Findex.md",{"type":7,"value":26237,"toc":29268},[26238,26241,26249,26265,26269,26296,26447,26466,26470,26477,26610,26616,26620,26626,26713,26716,26722,26725,26931,26938,26942,26949,26964,27244,27255,27426,27432,27436,27442,27811,27832,27836,27846,28094,28120,28124,28137,28149,28161,28602,28613,28617,28622,28628,28634,28644,29061,29068,29072,29181,29185,29193,29199,29212,29225,29231,29233,29260,29265],[10,26239,6951],{"id":26240},"automating-document-data-pipelines",[14,26242,26243,26244,26248],{},"Point tools solve point problems. A script that scrapes one PDF, a notebook that cleans one CSV, a macro that builds one workbook — each works in isolation, and none of them survive contact with a recurring business process. The real job is almost never \"extract this table.\" It is \"every morning, pull yesterday's invoices out of fifty PDFs, reconcile them against the ERP export, and email finance a formatted workbook plus a signed PDF summary — without a human touching it.\" That is a ",[26245,26246,26247],"em",{},"pipeline",", not a tool, and stitching one together from disconnected scripts fails the moment any link breaks silently: a vendor changes a PDF layout, an encoding error corrupts one CSV, a scheduled run dies at 3 a.m. and nobody notices until the numbers are wrong. A wired pipeline treats the whole flow — ingest, transform, consolidate, generate, deliver — as one observable system with logging, retries, and idempotency baked in, so a failure is loud and recoverable instead of quiet and corrupting. This overview shows how to build that system in Python, using the same libraries you would reach for piecemeal, but connected through a single pandas hub and run by a real scheduler.",[14,26250,26251,26252,26254,26255,26259,26260,26264],{},"The other three areas of this site each own a stage of that flow. ",[940,26253,6943],{"href":6942}," handles the extract and final-render stages; ",[940,26256,26258],{"href":26257},"\u002Fpython-for-excel-csv-data-processing\u002F","Python for Excel & CSV Data Processing"," owns the transform hub; ",[940,26261,26263],{"href":26262},"\u002Fword-document-templating-batch-processing\u002F","Word Document Templating & Batch Processing"," owns templated document output. This guide is the connective tissue: how those stages hand data to each other and run as one unattended job.",[18,26266,26268],{"id":26267},"the-pipeline-as-one-system","The pipeline as one system",[14,26270,26271,26272,26275,26276,26279,26280,26283,26284,26287,26288,26291,26292,26295],{},"Every document-automation job, however bespoke it looks, decomposes into the same five stages. Data enters from heterogeneous ",[1974,26273,26274],{},"sources"," (PDFs, Excel workbooks, CSV dumps), gets ",[1974,26277,26278],{},"extracted"," into rows, is ",[1974,26281,26282],{},"transformed"," in pandas into a clean typed frame, gets ",[1974,26285,26286],{},"consolidated"," across sources, and is then ",[1974,26289,26290],{},"generated"," out as Excel, Word, or PDF artifacts and ",[1974,26293,26294],{},"delivered"," on a schedule. Drawing the flow once, before writing any code, tells you exactly where each library plugs in and where a failure can hide.",[2540,26297,2547,26300,2547,26303,2547,26306,2547,26320,2547,26324,2547,26328,2547,26330,2547,26334,2547,26336,2547,26339,2547,26343,2547,26347,2547,26352,2547,26356,2547,26358,2547,26362,2547,26365,2547,26367,2547,26371,2547,26374,2547,26377,2547,26381,2547,26383,2547,26387,2547,26390,2547,26393,2547,26397,2547,26399,2547,26404,2547,26408,2547,26412,2547,26416,2547,26419,2547,26423,2547,26426,2547,26429,2547,26431,2547,26435,2547,26438,2547,26442,2547,26444],{"viewBox":26298,"role":2543,"ariaLabel":26299,"xmlns":2545,"style":2546},"0 0 760 360","End-to-end document data pipeline from PDF, Excel, and CSV sources through extract, transform, consolidate, generate, and deliver stages",[2549,26301,26302],{},"End-to-end document and data pipeline architecture",[2553,26304,26305],{},"PDF, Excel, and CSV sources feed an extract stage (pdfplumber, Tesseract), then a pandas transform hub, then consolidation, then a generation fan-out to Excel, Word, and PDF, all driven by a scheduling and logging harness.",[2557,26307,2559,26308,2559,26315,2547],{},[2561,26309,2564,26311,2564,26313,2559],{"id":26310,"x1":748,"y1":748,"x2":734,"y2":748},"pipelines-grad",[2566,26312],{"offset":748,"style":2568},[2566,26314],{"offset":734,"style":2571},[2573,26316,2564,26318,2559],{"id":26317,"viewBox":2576,"refX":2577,"refY":1153,"markerWidth":1179,"markerHeight":1179,"orient":2578},"pipelines-arrow",[2580,26319],{"d":2582,"fill":2583},[2585,26321],{"x":24213,"y":2590,"width":26322,"height":26323,"rx":2591,"fill":2592,"stroke":2593,"style":2594},"116","40",[2000,26325,26327],{"x":26326,"y":17018,"fill":2599,"style":2600},"74","PDF",[2585,26329],{"x":24213,"y":2679,"width":26322,"height":26323,"rx":2591,"fill":2592,"stroke":2593,"style":2594},[2000,26331,26333],{"x":26326,"y":26332,"fill":2599,"style":2600},"134","Excel",[2585,26335],{"x":24213,"y":2610,"width":26322,"height":26323,"rx":2591,"fill":2592,"stroke":2593,"style":2594},[2000,26337,26338],{"x":26326,"y":2643,"fill":2599,"style":2600},"CSV",[2000,26340,26342],{"x":26326,"y":26341,"fill":2583,"style":2685},"44","Sources",[35,26344],{"x1":26345,"y1":2588,"x2":11115,"y2":2588,"stroke":2583,"markerEnd":26346,"style":2594},"132","url(#pipelines-arrow)",[2585,26348],{"x":11115,"y":26349,"width":2589,"height":26350,"rx":3545,"fill":26351,"stroke":2593,"style":2594},"92","76","url(#pipelines-grad)",[2000,26353,26355],{"x":26354,"y":2589,"fill":2599,"style":16983},"230","Extract",[2000,26357,943],{"x":26354,"y":2609,"fill":2583,"style":2685},[2000,26359,26361],{"x":26354,"y":26360,"fill":2583,"style":2685},"156","Tesseract",[35,26363],{"x1":11231,"y1":2588,"x2":26364,"y2":2588,"stroke":2583,"markerEnd":26346,"style":2594},"328",[2585,26366],{"x":26364,"y":26349,"width":2589,"height":26350,"rx":3545,"fill":26351,"stroke":2593,"style":2594},[2000,26368,26370],{"x":26369,"y":2589,"fill":2599,"style":16983},"388","Transform",[2000,26372,26373],{"x":26369,"y":2609,"fill":2583,"style":2685},"pandas hub",[2000,26375,26376],{"x":26369,"y":26360,"fill":2583,"style":2685},"clean · type",[35,26378],{"x1":26379,"y1":2588,"x2":26380,"y2":2588,"stroke":2583,"markerEnd":26346,"style":2594},"448","486",[2585,26382],{"x":26380,"y":26349,"width":2589,"height":26350,"rx":3545,"fill":26351,"stroke":2593,"style":2594},[2000,26384,26386],{"x":26385,"y":2589,"fill":2599,"style":16983},"546","Consolidate",[2000,26388,26389],{"x":26385,"y":2609,"fill":2583,"style":2685},"join · dedup",[2000,26391,26392],{"x":26385,"y":26360,"fill":2583,"style":2685},"one dataset",[35,26394],{"x1":26395,"y1":2588,"x2":26396,"y2":2588,"stroke":2583,"markerEnd":26346,"style":2594},"606","644",[2585,26398],{"x":26396,"y":26349,"width":2650,"height":26350,"rx":3545,"fill":26351,"stroke":2593,"style":2594},[2000,26400,26403],{"x":26401,"y":26402,"fill":2599,"style":16983},"694","124","Generate",[2000,26405,26407],{"x":26401,"y":26406,"fill":2583,"style":2685},"144","fan-out",[2585,26409],{"x":13437,"y":26410,"width":2597,"height":26411,"rx":2591,"fill":2592,"stroke":2593,"style":2594},"232","38",[2000,26413,26333],{"x":26414,"y":26415,"fill":2599,"style":2685},"540","256",[2585,26417],{"x":26418,"y":26410,"width":2597,"height":26411,"rx":2591,"fill":2592,"stroke":2593,"style":2594},"592",[2000,26420,26422],{"x":26421,"y":26415,"fill":2599,"style":2685},"632","Word",[2585,26424],{"x":26425,"y":26410,"width":2590,"height":26411,"rx":2591,"fill":2592,"stroke":2593,"style":2594},"684",[2000,26427,26327],{"x":26428,"y":26415,"fill":2599,"style":2685},"714",[35,26430],{"x1":26401,"y1":2639,"x2":26401,"y2":26410,"stroke":2583,"markerEnd":26346,"style":2594},[2585,26432],{"x":11115,"y":26433,"width":26434,"height":26341,"rx":3545,"fill":2615,"stroke":2593,"style":2594},"300","436",[2000,26436,26437],{"x":26369,"y":2625,"fill":2599,"style":2600},"Scheduling & logging harness",[2000,26439,26441],{"x":26369,"y":26440,"fill":2583,"style":2685},"337","cron \u002F Prefect · retries · idempotency · alerting",[35,26443],{"x1":26354,"y1":26433,"x2":26354,"y2":2639,"stroke":2583,"markerEnd":26346,"style":2594},[35,26445],{"x1":26385,"y1":26433,"x2":26385,"y2":26446,"stroke":2583,"markerEnd":26346,"style":2594},"270",[14,26448,26449,26450,26453,26454,26457,26458,365,26460,26462,26463,3035],{},"The discipline this diagram enforces: data flows one direction through typed boundaries, and the harness underneath every stage owns ",[26245,26451,26452],{},"when"," things run and ",[26245,26455,26456],{},"what happens when they break",". The rest of this overview walks the stages in order. If you already know which stage you are stuck on, jump to the dedicated guide: ",[940,26459,948],{"href":947},[940,26461,4204],{"href":4203},", or ",[940,26464,5],{"href":26465},"\u002Fautomating-document-data-pipelines\u002Fscheduling-and-logging-automation-jobs\u002F",[18,26467,26469],{"id":26468},"library-ecosystem","Library ecosystem",[14,26471,26472,26473,26476],{},"A pipeline is an assembly of single-purpose libraries, each owning exactly one stage. The failure mode is using a library outside its lane — generating PDFs with a Word library, scheduling with a ",[30,26474,26475],{},"while True: sleep()"," loop. Pick per stage, and let pandas be the one shared currency every stage trades in.",[4273,26478,26479,26495],{},[4276,26480,26481],{},[4279,26482,26483,26486,26489,26492],{},[4282,26484,26485],{},"Library",[4282,26487,26488],{},"Role in pipeline",[4282,26490,26491],{},"Install",[4282,26493,26494],{},"When NOT to use",[4292,26496,26497,26514,26531,26556,26574,26591],{},[4279,26498,26499,26503,26506,26511],{},[4297,26500,26501],{},[940,26502,943],{"href":942},[4297,26504,26505],{},"Extract: text + tables from born-digital PDFs into rows",[4297,26507,26508],{},[30,26509,26510],{},"pip install pdfplumber",[4297,26512,26513],{},"Scanned\u002Fimage pages — there's no text layer to read",[4279,26515,26516,26520,26523,26528],{},[4297,26517,26518],{},[940,26519,9630],{"href":9598},[4297,26521,26522],{},"Transform & consolidate: the typed hub every source flows through",[4297,26524,26525],{},[30,26526,26527],{},"pip install pandas",[4297,26529,26530],{},"Streaming row-by-row at millions of rows — use Polars\u002FDuckDB",[4279,26532,26533,26537,26544,26549],{},[4297,26534,26535],{},[940,26536,22009],{"href":6935},[4297,26538,26539,26540,26543],{},"Generate: formatted ",[30,26541,26542],{},".xlsx"," with formulas, charts, styling",[4297,26545,26546],{},[30,26547,26548],{},"pip install openpyxl",[4297,26550,26551,26552,26555],{},"Plain tabular dumps — ",[30,26553,26554],{},"df.to_csv"," is simpler and faster",[4279,26557,26558,26563,26566,26571],{},[4297,26559,26560],{},[940,26561,18041],{"href":26562},"\u002Fword-document-templating-batch-processing\u002Fautomating-word-document-creation\u002F",[4297,26564,26565],{},"Generate: Word documents and templated mail-merge output",[4297,26567,26568],{},[30,26569,26570],{},"pip install python-docx",[4297,26572,26573],{},"Pixel-precise fixed layouts — ReportLab fits those",[4279,26575,26576,26580,26583,26588],{},[4297,26577,26578],{},[940,26579,19002],{"href":19001},[4297,26581,26582],{},"Generate: data-driven, pixel-precise PDF reports",[4297,26584,26585],{},[30,26586,26587],{},"pip install reportlab",[4297,26589,26590],{},"HTML-first layouts — WeasyPrint is a better match",[4279,26592,26593,26599,26602,26607],{},[4297,26594,26595,26596],{},"schedule \u002F cron \u002F ",[940,26597,26598],{"href":26465},"Prefect",[4297,26600,26601],{},"Deliver: run the job on a cadence, with retries and observability",[4297,26603,26604],{},[30,26605,26606],{},"pip install prefect",[4297,26608,26609],{},"A single nightly batch — plain cron needs no dependency",[14,26611,26612,26613,26615],{},"A rule of thumb on the scheduler column: reach for cron first. It is already on every Linux host, it survives reboots, and a one-line crontab entry runs your pipeline nightly with zero added dependencies. Graduate to Prefect only when you need a dependency graph between tasks, a UI to inspect failed runs, or backfills — orchestration you would otherwise hand-roll badly. The ",[30,26614,2325],{}," library sits awkwardly between the two: convenient for a long-running process, but it dies when the process dies, so it is the wrong tool for anything that must survive a reboot.",[18,26617,26619],{"id":26618},"environment-setup","Environment setup",[14,26621,26622,26623,3035],{},"Pin everything. A pipeline that pulls in pdfplumber, pandas, openpyxl, python-docx, and ReportLab has a deep transitive dependency tree, and an unpinned rebuild three months out will not reproduce. Isolate in a virtualenv and freeze a ",[30,26624,26625],{},"requirements.txt",[23,26627,26629],{"className":25,"code":26628,"language":27,"meta":28,"style":28},"# Create and activate an isolated environment\npython3 -m venv .venv\nsource .venv\u002Fbin\u002Factivate            # Windows: .venv\\Scripts\\activate\npython -m pip install --upgrade pip\n\n# System deps for the OCR branch and (optional) DOCX->PDF conversion\nsudo apt-get install -y tesseract-ocr libreoffice\n\npip install -r requirements.txt\n",[30,26630,26631,26636,26646,26655,26672,26676,26681,26697,26701],{"__ignoreMap":28},[33,26632,26633],{"class":35,"line":36},[33,26634,26635],{"class":39},"# Create and activate an isolated environment\n",[33,26637,26638,26640,26642,26644],{"class":35,"line":43},[33,26639,2011],{"class":46},[33,26641,51],{"class":50},[33,26643,55],{"class":54},[33,26645,58],{"class":54},[33,26647,26648,26650,26652],{"class":35,"line":61},[33,26649,64],{"class":50},[33,26651,67],{"class":54},[33,26653,26654],{"class":39},"            # Windows: .venv\\Scripts\\activate\n",[33,26656,26657,26659,26661,26664,26666,26669],{"class":35,"line":73},[33,26658,47],{"class":46},[33,26660,51],{"class":50},[33,26662,26663],{"class":54}," pip",[33,26665,79],{"class":54},[33,26667,26668],{"class":50}," --upgrade",[33,26670,26671],{"class":54}," pip\n",[33,26673,26674],{"class":35,"line":88},[33,26675,92],{"emptyLinePlaceholder":91},[33,26677,26678],{"class":35,"line":95},[33,26679,26680],{"class":39},"# System deps for the OCR branch and (optional) DOCX->PDF conversion\n",[33,26682,26683,26685,26687,26689,26691,26694],{"class":35,"line":101},[33,26684,9669],{"class":46},[33,26686,9672],{"class":54},[33,26688,79],{"class":54},[33,26690,20912],{"class":50},[33,26692,26693],{"class":54}," tesseract-ocr",[33,26695,26696],{"class":54}," libreoffice\n",[33,26698,26699],{"class":35,"line":171},[33,26700,92],{"emptyLinePlaceholder":91},[33,26702,26703,26705,26707,26710],{"class":35,"line":179},[33,26704,76],{"class":46},[33,26706,79],{"class":54},[33,26708,26709],{"class":50}," -r",[33,26711,26712],{"class":54}," requirements.txt\n",[14,26714,26715],{},"Pin the versions so a CI rebuild produces identical bytes. These are known-good as of this writing; bump them deliberately.",[23,26717,26720],{"className":26718,"code":26719,"language":2000,"meta":28},[1998],"# requirements.txt\npdfplumber==0.11.4\npandas==2.2.2\nopenpyxl==3.1.5\npython-docx==1.1.2\nreportlab==4.2.2\npyarrow==17.0.0\nprefect==2.20.3\n",[30,26721,26719],{"__ignoreMap":28},[14,26723,26724],{},"Wire logging once, at module load, so every stage writes to one stream. Unattended jobs are only debuggable if they leave a trail — and the trail is what turns a silent 3 a.m. failure into an actionable alert.",[23,26726,26728],{"className":126,"code":26727,"language":47,"meta":28,"style":28},"# pip install (stdlib only)\nimport logging\nfrom pathlib import Path\n\nLOG_DIR = Path(\"logs\")\nLOG_DIR.mkdir(exist_ok=True)\n\nlogging.basicConfig(\n    level=logging.INFO,\n    format=\"%(asctime)s | %(levelname)s | %(stage)s | %(message)s\",\n    datefmt=\"%Y-%m-%d %H:%M:%S\",\n    handlers=[\n        logging.FileHandler(LOG_DIR \u002F \"pipeline.log\"),\n        logging.StreamHandler(),\n    ],\n)\n\n\ndef stage_logger(stage: str) -> logging.LoggerAdapter:\n    \"\"\"Return a logger that tags every line with its pipeline stage.\"\"\"\n    return logging.LoggerAdapter(logging.getLogger(\"pipeline\"), {\"stage\": stage})\n",[30,26729,26730,26735,26741,26751,26755,26768,26782,26786,26790,26802,26832,26848,26858,26871,26876,26881,26885,26889,26893,26908,26913],{"__ignoreMap":28},[33,26731,26732],{"class":35,"line":36},[33,26733,26734],{"class":39},"# pip install (stdlib only)\n",[33,26736,26737,26739],{"class":35,"line":43},[33,26738,164],{"class":163},[33,26740,184],{"class":167},[33,26742,26743,26745,26747,26749],{"class":35,"line":61},[33,26744,190],{"class":163},[33,26746,193],{"class":167},[33,26748,164],{"class":163},[33,26750,198],{"class":167},[33,26752,26753],{"class":35,"line":73},[33,26754,92],{"emptyLinePlaceholder":91},[33,26756,26757,26759,26761,26763,26766],{"class":35,"line":88},[33,26758,1023],{"class":50},[33,26760,212],{"class":163},[33,26762,215],{"class":167},[33,26764,26765],{"class":54},"\"logs\"",[33,26767,221],{"class":167},[33,26769,26770,26772,26774,26776,26778,26780],{"class":35,"line":95},[33,26771,1023],{"class":50},[33,26773,1078],{"class":167},[33,26775,878],{"class":238},[33,26777,242],{"class":163},[33,26779,855],{"class":50},[33,26781,221],{"class":167},[33,26783,26784],{"class":35,"line":101},[33,26785,92],{"emptyLinePlaceholder":91},[33,26787,26788],{"class":35,"line":171},[33,26789,232],{"class":167},[33,26791,26792,26794,26796,26798,26800],{"class":35,"line":179},[33,26793,253],{"class":238},[33,26795,242],{"class":163},[33,26797,258],{"class":167},[33,26799,1067],{"class":50},[33,26801,247],{"class":167},[33,26803,26804,26806,26808,26810,26812,26815,26818,26820,26823,26825,26828,26830],{"class":35,"line":187},[33,26805,269],{"class":238},[33,26807,242],{"class":163},[33,26809,274],{"class":54},[33,26811,277],{"class":50},[33,26813,26814],{"class":54}," | ",[33,26816,26817],{"class":50},"%(levelname)s",[33,26819,26814],{"class":54},[33,26821,26822],{"class":50},"%(stage)s",[33,26824,26814],{"class":54},[33,26826,26827],{"class":50},"%(message)s",[33,26829,274],{"class":54},[33,26831,247],{"class":167},[33,26833,26834,26837,26839,26841,26843,26846],{"class":35,"line":201},[33,26835,26836],{"class":238},"    datefmt",[33,26838,242],{"class":163},[33,26840,1244],{"class":54},[33,26842,916],{"class":50},[33,26844,26845],{"class":54}," %H:%M:%S\"",[33,26847,247],{"class":167},[33,26849,26850,26853,26855],{"class":35,"line":206},[33,26851,26852],{"class":238},"    handlers",[33,26854,242],{"class":163},[33,26856,26857],{"class":167},"[\n",[33,26859,26860,26863,26865,26867,26869],{"class":35,"line":224},[33,26861,26862],{"class":167},"        logging.FileHandler(",[33,26864,1023],{"class":50},[33,26866,1107],{"class":163},[33,26868,4817],{"class":54},[33,26870,1506],{"class":167},[33,26872,26873],{"class":35,"line":229},[33,26874,26875],{"class":167},"        logging.StreamHandler(),\n",[33,26877,26878],{"class":35,"line":235},[33,26879,26880],{"class":167},"    ],\n",[33,26882,26883],{"class":35,"line":250},[33,26884,221],{"class":167},[33,26886,26887],{"class":35,"line":266},[33,26888,92],{"emptyLinePlaceholder":91},[33,26890,26891],{"class":35,"line":290},[33,26892,92],{"emptyLinePlaceholder":91},[33,26894,26895,26897,26900,26903,26905],{"class":35,"line":295},[33,26896,562],{"class":163},[33,26898,26899],{"class":46}," stage_logger",[33,26901,26902],{"class":167},"(stage: ",[33,26904,1053],{"class":50},[33,26906,26907],{"class":167},") -> logging.LoggerAdapter:\n",[33,26909,26910],{"class":35,"line":300},[33,26911,26912],{"class":54},"    \"\"\"Return a logger that tags every line with its pipeline stage.\"\"\"\n",[33,26914,26915,26917,26920,26922,26925,26928],{"class":35,"line":317},[33,26916,1332],{"class":163},[33,26918,26919],{"class":167}," logging.LoggerAdapter(logging.getLogger(",[33,26921,4978],{"class":54},[33,26923,26924],{"class":167},"), {",[33,26926,26927],{"class":54},"\"stage\"",[33,26929,26930],{"class":167},": stage})\n",[14,26932,26933,26934,26937],{},"Tagging each log line with its stage means a ",[30,26935,26936],{},"grep \"| extract |\" logs\u002Fpipeline.log"," reconstructs exactly what one stage did — invaluable when reconciliation fails and you need to find where a number went wrong.",[18,26939,26941],{"id":26940},"ingestion-patterns","Ingestion patterns",[14,26943,26944,26945,26948],{},"Each source format has its own reader and its own classic failure, and the ingestion stage's job is to turn all of them into the same shape: a list of rows, ready for pandas. Normalize the ",[26245,26946,26947],{},"interface"," here so the transform stage never has to know whether a record came from a PDF or a spreadsheet.",[14,26950,26951,26952,26954,26955,26958,26959,26961,26962,3035],{},"PDFs are the hardest source because they encode visual position, not structure. Use ",[940,26953,943],{"href":942}," for born-digital files and route scans to ",[940,26956,26361],{"href":26957},"\u002Fautomating-pdf-extraction-generation\u002Fscanning-and-ocr-processing-with-python\u002F"," OCR — the classify-before-you-parse rule from ",[940,26960,9592],{"href":942}," applies directly, and pulling the result into a frame is the dedicated subject of ",[940,26963,948],{"href":947},[23,26965,26967],{"className":126,"code":26966,"language":47,"meta":28,"style":28},"# pip install pdfplumber pandas\nfrom pathlib import Path\n\nimport pandas as pd\nimport pdfplumber\n\nSOURCE_PDF = Path(\"inbox\u002Finvoices_2026_05.pdf\")\n\n\ndef read_pdf(pdf_path: Path) -> pd.DataFrame:\n    \"\"\"Pull every table row across all pages into one raw frame.\"\"\"\n    if not pdf_path.exists():\n        raise FileNotFoundError(f\"Source PDF missing: {pdf_path}\")\n    rows: list[list[str]] = []\n    try:\n        with pdfplumber.open(pdf_path) as pdf:\n            for page in pdf.pages:\n                for table in page.extract_tables():\n                    rows.extend([(c or \"\").strip() for c in row] for row in table)\n    except Exception as exc:                       # corrupt \u002F encrypted file\n        raise RuntimeError(f\"Could not extract {pdf_path.name}: {exc}\") from exc\n    if not rows:\n        return pd.DataFrame()\n    return pd.DataFrame(rows[1:], columns=rows[0])  # first row is the header\n",[30,26968,26969,26973,26983,26987,26997,27003,27007,27021,27025,27029,27039,27044,27052,27076,27088,27094,27104,27114,27125,27155,27169,27205,27213,27219],{"__ignoreMap":28},[33,26970,26971],{"class":35,"line":36},[33,26972,7041],{"class":39},[33,26974,26975,26977,26979,26981],{"class":35,"line":43},[33,26976,190],{"class":163},[33,26978,193],{"class":167},[33,26980,164],{"class":163},[33,26982,198],{"class":167},[33,26984,26985],{"class":35,"line":61},[33,26986,92],{"emptyLinePlaceholder":91},[33,26988,26989,26991,26993,26995],{"class":35,"line":73},[33,26990,164],{"class":163},[33,26992,492],{"class":167},[33,26994,495],{"class":163},[33,26996,498],{"class":167},[33,26998,26999,27001],{"class":35,"line":88},[33,27000,164],{"class":163},[33,27002,485],{"class":167},[33,27004,27005],{"class":35,"line":95},[33,27006,92],{"emptyLinePlaceholder":91},[33,27008,27009,27012,27014,27016,27019],{"class":35,"line":101},[33,27010,27011],{"class":50},"SOURCE_PDF",[33,27013,212],{"class":163},[33,27015,215],{"class":167},[33,27017,27018],{"class":54},"\"inbox\u002Finvoices_2026_05.pdf\"",[33,27020,221],{"class":167},[33,27022,27023],{"class":35,"line":171},[33,27024,92],{"emptyLinePlaceholder":91},[33,27026,27027],{"class":35,"line":179},[33,27028,92],{"emptyLinePlaceholder":91},[33,27030,27031,27033,27036],{"class":35,"line":187},[33,27032,562],{"class":163},[33,27034,27035],{"class":46}," read_pdf",[33,27037,27038],{"class":167},"(pdf_path: Path) -> pd.DataFrame:\n",[33,27040,27041],{"class":35,"line":201},[33,27042,27043],{"class":54},"    \"\"\"Pull every table row across all pages into one raw frame.\"\"\"\n",[33,27045,27046,27048,27050],{"class":35,"line":206},[33,27047,617],{"class":163},[33,27049,620],{"class":163},[33,27051,21595],{"class":167},[33,27053,27054,27056,27058,27060,27062,27065,27067,27070,27072,27074],{"class":35,"line":224},[33,27055,4051],{"class":163},[33,27057,2945],{"class":50},[33,27059,602],{"class":167},[33,27061,4059],{"class":163},[33,27063,27064],{"class":54},"\"Source PDF missing: ",[33,27066,1115],{"class":50},[33,27068,27069],{"class":167},"pdf_path",[33,27071,1121],{"class":50},[33,27073,274],{"class":54},[33,27075,221],{"class":167},[33,27077,27078,27080,27082,27084,27086],{"class":35,"line":229},[33,27079,13076],{"class":167},[33,27081,1053],{"class":50},[33,27083,13081],{"class":167},[33,27085,242],{"class":163},[33,27087,589],{"class":167},[33,27089,27090,27092],{"class":35,"line":235},[33,27091,2424],{"class":163},[33,27093,574],{"class":167},[33,27095,27096,27098,27100,27102],{"class":35,"line":250},[33,27097,2191],{"class":163},[33,27099,681],{"class":167},[33,27101,495],{"class":163},[33,27103,686],{"class":167},[33,27105,27106,27108,27110,27112],{"class":35,"line":266},[33,27107,1793],{"class":163},[33,27109,695],{"class":167},[33,27111,662],{"class":163},[33,27113,700],{"class":167},[33,27115,27116,27118,27120,27122],{"class":35,"line":290},[33,27117,692],{"class":163},[33,27119,5998],{"class":167},[33,27121,662],{"class":163},[33,27123,27124],{"class":167}," page.extract_tables():\n",[33,27126,27127,27130,27132,27134,27137,27139,27141,27143,27146,27148,27150,27152],{"class":35,"line":295},[33,27128,27129],{"class":167},"                    rows.extend([(c ",[33,27131,7162],{"class":163},[33,27133,9892],{"class":54},[33,27135,27136],{"class":167},").strip() ",[33,27138,6124],{"class":163},[33,27140,7486],{"class":167},[33,27142,662],{"class":163},[33,27144,27145],{"class":167}," row] ",[33,27147,6124],{"class":163},[33,27149,3844],{"class":167},[33,27151,662],{"class":163},[33,27153,27154],{"class":167}," table)\n",[33,27156,27157,27159,27161,27163,27166],{"class":35,"line":300},[33,27158,2449],{"class":163},[33,27160,783],{"class":50},[33,27162,1852],{"class":163},[33,27164,27165],{"class":167}," exc:                       ",[33,27167,27168],{"class":39},"# corrupt \u002F encrypted file\n",[33,27170,27171,27173,27175,27177,27179,27182,27184,27187,27189,27191,27193,27195,27197,27199,27201,27203],{"class":35,"line":317},[33,27172,4051],{"class":163},[33,27174,7590],{"class":50},[33,27176,602],{"class":167},[33,27178,4059],{"class":163},[33,27180,27181],{"class":54},"\"Could not extract ",[33,27183,1115],{"class":50},[33,27185,27186],{"class":167},"pdf_path.name",[33,27188,1121],{"class":50},[33,27190,2079],{"class":54},[33,27192,1115],{"class":50},[33,27194,6565],{"class":167},[33,27196,1121],{"class":50},[33,27198,274],{"class":54},[33,27200,1649],{"class":167},[33,27202,190],{"class":163},[33,27204,20843],{"class":167},[33,27206,27207,27209,27211],{"class":35,"line":332},[33,27208,617],{"class":163},[33,27210,620],{"class":163},[33,27212,8723],{"class":167},[33,27214,27215,27217],{"class":35,"line":347},[33,27216,1659],{"class":163},[33,27218,7721],{"class":167},[33,27220,27221,27223,27225,27227,27229,27231,27233,27236,27238,27241],{"class":35,"line":374},[33,27222,1332],{"class":163},[33,27224,13261],{"class":167},[33,27226,734],{"class":50},[33,27228,737],{"class":167},[33,27230,740],{"class":238},[33,27232,242],{"class":163},[33,27234,27235],{"class":167},"rows[",[33,27237,748],{"class":50},[33,27239,27240],{"class":167},"])  ",[33,27242,27243],{"class":39},"# first row is the header\n",[14,27245,27246,27247,27250,27251,3035],{},"Excel and CSV sources are simpler but carry their own traps: pandas guesses dtypes, and a guess on an ID column that looks numeric will silently strip leading zeros. Read identifier columns as strings explicitly, and set ",[30,27248,27249],{},"encoding"," so accented data survives — the encoding failures are catalogued in ",[940,27252,27254],{"href":27253},"\u002Fpython-for-excel-csv-data-processing\u002Fcleaning-messy-csv-data-with-pandas\u002Ffixing-encoding-errors-in-csv-files\u002F","Fixing Encoding Errors in CSV Files",[23,27256,27258],{"className":126,"code":27257,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nfrom pathlib import Path\n\nimport pandas as pd\n\n\ndef read_tabular(path: Path) -> pd.DataFrame:\n    \"\"\"Read an Excel or CSV source, forcing the id column to stay a string.\"\"\"\n    if not path.exists():\n        raise FileNotFoundError(f\"Source missing: {path}\")\n    str_cols = {\"invoice_id\": \"string\"}\n    if path.suffix.lower() in {\".xlsx\", \".xlsm\"}:\n        return pd.read_excel(path, dtype=str_cols, engine=\"openpyxl\")\n    return pd.read_csv(path, dtype=str_cols, encoding=\"utf-8\")\n",[30,27259,27260,27264,27274,27278,27288,27292,27296,27305,27310,27319,27342,27361,27383,27405],{"__ignoreMap":28},[33,27261,27262],{"class":35,"line":36},[33,27263,3952],{"class":39},[33,27265,27266,27268,27270,27272],{"class":35,"line":43},[33,27267,190],{"class":163},[33,27269,193],{"class":167},[33,27271,164],{"class":163},[33,27273,198],{"class":167},[33,27275,27276],{"class":35,"line":61},[33,27277,92],{"emptyLinePlaceholder":91},[33,27279,27280,27282,27284,27286],{"class":35,"line":73},[33,27281,164],{"class":163},[33,27283,492],{"class":167},[33,27285,495],{"class":163},[33,27287,498],{"class":167},[33,27289,27290],{"class":35,"line":88},[33,27291,92],{"emptyLinePlaceholder":91},[33,27293,27294],{"class":35,"line":95},[33,27295,92],{"emptyLinePlaceholder":91},[33,27297,27298,27300,27303],{"class":35,"line":101},[33,27299,562],{"class":163},[33,27301,27302],{"class":46}," read_tabular",[33,27304,7103],{"class":167},[33,27306,27307],{"class":35,"line":171},[33,27308,27309],{"class":54},"    \"\"\"Read an Excel or CSV source, forcing the id column to stay a string.\"\"\"\n",[33,27311,27312,27314,27316],{"class":35,"line":179},[33,27313,617],{"class":163},[33,27315,620],{"class":163},[33,27317,27318],{"class":167}," path.exists():\n",[33,27320,27321,27323,27325,27327,27329,27332,27334,27336,27338,27340],{"class":35,"line":187},[33,27322,4051],{"class":163},[33,27324,2945],{"class":50},[33,27326,602],{"class":167},[33,27328,4059],{"class":163},[33,27330,27331],{"class":54},"\"Source missing: ",[33,27333,1115],{"class":50},[33,27335,2580],{"class":167},[33,27337,1121],{"class":50},[33,27339,274],{"class":54},[33,27341,221],{"class":167},[33,27343,27344,27347,27349,27351,27354,27356,27359],{"class":35,"line":201},[33,27345,27346],{"class":167},"    str_cols ",[33,27348,242],{"class":163},[33,27350,4098],{"class":167},[33,27352,27353],{"class":54},"\"invoice_id\"",[33,27355,2079],{"class":167},[33,27357,27358],{"class":54},"\"string\"",[33,27360,4113],{"class":167},[33,27362,27363,27365,27368,27370,27372,27375,27377,27380],{"class":35,"line":206},[33,27364,617],{"class":163},[33,27366,27367],{"class":167}," path.suffix.lower() ",[33,27369,662],{"class":163},[33,27371,4098],{"class":167},[33,27373,27374],{"class":54},"\".xlsx\"",[33,27376,365],{"class":167},[33,27378,27379],{"class":54},"\".xlsm\"",[33,27381,27382],{"class":167},"}:\n",[33,27384,27385,27387,27390,27392,27394,27397,27399,27401,27403],{"class":35,"line":224},[33,27386,1659],{"class":163},[33,27388,27389],{"class":167}," pd.read_excel(path, ",[33,27391,23262],{"class":238},[33,27393,242],{"class":163},[33,27395,27396],{"class":167},"str_cols, ",[33,27398,17351],{"class":238},[33,27400,242],{"class":163},[33,27402,17356],{"class":54},[33,27404,221],{"class":167},[33,27406,27407,27409,27412,27414,27416,27418,27420,27422,27424],{"class":35,"line":229},[33,27408,1332],{"class":163},[33,27410,27411],{"class":167}," pd.read_csv(path, ",[33,27413,23262],{"class":238},[33,27415,242],{"class":163},[33,27417,27396],{"class":167},[33,27419,27249],{"class":238},[33,27421,242],{"class":163},[33,27423,1195],{"class":54},[33,27425,221],{"class":167},[14,27427,27428,27429,27431],{},"The contract every reader honors is the same: take a path, return a ",[30,27430,11219],{},", raise on unrecoverable errors. With that interface fixed, the rest of the pipeline is source-agnostic.",[18,27433,27435],{"id":27434},"transformation-pipeline","Transformation pipeline",[14,27437,27438,27439,27441],{},"pandas is the hub. Every source, whatever its origin, lands in a dataframe, and the transform stage is where raw strings become a typed, schema-stable table you can trust downstream. This is the same dataframe discipline used throughout ",[940,27440,9599],{"href":9598},": coerce types explicitly, normalize the schema, and quarantine bad rows rather than letting them poison a total.",[23,27443,27445],{"className":126,"code":27444,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\n\nEXPECTED = [\"invoice_id\", \"date\", \"amount\"]\n\n\ndef transform(df: pd.DataFrame) -> pd.DataFrame:\n    \"\"\"Normalize headers, coerce types, and quarantine rows that fail coercion.\"\"\"\n    if df.empty:\n        return pd.DataFrame(columns=EXPECTED)\n\n    df = df.copy()\n    df.columns = [c.strip().lower().replace(\" \", \"_\") for c in df.columns]\n\n    # Strip currency\u002Fgrouping noise, then coerce; junk becomes NaN, not a crash\n    df[\"amount\"] = (\n        df[\"amount\"].astype(str).str.replace(r\"[,$\\s]\", \"\", regex=True)\n    )\n    df[\"amount\"] = pd.to_numeric(df[\"amount\"], errors=\"coerce\")\n    df[\"date\"] = pd.to_datetime(df[\"date\"], errors=\"coerce\", dayfirst=False)\n\n    bad = df[df[\"amount\"].isna() | df[\"date\"].isna()]\n    if not bad.empty:\n        df = df.drop(bad.index)                    # quarantine, don't keep silently\n\n    missing = set(EXPECTED) - set(df.columns)\n    if missing:\n        raise ValueError(f\"Schema mismatch, missing columns: {missing}\")\n    return df[EXPECTED].reset_index(drop=True)\n",[30,27446,27447,27451,27461,27465,27486,27490,27494,27503,27508,27515,27530,27534,27542,27568,27572,27577,27590,27627,27631,27656,27690,27694,27718,27727,27739,27743,27763,27769,27792],{"__ignoreMap":28},[33,27448,27449],{"class":35,"line":36},[33,27450,8895],{"class":39},[33,27452,27453,27455,27457,27459],{"class":35,"line":43},[33,27454,164],{"class":163},[33,27456,492],{"class":167},[33,27458,495],{"class":163},[33,27460,498],{"class":167},[33,27462,27463],{"class":35,"line":61},[33,27464,92],{"emptyLinePlaceholder":91},[33,27466,27467,27470,27472,27474,27476,27478,27480,27482,27484],{"class":35,"line":73},[33,27468,27469],{"class":50},"EXPECTED",[33,27471,212],{"class":163},[33,27473,9178],{"class":167},[33,27475,27353],{"class":54},[33,27477,365],{"class":167},[33,27479,4101],{"class":54},[33,27481,365],{"class":167},[33,27483,4106],{"class":54},[33,27485,9202],{"class":167},[33,27487,27488],{"class":35,"line":88},[33,27489,92],{"emptyLinePlaceholder":91},[33,27491,27492],{"class":35,"line":95},[33,27493,92],{"emptyLinePlaceholder":91},[33,27495,27496,27498,27501],{"class":35,"line":101},[33,27497,562],{"class":163},[33,27499,27500],{"class":46}," transform",[33,27502,12127],{"class":167},[33,27504,27505],{"class":35,"line":171},[33,27506,27507],{"class":54},"    \"\"\"Normalize headers, coerce types, and quarantine rows that fail coercion.\"\"\"\n",[33,27509,27510,27512],{"class":35,"line":179},[33,27511,617],{"class":163},[33,27513,27514],{"class":167}," df.empty:\n",[33,27516,27517,27519,27522,27524,27526,27528],{"class":35,"line":187},[33,27518,1659],{"class":163},[33,27520,27521],{"class":167}," pd.DataFrame(",[33,27523,740],{"class":238},[33,27525,242],{"class":163},[33,27527,27469],{"class":50},[33,27529,221],{"class":167},[33,27531,27532],{"class":35,"line":201},[33,27533,92],{"emptyLinePlaceholder":91},[33,27535,27536,27538,27540],{"class":35,"line":206},[33,27537,4025],{"class":167},[33,27539,242],{"class":163},[33,27541,11659],{"class":167},[33,27543,27544,27547,27549,27552,27554,27556,27558,27560,27562,27564,27566],{"class":35,"line":224},[33,27545,27546],{"class":167},"    df.columns ",[33,27548,242],{"class":163},[33,27550,27551],{"class":167}," [c.strip().lower().replace(",[33,27553,17294],{"class":54},[33,27555,365],{"class":167},[33,27557,7764],{"class":54},[33,27559,1649],{"class":167},[33,27561,6124],{"class":163},[33,27563,7486],{"class":167},[33,27565,662],{"class":163},[33,27567,12624],{"class":167},[33,27569,27570],{"class":35,"line":229},[33,27571,92],{"emptyLinePlaceholder":91},[33,27573,27574],{"class":35,"line":235},[33,27575,27576],{"class":39},"    # Strip currency\u002Fgrouping noise, then coerce; junk becomes NaN, not a crash\n",[33,27578,27579,27582,27584,27586,27588],{"class":35,"line":250},[33,27580,27581],{"class":167},"    df[",[33,27583,4106],{"class":54},[33,27585,763],{"class":167},[33,27587,242],{"class":163},[33,27589,1415],{"class":167},[33,27591,27592,27594,27596,27599,27601,27604,27606,27608,27611,27613,27615,27617,27619,27621,27623,27625],{"class":35,"line":266},[33,27593,10902],{"class":167},[33,27595,4106],{"class":54},[33,27597,27598],{"class":167},"].astype(",[33,27600,1053],{"class":50},[33,27602,27603],{"class":167},").str.replace(",[33,27605,11977],{"class":163},[33,27607,274],{"class":54},[33,27609,27610],{"class":50},"[,$\\s]",[33,27612,274],{"class":54},[33,27614,365],{"class":167},[33,27616,3198],{"class":54},[33,27618,365],{"class":167},[33,27620,11993],{"class":238},[33,27622,242],{"class":163},[33,27624,855],{"class":50},[33,27626,221],{"class":167},[33,27628,27629],{"class":35,"line":290},[33,27630,1202],{"class":167},[33,27632,27633,27635,27637,27639,27641,27644,27646,27648,27650,27652,27654],{"class":35,"line":295},[33,27634,27581],{"class":167},[33,27636,4106],{"class":54},[33,27638,763],{"class":167},[33,27640,242],{"class":163},[33,27642,27643],{"class":167}," pd.to_numeric(df[",[33,27645,4106],{"class":54},[33,27647,8314],{"class":167},[33,27649,8317],{"class":238},[33,27651,242],{"class":163},[33,27653,12107],{"class":54},[33,27655,221],{"class":167},[33,27657,27658,27660,27662,27664,27666,27669,27671,27673,27675,27677,27679,27681,27684,27686,27688],{"class":35,"line":300},[33,27659,27581],{"class":167},[33,27661,4101],{"class":54},[33,27663,763],{"class":167},[33,27665,242],{"class":163},[33,27667,27668],{"class":167}," pd.to_datetime(df[",[33,27670,4101],{"class":54},[33,27672,8314],{"class":167},[33,27674,8317],{"class":238},[33,27676,242],{"class":163},[33,27678,12107],{"class":54},[33,27680,365],{"class":167},[33,27682,27683],{"class":238},"dayfirst",[33,27685,242],{"class":163},[33,27687,902],{"class":50},[33,27689,221],{"class":167},[33,27691,27692],{"class":35,"line":317},[33,27693,92],{"emptyLinePlaceholder":91},[33,27695,27696,27699,27701,27704,27706,27709,27711,27713,27715],{"class":35,"line":332},[33,27697,27698],{"class":167},"    bad ",[33,27700,242],{"class":163},[33,27702,27703],{"class":167}," df[df[",[33,27705,4106],{"class":54},[33,27707,27708],{"class":167},"].isna() ",[33,27710,7654],{"class":163},[33,27712,7935],{"class":167},[33,27714,4101],{"class":54},[33,27716,27717],{"class":167},"].isna()]\n",[33,27719,27720,27722,27724],{"class":35,"line":347},[33,27721,617],{"class":163},[33,27723,620],{"class":163},[33,27725,27726],{"class":167}," bad.empty:\n",[33,27728,27729,27731,27733,27736],{"class":35,"line":374},[33,27730,7930],{"class":167},[33,27732,242],{"class":163},[33,27734,27735],{"class":167}," df.drop(bad.index)                    ",[33,27737,27738],{"class":39},"# quarantine, don't keep silently\n",[33,27740,27741],{"class":35,"line":397},[33,27742,92],{"emptyLinePlaceholder":91},[33,27744,27745,27747,27749,27751,27753,27755,27757,27759,27761],{"class":35,"line":653},[33,27746,4118],{"class":167},[33,27748,242],{"class":163},[33,27750,4129],{"class":50},[33,27752,602],{"class":167},[33,27754,27469],{"class":50},[33,27756,1649],{"class":167},[33,27758,4126],{"class":163},[33,27760,4129],{"class":50},[33,27762,4132],{"class":167},[33,27764,27765,27767],{"class":35,"line":667},[33,27766,617],{"class":163},[33,27768,4139],{"class":167},[33,27770,27771,27773,27775,27777,27779,27782,27784,27786,27788,27790],{"class":35,"line":675},[33,27772,4051],{"class":163},[33,27774,4054],{"class":50},[33,27776,602],{"class":167},[33,27778,4059],{"class":163},[33,27780,27781],{"class":54},"\"Schema mismatch, missing columns: ",[33,27783,1115],{"class":50},[33,27785,4157],{"class":167},[33,27787,1121],{"class":50},[33,27789,274],{"class":54},[33,27791,221],{"class":167},[33,27793,27794,27796,27798,27800,27803,27805,27807,27809],{"class":35,"line":689},[33,27795,1332],{"class":163},[33,27797,7935],{"class":167},[33,27799,27469],{"class":50},[33,27801,27802],{"class":167},"].reset_index(",[33,27804,10868],{"class":238},[33,27806,242],{"class":163},[33,27808,855],{"class":50},[33,27810,221],{"class":167},[14,27812,27813,27814,27817,27818,27821,27822,27825,27826,27828,27829,27831],{},"The load-bearing pattern is ",[30,27815,27816],{},"errors=\"coerce\""," plus an explicit quarantine: a malformed ",[30,27819,27820],{},"\"1,234\""," or a stray ",[30,27823,27824],{},"\"N\u002FA\""," becomes ",[30,27827,8884],{},", gets logged and dropped, and never reaches a report as a silent ",[30,27830,748],{},". Normalizing headers — lowercase, underscored — is what lets sources with different column casing line up for consolidation.",[18,27833,27835],{"id":27834},"consolidation","Consolidation",[14,27837,27838,27839,27841,27842,27845],{},"A single source is rarely the whole picture. The PDF gives you invoice rows; the ERP CSV gives you the customer master; the spreadsheet gives you cost centers. Consolidation joins them into one dataset, and the operation you pick changes the answer. Use ",[30,27840,8366],{}," to stack same-schema frames from many files (a month of invoice PDFs); use ",[30,27843,27844],{},"merge"," to enrich rows with a lookup keyed on a shared id. Then dedup, because a reprocessed file otherwise double-counts.",[23,27847,27849],{"className":126,"code":27848,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\n\n\ndef consolidate(frames: list[pd.DataFrame], master: pd.DataFrame | None = None) -> pd.DataFrame:\n    \"\"\"Stack per-source frames, enrich via a left join, then dedup on id.\"\"\"\n    frames = [f for f in frames if not f.empty]\n    if not frames:\n        return pd.DataFrame()\n\n    combined = pd.concat(frames, ignore_index=True)            # stack same-schema rows\n\n    if master is not None:\n        # Normalize the join key on BOTH sides or the merge silently yields NaN\n        combined[\"invoice_id\"] = combined[\"invoice_id\"].str.strip()\n        master = master.assign(invoice_id=master[\"invoice_id\"].str.strip())\n        combined = combined.merge(master, on=\"invoice_id\", how=\"left\")\n\n    return combined.drop_duplicates(subset=[\"invoice_id\"], keep=\"first\").reset_index(drop=True)\n",[30,27850,27851,27855,27865,27869,27873,27893,27898,27922,27930,27936,27940,27960,27964,27979,27984,28002,28025,28053,28057],{"__ignoreMap":28},[33,27852,27853],{"class":35,"line":36},[33,27854,8895],{"class":39},[33,27856,27857,27859,27861,27863],{"class":35,"line":43},[33,27858,164],{"class":163},[33,27860,492],{"class":167},[33,27862,495],{"class":163},[33,27864,498],{"class":167},[33,27866,27867],{"class":35,"line":61},[33,27868,92],{"emptyLinePlaceholder":91},[33,27870,27871],{"class":35,"line":73},[33,27872,92],{"emptyLinePlaceholder":91},[33,27874,27875,27877,27880,27883,27885,27887,27889,27891],{"class":35,"line":88},[33,27876,562],{"class":163},[33,27878,27879],{"class":46}," consolidate",[33,27881,27882],{"class":167},"(frames: list[pd.DataFrame], master: pd.DataFrame ",[33,27884,7654],{"class":163},[33,27886,7657],{"class":50},[33,27888,212],{"class":163},[33,27890,7657],{"class":50},[33,27892,7668],{"class":167},[33,27894,27895],{"class":35,"line":95},[33,27896,27897],{"class":54},"    \"\"\"Stack per-source frames, enrich via a left join, then dedup on id.\"\"\"\n",[33,27899,27900,27902,27904,27907,27909,27911,27913,27915,27917,27919],{"class":35,"line":101},[33,27901,584],{"class":167},[33,27903,242],{"class":163},[33,27905,27906],{"class":167}," [f ",[33,27908,6124],{"class":163},[33,27910,8832],{"class":167},[33,27912,662],{"class":163},[33,27914,14035],{"class":167},[33,27916,2491],{"class":163},[33,27918,620],{"class":163},[33,27920,27921],{"class":167}," f.empty]\n",[33,27923,27924,27926,27928],{"class":35,"line":171},[33,27925,617],{"class":163},[33,27927,620],{"class":163},[33,27929,816],{"class":167},[33,27931,27932,27934],{"class":35,"line":179},[33,27933,1659],{"class":163},[33,27935,7721],{"class":167},[33,27937,27938],{"class":35,"line":187},[33,27939,92],{"emptyLinePlaceholder":91},[33,27941,27942,27944,27946,27948,27950,27952,27954,27957],{"class":35,"line":201},[33,27943,842],{"class":167},[33,27945,242],{"class":163},[33,27947,847],{"class":167},[33,27949,850],{"class":238},[33,27951,242],{"class":163},[33,27953,855],{"class":50},[33,27955,27956],{"class":167},")            ",[33,27958,27959],{"class":39},"# stack same-schema rows\n",[33,27961,27962],{"class":35,"line":206},[33,27963,92],{"emptyLinePlaceholder":91},[33,27965,27966,27968,27971,27973,27975,27977],{"class":35,"line":224},[33,27967,617],{"class":163},[33,27969,27970],{"class":167}," master ",[33,27972,3847],{"class":163},[33,27974,620],{"class":163},[33,27976,7657],{"class":50},[33,27978,574],{"class":167},[33,27980,27981],{"class":35,"line":229},[33,27982,27983],{"class":39},"        # Normalize the join key on BOTH sides or the merge silently yields NaN\n",[33,27985,27986,27989,27991,27993,27995,27998,28000],{"class":35,"line":235},[33,27987,27988],{"class":167},"        combined[",[33,27990,27353],{"class":54},[33,27992,763],{"class":167},[33,27994,242],{"class":163},[33,27996,27997],{"class":167}," combined[",[33,27999,27353],{"class":54},[33,28001,10852],{"class":167},[33,28003,28004,28007,28009,28012,28015,28017,28020,28022],{"class":35,"line":250},[33,28005,28006],{"class":167},"        master ",[33,28008,242],{"class":163},[33,28010,28011],{"class":167}," master.assign(",[33,28013,28014],{"class":238},"invoice_id",[33,28016,242],{"class":163},[33,28018,28019],{"class":167},"master[",[33,28021,27353],{"class":54},[33,28023,28024],{"class":167},"].str.strip())\n",[33,28026,28027,28030,28032,28035,28037,28039,28041,28043,28046,28048,28051],{"class":35,"line":266},[33,28028,28029],{"class":167},"        combined ",[33,28031,242],{"class":163},[33,28033,28034],{"class":167}," combined.merge(master, ",[33,28036,2091],{"class":238},[33,28038,242],{"class":163},[33,28040,27353],{"class":54},[33,28042,365],{"class":167},[33,28044,28045],{"class":238},"how",[33,28047,242],{"class":163},[33,28049,28050],{"class":54},"\"left\"",[33,28052,221],{"class":167},[33,28054,28055],{"class":35,"line":290},[33,28056,92],{"emptyLinePlaceholder":91},[33,28058,28059,28061,28064,28067,28069,28071,28073,28075,28078,28080,28083,28086,28088,28090,28092],{"class":35,"line":295},[33,28060,1332],{"class":163},[33,28062,28063],{"class":167}," combined.drop_duplicates(",[33,28065,28066],{"class":238},"subset",[33,28068,242],{"class":163},[33,28070,8309],{"class":167},[33,28072,27353],{"class":54},[33,28074,8314],{"class":167},[33,28076,28077],{"class":238},"keep",[33,28079,242],{"class":163},[33,28081,28082],{"class":54},"\"first\"",[33,28084,28085],{"class":167},").reset_index(",[33,28087,10868],{"class":238},[33,28089,242],{"class":163},[33,28091,855],{"class":50},[33,28093,221],{"class":167},[14,28095,28096,28097,28099,28100,28103,28104,1351,28107,28110,28111,28115,28116,3035],{},"Two traps dominate. A join key with inconsistent whitespace or casing produces all-",[30,28098,8884],{}," enrichment columns from a merge that ",[26245,28101,28102],{},"looks"," like it worked — normalize the key on both sides first, as the code does. And when two frames share a non-key column name, pandas appends ",[30,28105,28106],{},"_x",[30,28108,28109],{},"_y"," suffixes that quietly break downstream references; that exact failure and its fix live in ",[940,28112,28114],{"href":28113},"\u002Fpython-for-excel-csv-data-processing\u002Fmerging-multiple-spreadsheets\u002Ffix-pandas-merge-overlapping-columns\u002F","Fix pandas merge Overlapping Column Suffixes",". The broader patterns for combining many inputs are in ",[940,28117,28119],{"href":28118},"\u002Fpython-for-excel-csv-data-processing\u002Fmerging-multiple-spreadsheets\u002F","Merging Multiple Spreadsheets",[18,28121,28123],{"id":28122},"output-serialization","Output & serialization",[14,28125,28126,28127,28130,28131,28134,28135,3035],{},"The consolidated frame fans out two ways: machine-readable ",[26245,28128,28129],{},"data"," for downstream systems, and formatted ",[26245,28132,28133],{},"documents"," for humans. The generation stage is the inverse of extraction — you now place clean data into a structured target. This fan-out is the dedicated subject of ",[940,28136,4204],{"href":4203},[14,28138,28139,28140,28143,28144,28148],{},"For the data path, write CSV or Parquet. Always pass ",[30,28141,28142],{},"index=False"," so pandas does not emit a phantom index column — the fix detailed in ",[940,28145,28147],{"href":28146},"\u002Fpython-for-excel-csv-data-processing\u002Fexporting-data-to-csv-formats\u002Ffix-pandas-to-csv-extra-index-column\u002F","Fix pandas to_csv Adding an Extra Index Column"," — and prefer Parquet when a BI tool consumes the output, since it preserves dtypes CSV flattens to strings.",[14,28150,28151,28152,28154,28155,28157,28158,28160],{},"For the human path, fan the same frame out to three formats. A formatted Excel workbook with ",[940,28153,22009],{"href":6935}," for analysts, a templated Word document via ",[940,28156,18041],{"href":18040}," for mail-merge correspondence, and a pixel-precise PDF via ",[940,28159,19002],{"href":19001}," for signed records.",[23,28162,28164],{"className":126,"code":28163,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl python-docx reportlab pyarrow\nfrom pathlib import Path\n\nimport pandas as pd\nfrom docx import Document\nfrom reportlab.lib.pagesizes import A4\nfrom reportlab.pdfgen import canvas\n\n\ndef fan_out(df: pd.DataFrame, out_dir: Path) -> None:\n    \"\"\"Emit the consolidated frame as data + Excel + Word + PDF artifacts.\"\"\"\n    out_dir.mkdir(parents=True, exist_ok=True)\n\n    # Data artifacts for downstream systems\n    df.to_csv(out_dir \u002F \"invoices.csv\", index=False, encoding=\"utf-8\")  # no phantom index\n    df.to_parquet(out_dir \u002F \"invoices.parquet\", index=False)           # dtypes preserved\n\n    # Excel for analysts (openpyxl engine writes the styled workbook)\n    df.to_excel(out_dir \u002F \"invoices.xlsx\", index=False, engine=\"openpyxl\")\n\n    # Word summary for correspondence\n    doc = Document()\n    doc.add_heading(\"Invoice Summary\", level=1)\n    doc.add_paragraph(f\"Records: {len(df)}    Total: {df['amount'].sum():,.2f}\")\n    doc.save(out_dir \u002F \"summary.docx\")\n\n    # PDF record, placed at explicit canvas coordinates\n    c = canvas.Canvas(str(out_dir \u002F \"summary.pdf\"), pagesize=A4)\n    _, height = A4\n    c.setFont(\"Helvetica-Bold\", 16)\n    c.drawString(50, height - 60, \"Invoice Summary\")\n    c.setFont(\"Helvetica\", 11)\n    c.drawString(50, height - 90, f\"Records: {len(df)}   Total: {df['amount'].sum():,.2f}\")\n    c.save()\n",[30,28165,28166,28171,28181,28185,28195,28205,28215,28227,28231,28235,28249,28254,28275,28279,28284,28315,28339,28343,28348,28376,28380,28385,28393,28410,28447,28459,28463,28468,28497,28506,28519,28540,28553,28597],{"__ignoreMap":28},[33,28167,28168],{"class":35,"line":36},[33,28169,28170],{"class":39},"# pip install pandas openpyxl python-docx reportlab pyarrow\n",[33,28172,28173,28175,28177,28179],{"class":35,"line":43},[33,28174,190],{"class":163},[33,28176,193],{"class":167},[33,28178,164],{"class":163},[33,28180,198],{"class":167},[33,28182,28183],{"class":35,"line":61},[33,28184,92],{"emptyLinePlaceholder":91},[33,28186,28187,28189,28191,28193],{"class":35,"line":73},[33,28188,164],{"class":163},[33,28190,492],{"class":167},[33,28192,495],{"class":163},[33,28194,498],{"class":167},[33,28196,28197,28199,28201,28203],{"class":35,"line":88},[33,28198,190],{"class":163},[33,28200,18092],{"class":167},[33,28202,164],{"class":163},[33,28204,18097],{"class":167},[33,28206,28207,28209,28211,28213],{"class":35,"line":95},[33,28208,190],{"class":163},[33,28210,19044],{"class":167},[33,28212,164],{"class":163},[33,28214,19049],{"class":167},[33,28216,28217,28219,28222,28224],{"class":35,"line":101},[33,28218,190],{"class":163},[33,28220,28221],{"class":167}," reportlab.pdfgen ",[33,28223,164],{"class":163},[33,28225,28226],{"class":167}," canvas\n",[33,28228,28229],{"class":35,"line":171},[33,28230,92],{"emptyLinePlaceholder":91},[33,28232,28233],{"class":35,"line":179},[33,28234,92],{"emptyLinePlaceholder":91},[33,28236,28237,28239,28242,28245,28247],{"class":35,"line":187},[33,28238,562],{"class":163},[33,28240,28241],{"class":46}," fan_out",[33,28243,28244],{"class":167},"(df: pd.DataFrame, out_dir: Path) -> ",[33,28246,571],{"class":50},[33,28248,574],{"class":167},[33,28250,28251],{"class":35,"line":201},[33,28252,28253],{"class":54},"    \"\"\"Emit the consolidated frame as data + Excel + Word + PDF artifacts.\"\"\"\n",[33,28255,28256,28259,28261,28263,28265,28267,28269,28271,28273],{"class":35,"line":206},[33,28257,28258],{"class":167},"    out_dir.mkdir(",[33,28260,869],{"class":238},[33,28262,242],{"class":163},[33,28264,855],{"class":50},[33,28266,365],{"class":167},[33,28268,878],{"class":238},[33,28270,242],{"class":163},[33,28272,855],{"class":50},[33,28274,221],{"class":167},[33,28276,28277],{"class":35,"line":224},[33,28278,92],{"emptyLinePlaceholder":91},[33,28280,28281],{"class":35,"line":229},[33,28282,28283],{"class":39},"    # Data artifacts for downstream systems\n",[33,28285,28286,28289,28291,28294,28296,28298,28300,28302,28304,28306,28308,28310,28312],{"class":35,"line":235},[33,28287,28288],{"class":167},"    df.to_csv(out_dir ",[33,28290,1351],{"class":163},[33,28292,28293],{"class":54}," \"invoices.csv\"",[33,28295,365],{"class":167},[33,28297,897],{"class":238},[33,28299,242],{"class":163},[33,28301,902],{"class":50},[33,28303,365],{"class":167},[33,28305,27249],{"class":238},[33,28307,242],{"class":163},[33,28309,1195],{"class":54},[33,28311,10922],{"class":167},[33,28313,28314],{"class":39},"# no phantom index\n",[33,28316,28317,28320,28322,28325,28327,28329,28331,28333,28336],{"class":35,"line":250},[33,28318,28319],{"class":167},"    df.to_parquet(out_dir ",[33,28321,1351],{"class":163},[33,28323,28324],{"class":54}," \"invoices.parquet\"",[33,28326,365],{"class":167},[33,28328,897],{"class":238},[33,28330,242],{"class":163},[33,28332,902],{"class":50},[33,28334,28335],{"class":167},")           ",[33,28337,28338],{"class":39},"# dtypes preserved\n",[33,28340,28341],{"class":35,"line":266},[33,28342,92],{"emptyLinePlaceholder":91},[33,28344,28345],{"class":35,"line":290},[33,28346,28347],{"class":39},"    # Excel for analysts (openpyxl engine writes the styled workbook)\n",[33,28349,28350,28353,28355,28358,28360,28362,28364,28366,28368,28370,28372,28374],{"class":35,"line":295},[33,28351,28352],{"class":167},"    df.to_excel(out_dir ",[33,28354,1351],{"class":163},[33,28356,28357],{"class":54}," \"invoices.xlsx\"",[33,28359,365],{"class":167},[33,28361,897],{"class":238},[33,28363,242],{"class":163},[33,28365,902],{"class":50},[33,28367,365],{"class":167},[33,28369,17351],{"class":238},[33,28371,242],{"class":163},[33,28373,17356],{"class":54},[33,28375,221],{"class":167},[33,28377,28378],{"class":35,"line":300},[33,28379,92],{"emptyLinePlaceholder":91},[33,28381,28382],{"class":35,"line":317},[33,28383,28384],{"class":39},"    # Word summary for correspondence\n",[33,28386,28387,28389,28391],{"class":35,"line":332},[33,28388,18224],{"class":167},[33,28390,242],{"class":163},[33,28392,18229],{"class":167},[33,28394,28395,28397,28400,28402,28404,28406,28408],{"class":35,"line":347},[33,28396,18591],{"class":167},[33,28398,28399],{"class":54},"\"Invoice Summary\"",[33,28401,365],{"class":167},[33,28403,18267],{"class":238},[33,28405,242],{"class":163},[33,28407,734],{"class":50},[33,28409,221],{"class":167},[33,28411,28412,28415,28417,28420,28422,28424,28426,28429,28431,28433,28436,28438,28441,28443,28445],{"class":35,"line":374},[33,28413,28414],{"class":167},"    doc.add_paragraph(",[33,28416,4059],{"class":163},[33,28418,28419],{"class":54},"\"Records: ",[33,28421,4065],{"class":50},[33,28423,4068],{"class":167},[33,28425,1121],{"class":50},[33,28427,28428],{"class":54},"    Total: ",[33,28430,1115],{"class":50},[33,28432,11038],{"class":167},[33,28434,28435],{"class":54},"'amount'",[33,28437,20640],{"class":167},[33,28439,28440],{"class":163},":,.2f",[33,28442,1121],{"class":50},[33,28444,274],{"class":54},[33,28446,221],{"class":167},[33,28448,28449,28452,28454,28457],{"class":35,"line":397},[33,28450,28451],{"class":167},"    doc.save(out_dir ",[33,28453,1351],{"class":163},[33,28455,28456],{"class":54}," \"summary.docx\"",[33,28458,221],{"class":167},[33,28460,28461],{"class":35,"line":653},[33,28462,92],{"emptyLinePlaceholder":91},[33,28464,28465],{"class":35,"line":667},[33,28466,28467],{"class":39},"    # PDF record, placed at explicit canvas coordinates\n",[33,28469,28470,28473,28475,28478,28480,28483,28485,28488,28490,28492,28494],{"class":35,"line":675},[33,28471,28472],{"class":167},"    c ",[33,28474,242],{"class":163},[33,28476,28477],{"class":167}," canvas.Canvas(",[33,28479,1053],{"class":50},[33,28481,28482],{"class":167},"(out_dir ",[33,28484,1351],{"class":163},[33,28486,28487],{"class":54}," \"summary.pdf\"",[33,28489,18525],{"class":167},[33,28491,20091],{"class":238},[33,28493,242],{"class":163},[33,28495,28496],{"class":167},"A4)\n",[33,28498,28499,28502,28504],{"class":35,"line":689},[33,28500,28501],{"class":167},"    _, height ",[33,28503,242],{"class":163},[33,28505,19049],{"class":167},[33,28507,28508,28511,28513,28515,28517],{"class":35,"line":703},[33,28509,28510],{"class":167},"    c.setFont(",[33,28512,19908],{"class":54},[33,28514,365],{"class":167},[33,28516,24213],{"class":50},[33,28518,221],{"class":167},[33,28520,28521,28524,28526,28529,28531,28534,28536,28538],{"class":35,"line":714},[33,28522,28523],{"class":167},"    c.drawString(",[33,28525,2680],{"class":50},[33,28527,28528],{"class":167},", height ",[33,28530,4126],{"class":163},[33,28532,28533],{"class":50}," 60",[33,28535,365],{"class":167},[33,28537,28399],{"class":54},[33,28539,221],{"class":167},[33,28541,28542,28544,28547,28549,28551],{"class":35,"line":723},[33,28543,28510],{"class":167},[33,28545,28546],{"class":54},"\"Helvetica\"",[33,28548,365],{"class":167},[33,28550,17260],{"class":50},[33,28552,221],{"class":167},[33,28554,28555,28557,28559,28561,28563,28566,28568,28570,28572,28574,28576,28578,28581,28583,28585,28587,28589,28591,28593,28595],{"class":35,"line":754},[33,28556,28523],{"class":167},[33,28558,2680],{"class":50},[33,28560,28528],{"class":167},[33,28562,4126],{"class":163},[33,28564,28565],{"class":50}," 90",[33,28567,365],{"class":167},[33,28569,4059],{"class":163},[33,28571,28419],{"class":54},[33,28573,4065],{"class":50},[33,28575,4068],{"class":167},[33,28577,1121],{"class":50},[33,28579,28580],{"class":54},"   Total: ",[33,28582,1115],{"class":50},[33,28584,11038],{"class":167},[33,28586,28435],{"class":54},[33,28588,20640],{"class":167},[33,28590,28440],{"class":163},[33,28592,1121],{"class":50},[33,28594,274],{"class":54},[33,28596,221],{"class":167},[33,28598,28599],{"class":35,"line":771},[33,28600,28601],{"class":167},"    c.save()\n",[14,28603,28604,28605,28609,28610,28612],{},"Two output gotchas recur. ReportLab silently drops non-Latin glyphs unless you register a Unicode TTF — see ",[940,28606,28608],{"href":28607},"\u002Fautomating-pdf-extraction-generation\u002Fgenerating-pdf-reports-dynamically\u002Ffix-reportlab-unicode-font-errors\u002F","Fix ReportLab Unicode Font Errors",". And templated Word output through ",[940,28611,26185],{"href":18040}," scales the per-record fan-out to one document per row when you need individualized letters rather than a single summary.",[18,28614,28616],{"id":28615},"production-hardening","Production hardening",[14,28618,28619,28620,3035],{},"A pipeline that runs once on your laptop is a demo. Production means it runs unattended on a schedule, survives the one corrupt file in a batch of a thousand, retries transient failures, refuses to double-count on a re-run, and tells you when something breaks. Five disciplines make that real, and they are the dedicated subject of ",[940,28621,5],{"href":26465},[14,28623,28624,28627],{},[1974,28625,28626],{},"Scheduling."," Start with cron — already present, reboot-survivable, one line. Graduate to Prefect when you need a task graph or a failure UI.",[23,28629,28632],{"className":28630,"code":28631,"language":2000,"meta":28},[1998],"# crontab -e  — run the pipeline every weekday at 03:00\n0 3 * * 1-5  cd \u002Fopt\u002Fpipeline && .venv\u002Fbin\u002Fpython run_pipeline.py --in inbox --out out >> logs\u002Fcron.log 2>&1\n",[30,28633,28631],{"__ignoreMap":28},[14,28635,28636,28639,28640,28643],{},[1974,28637,28638],{},"Retries, idempotency, and alerting."," Wrap each file so one bad input cannot abort the batch, retry I\u002FO-bound steps with exponential backoff, key outputs on a stable identifier so a re-run overwrites instead of duplicating, and alert on the failure count at the end so a partial run is ",[26245,28641,28642],{},"visible",", not silent.",[23,28645,28647],{"className":126,"code":28646,"language":47,"meta":28,"style":28},"# pip install (stdlib only)\nimport logging\nimport time\nfrom pathlib import Path\nfrom typing import Callable\n\nlog = logging.getLogger(\"pipeline\")\n\n\ndef with_retry(fn: Callable, *args, attempts: int = 3, base_delay: float = 1.0):\n    \"\"\"Retry a callable with exponential backoff; re-raise after the last try.\"\"\"\n    for attempt in range(1, attempts + 1):\n        try:\n            return fn(*args)\n        except Exception as exc:\n            if attempt == attempts:\n                raise\n            delay = base_delay * (2 ** (attempt - 1))\n            log.warning(\"Attempt %d failed (%s); retrying in %.1fs\", attempt, exc, delay)\n            time.sleep(delay)\n\n\ndef run_batch(src_dir: Path, process: Callable[[Path], None]) -> int:\n    \"\"\"Process every source, isolating per-file failures. Returns failure count.\"\"\"\n    failures = 0\n    sources = sorted(src_dir.glob(\"*.*\"))\n    for src in sources:\n        try:\n            with_retry(process, src)               # idempotent: process overwrites by id\n        except Exception as exc:\n            failures += 1\n            log.error(\"Permanently failed on %s: %s\", src.name, exc)\n    log.info(\"Batch done: %d ok, %d failed\", len(sources) - failures, failures)\n    if failures:\n        log.warning(\"ALERT: %d file(s) need manual review\", failures)  # hook email\u002FSlack here\n    return failures\n",[30,28648,28649,28653,28659,28665,28675,28687,28691,28704,28708,28712,28745,28750,28772,28778,28790,28800,28810,28815,28843,28868,28873,28877,28881,28900,28905,28915,28933,28945,28951,28959,28969,28979,28998,29028,29035,29054],{"__ignoreMap":28},[33,28650,28651],{"class":35,"line":36},[33,28652,26734],{"class":39},[33,28654,28655,28657],{"class":35,"line":43},[33,28656,164],{"class":163},[33,28658,184],{"class":167},[33,28660,28661,28663],{"class":35,"line":61},[33,28662,164],{"class":163},[33,28664,1689],{"class":167},[33,28666,28667,28669,28671,28673],{"class":35,"line":73},[33,28668,190],{"class":163},[33,28670,193],{"class":167},[33,28672,164],{"class":163},[33,28674,198],{"class":167},[33,28676,28677,28679,28682,28684],{"class":35,"line":88},[33,28678,190],{"class":163},[33,28680,28681],{"class":167}," typing ",[33,28683,164],{"class":163},[33,28685,28686],{"class":167}," Callable\n",[33,28688,28689],{"class":35,"line":95},[33,28690,92],{"emptyLinePlaceholder":91},[33,28692,28693,28696,28698,28700,28702],{"class":35,"line":101},[33,28694,28695],{"class":167},"log ",[33,28697,242],{"class":163},[33,28699,544],{"class":167},[33,28701,4978],{"class":54},[33,28703,221],{"class":167},[33,28705,28706],{"class":35,"line":171},[33,28707,92],{"emptyLinePlaceholder":91},[33,28709,28710],{"class":35,"line":179},[33,28711,92],{"emptyLinePlaceholder":91},[33,28713,28714,28716,28719,28722,28724,28727,28729,28731,28733,28736,28738,28740,28743],{"class":35,"line":187},[33,28715,562],{"class":163},[33,28717,28718],{"class":46}," with_retry",[33,28720,28721],{"class":167},"(fn: Callable, ",[33,28723,1769],{"class":163},[33,28725,28726],{"class":167},"args, attempts: ",[33,28728,1059],{"class":50},[33,28730,212],{"class":163},[33,28732,1714],{"class":50},[33,28734,28735],{"class":167},", base_delay: ",[33,28737,1720],{"class":50},[33,28739,212],{"class":163},[33,28741,28742],{"class":50}," 1.0",[33,28744,1737],{"class":167},[33,28746,28747],{"class":35,"line":201},[33,28748,28749],{"class":54},"    \"\"\"Retry a callable with exponential backoff; re-raise after the last try.\"\"\"\n",[33,28751,28752,28754,28756,28758,28760,28762,28764,28766,28768,28770],{"class":35,"line":206},[33,28753,656],{"class":163},[33,28755,1796],{"class":167},[33,28757,662],{"class":163},[33,28759,1801],{"class":50},[33,28761,602],{"class":167},[33,28763,734],{"class":50},[33,28765,1808],{"class":167},[33,28767,1811],{"class":163},[33,28769,1814],{"class":50},[33,28771,1737],{"class":167},[33,28773,28774,28776],{"class":35,"line":224},[33,28775,670],{"class":163},[33,28777,574],{"class":167},[33,28779,28780,28783,28785,28787],{"class":35,"line":229},[33,28781,28782],{"class":163},"            return",[33,28784,1832],{"class":167},[33,28786,1769],{"class":163},[33,28788,28789],{"class":167},"args)\n",[33,28791,28792,28794,28796,28798],{"class":35,"line":235},[33,28793,780],{"class":163},[33,28795,783],{"class":50},[33,28797,1852],{"class":163},[33,28799,1855],{"class":167},[33,28801,28802,28804,28806,28808],{"class":35,"line":250},[33,28803,5995],{"class":163},[33,28805,1796],{"class":167},[33,28807,1865],{"class":163},[33,28809,1868],{"class":167},[33,28811,28812],{"class":35,"line":266},[33,28813,28814],{"class":163},"                raise\n",[33,28816,28817,28820,28822,28825,28827,28829,28831,28834,28837,28839,28841],{"class":35,"line":290},[33,28818,28819],{"class":167},"            delay ",[33,28821,242],{"class":163},[33,28823,28824],{"class":167}," base_delay ",[33,28826,1769],{"class":163},[33,28828,17583],{"class":167},[33,28830,1533],{"class":50},[33,28832,28833],{"class":163}," **",[33,28835,28836],{"class":167}," (attempt ",[33,28838,4126],{"class":163},[33,28840,1814],{"class":50},[33,28842,371],{"class":167},[33,28844,28845,28848,28851,28853,28856,28858,28861,28863,28865],{"class":35,"line":295},[33,28846,28847],{"class":167},"            log.warning(",[33,28849,28850],{"class":54},"\"Attempt ",[33,28852,916],{"class":50},[33,28854,28855],{"class":54}," failed (",[33,28857,309],{"class":50},[33,28859,28860],{"class":54},"); retrying in ",[33,28862,1907],{"class":50},[33,28864,1910],{"class":54},[33,28866,28867],{"class":167},", attempt, exc, delay)\n",[33,28869,28870],{"class":35,"line":300},[33,28871,28872],{"class":167},"            time.sleep(delay)\n",[33,28874,28875],{"class":35,"line":317},[33,28876,92],{"emptyLinePlaceholder":91},[33,28878,28879],{"class":35,"line":332},[33,28880,92],{"emptyLinePlaceholder":91},[33,28882,28883,28885,28888,28891,28893,28896,28898],{"class":35,"line":347},[33,28884,562],{"class":163},[33,28886,28887],{"class":46}," run_batch",[33,28889,28890],{"class":167},"(src_dir: Path, process: Callable[[Path], ",[33,28892,571],{"class":50},[33,28894,28895],{"class":167},"]) -> ",[33,28897,1059],{"class":50},[33,28899,574],{"class":167},[33,28901,28902],{"class":35,"line":374},[33,28903,28904],{"class":54},"    \"\"\"Process every source, isolating per-file failures. Returns failure count.\"\"\"\n",[33,28906,28907,28910,28912],{"class":35,"line":397},[33,28908,28909],{"class":167},"    failures ",[33,28911,242],{"class":163},[33,28913,28914],{"class":50}," 0\n",[33,28916,28917,28920,28922,28925,28928,28931],{"class":35,"line":653},[33,28918,28919],{"class":167},"    sources ",[33,28921,242],{"class":163},[33,28923,28924],{"class":50}," sorted",[33,28926,28927],{"class":167},"(src_dir.glob(",[33,28929,28930],{"class":54},"\"*.*\"",[33,28932,371],{"class":167},[33,28934,28935,28937,28940,28942],{"class":35,"line":667},[33,28936,656],{"class":163},[33,28938,28939],{"class":167}," src ",[33,28941,662],{"class":163},[33,28943,28944],{"class":167}," sources:\n",[33,28946,28947,28949],{"class":35,"line":675},[33,28948,670],{"class":163},[33,28950,574],{"class":167},[33,28952,28953,28956],{"class":35,"line":689},[33,28954,28955],{"class":167},"            with_retry(process, src)               ",[33,28957,28958],{"class":39},"# idempotent: process overwrites by id\n",[33,28960,28961,28963,28965,28967],{"class":35,"line":703},[33,28962,780],{"class":163},[33,28964,783],{"class":50},[33,28966,1852],{"class":163},[33,28968,1855],{"class":167},[33,28970,28971,28974,28977],{"class":35,"line":714},[33,28972,28973],{"class":167},"            failures ",[33,28975,28976],{"class":163},"+=",[33,28978,17709],{"class":50},[33,28980,28981,28984,28987,28989,28991,28993,28995],{"class":35,"line":723},[33,28982,28983],{"class":167},"            log.error(",[33,28985,28986],{"class":54},"\"Permanently failed on ",[33,28988,309],{"class":50},[33,28990,2079],{"class":54},[33,28992,309],{"class":50},[33,28994,274],{"class":54},[33,28996,28997],{"class":167},", src.name, exc)\n",[33,28999,29000,29003,29006,29008,29011,29013,29016,29018,29020,29023,29025],{"class":35,"line":754},[33,29001,29002],{"class":167},"    log.info(",[33,29004,29005],{"class":54},"\"Batch done: ",[33,29007,916],{"class":50},[33,29009,29010],{"class":54}," ok, ",[33,29012,916],{"class":50},[33,29014,29015],{"class":54}," failed\"",[33,29017,365],{"class":167},[33,29019,928],{"class":50},[33,29021,29022],{"class":167},"(sources) ",[33,29024,4126],{"class":163},[33,29026,29027],{"class":167}," failures, failures)\n",[33,29029,29030,29032],{"class":35,"line":771},[33,29031,617],{"class":163},[33,29033,29034],{"class":167}," failures:\n",[33,29036,29037,29040,29043,29045,29048,29051],{"class":35,"line":777},[33,29038,29039],{"class":167},"        log.warning(",[33,29041,29042],{"class":54},"\"ALERT: ",[33,29044,916],{"class":50},[33,29046,29047],{"class":54}," file(s) need manual review\"",[33,29049,29050],{"class":167},", failures)  ",[33,29052,29053],{"class":39},"# hook email\u002FSlack here\n",[33,29055,29056,29058],{"class":35,"line":788},[33,29057,1332],{"class":163},[33,29059,29060],{"class":167}," failures\n",[14,29062,29063,29064,29067],{},"Idempotency is the subtle one: make ",[30,29065,29066],{},"process"," write its output keyed on a stable id (the invoice number, not the row position), so re-running yesterday's failed job simply overwrites and never double-counts. That single property is what makes \"just re-run it\" a safe operational response instead of a data-corruption risk.",[18,29069,29071],{"id":29070},"common-mistakes","Common mistakes",[4273,29073,29074,29085],{},[4276,29075,29076],{},[4279,29077,29078,29081,29083],{},[4282,29079,29080],{},"Issue",[4282,29082,4287],{},[4282,29084,4290],{},[4292,29086,29087,29102,29118,29133,29144,29159],{},[4279,29088,29089,29092,29095],{},[4297,29090,29091],{},"Re-run doubles every total",[4297,29093,29094],{},"Non-idempotent outputs keyed on position, not id",[4297,29096,29097,29098,29101],{},"Key outputs on a stable id; ",[30,29099,29100],{},"drop_duplicates"," and overwrite",[4279,29103,29104,29110,29113],{},[4297,29105,29106,29107,29109],{},"Enrichment columns all ",[30,29108,8884],{}," after merge",[4297,29111,29112],{},"Join key differs by whitespace\u002Fcasing across sources",[4297,29114,29115,29116],{},"Normalize the key on both sides before ",[30,29117,27844],{},[4279,29119,29120,29123,29126],{},[4297,29121,29122],{},"Leading zeros vanish from IDs",[4297,29124,29125],{},"pandas inferred a numeric dtype on read",[4297,29127,29128,29129,29132],{},"Read id columns with ",[30,29130,29131],{},"dtype=\"string\""," explicitly",[4279,29134,29135,29138,29141],{},[4297,29136,29137],{},"Nightly job dies silently at 3 a.m.",[4297,29139,29140],{},"No logging or alerting on the scheduled run",[4297,29142,29143],{},"Log every stage; alert on the end-of-batch failure count",[4279,29145,29146,29149,29152],{},[4297,29147,29148],{},"One corrupt file aborts the whole batch",[4297,29150,29151],{},"Unguarded loop, no per-file isolation",[4297,29153,29154,29155,29158],{},"Wrap each file in ",[30,29156,29157],{},"try\u002Fexcept","; count and continue",[4279,29160,29161,29164,29173],{},[4297,29162,29163],{},"Totals wrong but no error raised",[4297,29165,29166,29167,29169,29170,29172],{},"String cells like ",[30,29168,27820],{}," coerced to ",[30,29171,748],{}," or left untyped",[4297,29174,29175,29178,29179],{},[30,29176,29177],{},"pd.to_numeric(errors=\"coerce\")",", then quarantine ",[30,29180,8884],{},[18,29182,29184],{"id":29183},"frequently-asked-questions","Frequently asked questions",[14,29186,29187,29190,29191,3035],{},[1974,29188,29189],{},"Do I need an orchestration tool like Prefect or Airflow to start?","\nNo. A cron entry plus the logging and retry helpers above runs a robust nightly pipeline with zero orchestration dependencies. Reach for Prefect only when you need a dependency graph between tasks, backfills, or a UI to inspect failed runs — the trade-offs are laid out in ",[940,29192,5],{"href":26465},[14,29194,29195,29198],{},[1974,29196,29197],{},"Why route everything through pandas instead of passing data stage to stage directly?","\nBecause a dataframe is a typed, schema-checkable boundary every stage understands. Making pandas the single hub means each source reader and each output writer only has to speak one interface, so adding a new source or output format never touches the stages in between.",[14,29200,29201,29204,29205,29207,29208,29211],{},[1974,29202,29203],{},"How do I keep one bad PDF from killing the whole nightly batch?","\nWrap each file's processing in its own ",[30,29206,29157],{},", log it, increment a failure counter, and continue — then alert on the count at the end. The ",[30,29209,29210],{},"run_batch"," helper above implements exactly that, so a partial run is visible rather than a silent abort.",[14,29213,29214,29217,29218,29221,29222,29224],{},[1974,29215,29216],{},"Can the same pipeline output to Excel, Word, and PDF at once?","\nYes — that is the point of the fan-out stage. The consolidated frame is the single source; ",[30,29219,29220],{},"fan_out"," above writes all three from it. See ",[940,29223,4204],{"href":4203}," for the templated, paginated versions of each.",[14,29226,29227,29230],{},[1974,29228,29229],{},"What makes a re-run safe instead of a data-corruption risk?","\nIdempotency. Key every output on a stable business identifier (invoice number, account id) rather than row order, so re-processing overwrites the prior result instead of appending a duplicate. With that property, \"re-run the failed job\" is always safe.",[18,29232,6918],{"id":6917},[4211,29234,29235,29240,29245,29250,29255],{},[4214,29236,29237,29239],{},[940,29238,948],{"href":947}," — the extract-to-dataframe stage in depth, including multi-page tables.",[4214,29241,29242,29244],{},[940,29243,4204],{"href":4203}," — the Excel\u002FWord\u002FPDF fan-out with templating and pagination.",[4214,29246,29247,29249],{},[940,29248,5],{"href":26465}," — cron vs Prefect, structured logging, retries, and alerting.",[4214,29251,29252,29254],{},[940,29253,6943],{"href":6942}," — the extract and render stages owned end to end.",[4214,29256,29257,29259],{},[940,29258,26258],{"href":26257}," — the pandas transformation hub at the center of every pipeline.",[14,29261,6947,29262,3035],{},[940,29263,29264],{"href":1351},"Python Doc & Data Automation",[6953,29266,29267],{},"html pre.shiki code .sAwPA, html code.shiki .sAwPA{--shiki-default:#6A737D}html pre.shiki code .s7eDp, html code.shiki .s7eDp{--shiki-default:#6F42C1}html pre.shiki code .sYu0t, html code.shiki .sYu0t{--shiki-default:#005CC5}html pre.shiki code .sYBdl, html code.shiki .sYBdl{--shiki-default:#032F62}html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html pre.shiki code .sD7c4, html code.shiki .sD7c4{--shiki-default:#D73A49}html pre.shiki code .sgsFI, html code.shiki .sgsFI{--shiki-default:#24292E}html pre.shiki code .sqxcx, html code.shiki .sqxcx{--shiki-default:#E36209}",{"title":28,"searchDepth":43,"depth":43,"links":29269},[29270,29271,29272,29273,29274,29275,29276,29277,29278,29279,29280],{"id":26267,"depth":43,"text":26268},{"id":26468,"depth":43,"text":26469},{"id":26618,"depth":43,"text":26619},{"id":26940,"depth":43,"text":26941},{"id":27434,"depth":43,"text":27435},{"id":27834,"depth":43,"text":27835},{"id":28122,"depth":43,"text":28123},{"id":28615,"depth":43,"text":28616},{"id":29070,"depth":43,"text":29071},{"id":29183,"depth":43,"text":29184},{"id":6917,"depth":43,"text":6918},"Document Pipelines","Wire PDF extraction, pandas transformation, and Excel\u002FWord\u002FPDF generation into one scheduled, logged, idempotent Python pipeline that runs unattended end to end.",{},"\u002Fautomating-document-data-pipelines",{"title":6951,"description":29282},"Python Document & Data Pipeline Automation","automating-document-data-pipelines\u002Findex",[47,26247,6989,9630,29289],"etl","PNrUGQfSqrPXCRxrILlqkb5rkqDDHji7sViZm2BFTz0",{"id":4,"title":5,"body":29292,"breadcrumbTitle":6976,"canonical":6977,"date":6978,"description":6979,"draft":6980,"extension":6981,"image":6977,"meta":34948,"navigation":91,"path":6983,"robots":6977,"seo":34949,"seoTitle":6985,"stem":6986,"tags":34950,"updatedAt":6978,"__hash__":6991},{"type":7,"value":29293,"toc":34928},[29294,29296,29298,29300,29352,29354,29356,29360,29580,29582,29584,29586,29588,29988,29994,29996,30002,30312,30324,30326,30331,30809,30811,30813,30817,30831,30836,30846,30850,30870,30872,30876,31090,31096,31278,31280,31350,31352,31354,31356,31358,31634,31640,31642,32110,32114,32116,32118,32418,32420,32422,32640,32644,32646,32682,32684,32766,32768,34902,34904,34922,34926],[10,29295,5],{"id":12},[14,29297,16],{},[18,29299,21],{"id":20},[23,29301,29302],{"className":25,"code":26,"language":27,"meta":28,"style":28},[30,29303,29304,29308,29318,29326,29336,29340,29344],{"__ignoreMap":28},[33,29305,29306],{"class":35,"line":36},[33,29307,40],{"class":39},[33,29309,29310,29312,29314,29316],{"class":35,"line":43},[33,29311,47],{"class":46},[33,29313,51],{"class":50},[33,29315,55],{"class":54},[33,29317,58],{"class":54},[33,29319,29320,29322,29324],{"class":35,"line":61},[33,29321,64],{"class":50},[33,29323,67],{"class":54},[33,29325,70],{"class":39},[33,29327,29328,29330,29332,29334],{"class":35,"line":73},[33,29329,76],{"class":46},[33,29331,79],{"class":54},[33,29333,82],{"class":54},[33,29335,85],{"class":54},[33,29337,29338],{"class":35,"line":88},[33,29339,92],{"emptyLinePlaceholder":91},[33,29341,29342],{"class":35,"line":95},[33,29343,98],{"class":39},[33,29345,29346,29348,29350],{"class":35,"line":101},[33,29347,47],{"class":46},[33,29349,106],{"class":50},[33,29351,109],{"class":54},[14,29353,112],{},[18,29355,116],{"id":115},[14,29357,119,29358,123],{},[30,29359,122],{},[23,29361,29362],{"className":126,"code":127,"language":47,"meta":28,"style":28},[30,29363,29364,29368,29372,29376,29380,29384,29388,29394,29400,29406,29416,29420,29432,29436,29440,29450,29462,29480,29484,29488,29500,29512,29524,29544,29564],{"__ignoreMap":28},[33,29365,29366],{"class":35,"line":36},[33,29367,134],{"class":39},[33,29369,29370],{"class":35,"line":43},[33,29371,139],{"class":54},[33,29373,29374],{"class":35,"line":61},[33,29375,144],{"class":54},[33,29377,29378],{"class":35,"line":73},[33,29379,149],{"class":54},[33,29381,29382],{"class":35,"line":88},[33,29383,154],{"class":54},[33,29385,29386],{"class":35,"line":95},[33,29387,139],{"class":54},[33,29389,29390,29392],{"class":35,"line":101},[33,29391,164],{"class":163},[33,29393,168],{"class":167},[33,29395,29396,29398],{"class":35,"line":171},[33,29397,164],{"class":163},[33,29399,176],{"class":167},[33,29401,29402,29404],{"class":35,"line":179},[33,29403,164],{"class":163},[33,29405,184],{"class":167},[33,29407,29408,29410,29412,29414],{"class":35,"line":187},[33,29409,190],{"class":163},[33,29411,193],{"class":167},[33,29413,164],{"class":163},[33,29415,198],{"class":167},[33,29417,29418],{"class":35,"line":201},[33,29419,92],{"emptyLinePlaceholder":91},[33,29421,29422,29424,29426,29428,29430],{"class":35,"line":206},[33,29423,209],{"class":50},[33,29425,212],{"class":163},[33,29427,215],{"class":167},[33,29429,218],{"class":54},[33,29431,221],{"class":167},[33,29433,29434],{"class":35,"line":224},[33,29435,92],{"emptyLinePlaceholder":91},[33,29437,29438],{"class":35,"line":229},[33,29439,232],{"class":167},[33,29441,29442,29444,29446,29448],{"class":35,"line":235},[33,29443,239],{"class":238},[33,29445,242],{"class":163},[33,29447,209],{"class":50},[33,29449,247],{"class":167},[33,29451,29452,29454,29456,29458,29460],{"class":35,"line":250},[33,29453,253],{"class":238},[33,29455,242],{"class":163},[33,29457,258],{"class":167},[33,29459,261],{"class":50},[33,29461,247],{"class":167},[33,29463,29464,29466,29468,29470,29472,29474,29476,29478],{"class":35,"line":266},[33,29465,269],{"class":238},[33,29467,242],{"class":163},[33,29469,274],{"class":54},[33,29471,277],{"class":50},[33,29473,280],{"class":50},[33,29475,283],{"class":50},[33,29477,274],{"class":54},[33,29479,247],{"class":167},[33,29481,29482],{"class":35,"line":290},[33,29483,221],{"class":167},[33,29485,29486],{"class":35,"line":295},[33,29487,92],{"emptyLinePlaceholder":91},[33,29489,29490,29492,29494,29496,29498],{"class":35,"line":300},[33,29491,303],{"class":167},[33,29493,306],{"class":54},[33,29495,309],{"class":50},[33,29497,274],{"class":54},[33,29499,314],{"class":167},[33,29501,29502,29504,29506,29508,29510],{"class":35,"line":317},[33,29503,303],{"class":167},[33,29505,322],{"class":54},[33,29507,309],{"class":50},[33,29509,274],{"class":54},[33,29511,329],{"class":167},[33,29513,29514,29516,29518,29520,29522],{"class":35,"line":332},[33,29515,303],{"class":167},[33,29517,337],{"class":54},[33,29519,309],{"class":50},[33,29521,274],{"class":54},[33,29523,344],{"class":167},[33,29525,29526,29528,29530,29532,29534,29536,29538,29540,29542],{"class":35,"line":347},[33,29527,303],{"class":167},[33,29529,352],{"class":54},[33,29531,309],{"class":50},[33,29533,274],{"class":54},[33,29535,359],{"class":167},[33,29537,362],{"class":54},[33,29539,365],{"class":167},[33,29541,368],{"class":54},[33,29543,371],{"class":167},[33,29545,29546,29548,29550,29552,29554,29556,29558,29560,29562],{"class":35,"line":374},[33,29547,303],{"class":167},[33,29549,379],{"class":54},[33,29551,309],{"class":50},[33,29553,274],{"class":54},[33,29555,359],{"class":167},[33,29557,388],{"class":54},[33,29559,365],{"class":167},[33,29561,368],{"class":54},[33,29563,371],{"class":167},[33,29565,29566,29568,29570,29572,29574,29576,29578],{"class":35,"line":397},[33,29567,303],{"class":167},[33,29569,402],{"class":54},[33,29571,309],{"class":50},[33,29573,274],{"class":54},[33,29575,409],{"class":167},[33,29577,412],{"class":50},[33,29579,415],{"class":167},[14,29581,418],{},[18,29583,422],{"id":421},[424,29585,427],{"id":426},[14,29587,430],{},[23,29589,29590],{"className":126,"code":433,"language":47,"meta":28,"style":28},[30,29591,29592,29596,29600,29604,29608,29612,29618,29628,29632,29638,29648,29652,29664,29676,29680,29692,29696,29700,29712,29716,29724,29742,29750,29766,29770,29774,29784,29790,29800,29810,29818,29824,29846,29858,29862,29870,29882,29886,29894,29902,29906,29910,29926,29948,29964],{"__ignoreMap":28},[33,29593,29594],{"class":35,"line":36},[33,29595,440],{"class":39},[33,29597,29598],{"class":35,"line":43},[33,29599,139],{"class":54},[33,29601,29602],{"class":35,"line":61},[33,29603,449],{"class":54},[33,29605,29606],{"class":35,"line":73},[33,29607,454],{"class":54},[33,29609,29610],{"class":35,"line":88},[33,29611,139],{"class":54},[33,29613,29614,29616],{"class":35,"line":95},[33,29615,164],{"class":163},[33,29617,184],{"class":167},[33,29619,29620,29622,29624,29626],{"class":35,"line":101},[33,29621,190],{"class":163},[33,29623,193],{"class":167},[33,29625,164],{"class":163},[33,29627,198],{"class":167},[33,29629,29630],{"class":35,"line":171},[33,29631,92],{"emptyLinePlaceholder":91},[33,29633,29634,29636],{"class":35,"line":179},[33,29635,164],{"class":163},[33,29637,485],{"class":167},[33,29639,29640,29642,29644,29646],{"class":35,"line":187},[33,29641,164],{"class":163},[33,29643,492],{"class":167},[33,29645,495],{"class":163},[33,29647,498],{"class":167},[33,29649,29650],{"class":35,"line":201},[33,29651,92],{"emptyLinePlaceholder":91},[33,29653,29654,29656,29658,29660,29662],{"class":35,"line":206},[33,29655,507],{"class":50},[33,29657,212],{"class":163},[33,29659,215],{"class":167},[33,29661,514],{"class":54},[33,29663,221],{"class":167},[33,29665,29666,29668,29670,29672,29674],{"class":35,"line":224},[33,29667,521],{"class":50},[33,29669,212],{"class":163},[33,29671,215],{"class":167},[33,29673,528],{"class":54},[33,29675,221],{"class":167},[33,29677,29678],{"class":35,"line":229},[33,29679,92],{"emptyLinePlaceholder":91},[33,29681,29682,29684,29686,29688,29690],{"class":35,"line":235},[33,29683,539],{"class":167},[33,29685,242],{"class":163},[33,29687,544],{"class":167},[33,29689,547],{"class":50},[33,29691,221],{"class":167},[33,29693,29694],{"class":35,"line":250},[33,29695,92],{"emptyLinePlaceholder":91},[33,29697,29698],{"class":35,"line":266},[33,29699,92],{"emptyLinePlaceholder":91},[33,29701,29702,29704,29706,29708,29710],{"class":35,"line":290},[33,29703,562],{"class":163},[33,29705,565],{"class":46},[33,29707,568],{"class":167},[33,29709,571],{"class":50},[33,29711,574],{"class":167},[33,29713,29714],{"class":35,"line":295},[33,29715,579],{"class":54},[33,29717,29718,29720,29722],{"class":35,"line":300},[33,29719,584],{"class":167},[33,29721,242],{"class":163},[33,29723,589],{"class":167},[33,29725,29726,29728,29730,29732,29734,29736,29738,29740],{"class":35,"line":317},[33,29727,594],{"class":167},[33,29729,242],{"class":163},[33,29731,599],{"class":50},[33,29733,602],{"class":167},[33,29735,507],{"class":50},[33,29737,607],{"class":167},[33,29739,610],{"class":54},[33,29741,371],{"class":167},[33,29743,29744,29746,29748],{"class":35,"line":332},[33,29745,617],{"class":163},[33,29747,620],{"class":163},[33,29749,623],{"class":167},[33,29751,29752,29754,29756,29758,29760,29762,29764],{"class":35,"line":347},[33,29753,628],{"class":167},[33,29755,631],{"class":54},[33,29757,309],{"class":50},[33,29759,274],{"class":54},[33,29761,365],{"class":167},[33,29763,507],{"class":50},[33,29765,221],{"class":167},[33,29767,29768],{"class":35,"line":374},[33,29769,646],{"class":163},[33,29771,29772],{"class":35,"line":397},[33,29773,92],{"emptyLinePlaceholder":91},[33,29775,29776,29778,29780,29782],{"class":35,"line":653},[33,29777,656],{"class":163},[33,29779,659],{"class":167},[33,29781,662],{"class":163},[33,29783,623],{"class":167},[33,29785,29786,29788],{"class":35,"line":667},[33,29787,670],{"class":163},[33,29789,574],{"class":167},[33,29791,29792,29794,29796,29798],{"class":35,"line":675},[33,29793,678],{"class":163},[33,29795,681],{"class":167},[33,29797,495],{"class":163},[33,29799,686],{"class":167},[33,29801,29802,29804,29806,29808],{"class":35,"line":689},[33,29803,692],{"class":163},[33,29805,695],{"class":167},[33,29807,662],{"class":163},[33,29809,700],{"class":167},[33,29811,29812,29814,29816],{"class":35,"line":703},[33,29813,706],{"class":167},[33,29815,242],{"class":163},[33,29817,711],{"class":167},[33,29819,29820,29822],{"class":35,"line":714},[33,29821,717],{"class":163},[33,29823,720],{"class":167},[33,29825,29826,29828,29830,29832,29834,29836,29838,29840,29842,29844],{"class":35,"line":723},[33,29827,726],{"class":167},[33,29829,242],{"class":163},[33,29831,731],{"class":167},[33,29833,734],{"class":50},[33,29835,737],{"class":167},[33,29837,740],{"class":238},[33,29839,242],{"class":163},[33,29841,745],{"class":167},[33,29843,748],{"class":50},[33,29845,751],{"class":167},[33,29847,29848,29850,29852,29854,29856],{"class":35,"line":754},[33,29849,757],{"class":167},[33,29851,760],{"class":54},[33,29853,763],{"class":167},[33,29855,242],{"class":163},[33,29857,768],{"class":167},[33,29859,29860],{"class":35,"line":771},[33,29861,774],{"class":167},[33,29863,29864,29866,29868],{"class":35,"line":777},[33,29865,780],{"class":163},[33,29867,783],{"class":50},[33,29869,574],{"class":167},[33,29871,29872,29874,29876,29878,29880],{"class":35,"line":788},[33,29873,791],{"class":167},[33,29875,794],{"class":54},[33,29877,309],{"class":50},[33,29879,274],{"class":54},[33,29881,801],{"class":167},[33,29883,29884],{"class":35,"line":804},[33,29885,92],{"emptyLinePlaceholder":91},[33,29887,29888,29890,29892],{"class":35,"line":809},[33,29889,617],{"class":163},[33,29891,620],{"class":163},[33,29893,816],{"class":167},[33,29895,29896,29898,29900],{"class":35,"line":819},[33,29897,628],{"class":167},[33,29899,824],{"class":54},[33,29901,221],{"class":167},[33,29903,29904],{"class":35,"line":829},[33,29905,646],{"class":163},[33,29907,29908],{"class":35,"line":834},[33,29909,92],{"emptyLinePlaceholder":91},[33,29911,29912,29914,29916,29918,29920,29922,29924],{"class":35,"line":839},[33,29913,842],{"class":167},[33,29915,242],{"class":163},[33,29917,847],{"class":167},[33,29919,850],{"class":238},[33,29921,242],{"class":163},[33,29923,855],{"class":50},[33,29925,221],{"class":167},[33,29927,29928,29930,29932,29934,29936,29938,29940,29942,29944,29946],{"class":35,"line":860},[33,29929,863],{"class":50},[33,29931,866],{"class":167},[33,29933,869],{"class":238},[33,29935,242],{"class":163},[33,29937,855],{"class":50},[33,29939,365],{"class":167},[33,29941,878],{"class":238},[33,29943,242],{"class":163},[33,29945,855],{"class":50},[33,29947,221],{"class":167},[33,29949,29950,29952,29954,29956,29958,29960,29962],{"class":35,"line":887},[33,29951,890],{"class":167},[33,29953,521],{"class":50},[33,29955,365],{"class":167},[33,29957,897],{"class":238},[33,29959,242],{"class":163},[33,29961,902],{"class":50},[33,29963,221],{"class":167},[33,29965,29966,29968,29970,29972,29974,29976,29978,29980,29982,29984,29986],{"class":35,"line":907},[33,29967,910],{"class":167},[33,29969,913],{"class":54},[33,29971,916],{"class":50},[33,29973,919],{"class":54},[33,29975,309],{"class":50},[33,29977,274],{"class":54},[33,29979,365],{"class":167},[33,29981,928],{"class":50},[33,29983,931],{"class":167},[33,29985,521],{"class":50},[33,29987,221],{"class":167},[14,29989,938,29990,944,29992,949],{},[940,29991,943],{"href":942},[940,29993,948],{"href":947},[424,29995,953],{"id":952},[14,29997,956,29998,960,30000,964],{},[30,29999,959],{},[30,30001,963],{},[23,30003,30004],{"className":126,"code":967,"language":47,"meta":28,"style":28},[30,30005,30006,30010,30014,30018,30022,30026,30032,30038,30048,30052,30064,30068,30072,30094,30116,30138,30142,30150,30154,30174,30186,30196,30200,30208,30228,30242,30246,30250,30254,30262,30266,30270,30274,30278,30286,30294,30298,30302,30306],{"__ignoreMap":28},[33,30007,30008],{"class":35,"line":36},[33,30009,134],{"class":39},[33,30011,30012],{"class":35,"line":43},[33,30013,139],{"class":54},[33,30015,30016],{"class":35,"line":61},[33,30017,982],{"class":54},[33,30019,30020],{"class":35,"line":73},[33,30021,987],{"class":54},[33,30023,30024],{"class":35,"line":88},[33,30025,139],{"class":54},[33,30027,30028,30030],{"class":35,"line":95},[33,30029,164],{"class":163},[33,30031,184],{"class":167},[33,30033,30034,30036],{"class":35,"line":101},[33,30035,164],{"class":163},[33,30037,1004],{"class":167},[33,30039,30040,30042,30044,30046],{"class":35,"line":171},[33,30041,190],{"class":163},[33,30043,193],{"class":167},[33,30045,164],{"class":163},[33,30047,198],{"class":167},[33,30049,30050],{"class":35,"line":179},[33,30051,92],{"emptyLinePlaceholder":91},[33,30053,30054,30056,30058,30060,30062],{"class":35,"line":187},[33,30055,1023],{"class":50},[33,30057,212],{"class":163},[33,30059,215],{"class":167},[33,30061,1030],{"class":54},[33,30063,221],{"class":167},[33,30065,30066],{"class":35,"line":201},[33,30067,92],{"emptyLinePlaceholder":91},[33,30069,30070],{"class":35,"line":206},[33,30071,92],{"emptyLinePlaceholder":91},[33,30073,30074,30076,30078,30080,30082,30084,30086,30088,30090,30092],{"class":35,"line":224},[33,30075,562],{"class":163},[33,30077,1047],{"class":46},[33,30079,1050],{"class":167},[33,30081,1053],{"class":50},[33,30083,1056],{"class":167},[33,30085,1059],{"class":50},[33,30087,212],{"class":163},[33,30089,1064],{"class":167},[33,30091,1067],{"class":50},[33,30093,1070],{"class":167},[33,30095,30096,30098,30100,30102,30104,30106,30108,30110,30112,30114],{"class":35,"line":229},[33,30097,1075],{"class":50},[33,30099,1078],{"class":167},[33,30101,869],{"class":238},[33,30103,242],{"class":163},[33,30105,855],{"class":50},[33,30107,365],{"class":167},[33,30109,878],{"class":238},[33,30111,242],{"class":163},[33,30113,855],{"class":50},[33,30115,221],{"class":167},[33,30117,30118,30120,30122,30124,30126,30128,30130,30132,30134,30136],{"class":35,"line":235},[33,30119,1099],{"class":167},[33,30121,242],{"class":163},[33,30123,1104],{"class":50},[33,30125,1107],{"class":163},[33,30127,1110],{"class":163},[33,30129,274],{"class":54},[33,30131,1115],{"class":50},[33,30133,1118],{"class":167},[33,30135,1121],{"class":50},[33,30137,1124],{"class":54},[33,30139,30140],{"class":35,"line":250},[33,30141,92],{"emptyLinePlaceholder":91},[33,30143,30144,30146,30148],{"class":35,"line":266},[33,30145,1133],{"class":167},[33,30147,242],{"class":163},[33,30149,1138],{"class":167},[33,30151,30152],{"class":35,"line":290},[33,30153,1143],{"class":167},[33,30155,30156,30158,30160,30162,30164,30166,30168,30170,30172],{"class":35,"line":295},[33,30157,1148],{"class":238},[33,30159,242],{"class":163},[33,30161,1153],{"class":50},[33,30163,1156],{"class":163},[33,30165,1159],{"class":50},[33,30167,1156],{"class":163},[33,30169,1159],{"class":50},[33,30171,1166],{"class":167},[33,30173,1169],{"class":39},[33,30175,30176,30178,30180,30182,30184],{"class":35,"line":300},[33,30177,1174],{"class":238},[33,30179,242],{"class":163},[33,30181,1179],{"class":50},[33,30183,1182],{"class":167},[33,30185,1185],{"class":39},[33,30187,30188,30190,30192,30194],{"class":35,"line":317},[33,30189,1190],{"class":238},[33,30191,242],{"class":163},[33,30193,1195],{"class":54},[33,30195,247],{"class":167},[33,30197,30198],{"class":35,"line":332},[33,30199,1202],{"class":167},[33,30201,30202,30204,30206],{"class":35,"line":347},[33,30203,1207],{"class":167},[33,30205,242],{"class":163},[33,30207,1212],{"class":167},[33,30209,30210,30212,30214,30216,30218,30220,30222,30224,30226],{"class":35,"line":374},[33,30211,1217],{"class":238},[33,30213,242],{"class":163},[33,30215,274],{"class":54},[33,30217,277],{"class":50},[33,30219,1226],{"class":50},[33,30221,280],{"class":50},[33,30223,283],{"class":50},[33,30225,274],{"class":54},[33,30227,247],{"class":167},[33,30229,30230,30232,30234,30236,30238,30240],{"class":35,"line":397},[33,30231,1239],{"class":238},[33,30233,242],{"class":163},[33,30235,1244],{"class":54},[33,30237,916],{"class":50},[33,30239,1249],{"class":54},[33,30241,247],{"class":167},[33,30243,30244],{"class":35,"line":653},[33,30245,1202],{"class":167},[33,30247,30248],{"class":35,"line":667},[33,30249,1260],{"class":167},[33,30251,30252],{"class":35,"line":675},[33,30253,92],{"emptyLinePlaceholder":91},[33,30255,30256,30258,30260],{"class":35,"line":689},[33,30257,1269],{"class":167},[33,30259,242],{"class":163},[33,30261,1274],{"class":167},[33,30263,30264],{"class":35,"line":703},[33,30265,1279],{"class":167},[33,30267,30268],{"class":35,"line":714},[33,30269,1284],{"class":167},[33,30271,30272],{"class":35,"line":723},[33,30273,92],{"emptyLinePlaceholder":91},[33,30275,30276],{"class":35,"line":754},[33,30277,1293],{"class":39},[33,30279,30280,30282,30284],{"class":35,"line":771},[33,30281,1298],{"class":167},[33,30283,242],{"class":163},[33,30285,1303],{"class":167},[33,30287,30288,30290,30292],{"class":35,"line":777},[33,30289,1308],{"class":167},[33,30291,1311],{"class":50},[33,30293,221],{"class":167},[33,30295,30296],{"class":35,"line":788},[33,30297,1318],{"class":167},[33,30299,30300],{"class":35,"line":804},[33,30301,1323],{"class":167},[33,30303,30304],{"class":35,"line":809},[33,30305,92],{"emptyLinePlaceholder":91},[33,30307,30308,30310],{"class":35,"line":819},[33,30309,1332],{"class":163},[33,30311,1335],{"class":167},[14,30313,1338,30314,1341,30316,1344,30318,1347,30320,1351,30322,1355],{},[30,30315,261],{},[30,30317,1067],{},[30,30319,1311],{},[30,30321,1350],{},[30,30323,1354],{},[424,30325,1359],{"id":1358},[14,30327,1362,30328,1369],{},[940,30329,1368],{"href":1365,"rel":30330},[1367],[23,30332,30333],{"className":126,"code":1372,"language":47,"meta":28,"style":28},[30,30334,30335,30339,30343,30347,30351,30355,30361,30371,30375,30379,30383,30387,30391,30395,30399,30411,30415,30419,30427,30437,30449,30481,30501,30513,30517,30521,30525,30529,30545,30551,30569,30575,30579,30583,30587,30593,30599,30603,30607,30639,30647,30653,30669,30677,30699,30705,30719,30729,30739,30743,30747,30773,30781,30785,30789,30797,30803],{"__ignoreMap":28},[33,30336,30337],{"class":35,"line":36},[33,30338,1379],{"class":39},[33,30340,30341],{"class":35,"line":43},[33,30342,139],{"class":54},[33,30344,30345],{"class":35,"line":61},[33,30346,1388],{"class":54},[33,30348,30349],{"class":35,"line":73},[33,30350,1393],{"class":54},[33,30352,30353],{"class":35,"line":88},[33,30354,139],{"class":54},[33,30356,30357,30359],{"class":35,"line":95},[33,30358,164],{"class":163},[33,30360,184],{"class":167},[33,30362,30363,30365,30367,30369],{"class":35,"line":101},[33,30364,190],{"class":163},[33,30366,1410],{"class":167},[33,30368,164],{"class":163},[33,30370,1415],{"class":167},[33,30372,30373],{"class":35,"line":171},[33,30374,1420],{"class":167},[33,30376,30377],{"class":35,"line":179},[33,30378,1425],{"class":167},[33,30380,30381],{"class":35,"line":187},[33,30382,1430],{"class":167},[33,30384,30385],{"class":35,"line":201},[33,30386,1435],{"class":167},[33,30388,30389],{"class":35,"line":206},[33,30390,1440],{"class":167},[33,30392,30393],{"class":35,"line":224},[33,30394,221],{"class":167},[33,30396,30397],{"class":35,"line":229},[33,30398,92],{"emptyLinePlaceholder":91},[33,30400,30401,30403,30405,30407,30409],{"class":35,"line":235},[33,30402,539],{"class":167},[33,30404,242],{"class":163},[33,30406,544],{"class":167},[33,30408,547],{"class":50},[33,30410,221],{"class":167},[33,30412,30413],{"class":35,"line":250},[33,30414,92],{"emptyLinePlaceholder":91},[33,30416,30417],{"class":35,"line":266},[33,30418,1469],{"class":39},[33,30420,30421,30423,30425],{"class":35,"line":290},[33,30422,1474],{"class":167},[33,30424,242],{"class":163},[33,30426,1479],{"class":167},[33,30428,30429,30431,30433,30435],{"class":35,"line":295},[33,30430,1484],{"class":238},[33,30432,242],{"class":163},[33,30434,855],{"class":50},[33,30436,247],{"class":167},[33,30438,30439,30441,30443,30445,30447],{"class":35,"line":300},[33,30440,1495],{"class":238},[33,30442,242],{"class":163},[33,30444,1500],{"class":167},[33,30446,1503],{"class":50},[33,30448,1506],{"class":167},[33,30450,30451,30453,30455,30457,30459,30461,30463,30465,30467,30469,30471,30473,30475,30477,30479],{"class":35,"line":317},[33,30452,1511],{"class":238},[33,30454,242],{"class":163},[33,30456,1516],{"class":167},[33,30458,1519],{"class":238},[33,30460,242],{"class":163},[33,30462,734],{"class":50},[33,30464,365],{"class":167},[33,30466,1528],{"class":238},[33,30468,242],{"class":163},[33,30470,1533],{"class":50},[33,30472,365],{"class":167},[33,30474,1538],{"class":238},[33,30476,242],{"class":163},[33,30478,1543],{"class":50},[33,30480,1506],{"class":167},[33,30482,30483,30485,30487,30489,30491,30493,30495,30497,30499],{"class":35,"line":332},[33,30484,1550],{"class":238},[33,30486,242],{"class":163},[33,30488,1555],{"class":167},[33,30490,1558],{"class":50},[33,30492,365],{"class":167},[33,30494,1563],{"class":50},[33,30496,365],{"class":167},[33,30498,1568],{"class":50},[33,30500,1571],{"class":167},[33,30502,30503,30505,30507,30509,30511],{"class":35,"line":347},[33,30504,1576],{"class":238},[33,30506,242],{"class":163},[33,30508,1581],{"class":167},[33,30510,1311],{"class":50},[33,30512,1506],{"class":167},[33,30514,30515],{"class":35,"line":374},[33,30516,221],{"class":167},[33,30518,30519],{"class":35,"line":397},[33,30520,92],{"emptyLinePlaceholder":91},[33,30522,30523],{"class":35,"line":653},[33,30524,92],{"emptyLinePlaceholder":91},[33,30526,30527],{"class":35,"line":667},[33,30528,1602],{"class":46},[33,30530,30531,30533,30535,30537,30539,30541,30543],{"class":35,"line":675},[33,30532,562],{"class":163},[33,30534,1609],{"class":46},[33,30536,1612],{"class":167},[33,30538,1053],{"class":50},[33,30540,1617],{"class":167},[33,30542,1620],{"class":50},[33,30544,574],{"class":167},[33,30546,30547,30549],{"class":35,"line":689},[33,30548,1627],{"class":163},[33,30550,1630],{"class":167},[33,30552,30553,30555,30557,30559,30561,30563,30565,30567],{"class":35,"line":703},[33,30554,1635],{"class":163},[33,30556,1638],{"class":167},[33,30558,1641],{"class":238},[33,30560,242],{"class":163},[33,30562,1646],{"class":50},[33,30564,1649],{"class":167},[33,30566,495],{"class":163},[33,30568,1654],{"class":167},[33,30570,30571,30573],{"class":35,"line":714},[33,30572,1659],{"class":163},[33,30574,1662],{"class":167},[33,30576,30577],{"class":35,"line":723},[33,30578,92],{"emptyLinePlaceholder":91},[33,30580,30581],{"class":35,"line":754},[33,30582,92],{"emptyLinePlaceholder":91},[33,30584,30585],{"class":35,"line":771},[33,30586,1675],{"class":39},[33,30588,30589,30591],{"class":35,"line":777},[33,30590,164],{"class":163},[33,30592,1682],{"class":167},[33,30594,30595,30597],{"class":35,"line":788},[33,30596,164],{"class":163},[33,30598,1689],{"class":167},[33,30600,30601],{"class":35,"line":804},[33,30602,92],{"emptyLinePlaceholder":91},[33,30604,30605],{"class":35,"line":809},[33,30606,92],{"emptyLinePlaceholder":91},[33,30608,30609,30611,30613,30615,30617,30619,30621,30623,30625,30627,30629,30631,30633,30635,30637],{"class":35,"line":819},[33,30610,562],{"class":163},[33,30612,1704],{"class":46},[33,30614,1707],{"class":167},[33,30616,1059],{"class":50},[33,30618,212],{"class":163},[33,30620,1714],{"class":50},[33,30622,1717],{"class":167},[33,30624,1720],{"class":50},[33,30626,212],{"class":163},[33,30628,1725],{"class":50},[33,30630,1728],{"class":167},[33,30632,1720],{"class":50},[33,30634,212],{"class":163},[33,30636,1725],{"class":50},[33,30638,1737],{"class":167},[33,30640,30641,30643,30645],{"class":35,"line":829},[33,30642,1742],{"class":163},[33,30644,1745],{"class":46},[33,30646,1748],{"class":167},[33,30648,30649,30651],{"class":35,"line":834},[33,30650,1753],{"class":46},[33,30652,1756],{"class":167},[33,30654,30655,30657,30659,30661,30663,30665,30667],{"class":35,"line":839},[33,30656,1761],{"class":163},[33,30658,1764],{"class":46},[33,30660,602],{"class":167},[33,30662,1769],{"class":163},[33,30664,1772],{"class":167},[33,30666,1775],{"class":163},[33,30668,1778],{"class":167},[33,30670,30671,30673,30675],{"class":35,"line":860},[33,30672,1783],{"class":167},[33,30674,242],{"class":163},[33,30676,1788],{"class":167},[33,30678,30679,30681,30683,30685,30687,30689,30691,30693,30695,30697],{"class":35,"line":887},[33,30680,1793],{"class":163},[33,30682,1796],{"class":167},[33,30684,662],{"class":163},[33,30686,1801],{"class":50},[33,30688,602],{"class":167},[33,30690,734],{"class":50},[33,30692,1808],{"class":167},[33,30694,1811],{"class":163},[33,30696,1814],{"class":50},[33,30698,1737],{"class":167},[33,30700,30701,30703],{"class":35,"line":907},[33,30702,1821],{"class":163},[33,30704,574],{"class":167},[33,30706,30707,30709,30711,30713,30715,30717],{"class":35,"line":1826},[33,30708,1829],{"class":163},[33,30710,1832],{"class":167},[33,30712,1769],{"class":163},[33,30714,1772],{"class":167},[33,30716,1775],{"class":163},[33,30718,1841],{"class":167},[33,30720,30721,30723,30725,30727],{"class":35,"line":1844},[33,30722,1847],{"class":163},[33,30724,783],{"class":50},[33,30726,1852],{"class":163},[33,30728,1855],{"class":167},[33,30730,30731,30733,30735,30737],{"class":35,"line":1858},[33,30732,717],{"class":163},[33,30734,1796],{"class":167},[33,30736,1865],{"class":163},[33,30738,1868],{"class":167},[33,30740,30741],{"class":35,"line":1871},[33,30742,1874],{"class":163},[33,30744,30745],{"class":35,"line":1877},[33,30746,1880],{"class":167},[33,30748,30749,30751,30753,30755,30757,30759,30761,30763,30765,30767,30769,30771],{"class":35,"line":1883},[33,30750,1886],{"class":54},[33,30752,309],{"class":50},[33,30754,1796],{"class":54},[33,30756,916],{"class":50},[33,30758,1351],{"class":54},[33,30760,916],{"class":50},[33,30762,1899],{"class":54},[33,30764,309],{"class":50},[33,30766,1904],{"class":54},[33,30768,1907],{"class":50},[33,30770,1910],{"class":54},[33,30772,247],{"class":167},[33,30774,30775,30777,30779],{"class":35,"line":1915},[33,30776,1918],{"class":167},[33,30778,547],{"class":50},[33,30780,1923],{"class":167},[33,30782,30783],{"class":35,"line":1926},[33,30784,1929],{"class":167},[33,30786,30787],{"class":35,"line":1932},[33,30788,1935],{"class":167},[33,30790,30791,30793,30795],{"class":35,"line":1938},[33,30792,1941],{"class":167},[33,30794,1944],{"class":163},[33,30796,1947],{"class":167},[33,30798,30799,30801],{"class":35,"line":1950},[33,30800,1659],{"class":163},[33,30802,1955],{"class":167},[33,30804,30805,30807],{"class":35,"line":1958},[33,30806,1332],{"class":163},[33,30808,1963],{"class":167},[424,30810,1967],{"id":1966},[14,30812,1970],{},[14,30814,30815],{},[1974,30816,1976],{},[23,30818,30819],{"className":25,"code":1979,"language":27,"meta":28,"style":28},[30,30820,30821,30825],{"__ignoreMap":28},[33,30822,30823],{"class":35,"line":36},[33,30824,1986],{"class":39},[33,30826,30827,30829],{"class":35,"line":43},[33,30828,1991],{"class":46},[33,30830,1994],{"class":50},[23,30832,30834],{"className":30833,"code":1999,"language":2000},[1998],[30,30835,1999],{"__ignoreMap":28},[14,30837,2005,30838,2008,30840,2012,30842,2015,30844,2019],{},[30,30839,47],{},[30,30841,2011],{},[30,30843,47],{},[30,30845,2018],{},[14,30847,30848],{},[1974,30849,2024],{},[23,30851,30852],{"className":2027,"code":2028,"language":2029,"meta":28,"style":28},[30,30853,30854,30858,30862,30866],{"__ignoreMap":28},[33,30855,30856],{"class":35,"line":36},[33,30857,2036],{},[33,30859,30860],{"class":35,"line":43},[33,30861,2041],{},[33,30863,30864],{"class":35,"line":61},[33,30865,2046],{},[33,30867,30868],{"class":35,"line":73},[33,30869,2051],{},[14,30871,2054],{},[14,30873,30874],{},[1974,30875,2059],{},[23,30877,30878],{"className":2062,"code":2063,"language":2064,"meta":28,"style":28},[30,30879,30880,30884,30892,30896,30902,30908,30920,30928,30932,30938,30944,30952,30958,30968,30978,30984,30992,31002,31010,31020,31026,31034,31042,31050,31060,31068,31074,31082],{"__ignoreMap":28},[33,30881,30882],{"class":35,"line":36},[33,30883,2071],{"class":39},[33,30885,30886,30888,30890],{"class":35,"line":43},[33,30887,1118],{"class":2076},[33,30889,2079],{"class":167},[33,30891,2082],{"class":54},[33,30893,30894],{"class":35,"line":61},[33,30895,92],{"emptyLinePlaceholder":91},[33,30897,30898,30900],{"class":35,"line":73},[33,30899,2091],{"class":50},[33,30901,574],{"class":167},[33,30903,30904,30906],{"class":35,"line":88},[33,30905,2098],{"class":2076},[33,30907,574],{"class":167},[33,30909,30910,30912,30914,30916,30918],{"class":35,"line":95},[33,30911,2105],{"class":167},[33,30913,2108],{"class":2076},[33,30915,2079],{"class":167},[33,30917,2113],{"class":54},[33,30919,2116],{"class":39},[33,30921,30922,30924,30926],{"class":35,"line":101},[33,30923,2121],{"class":2076},[33,30925,2124],{"class":167},[33,30927,2127],{"class":39},[33,30929,30930],{"class":35,"line":171},[33,30931,92],{"emptyLinePlaceholder":91},[33,30933,30934,30936],{"class":35,"line":179},[33,30935,2136],{"class":2076},[33,30937,574],{"class":167},[33,30939,30940,30942],{"class":35,"line":187},[33,30941,2143],{"class":2076},[33,30943,574],{"class":167},[33,30945,30946,30948,30950],{"class":35,"line":201},[33,30947,2150],{"class":2076},[33,30949,2079],{"class":167},[33,30951,2155],{"class":54},[33,30953,30954,30956],{"class":35,"line":206},[33,30955,2160],{"class":2076},[33,30957,574],{"class":167},[33,30959,30960,30962,30964,30966],{"class":35,"line":224},[33,30961,2167],{"class":167},[33,30963,2170],{"class":2076},[33,30965,2079],{"class":167},[33,30967,2175],{"class":54},[33,30969,30970,30972,30974,30976],{"class":35,"line":229},[33,30971,2167],{"class":167},[33,30973,2170],{"class":2076},[33,30975,2079],{"class":167},[33,30977,2186],{"class":54},[33,30979,30980,30982],{"class":35,"line":235},[33,30981,2191],{"class":2076},[33,30983,574],{"class":167},[33,30985,30986,30988,30990],{"class":35,"line":250},[33,30987,2198],{"class":2076},[33,30989,2079],{"class":167},[33,30991,2203],{"class":54},[33,30993,30994,30996,30998,31000],{"class":35,"line":266},[33,30995,2167],{"class":167},[33,30997,1118],{"class":2076},[33,30999,2079],{"class":167},[33,31001,2214],{"class":54},[33,31003,31004,31006,31008],{"class":35,"line":290},[33,31005,2219],{"class":2076},[33,31007,2079],{"class":167},[33,31009,2224],{"class":54},[33,31011,31012,31014,31016,31018],{"class":35,"line":295},[33,31013,2167],{"class":167},[33,31015,1118],{"class":2076},[33,31017,2079],{"class":167},[33,31019,2235],{"class":54},[33,31021,31022,31024],{"class":35,"line":300},[33,31023,2240],{"class":2076},[33,31025,574],{"class":167},[33,31027,31028,31030,31032],{"class":35,"line":317},[33,31029,2247],{"class":2076},[33,31031,2079],{"class":167},[33,31033,2252],{"class":54},[33,31035,31036,31038,31040],{"class":35,"line":332},[33,31037,2257],{"class":2076},[33,31039,2079],{"class":167},[33,31041,2262],{"class":54},[33,31043,31044,31046,31048],{"class":35,"line":347},[33,31045,2219],{"class":2076},[33,31047,2079],{"class":167},[33,31049,2271],{"class":54},[33,31051,31052,31054,31056,31058],{"class":35,"line":374},[33,31053,2167],{"class":167},[33,31055,1118],{"class":2076},[33,31057,2079],{"class":167},[33,31059,2282],{"class":54},[33,31061,31062,31064,31066],{"class":35,"line":397},[33,31063,2287],{"class":2076},[33,31065,2079],{"class":167},[33,31067,2292],{"class":54},[33,31069,31070,31072],{"class":35,"line":653},[33,31071,2191],{"class":2076},[33,31073,574],{"class":167},[33,31075,31076,31078,31080],{"class":35,"line":667},[33,31077,2303],{"class":2076},[33,31079,2079],{"class":167},[33,31081,2308],{"class":54},[33,31083,31084,31086,31088],{"class":35,"line":675},[33,31085,2313],{"class":2076},[33,31087,2079],{"class":167},[33,31089,2318],{"class":54},[14,31091,31092],{},[1974,31093,31094,2326],{},[30,31095,2325],{},[23,31097,31098],{"className":126,"code":2329,"language":47,"meta":28,"style":28},[30,31099,31100,31104,31108,31112,31116,31120,31126,31132,31138,31142,31154,31158,31162,31174,31182,31188,31194,31202,31210,31218,31222,31226,31234,31238,31250,31258,31266,31270],{"__ignoreMap":28},[33,31101,31102],{"class":35,"line":36},[33,31103,2336],{"class":39},[33,31105,31106],{"class":35,"line":43},[33,31107,139],{"class":54},[33,31109,31110],{"class":35,"line":61},[33,31111,2345],{"class":54},[33,31113,31114],{"class":35,"line":73},[33,31115,2350],{"class":54},[33,31117,31118],{"class":35,"line":88},[33,31119,139],{"class":54},[33,31121,31122,31124],{"class":35,"line":95},[33,31123,164],{"class":163},[33,31125,1689],{"class":167},[33,31127,31128,31130],{"class":35,"line":101},[33,31129,164],{"class":163},[33,31131,184],{"class":167},[33,31133,31134,31136],{"class":35,"line":171},[33,31135,164],{"class":163},[33,31137,2373],{"class":167},[33,31139,31140],{"class":35,"line":179},[33,31141,92],{"emptyLinePlaceholder":91},[33,31143,31144,31146,31148,31150,31152],{"class":35,"line":187},[33,31145,539],{"class":167},[33,31147,242],{"class":163},[33,31149,544],{"class":167},[33,31151,547],{"class":50},[33,31153,221],{"class":167},[33,31155,31156],{"class":35,"line":201},[33,31157,92],{"emptyLinePlaceholder":91},[33,31159,31160],{"class":35,"line":206},[33,31161,92],{"emptyLinePlaceholder":91},[33,31163,31164,31166,31168,31170,31172],{"class":35,"line":224},[33,31165,562],{"class":163},[33,31167,2404],{"class":46},[33,31169,568],{"class":167},[33,31171,571],{"class":50},[33,31173,574],{"class":167},[33,31175,31176,31178,31180],{"class":35,"line":229},[33,31177,910],{"class":167},[33,31179,2417],{"class":54},[33,31181,221],{"class":167},[33,31183,31184,31186],{"class":35,"line":235},[33,31185,2424],{"class":163},[33,31187,574],{"class":167},[33,31189,31190,31192],{"class":35,"line":250},[33,31191,2431],{"class":167},[33,31193,2434],{"class":39},[33,31195,31196,31198,31200],{"class":35,"line":266},[33,31197,2439],{"class":167},[33,31199,2442],{"class":54},[33,31201,221],{"class":167},[33,31203,31204,31206,31208],{"class":35,"line":290},[33,31205,2449],{"class":163},[33,31207,783],{"class":50},[33,31209,574],{"class":167},[33,31211,31212,31214,31216],{"class":35,"line":295},[33,31213,2458],{"class":167},[33,31215,2461],{"class":54},[33,31217,221],{"class":167},[33,31219,31220],{"class":35,"line":300},[33,31221,92],{"emptyLinePlaceholder":91},[33,31223,31224],{"class":35,"line":317},[33,31225,92],{"emptyLinePlaceholder":91},[33,31227,31228,31230,31232],{"class":35,"line":332},[33,31229,2476],{"class":167},[33,31231,2479],{"class":54},[33,31233,2482],{"class":167},[33,31235,31236],{"class":35,"line":347},[33,31237,92],{"emptyLinePlaceholder":91},[33,31239,31240,31242,31244,31246,31248],{"class":35,"line":374},[33,31241,2491],{"class":163},[33,31243,2494],{"class":50},[33,31245,2497],{"class":163},[33,31247,2500],{"class":54},[33,31249,574],{"class":167},[33,31251,31252,31254,31256],{"class":35,"line":397},[33,31253,910],{"class":167},[33,31255,2509],{"class":54},[33,31257,221],{"class":167},[33,31259,31260,31262,31264],{"class":35,"line":653},[33,31261,2516],{"class":163},[33,31263,2519],{"class":50},[33,31265,574],{"class":167},[33,31267,31268],{"class":35,"line":667},[33,31269,2526],{"class":167},[33,31271,31272,31274,31276],{"class":35,"line":675},[33,31273,2531],{"class":167},[33,31275,1543],{"class":50},[33,31277,221],{"class":167},[2537,31279],{},[2540,31281,2547,31282,2547,31284,2547,31286,2547,2547,31298,2547,31300,2547,31302,2547,2547,31304,2547,2547,31306,2547,31308,2547,31310,2547,2547,31312,2547,2547,31314,2547,31316,2547,31318,2547,31320,2547,2547,31322,2547,2547,31324,2547,31326,2547,31328,2547,2547,31330,2547,2547,31332,2547,31334,2547,31336,2547,2547,31338,2547,31340,2547,31342,2547,2547,31344,2547,2547,31346,2547,31348],{"viewBox":2542,"role":2543,"ariaLabel":2544,"xmlns":2545,"style":2546},[2549,31283,2551],{},[2553,31285,2555],{},[2557,31287,2559,31288,2559,31294,2547],{},[2561,31289,2564,31290,2564,31292,2559],{"id":2563,"x1":748,"y1":748,"x2":734,"y2":748},[2566,31291],{"offset":748,"style":2568},[2566,31293],{"offset":734,"style":2571},[2573,31295,2564,31296,2559],{"id":2575,"viewBox":2576,"refX":2577,"refY":1153,"markerWidth":1179,"markerHeight":1179,"orient":2578},[2580,31297],{"d":2582,"fill":2583},[2585,31299],{"x":2587,"y":2588,"width":2589,"height":2590,"rx":2591,"fill":2592,"stroke":2593,"style":2594},[2000,31301,2601],{"x":2597,"y":2598,"fill":2599,"style":2600},[2000,31303,2606],{"x":2597,"y":2604,"fill":2583,"style":2605},[35,31305],{"x1":2609,"y1":2610,"x2":2611,"y2":2610,"stroke":2583,"markerEnd":2612,"style":2594},[2585,31307],{"x":2611,"y":2588,"width":2589,"height":2590,"rx":2591,"fill":2615,"stroke":2593,"style":2594},[2000,31309,2619],{"x":2618,"y":2598,"fill":2599,"style":2600},[2000,31311,2622],{"x":2618,"y":2604,"fill":2583,"style":2605},[35,31313],{"x1":2625,"y1":2610,"x2":2626,"y2":2610,"stroke":2583,"markerEnd":2612,"style":2594},[2585,31315],{"x":2626,"y":2629,"width":2588,"height":2630,"rx":2591,"fill":2631,"stroke":2593,"style":2594},[2000,31317,2636],{"x":2634,"y":2635,"fill":2599,"style":2600},[2000,31319,2640],{"x":2634,"y":2639,"fill":2599,"style":2605},[2000,31321,2644],{"x":2634,"y":2643,"fill":2599,"style":2605},[35,31323],{"x1":2647,"y1":2648,"x2":2649,"y2":2650,"stroke":2583,"markerEnd":2612,"style":2594},[2585,31325],{"x":2649,"y":2653,"width":2609,"height":2590,"rx":2591,"fill":2592,"stroke":2593,"style":2594},[2000,31327,2657],{"x":2656,"y":2630,"fill":2599,"style":2600},[2000,31329,2661],{"x":2656,"y":2660,"fill":2583,"style":2605},[35,31331],{"x1":2647,"y1":2664,"x2":2649,"y2":2665,"stroke":2583,"markerEnd":2612,"style":2594},[2585,31333],{"x":2649,"y":2611,"width":2609,"height":2590,"rx":2591,"fill":2592,"stroke":2593,"style":2594},[2000,31335,2670],{"x":2656,"y":2665,"fill":2599,"style":2600},[2000,31337,2674],{"x":2656,"y":2673,"fill":2583,"style":2605},[2585,31339],{"x":2677,"y":2678,"width":2679,"height":2680,"rx":2681,"fill":2615,"stroke":2593,"style":2594},[2000,31341,2686],{"x":2634,"y":2684,"fill":2599,"style":2685},[2000,31343,2690],{"x":2634,"y":2689,"fill":2583,"style":2605},[35,31345],{"x1":2634,"y1":2693,"x2":2634,"y2":2678,"stroke":2583,"markerEnd":2612,"style":2694},[35,31347],{"x1":2618,"y1":2697,"x2":2618,"y2":2698,"stroke":2593,"style":2594},[2000,31349,2703],{"x":2701,"y":2702,"fill":2583,"style":2605},[2537,31351],{},[18,31353,2709],{"id":2708},[424,31355,2713],{"id":2712},[14,31357,2716],{},[23,31359,31360],{"className":126,"code":2719,"language":47,"meta":28,"style":28},[30,31361,31362,31366,31370,31374,31378,31382,31388,31394,31400,31410,31414,31426,31438,31442,31446,31458,31464,31492,31500,31504,31510,31518,31524,31528,31532,31544,31550,31556,31564,31568,31572,31576,31588,31596,31604,31612,31618,31624,31630],{"__ignoreMap":28},[33,31363,31364],{"class":35,"line":36},[33,31365,134],{"class":39},[33,31367,31368],{"class":35,"line":43},[33,31369,139],{"class":54},[33,31371,31372],{"class":35,"line":61},[33,31373,2734],{"class":54},[33,31375,31376],{"class":35,"line":73},[33,31377,2739],{"class":54},[33,31379,31380],{"class":35,"line":88},[33,31381,139],{"class":54},[33,31383,31384,31386],{"class":35,"line":95},[33,31385,164],{"class":163},[33,31387,176],{"class":167},[33,31389,31390,31392],{"class":35,"line":101},[33,31391,164],{"class":163},[33,31393,168],{"class":167},[33,31395,31396,31398],{"class":35,"line":171},[33,31397,164],{"class":163},[33,31399,184],{"class":167},[33,31401,31402,31404,31406,31408],{"class":35,"line":179},[33,31403,190],{"class":163},[33,31405,193],{"class":167},[33,31407,164],{"class":163},[33,31409,198],{"class":167},[33,31411,31412],{"class":35,"line":187},[33,31413,92],{"emptyLinePlaceholder":91},[33,31415,31416,31418,31420,31422,31424],{"class":35,"line":201},[33,31417,2780],{"class":50},[33,31419,212],{"class":163},[33,31421,215],{"class":167},[33,31423,2787],{"class":54},[33,31425,221],{"class":167},[33,31427,31428,31430,31432,31434,31436],{"class":35,"line":206},[33,31429,539],{"class":167},[33,31431,242],{"class":163},[33,31433,544],{"class":167},[33,31435,547],{"class":50},[33,31437,221],{"class":167},[33,31439,31440],{"class":35,"line":224},[33,31441,92],{"emptyLinePlaceholder":91},[33,31443,31444],{"class":35,"line":229},[33,31445,92],{"emptyLinePlaceholder":91},[33,31447,31448,31450,31452,31454,31456],{"class":35,"line":235},[33,31449,562],{"class":163},[33,31451,2816],{"class":46},[33,31453,568],{"class":167},[33,31455,2821],{"class":50},[33,31457,574],{"class":167},[33,31459,31460,31462],{"class":35,"line":250},[33,31461,2424],{"class":163},[33,31463,574],{"class":167},[33,31465,31466,31468,31470,31472,31474,31476,31478,31480,31482,31484,31486,31488,31490],{"class":35,"line":266},[33,31467,2834],{"class":167},[33,31469,242],{"class":163},[33,31471,2839],{"class":167},[33,31473,2780],{"class":50},[33,31475,2844],{"class":167},[33,31477,2847],{"class":50},[33,31479,2850],{"class":163},[33,31481,2853],{"class":167},[33,31483,2856],{"class":50},[33,31485,2850],{"class":163},[33,31487,2853],{"class":167},[33,31489,2863],{"class":50},[33,31491,221],{"class":167},[33,31493,31494,31496,31498],{"class":35,"line":290},[33,31495,2870],{"class":167},[33,31497,1053],{"class":50},[33,31499,2875],{"class":167},[33,31501,31502],{"class":35,"line":295},[33,31503,2880],{"class":167},[33,31505,31506,31508],{"class":35,"line":300},[33,31507,1659],{"class":163},[33,31509,2887],{"class":50},[33,31511,31512,31514,31516],{"class":35,"line":317},[33,31513,2449],{"class":163},[33,31515,2894],{"class":50},[33,31517,574],{"class":167},[33,31519,31520,31522],{"class":35,"line":332},[33,31521,1659],{"class":163},[33,31523,2903],{"class":50},[33,31525,31526],{"class":35,"line":347},[33,31527,92],{"emptyLinePlaceholder":91},[33,31529,31530],{"class":35,"line":374},[33,31531,92],{"emptyLinePlaceholder":91},[33,31533,31534,31536,31538,31540,31542],{"class":35,"line":397},[33,31535,562],{"class":163},[33,31537,2918],{"class":46},[33,31539,568],{"class":167},[33,31541,571],{"class":50},[33,31543,574],{"class":167},[33,31545,31546,31548],{"class":35,"line":653},[33,31547,2424],{"class":163},[33,31549,574],{"class":167},[33,31551,31552,31554],{"class":35,"line":667},[33,31553,2935],{"class":50},[33,31555,2938],{"class":167},[33,31557,31558,31560,31562],{"class":35,"line":675},[33,31559,2449],{"class":163},[33,31561,2945],{"class":50},[33,31563,574],{"class":167},[33,31565,31566],{"class":35,"line":689},[33,31567,2952],{"class":163},[33,31569,31570],{"class":35,"line":703},[33,31571,92],{"emptyLinePlaceholder":91},[33,31573,31574],{"class":35,"line":714},[33,31575,92],{"emptyLinePlaceholder":91},[33,31577,31578,31580,31582,31584,31586],{"class":35,"line":723},[33,31579,2491],{"class":163},[33,31581,2494],{"class":50},[33,31583,2497],{"class":163},[33,31585,2500],{"class":54},[33,31587,574],{"class":167},[33,31589,31590,31592,31594],{"class":35,"line":754},[33,31591,617],{"class":163},[33,31593,620],{"class":163},[33,31595,2981],{"class":167},[33,31597,31598,31600,31602],{"class":35,"line":771},[33,31599,628],{"class":167},[33,31601,2988],{"class":54},[33,31603,221],{"class":167},[33,31605,31606,31608,31610],{"class":35,"line":777},[33,31607,2995],{"class":167},[33,31609,748],{"class":50},[33,31611,221],{"class":167},[33,31613,31614,31616],{"class":35,"line":788},[33,31615,2424],{"class":163},[33,31617,574],{"class":167},[33,31619,31620,31622],{"class":35,"line":804},[33,31621,3010],{"class":167},[33,31623,3013],{"class":39},[33,31625,31626,31628],{"class":35,"line":809},[33,31627,3018],{"class":163},[33,31629,574],{"class":167},[33,31631,31632],{"class":35,"line":819},[33,31633,3025],{"class":167},[14,31635,3028,31636,3032,31638,3035],{},[30,31637,3031],{},[30,31639,2856],{},[424,31641,3039],{"id":3038},[23,31643,31644],{"className":126,"code":3042,"language":47,"meta":28,"style":28},[30,31645,31646,31650,31654,31658,31662,31666,31672,31678,31684,31690,31700,31710,31714,31730,31748,31764,31780,31796,31812,31816,31820,31840,31848,31860,31872,31884,31888,31894,31912,31916,31928,31932,31940,31946,31958,31962,31966,31982,31992,31996,32008,32016,32022,32030,32046,32056,32060,32066,32080,32084,32092,32098],{"__ignoreMap":28},[33,31647,31648],{"class":35,"line":36},[33,31649,3049],{"class":39},[33,31651,31652],{"class":35,"line":43},[33,31653,139],{"class":54},[33,31655,31656],{"class":35,"line":61},[33,31657,3058],{"class":54},[33,31659,31660],{"class":35,"line":73},[33,31661,3063],{"class":54},[33,31663,31664],{"class":35,"line":88},[33,31665,139],{"class":54},[33,31667,31668,31670],{"class":35,"line":95},[33,31669,164],{"class":163},[33,31671,3074],{"class":167},[33,31673,31674,31676],{"class":35,"line":101},[33,31675,164],{"class":163},[33,31677,3081],{"class":167},[33,31679,31680,31682],{"class":35,"line":171},[33,31681,164],{"class":163},[33,31683,1630],{"class":167},[33,31685,31686,31688],{"class":35,"line":179},[33,31687,164],{"class":163},[33,31689,176],{"class":167},[33,31691,31692,31694,31696,31698],{"class":35,"line":187},[33,31693,190],{"class":163},[33,31695,3100],{"class":167},[33,31697,164],{"class":163},[33,31699,3105],{"class":167},[33,31701,31702,31704,31706,31708],{"class":35,"line":201},[33,31703,190],{"class":163},[33,31705,193],{"class":167},[33,31707,164],{"class":163},[33,31709,198],{"class":167},[33,31711,31712],{"class":35,"line":206},[33,31713,92],{"emptyLinePlaceholder":91},[33,31715,31716,31718,31720,31722,31724,31726,31728],{"class":35,"line":224},[33,31717,3124],{"class":50},[33,31719,212],{"class":163},[33,31721,3129],{"class":167},[33,31723,3132],{"class":54},[33,31725,365],{"class":167},[33,31727,3137],{"class":54},[33,31729,221],{"class":167},[33,31731,31732,31734,31736,31738,31740,31742,31744,31746],{"class":35,"line":229},[33,31733,3144],{"class":50},[33,31735,212],{"class":163},[33,31737,3149],{"class":50},[33,31739,3152],{"class":167},[33,31741,3155],{"class":54},[33,31743,365],{"class":167},[33,31745,3160],{"class":54},[33,31747,371],{"class":167},[33,31749,31750,31752,31754,31756,31758,31760,31762],{"class":35,"line":235},[33,31751,3167],{"class":50},[33,31753,212],{"class":163},[33,31755,3129],{"class":167},[33,31757,3174],{"class":54},[33,31759,365],{"class":167},[33,31761,3179],{"class":54},[33,31763,221],{"class":167},[33,31765,31766,31768,31770,31772,31774,31776,31778],{"class":35,"line":250},[33,31767,3186],{"class":50},[33,31769,212],{"class":163},[33,31771,3129],{"class":167},[33,31773,3193],{"class":54},[33,31775,365],{"class":167},[33,31777,3198],{"class":54},[33,31779,221],{"class":167},[33,31781,31782,31784,31786,31788,31790,31792,31794],{"class":35,"line":266},[33,31783,3205],{"class":50},[33,31785,212],{"class":163},[33,31787,3129],{"class":167},[33,31789,3212],{"class":54},[33,31791,365],{"class":167},[33,31793,3217],{"class":54},[33,31795,221],{"class":167},[33,31797,31798,31800,31802,31804,31806,31808,31810],{"class":35,"line":290},[33,31799,3224],{"class":50},[33,31801,212],{"class":163},[33,31803,3129],{"class":167},[33,31805,3231],{"class":54},[33,31807,365],{"class":167},[33,31809,3198],{"class":54},[33,31811,221],{"class":167},[33,31813,31814],{"class":35,"line":295},[33,31815,92],{"emptyLinePlaceholder":91},[33,31817,31818],{"class":35,"line":300},[33,31819,92],{"emptyLinePlaceholder":91},[33,31821,31822,31824,31826,31828,31830,31832,31834,31836,31838],{"class":35,"line":317},[33,31823,562],{"class":163},[33,31825,3252],{"class":46},[33,31827,3255],{"class":167},[33,31829,1053],{"class":50},[33,31831,3260],{"class":167},[33,31833,1053],{"class":50},[33,31835,1617],{"class":167},[33,31837,571],{"class":50},[33,31839,574],{"class":167},[33,31841,31842,31844,31846],{"class":35,"line":332},[33,31843,3273],{"class":167},[33,31845,242],{"class":163},[33,31847,3278],{"class":167},[33,31849,31850,31852,31854,31856,31858],{"class":35,"line":347},[33,31851,3283],{"class":167},[33,31853,3286],{"class":54},[33,31855,763],{"class":167},[33,31857,242],{"class":163},[33,31859,3293],{"class":167},[33,31861,31862,31864,31866,31868,31870],{"class":35,"line":374},[33,31863,3283],{"class":167},[33,31865,3300],{"class":54},[33,31867,763],{"class":167},[33,31869,242],{"class":163},[33,31871,3307],{"class":50},[33,31873,31874,31876,31878,31880,31882],{"class":35,"line":397},[33,31875,3283],{"class":167},[33,31877,3314],{"class":54},[33,31879,763],{"class":167},[33,31881,242],{"class":163},[33,31883,3321],{"class":50},[33,31885,31886],{"class":35,"line":653},[33,31887,3326],{"class":167},[33,31889,31890,31892],{"class":35,"line":667},[33,31891,2424],{"class":163},[33,31893,574],{"class":167},[33,31895,31896,31898,31900,31902,31904,31906,31908,31910],{"class":35,"line":675},[33,31897,2191],{"class":163},[33,31899,3339],{"class":167},[33,31901,3124],{"class":50},[33,31903,365],{"class":167},[33,31905,3144],{"class":50},[33,31907,1649],{"class":167},[33,31909,495],{"class":163},[33,31911,3352],{"class":167},[33,31913,31914],{"class":35,"line":689},[33,31915,3357],{"class":167},[33,31917,31918,31920,31922,31924,31926],{"class":35,"line":703},[33,31919,3362],{"class":167},[33,31921,3167],{"class":50},[33,31923,365],{"class":167},[33,31925,3186],{"class":50},[33,31927,221],{"class":167},[33,31929,31930],{"class":35,"line":714},[33,31931,3375],{"class":167},[33,31933,31934,31936,31938],{"class":35,"line":723},[33,31935,2449],{"class":163},[33,31937,783],{"class":50},[33,31939,574],{"class":167},[33,31941,31942,31944],{"class":35,"line":754},[33,31943,3388],{"class":163},[33,31945,184],{"class":167},[33,31947,31948,31950,31952,31954,31956],{"class":35,"line":771},[33,31949,3395],{"class":167},[33,31951,547],{"class":50},[33,31953,3400],{"class":167},[33,31955,3403],{"class":54},[33,31957,221],{"class":167},[33,31959,31960],{"class":35,"line":777},[33,31961,92],{"emptyLinePlaceholder":91},[33,31963,31964],{"class":35,"line":788},[33,31965,92],{"emptyLinePlaceholder":91},[33,31967,31968,31970,31972,31974,31976,31978,31980],{"class":35,"line":804},[33,31969,562],{"class":163},[33,31971,3420],{"class":46},[33,31973,3423],{"class":167},[33,31975,1053],{"class":50},[33,31977,1617],{"class":167},[33,31979,571],{"class":50},[33,31981,574],{"class":167},[33,31983,31984,31986,31988,31990],{"class":35,"line":809},[33,31985,617],{"class":163},[33,31987,620],{"class":163},[33,31989,3440],{"class":50},[33,31991,574],{"class":167},[33,31993,31994],{"class":35,"line":819},[33,31995,646],{"class":163},[33,31997,31998,32000,32002,32004,32006],{"class":35,"line":829},[33,31999,3451],{"class":167},[33,32001,242],{"class":163},[33,32003,3456],{"class":167},[33,32005,3459],{"class":54},[33,32007,3462],{"class":167},[33,32009,32010,32012,32014],{"class":35,"line":834},[33,32011,3467],{"class":167},[33,32013,242],{"class":163},[33,32015,3472],{"class":167},[33,32017,32018,32020],{"class":35,"line":839},[33,32019,3477],{"class":50},[33,32021,247],{"class":167},[33,32023,32024,32026,32028],{"class":35,"line":860},[33,32025,3484],{"class":238},[33,32027,242],{"class":163},[33,32029,3489],{"class":167},[33,32031,32032,32034,32036,32038,32040,32042,32044],{"class":35,"line":887},[33,32033,3494],{"class":238},[33,32035,242],{"class":163},[33,32037,1115],{"class":167},[33,32039,3501],{"class":54},[33,32041,2079],{"class":167},[33,32043,3506],{"class":54},[33,32045,3509],{"class":167},[33,32047,32048,32050,32052,32054],{"class":35,"line":907},[33,32049,3514],{"class":238},[33,32051,242],{"class":163},[33,32053,3519],{"class":54},[33,32055,247],{"class":167},[33,32057,32058],{"class":35,"line":1826},[33,32059,1202],{"class":167},[33,32061,32062,32064],{"class":35,"line":1844},[33,32063,2424],{"class":163},[33,32065,574],{"class":167},[33,32067,32068,32070,32072,32074,32076,32078],{"class":35,"line":1858},[33,32069,2191],{"class":163},[33,32071,3538],{"class":167},[33,32073,1641],{"class":238},[33,32075,242],{"class":163},[33,32077,3545],{"class":50},[33,32079,1737],{"class":167},[33,32081,32082],{"class":35,"line":1871},[33,32083,3552],{"class":163},[33,32085,32086,32088,32090],{"class":35,"line":1877},[33,32087,2449],{"class":163},[33,32089,783],{"class":50},[33,32091,574],{"class":167},[33,32093,32094,32096],{"class":35,"line":1883},[33,32095,3388],{"class":163},[33,32097,184],{"class":167},[33,32099,32100,32102,32104,32106,32108],{"class":35,"line":1915},[33,32101,3395],{"class":167},[33,32103,547],{"class":50},[33,32105,3400],{"class":167},[33,32107,3577],{"class":54},[33,32109,221],{"class":167},[14,32111,3582,32112,3586],{},[30,32113,3585],{},[424,32115,3590],{"id":3589},[14,32117,3593],{},[23,32119,32120],{"className":126,"code":3596,"language":47,"meta":28,"style":28},[30,32121,32122,32126,32130,32134,32138,32142,32148,32154,32164,32168,32180,32184,32188,32196,32208,32212,32216,32220,32224,32228,32234,32238,32242,32254,32262,32266,32272,32276,32280,32292,32300,32308,32316,32322,32326,32330,32342,32346,32350,32362,32372,32380,32388,32392,32398,32406,32410,32414],{"__ignoreMap":28},[33,32123,32124],{"class":35,"line":36},[33,32125,134],{"class":39},[33,32127,32128],{"class":35,"line":43},[33,32129,139],{"class":54},[33,32131,32132],{"class":35,"line":61},[33,32133,3611],{"class":54},[33,32135,32136],{"class":35,"line":73},[33,32137,3616],{"class":54},[33,32139,32140],{"class":35,"line":88},[33,32141,139],{"class":54},[33,32143,32144,32146],{"class":35,"line":95},[33,32145,164],{"class":163},[33,32147,3627],{"class":167},[33,32149,32150,32152],{"class":35,"line":101},[33,32151,164],{"class":163},[33,32153,3634],{"class":167},[33,32155,32156,32158,32160,32162],{"class":35,"line":171},[33,32157,190],{"class":163},[33,32159,193],{"class":167},[33,32161,164],{"class":163},[33,32163,198],{"class":167},[33,32165,32166],{"class":35,"line":179},[33,32167,92],{"emptyLinePlaceholder":91},[33,32169,32170,32172,32174,32176,32178],{"class":35,"line":187},[33,32171,3653],{"class":50},[33,32173,212],{"class":163},[33,32175,215],{"class":167},[33,32177,3660],{"class":54},[33,32179,221],{"class":167},[33,32181,32182],{"class":35,"line":201},[33,32183,92],{"emptyLinePlaceholder":91},[33,32185,32186],{"class":35,"line":206},[33,32187,92],{"emptyLinePlaceholder":91},[33,32189,32190,32192,32194],{"class":35,"line":224},[33,32191,562],{"class":163},[33,32193,3677],{"class":46},[33,32195,3680],{"class":167},[33,32197,32198,32200,32202,32204,32206],{"class":35,"line":229},[33,32199,3685],{"class":167},[33,32201,242],{"class":163},[33,32203,3690],{"class":167},[33,32205,3653],{"class":50},[33,32207,221],{"class":167},[33,32209,32210],{"class":35,"line":235},[33,32211,3699],{"class":167},[33,32213,32214],{"class":35,"line":250},[33,32215,3704],{"class":54},[33,32217,32218],{"class":35,"line":266},[33,32219,3709],{"class":54},[33,32221,32222],{"class":35,"line":290},[33,32223,1202],{"class":167},[33,32225,32226],{"class":35,"line":295},[33,32227,3718],{"class":167},[33,32229,32230,32232],{"class":35,"line":300},[33,32231,1332],{"class":163},[33,32233,3725],{"class":167},[33,32235,32236],{"class":35,"line":317},[33,32237,92],{"emptyLinePlaceholder":91},[33,32239,32240],{"class":35,"line":332},[33,32241,92],{"emptyLinePlaceholder":91},[33,32243,32244,32246,32248,32250,32252],{"class":35,"line":347},[33,32245,562],{"class":163},[33,32247,3740],{"class":46},[33,32249,3743],{"class":167},[33,32251,1053],{"class":50},[33,32253,574],{"class":167},[33,32255,32256,32258,32260],{"class":35,"line":374},[33,32257,3752],{"class":167},[33,32259,242],{"class":163},[33,32261,3757],{"class":167},[33,32263,32264],{"class":35,"line":397},[33,32265,3762],{"class":167},[33,32267,32268,32270],{"class":35,"line":653},[33,32269,1332],{"class":163},[33,32271,3769],{"class":167},[33,32273,32274],{"class":35,"line":667},[33,32275,92],{"emptyLinePlaceholder":91},[33,32277,32278],{"class":35,"line":675},[33,32279,92],{"emptyLinePlaceholder":91},[33,32281,32282,32284,32286,32288,32290],{"class":35,"line":689},[33,32283,562],{"class":163},[33,32285,3784],{"class":46},[33,32287,3743],{"class":167},[33,32289,2821],{"class":50},[33,32291,574],{"class":167},[33,32293,32294,32296,32298],{"class":35,"line":703},[33,32295,3795],{"class":167},[33,32297,242],{"class":163},[33,32299,3800],{"class":167},[33,32301,32302,32304,32306],{"class":35,"line":714},[33,32303,3685],{"class":167},[33,32305,242],{"class":163},[33,32307,3809],{"class":167},[33,32309,32310,32312,32314],{"class":35,"line":723},[33,32311,3814],{"class":167},[33,32313,242],{"class":163},[33,32315,3819],{"class":167},[33,32317,32318,32320],{"class":35,"line":754},[33,32319,3824],{"class":54},[33,32321,3827],{"class":167},[33,32323,32324],{"class":35,"line":771},[33,32325,3832],{"class":167},[33,32327,32328],{"class":35,"line":777},[33,32329,3837],{"class":167},[33,32331,32332,32334,32336,32338,32340],{"class":35,"line":788},[33,32333,1332],{"class":163},[33,32335,3844],{"class":167},[33,32337,3847],{"class":163},[33,32339,620],{"class":163},[33,32341,3852],{"class":50},[33,32343,32344],{"class":35,"line":804},[33,32345,92],{"emptyLinePlaceholder":91},[33,32347,32348],{"class":35,"line":809},[33,32349,92],{"emptyLinePlaceholder":91},[33,32351,32352,32354,32356,32358,32360],{"class":35,"line":819},[33,32353,562],{"class":163},[33,32355,3867],{"class":46},[33,32357,3743],{"class":167},[33,32359,571],{"class":50},[33,32361,574],{"class":167},[33,32363,32364,32366,32368,32370],{"class":35,"line":829},[33,32365,3878],{"class":163},[33,32367,3881],{"class":167},[33,32369,164],{"class":163},[33,32371,3886],{"class":167},[33,32373,32374,32376,32378],{"class":35,"line":834},[33,32375,3795],{"class":167},[33,32377,242],{"class":163},[33,32379,3800],{"class":167},[33,32381,32382,32384,32386],{"class":35,"line":839},[33,32383,3685],{"class":167},[33,32385,242],{"class":163},[33,32387,3809],{"class":167},[33,32389,32390],{"class":35,"line":860},[33,32391,3699],{"class":167},[33,32393,32394,32396],{"class":35,"line":887},[33,32395,3911],{"class":54},[33,32397,247],{"class":167},[33,32399,32400,32402,32404],{"class":35,"line":907},[33,32401,3918],{"class":167},[33,32403,1053],{"class":50},[33,32405,3923],{"class":167},[33,32407,32408],{"class":35,"line":1826},[33,32409,1202],{"class":167},[33,32411,32412],{"class":35,"line":1844},[33,32413,3718],{"class":167},[33,32415,32416],{"class":35,"line":1858},[33,32417,3837],{"class":167},[18,32419,3939],{"id":3938},[14,32421,3942],{},[23,32423,32424],{"className":126,"code":3945,"language":47,"meta":28,"style":28},[30,32425,32426,32430,32434,32438,32442,32446,32456,32466,32470,32474,32494,32502,32514,32544,32548,32568,32582,32588,32610,32614,32628],{"__ignoreMap":28},[33,32427,32428],{"class":35,"line":36},[33,32429,3952],{"class":39},[33,32431,32432],{"class":35,"line":43},[33,32433,139],{"class":54},[33,32435,32436],{"class":35,"line":61},[33,32437,3961],{"class":54},[33,32439,32440],{"class":35,"line":73},[33,32441,3966],{"class":54},[33,32443,32444],{"class":35,"line":88},[33,32445,139],{"class":54},[33,32447,32448,32450,32452,32454],{"class":35,"line":95},[33,32449,164],{"class":163},[33,32451,492],{"class":167},[33,32453,495],{"class":163},[33,32455,498],{"class":167},[33,32457,32458,32460,32462,32464],{"class":35,"line":101},[33,32459,190],{"class":163},[33,32461,193],{"class":167},[33,32463,164],{"class":163},[33,32465,198],{"class":167},[33,32467,32468],{"class":35,"line":171},[33,32469,92],{"emptyLinePlaceholder":91},[33,32471,32472],{"class":35,"line":179},[33,32473,92],{"emptyLinePlaceholder":91},[33,32475,32476,32478,32480,32482,32484,32486,32488,32490,32492],{"class":35,"line":187},[33,32477,562],{"class":163},[33,32479,4005],{"class":46},[33,32481,4008],{"class":167},[33,32483,1059],{"class":50},[33,32485,212],{"class":163},[33,32487,1814],{"class":50},[33,32489,1617],{"class":167},[33,32491,571],{"class":50},[33,32493,574],{"class":167},[33,32495,32496,32498,32500],{"class":35,"line":201},[33,32497,4025],{"class":167},[33,32499,242],{"class":163},[33,32501,4030],{"class":167},[33,32503,32504,32506,32508,32510,32512],{"class":35,"line":206},[33,32505,617],{"class":163},[33,32507,4037],{"class":50},[33,32509,4040],{"class":167},[33,32511,4043],{"class":163},[33,32513,4046],{"class":167},[33,32515,32516,32518,32520,32522,32524,32526,32528,32530,32532,32534,32536,32538,32540,32542],{"class":35,"line":224},[33,32517,4051],{"class":163},[33,32519,4054],{"class":50},[33,32521,602],{"class":167},[33,32523,4059],{"class":163},[33,32525,4062],{"class":54},[33,32527,4065],{"class":50},[33,32529,4068],{"class":167},[33,32531,1121],{"class":50},[33,32533,4073],{"class":54},[33,32535,1115],{"class":50},[33,32537,4078],{"class":167},[33,32539,1121],{"class":50},[33,32541,274],{"class":54},[33,32543,221],{"class":167},[33,32545,32546],{"class":35,"line":229},[33,32547,92],{"emptyLinePlaceholder":91},[33,32549,32550,32552,32554,32556,32558,32560,32562,32564,32566],{"class":35,"line":235},[33,32551,4093],{"class":167},[33,32553,242],{"class":163},[33,32555,4098],{"class":167},[33,32557,4101],{"class":54},[33,32559,365],{"class":167},[33,32561,4106],{"class":54},[33,32563,365],{"class":167},[33,32565,760],{"class":54},[33,32567,4113],{"class":167},[33,32569,32570,32572,32574,32576,32578,32580],{"class":35,"line":250},[33,32571,4118],{"class":167},[33,32573,242],{"class":163},[33,32575,4123],{"class":167},[33,32577,4126],{"class":163},[33,32579,4129],{"class":50},[33,32581,4132],{"class":167},[33,32583,32584,32586],{"class":35,"line":266},[33,32585,617],{"class":163},[33,32587,4139],{"class":167},[33,32589,32590,32592,32594,32596,32598,32600,32602,32604,32606,32608],{"class":35,"line":290},[33,32591,4051],{"class":163},[33,32593,4054],{"class":50},[33,32595,602],{"class":167},[33,32597,4059],{"class":163},[33,32599,4152],{"class":54},[33,32601,1115],{"class":50},[33,32603,4157],{"class":167},[33,32605,1121],{"class":50},[33,32607,274],{"class":54},[33,32609,221],{"class":167},[33,32611,32612],{"class":35,"line":295},[33,32613,92],{"emptyLinePlaceholder":91},[33,32615,32616,32618,32620,32622,32624,32626],{"class":35,"line":300},[33,32617,617],{"class":163},[33,32619,4174],{"class":167},[33,32621,4177],{"class":238},[33,32623,242],{"class":163},[33,32625,734],{"class":50},[33,32627,4184],{"class":167},[33,32629,32630,32632,32634,32636,32638],{"class":35,"line":317},[33,32631,4051],{"class":163},[33,32633,4054],{"class":50},[33,32635,602],{"class":167},[33,32637,4195],{"class":54},[33,32639,221],{"class":167},[14,32641,4200,32642,4205],{},[940,32643,4204],{"href":4203},[18,32645,4209],{"id":4208},[4211,32647,32648,32654,32660,32668,32676],{},[4214,32649,32650,4219,32652,4222],{},[1974,32651,4218],{},[30,32653,507],{},[4214,32655,32656,4228,32658,4232],{},[1974,32657,4227],{},[30,32659,4231],{},[4214,32661,32662,2079,32664,4241,32666,4245],{},[1974,32663,4237],{},[30,32665,4240],{},[30,32667,4244],{},[4214,32669,32670,4251,32672,4254,32674,4258],{},[1974,32671,4250],{},[30,32673,76],{},[30,32675,4257],{},[4214,32677,32678,4264,32680,4267],{},[1974,32679,4263],{},[30,32681,2325],{},[18,32683,4271],{"id":4270},[4273,32685,32686,32696],{},[4276,32687,32688],{},[4279,32689,32690,32692,32694],{},[4282,32691,4284],{},[4282,32693,4287],{},[4282,32695,4290],{},[4292,32697,32698,32712,32724,32744,32756],{},[4279,32699,32700,32704,32708],{},[4297,32701,32702,4302],{},[30,32703,4301],{},[4297,32705,4305,32706,4309],{},[30,32707,4308],{},[4297,32709,4312,32710,4316],{},[30,32711,4315],{},[4279,32713,32714,32716,32720],{},[4297,32715,4321],{},[4297,32717,4324,32718],{},[30,32719,4327],{},[4297,32721,4330,32722,4334],{},[30,32723,4333],{},[4279,32725,32726,32730,32738],{},[4297,32727,32728,4342],{},[30,32729,4341],{},[4297,32731,4345,32732,4348,32734,4352,32736],{},[30,32733,122],{},[30,32735,4351],{},[30,32737,4355],{},[4297,32739,4358,32740,4362,32742],{},[30,32741,4361],{},[30,32743,4365],{},[4279,32745,32746,32748,32752],{},[4297,32747,4370],{},[4297,32749,4373,32750],{},[30,32751,4376],{},[4297,32753,4379,32754,4383],{},[30,32755,4382],{},[4279,32757,32758,32760,32762],{},[4297,32759,4388],{},[4297,32761,4391],{},[4297,32763,4394,32764,4398],{},[30,32765,4397],{},[18,32767,4402],{"id":4401},[23,32769,32770],{"className":126,"code":4405,"language":47,"meta":28,"style":28},[30,32771,32772,32776,32780,32784,32788,32792,32796,32800,32804,32808,32812,32818,32824,32830,32836,32842,32848,32854,32860,32870,32880,32884,32890,32900,32906,32916,32920,32924,32928,32932,32936,32940,32944,32948,32964,32980,32996,33012,33024,33040,33056,33072,33088,33104,33108,33112,33116,33124,33146,33154,33164,33182,33192,33202,33206,33214,33230,33244,33248,33252,33260,33268,33272,33280,33288,33292,33296,33306,33310,33314,33322,33326,33330,33334,33346,33352,33380,33388,33392,33398,33406,33412,33416,33420,33432,33438,33444,33452,33456,33460,33464,33468,33476,33488,33492,33496,33500,33504,33508,33514,33518,33522,33534,33540,33544,33548,33560,33568,33576,33584,33590,33594,33598,33610,33614,33618,33630,33638,33642,33648,33656,33660,33664,33668,33672,33676,33680,33688,33698,33710,33742,33758,33770,33774,33778,33782,33786,33806,33812,33818,33824,33834,33838,33850,33856,33864,33876,33888,33900,33904,33922,33926,33938,33942,33950,33958,33962,33970,34004,34012,34018,34026,34042,34052,34056,34062,34076,34080,34088,34096,34100,34104,34108,34112,34120,34128,34138,34148,34156,34174,34196,34208,34212,34218,34222,34226,34230,34242,34254,34280,34284,34292,34300,34304,34308,34316,34326,34332,34340,34344,34348,34368,34376,34388,34392,34400,34416,34420,34424,34440,34460,34476,34498,34510,34530,34534,34538,34550,34558,34566,34570,34576,34580,34590,34598,34602,34612,34642,34646,34654,34660,34664,34668,34672,34676,34688,34704,34726,34748,34752,34758,34768,34778,34782,34790,34794,34800,34808,34816,34832,34836,34844,34848,34856,34862,34878,34882,34886,34898],{"__ignoreMap":28},[33,32773,32774],{"class":35,"line":36},[33,32775,4412],{"class":39},[33,32777,32778],{"class":35,"line":43},[33,32779,139],{"class":54},[33,32781,32782],{"class":35,"line":61},[33,32783,4421],{"class":54},[33,32785,32786],{"class":35,"line":73},[33,32787,4426],{"class":54},[33,32789,32790],{"class":35,"line":88},[33,32791,92],{"emptyLinePlaceholder":91},[33,32793,32794],{"class":35,"line":95},[33,32795,4435],{"class":54},[33,32797,32798],{"class":35,"line":101},[33,32799,4440],{"class":54},[33,32801,32802],{"class":35,"line":171},[33,32803,4445],{"class":54},[33,32805,32806],{"class":35,"line":179},[33,32807,4450],{"class":54},[33,32809,32810],{"class":35,"line":187},[33,32811,139],{"class":54},[33,32813,32814,32816],{"class":35,"line":201},[33,32815,164],{"class":163},[33,32817,4461],{"class":167},[33,32819,32820,32822],{"class":35,"line":206},[33,32821,164],{"class":163},[33,32823,3634],{"class":167},[33,32825,32826,32828],{"class":35,"line":224},[33,32827,164],{"class":163},[33,32829,184],{"class":167},[33,32831,32832,32834],{"class":35,"line":229},[33,32833,164],{"class":163},[33,32835,1004],{"class":167},[33,32837,32838,32840],{"class":35,"line":235},[33,32839,164],{"class":163},[33,32841,176],{"class":167},[33,32843,32844,32846],{"class":35,"line":250},[33,32845,164],{"class":163},[33,32847,3627],{"class":167},[33,32849,32850,32852],{"class":35,"line":266},[33,32851,164],{"class":163},[33,32853,168],{"class":167},[33,32855,32856,32858],{"class":35,"line":290},[33,32857,164],{"class":163},[33,32859,1689],{"class":167},[33,32861,32862,32864,32866,32868],{"class":35,"line":295},[33,32863,190],{"class":163},[33,32865,3881],{"class":167},[33,32867,164],{"class":163},[33,32869,3886],{"class":167},[33,32871,32872,32874,32876,32878],{"class":35,"line":300},[33,32873,190],{"class":163},[33,32875,193],{"class":167},[33,32877,164],{"class":163},[33,32879,198],{"class":167},[33,32881,32882],{"class":35,"line":317},[33,32883,92],{"emptyLinePlaceholder":91},[33,32885,32886,32888],{"class":35,"line":332},[33,32887,164],{"class":163},[33,32889,485],{"class":167},[33,32891,32892,32894,32896,32898],{"class":35,"line":347},[33,32893,164],{"class":163},[33,32895,492],{"class":167},[33,32897,495],{"class":163},[33,32899,498],{"class":167},[33,32901,32902,32904],{"class":35,"line":374},[33,32903,164],{"class":163},[33,32905,2373],{"class":167},[33,32907,32908,32910,32912,32914],{"class":35,"line":397},[33,32909,190],{"class":163},[33,32911,1410],{"class":167},[33,32913,164],{"class":163},[33,32915,1415],{"class":167},[33,32917,32918],{"class":35,"line":653},[33,32919,1420],{"class":167},[33,32921,32922],{"class":35,"line":667},[33,32923,1425],{"class":167},[33,32925,32926],{"class":35,"line":675},[33,32927,1430],{"class":167},[33,32929,32930],{"class":35,"line":689},[33,32931,1435],{"class":167},[33,32933,32934],{"class":35,"line":703},[33,32935,1440],{"class":167},[33,32937,32938],{"class":35,"line":714},[33,32939,221],{"class":167},[33,32941,32942],{"class":35,"line":723},[33,32943,92],{"emptyLinePlaceholder":91},[33,32945,32946],{"class":35,"line":754},[33,32947,4592],{"class":39},[33,32949,32950,32952,32954,32956,32958,32960,32962],{"class":35,"line":771},[33,32951,507],{"class":50},[33,32953,212],{"class":163},[33,32955,4601],{"class":167},[33,32957,4604],{"class":54},[33,32959,365],{"class":167},[33,32961,514],{"class":54},[33,32963,371],{"class":167},[33,32965,32966,32968,32970,32972,32974,32976,32978],{"class":35,"line":777},[33,32967,4615],{"class":50},[33,32969,212],{"class":163},[33,32971,4601],{"class":167},[33,32973,4622],{"class":54},[33,32975,365],{"class":167},[33,32977,4627],{"class":54},[33,32979,371],{"class":167},[33,32981,32982,32984,32986,32988,32990,32992,32994],{"class":35,"line":788},[33,32983,1023],{"class":50},[33,32985,212],{"class":163},[33,32987,4601],{"class":167},[33,32989,4640],{"class":54},[33,32991,365],{"class":167},[33,32993,1030],{"class":54},[33,32995,371],{"class":167},[33,32997,32998,33000,33002,33004,33006,33008,33010],{"class":35,"line":804},[33,32999,3653],{"class":50},[33,33001,212],{"class":163},[33,33003,4601],{"class":167},[33,33005,4657],{"class":54},[33,33007,365],{"class":167},[33,33009,3660],{"class":54},[33,33011,371],{"class":167},[33,33013,33014,33016,33018,33020,33022],{"class":35,"line":809},[33,33015,2780],{"class":50},[33,33017,212],{"class":163},[33,33019,215],{"class":167},[33,33021,2787],{"class":54},[33,33023,221],{"class":167},[33,33025,33026,33028,33030,33032,33034,33036,33038],{"class":35,"line":819},[33,33027,3124],{"class":50},[33,33029,212],{"class":163},[33,33031,3129],{"class":167},[33,33033,3132],{"class":54},[33,33035,365],{"class":167},[33,33037,3198],{"class":54},[33,33039,221],{"class":167},[33,33041,33042,33044,33046,33048,33050,33052,33054],{"class":35,"line":829},[33,33043,3167],{"class":50},[33,33045,212],{"class":163},[33,33047,3129],{"class":167},[33,33049,3174],{"class":54},[33,33051,365],{"class":167},[33,33053,3198],{"class":54},[33,33055,221],{"class":167},[33,33057,33058,33060,33062,33064,33066,33068,33070],{"class":35,"line":834},[33,33059,3186],{"class":50},[33,33061,212],{"class":163},[33,33063,3129],{"class":167},[33,33065,3193],{"class":54},[33,33067,365],{"class":167},[33,33069,3198],{"class":54},[33,33071,221],{"class":167},[33,33073,33074,33076,33078,33080,33082,33084,33086],{"class":35,"line":839},[33,33075,3205],{"class":50},[33,33077,212],{"class":163},[33,33079,3129],{"class":167},[33,33081,3212],{"class":54},[33,33083,365],{"class":167},[33,33085,3198],{"class":54},[33,33087,221],{"class":167},[33,33089,33090,33092,33094,33096,33098,33100,33102],{"class":35,"line":860},[33,33091,3224],{"class":50},[33,33093,212],{"class":163},[33,33095,3129],{"class":167},[33,33097,3231],{"class":54},[33,33099,365],{"class":167},[33,33101,3198],{"class":54},[33,33103,221],{"class":167},[33,33105,33106],{"class":35,"line":887},[33,33107,92],{"emptyLinePlaceholder":91},[33,33109,33110],{"class":35,"line":907},[33,33111,4764],{"class":39},[33,33113,33114],{"class":35,"line":1826},[33,33115,92],{"emptyLinePlaceholder":91},[33,33117,33118,33120,33122],{"class":35,"line":1844},[33,33119,562],{"class":163},[33,33121,1047],{"class":46},[33,33123,4777],{"class":167},[33,33125,33126,33128,33130,33132,33134,33136,33138,33140,33142,33144],{"class":35,"line":1858},[33,33127,1075],{"class":50},[33,33129,1078],{"class":167},[33,33131,869],{"class":238},[33,33133,242],{"class":163},[33,33135,855],{"class":50},[33,33137,365],{"class":167},[33,33139,878],{"class":238},[33,33141,242],{"class":163},[33,33143,855],{"class":50},[33,33145,221],{"class":167},[33,33147,33148,33150,33152],{"class":35,"line":1871},[33,33149,1133],{"class":167},[33,33151,242],{"class":163},[33,33153,1138],{"class":167},[33,33155,33156,33158,33160,33162],{"class":35,"line":1877},[33,33157,4812],{"class":50},[33,33159,1107],{"class":163},[33,33161,4817],{"class":54},[33,33163,247],{"class":167},[33,33165,33166,33168,33170,33172,33174,33176,33178,33180],{"class":35,"line":1883},[33,33167,1148],{"class":238},[33,33169,242],{"class":163},[33,33171,1153],{"class":50},[33,33173,1156],{"class":163},[33,33175,1159],{"class":50},[33,33177,1156],{"class":163},[33,33179,1159],{"class":50},[33,33181,247],{"class":167},[33,33183,33184,33186,33188,33190],{"class":35,"line":1915},[33,33185,1174],{"class":238},[33,33187,242],{"class":163},[33,33189,1179],{"class":50},[33,33191,247],{"class":167},[33,33193,33194,33196,33198,33200],{"class":35,"line":1926},[33,33195,1190],{"class":238},[33,33197,242],{"class":163},[33,33199,1195],{"class":54},[33,33201,247],{"class":167},[33,33203,33204],{"class":35,"line":1932},[33,33205,1202],{"class":167},[33,33207,33208,33210,33212],{"class":35,"line":1938},[33,33209,4866],{"class":167},[33,33211,242],{"class":163},[33,33213,1212],{"class":167},[33,33215,33216,33218,33220,33222,33224,33226,33228],{"class":35,"line":1950},[33,33217,4875],{"class":54},[33,33219,277],{"class":50},[33,33221,1226],{"class":50},[33,33223,280],{"class":50},[33,33225,283],{"class":50},[33,33227,274],{"class":54},[33,33229,247],{"class":167},[33,33231,33232,33234,33236,33238,33240,33242],{"class":35,"line":1958},[33,33233,1239],{"class":238},[33,33235,242],{"class":163},[33,33237,1244],{"class":54},[33,33239,916],{"class":50},[33,33241,1249],{"class":54},[33,33243,247],{"class":167},[33,33245,33246],{"class":35,"line":4904},[33,33247,1202],{"class":167},[33,33249,33250],{"class":35,"line":4909},[33,33251,4912],{"class":167},[33,33253,33254,33256,33258],{"class":35,"line":4915},[33,33255,4918],{"class":167},[33,33257,242],{"class":163},[33,33259,1303],{"class":167},[33,33261,33262,33264,33266],{"class":35,"line":4925},[33,33263,4928],{"class":167},[33,33265,1311],{"class":50},[33,33267,221],{"class":167},[33,33269,33270],{"class":35,"line":4935},[33,33271,4938],{"class":167},[33,33273,33274,33276,33278],{"class":35,"line":4941},[33,33275,1269],{"class":167},[33,33277,242],{"class":163},[33,33279,1274],{"class":167},[33,33281,33282,33284,33286],{"class":35,"line":4950},[33,33283,4953],{"class":167},[33,33285,1067],{"class":50},[33,33287,221],{"class":167},[33,33289,33290],{"class":35,"line":4960},[33,33291,1284],{"class":167},[33,33293,33294],{"class":35,"line":4965},[33,33295,4968],{"class":167},[33,33297,33298,33300,33302,33304],{"class":35,"line":4971},[33,33299,1332],{"class":163},[33,33301,544],{"class":167},[33,33303,4978],{"class":54},[33,33305,221],{"class":167},[33,33307,33308],{"class":35,"line":4983},[33,33309,92],{"emptyLinePlaceholder":91},[33,33311,33312],{"class":35,"line":4988},[33,33313,92],{"emptyLinePlaceholder":91},[33,33315,33316,33318,33320],{"class":35,"line":4993},[33,33317,539],{"class":167},[33,33319,242],{"class":163},[33,33321,5000],{"class":167},[33,33323,33324],{"class":35,"line":5003},[33,33325,92],{"emptyLinePlaceholder":91},[33,33327,33328],{"class":35,"line":5008},[33,33329,5011],{"class":39},[33,33331,33332],{"class":35,"line":5014},[33,33333,92],{"emptyLinePlaceholder":91},[33,33335,33336,33338,33340,33342,33344],{"class":35,"line":5019},[33,33337,562],{"class":163},[33,33339,2816],{"class":46},[33,33341,568],{"class":167},[33,33343,2821],{"class":50},[33,33345,574],{"class":167},[33,33347,33348,33350],{"class":35,"line":5032},[33,33349,2424],{"class":163},[33,33351,574],{"class":167},[33,33353,33354,33356,33358,33360,33362,33364,33366,33368,33370,33372,33374,33376,33378],{"class":35,"line":5039},[33,33355,2834],{"class":167},[33,33357,242],{"class":163},[33,33359,2839],{"class":167},[33,33361,2780],{"class":50},[33,33363,2844],{"class":167},[33,33365,2847],{"class":50},[33,33367,2850],{"class":163},[33,33369,2853],{"class":167},[33,33371,2856],{"class":50},[33,33373,2850],{"class":163},[33,33375,2853],{"class":167},[33,33377,2863],{"class":50},[33,33379,221],{"class":167},[33,33381,33382,33384,33386],{"class":35,"line":5068},[33,33383,2870],{"class":167},[33,33385,1053],{"class":50},[33,33387,2875],{"class":167},[33,33389,33390],{"class":35,"line":5077},[33,33391,2880],{"class":167},[33,33393,33394,33396],{"class":35,"line":5082},[33,33395,1659],{"class":163},[33,33397,2887],{"class":50},[33,33399,33400,33402,33404],{"class":35,"line":5089},[33,33401,2449],{"class":163},[33,33403,2894],{"class":50},[33,33405,574],{"class":167},[33,33407,33408,33410],{"class":35,"line":5098},[33,33409,1659],{"class":163},[33,33411,2903],{"class":50},[33,33413,33414],{"class":35,"line":5105},[33,33415,92],{"emptyLinePlaceholder":91},[33,33417,33418],{"class":35,"line":5110},[33,33419,92],{"emptyLinePlaceholder":91},[33,33421,33422,33424,33426,33428,33430],{"class":35,"line":5115},[33,33423,562],{"class":163},[33,33425,2918],{"class":46},[33,33427,568],{"class":167},[33,33429,571],{"class":50},[33,33431,574],{"class":167},[33,33433,33434,33436],{"class":35,"line":5128},[33,33435,2424],{"class":163},[33,33437,574],{"class":167},[33,33439,33440,33442],{"class":35,"line":5135},[33,33441,2935],{"class":50},[33,33443,2938],{"class":167},[33,33445,33446,33448,33450],{"class":35,"line":5142},[33,33447,2449],{"class":163},[33,33449,2945],{"class":50},[33,33451,574],{"class":167},[33,33453,33454],{"class":35,"line":5151},[33,33455,2952],{"class":163},[33,33457,33458],{"class":35,"line":5156},[33,33459,92],{"emptyLinePlaceholder":91},[33,33461,33462],{"class":35,"line":5161},[33,33463,5164],{"class":39},[33,33465,33466],{"class":35,"line":5167},[33,33467,92],{"emptyLinePlaceholder":91},[33,33469,33470,33472,33474],{"class":35,"line":5172},[33,33471,562],{"class":163},[33,33473,5177],{"class":46},[33,33475,3680],{"class":167},[33,33477,33478,33480,33482,33484,33486],{"class":35,"line":5182},[33,33479,3685],{"class":167},[33,33481,242],{"class":163},[33,33483,3690],{"class":167},[33,33485,3653],{"class":50},[33,33487,221],{"class":167},[33,33489,33490],{"class":35,"line":5195},[33,33491,3699],{"class":167},[33,33493,33494],{"class":35,"line":5200},[33,33495,3704],{"class":54},[33,33497,33498],{"class":35,"line":5205},[33,33499,3709],{"class":54},[33,33501,33502],{"class":35,"line":5210},[33,33503,1202],{"class":167},[33,33505,33506],{"class":35,"line":5215},[33,33507,3718],{"class":167},[33,33509,33510,33512],{"class":35,"line":5220},[33,33511,1332],{"class":163},[33,33513,3725],{"class":167},[33,33515,33516],{"class":35,"line":5227},[33,33517,92],{"emptyLinePlaceholder":91},[33,33519,33520],{"class":35,"line":5232},[33,33521,92],{"emptyLinePlaceholder":91},[33,33523,33524,33526,33528,33530,33532],{"class":35,"line":5237},[33,33525,562],{"class":163},[33,33527,5242],{"class":46},[33,33529,3743],{"class":167},[33,33531,1053],{"class":50},[33,33533,574],{"class":167},[33,33535,33536,33538],{"class":35,"line":5251},[33,33537,1332],{"class":163},[33,33539,5256],{"class":167},[33,33541,33542],{"class":35,"line":5259},[33,33543,92],{"emptyLinePlaceholder":91},[33,33545,33546],{"class":35,"line":5264},[33,33547,92],{"emptyLinePlaceholder":91},[33,33549,33550,33552,33554,33556,33558],{"class":35,"line":5269},[33,33551,562],{"class":163},[33,33553,5274],{"class":46},[33,33555,3743],{"class":167},[33,33557,2821],{"class":50},[33,33559,574],{"class":167},[33,33561,33562,33564,33566],{"class":35,"line":5283},[33,33563,3795],{"class":167},[33,33565,242],{"class":163},[33,33567,5290],{"class":167},[33,33569,33570,33572,33574],{"class":35,"line":5293},[33,33571,3685],{"class":167},[33,33573,242],{"class":163},[33,33575,5300],{"class":167},[33,33577,33578,33580,33582],{"class":35,"line":5303},[33,33579,5306],{"class":167},[33,33581,242],{"class":163},[33,33583,3819],{"class":167},[33,33585,33586,33588],{"class":35,"line":5313},[33,33587,3824],{"class":54},[33,33589,3827],{"class":167},[33,33591,33592],{"class":35,"line":5320},[33,33593,3832],{"class":167},[33,33595,33596],{"class":35,"line":5325},[33,33597,3837],{"class":167},[33,33599,33600,33602,33604,33606,33608],{"class":35,"line":5330},[33,33601,1332],{"class":163},[33,33603,5335],{"class":167},[33,33605,3847],{"class":163},[33,33607,620],{"class":163},[33,33609,3852],{"class":50},[33,33611,33612],{"class":35,"line":5344},[33,33613,92],{"emptyLinePlaceholder":91},[33,33615,33616],{"class":35,"line":5349},[33,33617,92],{"emptyLinePlaceholder":91},[33,33619,33620,33622,33624,33626,33628],{"class":35,"line":5354},[33,33621,562],{"class":163},[33,33623,5359],{"class":46},[33,33625,3743],{"class":167},[33,33627,571],{"class":50},[33,33629,574],{"class":167},[33,33631,33632,33634,33636],{"class":35,"line":5368},[33,33633,3685],{"class":167},[33,33635,242],{"class":163},[33,33637,5300],{"class":167},[33,33639,33640],{"class":35,"line":5377},[33,33641,3699],{"class":167},[33,33643,33644,33646],{"class":35,"line":5382},[33,33645,3911],{"class":54},[33,33647,247],{"class":167},[33,33649,33650,33652,33654],{"class":35,"line":5389},[33,33651,5392],{"class":167},[33,33653,1053],{"class":50},[33,33655,3923],{"class":167},[33,33657,33658],{"class":35,"line":5399},[33,33659,1202],{"class":167},[33,33661,33662],{"class":35,"line":5404},[33,33663,3718],{"class":167},[33,33665,33666],{"class":35,"line":5409},[33,33667,3837],{"class":167},[33,33669,33670],{"class":35,"line":5414},[33,33671,92],{"emptyLinePlaceholder":91},[33,33673,33674],{"class":35,"line":5419},[33,33675,5422],{"class":39},[33,33677,33678],{"class":35,"line":5425},[33,33679,92],{"emptyLinePlaceholder":91},[33,33681,33682,33684,33686],{"class":35,"line":5430},[33,33683,5433],{"class":167},[33,33685,242],{"class":163},[33,33687,1479],{"class":167},[33,33689,33690,33692,33694,33696],{"class":35,"line":5440},[33,33691,1484],{"class":238},[33,33693,242],{"class":163},[33,33695,855],{"class":50},[33,33697,247],{"class":167},[33,33699,33700,33702,33704,33706,33708],{"class":35,"line":5451},[33,33701,1495],{"class":238},[33,33703,242],{"class":163},[33,33705,1500],{"class":167},[33,33707,1503],{"class":50},[33,33709,1506],{"class":167},[33,33711,33712,33714,33716,33718,33720,33722,33724,33726,33728,33730,33732,33734,33736,33738,33740],{"class":35,"line":5464},[33,33713,1511],{"class":238},[33,33715,242],{"class":163},[33,33717,1516],{"class":167},[33,33719,1519],{"class":238},[33,33721,242],{"class":163},[33,33723,734],{"class":50},[33,33725,365],{"class":167},[33,33727,1528],{"class":238},[33,33729,242],{"class":163},[33,33731,1533],{"class":50},[33,33733,365],{"class":167},[33,33735,1538],{"class":238},[33,33737,242],{"class":163},[33,33739,1543],{"class":50},[33,33741,1506],{"class":167},[33,33743,33744,33746,33748,33750,33752,33754,33756],{"class":35,"line":5497},[33,33745,1550],{"class":238},[33,33747,242],{"class":163},[33,33749,1555],{"class":167},[33,33751,1558],{"class":50},[33,33753,365],{"class":167},[33,33755,1563],{"class":50},[33,33757,1571],{"class":167},[33,33759,33760,33762,33764,33766,33768],{"class":35,"line":5514},[33,33761,1576],{"class":238},[33,33763,242],{"class":163},[33,33765,1581],{"class":167},[33,33767,1311],{"class":50},[33,33769,1506],{"class":167},[33,33771,33772],{"class":35,"line":5527},[33,33773,221],{"class":167},[33,33775,33776],{"class":35,"line":5532},[33,33777,92],{"emptyLinePlaceholder":91},[33,33779,33780],{"class":35,"line":5537},[33,33781,5540],{"class":39},[33,33783,33784],{"class":35,"line":5543},[33,33785,92],{"emptyLinePlaceholder":91},[33,33787,33788,33790,33792,33794,33796,33798,33800,33802,33804],{"class":35,"line":5548},[33,33789,562],{"class":163},[33,33791,5553],{"class":46},[33,33793,3255],{"class":167},[33,33795,1053],{"class":50},[33,33797,3260],{"class":167},[33,33799,1053],{"class":50},[33,33801,1617],{"class":167},[33,33803,571],{"class":50},[33,33805,574],{"class":167},[33,33807,33808,33810],{"class":35,"line":5570},[33,33809,1627],{"class":163},[33,33811,3081],{"class":167},[33,33813,33814,33816],{"class":35,"line":5577},[33,33815,1627],{"class":163},[33,33817,3074],{"class":167},[33,33819,33820,33822],{"class":35,"line":5584},[33,33821,1627],{"class":163},[33,33823,1630],{"class":167},[33,33825,33826,33828,33830,33832],{"class":35,"line":5591},[33,33827,3878],{"class":163},[33,33829,3100],{"class":167},[33,33831,164],{"class":163},[33,33833,3105],{"class":167},[33,33835,33836],{"class":35,"line":5602},[33,33837,92],{"emptyLinePlaceholder":91},[33,33839,33840,33842,33844,33846,33848],{"class":35,"line":5607},[33,33841,617],{"class":163},[33,33843,5612],{"class":50},[33,33845,5615],{"class":163},[33,33847,5618],{"class":50},[33,33849,574],{"class":167},[33,33851,33852,33854],{"class":35,"line":5623},[33,33853,670],{"class":163},[33,33855,574],{"class":167},[33,33857,33858,33860,33862],{"class":35,"line":5630},[33,33859,5633],{"class":167},[33,33861,242],{"class":163},[33,33863,3278],{"class":167},[33,33865,33866,33868,33870,33872,33874],{"class":35,"line":5640},[33,33867,5643],{"class":167},[33,33869,3286],{"class":54},[33,33871,763],{"class":167},[33,33873,242],{"class":163},[33,33875,3293],{"class":167},[33,33877,33878,33880,33882,33884,33886],{"class":35,"line":5654},[33,33879,5643],{"class":167},[33,33881,3300],{"class":54},[33,33883,763],{"class":167},[33,33885,242],{"class":163},[33,33887,3307],{"class":50},[33,33889,33890,33892,33894,33896,33898],{"class":35,"line":5667},[33,33891,5643],{"class":167},[33,33893,3314],{"class":54},[33,33895,763],{"class":167},[33,33897,242],{"class":163},[33,33899,3321],{"class":50},[33,33901,33902],{"class":35,"line":5680},[33,33903,5683],{"class":167},[33,33905,33906,33908,33910,33912,33914,33916,33918,33920],{"class":35,"line":5686},[33,33907,678],{"class":163},[33,33909,3339],{"class":167},[33,33911,3124],{"class":50},[33,33913,365],{"class":167},[33,33915,5697],{"class":50},[33,33917,1649],{"class":167},[33,33919,495],{"class":163},[33,33921,3352],{"class":167},[33,33923,33924],{"class":35,"line":5706},[33,33925,5709],{"class":167},[33,33927,33928,33930,33932,33934,33936],{"class":35,"line":5712},[33,33929,5715],{"class":167},[33,33931,3167],{"class":50},[33,33933,365],{"class":167},[33,33935,3186],{"class":50},[33,33937,221],{"class":167},[33,33939,33940],{"class":35,"line":5726},[33,33941,5729],{"class":167},[33,33943,33944,33946,33948],{"class":35,"line":5732},[33,33945,780],{"class":163},[33,33947,783],{"class":50},[33,33949,574],{"class":167},[33,33951,33952,33954,33956],{"class":35,"line":5741},[33,33953,791],{"class":167},[33,33955,5746],{"class":54},[33,33957,221],{"class":167},[33,33959,33960],{"class":35,"line":5751},[33,33961,92],{"emptyLinePlaceholder":91},[33,33963,33964,33966,33968],{"class":35,"line":5756},[33,33965,617],{"class":163},[33,33967,3440],{"class":50},[33,33969,574],{"class":167},[33,33971,33972,33974,33976,33978,33980,33982,33984,33986,33988,33990,33992,33994,33996,33998,34000,34002],{"class":35,"line":5765},[33,33973,5768],{"class":167},[33,33975,242],{"class":163},[33,33977,3456],{"class":167},[33,33979,3459],{"class":54},[33,33981,2079],{"class":167},[33,33983,4059],{"class":163},[33,33985,5781],{"class":54},[33,33987,1115],{"class":50},[33,33989,5786],{"class":167},[33,33991,1121],{"class":50},[33,33993,1769],{"class":54},[33,33995,5793],{"class":50},[33,33997,5796],{"class":167},[33,33999,1121],{"class":50},[33,34001,274],{"class":54},[33,34003,5803],{"class":167},[33,34005,34006,34008,34010],{"class":35,"line":5806},[33,34007,5809],{"class":167},[33,34009,242],{"class":163},[33,34011,3472],{"class":167},[33,34013,34014,34016],{"class":35,"line":5816},[33,34015,5819],{"class":50},[33,34017,247],{"class":167},[33,34019,34020,34022,34024],{"class":35,"line":5824},[33,34021,5827],{"class":238},[33,34023,242],{"class":163},[33,34025,3489],{"class":167},[33,34027,34028,34030,34032,34034,34036,34038,34040],{"class":35,"line":5834},[33,34029,5837],{"class":238},[33,34031,242],{"class":163},[33,34033,1115],{"class":167},[33,34035,3501],{"class":54},[33,34037,2079],{"class":167},[33,34039,3506],{"class":54},[33,34041,3509],{"class":167},[33,34043,34044,34046,34048,34050],{"class":35,"line":5852},[33,34045,5855],{"class":238},[33,34047,242],{"class":163},[33,34049,3519],{"class":54},[33,34051,247],{"class":167},[33,34053,34054],{"class":35,"line":5864},[33,34055,5867],{"class":167},[33,34057,34058,34060],{"class":35,"line":5870},[33,34059,670],{"class":163},[33,34061,574],{"class":167},[33,34063,34064,34066,34068,34070,34072,34074],{"class":35,"line":5877},[33,34065,678],{"class":163},[33,34067,3538],{"class":167},[33,34069,1641],{"class":238},[33,34071,242],{"class":163},[33,34073,3545],{"class":50},[33,34075,1737],{"class":167},[33,34077,34078],{"class":35,"line":5892},[33,34079,5895],{"class":163},[33,34081,34082,34084,34086],{"class":35,"line":5898},[33,34083,780],{"class":163},[33,34085,783],{"class":50},[33,34087,574],{"class":167},[33,34089,34090,34092,34094],{"class":35,"line":5907},[33,34091,791],{"class":167},[33,34093,5912],{"class":54},[33,34095,221],{"class":167},[33,34097,34098],{"class":35,"line":5917},[33,34099,92],{"emptyLinePlaceholder":91},[33,34101,34102],{"class":35,"line":5922},[33,34103,5925],{"class":39},[33,34105,34106],{"class":35,"line":5928},[33,34107,92],{"emptyLinePlaceholder":91},[33,34109,34110],{"class":35,"line":5933},[33,34111,5936],{"class":46},[33,34113,34114,34116,34118],{"class":35,"line":5939},[33,34115,562],{"class":163},[33,34117,5944],{"class":46},[33,34119,5947],{"class":167},[33,34121,34122,34124,34126],{"class":35,"line":5950},[33,34123,584],{"class":167},[33,34125,242],{"class":163},[33,34127,589],{"class":167},[33,34129,34130,34132,34134,34136],{"class":35,"line":5959},[33,34131,1635],{"class":163},[33,34133,681],{"class":167},[33,34135,495],{"class":163},[33,34137,686],{"class":167},[33,34139,34140,34142,34144,34146],{"class":35,"line":5970},[33,34141,5973],{"class":163},[33,34143,695],{"class":167},[33,34145,662],{"class":163},[33,34147,700],{"class":167},[33,34149,34150,34152,34154],{"class":35,"line":5982},[33,34151,5985],{"class":167},[33,34153,242],{"class":163},[33,34155,711],{"class":167},[33,34157,34158,34160,34162,34164,34166,34168,34170,34172],{"class":35,"line":5992},[33,34159,5995],{"class":163},[33,34161,5998],{"class":167},[33,34163,6001],{"class":163},[33,34165,4037],{"class":50},[33,34167,6006],{"class":167},[33,34169,6009],{"class":163},[33,34171,1814],{"class":50},[33,34173,574],{"class":167},[33,34175,34176,34178,34180,34182,34184,34186,34188,34190,34192,34194],{"class":35,"line":6016},[33,34177,6019],{"class":167},[33,34179,242],{"class":163},[33,34181,731],{"class":167},[33,34183,734],{"class":50},[33,34185,737],{"class":167},[33,34187,740],{"class":238},[33,34189,242],{"class":163},[33,34191,745],{"class":167},[33,34193,748],{"class":50},[33,34195,751],{"class":167},[33,34197,34198,34200,34202,34204,34206],{"class":35,"line":6040},[33,34199,6043],{"class":167},[33,34201,760],{"class":54},[33,34203,763],{"class":167},[33,34205,242],{"class":163},[33,34207,768],{"class":167},[33,34209,34210],{"class":35,"line":6054},[33,34211,6057],{"class":167},[33,34213,34214,34216],{"class":35,"line":6060},[33,34215,1332],{"class":163},[33,34217,6065],{"class":167},[33,34219,34220],{"class":35,"line":6068},[33,34221,92],{"emptyLinePlaceholder":91},[33,34223,34224],{"class":35,"line":6073},[33,34225,6076],{"class":39},[33,34227,34228],{"class":35,"line":6079},[33,34229,92],{"emptyLinePlaceholder":91},[33,34231,34232,34234,34236,34238,34240],{"class":35,"line":6084},[33,34233,562],{"class":163},[33,34235,6089],{"class":46},[33,34237,6092],{"class":167},[33,34239,571],{"class":50},[33,34241,574],{"class":167},[33,34243,34244,34246,34248,34250,34252],{"class":35,"line":6099},[33,34245,910],{"class":167},[33,34247,6104],{"class":54},[33,34249,309],{"class":50},[33,34251,274],{"class":54},[33,34253,6111],{"class":167},[33,34255,34256,34258,34260,34262,34264,34266,34268,34270,34272,34274,34276,34278],{"class":35,"line":6114},[33,34257,594],{"class":167},[33,34259,242],{"class":163},[33,34261,6121],{"class":167},[33,34263,6124],{"class":163},[33,34265,6127],{"class":167},[33,34267,662],{"class":163},[33,34269,6132],{"class":167},[33,34271,610],{"class":54},[33,34273,1649],{"class":167},[33,34275,2491],{"class":163},[33,34277,620],{"class":163},[33,34279,6143],{"class":167},[33,34281,34282],{"class":35,"line":6146},[33,34283,92],{"emptyLinePlaceholder":91},[33,34285,34286,34288,34290],{"class":35,"line":6151},[33,34287,617],{"class":163},[33,34289,620],{"class":163},[33,34291,623],{"class":167},[33,34293,34294,34296,34298],{"class":35,"line":6160},[33,34295,2439],{"class":167},[33,34297,6165],{"class":54},[33,34299,221],{"class":167},[33,34301,34302],{"class":35,"line":6170},[33,34303,646],{"class":163},[33,34305,34306],{"class":35,"line":6175},[33,34307,92],{"emptyLinePlaceholder":91},[33,34309,34310,34312,34314],{"class":35,"line":6180},[33,34311,6183],{"class":167},[33,34313,242],{"class":163},[33,34315,589],{"class":167},[33,34317,34318,34320,34322,34324],{"class":35,"line":6190},[33,34319,656],{"class":163},[33,34321,659],{"class":167},[33,34323,662],{"class":163},[33,34325,623],{"class":167},[33,34327,34328,34330],{"class":35,"line":6201},[33,34329,670],{"class":163},[33,34331,574],{"class":167},[33,34333,34334,34336,34338],{"class":35,"line":6208},[33,34335,6211],{"class":167},[33,34337,242],{"class":163},[33,34339,6216],{"class":167},[33,34341,34342],{"class":35,"line":6219},[33,34343,6222],{"class":167},[33,34345,34346],{"class":35,"line":6225},[33,34347,6228],{"class":167},[33,34349,34350,34352,34354,34356,34358,34360,34362,34364,34366],{"class":35,"line":6231},[33,34351,6234],{"class":167},[33,34353,6237],{"class":54},[33,34355,309],{"class":50},[33,34357,6242],{"class":54},[33,34359,916],{"class":50},[33,34361,6247],{"class":54},[33,34363,6250],{"class":167},[33,34365,928],{"class":50},[33,34367,6255],{"class":167},[33,34369,34370,34372,34374],{"class":35,"line":6258},[33,34371,780],{"class":163},[33,34373,783],{"class":50},[33,34375,574],{"class":167},[33,34377,34378,34380,34382,34384,34386],{"class":35,"line":6267},[33,34379,791],{"class":167},[33,34381,6272],{"class":54},[33,34383,309],{"class":50},[33,34385,6277],{"class":54},[33,34387,801],{"class":167},[33,34389,34390],{"class":35,"line":6282},[33,34391,92],{"emptyLinePlaceholder":91},[33,34393,34394,34396,34398],{"class":35,"line":6287},[33,34395,617],{"class":163},[33,34397,620],{"class":163},[33,34399,816],{"class":167},[33,34401,34402,34404,34406,34408,34410,34412,34414],{"class":35,"line":6296},[33,34403,628],{"class":167},[33,34405,6301],{"class":54},[33,34407,916],{"class":50},[33,34409,6306],{"class":54},[33,34411,365],{"class":167},[33,34413,928],{"class":50},[33,34415,6313],{"class":167},[33,34417,34418],{"class":35,"line":6316},[33,34419,646],{"class":163},[33,34421,34422],{"class":35,"line":6321},[33,34423,92],{"emptyLinePlaceholder":91},[33,34425,34426,34428,34430,34432,34434,34436,34438],{"class":35,"line":6326},[33,34427,842],{"class":167},[33,34429,242],{"class":163},[33,34431,847],{"class":167},[33,34433,850],{"class":238},[33,34435,242],{"class":163},[33,34437,855],{"class":50},[33,34439,221],{"class":167},[33,34441,34442,34444,34446,34448,34450,34452,34454,34456,34458],{"class":35,"line":6343},[33,34443,6346],{"class":167},[33,34445,869],{"class":238},[33,34447,242],{"class":163},[33,34449,855],{"class":50},[33,34451,365],{"class":167},[33,34453,878],{"class":238},[33,34455,242],{"class":163},[33,34457,855],{"class":50},[33,34459,221],{"class":167},[33,34461,34462,34464,34466,34468,34470,34472,34474],{"class":35,"line":6365},[33,34463,6368],{"class":167},[33,34465,242],{"class":163},[33,34467,6373],{"class":167},[33,34469,6376],{"class":54},[33,34471,916],{"class":50},[33,34473,274],{"class":54},[33,34475,221],{"class":167},[33,34477,34478,34480,34482,34484,34486,34488,34490,34492,34494,34496],{"class":35,"line":6385},[33,34479,6388],{"class":167},[33,34481,242],{"class":163},[33,34483,6393],{"class":167},[33,34485,1351],{"class":163},[33,34487,1110],{"class":163},[33,34489,6400],{"class":54},[33,34491,1115],{"class":50},[33,34493,6405],{"class":167},[33,34495,1121],{"class":50},[33,34497,6410],{"class":54},[33,34499,34500,34502,34504,34506,34508],{"class":35,"line":6413},[33,34501,6416],{"class":167},[33,34503,897],{"class":238},[33,34505,242],{"class":163},[33,34507,902],{"class":50},[33,34509,221],{"class":167},[33,34511,34512,34514,34516,34518,34520,34522,34524,34526,34528],{"class":35,"line":6427},[33,34513,910],{"class":167},[33,34515,913],{"class":54},[33,34517,916],{"class":50},[33,34519,919],{"class":54},[33,34521,309],{"class":50},[33,34523,274],{"class":54},[33,34525,365],{"class":167},[33,34527,928],{"class":50},[33,34529,6446],{"class":167},[33,34531,34532],{"class":35,"line":6449},[33,34533,92],{"emptyLinePlaceholder":91},[33,34535,34536],{"class":35,"line":6454},[33,34537,92],{"emptyLinePlaceholder":91},[33,34539,34540,34542,34544,34546,34548],{"class":35,"line":6459},[33,34541,562],{"class":163},[33,34543,6464],{"class":46},[33,34545,6092],{"class":167},[33,34547,571],{"class":50},[33,34549,574],{"class":167},[33,34551,34552,34554,34556],{"class":35,"line":6473},[33,34553,617],{"class":163},[33,34555,620],{"class":163},[33,34557,2981],{"class":167},[33,34559,34560,34562,34564],{"class":35,"line":6482},[33,34561,628],{"class":167},[33,34563,6487],{"class":54},[33,34565,221],{"class":167},[33,34567,34568],{"class":35,"line":6492},[33,34569,646],{"class":163},[33,34571,34572,34574],{"class":35,"line":6497},[33,34573,2424],{"class":163},[33,34575,574],{"class":167},[33,34577,34578],{"class":35,"line":6504},[33,34579,6507],{"class":167},[33,34581,34582,34584,34586,34588],{"class":35,"line":6510},[33,34583,2449],{"class":163},[33,34585,783],{"class":50},[33,34587,1852],{"class":163},[33,34589,1855],{"class":167},[33,34591,34592,34594,34596],{"class":35,"line":6521},[33,34593,2458],{"class":167},[33,34595,6526],{"class":54},[33,34597,221],{"class":167},[33,34599,34600],{"class":35,"line":6531},[33,34601,6534],{"class":167},[33,34603,34604,34606,34608,34610],{"class":35,"line":6537},[33,34605,6540],{"class":238},[33,34607,242],{"class":163},[33,34609,6545],{"class":54},[33,34611,247],{"class":167},[33,34613,34614,34616,34618,34620,34622,34624,34626,34628,34630,34632,34634,34636,34638,34640],{"class":35,"line":6550},[33,34615,6553],{"class":238},[33,34617,242],{"class":163},[33,34619,4059],{"class":163},[33,34621,6560],{"class":54},[33,34623,1115],{"class":50},[33,34625,6565],{"class":167},[33,34627,6568],{"class":50},[33,34629,6571],{"class":54},[33,34631,6574],{"class":50},[33,34633,1107],{"class":163},[33,34635,6579],{"class":54},[33,34637,1121],{"class":50},[33,34639,6584],{"class":54},[33,34641,247],{"class":167},[33,34643,34644],{"class":35,"line":6589},[33,34645,5867],{"class":167},[33,34647,34648,34650,34652],{"class":35,"line":6594},[33,34649,2995],{"class":167},[33,34651,734],{"class":50},[33,34653,221],{"class":167},[33,34655,34656,34658],{"class":35,"line":6603},[33,34657,3018],{"class":163},[33,34659,574],{"class":167},[33,34661,34662],{"class":35,"line":6610},[33,34663,3025],{"class":167},[33,34665,34666],{"class":35,"line":6615},[33,34667,92],{"emptyLinePlaceholder":91},[33,34669,34670],{"class":35,"line":6620},[33,34671,6623],{"class":39},[33,34673,34674],{"class":35,"line":6626},[33,34675,92],{"emptyLinePlaceholder":91},[33,34677,34678,34680,34682,34684,34686],{"class":35,"line":6631},[33,34679,562],{"class":163},[33,34681,6636],{"class":46},[33,34683,568],{"class":167},[33,34685,571],{"class":50},[33,34687,574],{"class":167},[33,34689,34690,34692,34694,34696,34698,34700,34702],{"class":35,"line":6645},[33,34691,6648],{"class":167},[33,34693,242],{"class":163},[33,34695,6653],{"class":167},[33,34697,6656],{"class":238},[33,34699,242],{"class":163},[33,34701,6661],{"class":54},[33,34703,221],{"class":167},[33,34705,34706,34708,34710,34712,34714,34716,34718,34720,34722,34724],{"class":35,"line":6666},[33,34707,6669],{"class":167},[33,34709,6672],{"class":54},[33,34711,365],{"class":167},[33,34713,6677],{"class":238},[33,34715,242],{"class":163},[33,34717,6682],{"class":167},[33,34719,6685],{"class":238},[33,34721,242],{"class":163},[33,34723,507],{"class":50},[33,34725,221],{"class":167},[33,34727,34728,34730,34732,34734,34736,34738,34740,34742,34744,34746],{"class":35,"line":6694},[33,34729,6669],{"class":167},[33,34731,6699],{"class":54},[33,34733,365],{"class":167},[33,34735,6677],{"class":238},[33,34737,242],{"class":163},[33,34739,6682],{"class":167},[33,34741,6685],{"class":238},[33,34743,242],{"class":163},[33,34745,4615],{"class":50},[33,34747,221],{"class":167},[33,34749,34750],{"class":35,"line":6718},[33,34751,6721],{"class":167},[33,34753,34754,34756],{"class":35,"line":6724},[33,34755,6727],{"class":54},[33,34757,247],{"class":167},[33,34759,34760,34762,34764,34766],{"class":35,"line":6732},[33,34761,6735],{"class":238},[33,34763,242],{"class":163},[33,34765,6740],{"class":54},[33,34767,247],{"class":167},[33,34769,34770,34772,34774,34776],{"class":35,"line":6745},[33,34771,6748],{"class":238},[33,34773,242],{"class":163},[33,34775,6753],{"class":54},[33,34777,247],{"class":167},[33,34779,34780],{"class":35,"line":6758},[33,34781,1202],{"class":167},[33,34783,34784,34786,34788],{"class":35,"line":6763},[33,34785,6766],{"class":167},[33,34787,242],{"class":163},[33,34789,6771],{"class":167},[33,34791,34792],{"class":35,"line":6774},[33,34793,92],{"emptyLinePlaceholder":91},[33,34795,34796,34798],{"class":35,"line":6779},[33,34797,617],{"class":163},[33,34799,6784],{"class":167},[33,34801,34802,34804,34806],{"class":35,"line":6787},[33,34803,2439],{"class":167},[33,34805,6792],{"class":54},[33,34807,221],{"class":167},[33,34809,34810,34812,34814],{"class":35,"line":6797},[33,34811,6800],{"class":167},[33,34813,2479],{"class":54},[33,34815,6805],{"class":167},[33,34817,34818,34820,34822,34824,34826,34828,34830],{"class":35,"line":6808},[33,34819,6811],{"class":167},[33,34821,6814],{"class":238},[33,34823,242],{"class":163},[33,34825,6819],{"class":167},[33,34827,6822],{"class":238},[33,34829,242],{"class":163},[33,34831,6827],{"class":167},[33,34833,34834],{"class":35,"line":6830},[33,34835,5867],{"class":167},[33,34837,34838,34840,34842],{"class":35,"line":6835},[33,34839,6838],{"class":163},[33,34841,2519],{"class":50},[33,34843,574],{"class":167},[33,34845,34846],{"class":35,"line":6845},[33,34847,6848],{"class":167},[33,34849,34850,34852,34854],{"class":35,"line":6851},[33,34851,6854],{"class":167},[33,34853,1543],{"class":50},[33,34855,221],{"class":167},[33,34857,34858,34860],{"class":35,"line":6861},[33,34859,6864],{"class":163},[33,34861,574],{"class":167},[33,34863,34864,34866,34868,34870,34872,34874,34876],{"class":35,"line":6869},[33,34865,6872],{"class":167},[33,34867,6814],{"class":238},[33,34869,242],{"class":163},[33,34871,6819],{"class":167},[33,34873,6822],{"class":238},[33,34875,242],{"class":163},[33,34877,6885],{"class":167},[33,34879,34880],{"class":35,"line":6888},[33,34881,92],{"emptyLinePlaceholder":91},[33,34883,34884],{"class":35,"line":6893},[33,34885,92],{"emptyLinePlaceholder":91},[33,34887,34888,34890,34892,34894,34896],{"class":35,"line":6898},[33,34889,2491],{"class":163},[33,34891,2494],{"class":50},[33,34893,2497],{"class":163},[33,34895,2500],{"class":54},[33,34897,574],{"class":167},[33,34899,34900],{"class":35,"line":6911},[33,34901,6914],{"class":167},[18,34903,6918],{"id":6917},[4211,34905,34906,34910,34914,34918],{},[4214,34907,34908,6925],{},[940,34909,948],{"href":947},[4214,34911,34912,6930],{},[940,34913,4204],{"href":4203},[4214,34915,34916,6937],{},[940,34917,6936],{"href":6935},[4214,34919,34920,6944],{},[940,34921,6943],{"href":6942},[14,34923,6947,34924,3035],{},[940,34925,6951],{"href":6950},[6953,34927,6955],{},{"title":28,"searchDepth":43,"depth":43,"links":34929},[34930,34931,34932,34938,34943,34944,34945,34946,34947],{"id":20,"depth":43,"text":21},{"id":115,"depth":43,"text":116},{"id":421,"depth":43,"text":422,"children":34933},[34934,34935,34936,34937],{"id":426,"depth":61,"text":427},{"id":952,"depth":61,"text":953},{"id":1358,"depth":61,"text":1359},{"id":1966,"depth":61,"text":1967},{"id":2708,"depth":43,"text":2709,"children":34939},[34940,34941,34942],{"id":2712,"depth":61,"text":2713},{"id":3038,"depth":61,"text":3039},{"id":3589,"depth":61,"text":3590},{"id":3938,"depth":43,"text":3939},{"id":4208,"depth":43,"text":4209},{"id":4270,"depth":43,"text":4271},{"id":4401,"depth":43,"text":4402},{"id":6917,"depth":43,"text":6918},{},{"title":5,"description":6979},[47,6988,959,6989,6990],{"id":34952,"title":34953,"body":34954,"breadcrumbTitle":36783,"canonical":6977,"date":6978,"description":36784,"draft":6980,"extension":6981,"image":6977,"meta":36785,"navigation":91,"path":36786,"robots":6977,"seo":36787,"seoTitle":36788,"stem":36789,"tags":36790,"updatedAt":6978,"__hash__":36792},"content\u002Fautomating-pdf-extraction-generation\u002Fcomparing-pdf-table-extraction-libraries\u002Ffix-tabula-java-not-found-error\u002Findex.md","Fix tabula-py \"java not found\" Error",{"type":7,"value":34955,"toc":36765},[34956,34959,34980,34982,34999,35005,35014,35018,35024,35174,35185,35189,35193,35285,35290,35294,35368,35382,35386,35422,35438,35443,35499,35502,35506,35513,35766,35769,35799,35803,35807,35813,35822,36016,36032,36036,36046,36134,36144,36148,36155,36293,36314,36316,36319,36586,36589,36595,36617,36623,36627,36669,36682,36688,36716,36733,36735,36758,36762],[10,34957,34953],{"id":34958},"fix-tabula-py-java-not-found-error",[14,34960,34961,34962,34965,34966,34969,34970,34972,34973,34976,34977,34979],{},"tabula-py raises ",[30,34963,34964],{},"JavaNotFoundError"," — or the message ",[30,34967,34968],{},"java command is not found from this Python process"," — when it cannot locate a Java runtime on ",[30,34971,122],{},". This is the single most common tabula-py failure and has nothing to do with your PDF or your Python code. The fix is to install a JRE\u002FJDK and make sure the ",[30,34974,34975],{},"java"," binary is on the ",[30,34978,122],{}," visible to the Python subprocess.",[18,34981,7021],{"id":7020},[14,34983,34984,34985,34988,34989,34992,34993,34995,34996,34998],{},"tabula-py does not extract PDF tables in Python. It calls a bundled Java JAR (",[30,34986,34987],{},"tabula-1.x.x-jar-with-dependencies.jar",") via ",[30,34990,34991],{},"subprocess.run([\"java\", \"-jar\", ...])",". If ",[30,34994,34975],{}," is not on the system ",[30,34997,122],{}," — or if it is installed but not in the PATH that Python's subprocess environment inherits — the call fails immediately with one of these messages:",[23,35000,35003],{"className":35001,"code":35002,"language":2000},[1998],"java.lang.Exception: Error in Java call\ntabula.errors.JavaNotFoundError: `java` command is not found from this Python process.\nPlease ensure Java is installed and PATH is set for `java`\nFileNotFoundError: [Errno 2] No such file or directory: 'java'\n",[30,35004,35002],{"__ignoreMap":28},[14,35006,35007,35008,35010,35011,35013],{},"The error is raised before tabula-py reads a single byte of your PDF. Confirming ",[30,35009,34975],{}," is on ",[30,35012,122],{}," in your shell is not sufficient — the PATH seen by a Python subprocess can differ, especially in virtual environments, Docker containers, or GUI-launched processes on macOS.",[18,35015,35017],{"id":35016},"minimal-diagnostic","Minimal Diagnostic",[14,35019,35020,35021,35023],{},"Reproduce the failure with the shortest possible snippet to confirm whether ",[30,35022,34975],{}," is reachable from Python:",[23,35025,35027],{"className":126,"code":35026,"language":47,"meta":28,"style":28},"# pip install tabula-py\nimport subprocess\nimport sys\n\nresult = subprocess.run(\n    [\"java\", \"-version\"],\n    capture_output=True,\n    text=True,\n)\nif result.returncode != 0:\n    print(\"java NOT found by Python subprocess\")\n    print(\"stderr:\", result.stderr)\n    sys.exit(1)\nelse:\n    print(\"java found:\")\n    print(result.stderr)  # java -version writes to stderr by convention\n",[30,35028,35029,35034,35041,35047,35051,35061,35076,35087,35098,35102,35115,35126,35138,35147,35153,35164],{"__ignoreMap":28},[33,35030,35031],{"class":35,"line":36},[33,35032,35033],{"class":39},"# pip install tabula-py\n",[33,35035,35036,35038],{"class":35,"line":43},[33,35037,164],{"class":163},[33,35039,35040],{"class":167}," subprocess\n",[33,35042,35043,35045],{"class":35,"line":61},[33,35044,164],{"class":163},[33,35046,168],{"class":167},[33,35048,35049],{"class":35,"line":73},[33,35050,92],{"emptyLinePlaceholder":91},[33,35052,35053,35056,35058],{"class":35,"line":88},[33,35054,35055],{"class":167},"result ",[33,35057,242],{"class":163},[33,35059,35060],{"class":167}," subprocess.run(\n",[33,35062,35063,35066,35069,35071,35074],{"class":35,"line":95},[33,35064,35065],{"class":167},"    [",[33,35067,35068],{"class":54},"\"java\"",[33,35070,365],{"class":167},[33,35072,35073],{"class":54},"\"-version\"",[33,35075,8935],{"class":167},[33,35077,35078,35081,35083,35085],{"class":35,"line":101},[33,35079,35080],{"class":238},"    capture_output",[33,35082,242],{"class":163},[33,35084,855],{"class":50},[33,35086,247],{"class":167},[33,35088,35089,35092,35094,35096],{"class":35,"line":171},[33,35090,35091],{"class":238},"    text",[33,35093,242],{"class":163},[33,35095,855],{"class":50},[33,35097,247],{"class":167},[33,35099,35100],{"class":35,"line":179},[33,35101,221],{"class":167},[33,35103,35104,35106,35109,35111,35113],{"class":35,"line":187},[33,35105,2491],{"class":163},[33,35107,35108],{"class":167}," result.returncode ",[33,35110,17877],{"class":163},[33,35112,10791],{"class":50},[33,35114,574],{"class":167},[33,35116,35117,35119,35121,35124],{"class":35,"line":201},[33,35118,7268],{"class":50},[33,35120,602],{"class":167},[33,35122,35123],{"class":54},"\"java NOT found by Python subprocess\"",[33,35125,221],{"class":167},[33,35127,35128,35130,35132,35135],{"class":35,"line":206},[33,35129,7268],{"class":50},[33,35131,602],{"class":167},[33,35133,35134],{"class":54},"\"stderr:\"",[33,35136,35137],{"class":167},", result.stderr)\n",[33,35139,35140,35143,35145],{"class":35,"line":224},[33,35141,35142],{"class":167},"    sys.exit(",[33,35144,734],{"class":50},[33,35146,221],{"class":167},[33,35148,35149,35151],{"class":35,"line":229},[33,35150,7489],{"class":163},[33,35152,574],{"class":167},[33,35154,35155,35157,35159,35162],{"class":35,"line":235},[33,35156,7268],{"class":50},[33,35158,602],{"class":167},[33,35160,35161],{"class":54},"\"java found:\"",[33,35163,221],{"class":167},[33,35165,35166,35168,35171],{"class":35,"line":250},[33,35167,7268],{"class":50},[33,35169,35170],{"class":167},"(result.stderr)  ",[33,35172,35173],{"class":39},"# java -version writes to stderr by convention\n",[14,35175,35176,35177,35180,35181,35184],{},"If this prints ",[30,35178,35179],{},"java NOT found",", the issue is PATH or a missing JRE. If it prints the version string, tabula-py should work — re-run ",[30,35182,35183],{},"tabula.read_pdf()"," and capture the exact error for the variant fixes below.",[18,35186,35188],{"id":35187},"fix-install-java-and-add-it-to-path","Fix: Install Java and Add It to PATH",[424,35190,35192],{"id":35191},"ubuntu-debian","Ubuntu \u002F Debian",[23,35194,35196],{"className":25,"code":35195,"language":27,"meta":28,"style":28},"# Install the default JRE (Java 17 on Ubuntu 22.04+)\nsudo apt-get update && sudo apt-get install -y default-jre\n\n# Confirm the binary is on PATH\njava -version\n# Expected: openjdk version \"17.x.x\" ...\n\n# Find the install location if you need JAVA_HOME\nreadlink -f $(which java)\n# e.g. \u002Fusr\u002Flib\u002Fjvm\u002Fjava-17-openjdk-amd64\u002Fbin\u002Fjava\n# → JAVA_HOME = \u002Fusr\u002Flib\u002Fjvm\u002Fjava-17-openjdk-amd64\n",[30,35197,35198,35203,35226,35230,35235,35242,35247,35251,35256,35275,35280],{"__ignoreMap":28},[33,35199,35200],{"class":35,"line":36},[33,35201,35202],{"class":39},"# Install the default JRE (Java 17 on Ubuntu 22.04+)\n",[33,35204,35205,35207,35209,35212,35215,35217,35219,35221,35223],{"class":35,"line":43},[33,35206,9669],{"class":46},[33,35208,9672],{"class":54},[33,35210,35211],{"class":54}," update",[33,35213,35214],{"class":167}," && ",[33,35216,9669],{"class":46},[33,35218,9672],{"class":54},[33,35220,79],{"class":54},[33,35222,20912],{"class":50},[33,35224,35225],{"class":54}," default-jre\n",[33,35227,35228],{"class":35,"line":61},[33,35229,92],{"emptyLinePlaceholder":91},[33,35231,35232],{"class":35,"line":73},[33,35233,35234],{"class":39},"# Confirm the binary is on PATH\n",[33,35236,35237,35239],{"class":35,"line":88},[33,35238,34975],{"class":46},[33,35240,35241],{"class":50}," -version\n",[33,35243,35244],{"class":35,"line":95},[33,35245,35246],{"class":39},"# Expected: openjdk version \"17.x.x\" ...\n",[33,35248,35249],{"class":35,"line":101},[33,35250,92],{"emptyLinePlaceholder":91},[33,35252,35253],{"class":35,"line":171},[33,35254,35255],{"class":39},"# Find the install location if you need JAVA_HOME\n",[33,35257,35258,35261,35264,35267,35270,35273],{"class":35,"line":179},[33,35259,35260],{"class":46},"readlink",[33,35262,35263],{"class":50}," -f",[33,35265,35266],{"class":167}," $(",[33,35268,35269],{"class":50},"which",[33,35271,35272],{"class":54}," java",[33,35274,221],{"class":167},[33,35276,35277],{"class":35,"line":187},[33,35278,35279],{"class":39},"# e.g. \u002Fusr\u002Flib\u002Fjvm\u002Fjava-17-openjdk-amd64\u002Fbin\u002Fjava\n",[33,35281,35282],{"class":35,"line":201},[33,35283,35284],{"class":39},"# → JAVA_HOME = \u002Fusr\u002Flib\u002Fjvm\u002Fjava-17-openjdk-amd64\n",[14,35286,35287,35288,3035],{},"After installation, re-run the diagnostic snippet above from within your virtual environment to confirm the Python subprocess sees ",[30,35289,34975],{},[424,35291,35293],{"id":35292},"macos","macOS",[23,35295,35297],{"className":25,"code":35296,"language":27,"meta":28,"style":28},"# Install OpenJDK via Homebrew\nbrew install openjdk\n\n# Homebrew does not symlink openjdk to \u002Fusr\u002Flocal\u002Fbin by default.\n# Add it to PATH manually (add this line to ~\u002F.zshrc or ~\u002F.bash_profile):\nexport PATH=\"\u002Fopt\u002Fhomebrew\u002Fopt\u002Fopenjdk\u002Fbin:$PATH\"\n\n# Reload the profile and verify\nsource ~\u002F.zshrc\njava -version\n",[30,35298,35299,35304,35314,35318,35323,35328,35346,35350,35355,35362],{"__ignoreMap":28},[33,35300,35301],{"class":35,"line":36},[33,35302,35303],{"class":39},"# Install OpenJDK via Homebrew\n",[33,35305,35306,35309,35311],{"class":35,"line":43},[33,35307,35308],{"class":46},"brew",[33,35310,79],{"class":54},[33,35312,35313],{"class":54}," openjdk\n",[33,35315,35316],{"class":35,"line":61},[33,35317,92],{"emptyLinePlaceholder":91},[33,35319,35320],{"class":35,"line":73},[33,35321,35322],{"class":39},"# Homebrew does not symlink openjdk to \u002Fusr\u002Flocal\u002Fbin by default.\n",[33,35324,35325],{"class":35,"line":88},[33,35326,35327],{"class":39},"# Add it to PATH manually (add this line to ~\u002F.zshrc or ~\u002F.bash_profile):\n",[33,35329,35330,35333,35336,35338,35341,35344],{"class":35,"line":95},[33,35331,35332],{"class":163},"export",[33,35334,35335],{"class":167}," PATH",[33,35337,242],{"class":163},[33,35339,35340],{"class":54},"\"\u002Fopt\u002Fhomebrew\u002Fopt\u002Fopenjdk\u002Fbin:",[33,35342,35343],{"class":167},"$PATH",[33,35345,7504],{"class":54},[33,35347,35348],{"class":35,"line":101},[33,35349,92],{"emptyLinePlaceholder":91},[33,35351,35352],{"class":35,"line":171},[33,35353,35354],{"class":39},"# Reload the profile and verify\n",[33,35356,35357,35359],{"class":35,"line":179},[33,35358,64],{"class":50},[33,35360,35361],{"class":54}," ~\u002F.zshrc\n",[33,35363,35364,35366],{"class":35,"line":187},[33,35365,34975],{"class":46},[33,35367,35241],{"class":50},[14,35369,35370,35371,35374,35375,35378,35379,3035],{},"On Apple Silicon, the Homebrew prefix is ",[30,35372,35373],{},"\u002Fopt\u002Fhomebrew",". On Intel Macs it is ",[30,35376,35377],{},"\u002Fusr\u002Flocal",". Check with ",[30,35380,35381],{},"brew --prefix openjdk",[424,35383,35385],{"id":35384},"windows","Windows",[35387,35388,35389,35402,35415],"ol",{},[4214,35390,35391,35392,35397,35398,35401],{},"Download a JDK installer from ",[940,35393,35396],{"href":35394,"rel":35395},"https:\u002F\u002Fadoptium.net\u002F",[1367],"Adoptium"," (choose the ",[30,35399,35400],{},".msi"," for Temurin 17 LTS).",[4214,35403,35404,35405,10065,35411,35414],{},"Run the installer — it offers to ",[1974,35406,35407,35408],{},"set ",[30,35409,35410],{},"JAVA_HOME",[1974,35412,35413],{},"add to PATH"," automatically. Accept both.",[4214,35416,35417,35418,35421],{},"Open a ",[1974,35419,35420],{},"new"," Command Prompt or PowerShell (existing terminals do not inherit the new PATH):",[23,35423,35427],{"className":35424,"code":35425,"language":35426,"meta":28,"style":28},"language-powershell shiki shiki-themes github-light","java -version\n# Expected: openjdk version \"17.x.x\" ...\n","powershell",[30,35428,35429,35434],{"__ignoreMap":28},[33,35430,35431],{"class":35,"line":36},[33,35432,35433],{},"java -version\n",[33,35435,35436],{"class":35,"line":43},[33,35437,35246],{},[35387,35439,35440],{"start":73},[4214,35441,35442],{},"If the installer did not update PATH, add manually:",[23,35444,35446],{"className":35424,"code":35445,"language":35426,"meta":28,"style":28},"# PowerShell (permanent, current user)\n[System.Environment]::SetEnvironmentVariable(\n    \"JAVA_HOME\",\n    \"C:\\Program Files\\Eclipse Adoptium\\jdk-17.x.x-hotspot\",\n    \"User\"\n)\n[System.Environment]::SetEnvironmentVariable(\n    \"PATH\",\n    \"$env:PATH;$env:JAVA_HOME\\bin\",\n    \"User\"\n)\n",[30,35447,35448,35453,35458,35463,35468,35473,35477,35481,35486,35491,35495],{"__ignoreMap":28},[33,35449,35450],{"class":35,"line":36},[33,35451,35452],{},"# PowerShell (permanent, current user)\n",[33,35454,35455],{"class":35,"line":43},[33,35456,35457],{},"[System.Environment]::SetEnvironmentVariable(\n",[33,35459,35460],{"class":35,"line":61},[33,35461,35462],{},"    \"JAVA_HOME\",\n",[33,35464,35465],{"class":35,"line":73},[33,35466,35467],{},"    \"C:\\Program Files\\Eclipse Adoptium\\jdk-17.x.x-hotspot\",\n",[33,35469,35470],{"class":35,"line":88},[33,35471,35472],{},"    \"User\"\n",[33,35474,35475],{"class":35,"line":95},[33,35476,221],{},[33,35478,35479],{"class":35,"line":101},[33,35480,35457],{},[33,35482,35483],{"class":35,"line":171},[33,35484,35485],{},"    \"PATH\",\n",[33,35487,35488],{"class":35,"line":179},[33,35489,35490],{},"    \"$env:PATH;$env:JAVA_HOME\\bin\",\n",[33,35492,35493],{"class":35,"line":187},[33,35494,35472],{},[33,35496,35497],{"class":35,"line":201},[33,35498,221],{},[14,35500,35501],{},"Then restart your terminal and relaunch any IDE or script runner.",[18,35503,35505],{"id":35504},"fix-implementation-corrected-tabula-py-call","Fix Implementation: Corrected tabula-py Call",[14,35507,35508,35509,35512],{},"Once ",[30,35510,35511],{},"java -version"," works from the Python diagnostic script, this minimal call should succeed:",[23,35514,35516],{"className":126,"code":35515,"language":47,"meta":28,"style":28},"# pip install tabula-py pandas\nfrom pathlib import Path\nimport tabula\nimport pandas as pd\n\nPDF_PATH = Path(\"data\u002Freport.pdf\")\n\ntry:\n    # read_pdf shells out to the bundled JAR; java must be on PATH\n    dfs: list[pd.DataFrame] = tabula.read_pdf(\n        str(PDF_PATH),       # tabula-py requires a str path, not Path\n        pages=\"all\",\n        multiple_tables=True,\n        silent=True,         # suppress Java stderr noise in output\n    )\n    print(f\"Extracted {len(dfs)} table(s)\")\n    for i, df in enumerate(dfs):\n        print(f\"Table {i}: {df.shape}\")\n        print(df.head(3))\nexcept FileNotFoundError as e:\n    # Raised when 'java' is genuinely not found on PATH\n    raise RuntimeError(\n        \"java not found. Install a JRE and ensure 'java' is on PATH. \"\n        \"Run: java -version to verify.\"\n    ) from e\n",[30,35517,35518,35523,35533,35540,35550,35554,35566,35570,35577,35582,35592,35607,35619,35630,35645,35649,35670,35683,35711,35722,35733,35738,35747,35752,35757],{"__ignoreMap":28},[33,35519,35520],{"class":35,"line":36},[33,35521,35522],{"class":39},"# pip install tabula-py pandas\n",[33,35524,35525,35527,35529,35531],{"class":35,"line":43},[33,35526,190],{"class":163},[33,35528,193],{"class":167},[33,35530,164],{"class":163},[33,35532,198],{"class":167},[33,35534,35535,35537],{"class":35,"line":61},[33,35536,164],{"class":163},[33,35538,35539],{"class":167}," tabula\n",[33,35541,35542,35544,35546,35548],{"class":35,"line":73},[33,35543,164],{"class":163},[33,35545,492],{"class":167},[33,35547,495],{"class":163},[33,35549,498],{"class":167},[33,35551,35552],{"class":35,"line":88},[33,35553,92],{"emptyLinePlaceholder":91},[33,35555,35556,35558,35560,35562,35564],{"class":35,"line":95},[33,35557,7076],{"class":50},[33,35559,212],{"class":163},[33,35561,215],{"class":167},[33,35563,7083],{"class":54},[33,35565,221],{"class":167},[33,35567,35568],{"class":35,"line":101},[33,35569,92],{"emptyLinePlaceholder":91},[33,35571,35572,35575],{"class":35,"line":171},[33,35573,35574],{"class":163},"try",[33,35576,574],{"class":167},[33,35578,35579],{"class":35,"line":179},[33,35580,35581],{"class":39},"    # read_pdf shells out to the bundled JAR; java must be on PATH\n",[33,35583,35584,35587,35589],{"class":35,"line":187},[33,35585,35586],{"class":167},"    dfs: list[pd.DataFrame] ",[33,35588,242],{"class":163},[33,35590,35591],{"class":167}," tabula.read_pdf(\n",[33,35593,35594,35597,35599,35601,35604],{"class":35,"line":201},[33,35595,35596],{"class":50},"        str",[33,35598,602],{"class":167},[33,35600,7076],{"class":50},[33,35602,35603],{"class":167},"),       ",[33,35605,35606],{"class":39},"# tabula-py requires a str path, not Path\n",[33,35608,35609,35612,35614,35617],{"class":35,"line":206},[33,35610,35611],{"class":238},"        pages",[33,35613,242],{"class":163},[33,35615,35616],{"class":54},"\"all\"",[33,35618,247],{"class":167},[33,35620,35621,35624,35626,35628],{"class":35,"line":224},[33,35622,35623],{"class":238},"        multiple_tables",[33,35625,242],{"class":163},[33,35627,855],{"class":50},[33,35629,247],{"class":167},[33,35631,35632,35635,35637,35639,35642],{"class":35,"line":229},[33,35633,35634],{"class":238},"        silent",[33,35636,242],{"class":163},[33,35638,855],{"class":50},[33,35640,35641],{"class":167},",         ",[33,35643,35644],{"class":39},"# suppress Java stderr noise in output\n",[33,35646,35647],{"class":35,"line":235},[33,35648,1202],{"class":167},[33,35650,35651,35653,35655,35657,35659,35661,35664,35666,35668],{"class":35,"line":250},[33,35652,7268],{"class":50},[33,35654,602],{"class":167},[33,35656,4059],{"class":163},[33,35658,8142],{"class":54},[33,35660,4065],{"class":50},[33,35662,35663],{"class":167},"(dfs)",[33,35665,1121],{"class":50},[33,35667,6247],{"class":54},[33,35669,221],{"class":167},[33,35671,35672,35674,35676,35678,35680],{"class":35,"line":266},[33,35673,656],{"class":163},[33,35675,10994],{"class":167},[33,35677,662],{"class":163},[33,35679,7403],{"class":50},[33,35681,35682],{"class":167},"(dfs):\n",[33,35684,35685,35687,35689,35691,35693,35695,35697,35699,35701,35703,35705,35707,35709],{"class":35,"line":290},[33,35686,9414],{"class":50},[33,35688,602],{"class":167},[33,35690,4059],{"class":163},[33,35692,11012],{"class":54},[33,35694,1115],{"class":50},[33,35696,7499],{"class":167},[33,35698,1121],{"class":50},[33,35700,2079],{"class":54},[33,35702,1115],{"class":50},[33,35704,9426],{"class":167},[33,35706,1121],{"class":50},[33,35708,274],{"class":54},[33,35710,221],{"class":167},[33,35712,35713,35715,35718,35720],{"class":35,"line":295},[33,35714,9414],{"class":50},[33,35716,35717],{"class":167},"(df.head(",[33,35719,10258],{"class":50},[33,35721,371],{"class":167},[33,35723,35724,35727,35729,35731],{"class":35,"line":300},[33,35725,35726],{"class":163},"except",[33,35728,2945],{"class":50},[33,35730,1852],{"class":163},[33,35732,7583],{"class":167},[33,35734,35735],{"class":35,"line":317},[33,35736,35737],{"class":39},"    # Raised when 'java' is genuinely not found on PATH\n",[33,35739,35740,35743,35745],{"class":35,"line":332},[33,35741,35742],{"class":163},"    raise",[33,35744,7590],{"class":50},[33,35746,7637],{"class":167},[33,35748,35749],{"class":35,"line":347},[33,35750,35751],{"class":54},"        \"java not found. Install a JRE and ensure 'java' is on PATH. \"\n",[33,35753,35754],{"class":35,"line":374},[33,35755,35756],{"class":54},"        \"Run: java -version to verify.\"\n",[33,35758,35759,35762,35764],{"class":35,"line":397},[33,35760,35761],{"class":167},"    ) ",[33,35763,190],{"class":163},[33,35765,7613],{"class":167},[14,35767,35768],{},"Key changed lines:",[4211,35770,35771,35781,35787],{},[4214,35772,35773,35776,35777,35780],{},[30,35774,35775],{},"str(PDF_PATH)"," — tabula-py passes this to Java; use a string, not a ",[30,35778,35779],{},"Path"," object.",[4214,35782,35783,35786],{},[30,35784,35785],{},"silent=True"," — suppresses Java's own stderr output so your logs stay clean.",[4214,35788,35789,35791,35792,35795,35796,35798],{},[30,35790,4341],{}," catch — raised by ",[30,35793,35794],{},"subprocess"," when the ",[30,35797,34975],{}," binary cannot be found; re-raise with a clear message.",[18,35800,35802],{"id":35801},"variant-fixes","Variant Fixes",[424,35804,35806],{"id":35805},"variant-a-java-is-installed-but-not-in-the-subprocess-path","Variant A: Java Is Installed but Not in the Subprocess PATH",[14,35808,35809,35810,35812],{},"This happens with virtual environments activated in terminals where ",[30,35811,35410],{}," was set for a different shell session, or with systemd services that use a stripped environment.",[14,35814,35815,35816,35818,35819,35821],{},"Set ",[30,35817,35410],{}," and prepend it to ",[30,35820,122],{}," before your Python process starts, or set it inside the process:",[23,35823,35825],{"className":126,"code":35824,"language":47,"meta":28,"style":28},"# pip install tabula-py pandas\nimport os\nfrom pathlib import Path\nimport tabula\n\n# Explicitly extend PATH before calling tabula\n# Replace with your actual JDK path\nJAVA_HOME = Path(\"\u002Fusr\u002Flib\u002Fjvm\u002Fjava-17-openjdk-amd64\")\nos.environ[\"JAVA_HOME\"] = str(JAVA_HOME)\nos.environ[\"PATH\"] = str(JAVA_HOME \u002F \"bin\") + os.pathsep + os.environ.get(\"PATH\", \"\")\n\nPDF_PATH = Path(\"data\u002Freport.pdf\")\n\ndfs = tabula.read_pdf(str(PDF_PATH), pages=\"1\", silent=True)\nprint(f\"Extracted {len(dfs)} table(s)\")\n",[30,35826,35827,35831,35837,35847,35853,35857,35862,35867,35880,35900,35940,35944,35956,35960,35996],{"__ignoreMap":28},[33,35828,35829],{"class":35,"line":36},[33,35830,35522],{"class":39},[33,35832,35833,35835],{"class":35,"line":43},[33,35834,164],{"class":163},[33,35836,176],{"class":167},[33,35838,35839,35841,35843,35845],{"class":35,"line":61},[33,35840,190],{"class":163},[33,35842,193],{"class":167},[33,35844,164],{"class":163},[33,35846,198],{"class":167},[33,35848,35849,35851],{"class":35,"line":73},[33,35850,164],{"class":163},[33,35852,35539],{"class":167},[33,35854,35855],{"class":35,"line":88},[33,35856,92],{"emptyLinePlaceholder":91},[33,35858,35859],{"class":35,"line":95},[33,35860,35861],{"class":39},"# Explicitly extend PATH before calling tabula\n",[33,35863,35864],{"class":35,"line":101},[33,35865,35866],{"class":39},"# Replace with your actual JDK path\n",[33,35868,35869,35871,35873,35875,35878],{"class":35,"line":171},[33,35870,35410],{"class":50},[33,35872,212],{"class":163},[33,35874,215],{"class":167},[33,35876,35877],{"class":54},"\"\u002Fusr\u002Flib\u002Fjvm\u002Fjava-17-openjdk-amd64\"",[33,35879,221],{"class":167},[33,35881,35882,35885,35888,35890,35892,35894,35896,35898],{"class":35,"line":179},[33,35883,35884],{"class":167},"os.environ[",[33,35886,35887],{"class":54},"\"JAVA_HOME\"",[33,35889,763],{"class":167},[33,35891,242],{"class":163},[33,35893,7887],{"class":50},[33,35895,602],{"class":167},[33,35897,35410],{"class":50},[33,35899,221],{"class":167},[33,35901,35902,35904,35906,35908,35910,35912,35914,35916,35918,35921,35923,35925,35928,35930,35932,35934,35936,35938],{"class":35,"line":187},[33,35903,35884],{"class":167},[33,35905,362],{"class":54},[33,35907,763],{"class":167},[33,35909,242],{"class":163},[33,35911,7887],{"class":50},[33,35913,602],{"class":167},[33,35915,35410],{"class":50},[33,35917,1107],{"class":163},[33,35919,35920],{"class":54}," \"bin\"",[33,35922,1649],{"class":167},[33,35924,1811],{"class":163},[33,35926,35927],{"class":167}," os.pathsep ",[33,35929,1811],{"class":163},[33,35931,3129],{"class":167},[33,35933,362],{"class":54},[33,35935,365],{"class":167},[33,35937,3198],{"class":54},[33,35939,221],{"class":167},[33,35941,35942],{"class":35,"line":201},[33,35943,92],{"emptyLinePlaceholder":91},[33,35945,35946,35948,35950,35952,35954],{"class":35,"line":206},[33,35947,7076],{"class":50},[33,35949,212],{"class":163},[33,35951,215],{"class":167},[33,35953,7083],{"class":54},[33,35955,221],{"class":167},[33,35957,35958],{"class":35,"line":224},[33,35959,92],{"emptyLinePlaceholder":91},[33,35961,35962,35965,35967,35970,35972,35974,35976,35978,35980,35982,35985,35987,35990,35992,35994],{"class":35,"line":229},[33,35963,35964],{"class":167},"dfs ",[33,35966,242],{"class":163},[33,35968,35969],{"class":167}," tabula.read_pdf(",[33,35971,1053],{"class":50},[33,35973,602],{"class":167},[33,35975,7076],{"class":50},[33,35977,18525],{"class":167},[33,35979,10971],{"class":238},[33,35981,242],{"class":163},[33,35983,35984],{"class":54},"\"1\"",[33,35986,365],{"class":167},[33,35988,35989],{"class":238},"silent",[33,35991,242],{"class":163},[33,35993,855],{"class":50},[33,35995,221],{"class":167},[33,35997,35998,36000,36002,36004,36006,36008,36010,36012,36014],{"class":35,"line":235},[33,35999,13474],{"class":50},[33,36001,602],{"class":167},[33,36003,4059],{"class":163},[33,36005,8142],{"class":54},[33,36007,4065],{"class":50},[33,36009,35663],{"class":167},[33,36011,1121],{"class":50},[33,36013,6247],{"class":54},[33,36015,221],{"class":167},[14,36017,36018,36019,36022,36023,36025,36026,36028,36029,36031],{},"Setting ",[30,36020,36021],{},"os.environ"," before the first ",[30,36024,35183],{}," call is sufficient — Python's ",[30,36027,35794],{}," inherits ",[30,36030,36021],{}," at call time, not at import time.",[424,36033,36035],{"id":36034},"variant-b-docker-container-with-no-jre","Variant B: Docker Container with No JRE",[14,36037,36038,36039,36042,36043,20891],{},"A minimal Python base image (",[30,36040,36041],{},"python:3.12-slim",") ships without Java. Add the install step to your ",[30,36044,36045],{},"Dockerfile",[23,36047,36051],{"className":36048,"code":36049,"language":36050,"meta":28,"style":28},"language-dockerfile shiki shiki-themes github-light","FROM python:3.12-slim\n\n# Install JRE and ghostscript (needed by camelot; include if you use both)\nRUN apt-get update \\\n && apt-get install -y --no-install-recommends \\\n      default-jre-headless \\\n && rm -rf \u002Fvar\u002Flib\u002Fapt\u002Flists\u002F*\n\n# Verify java is on PATH for the Python subprocess\nRUN java -version\n\nCOPY requirements.txt .\nRUN pip install --no-cache-dir -r requirements.txt\n\nCOPY . \u002Fapp\nWORKDIR \u002Fapp\nCMD [\"python\", \"extract_tables.py\"]\n","dockerfile",[30,36052,36053,36058,36062,36067,36072,36077,36082,36087,36091,36096,36101,36105,36110,36115,36119,36124,36129],{"__ignoreMap":28},[33,36054,36055],{"class":35,"line":36},[33,36056,36057],{},"FROM python:3.12-slim\n",[33,36059,36060],{"class":35,"line":43},[33,36061,92],{"emptyLinePlaceholder":91},[33,36063,36064],{"class":35,"line":61},[33,36065,36066],{},"# Install JRE and ghostscript (needed by camelot; include if you use both)\n",[33,36068,36069],{"class":35,"line":73},[33,36070,36071],{},"RUN apt-get update \\\n",[33,36073,36074],{"class":35,"line":88},[33,36075,36076],{}," && apt-get install -y --no-install-recommends \\\n",[33,36078,36079],{"class":35,"line":95},[33,36080,36081],{},"      default-jre-headless \\\n",[33,36083,36084],{"class":35,"line":101},[33,36085,36086],{}," && rm -rf \u002Fvar\u002Flib\u002Fapt\u002Flists\u002F*\n",[33,36088,36089],{"class":35,"line":171},[33,36090,92],{"emptyLinePlaceholder":91},[33,36092,36093],{"class":35,"line":179},[33,36094,36095],{},"# Verify java is on PATH for the Python subprocess\n",[33,36097,36098],{"class":35,"line":187},[33,36099,36100],{},"RUN java -version\n",[33,36102,36103],{"class":35,"line":201},[33,36104,92],{"emptyLinePlaceholder":91},[33,36106,36107],{"class":35,"line":206},[33,36108,36109],{},"COPY requirements.txt .\n",[33,36111,36112],{"class":35,"line":224},[33,36113,36114],{},"RUN pip install --no-cache-dir -r requirements.txt\n",[33,36116,36117],{"class":35,"line":229},[33,36118,92],{"emptyLinePlaceholder":91},[33,36120,36121],{"class":35,"line":235},[33,36122,36123],{},"COPY . \u002Fapp\n",[33,36125,36126],{"class":35,"line":250},[33,36127,36128],{},"WORKDIR \u002Fapp\n",[33,36130,36131],{"class":35,"line":266},[33,36132,36133],{},"CMD [\"python\", \"extract_tables.py\"]\n",[14,36135,36136,36139,36140,36143],{},[30,36137,36138],{},"default-jre-headless"," is smaller than ",[30,36141,36142],{},"default-jre"," — it omits graphical components that a server container does not need.",[424,36145,36147],{"id":36146},"variant-c-passing-java_options-to-tabula-py","Variant C: Passing java_options to tabula-py",[14,36149,36150,36151,36154],{},"If Java is found but the extraction fails with ",[30,36152,36153],{},"OutOfMemoryError"," on large files, pass JVM flags:",[23,36156,36158],{"className":126,"code":36157,"language":47,"meta":28,"style":28},"# pip install tabula-py\nimport tabula\nfrom pathlib import Path\n\nPDF_PATH = Path(\"data\u002Flarge_report.pdf\")\n\ndfs = tabula.read_pdf(\n    str(PDF_PATH),\n    pages=\"all\",\n    multiple_tables=True,\n    java_options=[\"-Xmx512m\"],  # cap heap at 512 MB\n    silent=True,\n)\nprint(f\"Extracted {len(dfs)} table(s)\")\n",[30,36159,36160,36164,36170,36180,36184,36197,36201,36209,36219,36229,36240,36258,36269,36273],{"__ignoreMap":28},[33,36161,36162],{"class":35,"line":36},[33,36163,35033],{"class":39},[33,36165,36166,36168],{"class":35,"line":43},[33,36167,164],{"class":163},[33,36169,35539],{"class":167},[33,36171,36172,36174,36176,36178],{"class":35,"line":61},[33,36173,190],{"class":163},[33,36175,193],{"class":167},[33,36177,164],{"class":163},[33,36179,198],{"class":167},[33,36181,36182],{"class":35,"line":73},[33,36183,92],{"emptyLinePlaceholder":91},[33,36185,36186,36188,36190,36192,36195],{"class":35,"line":88},[33,36187,7076],{"class":50},[33,36189,212],{"class":163},[33,36191,215],{"class":167},[33,36193,36194],{"class":54},"\"data\u002Flarge_report.pdf\"",[33,36196,221],{"class":167},[33,36198,36199],{"class":35,"line":95},[33,36200,92],{"emptyLinePlaceholder":91},[33,36202,36203,36205,36207],{"class":35,"line":101},[33,36204,35964],{"class":167},[33,36206,242],{"class":163},[33,36208,35591],{"class":167},[33,36210,36211,36213,36215,36217],{"class":35,"line":171},[33,36212,13379],{"class":50},[33,36214,602],{"class":167},[33,36216,7076],{"class":50},[33,36218,1506],{"class":167},[33,36220,36221,36223,36225,36227],{"class":35,"line":179},[33,36222,13390],{"class":238},[33,36224,242],{"class":163},[33,36226,35616],{"class":54},[33,36228,247],{"class":167},[33,36230,36231,36234,36236,36238],{"class":35,"line":187},[33,36232,36233],{"class":238},"    multiple_tables",[33,36235,242],{"class":163},[33,36237,855],{"class":50},[33,36239,247],{"class":167},[33,36241,36242,36245,36247,36249,36252,36255],{"class":35,"line":201},[33,36243,36244],{"class":238},"    java_options",[33,36246,242],{"class":163},[33,36248,8309],{"class":167},[33,36250,36251],{"class":54},"\"-Xmx512m\"",[33,36253,36254],{"class":167},"],  ",[33,36256,36257],{"class":39},"# cap heap at 512 MB\n",[33,36259,36260,36263,36265,36267],{"class":35,"line":206},[33,36261,36262],{"class":238},"    silent",[33,36264,242],{"class":163},[33,36266,855],{"class":50},[33,36268,247],{"class":167},[33,36270,36271],{"class":35,"line":224},[33,36272,221],{"class":167},[33,36274,36275,36277,36279,36281,36283,36285,36287,36289,36291],{"class":35,"line":229},[33,36276,13474],{"class":50},[33,36278,602],{"class":167},[33,36280,4059],{"class":163},[33,36282,8142],{"class":54},[33,36284,4065],{"class":50},[33,36286,35663],{"class":167},[33,36288,1121],{"class":50},[33,36290,6247],{"class":54},[33,36292,221],{"class":167},[14,36294,36295,36298,36299,36301,36302,36305,36306,36309,36310,36313],{},[30,36296,36297],{},"java_options"," is a list of strings appended to the ",[30,36300,34975],{}," command before ",[30,36303,36304],{},"-jar",". Common options: ",[30,36307,36308],{},"-Xmx512m"," (heap cap), ",[30,36311,36312],{},"-Djava.awt.headless=true"," (suppress AWT warnings in headless environments).",[18,36315,9247],{"id":9246},[14,36317,36318],{},"Run the diagnostic snippet from the root cause section one final time. Then confirm a real extraction works end-to-end:",[23,36320,36322],{"className":126,"code":36321,"language":47,"meta":28,"style":28},"# pip install tabula-py pandas\nimport subprocess, sys, tabula, pandas as pd\nfrom pathlib import Path\n\n# Step 1: confirm java is reachable\nr = subprocess.run([\"java\", \"-version\"], capture_output=True, text=True)\nassert r.returncode == 0, f\"java still not found: {r.stderr}\"\nprint(\"java OK:\", r.stderr.splitlines()[0])\n\n# Step 2: confirm tabula can read a PDF\nPDF_PATH = Path(\"data\u002Freport.pdf\")\nassert PDF_PATH.exists(), f\"Test PDF not found at {PDF_PATH}\"\n\ndfs = tabula.read_pdf(str(PDF_PATH), pages=\"1\", silent=True)\nassert len(dfs) > 0, \"tabula returned no tables — check the PDF has a bordered table on page 1\"\nassert isinstance(dfs[0], pd.DataFrame), \"Expected a DataFrame\"\nprint(f\"tabula OK: extracted {len(dfs)} table(s), first table shape {dfs[0].shape}\")\n",[30,36323,36324,36328,36339,36349,36353,36358,36393,36421,36437,36441,36446,36458,36478,36482,36514,36532,36550],{"__ignoreMap":28},[33,36325,36326],{"class":35,"line":36},[33,36327,35522],{"class":39},[33,36329,36330,36332,36335,36337],{"class":35,"line":43},[33,36331,164],{"class":163},[33,36333,36334],{"class":167}," subprocess, sys, tabula, pandas ",[33,36336,495],{"class":163},[33,36338,498],{"class":167},[33,36340,36341,36343,36345,36347],{"class":35,"line":61},[33,36342,190],{"class":163},[33,36344,193],{"class":167},[33,36346,164],{"class":163},[33,36348,198],{"class":167},[33,36350,36351],{"class":35,"line":73},[33,36352,92],{"emptyLinePlaceholder":91},[33,36354,36355],{"class":35,"line":88},[33,36356,36357],{"class":39},"# Step 1: confirm java is reachable\n",[33,36359,36360,36363,36365,36368,36370,36372,36374,36376,36379,36381,36383,36385,36387,36389,36391],{"class":35,"line":95},[33,36361,36362],{"class":167},"r ",[33,36364,242],{"class":163},[33,36366,36367],{"class":167}," subprocess.run([",[33,36369,35068],{"class":54},[33,36371,365],{"class":167},[33,36373,35073],{"class":54},[33,36375,8314],{"class":167},[33,36377,36378],{"class":238},"capture_output",[33,36380,242],{"class":163},[33,36382,855],{"class":50},[33,36384,365],{"class":167},[33,36386,2000],{"class":238},[33,36388,242],{"class":163},[33,36390,855],{"class":50},[33,36392,221],{"class":167},[33,36394,36395,36398,36401,36403,36405,36407,36409,36412,36414,36417,36419],{"class":35,"line":101},[33,36396,36397],{"class":163},"assert",[33,36399,36400],{"class":167}," r.returncode ",[33,36402,1865],{"class":163},[33,36404,10791],{"class":50},[33,36406,365],{"class":167},[33,36408,4059],{"class":163},[33,36410,36411],{"class":54},"\"java still not found: ",[33,36413,1115],{"class":50},[33,36415,36416],{"class":167},"r.stderr",[33,36418,1121],{"class":50},[33,36420,7504],{"class":54},[33,36422,36423,36425,36427,36430,36433,36435],{"class":35,"line":171},[33,36424,13474],{"class":50},[33,36426,602],{"class":167},[33,36428,36429],{"class":54},"\"java OK:\"",[33,36431,36432],{"class":167},", r.stderr.splitlines()[",[33,36434,748],{"class":50},[33,36436,751],{"class":167},[33,36438,36439],{"class":35,"line":179},[33,36440,92],{"emptyLinePlaceholder":91},[33,36442,36443],{"class":35,"line":187},[33,36444,36445],{"class":39},"# Step 2: confirm tabula can read a PDF\n",[33,36447,36448,36450,36452,36454,36456],{"class":35,"line":201},[33,36449,7076],{"class":50},[33,36451,212],{"class":163},[33,36453,215],{"class":167},[33,36455,7083],{"class":54},[33,36457,221],{"class":167},[33,36459,36460,36462,36465,36468,36470,36473,36476],{"class":35,"line":206},[33,36461,36397],{"class":163},[33,36463,36464],{"class":50}," PDF_PATH",[33,36466,36467],{"class":167},".exists(), ",[33,36469,4059],{"class":163},[33,36471,36472],{"class":54},"\"Test PDF not found at ",[33,36474,36475],{"class":50},"{PDF_PATH}",[33,36477,7504],{"class":54},[33,36479,36480],{"class":35,"line":224},[33,36481,92],{"emptyLinePlaceholder":91},[33,36483,36484,36486,36488,36490,36492,36494,36496,36498,36500,36502,36504,36506,36508,36510,36512],{"class":35,"line":229},[33,36485,35964],{"class":167},[33,36487,242],{"class":163},[33,36489,35969],{"class":167},[33,36491,1053],{"class":50},[33,36493,602],{"class":167},[33,36495,7076],{"class":50},[33,36497,18525],{"class":167},[33,36499,10971],{"class":238},[33,36501,242],{"class":163},[33,36503,35984],{"class":54},[33,36505,365],{"class":167},[33,36507,35989],{"class":238},[33,36509,242],{"class":163},[33,36511,855],{"class":50},[33,36513,221],{"class":167},[33,36515,36516,36518,36520,36523,36525,36527,36529],{"class":35,"line":235},[33,36517,36397],{"class":163},[33,36519,4037],{"class":50},[33,36521,36522],{"class":167},"(dfs) ",[33,36524,6009],{"class":163},[33,36526,10791],{"class":50},[33,36528,365],{"class":167},[33,36530,36531],{"class":54},"\"tabula returned no tables — check the PDF has a bordered table on page 1\"\n",[33,36533,36534,36536,36539,36542,36544,36547],{"class":35,"line":250},[33,36535,36397],{"class":163},[33,36537,36538],{"class":50}," isinstance",[33,36540,36541],{"class":167},"(dfs[",[33,36543,748],{"class":50},[33,36545,36546],{"class":167},"], pd.DataFrame), ",[33,36548,36549],{"class":54},"\"Expected a DataFrame\"\n",[33,36551,36552,36554,36556,36558,36561,36563,36565,36567,36570,36572,36575,36577,36580,36582,36584],{"class":35,"line":266},[33,36553,13474],{"class":50},[33,36555,602],{"class":167},[33,36557,4059],{"class":163},[33,36559,36560],{"class":54},"\"tabula OK: extracted ",[33,36562,4065],{"class":50},[33,36564,35663],{"class":167},[33,36566,1121],{"class":50},[33,36568,36569],{"class":54}," table(s), first table shape ",[33,36571,1115],{"class":50},[33,36573,36574],{"class":167},"dfs[",[33,36576,748],{"class":50},[33,36578,36579],{"class":167},"].shape",[33,36581,1121],{"class":50},[33,36583,274],{"class":54},[33,36585,221],{"class":167},[14,36587,36588],{},"Expected output:",[23,36590,36593],{"className":36591,"code":36592,"language":2000},[1998],"java OK: openjdk version \"17.0.x\" ...\ntabula OK: extracted 1 table(s), first table shape (12, 5)\n",[30,36594,36592],{"__ignoreMap":28},[14,36596,36597,36598,36600,36601,36604,36605,36608,36609,36612,36613,36616],{},"If you still see ",[30,36599,34964],{}," after installing Java, check whether your IDE or process runner has its own PATH override that excludes the Java binary directory. In VS Code, set ",[30,36602,36603],{},"\"terminal.integrated.env.linux\""," (or ",[30,36606,36607],{},".mac"," \u002F ",[30,36610,36611],{},".windows",") to include the Java ",[30,36614,36615],{},"bin"," path.",[14,36618,36619,36620,36622],{},"If Java is found but your PDF returns 0 tables or garbled Unicode, switch to ",[940,36621,9606],{"href":9605}," and try the pdfplumber fallback — tabula-py does not handle all PDF font encodings.",[18,36624,36626],{"id":36625},"faq","FAQ",[14,36628,36629,36638,36639,2012,36642,36645,36646,36648,36649,36651,36652,36654,36655,36657,36658,36661,36662,36664,36665,36668],{},[1974,36630,36631,36632,36634,36635,36637],{},"Why does ",[30,36633,35511],{}," work in my terminal but tabula-py still raises ",[30,36636,34964],{},"?","\nYour interactive shell sources ",[30,36640,36641],{},"~\u002F.bashrc",[30,36643,36644],{},"~\u002F.zshrc",", which sets ",[30,36647,122],{},". Python's subprocess inherits the environment of the process that launched it — not your shell profile. If you start Python from an IDE launcher, a cron job, or a systemd service, the ",[30,36650,122],{}," in that environment may not include the Java ",[30,36653,36615],{}," directory. Fix: set ",[30,36656,35410],{}," and prepend ",[30,36659,36660],{},"$JAVA_HOME\u002Fbin"," to ",[30,36663,122],{}," in the same script, or set them as system-level environment variables (see the Windows ",[30,36666,36667],{},"SetEnvironmentVariable"," example above).",[14,36670,36671,36674,36675,36677,36678,36681],{},[1974,36672,36673],{},"Does tabula-py require a full JDK or just a JRE?","\nA JRE (Java Runtime Environment) is sufficient — tabula-py only runs a pre-compiled JAR, it does not compile Java code. ",[30,36676,36142],{}," on Debian\u002FUbuntu or ",[30,36679,36680],{},"openjdk"," on macOS both install a JRE. A full JDK works too and is fine if you already have one installed.",[14,36683,36684,36687],{},[1974,36685,36686],{},"Which Java version does tabula-py support?","\ntabula-py's bundled JAR targets Java 8+ and has been tested through Java 21 LTS. Java 17 LTS is the recommended choice for new installs — it is the current Ubuntu LTS default and is available on all major platforms via Adoptium. Avoid Java 8 on new setups; it is end-of-life.",[14,36689,36690,36693,36694,36697,36698,36700,36701,36704,36705,36708,36709,36712,36713,36715],{},[1974,36691,36692],{},"Can I use tabula-py in a GitHub Actions workflow?","\nYes. The ",[30,36695,36696],{},"ubuntu-latest"," runner ships with Java pre-installed. Confirm with ",[30,36699,35511],{}," in a ",[30,36702,36703],{},"run:"," step before calling your Python script. If you use the ",[30,36706,36707],{},"python:3.x-slim"," Docker container action instead, add a ",[30,36710,36711],{},"RUN apt-get install -y default-jre-headless"," step to your ",[30,36714,36045],{}," first.",[14,36717,36718,36721,36722,36725,36726,36729,36730,36732],{},[1974,36719,36720],{},"tabula-py works locally but fails in production — what is different?","\nCommon causes: (1) production runs in a Docker container based on ",[30,36723,36724],{},"python:slim"," which has no JRE; (2) a systemd service uses ",[30,36727,36728],{},"EnvironmentFile"," that does not set ",[30,36731,35410],{},"; (3) a AWS Lambda \u002F Cloud Run function uses a minimal runtime image. Use the Dockerfile fix in Variant B, or switch to pdfplumber for serverless environments where system dependencies are impractical to install.",[18,36734,6918],{"id":6917},[4211,36736,36737,36742,36747,36752],{},[4214,36738,36739,36741],{},[940,36740,9606],{"href":9605}," — choose the right library for your table type and environment",[4214,36743,36744,36746],{},[940,36745,9592],{"href":942}," — full extraction pipeline with pdfplumber and camelot",[4214,36748,36749,36751],{},[940,36750,9739],{"href":9738}," — similar system-dependency issue for camelot's Ghostscript requirement",[4214,36753,36754,36757],{},[940,36755,36756],{"href":26957},"Scanning and OCR Processing with Python"," — handle image-only PDFs that no table extractor can read directly",[14,36759,6947,36760,3035],{},[940,36761,9606],{"href":9605},[6953,36763,36764],{},"html pre.shiki code .sAwPA, html code.shiki .sAwPA{--shiki-default:#6A737D}html pre.shiki code .sD7c4, html code.shiki .sD7c4{--shiki-default:#D73A49}html pre.shiki code .sgsFI, html code.shiki .sgsFI{--shiki-default:#24292E}html pre.shiki code .sYBdl, html code.shiki .sYBdl{--shiki-default:#032F62}html pre.shiki code .sqxcx, html code.shiki .sqxcx{--shiki-default:#E36209}html pre.shiki code .sYu0t, html code.shiki .sYu0t{--shiki-default:#005CC5}html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html pre.shiki code .s7eDp, html code.shiki .s7eDp{--shiki-default:#6F42C1}",{"title":28,"searchDepth":43,"depth":43,"links":36766},[36767,36768,36769,36774,36775,36780,36781,36782],{"id":7020,"depth":43,"text":7021},{"id":35016,"depth":43,"text":35017},{"id":35187,"depth":43,"text":35188,"children":36770},[36771,36772,36773],{"id":35191,"depth":61,"text":35192},{"id":35292,"depth":61,"text":35293},{"id":35384,"depth":61,"text":35385},{"id":35504,"depth":43,"text":35505},{"id":35801,"depth":43,"text":35802,"children":36776},[36777,36778,36779],{"id":35805,"depth":61,"text":35806},{"id":36034,"depth":61,"text":36035},{"id":36146,"depth":61,"text":36147},{"id":9246,"depth":43,"text":9247},{"id":36625,"depth":43,"text":36626},{"id":6917,"depth":43,"text":6918},"Fix Java Not Found","Fix tabula-py raising JavaNotFoundError or \"java command is not found from this Python process\". Install a JRE, add it to PATH, and set JAVA_HOME correctly.",{},"\u002Fautomating-pdf-extraction-generation\u002Fcomparing-pdf-table-extraction-libraries\u002Ffix-tabula-java-not-found-error",{"title":34953,"description":36784},"Fix tabula-py JavaNotFoundError java not found","automating-pdf-extraction-generation\u002Fcomparing-pdf-table-extraction-libraries\u002Ffix-tabula-java-not-found-error\u002Findex",[47,9631,36791,34975],"tabula-py","L-8qz-xCrwAKwSeBdvcO7ULvNvb0zj4lYctlMM88f4Q",{"id":36794,"title":9606,"body":36795,"breadcrumbTitle":41427,"canonical":6977,"date":6978,"description":41428,"draft":6980,"extension":6981,"image":6977,"meta":41429,"navigation":91,"path":41430,"robots":6977,"seo":41431,"seoTitle":41432,"stem":41433,"tags":41434,"updatedAt":6978,"__hash__":41435},"content\u002Fautomating-pdf-extraction-generation\u002Fcomparing-pdf-table-extraction-libraries\u002Findex.md",{"type":7,"value":36796,"toc":41405},[36797,36800,36806,36809,36813,36816,36841,36847,36851,36854,36983,36986,37019,37028,37036,37040,37043,37389,37411,37415,37419,37766,37772,37776,38162,38168,38174,38178,38489,38495,38499,38692,38892,38896,38900,38907,39176,39180,39183,39548,39554,39558,39565,39682,39696,39700,39703,39925,39931,39935,39948,39957,39963,40190,40194,40313,40317,40320,40363,40367,40374,41345,41348,41369,41371,41398,41402],[10,36798,9606],{"id":36799},"pdfplumber-vs-camelot-vs-tabula",[14,36801,36802,36803,36805],{},"Three libraries dominate Python PDF table extraction: ",[940,36804,943],{"href":942},", camelot, and tabula-py. Each wraps a different extraction engine with different runtime requirements, and each wins in a distinct scenario. Choosing the wrong one costs hours of debugging — choose based on your table type, your infrastructure constraints, and whether you can install system-level dependencies.",[14,36807,36808],{},"This guide covers installation, runtime requirements, accuracy across table types, output format, speed, and a complete decision script.",[18,36810,36812],{"id":36811},"_1-problem-framing","1. Problem Framing",[14,36814,36815],{},"Generic PDF-to-text converters strip structural information. A PDF table is not a semantic grid — it is a set of independently positioned rectangles or text runs with no native row\u002Fcolumn semantics. Each library reconstructs that structure differently:",[4211,36817,36818,36823,36836],{},[4214,36819,36820,36822],{},[1974,36821,943],{}," uses geometric line detection and spatial clustering on the raw PDF content stream. No system deps. Pure Python install.",[4214,36824,36825,36827,36828,36831,36832,36835],{},[1974,36826,16139],{}," implements two algorithms: ",[30,36829,36830],{},"lattice"," (explicit border lines) and ",[30,36833,36834],{},"stream"," (whitespace inference). Requires Ghostscript and OpenCV at runtime.",[4214,36837,36838,36840],{},[1974,36839,36791],{}," wraps the Tabula Java library. It shells out to a bundled JAR, so a JRE\u002FJDK must be on PATH. Fast for standard bordered tables on documents without unusual encodings.",[14,36842,36843,36844,3035],{},"If your deployment environment is a locked-down container or a serverless function, camelot's Ghostscript dependency and tabula-py's JVM requirement are immediate blockers. pdfplumber installs with a single ",[30,36845,36846],{},"pip install",[18,36848,36850],{"id":36849},"_2-prerequisites","2. Prerequisites",[14,36852,36853],{},"Install all three libraries in a virtual environment to run the comparisons in this guide:",[23,36855,36857],{"className":25,"code":36856,"language":27,"meta":28,"style":28},"# System deps — install before pip installs\n# Ghostscript (camelot lattice mode)\n# Ubuntu\u002FDebian:\nsudo apt-get install ghostscript libgs-dev\n# macOS:\nbrew install ghostscript\n\n# Java JRE\u002FJDK (tabula-py)\n# Ubuntu\u002FDebian:\nsudo apt-get install default-jre\n# macOS:\nbrew install openjdk\n# Windows: download from https:\u002F\u002Fadoptium.net\u002F and add to PATH\n\n# Python packages\npip install pdfplumber camelot-py[cv] tabula-py pandas opencv-python-headless\n\n# Verify Java is available to Python subprocess\njava -version\n",[30,36858,36859,36864,36869,36874,36887,36892,36901,36905,36910,36914,36924,36928,36936,36941,36945,36949,36968,36972,36977],{"__ignoreMap":28},[33,36860,36861],{"class":35,"line":36},[33,36862,36863],{"class":39},"# System deps — install before pip installs\n",[33,36865,36866],{"class":35,"line":43},[33,36867,36868],{"class":39},"# Ghostscript (camelot lattice mode)\n",[33,36870,36871],{"class":35,"line":61},[33,36872,36873],{"class":39},"# Ubuntu\u002FDebian:\n",[33,36875,36876,36878,36880,36882,36884],{"class":35,"line":73},[33,36877,9669],{"class":46},[33,36879,9672],{"class":54},[33,36881,79],{"class":54},[33,36883,9677],{"class":54},[33,36885,36886],{"class":54}," libgs-dev\n",[33,36888,36889],{"class":35,"line":88},[33,36890,36891],{"class":39},"# macOS:\n",[33,36893,36894,36896,36898],{"class":35,"line":95},[33,36895,35308],{"class":46},[33,36897,79],{"class":54},[33,36899,36900],{"class":54}," ghostscript\n",[33,36902,36903],{"class":35,"line":101},[33,36904,92],{"emptyLinePlaceholder":91},[33,36906,36907],{"class":35,"line":171},[33,36908,36909],{"class":39},"# Java JRE\u002FJDK (tabula-py)\n",[33,36911,36912],{"class":35,"line":179},[33,36913,36873],{"class":39},[33,36915,36916,36918,36920,36922],{"class":35,"line":187},[33,36917,9669],{"class":46},[33,36919,9672],{"class":54},[33,36921,79],{"class":54},[33,36923,35225],{"class":54},[33,36925,36926],{"class":35,"line":201},[33,36927,36891],{"class":39},[33,36929,36930,36932,36934],{"class":35,"line":206},[33,36931,35308],{"class":46},[33,36933,79],{"class":54},[33,36935,35313],{"class":54},[33,36937,36938],{"class":35,"line":224},[33,36939,36940],{"class":39},"# Windows: download from https:\u002F\u002Fadoptium.net\u002F and add to PATH\n",[33,36942,36943],{"class":35,"line":229},[33,36944,92],{"emptyLinePlaceholder":91},[33,36946,36947],{"class":35,"line":235},[33,36948,9692],{"class":39},[33,36950,36951,36953,36955,36957,36960,36963,36965],{"class":35,"line":250},[33,36952,76],{"class":46},[33,36954,79],{"class":54},[33,36956,9701],{"class":54},[33,36958,36959],{"class":54}," camelot-py[cv]",[33,36961,36962],{"class":54}," tabula-py",[33,36964,16183],{"class":54},[33,36966,36967],{"class":54}," opencv-python-headless\n",[33,36969,36970],{"class":35,"line":266},[33,36971,92],{"emptyLinePlaceholder":91},[33,36973,36974],{"class":35,"line":290},[33,36975,36976],{"class":39},"# Verify Java is available to Python subprocess\n",[33,36978,36979,36981],{"class":35,"line":295},[33,36980,34975],{"class":46},[33,36982,35241],{"class":50},[14,36984,36985],{},"Confirm camelot can find Ghostscript:",[23,36987,36989],{"className":126,"code":36988,"language":47,"meta":28,"style":28},"# pip install camelot-py[cv]\nimport camelot\n# This import alone triggers the Ghostscript check on some platforms\nprint(camelot.__version__)\n",[30,36990,36991,36996,37002,37007],{"__ignoreMap":28},[33,36992,36993],{"class":35,"line":36},[33,36994,36995],{"class":39},"# pip install camelot-py[cv]\n",[33,36997,36998,37000],{"class":35,"line":43},[33,36999,164],{"class":163},[33,37001,10567],{"class":167},[33,37003,37004],{"class":35,"line":61},[33,37005,37006],{"class":39},"# This import alone triggers the Ghostscript check on some platforms\n",[33,37008,37009,37011,37014,37017],{"class":35,"line":73},[33,37010,13474],{"class":50},[33,37012,37013],{"class":167},"(camelot.",[33,37015,37016],{"class":50},"__version__",[33,37018,221],{"class":167},[14,37020,37021,37022,37025,37026,3035],{},"If you see ",[30,37023,37024],{},"OSError: Ghostscript is not installed",", see ",[940,37027,9739],{"href":9738},[14,37029,37030,37031,37025,37033,3035],{},"If tabula-py raises ",[30,37032,34964],{},[940,37034,34953],{"href":37035},"\u002Fautomating-pdf-extraction-generation\u002Fcomparing-pdf-table-extraction-libraries\u002Ffix-tabula-java-not-found-error\u002F",[18,37037,37039],{"id":37038},"_3-diagnostic-step-classify-your-table-before-choosing-a-library","3. Diagnostic Step: Classify Your Table Before Choosing a Library",[14,37041,37042],{},"Run this inspection snippet on any new PDF before committing to a library. It checks for explicit vector lines (bordered table indicator) and text density (borderless indicator):",[23,37044,37046],{"className":126,"code":37045,"language":47,"meta":28,"style":28},"# pip install pdfplumber\nfrom pathlib import Path\nimport pdfplumber\n\nPDF_PATH = Path(\"data\u002Freport.pdf\")\n\ndef classify_table_type(pdf_path: Path) -> dict:\n    \"\"\"Inspect page geometry to recommend an extraction strategy.\"\"\"\n    results = []\n    try:\n        with pdfplumber.open(pdf_path) as pdf:\n            for i, page in enumerate(pdf.pages[:3]):  # Sample first 3 pages\n                lines = page.lines\n                rects = page.rects\n                words = page.extract_words()\n                results.append({\n                    \"page\": i + 1,\n                    \"vector_lines\": len(lines),\n                    \"rects\": len(rects),\n                    \"words\": len(words),\n                    \"recommendation\": (\n                        \"lattice\"  if len(lines) > 10 or len(rects) > 5\n                        else \"stream\" if len(words) > 20\n                        else \"ocr\"\n                    ),\n                })\n    except Exception as e:\n        return {\"error\": str(e)}\n    return results\n\nif __name__ == \"__main__\":\n    import json\n    print(json.dumps(classify_table_type(PDF_PATH), indent=2))\n",[30,37047,37048,37052,37062,37068,37072,37084,37088,37103,37108,37117,37123,37133,37155,37165,37175,37185,37190,37204,37216,37228,37240,37248,37279,37299,37305,37310,37315,37325,37341,37347,37351,37363,37369],{"__ignoreMap":28},[33,37049,37050],{"class":35,"line":36},[33,37051,9763],{"class":39},[33,37053,37054,37056,37058,37060],{"class":35,"line":43},[33,37055,190],{"class":163},[33,37057,193],{"class":167},[33,37059,164],{"class":163},[33,37061,198],{"class":167},[33,37063,37064,37066],{"class":35,"line":61},[33,37065,164],{"class":163},[33,37067,485],{"class":167},[33,37069,37070],{"class":35,"line":73},[33,37071,92],{"emptyLinePlaceholder":91},[33,37073,37074,37076,37078,37080,37082],{"class":35,"line":88},[33,37075,7076],{"class":50},[33,37077,212],{"class":163},[33,37079,215],{"class":167},[33,37081,7083],{"class":54},[33,37083,221],{"class":167},[33,37085,37086],{"class":35,"line":95},[33,37087,92],{"emptyLinePlaceholder":91},[33,37089,37090,37092,37095,37098,37101],{"class":35,"line":101},[33,37091,562],{"class":163},[33,37093,37094],{"class":46}," classify_table_type",[33,37096,37097],{"class":167},"(pdf_path: Path) -> ",[33,37099,37100],{"class":50},"dict",[33,37102,574],{"class":167},[33,37104,37105],{"class":35,"line":171},[33,37106,37107],{"class":54},"    \"\"\"Inspect page geometry to recommend an extraction strategy.\"\"\"\n",[33,37109,37110,37113,37115],{"class":35,"line":179},[33,37111,37112],{"class":167},"    results ",[33,37114,242],{"class":163},[33,37116,589],{"class":167},[33,37118,37119,37121],{"class":35,"line":187},[33,37120,2424],{"class":163},[33,37122,574],{"class":167},[33,37124,37125,37127,37129,37131],{"class":35,"line":201},[33,37126,2191],{"class":163},[33,37128,681],{"class":167},[33,37130,495],{"class":163},[33,37132,686],{"class":167},[33,37134,37135,37137,37140,37142,37144,37147,37149,37152],{"class":35,"line":206},[33,37136,1793],{"class":163},[33,37138,37139],{"class":167}," i, page ",[33,37141,662],{"class":163},[33,37143,7403],{"class":50},[33,37145,37146],{"class":167},"(pdf.pages[:",[33,37148,10258],{"class":50},[33,37150,37151],{"class":167},"]):  ",[33,37153,37154],{"class":39},"# Sample first 3 pages\n",[33,37156,37157,37160,37162],{"class":35,"line":224},[33,37158,37159],{"class":167},"                lines ",[33,37161,242],{"class":163},[33,37163,37164],{"class":167}," page.lines\n",[33,37166,37167,37170,37172],{"class":35,"line":229},[33,37168,37169],{"class":167},"                rects ",[33,37171,242],{"class":163},[33,37173,37174],{"class":167}," page.rects\n",[33,37176,37177,37180,37182],{"class":35,"line":235},[33,37178,37179],{"class":167},"                words ",[33,37181,242],{"class":163},[33,37183,37184],{"class":167}," page.extract_words()\n",[33,37186,37187],{"class":35,"line":250},[33,37188,37189],{"class":167},"                results.append({\n",[33,37191,37192,37195,37198,37200,37202],{"class":35,"line":266},[33,37193,37194],{"class":54},"                    \"page\"",[33,37196,37197],{"class":167},": i ",[33,37199,1811],{"class":163},[33,37201,1814],{"class":50},[33,37203,247],{"class":167},[33,37205,37206,37209,37211,37213],{"class":35,"line":290},[33,37207,37208],{"class":54},"                    \"vector_lines\"",[33,37210,2079],{"class":167},[33,37212,928],{"class":50},[33,37214,37215],{"class":167},"(lines),\n",[33,37217,37218,37221,37223,37225],{"class":35,"line":295},[33,37219,37220],{"class":54},"                    \"rects\"",[33,37222,2079],{"class":167},[33,37224,928],{"class":50},[33,37226,37227],{"class":167},"(rects),\n",[33,37229,37230,37233,37235,37237],{"class":35,"line":300},[33,37231,37232],{"class":54},"                    \"words\"",[33,37234,2079],{"class":167},[33,37236,928],{"class":50},[33,37238,37239],{"class":167},"(words),\n",[33,37241,37242,37245],{"class":35,"line":317},[33,37243,37244],{"class":54},"                    \"recommendation\"",[33,37246,37247],{"class":167},": (\n",[33,37249,37250,37253,37256,37258,37261,37263,37266,37269,37271,37274,37276],{"class":35,"line":332},[33,37251,37252],{"class":54},"                        \"lattice\"",[33,37254,37255],{"class":163},"  if",[33,37257,4037],{"class":50},[33,37259,37260],{"class":167},"(lines) ",[33,37262,6009],{"class":163},[33,37264,37265],{"class":50}," 10",[33,37267,37268],{"class":163}," or",[33,37270,4037],{"class":50},[33,37272,37273],{"class":167},"(rects) ",[33,37275,6009],{"class":163},[33,37277,37278],{"class":50}," 5\n",[33,37280,37281,37284,37287,37289,37291,37294,37296],{"class":35,"line":347},[33,37282,37283],{"class":163},"                        else",[33,37285,37286],{"class":54}," \"stream\"",[33,37288,9994],{"class":163},[33,37290,4037],{"class":50},[33,37292,37293],{"class":167},"(words) ",[33,37295,6009],{"class":163},[33,37297,37298],{"class":50}," 20\n",[33,37300,37301,37303],{"class":35,"line":374},[33,37302,37283],{"class":163},[33,37304,9984],{"class":54},[33,37306,37307],{"class":35,"line":397},[33,37308,37309],{"class":167},"                    ),\n",[33,37311,37312],{"class":35,"line":653},[33,37313,37314],{"class":167},"                })\n",[33,37316,37317,37319,37321,37323],{"class":35,"line":667},[33,37318,2449],{"class":163},[33,37320,783],{"class":50},[33,37322,1852],{"class":163},[33,37324,7583],{"class":167},[33,37326,37327,37329,37331,37334,37336,37338],{"class":35,"line":675},[33,37328,1659],{"class":163},[33,37330,4098],{"class":167},[33,37332,37333],{"class":54},"\"error\"",[33,37335,2079],{"class":167},[33,37337,1053],{"class":50},[33,37339,37340],{"class":167},"(e)}\n",[33,37342,37343,37345],{"class":35,"line":689},[33,37344,1332],{"class":163},[33,37346,14211],{"class":167},[33,37348,37349],{"class":35,"line":703},[33,37350,92],{"emptyLinePlaceholder":91},[33,37352,37353,37355,37357,37359,37361],{"class":35,"line":714},[33,37354,2491],{"class":163},[33,37356,2494],{"class":50},[33,37358,2497],{"class":163},[33,37360,2500],{"class":54},[33,37362,574],{"class":167},[33,37364,37365,37367],{"class":35,"line":723},[33,37366,1627],{"class":163},[33,37368,3081],{"class":167},[33,37370,37371,37373,37376,37378,37380,37383,37385,37387],{"class":35,"line":754},[33,37372,7268],{"class":50},[33,37374,37375],{"class":167},"(json.dumps(classify_table_type(",[33,37377,7076],{"class":50},[33,37379,18525],{"class":167},[33,37381,37382],{"class":238},"indent",[33,37384,242],{"class":163},[33,37386,1533],{"class":50},[33,37388,371],{"class":167},[4211,37390,37391,37400,37406],{},[4214,37392,37393,2012,37396,37399],{},[30,37394,37395],{},"vector_lines > 10",[30,37397,37398],{},"rects > 5",": bordered table — use camelot lattice or pdfplumber.",[4214,37401,37402,37403,3035],{},"Few lines, many words: borderless\u002Fstream table — use camelot stream or pdfplumber with ",[30,37404,37405],{},"vertical_strategy=\"text\"",[4214,37407,37408,37409,3035],{},"Near-zero words: scanned image — OCR required first. See ",[940,37410,36756],{"href":26957},[18,37412,37414],{"id":37413},"_4-library-by-library-walkthrough","4. Library-by-Library Walkthrough",[424,37416,37418],{"id":37417},"step-1-extract-with-pdfplumber","Step 1 — Extract with pdfplumber",[23,37420,37422],{"className":126,"code":37421,"language":47,"meta":28,"style":28},"# pip install pdfplumber pandas\nfrom pathlib import Path\nimport pdfplumber\nimport pandas as pd\n\nPDF_PATH = Path(\"data\u002Freport.pdf\")\n\ndef extract_pdfplumber(pdf_path: Path, page_num: int = 0) -> list[pd.DataFrame]:\n    \"\"\"Extract all tables from a single page using pdfplumber.\"\"\"\n    dfs = []\n    try:\n        with pdfplumber.open(pdf_path) as pdf:\n            page = pdf.pages[page_num]\n            # extract_tables() uses both horizontal and vertical line detection\n            raw = page.extract_tables({\n                \"vertical_strategy\": \"lines\",\n                \"horizontal_strategy\": \"lines\",\n                \"snap_tolerance\": 3,\n                \"join_tolerance\": 3,\n            })\n            for tbl in raw:\n                if tbl and len(tbl) > 1:\n                    df = pd.DataFrame(tbl[1:], columns=tbl[0])\n                    dfs.append(df)\n    except Exception as e:\n        print(f\"pdfplumber error: {e}\")\n    return dfs\n\nif __name__ == \"__main__\":\n    tables = extract_pdfplumber(PDF_PATH)\n    for i, df in enumerate(tables):\n        print(f\"Table {i}: {df.shape}\")\n        print(df.head())\n",[30,37423,37424,37428,37438,37444,37454,37458,37470,37474,37491,37496,37505,37511,37521,37530,37535,37544,37556,37567,37578,37589,37594,37605,37623,37646,37651,37661,37682,37689,37693,37705,37719,37732,37760],{"__ignoreMap":28},[33,37425,37426],{"class":35,"line":36},[33,37427,7041],{"class":39},[33,37429,37430,37432,37434,37436],{"class":35,"line":43},[33,37431,190],{"class":163},[33,37433,193],{"class":167},[33,37435,164],{"class":163},[33,37437,198],{"class":167},[33,37439,37440,37442],{"class":35,"line":61},[33,37441,164],{"class":163},[33,37443,485],{"class":167},[33,37445,37446,37448,37450,37452],{"class":35,"line":73},[33,37447,164],{"class":163},[33,37449,492],{"class":167},[33,37451,495],{"class":163},[33,37453,498],{"class":167},[33,37455,37456],{"class":35,"line":88},[33,37457,92],{"emptyLinePlaceholder":91},[33,37459,37460,37462,37464,37466,37468],{"class":35,"line":95},[33,37461,7076],{"class":50},[33,37463,212],{"class":163},[33,37465,215],{"class":167},[33,37467,7083],{"class":54},[33,37469,221],{"class":167},[33,37471,37472],{"class":35,"line":101},[33,37473,92],{"emptyLinePlaceholder":91},[33,37475,37476,37478,37480,37483,37485,37487,37489],{"class":35,"line":171},[33,37477,562],{"class":163},[33,37479,14660],{"class":46},[33,37481,37482],{"class":167},"(pdf_path: Path, page_num: ",[33,37484,1059],{"class":50},[33,37486,212],{"class":163},[33,37488,10791],{"class":50},[33,37490,10647],{"class":167},[33,37492,37493],{"class":35,"line":179},[33,37494,37495],{"class":54},"    \"\"\"Extract all tables from a single page using pdfplumber.\"\"\"\n",[33,37497,37498,37501,37503],{"class":35,"line":187},[33,37499,37500],{"class":167},"    dfs ",[33,37502,242],{"class":163},[33,37504,589],{"class":167},[33,37506,37507,37509],{"class":35,"line":201},[33,37508,2424],{"class":163},[33,37510,574],{"class":167},[33,37512,37513,37515,37517,37519],{"class":35,"line":206},[33,37514,2191],{"class":163},[33,37516,681],{"class":167},[33,37518,495],{"class":163},[33,37520,686],{"class":167},[33,37522,37523,37525,37527],{"class":35,"line":224},[33,37524,9865],{"class":167},[33,37526,242],{"class":163},[33,37528,37529],{"class":167}," pdf.pages[page_num]\n",[33,37531,37532],{"class":35,"line":229},[33,37533,37534],{"class":39},"            # extract_tables() uses both horizontal and vertical line detection\n",[33,37536,37537,37539,37541],{"class":35,"line":235},[33,37538,7142],{"class":167},[33,37540,242],{"class":163},[33,37542,37543],{"class":167}," page.extract_tables({\n",[33,37545,37546,37549,37551,37554],{"class":35,"line":250},[33,37547,37548],{"class":54},"                \"vertical_strategy\"",[33,37550,2079],{"class":167},[33,37552,37553],{"class":54},"\"lines\"",[33,37555,247],{"class":167},[33,37557,37558,37561,37563,37565],{"class":35,"line":266},[33,37559,37560],{"class":54},"                \"horizontal_strategy\"",[33,37562,2079],{"class":167},[33,37564,37553],{"class":54},[33,37566,247],{"class":167},[33,37568,37569,37572,37574,37576],{"class":35,"line":290},[33,37570,37571],{"class":54},"                \"snap_tolerance\"",[33,37573,2079],{"class":167},[33,37575,10258],{"class":50},[33,37577,247],{"class":167},[33,37579,37580,37583,37585,37587],{"class":35,"line":295},[33,37581,37582],{"class":54},"                \"join_tolerance\"",[33,37584,2079],{"class":167},[33,37586,10258],{"class":50},[33,37588,247],{"class":167},[33,37590,37591],{"class":35,"line":300},[33,37592,37593],{"class":167},"            })\n",[33,37595,37596,37598,37600,37602],{"class":35,"line":317},[33,37597,1793],{"class":163},[33,37599,7154],{"class":167},[33,37601,662],{"class":163},[33,37603,37604],{"class":167}," raw:\n",[33,37606,37607,37609,37611,37613,37615,37617,37619,37621],{"class":35,"line":332},[33,37608,7170],{"class":163},[33,37610,7154],{"class":167},[33,37612,6001],{"class":163},[33,37614,4037],{"class":50},[33,37616,7179],{"class":167},[33,37618,6009],{"class":163},[33,37620,1814],{"class":50},[33,37622,574],{"class":167},[33,37624,37625,37627,37629,37632,37634,37636,37638,37640,37642,37644],{"class":35,"line":347},[33,37626,7533],{"class":167},[33,37628,242],{"class":163},[33,37630,37631],{"class":167}," pd.DataFrame(tbl[",[33,37633,734],{"class":50},[33,37635,737],{"class":167},[33,37637,740],{"class":238},[33,37639,242],{"class":163},[33,37641,7206],{"class":167},[33,37643,748],{"class":50},[33,37645,751],{"class":167},[33,37647,37648],{"class":35,"line":374},[33,37649,37650],{"class":167},"                    dfs.append(df)\n",[33,37652,37653,37655,37657,37659],{"class":35,"line":397},[33,37654,2449],{"class":163},[33,37656,783],{"class":50},[33,37658,1852],{"class":163},[33,37660,7583],{"class":167},[33,37662,37663,37665,37667,37669,37672,37674,37676,37678,37680],{"class":35,"line":653},[33,37664,9414],{"class":50},[33,37666,602],{"class":167},[33,37668,4059],{"class":163},[33,37670,37671],{"class":54},"\"pdfplumber error: ",[33,37673,1115],{"class":50},[33,37675,7602],{"class":167},[33,37677,1121],{"class":50},[33,37679,274],{"class":54},[33,37681,221],{"class":167},[33,37683,37684,37686],{"class":35,"line":667},[33,37685,1332],{"class":163},[33,37687,37688],{"class":167}," dfs\n",[33,37690,37691],{"class":35,"line":675},[33,37692,92],{"emptyLinePlaceholder":91},[33,37694,37695,37697,37699,37701,37703],{"class":35,"line":689},[33,37696,2491],{"class":163},[33,37698,2494],{"class":50},[33,37700,2497],{"class":163},[33,37702,2500],{"class":54},[33,37704,574],{"class":167},[33,37706,37707,37710,37712,37715,37717],{"class":35,"line":703},[33,37708,37709],{"class":167},"    tables ",[33,37711,242],{"class":163},[33,37713,37714],{"class":167}," extract_pdfplumber(",[33,37716,7076],{"class":50},[33,37718,221],{"class":167},[33,37720,37721,37723,37725,37727,37729],{"class":35,"line":714},[33,37722,656],{"class":163},[33,37724,10994],{"class":167},[33,37726,662],{"class":163},[33,37728,7403],{"class":50},[33,37730,37731],{"class":167},"(tables):\n",[33,37733,37734,37736,37738,37740,37742,37744,37746,37748,37750,37752,37754,37756,37758],{"class":35,"line":723},[33,37735,9414],{"class":50},[33,37737,602],{"class":167},[33,37739,4059],{"class":163},[33,37741,11012],{"class":54},[33,37743,1115],{"class":50},[33,37745,7499],{"class":167},[33,37747,1121],{"class":50},[33,37749,2079],{"class":54},[33,37751,1115],{"class":50},[33,37753,9426],{"class":167},[33,37755,1121],{"class":50},[33,37757,274],{"class":54},[33,37759,221],{"class":167},[33,37761,37762,37764],{"class":35,"line":754},[33,37763,9414],{"class":50},[33,37765,13311],{"class":167},[14,37767,37768,37771],{},[1974,37769,37770],{},"When pdfplumber wins:"," borderless tables, complex coordinate geometry, embedded font edge cases, or any environment where system deps are unavailable.",[424,37773,37775],{"id":37774},"step-2-extract-with-camelot-lattice-and-stream","Step 2 — Extract with camelot (lattice and stream)",[23,37777,37779],{"className":126,"code":37778,"language":47,"meta":28,"style":28},"# pip install camelot-py[cv] pandas\n# System: ghostscript must be on PATH\nfrom pathlib import Path\nimport camelot\nimport pandas as pd\n\nPDF_PATH = Path(\"data\u002Freport.pdf\")\n\ndef extract_camelot(pdf_path: Path, flavor: str = \"lattice\") -> list[pd.DataFrame]:\n    \"\"\"\n    Extract tables with camelot.\n    flavor=\"lattice\"  — bordered tables with explicit grid lines\n    flavor=\"stream\"   — borderless tables inferred from whitespace\n    \"\"\"\n    dfs = []\n    try:\n        tables = camelot.read_pdf(\n            str(pdf_path),\n            pages=\"all\",\n            flavor=flavor,\n            # lattice-specific: copy_text handles cells that span rows\n            copy_text=[\"v\"] if flavor == \"lattice\" else [],\n        )\n        print(f\"camelot ({flavor}): found {tables.n} table(s)\")\n        for tbl in tables:\n            # tbl.parsing_report gives accuracy score (0-100)\n            print(f\"  accuracy={tbl.parsing_report['accuracy']:.1f}%  \"\n                  f\"whitespace={tbl.parsing_report['whitespace']:.1f}%\")\n            dfs.append(tbl.df)\n    except Exception as e:\n        print(f\"camelot error: {e}\")\n    return dfs\n\nif __name__ == \"__main__\":\n    lattice_dfs = extract_camelot(PDF_PATH, flavor=\"lattice\")\n    stream_dfs  = extract_camelot(PDF_PATH, flavor=\"stream\")\n",[30,37780,37781,37786,37791,37801,37807,37817,37821,37833,37837,37854,37858,37863,37868,37873,37877,37885,37891,37900,37907,37917,37925,37930,37957,37961,37991,38002,38007,38035,38061,38066,38076,38097,38103,38107,38119,38141],{"__ignoreMap":28},[33,37782,37783],{"class":35,"line":36},[33,37784,37785],{"class":39},"# pip install camelot-py[cv] pandas\n",[33,37787,37788],{"class":35,"line":43},[33,37789,37790],{"class":39},"# System: ghostscript must be on PATH\n",[33,37792,37793,37795,37797,37799],{"class":35,"line":61},[33,37794,190],{"class":163},[33,37796,193],{"class":167},[33,37798,164],{"class":163},[33,37800,198],{"class":167},[33,37802,37803,37805],{"class":35,"line":73},[33,37804,164],{"class":163},[33,37806,10567],{"class":167},[33,37808,37809,37811,37813,37815],{"class":35,"line":88},[33,37810,164],{"class":163},[33,37812,492],{"class":167},[33,37814,495],{"class":163},[33,37816,498],{"class":167},[33,37818,37819],{"class":35,"line":95},[33,37820,92],{"emptyLinePlaceholder":91},[33,37822,37823,37825,37827,37829,37831],{"class":35,"line":101},[33,37824,7076],{"class":50},[33,37826,212],{"class":163},[33,37828,215],{"class":167},[33,37830,7083],{"class":54},[33,37832,221],{"class":167},[33,37834,37835],{"class":35,"line":171},[33,37836,92],{"emptyLinePlaceholder":91},[33,37838,37839,37841,37843,37846,37848,37850,37852],{"class":35,"line":179},[33,37840,562],{"class":163},[33,37842,14835],{"class":46},[33,37844,37845],{"class":167},"(pdf_path: Path, flavor: ",[33,37847,1053],{"class":50},[33,37849,212],{"class":163},[33,37851,9991],{"class":54},[33,37853,10647],{"class":167},[33,37855,37856],{"class":35,"line":187},[33,37857,7673],{"class":54},[33,37859,37860],{"class":35,"line":201},[33,37861,37862],{"class":54},"    Extract tables with camelot.\n",[33,37864,37865],{"class":35,"line":206},[33,37866,37867],{"class":54},"    flavor=\"lattice\"  — bordered tables with explicit grid lines\n",[33,37869,37870],{"class":35,"line":224},[33,37871,37872],{"class":54},"    flavor=\"stream\"   — borderless tables inferred from whitespace\n",[33,37874,37875],{"class":35,"line":229},[33,37876,7673],{"class":54},[33,37878,37879,37881,37883],{"class":35,"line":235},[33,37880,37500],{"class":167},[33,37882,242],{"class":163},[33,37884,589],{"class":167},[33,37886,37887,37889],{"class":35,"line":250},[33,37888,2424],{"class":163},[33,37890,574],{"class":167},[33,37892,37893,37896,37898],{"class":35,"line":266},[33,37894,37895],{"class":167},"        tables ",[33,37897,242],{"class":163},[33,37899,10668],{"class":167},[33,37901,37902,37904],{"class":35,"line":290},[33,37903,10673],{"class":50},[33,37905,37906],{"class":167},"(pdf_path),\n",[33,37908,37909,37911,37913,37915],{"class":35,"line":295},[33,37910,10681],{"class":238},[33,37912,242],{"class":163},[33,37914,35616],{"class":54},[33,37916,247],{"class":167},[33,37918,37919,37921,37923],{"class":35,"line":300},[33,37920,10691],{"class":238},[33,37922,242],{"class":163},[33,37924,10696],{"class":167},[33,37926,37927],{"class":35,"line":317},[33,37928,37929],{"class":39},"            # lattice-specific: copy_text handles cells that span rows\n",[33,37931,37932,37935,37937,37939,37942,37944,37946,37948,37950,37952,37954],{"class":35,"line":332},[33,37933,37934],{"class":238},"            copy_text",[33,37936,242],{"class":163},[33,37938,8309],{"class":167},[33,37940,37941],{"class":54},"\"v\"",[33,37943,763],{"class":167},[33,37945,2491],{"class":163},[33,37947,15750],{"class":167},[33,37949,1865],{"class":163},[33,37951,9991],{"class":54},[33,37953,15715],{"class":163},[33,37955,37956],{"class":167}," [],\n",[33,37958,37959],{"class":35,"line":347},[33,37960,5867],{"class":167},[33,37962,37963,37965,37967,37969,37971,37973,37975,37977,37980,37982,37985,37987,37989],{"class":35,"line":374},[33,37964,9414],{"class":50},[33,37966,602],{"class":167},[33,37968,4059],{"class":163},[33,37970,10743],{"class":54},[33,37972,1115],{"class":50},[33,37974,10748],{"class":167},[33,37976,1121],{"class":50},[33,37978,37979],{"class":54},"): found ",[33,37981,1115],{"class":50},[33,37983,37984],{"class":167},"tables.n",[33,37986,1121],{"class":50},[33,37988,6247],{"class":54},[33,37990,221],{"class":167},[33,37992,37993,37995,37997,37999],{"class":35,"line":397},[33,37994,5973],{"class":163},[33,37996,7154],{"class":167},[33,37998,662],{"class":163},[33,38000,38001],{"class":167}," tables:\n",[33,38003,38004],{"class":35,"line":653},[33,38005,38006],{"class":39},"            # tbl.parsing_report gives accuracy score (0-100)\n",[33,38008,38009,38011,38013,38015,38018,38020,38023,38026,38028,38030,38032],{"class":35,"line":667},[33,38010,9364],{"class":50},[33,38012,602],{"class":167},[33,38014,4059],{"class":163},[33,38016,38017],{"class":54},"\"  accuracy=",[33,38019,1115],{"class":50},[33,38021,38022],{"class":167},"tbl.parsing_report[",[33,38024,38025],{"class":54},"'accuracy'",[33,38027,9546],{"class":167},[33,38029,18438],{"class":163},[33,38031,1121],{"class":50},[33,38033,38034],{"class":54},"%  \"\n",[33,38036,38037,38040,38043,38045,38047,38050,38052,38054,38056,38059],{"class":35,"line":675},[33,38038,38039],{"class":163},"                  f",[33,38041,38042],{"class":54},"\"whitespace=",[33,38044,1115],{"class":50},[33,38046,38022],{"class":167},[33,38048,38049],{"class":54},"'whitespace'",[33,38051,9546],{"class":167},[33,38053,18438],{"class":163},[33,38055,1121],{"class":50},[33,38057,38058],{"class":54},"%\"",[33,38060,221],{"class":167},[33,38062,38063],{"class":35,"line":689},[33,38064,38065],{"class":167},"            dfs.append(tbl.df)\n",[33,38067,38068,38070,38072,38074],{"class":35,"line":703},[33,38069,2449],{"class":163},[33,38071,783],{"class":50},[33,38073,1852],{"class":163},[33,38075,7583],{"class":167},[33,38077,38078,38080,38082,38084,38087,38089,38091,38093,38095],{"class":35,"line":714},[33,38079,9414],{"class":50},[33,38081,602],{"class":167},[33,38083,4059],{"class":163},[33,38085,38086],{"class":54},"\"camelot error: ",[33,38088,1115],{"class":50},[33,38090,7602],{"class":167},[33,38092,1121],{"class":50},[33,38094,274],{"class":54},[33,38096,221],{"class":167},[33,38098,38099,38101],{"class":35,"line":723},[33,38100,1332],{"class":163},[33,38102,37688],{"class":167},[33,38104,38105],{"class":35,"line":754},[33,38106,92],{"emptyLinePlaceholder":91},[33,38108,38109,38111,38113,38115,38117],{"class":35,"line":771},[33,38110,2491],{"class":163},[33,38112,2494],{"class":50},[33,38114,2497],{"class":163},[33,38116,2500],{"class":54},[33,38118,574],{"class":167},[33,38120,38121,38124,38126,38129,38131,38133,38135,38137,38139],{"class":35,"line":777},[33,38122,38123],{"class":167},"    lattice_dfs ",[33,38125,242],{"class":163},[33,38127,38128],{"class":167}," extract_camelot(",[33,38130,7076],{"class":50},[33,38132,365],{"class":167},[33,38134,10748],{"class":238},[33,38136,242],{"class":163},[33,38138,10985],{"class":54},[33,38140,221],{"class":167},[33,38142,38143,38146,38148,38150,38152,38154,38156,38158,38160],{"class":35,"line":788},[33,38144,38145],{"class":167},"    stream_dfs  ",[33,38147,242],{"class":163},[33,38149,38128],{"class":167},[33,38151,7076],{"class":50},[33,38153,365],{"class":167},[33,38155,10748],{"class":238},[33,38157,242],{"class":163},[33,38159,13407],{"class":54},[33,38161,221],{"class":167},[14,38163,38164,38167],{},[1974,38165,38166],{},"Camelot's accuracy score"," is unique among the three libraries — it quantifies how much whitespace was left in cells, giving you a confidence signal without manually inspecting every table.",[14,38169,38170,38173],{},[1974,38171,38172],{},"When camelot wins:"," two-pass comparison (run both flavors, take the higher accuracy score), official financial or government PDFs with clean bordered grids, or situations where you need that accuracy signal for automated QA.",[424,38175,38177],{"id":38176},"step-3-extract-with-tabula-py","Step 3 — Extract with tabula-py",[23,38179,38181],{"className":126,"code":38180,"language":47,"meta":28,"style":28},"# pip install tabula-py pandas\n# System: Java JRE\u002FJDK must be on PATH\nfrom pathlib import Path\nimport tabula\nimport pandas as pd\n\nPDF_PATH = Path(\"data\u002Freport.pdf\")\n\ndef extract_tabula(pdf_path: Path) -> list[pd.DataFrame]:\n    \"\"\"Extract all tables from all pages using tabula-py.\"\"\"\n    try:\n        # read_pdf returns a list of DataFrames, one per detected table\n        dfs = tabula.read_pdf(\n            str(pdf_path),\n            pages=\"all\",\n            multiple_tables=True,\n            pandas_options={\"header\": 0},\n            lattice=True,   # True = bordered; False = stream-style\n            silent=True,    # suppress Java stderr\n        )\n        print(f\"tabula: found {len(dfs)} table(s)\")\n        return dfs\n    except Exception as e:\n        print(f\"tabula error: {e}\")\n        return []\n\nif __name__ == \"__main__\":\n    tables = extract_tabula(PDF_PATH)\n    for i, df in enumerate(tables):\n        print(f\"Table {i}: {df.shape}\")\n        print(df.head())\n",[30,38182,38183,38187,38192,38202,38208,38218,38222,38234,38238,38247,38252,38258,38263,38272,38278,38288,38299,38317,38331,38346,38350,38371,38377,38387,38408,38414,38418,38430,38443,38455,38483],{"__ignoreMap":28},[33,38184,38185],{"class":35,"line":36},[33,38186,35522],{"class":39},[33,38188,38189],{"class":35,"line":43},[33,38190,38191],{"class":39},"# System: Java JRE\u002FJDK must be on PATH\n",[33,38193,38194,38196,38198,38200],{"class":35,"line":61},[33,38195,190],{"class":163},[33,38197,193],{"class":167},[33,38199,164],{"class":163},[33,38201,198],{"class":167},[33,38203,38204,38206],{"class":35,"line":73},[33,38205,164],{"class":163},[33,38207,35539],{"class":167},[33,38209,38210,38212,38214,38216],{"class":35,"line":88},[33,38211,164],{"class":163},[33,38213,492],{"class":167},[33,38215,495],{"class":163},[33,38217,498],{"class":167},[33,38219,38220],{"class":35,"line":95},[33,38221,92],{"emptyLinePlaceholder":91},[33,38223,38224,38226,38228,38230,38232],{"class":35,"line":101},[33,38225,7076],{"class":50},[33,38227,212],{"class":163},[33,38229,215],{"class":167},[33,38231,7083],{"class":54},[33,38233,221],{"class":167},[33,38235,38236],{"class":35,"line":171},[33,38237,92],{"emptyLinePlaceholder":91},[33,38239,38240,38242,38245],{"class":35,"line":179},[33,38241,562],{"class":163},[33,38243,38244],{"class":46}," extract_tabula",[33,38246,5947],{"class":167},[33,38248,38249],{"class":35,"line":187},[33,38250,38251],{"class":54},"    \"\"\"Extract all tables from all pages using tabula-py.\"\"\"\n",[33,38253,38254,38256],{"class":35,"line":201},[33,38255,2424],{"class":163},[33,38257,574],{"class":167},[33,38259,38260],{"class":35,"line":206},[33,38261,38262],{"class":39},"        # read_pdf returns a list of DataFrames, one per detected table\n",[33,38264,38265,38268,38270],{"class":35,"line":224},[33,38266,38267],{"class":167},"        dfs ",[33,38269,242],{"class":163},[33,38271,35591],{"class":167},[33,38273,38274,38276],{"class":35,"line":229},[33,38275,10673],{"class":50},[33,38277,37906],{"class":167},[33,38279,38280,38282,38284,38286],{"class":35,"line":235},[33,38281,10681],{"class":238},[33,38283,242],{"class":163},[33,38285,35616],{"class":54},[33,38287,247],{"class":167},[33,38289,38290,38293,38295,38297],{"class":35,"line":250},[33,38291,38292],{"class":238},"            multiple_tables",[33,38294,242],{"class":163},[33,38296,855],{"class":50},[33,38298,247],{"class":167},[33,38300,38301,38304,38306,38308,38311,38313,38315],{"class":35,"line":266},[33,38302,38303],{"class":238},"            pandas_options",[33,38305,242],{"class":163},[33,38307,1115],{"class":167},[33,38309,38310],{"class":54},"\"header\"",[33,38312,2079],{"class":167},[33,38314,748],{"class":50},[33,38316,3509],{"class":167},[33,38318,38319,38322,38324,38326,38328],{"class":35,"line":290},[33,38320,38321],{"class":238},"            lattice",[33,38323,242],{"class":163},[33,38325,855],{"class":50},[33,38327,1166],{"class":167},[33,38329,38330],{"class":39},"# True = bordered; False = stream-style\n",[33,38332,38333,38336,38338,38340,38343],{"class":35,"line":295},[33,38334,38335],{"class":238},"            silent",[33,38337,242],{"class":163},[33,38339,855],{"class":50},[33,38341,38342],{"class":167},",    ",[33,38344,38345],{"class":39},"# suppress Java stderr\n",[33,38347,38348],{"class":35,"line":300},[33,38349,5867],{"class":167},[33,38351,38352,38354,38356,38358,38361,38363,38365,38367,38369],{"class":35,"line":317},[33,38353,9414],{"class":50},[33,38355,602],{"class":167},[33,38357,4059],{"class":163},[33,38359,38360],{"class":54},"\"tabula: found ",[33,38362,4065],{"class":50},[33,38364,35663],{"class":167},[33,38366,1121],{"class":50},[33,38368,6247],{"class":54},[33,38370,221],{"class":167},[33,38372,38373,38375],{"class":35,"line":332},[33,38374,1659],{"class":163},[33,38376,37688],{"class":167},[33,38378,38379,38381,38383,38385],{"class":35,"line":347},[33,38380,2449],{"class":163},[33,38382,783],{"class":50},[33,38384,1852],{"class":163},[33,38386,7583],{"class":167},[33,38388,38389,38391,38393,38395,38398,38400,38402,38404,38406],{"class":35,"line":374},[33,38390,9414],{"class":50},[33,38392,602],{"class":167},[33,38394,4059],{"class":163},[33,38396,38397],{"class":54},"\"tabula error: ",[33,38399,1115],{"class":50},[33,38401,7602],{"class":167},[33,38403,1121],{"class":50},[33,38405,274],{"class":54},[33,38407,221],{"class":167},[33,38409,38410,38412],{"class":35,"line":397},[33,38411,1659],{"class":163},[33,38413,589],{"class":167},[33,38415,38416],{"class":35,"line":653},[33,38417,92],{"emptyLinePlaceholder":91},[33,38419,38420,38422,38424,38426,38428],{"class":35,"line":667},[33,38421,2491],{"class":163},[33,38423,2494],{"class":50},[33,38425,2497],{"class":163},[33,38427,2500],{"class":54},[33,38429,574],{"class":167},[33,38431,38432,38434,38436,38439,38441],{"class":35,"line":675},[33,38433,37709],{"class":167},[33,38435,242],{"class":163},[33,38437,38438],{"class":167}," extract_tabula(",[33,38440,7076],{"class":50},[33,38442,221],{"class":167},[33,38444,38445,38447,38449,38451,38453],{"class":35,"line":689},[33,38446,656],{"class":163},[33,38448,10994],{"class":167},[33,38450,662],{"class":163},[33,38452,7403],{"class":50},[33,38454,37731],{"class":167},[33,38456,38457,38459,38461,38463,38465,38467,38469,38471,38473,38475,38477,38479,38481],{"class":35,"line":703},[33,38458,9414],{"class":50},[33,38460,602],{"class":167},[33,38462,4059],{"class":163},[33,38464,11012],{"class":54},[33,38466,1115],{"class":50},[33,38468,7499],{"class":167},[33,38470,1121],{"class":50},[33,38472,2079],{"class":54},[33,38474,1115],{"class":50},[33,38476,9426],{"class":167},[33,38478,1121],{"class":50},[33,38480,274],{"class":54},[33,38482,221],{"class":167},[33,38484,38485,38487],{"class":35,"line":714},[33,38486,9414],{"class":50},[33,38488,13311],{"class":167},[14,38490,38491,38494],{},[1974,38492,38493],{},"When tabula wins:"," standard bordered tables in mainstream PDF generators (Word exports, Excel-to-PDF, LibreOffice), bulk processing where JVM startup time amortizes over many files, or teams already running Java tooling.",[18,38496,38498],{"id":38497},"_5-comparison-matrix","5. Comparison Matrix",[4273,38500,38501,38514],{},[4276,38502,38503],{},[4279,38504,38505,38508,38510,38512],{},[4282,38506,38507],{},"Feature",[4282,38509,943],{},[4282,38511,16139],{},[4282,38513,36791],{},[4292,38515,38516,38529,38541,38553,38579,38593,38605,38621,38634,38645,38657,38674],{},[4279,38517,38518,38521,38524,38527],{},[4297,38519,38520],{},"Bordered tables (lattice)",[4297,38522,38523],{},"Good",[4297,38525,38526],{},"Excellent",[4297,38528,38526],{},[4279,38530,38531,38534,38536,38538],{},[4297,38532,38533],{},"Borderless tables (stream)",[4297,38535,38523],{},[4297,38537,38523],{},[4297,38539,38540],{},"Fair",[4279,38542,38543,38546,38549,38551],{},[4297,38544,38545],{},"Scanned \u002F image tables",[4297,38547,38548],{},"None (needs OCR first)",[4297,38550,571],{},[4297,38552,571],{},[4279,38554,38555,38558,38564,38573],{},[4297,38556,38557],{},"Output type",[4297,38559,38560,38563],{},[30,38561,38562],{},"list[list]"," → DataFrame",[4297,38565,38566,38569,38570],{},[30,38567,38568],{},"TableList"," → DataFrame via ",[30,38571,38572],{},".df",[4297,38574,38575,38578],{},[30,38576,38577],{},"list[DataFrame]"," direct",[4279,38580,38581,38584,38587,38590],{},[4297,38582,38583],{},"Speed (single page)",[4297,38585,38586],{},"Fast",[4297,38588,38589],{},"Moderate",[4297,38591,38592],{},"Moderate (JVM warmup)",[4279,38594,38595,38598,38600,38602],{},[4297,38596,38597],{},"Speed (batch 100 pages)",[4297,38599,38586],{},[4297,38601,38589],{},[4297,38603,38604],{},"Fast (JVM amortized)",[4279,38606,38607,38610,38613,38619],{},[4297,38608,38609],{},"Accuracy signal",[4297,38611,38612],{},"None built-in",[4297,38614,38615,38618],{},[30,38616,38617],{},"parsing_report"," score",[4297,38620,38612],{},[4279,38622,38623,38626,38629,38632],{},[4297,38624,38625],{},"Ghostscript required",[4297,38627,38628],{},"No",[4297,38630,38631],{},"Yes",[4297,38633,38628],{},[4279,38635,38636,38639,38641,38643],{},[4297,38637,38638],{},"Java JRE required",[4297,38640,38628],{},[4297,38642,38628],{},[4297,38644,38631],{},[4279,38646,38647,38650,38653,38655],{},[4297,38648,38649],{},"License",[4297,38651,38652],{},"MIT",[4297,38654,38652],{},[4297,38656,38652],{},[4279,38658,38659,38661,38665,38670],{},[4297,38660,36846],{},[4297,38662,38663],{},[30,38664,943],{},[4297,38666,38667],{},[30,38668,38669],{},"camelot-py[cv]",[4297,38671,38672],{},[30,38673,36791],{},[4279,38675,38676,38679,38683,38687],{},[4297,38677,38678],{},"Stream\u002Fwhitespace mode",[4297,38680,38681],{},[30,38682,37405],{},[4297,38684,38685],{},[30,38686,11068],{},[4297,38688,38689],{},[30,38690,38691],{},"lattice=False",[2540,38693,2547,38695,2547,38698,2547,38701,2547,2547,38711,2547,38715,38720,38724,38728,38732,2547,2547,38736,2547,38738,2547,38742,2547,2547,38746,2547,38750,2547,38753,2547,2547,38757,2547,38760,2547,38763,2547,2547,38766,2547,38770,2547,38774,2547,2547,38777,2547,38780,2547,38783,2547,2547,38786,2547,38789,2547,38792,2547,2547,38795,2547,38797,2547,38799,2547,2547,38802,2547,38804,2547,38807,2547,2547,38811,2547,38813,2547,38815,2547,2547,38818,2547,38820,2547,38822,2547,2547,38824,2547,38827,2547,38830,2547,2547,38833,2547,38836,2547,38841,2547,2547,38845,2547,38848,2547,38851,2547,2547,38854,2547,38857,2547,38860,2547,2547,38864,2547,38866,2547,38868,2547,2547,38870,2547,38872,2547,38874,2547,2547,38876,2547,38878,2547,38881,2547,2547,38884,2547,38886,2547,38889],{"viewBox":2542,"role":2543,"ariaLabel":38694,"xmlns":2545,"style":2546},"Comparison matrix of pdfplumber, camelot, and tabula-py across five dimensions",[2549,38696,38697],{},"Library Comparison Matrix",[2553,38699,38700],{},"Heatmap comparing pdfplumber, camelot, and tabula-py across bordered tables, borderless tables, scanned PDFs, speed, and system dependencies.",[2557,38702,2559,38703,2547],{},[2561,38704,2564,38706,2564,38708,2559],{"id":38705,"x1":748,"y1":748,"x2":734,"y2":748},"compare-tables-header-grad",[2566,38707],{"offset":748,"style":2568},[2566,38709],{"offset":734,"style":38710},"stop-color:#3b82f6",[2585,38712],{"x":2591,"y":2591,"width":38713,"height":26341,"rx":2681,"fill":38714},"744","url(#compare-tables-header-grad)",[2000,38716,38719],{"x":11155,"y":38717,"fill":2592,"style":38718},"34","text-anchor:middle;font-size:13px;font-weight:bold","\nBordered\n",[2000,38721,38723],{"x":38722,"y":38717,"fill":2592,"style":38718},"248","\nBorderless\n",[2000,38725,38727],{"x":38726,"y":38717,"fill":2592,"style":38718},"392","\nScanned\n",[2000,38729,38731],{"x":38730,"y":38717,"fill":2592,"style":38718},"536","\nSpeed\n",[2000,38733,38735],{"x":38734,"y":38717,"fill":2592,"style":38718},"672","\nSys Deps\n",[2585,38737],{"x":2591,"y":2590,"width":38713,"height":2597,"rx":1503,"fill":2615,"stroke":2593,"style":11105},[2000,38739,943],{"x":38740,"y":38741,"fill":2599,"style":16979},"56","96",[2000,38743,38745],{"x":38740,"y":38744,"fill":2583,"style":2605},"114","MIT · pure Python",[2585,38747],{"x":38748,"y":38749,"width":26402,"height":2590,"rx":1503,"fill":11165,"stroke":2593,"style":11105},"70","68",[2000,38751,38523],{"x":26345,"y":38752,"fill":11166,"style":38718},"93",[2000,38754,38756],{"x":26345,"y":38755,"fill":2583,"style":2605},"111","line detection",[2585,38758],{"x":38759,"y":38749,"width":26402,"height":2590,"rx":1503,"fill":11165,"stroke":2593,"style":11105},"202",[2000,38761,38523],{"x":38762,"y":38752,"fill":11166,"style":38718},"264",[2000,38764,38765],{"x":38762,"y":38755,"fill":2583,"style":2605},"text strategy",[2585,38767],{"x":38768,"y":38749,"width":26402,"height":2590,"rx":1503,"fill":38769,"stroke":2593,"style":11105},"346","#f1f5f9",[2000,38771,571],{"x":38772,"y":38752,"fill":38773,"style":38718},"408","#94a3b8",[2000,38775,38776],{"x":38772,"y":38755,"fill":38773,"style":2605},"OCR needed",[2585,38778],{"x":16990,"y":38749,"width":26402,"height":2590,"rx":1503,"fill":38779,"stroke":2593,"style":11105},"#dcfce7",[2000,38781,38586],{"x":38782,"y":38752,"fill":17010,"style":38718},"552",[2000,38784,38785],{"x":38782,"y":38755,"fill":2583,"style":2605},"no JVM warmup",[2585,38787],{"x":38788,"y":38749,"width":11095,"height":2590,"rx":1503,"fill":38779,"stroke":2593,"style":11105},"634",[2000,38790,571],{"x":38791,"y":38752,"fill":17010,"style":38718},"693",[2000,38793,38794],{"x":38791,"y":38755,"fill":2583,"style":2605},"pip only",[2585,38796],{"x":2591,"y":11194,"width":38713,"height":2597,"rx":1503,"fill":2592,"stroke":2593,"style":11105},[2000,38798,16139],{"x":38740,"y":2643,"fill":2599,"style":16979},[2000,38800,38801],{"x":38740,"y":38759,"fill":2583,"style":2605},"MIT · needs gs+cv",[2585,38803],{"x":38748,"y":26360,"width":26402,"height":2590,"rx":1503,"fill":11166,"stroke":2593,"style":11105},[2000,38805,38526],{"x":26345,"y":38806,"fill":2592,"style":38718},"181",[2000,38808,38810],{"x":26345,"y":17036,"fill":38809,"style":2605},"#bfdbfe","accuracy score",[2585,38812],{"x":38759,"y":26360,"width":26402,"height":2590,"rx":1503,"fill":11165,"stroke":2593,"style":11105},[2000,38814,38523],{"x":38762,"y":38806,"fill":11166,"style":38718},[2000,38816,38817],{"x":38762,"y":17036,"fill":2583,"style":2605},"stream mode",[2585,38819],{"x":38768,"y":26360,"width":26402,"height":2590,"rx":1503,"fill":38769,"stroke":2593,"style":11105},[2000,38821,571],{"x":38772,"y":38806,"fill":38773,"style":38718},[2000,38823,38776],{"x":38772,"y":17036,"fill":38773,"style":2605},[2585,38825],{"x":16990,"y":26360,"width":26402,"height":2590,"rx":1503,"fill":38826,"stroke":2593,"style":11105},"#fef9c3",[2000,38828,38589],{"x":38782,"y":38806,"fill":38829,"style":38718},"#ca8a04",[2000,38831,38832],{"x":38782,"y":17036,"fill":2583,"style":2605},"OpenCV overhead",[2585,38834],{"x":38788,"y":26360,"width":11095,"height":2590,"rx":1503,"fill":38835,"stroke":2593,"style":11105},"#fef3c7",[2000,38837,38840],{"x":38791,"y":16982,"fill":38838,"style":38839},"#b45309","text-anchor:middle;font-size:12px;font-weight:bold","Ghostscript",[2000,38842,38844],{"x":38791,"y":38843,"fill":38838,"style":38839},"196","+ OpenCV",[2585,38846],{"x":2591,"y":38847,"width":38713,"height":2597,"rx":1503,"fill":2615,"stroke":2593,"style":11105},"236",[2000,38849,36791],{"x":38740,"y":38850,"fill":2599,"style":16979},"272",[2000,38852,38853],{"x":38740,"y":11231,"fill":2583,"style":2605},"MIT · needs JVM",[2585,38855],{"x":38748,"y":38856,"width":26402,"height":2590,"rx":1503,"fill":11166,"stroke":2593,"style":11105},"244",[2000,38858,38526],{"x":26345,"y":38859,"fill":2592,"style":38718},"269",[2000,38861,38863],{"x":26345,"y":38862,"fill":38809,"style":2605},"287","Java Tabula lib",[2585,38865],{"x":38759,"y":38856,"width":26402,"height":2590,"rx":1503,"fill":38826,"stroke":2593,"style":11105},[2000,38867,38540],{"x":38762,"y":38859,"fill":38829,"style":38718},[2000,38869,38691],{"x":38762,"y":38862,"fill":2583,"style":2605},[2585,38871],{"x":38768,"y":38856,"width":26402,"height":2590,"rx":1503,"fill":38769,"stroke":2593,"style":11105},[2000,38873,571],{"x":38772,"y":38859,"fill":38773,"style":38718},[2000,38875,38776],{"x":38772,"y":38862,"fill":38773,"style":2605},[2585,38877],{"x":16990,"y":38856,"width":26402,"height":2590,"rx":1503,"fill":38779,"stroke":2593,"style":11105},[2000,38879,38880],{"x":38782,"y":38859,"fill":17010,"style":38718},"Fast (batch)",[2000,38882,38883],{"x":38782,"y":38862,"fill":2583,"style":2605},"JVM amortized",[2585,38885],{"x":38788,"y":38856,"width":11095,"height":2590,"rx":1503,"fill":38835,"stroke":2593,"style":11105},[2000,38887,38888],{"x":38791,"y":38850,"fill":38838,"style":38718},"Java JRE",[2000,38890,38891],{"x":38791,"y":11231,"fill":2583,"style":2605},"on PATH",[18,38893,38895],{"id":38894},"_6-edge-cases-and-variants","6. Edge Cases and Variants",[424,38897,38899],{"id":38898},"variant-a-rotated-tables","Variant A: Rotated Tables",[14,38901,38902,38903,38906],{},"pdfplumber handles rotated pages via ",[30,38904,38905],{},".rotate",", but its table extractor assumes upright coordinate axes. For pages rotated 90° or 270°, normalize first:",[23,38908,38910],{"className":126,"code":38909,"language":47,"meta":28,"style":28},"# pip install pdfplumber pandas\nfrom pathlib import Path\nimport pdfplumber\nimport pandas as pd\n\nPDF_PATH = Path(\"data\u002Frotated_report.pdf\")\n\ndef extract_rotated(pdf_path: Path) -> list[pd.DataFrame]:\n    dfs = []\n    try:\n        with pdfplumber.open(pdf_path) as pdf:\n            for page in pdf.pages:\n                # Rotate back to upright before extracting\n                upright = page.rotate(page.rotation * -1) if page.rotation else page\n                for tbl in upright.extract_tables():\n                    if tbl and len(tbl) > 1:\n                        dfs.append(pd.DataFrame(tbl[1:], columns=tbl[0]))\n    except Exception as e:\n        print(f\"Error: {e}\")\n    return dfs\n\nif __name__ == \"__main__\":\n    tables = extract_rotated(PDF_PATH)\n    print(f\"Extracted {len(tables)} table(s) from rotated PDF\")\n",[30,38911,38912,38916,38926,38932,38942,38946,38959,38963,38972,38980,38986,38996,39006,39011,39040,39051,39069,39088,39098,39119,39125,39129,39141,39154],{"__ignoreMap":28},[33,38913,38914],{"class":35,"line":36},[33,38915,7041],{"class":39},[33,38917,38918,38920,38922,38924],{"class":35,"line":43},[33,38919,190],{"class":163},[33,38921,193],{"class":167},[33,38923,164],{"class":163},[33,38925,198],{"class":167},[33,38927,38928,38930],{"class":35,"line":61},[33,38929,164],{"class":163},[33,38931,485],{"class":167},[33,38933,38934,38936,38938,38940],{"class":35,"line":73},[33,38935,164],{"class":163},[33,38937,492],{"class":167},[33,38939,495],{"class":163},[33,38941,498],{"class":167},[33,38943,38944],{"class":35,"line":88},[33,38945,92],{"emptyLinePlaceholder":91},[33,38947,38948,38950,38952,38954,38957],{"class":35,"line":95},[33,38949,7076],{"class":50},[33,38951,212],{"class":163},[33,38953,215],{"class":167},[33,38955,38956],{"class":54},"\"data\u002Frotated_report.pdf\"",[33,38958,221],{"class":167},[33,38960,38961],{"class":35,"line":101},[33,38962,92],{"emptyLinePlaceholder":91},[33,38964,38965,38967,38970],{"class":35,"line":171},[33,38966,562],{"class":163},[33,38968,38969],{"class":46}," extract_rotated",[33,38971,5947],{"class":167},[33,38973,38974,38976,38978],{"class":35,"line":179},[33,38975,37500],{"class":167},[33,38977,242],{"class":163},[33,38979,589],{"class":167},[33,38981,38982,38984],{"class":35,"line":187},[33,38983,2424],{"class":163},[33,38985,574],{"class":167},[33,38987,38988,38990,38992,38994],{"class":35,"line":201},[33,38989,2191],{"class":163},[33,38991,681],{"class":167},[33,38993,495],{"class":163},[33,38995,686],{"class":167},[33,38997,38998,39000,39002,39004],{"class":35,"line":206},[33,38999,1793],{"class":163},[33,39001,695],{"class":167},[33,39003,662],{"class":163},[33,39005,700],{"class":167},[33,39007,39008],{"class":35,"line":224},[33,39009,39010],{"class":39},"                # Rotate back to upright before extracting\n",[33,39012,39013,39016,39018,39021,39023,39026,39028,39030,39032,39035,39037],{"class":35,"line":229},[33,39014,39015],{"class":167},"                upright ",[33,39017,242],{"class":163},[33,39019,39020],{"class":167}," page.rotate(page.rotation ",[33,39022,1769],{"class":163},[33,39024,39025],{"class":163}," -",[33,39027,734],{"class":50},[33,39029,1649],{"class":167},[33,39031,2491],{"class":163},[33,39033,39034],{"class":167}," page.rotation ",[33,39036,7489],{"class":163},[33,39038,39039],{"class":167}," page\n",[33,39041,39042,39044,39046,39048],{"class":35,"line":235},[33,39043,692],{"class":163},[33,39045,7154],{"class":167},[33,39047,662],{"class":163},[33,39049,39050],{"class":167}," upright.extract_tables():\n",[33,39052,39053,39055,39057,39059,39061,39063,39065,39067],{"class":35,"line":250},[33,39054,717],{"class":163},[33,39056,7154],{"class":167},[33,39058,6001],{"class":163},[33,39060,4037],{"class":50},[33,39062,7179],{"class":167},[33,39064,6009],{"class":163},[33,39066,1814],{"class":50},[33,39068,574],{"class":167},[33,39070,39071,39074,39076,39078,39080,39082,39084,39086],{"class":35,"line":266},[33,39072,39073],{"class":167},"                        dfs.append(pd.DataFrame(tbl[",[33,39075,734],{"class":50},[33,39077,737],{"class":167},[33,39079,740],{"class":238},[33,39081,242],{"class":163},[33,39083,7206],{"class":167},[33,39085,748],{"class":50},[33,39087,7211],{"class":167},[33,39089,39090,39092,39094,39096],{"class":35,"line":290},[33,39091,2449],{"class":163},[33,39093,783],{"class":50},[33,39095,1852],{"class":163},[33,39097,7583],{"class":167},[33,39099,39100,39102,39104,39106,39109,39111,39113,39115,39117],{"class":35,"line":295},[33,39101,9414],{"class":50},[33,39103,602],{"class":167},[33,39105,4059],{"class":163},[33,39107,39108],{"class":54},"\"Error: ",[33,39110,1115],{"class":50},[33,39112,7602],{"class":167},[33,39114,1121],{"class":50},[33,39116,274],{"class":54},[33,39118,221],{"class":167},[33,39120,39121,39123],{"class":35,"line":300},[33,39122,1332],{"class":163},[33,39124,37688],{"class":167},[33,39126,39127],{"class":35,"line":317},[33,39128,92],{"emptyLinePlaceholder":91},[33,39130,39131,39133,39135,39137,39139],{"class":35,"line":332},[33,39132,2491],{"class":163},[33,39134,2494],{"class":50},[33,39136,2497],{"class":163},[33,39138,2500],{"class":54},[33,39140,574],{"class":167},[33,39142,39143,39145,39147,39150,39152],{"class":35,"line":347},[33,39144,37709],{"class":167},[33,39146,242],{"class":163},[33,39148,39149],{"class":167}," extract_rotated(",[33,39151,7076],{"class":50},[33,39153,221],{"class":167},[33,39155,39156,39158,39160,39162,39164,39166,39169,39171,39174],{"class":35,"line":374},[33,39157,7268],{"class":50},[33,39159,602],{"class":167},[33,39161,4059],{"class":163},[33,39163,8142],{"class":54},[33,39165,4065],{"class":50},[33,39167,39168],{"class":167},"(tables)",[33,39170,1121],{"class":50},[33,39172,39173],{"class":54}," table(s) from rotated PDF\"",[33,39175,221],{"class":167},[424,39177,39179],{"id":39178},"variant-b-multi-page-tables-spanning-page-breaks","Variant B: Multi-Page Tables Spanning Page Breaks",[14,39181,39182],{},"Tables split across pages arrive as separate DataFrames. Detect repeating header rows and concatenate:",[23,39184,39186],{"className":126,"code":39185,"language":47,"meta":28,"style":28},"# pip install pdfplumber pandas\nfrom pathlib import Path\nimport pdfplumber\nimport pandas as pd\n\nPDF_PATH = Path(\"data\u002Fmulti_page_table.pdf\")\n\ndef extract_multipage_table(pdf_path: Path) -> pd.DataFrame:\n    \"\"\"Concatenate tables across pages, stripping repeated headers.\"\"\"\n    all_dfs: list[pd.DataFrame] = []\n    header: list | None = None\n    try:\n        with pdfplumber.open(pdf_path) as pdf:\n            for page in pdf.pages:\n                for tbl in page.extract_tables():\n                    if not tbl:\n                        continue\n                    if header is None:\n                        header = tbl[0]\n                        rows = tbl[1:]\n                    else:\n                        # Skip row if it repeats the header (page-break artefact)\n                        rows = tbl[1:] if tbl[0] == header else tbl\n                    if rows:\n                        all_dfs.append(pd.DataFrame(rows, columns=header))\n    except Exception as e:\n        print(f\"Error reading {pdf_path}: {e}\")\n        return pd.DataFrame()\n    return pd.concat(all_dfs, ignore_index=True) if all_dfs else pd.DataFrame()\n\nif __name__ == \"__main__\":\n    df = extract_multipage_table(PDF_PATH)\n    print(df.shape)\n    df.to_csv(\"output\u002Fmerged_table.csv\", index=False)\n",[30,39187,39188,39192,39202,39208,39218,39222,39235,39239,39248,39253,39262,39277,39283,39293,39303,39313,39321,39325,39337,39351,39365,39372,39377,39407,39413,39425,39435,39464,39470,39494,39498,39510,39523,39530],{"__ignoreMap":28},[33,39189,39190],{"class":35,"line":36},[33,39191,7041],{"class":39},[33,39193,39194,39196,39198,39200],{"class":35,"line":43},[33,39195,190],{"class":163},[33,39197,193],{"class":167},[33,39199,164],{"class":163},[33,39201,198],{"class":167},[33,39203,39204,39206],{"class":35,"line":61},[33,39205,164],{"class":163},[33,39207,485],{"class":167},[33,39209,39210,39212,39214,39216],{"class":35,"line":73},[33,39211,164],{"class":163},[33,39213,492],{"class":167},[33,39215,495],{"class":163},[33,39217,498],{"class":167},[33,39219,39220],{"class":35,"line":88},[33,39221,92],{"emptyLinePlaceholder":91},[33,39223,39224,39226,39228,39230,39233],{"class":35,"line":95},[33,39225,7076],{"class":50},[33,39227,212],{"class":163},[33,39229,215],{"class":167},[33,39231,39232],{"class":54},"\"data\u002Fmulti_page_table.pdf\"",[33,39234,221],{"class":167},[33,39236,39237],{"class":35,"line":101},[33,39238,92],{"emptyLinePlaceholder":91},[33,39240,39241,39243,39246],{"class":35,"line":171},[33,39242,562],{"class":163},[33,39244,39245],{"class":46}," extract_multipage_table",[33,39247,27038],{"class":167},[33,39249,39250],{"class":35,"line":179},[33,39251,39252],{"class":54},"    \"\"\"Concatenate tables across pages, stripping repeated headers.\"\"\"\n",[33,39254,39255,39258,39260],{"class":35,"line":187},[33,39256,39257],{"class":167},"    all_dfs: list[pd.DataFrame] ",[33,39259,242],{"class":163},[33,39261,589],{"class":167},[33,39263,39264,39267,39269,39271,39273,39275],{"class":35,"line":201},[33,39265,39266],{"class":167},"    header: ",[33,39268,25066],{"class":50},[33,39270,2850],{"class":163},[33,39272,7657],{"class":50},[33,39274,212],{"class":163},[33,39276,3852],{"class":50},[33,39278,39279,39281],{"class":35,"line":206},[33,39280,2424],{"class":163},[33,39282,574],{"class":167},[33,39284,39285,39287,39289,39291],{"class":35,"line":224},[33,39286,2191],{"class":163},[33,39288,681],{"class":167},[33,39290,495],{"class":163},[33,39292,686],{"class":167},[33,39294,39295,39297,39299,39301],{"class":35,"line":229},[33,39296,1793],{"class":163},[33,39298,695],{"class":167},[33,39300,662],{"class":163},[33,39302,700],{"class":167},[33,39304,39305,39307,39309,39311],{"class":35,"line":235},[33,39306,692],{"class":163},[33,39308,7154],{"class":167},[33,39310,662],{"class":163},[33,39312,27124],{"class":167},[33,39314,39315,39317,39319],{"class":35,"line":250},[33,39316,717],{"class":163},[33,39318,620],{"class":163},[33,39320,14920],{"class":167},[33,39322,39323],{"class":35,"line":266},[33,39324,7458],{"class":163},[33,39326,39327,39329,39331,39333,39335],{"class":35,"line":290},[33,39328,717],{"class":163},[33,39330,17788],{"class":167},[33,39332,3847],{"class":163},[33,39334,7657],{"class":50},[33,39336,574],{"class":167},[33,39338,39339,39342,39344,39347,39349],{"class":35,"line":295},[33,39340,39341],{"class":167},"                        header ",[33,39343,242],{"class":163},[33,39345,39346],{"class":167}," tbl[",[33,39348,748],{"class":50},[33,39350,9202],{"class":167},[33,39352,39353,39356,39358,39360,39362],{"class":35,"line":300},[33,39354,39355],{"class":167},"                        rows ",[33,39357,242],{"class":163},[33,39359,39346],{"class":167},[33,39361,734],{"class":50},[33,39363,39364],{"class":167},":]\n",[33,39366,39367,39370],{"class":35,"line":317},[33,39368,39369],{"class":163},"                    else",[33,39371,574],{"class":167},[33,39373,39374],{"class":35,"line":332},[33,39375,39376],{"class":39},"                        # Skip row if it repeats the header (page-break artefact)\n",[33,39378,39379,39381,39383,39385,39387,39390,39392,39394,39396,39398,39400,39402,39404],{"class":35,"line":347},[33,39380,39355],{"class":167},[33,39382,242],{"class":163},[33,39384,39346],{"class":167},[33,39386,734],{"class":50},[33,39388,39389],{"class":167},":] ",[33,39391,2491],{"class":163},[33,39393,39346],{"class":167},[33,39395,748],{"class":50},[33,39397,763],{"class":167},[33,39399,1865],{"class":163},[33,39401,17788],{"class":167},[33,39403,7489],{"class":163},[33,39405,39406],{"class":167}," tbl\n",[33,39408,39409,39411],{"class":35,"line":374},[33,39410,717],{"class":163},[33,39412,8723],{"class":167},[33,39414,39415,39418,39420,39422],{"class":35,"line":397},[33,39416,39417],{"class":167},"                        all_dfs.append(pd.DataFrame(rows, ",[33,39419,740],{"class":238},[33,39421,242],{"class":163},[33,39423,39424],{"class":167},"header))\n",[33,39426,39427,39429,39431,39433],{"class":35,"line":653},[33,39428,2449],{"class":163},[33,39430,783],{"class":50},[33,39432,1852],{"class":163},[33,39434,7583],{"class":167},[33,39436,39437,39439,39441,39443,39446,39448,39450,39452,39454,39456,39458,39460,39462],{"class":35,"line":667},[33,39438,9414],{"class":50},[33,39440,602],{"class":167},[33,39442,4059],{"class":163},[33,39444,39445],{"class":54},"\"Error reading ",[33,39447,1115],{"class":50},[33,39449,27069],{"class":167},[33,39451,1121],{"class":50},[33,39453,2079],{"class":54},[33,39455,1115],{"class":50},[33,39457,7602],{"class":167},[33,39459,1121],{"class":50},[33,39461,274],{"class":54},[33,39463,221],{"class":167},[33,39465,39466,39468],{"class":35,"line":675},[33,39467,1659],{"class":163},[33,39469,7721],{"class":167},[33,39471,39472,39474,39477,39479,39481,39483,39485,39487,39490,39492],{"class":35,"line":689},[33,39473,1332],{"class":163},[33,39475,39476],{"class":167}," pd.concat(all_dfs, ",[33,39478,850],{"class":238},[33,39480,242],{"class":163},[33,39482,855],{"class":50},[33,39484,1649],{"class":167},[33,39486,2491],{"class":163},[33,39488,39489],{"class":167}," all_dfs ",[33,39491,7489],{"class":163},[33,39493,7721],{"class":167},[33,39495,39496],{"class":35,"line":703},[33,39497,92],{"emptyLinePlaceholder":91},[33,39499,39500,39502,39504,39506,39508],{"class":35,"line":714},[33,39501,2491],{"class":163},[33,39503,2494],{"class":50},[33,39505,2497],{"class":163},[33,39507,2500],{"class":54},[33,39509,574],{"class":167},[33,39511,39512,39514,39516,39519,39521],{"class":35,"line":723},[33,39513,4025],{"class":167},[33,39515,242],{"class":163},[33,39517,39518],{"class":167}," extract_multipage_table(",[33,39520,7076],{"class":50},[33,39522,221],{"class":167},[33,39524,39525,39527],{"class":35,"line":754},[33,39526,7268],{"class":50},[33,39528,39529],{"class":167},"(df.shape)\n",[33,39531,39532,39535,39538,39540,39542,39544,39546],{"class":35,"line":771},[33,39533,39534],{"class":167},"    df.to_csv(",[33,39536,39537],{"class":54},"\"output\u002Fmerged_table.csv\"",[33,39539,365],{"class":167},[33,39541,897],{"class":238},[33,39543,242],{"class":163},[33,39545,902],{"class":50},[33,39547,221],{"class":167},[14,39549,39550,39551,39553],{},"The ",[940,39552,9592],{"href":942}," guide covers deduplication patterns in more detail.",[424,39555,39557],{"id":39556},"variant-c-password-protected-pdfs","Variant C: Password-Protected PDFs",[14,39559,39560,39561,39564],{},"All three libraries accept a ",[30,39562,39563],{},"password"," parameter. Load it from the environment rather than hardcoding:",[23,39566,39568],{"className":126,"code":39567,"language":47,"meta":28,"style":28},"# pip install pdfplumber\nimport os\nfrom pathlib import Path\nimport pdfplumber\n\nPDF_PATH = Path(\"data\u002Fprotected.pdf\")\nPDF_PASSWORD = os.environ.get(\"PDF_PASSWORD\", \"\")\n\nwith pdfplumber.open(PDF_PATH, password=PDF_PASSWORD) as pdf:\n    page = pdf.pages[0]\n    print(page.extract_text()[:200])\n",[30,39569,39570,39574,39580,39590,39596,39600,39613,39631,39635,39658,39671],{"__ignoreMap":28},[33,39571,39572],{"class":35,"line":36},[33,39573,9763],{"class":39},[33,39575,39576,39578],{"class":35,"line":43},[33,39577,164],{"class":163},[33,39579,176],{"class":167},[33,39581,39582,39584,39586,39588],{"class":35,"line":61},[33,39583,190],{"class":163},[33,39585,193],{"class":167},[33,39587,164],{"class":163},[33,39589,198],{"class":167},[33,39591,39592,39594],{"class":35,"line":73},[33,39593,164],{"class":163},[33,39595,485],{"class":167},[33,39597,39598],{"class":35,"line":88},[33,39599,92],{"emptyLinePlaceholder":91},[33,39601,39602,39604,39606,39608,39611],{"class":35,"line":95},[33,39603,7076],{"class":50},[33,39605,212],{"class":163},[33,39607,215],{"class":167},[33,39609,39610],{"class":54},"\"data\u002Fprotected.pdf\"",[33,39612,221],{"class":167},[33,39614,39615,39618,39620,39622,39625,39627,39629],{"class":35,"line":101},[33,39616,39617],{"class":50},"PDF_PASSWORD",[33,39619,212],{"class":163},[33,39621,3129],{"class":167},[33,39623,39624],{"class":54},"\"PDF_PASSWORD\"",[33,39626,365],{"class":167},[33,39628,3198],{"class":54},[33,39630,221],{"class":167},[33,39632,39633],{"class":35,"line":171},[33,39634,92],{"emptyLinePlaceholder":91},[33,39636,39637,39639,39642,39644,39646,39648,39650,39652,39654,39656],{"class":35,"line":179},[33,39638,22271],{"class":163},[33,39640,39641],{"class":167}," pdfplumber.open(",[33,39643,7076],{"class":50},[33,39645,365],{"class":167},[33,39647,39563],{"class":238},[33,39649,242],{"class":163},[33,39651,39617],{"class":50},[33,39653,1649],{"class":167},[33,39655,495],{"class":163},[33,39657,686],{"class":167},[33,39659,39660,39663,39665,39667,39669],{"class":35,"line":187},[33,39661,39662],{"class":167},"    page ",[33,39664,242],{"class":163},[33,39666,9870],{"class":167},[33,39668,748],{"class":50},[33,39670,9202],{"class":167},[33,39672,39673,39675,39678,39680],{"class":35,"line":201},[33,39674,7268],{"class":50},[33,39676,39677],{"class":167},"(page.extract_text()[:",[33,39679,2611],{"class":50},[33,39681,751],{"class":167},[14,39683,39684,39685,36661,39688,39690,39691,36661,39693,3035],{},"For tabula-py, pass ",[30,39686,39687],{},"password=PDF_PASSWORD",[30,39689,35183],{},". For camelot, pass ",[30,39692,39687],{},[30,39694,39695],{},"camelot.read_pdf()",[18,39697,39699],{"id":39698},"_7-validation","7. Validation",[14,39701,39702],{},"After extraction, verify correctness before downstream processing. Do not trust shape alone — check a known cell value:",[23,39704,39706],{"className":126,"code":39705,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\n\ndef validate_extraction(df: pd.DataFrame, expected_cols: int, sample_cell: str | None = None) -> bool:\n    \"\"\"Assert basic structural correctness of an extracted DataFrame.\"\"\"\n    assert not df.empty, \"Extracted DataFrame is empty\"\n    assert df.shape[1] == expected_cols, (\n        f\"Column count mismatch: got {df.shape[1]}, expected {expected_cols}\"\n    )\n    if sample_cell:\n        found = df.apply(lambda col: col.astype(str).str.contains(sample_cell, na=False)).any().any()\n        assert found, f\"Expected sample value '{sample_cell}' not found in DataFrame\"\n    print(f\"Validation passed: {df.shape[0]} rows x {df.shape[1]} cols\")\n    return True\n",[30,39707,39708,39712,39722,39726,39757,39762,39774,39789,39816,39820,39827,39858,39880,39919],{"__ignoreMap":28},[33,39709,39710],{"class":35,"line":36},[33,39711,8895],{"class":39},[33,39713,39714,39716,39718,39720],{"class":35,"line":43},[33,39715,164],{"class":163},[33,39717,492],{"class":167},[33,39719,495],{"class":163},[33,39721,498],{"class":167},[33,39723,39724],{"class":35,"line":61},[33,39725,92],{"emptyLinePlaceholder":91},[33,39727,39728,39730,39733,39736,39738,39741,39743,39745,39747,39749,39751,39753,39755],{"class":35,"line":73},[33,39729,562],{"class":163},[33,39731,39732],{"class":46}," validate_extraction",[33,39734,39735],{"class":167},"(df: pd.DataFrame, expected_cols: ",[33,39737,1059],{"class":50},[33,39739,39740],{"class":167},", sample_cell: ",[33,39742,1053],{"class":50},[33,39744,2850],{"class":163},[33,39746,7657],{"class":50},[33,39748,212],{"class":163},[33,39750,7657],{"class":50},[33,39752,1617],{"class":167},[33,39754,2821],{"class":50},[33,39756,574],{"class":167},[33,39758,39759],{"class":35,"line":88},[33,39760,39761],{"class":54},"    \"\"\"Assert basic structural correctness of an extracted DataFrame.\"\"\"\n",[33,39763,39764,39766,39768,39771],{"class":35,"line":95},[33,39765,9228],{"class":163},[33,39767,620],{"class":163},[33,39769,39770],{"class":167}," df.empty, ",[33,39772,39773],{"class":54},"\"Extracted DataFrame is empty\"\n",[33,39775,39776,39778,39780,39782,39784,39786],{"class":35,"line":101},[33,39777,9228],{"class":163},[33,39779,9516],{"class":167},[33,39781,734],{"class":50},[33,39783,763],{"class":167},[33,39785,1865],{"class":163},[33,39787,39788],{"class":167}," expected_cols, (\n",[33,39790,39791,39793,39796,39798,39800,39802,39804,39806,39808,39810,39812,39814],{"class":35,"line":171},[33,39792,9533],{"class":163},[33,39794,39795],{"class":54},"\"Column count mismatch: got ",[33,39797,1115],{"class":50},[33,39799,9541],{"class":167},[33,39801,734],{"class":50},[33,39803,9546],{"class":167},[33,39805,1121],{"class":50},[33,39807,9551],{"class":54},[33,39809,1115],{"class":50},[33,39811,12914],{"class":167},[33,39813,1121],{"class":50},[33,39815,7504],{"class":54},[33,39817,39818],{"class":35,"line":179},[33,39819,1202],{"class":167},[33,39821,39822,39824],{"class":35,"line":187},[33,39823,617],{"class":163},[33,39825,39826],{"class":167}," sample_cell:\n",[33,39828,39829,39832,39834,39837,39840,39843,39845,39848,39851,39853,39855],{"class":35,"line":201},[33,39830,39831],{"class":167},"        found ",[33,39833,242],{"class":163},[33,39835,39836],{"class":167}," df.apply(",[33,39838,39839],{"class":163},"lambda",[33,39841,39842],{"class":167}," col: col.astype(",[33,39844,1053],{"class":50},[33,39846,39847],{"class":167},").str.contains(sample_cell, ",[33,39849,39850],{"class":238},"na",[33,39852,242],{"class":163},[33,39854,902],{"class":50},[33,39856,39857],{"class":167},")).any().any()\n",[33,39859,39860,39862,39865,39867,39870,39872,39875,39877],{"class":35,"line":206},[33,39861,21485],{"class":163},[33,39863,39864],{"class":167}," found, ",[33,39866,4059],{"class":163},[33,39868,39869],{"class":54},"\"Expected sample value '",[33,39871,1115],{"class":50},[33,39873,39874],{"class":167},"sample_cell",[33,39876,1121],{"class":50},[33,39878,39879],{"class":54},"' not found in DataFrame\"\n",[33,39881,39882,39884,39886,39888,39891,39893,39895,39897,39899,39901,39904,39906,39908,39910,39912,39914,39917],{"class":35,"line":224},[33,39883,7268],{"class":50},[33,39885,602],{"class":167},[33,39887,4059],{"class":163},[33,39889,39890],{"class":54},"\"Validation passed: ",[33,39892,1115],{"class":50},[33,39894,9541],{"class":167},[33,39896,748],{"class":50},[33,39898,9546],{"class":167},[33,39900,1121],{"class":50},[33,39902,39903],{"class":54}," rows x ",[33,39905,1115],{"class":50},[33,39907,9541],{"class":167},[33,39909,734],{"class":50},[33,39911,9546],{"class":167},[33,39913,1121],{"class":50},[33,39915,39916],{"class":54}," cols\"",[33,39918,221],{"class":167},[33,39920,39921,39923],{"class":35,"line":229},[33,39922,1332],{"class":163},[33,39924,2887],{"class":50},[14,39926,39927,39928,39930],{},"For data that feeds reporting pipelines — see ",[940,39929,26258],{"href":26257}," — also check numeric column dtypes after coercion.",[18,39932,39934],{"id":39933},"_8-performance-and-scale-notes","8. Performance and Scale Notes",[14,39936,39937,39940,39941,39943,39944,39947],{},[1974,39938,39939],{},"JVM startup cost for tabula-py:"," The Java process starts fresh per Python session. Amortize it by calling ",[30,39942,35183],{}," in bulk rather than spawning multiple subprocesses. For batch processing 500+ PDFs, consider ",[30,39945,39946],{},"tabula.convert_into_by_batch()"," which passes a directory to the Java JAR directly.",[14,39949,39950,39953,39954,39956],{},[1974,39951,39952],{},"camelot memory use:"," camelot loads each page as an OpenCV image matrix. High-DPI PDFs or documents with many pages can exhaust RAM. Process in chunks and delete intermediate ",[30,39955,38568],{}," objects explicitly.",[14,39958,39959,39962],{},[1974,39960,39961],{},"pdfplumber chunking:"," For very large files, iterate pages lazily and write each table to CSV immediately rather than holding all DataFrames in memory:",[23,39964,39966],{"className":126,"code":39965,"language":47,"meta":28,"style":28},"# pip install pdfplumber pandas\nfrom pathlib import Path\nimport pdfplumber\nimport pandas as pd\n\nPDF_PATH = Path(\"data\u002Flarge_report.pdf\")\nOUT_DIR  = Path(\"output\u002Ftables\")\nOUT_DIR.mkdir(parents=True, exist_ok=True)\n\nwith pdfplumber.open(PDF_PATH) as pdf:\n    for page_num, page in enumerate(pdf.pages):\n        for tbl_num, tbl in enumerate(page.extract_tables()):\n            if tbl and len(tbl) > 1:\n                df = pd.DataFrame(tbl[1:], columns=tbl[0])\n                path = OUT_DIR \u002F f\"page{page_num+1}_tbl{tbl_num+1}.csv\"\n                df.to_csv(path, index=False)\n",[30,39967,39968,39972,39982,39988,39998,40002,40014,40028,40050,40054,40068,40081,40095,40113,40135,40177],{"__ignoreMap":28},[33,39969,39970],{"class":35,"line":36},[33,39971,7041],{"class":39},[33,39973,39974,39976,39978,39980],{"class":35,"line":43},[33,39975,190],{"class":163},[33,39977,193],{"class":167},[33,39979,164],{"class":163},[33,39981,198],{"class":167},[33,39983,39984,39986],{"class":35,"line":61},[33,39985,164],{"class":163},[33,39987,485],{"class":167},[33,39989,39990,39992,39994,39996],{"class":35,"line":73},[33,39991,164],{"class":163},[33,39993,492],{"class":167},[33,39995,495],{"class":163},[33,39997,498],{"class":167},[33,39999,40000],{"class":35,"line":88},[33,40001,92],{"emptyLinePlaceholder":91},[33,40003,40004,40006,40008,40010,40012],{"class":35,"line":95},[33,40005,7076],{"class":50},[33,40007,212],{"class":163},[33,40009,215],{"class":167},[33,40011,36194],{"class":54},[33,40013,221],{"class":167},[33,40015,40016,40019,40021,40023,40026],{"class":35,"line":101},[33,40017,40018],{"class":50},"OUT_DIR",[33,40020,17208],{"class":163},[33,40022,215],{"class":167},[33,40024,40025],{"class":54},"\"output\u002Ftables\"",[33,40027,221],{"class":167},[33,40029,40030,40032,40034,40036,40038,40040,40042,40044,40046,40048],{"class":35,"line":171},[33,40031,40018],{"class":50},[33,40033,1078],{"class":167},[33,40035,869],{"class":238},[33,40037,242],{"class":163},[33,40039,855],{"class":50},[33,40041,365],{"class":167},[33,40043,878],{"class":238},[33,40045,242],{"class":163},[33,40047,855],{"class":50},[33,40049,221],{"class":167},[33,40051,40052],{"class":35,"line":179},[33,40053,92],{"emptyLinePlaceholder":91},[33,40055,40056,40058,40060,40062,40064,40066],{"class":35,"line":187},[33,40057,22271],{"class":163},[33,40059,39641],{"class":167},[33,40061,7076],{"class":50},[33,40063,1649],{"class":167},[33,40065,495],{"class":163},[33,40067,686],{"class":167},[33,40069,40070,40072,40074,40076,40078],{"class":35,"line":201},[33,40071,656],{"class":163},[33,40073,7398],{"class":167},[33,40075,662],{"class":163},[33,40077,7403],{"class":50},[33,40079,40080],{"class":167},"(pdf.pages):\n",[33,40082,40083,40085,40088,40090,40092],{"class":35,"line":206},[33,40084,5973],{"class":163},[33,40086,40087],{"class":167}," tbl_num, tbl ",[33,40089,662],{"class":163},[33,40091,7403],{"class":50},[33,40093,40094],{"class":167},"(page.extract_tables()):\n",[33,40096,40097,40099,40101,40103,40105,40107,40109,40111],{"class":35,"line":224},[33,40098,5995],{"class":163},[33,40100,7154],{"class":167},[33,40102,6001],{"class":163},[33,40104,4037],{"class":50},[33,40106,7179],{"class":167},[33,40108,6009],{"class":163},[33,40110,1814],{"class":50},[33,40112,574],{"class":167},[33,40114,40115,40117,40119,40121,40123,40125,40127,40129,40131,40133],{"class":35,"line":229},[33,40116,6019],{"class":167},[33,40118,242],{"class":163},[33,40120,37631],{"class":167},[33,40122,734],{"class":50},[33,40124,737],{"class":167},[33,40126,740],{"class":238},[33,40128,242],{"class":163},[33,40130,7206],{"class":167},[33,40132,748],{"class":50},[33,40134,751],{"class":167},[33,40136,40137,40140,40142,40145,40147,40149,40152,40154,40157,40159,40162,40165,40167,40170,40172,40174],{"class":35,"line":235},[33,40138,40139],{"class":167},"                path ",[33,40141,242],{"class":163},[33,40143,40144],{"class":50}," OUT_DIR",[33,40146,1107],{"class":163},[33,40148,1110],{"class":163},[33,40150,40151],{"class":54},"\"page",[33,40153,1115],{"class":50},[33,40155,40156],{"class":167},"page_num",[33,40158,1811],{"class":163},[33,40160,40161],{"class":50},"1}",[33,40163,40164],{"class":54},"_tbl",[33,40166,1115],{"class":50},[33,40168,40169],{"class":167},"tbl_num",[33,40171,1811],{"class":163},[33,40173,40161],{"class":50},[33,40175,40176],{"class":54},".csv\"\n",[33,40178,40179,40182,40184,40186,40188],{"class":35,"line":250},[33,40180,40181],{"class":167},"                df.to_csv(path, ",[33,40183,897],{"class":238},[33,40185,242],{"class":163},[33,40187,902],{"class":50},[33,40189,221],{"class":167},[18,40191,40193],{"id":40192},"_9-troubleshooting","9. Troubleshooting",[4273,40195,40196,40206],{},[4276,40197,40198],{},[4279,40199,40200,40202,40204],{},[4282,40201,14317],{},[4282,40203,4287],{},[4282,40205,4290],{},[4292,40207,40208,40232,40249,40267,40280,40296],{},[4279,40209,40210,40214,40221],{},[4297,40211,40212],{},[30,40213,37024],{},[4297,40215,40216,40217,40220],{},"camelot needs ",[30,40218,40219],{},"gs"," on PATH",[4297,40222,40223,2012,40226,40229,40230],{},[30,40224,40225],{},"apt install ghostscript",[30,40227,40228],{},"brew install ghostscript","; see ",[940,40231,9739],{"href":9738},[4279,40233,40234,40241,40244],{},[4297,40235,40236,36608,40238],{},[30,40237,34964],{},[30,40239,40240],{},"java not found",[4297,40242,40243],{},"tabula-py cannot find JRE",[4297,40245,40246,40247],{},"Install JDK, add to PATH; see ",[940,40248,34953],{"href":37035},[4279,40250,40251,40256,40259],{},[4297,40252,40253],{},[30,40254,40255],{},"camelot returns 0 tables",[4297,40257,40258],{},"Wrong flavor for table type",[4297,40260,40261,40262,40264,40265],{},"Try ",[30,40263,11068],{}," if no visible borders; inspect ",[30,40266,38617],{},[4279,40268,40269,40274,40277],{},[4297,40270,40271],{},[30,40272,40273],{},"tabula returns garbled Unicode",[4297,40275,40276],{},"PDF uses CIDFont or custom encoding",[4297,40278,40279],{},"Switch to pdfplumber; tabula's Java layer does not handle all font encodings",[4279,40281,40282,40287,40290],{},[4297,40283,40284],{},[30,40285,40286],{},"pdfplumber returns None cells",[4297,40288,40289],{},"Table has merged\u002Fspanning cells",[4297,40291,17059,40292,40295],{},[30,40293,40294],{},".extract_table(table_settings={\"snap_tolerance\": 5})"," and forward-fill",[4279,40297,40298,40304,40307],{},[4297,40299,40300,40303],{},[30,40301,40302],{},"tabula.read_pdf"," hangs",[4297,40305,40306],{},"JVM OOM on large PDF",[4297,40308,4358,40309,40312],{},[30,40310,40311],{},"java_options=[\"-Xmx512m\"]","; split the PDF first",[18,40314,40316],{"id":40315},"_10-decision-guide","10. Decision Guide",[14,40318,40319],{},"Pick your library with three questions:",[35387,40321,40322,40335,40350],{},[4214,40323,40324,40327],{},[1974,40325,40326],{},"Can you install system dependencies?",[4211,40328,40329,40332],{},[4214,40330,40331],{},"No → use pdfplumber (pure Python, no system deps).",[4214,40333,40334],{},"Yes → continue.",[4214,40336,40337,40340],{},[1974,40338,40339],{},"Does your table have visible borders?",[4211,40341,40342,40345],{},[4214,40343,40344],{},"Yes, clean grid lines → try camelot lattice first (best accuracy signal); tabula is a solid alternative.",[4214,40346,40347,40348,3035],{},"No border lines → camelot stream or pdfplumber with ",[30,40349,37405],{},[4214,40351,40352,40355],{},[1974,40353,40354],{},"Is this a scanned image PDF?",[4211,40356,40357],{},[4214,40358,40359,40360,40362],{},"Yes → OCR it first with Tesseract (see ",[940,40361,36756],{"href":26957},"), then re-run extraction on the text-layer output.",[18,40364,40366],{"id":40365},"_11-complete-script-camelot-with-pdfplumber-fallback","11. Complete Script: camelot with pdfplumber Fallback",[14,40368,40369,40370,40373],{},"This production-ready script tries camelot lattice, falls back to camelot stream, then falls back to pdfplumber. It accepts a file path via ",[30,40371,40372],{},"argparse"," and writes one CSV per extracted table.",[23,40375,40377],{"className":126,"code":40376,"language":47,"meta":28,"style":28},"#!\u002Fusr\u002Fbin\u002Fenv python3\n# pip install camelot-py[cv] pdfplumber pandas opencv-python-headless\n# System deps: ghostscript (for camelot)\n\"\"\"\nextract_tables.py — extract all tables from a PDF, camelot → pdfplumber fallback.\nUsage: python extract_tables.py report.pdf --out output\u002F\n\"\"\"\nimport argparse\nimport sys\nfrom pathlib import Path\n\nimport pandas as pd\n\ndef _try_camelot(pdf_path: Path, out_dir: Path) -> bool:\n    \"\"\"Attempt extraction with camelot lattice then stream. Return True on success.\"\"\"\n    try:\n        import camelot\n    except ImportError:\n        print(\"camelot not installed, skipping\")\n        return False\n\n    for flavor in (\"lattice\", \"stream\"):\n        try:\n            tables = camelot.read_pdf(str(pdf_path), pages=\"all\", flavor=flavor)\n            if tables.n == 0:\n                continue\n            for i, tbl in enumerate(tables):\n                df = tbl.df.copy()\n                # Promote first row to header if it looks like one\n                if df.shape[0] > 1:\n                    df.columns = df.iloc[0].tolist()\n                    df = df.iloc[1:].reset_index(drop=True)\n                out_path = out_dir \u002F f\"camelot_{flavor}_tbl{i+1}.csv\"\n                df.to_csv(out_path, index=False)\n                print(f\"Saved {out_path}  (accuracy {tbl.parsing_report['accuracy']:.1f}%)\")\n            return True\n        except Exception as exc:\n            print(f\"camelot {flavor} failed: {exc}\")\n    return False\n\n\ndef _try_pdfplumber(pdf_path: Path, out_dir: Path) -> bool:\n    \"\"\"Fallback extraction with pdfplumber. Return True on success.\"\"\"\n    try:\n        import pdfplumber\n    except ImportError:\n        print(\"pdfplumber not installed, cannot fall back\")\n        return False\n\n    count = 0\n    try:\n        with pdfplumber.open(pdf_path) as pdf:\n            for page_num, page in enumerate(pdf.pages):\n                for tbl_num, raw in enumerate(page.extract_tables()):\n                    if not raw or len(raw) \u003C 2:\n                        continue\n                    df = pd.DataFrame(raw[1:], columns=raw[0])\n                    out_path = out_dir \u002F f\"pdfplumber_p{page_num+1}_tbl{tbl_num+1}.csv\"\n                    df.to_csv(out_path, index=False)\n                    print(f\"Saved {out_path}\")\n                    count += 1\n    except Exception as exc:\n        print(f\"pdfplumber failed: {exc}\")\n        return False\n    return count > 0\n\n\ndef main() -> None:\n    parser = argparse.ArgumentParser(description=\"Extract tables from a PDF file.\")\n    parser.add_argument(\"pdf\", type=Path, help=\"Path to input PDF\")\n    parser.add_argument(\"--out\", type=Path, default=Path(\"output\"), help=\"Output directory\")\n    args = parser.parse_args()\n\n    if not args.pdf.exists():\n        sys.exit(f\"File not found: {args.pdf}\")\n\n    args.out.mkdir(parents=True, exist_ok=True)\n\n    if not _try_camelot(args.pdf, args.out):\n        print(\"camelot produced no tables, falling back to pdfplumber\")\n        if not _try_pdfplumber(args.pdf, args.out):\n            sys.exit(\"All extraction methods failed. Check that the PDF contains selectable text.\")\n        else:\n            print(\"Extraction complete via pdfplumber fallback.\")\n    else:\n        print(\"Extraction complete via camelot.\")\n\n\nif __name__ == \"__main__\":\n    main()\n",[30,40378,40379,40383,40388,40393,40397,40402,40407,40411,40417,40423,40433,40437,40447,40451,40465,40470,40476,40482,40491,40502,40508,40512,40530,40536,40566,40579,40583,40596,40605,40610,40626,40640,40660,40695,40708,40745,40751,40761,40790,40796,40800,40804,40817,40822,40828,40834,40842,40853,40859,40863,40872,40878,40888,40900,40913,40933,40937,40959,40995,41008,41029,41038,41048,41069,41075,41086,41090,41094,41106,41123,41146,41180,41188,41192,41200,41218,41222,41243,41247,41256,41267,41276,41286,41293,41304,41310,41321,41325,41329,41341],{"__ignoreMap":28},[33,40380,40381],{"class":35,"line":36},[33,40382,14447],{"class":39},[33,40384,40385],{"class":35,"line":43},[33,40386,40387],{"class":39},"# pip install camelot-py[cv] pdfplumber pandas opencv-python-headless\n",[33,40389,40390],{"class":35,"line":61},[33,40391,40392],{"class":39},"# System deps: ghostscript (for camelot)\n",[33,40394,40395],{"class":35,"line":73},[33,40396,139],{"class":54},[33,40398,40399],{"class":35,"line":88},[33,40400,40401],{"class":54},"extract_tables.py — extract all tables from a PDF, camelot → pdfplumber fallback.\n",[33,40403,40404],{"class":35,"line":95},[33,40405,40406],{"class":54},"Usage: python extract_tables.py report.pdf --out output\u002F\n",[33,40408,40409],{"class":35,"line":101},[33,40410,139],{"class":54},[33,40412,40413,40415],{"class":35,"line":171},[33,40414,164],{"class":163},[33,40416,4461],{"class":167},[33,40418,40419,40421],{"class":35,"line":179},[33,40420,164],{"class":163},[33,40422,168],{"class":167},[33,40424,40425,40427,40429,40431],{"class":35,"line":187},[33,40426,190],{"class":163},[33,40428,193],{"class":167},[33,40430,164],{"class":163},[33,40432,198],{"class":167},[33,40434,40435],{"class":35,"line":201},[33,40436,92],{"emptyLinePlaceholder":91},[33,40438,40439,40441,40443,40445],{"class":35,"line":206},[33,40440,164],{"class":163},[33,40442,492],{"class":167},[33,40444,495],{"class":163},[33,40446,498],{"class":167},[33,40448,40449],{"class":35,"line":224},[33,40450,92],{"emptyLinePlaceholder":91},[33,40452,40453,40455,40458,40461,40463],{"class":35,"line":229},[33,40454,562],{"class":163},[33,40456,40457],{"class":46}," _try_camelot",[33,40459,40460],{"class":167},"(pdf_path: Path, out_dir: Path) -> ",[33,40462,2821],{"class":50},[33,40464,574],{"class":167},[33,40466,40467],{"class":35,"line":235},[33,40468,40469],{"class":54},"    \"\"\"Attempt extraction with camelot lattice then stream. Return True on success.\"\"\"\n",[33,40471,40472,40474],{"class":35,"line":250},[33,40473,2424],{"class":163},[33,40475,574],{"class":167},[33,40477,40478,40480],{"class":35,"line":266},[33,40479,3388],{"class":163},[33,40481,10567],{"class":167},[33,40483,40484,40486,40489],{"class":35,"line":290},[33,40485,2449],{"class":163},[33,40487,40488],{"class":50}," ImportError",[33,40490,574],{"class":167},[33,40492,40493,40495,40497,40500],{"class":35,"line":295},[33,40494,9414],{"class":50},[33,40496,602],{"class":167},[33,40498,40499],{"class":54},"\"camelot not installed, skipping\"",[33,40501,221],{"class":167},[33,40503,40504,40506],{"class":35,"line":300},[33,40505,1659],{"class":163},[33,40507,2903],{"class":50},[33,40509,40510],{"class":35,"line":317},[33,40511,92],{"emptyLinePlaceholder":91},[33,40513,40514,40516,40518,40520,40522,40524,40526,40528],{"class":35,"line":332},[33,40515,656],{"class":163},[33,40517,15750],{"class":167},[33,40519,662],{"class":163},[33,40521,17583],{"class":167},[33,40523,10985],{"class":54},[33,40525,365],{"class":167},[33,40527,13407],{"class":54},[33,40529,1737],{"class":167},[33,40531,40532,40534],{"class":35,"line":347},[33,40533,670],{"class":163},[33,40535,574],{"class":167},[33,40537,40538,40541,40543,40546,40548,40551,40553,40555,40557,40559,40561,40563],{"class":35,"line":374},[33,40539,40540],{"class":167},"            tables ",[33,40542,242],{"class":163},[33,40544,40545],{"class":167}," camelot.read_pdf(",[33,40547,1053],{"class":50},[33,40549,40550],{"class":167},"(pdf_path), ",[33,40552,10971],{"class":238},[33,40554,242],{"class":163},[33,40556,35616],{"class":54},[33,40558,365],{"class":167},[33,40560,10748],{"class":238},[33,40562,242],{"class":163},[33,40564,40565],{"class":167},"flavor)\n",[33,40567,40568,40570,40573,40575,40577],{"class":35,"line":397},[33,40569,5995],{"class":163},[33,40571,40572],{"class":167}," tables.n ",[33,40574,1865],{"class":163},[33,40576,10791],{"class":50},[33,40578,574],{"class":167},[33,40580,40581],{"class":35,"line":653},[33,40582,12315],{"class":163},[33,40584,40585,40587,40590,40592,40594],{"class":35,"line":667},[33,40586,1793],{"class":163},[33,40588,40589],{"class":167}," i, tbl ",[33,40591,662],{"class":163},[33,40593,7403],{"class":50},[33,40595,37731],{"class":167},[33,40597,40598,40600,40602],{"class":35,"line":675},[33,40599,6019],{"class":167},[33,40601,242],{"class":163},[33,40603,40604],{"class":167}," tbl.df.copy()\n",[33,40606,40607],{"class":35,"line":689},[33,40608,40609],{"class":39},"                # Promote first row to header if it looks like one\n",[33,40611,40612,40614,40616,40618,40620,40622,40624],{"class":35,"line":703},[33,40613,7170],{"class":163},[33,40615,9516],{"class":167},[33,40617,748],{"class":50},[33,40619,763],{"class":167},[33,40621,6009],{"class":163},[33,40623,1814],{"class":50},[33,40625,574],{"class":167},[33,40627,40628,40631,40633,40635,40637],{"class":35,"line":714},[33,40629,40630],{"class":167},"                    df.columns ",[33,40632,242],{"class":163},[33,40634,10847],{"class":167},[33,40636,748],{"class":50},[33,40638,40639],{"class":167},"].tolist()\n",[33,40641,40642,40644,40646,40648,40650,40652,40654,40656,40658],{"class":35,"line":723},[33,40643,7533],{"class":167},[33,40645,242],{"class":163},[33,40647,10847],{"class":167},[33,40649,734],{"class":50},[33,40651,10865],{"class":167},[33,40653,10868],{"class":238},[33,40655,242],{"class":163},[33,40657,855],{"class":50},[33,40659,221],{"class":167},[33,40661,40662,40665,40667,40670,40672,40674,40677,40679,40681,40683,40685,40687,40689,40691,40693],{"class":35,"line":754},[33,40663,40664],{"class":167},"                out_path ",[33,40666,242],{"class":163},[33,40668,40669],{"class":167}," out_dir ",[33,40671,1351],{"class":163},[33,40673,1110],{"class":163},[33,40675,40676],{"class":54},"\"camelot_",[33,40678,1115],{"class":50},[33,40680,10748],{"class":167},[33,40682,1121],{"class":50},[33,40684,40164],{"class":54},[33,40686,1115],{"class":50},[33,40688,7499],{"class":167},[33,40690,1811],{"class":163},[33,40692,40161],{"class":50},[33,40694,40176],{"class":54},[33,40696,40697,40700,40702,40704,40706],{"class":35,"line":771},[33,40698,40699],{"class":167},"                df.to_csv(out_path, ",[33,40701,897],{"class":238},[33,40703,242],{"class":163},[33,40705,902],{"class":50},[33,40707,221],{"class":167},[33,40709,40710,40712,40714,40716,40718,40720,40723,40725,40728,40730,40732,40734,40736,40738,40740,40743],{"class":35,"line":777},[33,40711,8264],{"class":50},[33,40713,602],{"class":167},[33,40715,4059],{"class":163},[33,40717,16008],{"class":54},[33,40719,1115],{"class":50},[33,40721,40722],{"class":167},"out_path",[33,40724,1121],{"class":50},[33,40726,40727],{"class":54},"  (accuracy ",[33,40729,1115],{"class":50},[33,40731,38022],{"class":167},[33,40733,38025],{"class":54},[33,40735,9546],{"class":167},[33,40737,18438],{"class":163},[33,40739,1121],{"class":50},[33,40741,40742],{"class":54},"%)\"",[33,40744,221],{"class":167},[33,40746,40747,40749],{"class":35,"line":788},[33,40748,28782],{"class":163},[33,40750,2887],{"class":50},[33,40752,40753,40755,40757,40759],{"class":35,"line":804},[33,40754,780],{"class":163},[33,40756,783],{"class":50},[33,40758,1852],{"class":163},[33,40760,1855],{"class":167},[33,40762,40763,40765,40767,40769,40772,40774,40776,40778,40780,40782,40784,40786,40788],{"class":35,"line":809},[33,40764,9364],{"class":50},[33,40766,602],{"class":167},[33,40768,4059],{"class":163},[33,40770,40771],{"class":54},"\"camelot ",[33,40773,1115],{"class":50},[33,40775,10748],{"class":167},[33,40777,1121],{"class":50},[33,40779,1899],{"class":54},[33,40781,1115],{"class":50},[33,40783,6565],{"class":167},[33,40785,1121],{"class":50},[33,40787,274],{"class":54},[33,40789,221],{"class":167},[33,40791,40792,40794],{"class":35,"line":819},[33,40793,1332],{"class":163},[33,40795,2903],{"class":50},[33,40797,40798],{"class":35,"line":829},[33,40799,92],{"emptyLinePlaceholder":91},[33,40801,40802],{"class":35,"line":834},[33,40803,92],{"emptyLinePlaceholder":91},[33,40805,40806,40808,40811,40813,40815],{"class":35,"line":839},[33,40807,562],{"class":163},[33,40809,40810],{"class":46}," _try_pdfplumber",[33,40812,40460],{"class":167},[33,40814,2821],{"class":50},[33,40816,574],{"class":167},[33,40818,40819],{"class":35,"line":860},[33,40820,40821],{"class":54},"    \"\"\"Fallback extraction with pdfplumber. Return True on success.\"\"\"\n",[33,40823,40824,40826],{"class":35,"line":887},[33,40825,2424],{"class":163},[33,40827,574],{"class":167},[33,40829,40830,40832],{"class":35,"line":907},[33,40831,3388],{"class":163},[33,40833,485],{"class":167},[33,40835,40836,40838,40840],{"class":35,"line":1826},[33,40837,2449],{"class":163},[33,40839,40488],{"class":50},[33,40841,574],{"class":167},[33,40843,40844,40846,40848,40851],{"class":35,"line":1844},[33,40845,9414],{"class":50},[33,40847,602],{"class":167},[33,40849,40850],{"class":54},"\"pdfplumber not installed, cannot fall back\"",[33,40852,221],{"class":167},[33,40854,40855,40857],{"class":35,"line":1858},[33,40856,1659],{"class":163},[33,40858,2903],{"class":50},[33,40860,40861],{"class":35,"line":1871},[33,40862,92],{"emptyLinePlaceholder":91},[33,40864,40865,40868,40870],{"class":35,"line":1877},[33,40866,40867],{"class":167},"    count ",[33,40869,242],{"class":163},[33,40871,28914],{"class":50},[33,40873,40874,40876],{"class":35,"line":1883},[33,40875,2424],{"class":163},[33,40877,574],{"class":167},[33,40879,40880,40882,40884,40886],{"class":35,"line":1915},[33,40881,2191],{"class":163},[33,40883,681],{"class":167},[33,40885,495],{"class":163},[33,40887,686],{"class":167},[33,40889,40890,40892,40894,40896,40898],{"class":35,"line":1926},[33,40891,1793],{"class":163},[33,40893,7398],{"class":167},[33,40895,662],{"class":163},[33,40897,7403],{"class":50},[33,40899,40080],{"class":167},[33,40901,40902,40904,40907,40909,40911],{"class":35,"line":1932},[33,40903,692],{"class":163},[33,40905,40906],{"class":167}," tbl_num, raw ",[33,40908,662],{"class":163},[33,40910,7403],{"class":50},[33,40912,40094],{"class":167},[33,40914,40915,40917,40919,40921,40923,40925,40927,40929,40931],{"class":35,"line":1938},[33,40916,717],{"class":163},[33,40918,620],{"class":163},[33,40920,7422],{"class":167},[33,40922,7162],{"class":163},[33,40924,4037],{"class":50},[33,40926,7446],{"class":167},[33,40928,4043],{"class":163},[33,40930,7451],{"class":50},[33,40932,574],{"class":167},[33,40934,40935],{"class":35,"line":1950},[33,40936,7458],{"class":163},[33,40938,40939,40941,40943,40945,40947,40949,40951,40953,40955,40957],{"class":35,"line":1958},[33,40940,7533],{"class":167},[33,40942,242],{"class":163},[33,40944,7538],{"class":167},[33,40946,734],{"class":50},[33,40948,737],{"class":167},[33,40950,740],{"class":238},[33,40952,242],{"class":163},[33,40954,13789],{"class":167},[33,40956,748],{"class":50},[33,40958,751],{"class":167},[33,40960,40961,40964,40966,40968,40970,40972,40975,40977,40979,40981,40983,40985,40987,40989,40991,40993],{"class":35,"line":4904},[33,40962,40963],{"class":167},"                    out_path ",[33,40965,242],{"class":163},[33,40967,40669],{"class":167},[33,40969,1351],{"class":163},[33,40971,1110],{"class":163},[33,40973,40974],{"class":54},"\"pdfplumber_p",[33,40976,1115],{"class":50},[33,40978,40156],{"class":167},[33,40980,1811],{"class":163},[33,40982,40161],{"class":50},[33,40984,40164],{"class":54},[33,40986,1115],{"class":50},[33,40988,40169],{"class":167},[33,40990,1811],{"class":163},[33,40992,40161],{"class":50},[33,40994,40176],{"class":54},[33,40996,40997,41000,41002,41004,41006],{"class":35,"line":4909},[33,40998,40999],{"class":167},"                    df.to_csv(out_path, ",[33,41001,897],{"class":238},[33,41003,242],{"class":163},[33,41005,902],{"class":50},[33,41007,221],{"class":167},[33,41009,41010,41013,41015,41017,41019,41021,41023,41025,41027],{"class":35,"line":4915},[33,41011,41012],{"class":50},"                    print",[33,41014,602],{"class":167},[33,41016,4059],{"class":163},[33,41018,16008],{"class":54},[33,41020,1115],{"class":50},[33,41022,40722],{"class":167},[33,41024,1121],{"class":50},[33,41026,274],{"class":54},[33,41028,221],{"class":167},[33,41030,41031,41034,41036],{"class":35,"line":4925},[33,41032,41033],{"class":167},"                    count ",[33,41035,28976],{"class":163},[33,41037,17709],{"class":50},[33,41039,41040,41042,41044,41046],{"class":35,"line":4935},[33,41041,2449],{"class":163},[33,41043,783],{"class":50},[33,41045,1852],{"class":163},[33,41047,1855],{"class":167},[33,41049,41050,41052,41054,41056,41059,41061,41063,41065,41067],{"class":35,"line":4941},[33,41051,9414],{"class":50},[33,41053,602],{"class":167},[33,41055,4059],{"class":163},[33,41057,41058],{"class":54},"\"pdfplumber failed: ",[33,41060,1115],{"class":50},[33,41062,6565],{"class":167},[33,41064,1121],{"class":50},[33,41066,274],{"class":54},[33,41068,221],{"class":167},[33,41070,41071,41073],{"class":35,"line":4950},[33,41072,1659],{"class":163},[33,41074,2903],{"class":50},[33,41076,41077,41079,41082,41084],{"class":35,"line":4960},[33,41078,1332],{"class":163},[33,41080,41081],{"class":167}," count ",[33,41083,6009],{"class":163},[33,41085,28914],{"class":50},[33,41087,41088],{"class":35,"line":4965},[33,41089,92],{"emptyLinePlaceholder":91},[33,41091,41092],{"class":35,"line":4971},[33,41093,92],{"emptyLinePlaceholder":91},[33,41095,41096,41098,41100,41102,41104],{"class":35,"line":4983},[33,41097,562],{"class":163},[33,41099,6636],{"class":46},[33,41101,568],{"class":167},[33,41103,571],{"class":50},[33,41105,574],{"class":167},[33,41107,41108,41110,41112,41114,41116,41118,41121],{"class":35,"line":4988},[33,41109,6648],{"class":167},[33,41111,242],{"class":163},[33,41113,6653],{"class":167},[33,41115,6656],{"class":238},[33,41117,242],{"class":163},[33,41119,41120],{"class":54},"\"Extract tables from a PDF file.\"",[33,41122,221],{"class":167},[33,41124,41125,41127,41129,41131,41133,41135,41137,41139,41141,41144],{"class":35,"line":4993},[33,41126,6669],{"class":167},[33,41128,15519],{"class":54},[33,41130,365],{"class":167},[33,41132,6677],{"class":238},[33,41134,242],{"class":163},[33,41136,6682],{"class":167},[33,41138,25463],{"class":238},[33,41140,242],{"class":163},[33,41142,41143],{"class":54},"\"Path to input PDF\"",[33,41145,221],{"class":167},[33,41147,41148,41150,41153,41155,41157,41159,41161,41163,41165,41167,41170,41172,41174,41176,41178],{"class":35,"line":5003},[33,41149,6669],{"class":167},[33,41151,41152],{"class":54},"\"--out\"",[33,41154,365],{"class":167},[33,41156,6677],{"class":238},[33,41158,242],{"class":163},[33,41160,6682],{"class":167},[33,41162,6685],{"class":238},[33,41164,242],{"class":163},[33,41166,15641],{"class":167},[33,41168,41169],{"class":54},"\"output\"",[33,41171,18525],{"class":167},[33,41173,25463],{"class":238},[33,41175,242],{"class":163},[33,41177,25501],{"class":54},[33,41179,221],{"class":167},[33,41181,41182,41184,41186],{"class":35,"line":5008},[33,41183,6766],{"class":167},[33,41185,242],{"class":163},[33,41187,6771],{"class":167},[33,41189,41190],{"class":35,"line":5014},[33,41191,92],{"emptyLinePlaceholder":91},[33,41193,41194,41196,41198],{"class":35,"line":5019},[33,41195,617],{"class":163},[33,41197,620],{"class":163},[33,41199,15668],{"class":167},[33,41201,41202,41204,41206,41208,41210,41212,41214,41216],{"class":35,"line":5032},[33,41203,2995],{"class":167},[33,41205,4059],{"class":163},[33,41207,15677],{"class":54},[33,41209,1115],{"class":50},[33,41211,15682],{"class":167},[33,41213,1121],{"class":50},[33,41215,274],{"class":54},[33,41217,221],{"class":167},[33,41219,41220],{"class":35,"line":5039},[33,41221,92],{"emptyLinePlaceholder":91},[33,41223,41224,41227,41229,41231,41233,41235,41237,41239,41241],{"class":35,"line":5068},[33,41225,41226],{"class":167},"    args.out.mkdir(",[33,41228,869],{"class":238},[33,41230,242],{"class":163},[33,41232,855],{"class":50},[33,41234,365],{"class":167},[33,41236,878],{"class":238},[33,41238,242],{"class":163},[33,41240,855],{"class":50},[33,41242,221],{"class":167},[33,41244,41245],{"class":35,"line":5077},[33,41246,92],{"emptyLinePlaceholder":91},[33,41248,41249,41251,41253],{"class":35,"line":5082},[33,41250,617],{"class":163},[33,41252,620],{"class":163},[33,41254,41255],{"class":167}," _try_camelot(args.pdf, args.out):\n",[33,41257,41258,41260,41262,41265],{"class":35,"line":5089},[33,41259,9414],{"class":50},[33,41261,602],{"class":167},[33,41263,41264],{"class":54},"\"camelot produced no tables, falling back to pdfplumber\"",[33,41266,221],{"class":167},[33,41268,41269,41271,41273],{"class":35,"line":5098},[33,41270,8221],{"class":163},[33,41272,620],{"class":163},[33,41274,41275],{"class":167}," _try_pdfplumber(args.pdf, args.out):\n",[33,41277,41278,41281,41284],{"class":35,"line":5105},[33,41279,41280],{"class":167},"            sys.exit(",[33,41282,41283],{"class":54},"\"All extraction methods failed. Check that the PDF contains selectable text.\"",[33,41285,221],{"class":167},[33,41287,41288,41291],{"class":35,"line":5110},[33,41289,41290],{"class":163},"        else",[33,41292,574],{"class":167},[33,41294,41295,41297,41299,41302],{"class":35,"line":5115},[33,41296,9364],{"class":50},[33,41298,602],{"class":167},[33,41300,41301],{"class":54},"\"Extraction complete via pdfplumber fallback.\"",[33,41303,221],{"class":167},[33,41305,41306,41308],{"class":35,"line":5128},[33,41307,6864],{"class":163},[33,41309,574],{"class":167},[33,41311,41312,41314,41316,41319],{"class":35,"line":5135},[33,41313,9414],{"class":50},[33,41315,602],{"class":167},[33,41317,41318],{"class":54},"\"Extraction complete via camelot.\"",[33,41320,221],{"class":167},[33,41322,41323],{"class":35,"line":5142},[33,41324,92],{"emptyLinePlaceholder":91},[33,41326,41327],{"class":35,"line":5151},[33,41328,92],{"emptyLinePlaceholder":91},[33,41330,41331,41333,41335,41337,41339],{"class":35,"line":5156},[33,41332,2491],{"class":163},[33,41334,2494],{"class":50},[33,41336,2497],{"class":163},[33,41338,2500],{"class":54},[33,41340,574],{"class":167},[33,41342,41343],{"class":35,"line":5161},[33,41344,6914],{"class":167},[14,41346,41347],{},"Run it:",[23,41349,41351],{"className":25,"code":41350,"language":27,"meta":28,"style":28},"python extract_tables.py report.pdf --out output\u002Ftables\u002F\n",[30,41352,41353],{"__ignoreMap":28},[33,41354,41355,41357,41360,41363,41366],{"class":35,"line":36},[33,41356,47],{"class":46},[33,41358,41359],{"class":54}," extract_tables.py",[33,41361,41362],{"class":54}," report.pdf",[33,41364,41365],{"class":50}," --out",[33,41367,41368],{"class":54}," output\u002Ftables\u002F\n",[18,41370,6918],{"id":6917},[4211,41372,41373,41378,41383,41388,41393],{},[4214,41374,41375,41377],{},[940,41376,9592],{"href":942}," — coordinate mapping, multi-page iteration, and structured CSV export",[4214,41379,41380,41382],{},[940,41381,9739],{"href":9738}," — Ghostscript and OpenCV install issues on Debian\u002FUbuntu",[4214,41384,41385,41387],{},[940,41386,34953],{"href":37035}," — JRE install, PATH config, and Docker variants",[4214,41389,41390,41392],{},[940,41391,36756],{"href":26957}," — pre-process scanned PDFs before running any of these libraries",[4214,41394,41395,41397],{},[940,41396,26258],{"href":26257}," — downstream pandas workflows for the DataFrames you extract",[14,41399,6947,41400,3035],{},[940,41401,6943],{"href":6942},[6953,41403,41404],{},"html pre.shiki code .sAwPA, html code.shiki .sAwPA{--shiki-default:#6A737D}html pre.shiki code .s7eDp, html code.shiki .s7eDp{--shiki-default:#6F42C1}html pre.shiki code .sYBdl, html code.shiki .sYBdl{--shiki-default:#032F62}html pre.shiki code .sYu0t, html code.shiki .sYu0t{--shiki-default:#005CC5}html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html pre.shiki code .sD7c4, html code.shiki .sD7c4{--shiki-default:#D73A49}html pre.shiki code .sgsFI, html code.shiki .sgsFI{--shiki-default:#24292E}html pre.shiki code .sqxcx, html code.shiki .sqxcx{--shiki-default:#E36209}",{"title":28,"searchDepth":43,"depth":43,"links":41406},[41407,41408,41409,41410,41415,41416,41421,41422,41423,41424,41425,41426],{"id":36811,"depth":43,"text":36812},{"id":36849,"depth":43,"text":36850},{"id":37038,"depth":43,"text":37039},{"id":37413,"depth":43,"text":37414,"children":41411},[41412,41413,41414],{"id":37417,"depth":61,"text":37418},{"id":37774,"depth":61,"text":37775},{"id":38176,"depth":61,"text":38177},{"id":38497,"depth":43,"text":38498},{"id":38894,"depth":43,"text":38895,"children":41417},[41418,41419,41420],{"id":38898,"depth":61,"text":38899},{"id":39178,"depth":61,"text":39179},{"id":39556,"depth":61,"text":39557},{"id":39698,"depth":43,"text":39699},{"id":39933,"depth":43,"text":39934},{"id":40192,"depth":43,"text":40193},{"id":40315,"depth":43,"text":40316},{"id":40365,"depth":43,"text":40366},{"id":6917,"depth":43,"text":6918},"Compare PDF Table Libraries","Side-by-side comparison of pdfplumber, camelot, and tabula-py for PDF table extraction. Covers install deps, lattice vs stream, accuracy, speed, and a fallback script.",{},"\u002Fautomating-pdf-extraction-generation\u002Fcomparing-pdf-table-extraction-libraries",{"title":9606,"description":41428},"pdfplumber vs camelot vs tabula-py Comparison","automating-pdf-extraction-generation\u002Fcomparing-pdf-table-extraction-libraries\u002Findex",[47,9631,943,16139,36791],"8k2TinUaLlvgus6VSPXz0StyJzljZE2q8HMzsGzgY_s",{"id":41437,"title":9739,"body":41438,"breadcrumbTitle":43136,"canonical":6977,"date":6978,"description":43137,"draft":6980,"extension":6981,"image":6977,"meta":43138,"navigation":91,"path":43139,"robots":6977,"seo":43140,"seoTitle":43141,"stem":43142,"tags":43143,"updatedAt":6978,"__hash__":43145},"content\u002Fautomating-pdf-extraction-generation\u002Fextracting-tables-from-pdfs\u002Ffix-camelot-import-error-on-linux\u002Findex.md",{"type":7,"value":41439,"toc":43114},[41440,41443,41449,41455,41467,41469,41475,41516,41522,41524,41527,41790,41793,41799,41809,41816,41819,41905,41915,41922,41930,41961,41967,41992,42006,42057,42064,42074,42080,42083,42162,42173,42186,42189,42231,42243,42252,42258,42264,42287,42290,42316,42320,42323,42329,42332,42358,42361,42376,42383,42406,42410,42414,42426,42470,42479,42483,42486,42537,42541,42550,42579,42589,42593,42596,42717,42721,42732,42808,42814,42816,42819,43071,43083,43085,43107,43111],[10,41441,9739],{"id":41442},"fix-camelot-import-error-on-linux",[14,41444,41445,41446,41448],{},"Importing ",[940,41447,16139],{"href":942}," on a fresh Linux environment raises one of three errors. The exact message depends on which system dependency is missing:",[23,41450,41453],{"className":41451,"code":41452,"language":2000},[1998],"ImportError: No module named 'camelot'\nModuleNotFoundError: No module named 'cv2'\nOSError: ghostscript not found on PATH\n",[30,41454,41452],{"__ignoreMap":28},[14,41456,41457,41458,10065,41460,41463,41464,3035],{},"All three errors come from the same root cause: camelot's lattice mode requires two system-level binaries — ",[1974,41459,38840],{},[1974,41461,41462],{},"OpenCV"," — that are not installed by a plain ",[30,41465,41466],{},"pip install camelot-py",[18,41468,7021],{"id":7020},[14,41470,41471,41474],{},[30,41472,41473],{},"camelot-py"," is published in two variants on PyPI:",[4273,41476,41477,41490],{},[4276,41478,41479],{},[4279,41480,41481,41484,41487],{},[4282,41482,41483],{},"Package",[4282,41485,41486],{},"What it includes",[4282,41488,41489],{},"When it fails",[4292,41491,41492,41504],{},[4279,41493,41494,41498,41501],{},[4297,41495,41496],{},[30,41497,41473],{},[4297,41499,41500],{},"Core library only",[4297,41502,41503],{},"Lattice mode crashes — no OpenCV",[4279,41505,41506,41510,41513],{},[4297,41507,41508],{},[30,41509,38669],{},[4297,41511,41512],{},"Core + OpenCV bindings",[4297,41514,41515],{},"Works if Ghostscript binary is also installed",[14,41517,39550,41518,41521],{},[30,41519,41520],{},"cv2"," (OpenCV) module handles image processing for lattice-mode table detection. Ghostscript is called as a subprocess to convert PDFs to intermediate images before processing. Neither ships inside a wheel — they are system-level dependencies.",[18,41523,35017],{"id":35016},[14,41525,41526],{},"Run this before attempting any fix to confirm which component is missing:",[23,41528,41530],{"className":126,"code":41529,"language":47,"meta":28,"style":28},"# No pip install needed — this only tests imports\ndef diagnose_camelot() -> dict:\n    \"\"\"Check which camelot dependencies are present.\"\"\"\n    status = {}\n\n    try:\n        import camelot\n        status[\"camelot\"] = camelot.__version__\n    except ImportError as e:\n        status[\"camelot\"] = f\"MISSING: {e}\"\n\n    try:\n        import cv2\n        status[\"cv2\"] = cv2.__version__\n    except ImportError as e:\n        status[\"cv2\"] = f\"MISSING: {e}\"\n\n    import shutil\n    gs = shutil.which(\"ghostscript\") or shutil.which(\"gs\")\n    status[\"ghostscript_binary\"] = gs or \"NOT FOUND\"\n\n    return status\n\nif __name__ == \"__main__\":\n    import pprint\n    pprint.pprint(diagnose_camelot())\n",[30,41531,41532,41537,41550,41555,41564,41568,41574,41580,41598,41608,41631,41635,41641,41648,41664,41674,41696,41700,41707,41731,41751,41755,41762,41766,41778,41785],{"__ignoreMap":28},[33,41533,41534],{"class":35,"line":36},[33,41535,41536],{"class":39},"# No pip install needed — this only tests imports\n",[33,41538,41539,41541,41544,41546,41548],{"class":35,"line":43},[33,41540,562],{"class":163},[33,41542,41543],{"class":46}," diagnose_camelot",[33,41545,568],{"class":167},[33,41547,37100],{"class":50},[33,41549,574],{"class":167},[33,41551,41552],{"class":35,"line":61},[33,41553,41554],{"class":54},"    \"\"\"Check which camelot dependencies are present.\"\"\"\n",[33,41556,41557,41560,41562],{"class":35,"line":73},[33,41558,41559],{"class":167},"    status ",[33,41561,242],{"class":163},[33,41563,14093],{"class":167},[33,41565,41566],{"class":35,"line":88},[33,41567,92],{"emptyLinePlaceholder":91},[33,41569,41570,41572],{"class":35,"line":95},[33,41571,2424],{"class":163},[33,41573,574],{"class":167},[33,41575,41576,41578],{"class":35,"line":101},[33,41577,3388],{"class":163},[33,41579,10567],{"class":167},[33,41581,41582,41585,41588,41590,41592,41595],{"class":35,"line":171},[33,41583,41584],{"class":167},"        status[",[33,41586,41587],{"class":54},"\"camelot\"",[33,41589,763],{"class":167},[33,41591,242],{"class":163},[33,41593,41594],{"class":167}," camelot.",[33,41596,41597],{"class":50},"__version__\n",[33,41599,41600,41602,41604,41606],{"class":35,"line":179},[33,41601,2449],{"class":163},[33,41603,40488],{"class":50},[33,41605,1852],{"class":163},[33,41607,7583],{"class":167},[33,41609,41610,41612,41614,41616,41618,41620,41623,41625,41627,41629],{"class":35,"line":187},[33,41611,41584],{"class":167},[33,41613,41587],{"class":54},[33,41615,763],{"class":167},[33,41617,242],{"class":163},[33,41619,1110],{"class":163},[33,41621,41622],{"class":54},"\"MISSING: ",[33,41624,1115],{"class":50},[33,41626,7602],{"class":167},[33,41628,1121],{"class":50},[33,41630,7504],{"class":54},[33,41632,41633],{"class":35,"line":201},[33,41634,92],{"emptyLinePlaceholder":91},[33,41636,41637,41639],{"class":35,"line":206},[33,41638,2424],{"class":163},[33,41640,574],{"class":167},[33,41642,41643,41645],{"class":35,"line":224},[33,41644,3388],{"class":163},[33,41646,41647],{"class":167}," cv2\n",[33,41649,41650,41652,41655,41657,41659,41662],{"class":35,"line":229},[33,41651,41584],{"class":167},[33,41653,41654],{"class":54},"\"cv2\"",[33,41656,763],{"class":167},[33,41658,242],{"class":163},[33,41660,41661],{"class":167}," cv2.",[33,41663,41597],{"class":50},[33,41665,41666,41668,41670,41672],{"class":35,"line":235},[33,41667,2449],{"class":163},[33,41669,40488],{"class":50},[33,41671,1852],{"class":163},[33,41673,7583],{"class":167},[33,41675,41676,41678,41680,41682,41684,41686,41688,41690,41692,41694],{"class":35,"line":250},[33,41677,41584],{"class":167},[33,41679,41654],{"class":54},[33,41681,763],{"class":167},[33,41683,242],{"class":163},[33,41685,1110],{"class":163},[33,41687,41622],{"class":54},[33,41689,1115],{"class":50},[33,41691,7602],{"class":167},[33,41693,1121],{"class":50},[33,41695,7504],{"class":54},[33,41697,41698],{"class":35,"line":266},[33,41699,92],{"emptyLinePlaceholder":91},[33,41701,41702,41704],{"class":35,"line":290},[33,41703,1627],{"class":163},[33,41705,41706],{"class":167}," shutil\n",[33,41708,41709,41712,41714,41717,41720,41722,41724,41726,41729],{"class":35,"line":295},[33,41710,41711],{"class":167},"    gs ",[33,41713,242],{"class":163},[33,41715,41716],{"class":167}," shutil.which(",[33,41718,41719],{"class":54},"\"ghostscript\"",[33,41721,1649],{"class":167},[33,41723,7162],{"class":163},[33,41725,41716],{"class":167},[33,41727,41728],{"class":54},"\"gs\"",[33,41730,221],{"class":167},[33,41732,41733,41736,41739,41741,41743,41746,41748],{"class":35,"line":300},[33,41734,41735],{"class":167},"    status[",[33,41737,41738],{"class":54},"\"ghostscript_binary\"",[33,41740,763],{"class":167},[33,41742,242],{"class":163},[33,41744,41745],{"class":167}," gs ",[33,41747,7162],{"class":163},[33,41749,41750],{"class":54}," \"NOT FOUND\"\n",[33,41752,41753],{"class":35,"line":317},[33,41754,92],{"emptyLinePlaceholder":91},[33,41756,41757,41759],{"class":35,"line":332},[33,41758,1332],{"class":163},[33,41760,41761],{"class":167}," status\n",[33,41763,41764],{"class":35,"line":347},[33,41765,92],{"emptyLinePlaceholder":91},[33,41767,41768,41770,41772,41774,41776],{"class":35,"line":374},[33,41769,2491],{"class":163},[33,41771,2494],{"class":50},[33,41773,2497],{"class":163},[33,41775,2500],{"class":54},[33,41777,574],{"class":167},[33,41779,41780,41782],{"class":35,"line":397},[33,41781,1627],{"class":163},[33,41783,41784],{"class":167}," pprint\n",[33,41786,41787],{"class":35,"line":653},[33,41788,41789],{"class":167},"    pprint.pprint(diagnose_camelot())\n",[14,41791,41792],{},"Expected healthy output:",[23,41794,41797],{"className":41795,"code":41796,"language":2000},[1998],"{'camelot': '0.11.0',\n 'cv2': '4.9.0',\n 'ghostscript_binary': '\u002Fusr\u002Fbin\u002Fghostscript'}\n",[30,41798,41796],{"__ignoreMap":28},[14,41800,41801,41802,2012,41805,41808],{},"Any ",[30,41803,41804],{},"MISSING",[30,41806,41807],{},"NOT FOUND"," value maps directly to the fix below.",[18,41810,41812,41813],{"id":41811},"fix-install-ghostscript-and-camelot-pycv","Fix: Install Ghostscript and camelot-py",[33,41814,41815],{},"cv",[14,41817,41818],{},"Run the following in order. Do not skip steps — Ghostscript must be present before the Python install or camelot's post-install check will fail.",[23,41820,41822],{"className":25,"code":41821,"language":27,"meta":28,"style":28},"# Step 1: Install Ghostscript system binary\nsudo apt-get update && sudo apt-get install -y ghostscript\n\n# Step 2: Verify the binary is accessible\ngs --version\n# Expected: 9.x or 10.x\n\n# Step 3: Install camelot with the cv extra (includes OpenCV)\npip install \"camelot-py[cv]\"\n\n# Step 4: Confirm no import errors\npython -c \"import camelot; print(camelot.__version__)\"\n",[30,41823,41824,41829,41849,41853,41858,41865,41870,41874,41879,41888,41892,41897],{"__ignoreMap":28},[33,41825,41826],{"class":35,"line":36},[33,41827,41828],{"class":39},"# Step 1: Install Ghostscript system binary\n",[33,41830,41831,41833,41835,41837,41839,41841,41843,41845,41847],{"class":35,"line":43},[33,41832,9669],{"class":46},[33,41834,9672],{"class":54},[33,41836,35211],{"class":54},[33,41838,35214],{"class":167},[33,41840,9669],{"class":46},[33,41842,9672],{"class":54},[33,41844,79],{"class":54},[33,41846,20912],{"class":50},[33,41848,36900],{"class":54},[33,41850,41851],{"class":35,"line":61},[33,41852,92],{"emptyLinePlaceholder":91},[33,41854,41855],{"class":35,"line":73},[33,41856,41857],{"class":39},"# Step 2: Verify the binary is accessible\n",[33,41859,41860,41862],{"class":35,"line":88},[33,41861,40219],{"class":46},[33,41863,41864],{"class":50}," --version\n",[33,41866,41867],{"class":35,"line":95},[33,41868,41869],{"class":39},"# Expected: 9.x or 10.x\n",[33,41871,41872],{"class":35,"line":101},[33,41873,92],{"emptyLinePlaceholder":91},[33,41875,41876],{"class":35,"line":171},[33,41877,41878],{"class":39},"# Step 3: Install camelot with the cv extra (includes OpenCV)\n",[33,41880,41881,41883,41885],{"class":35,"line":179},[33,41882,76],{"class":46},[33,41884,79],{"class":54},[33,41886,41887],{"class":54}," \"camelot-py[cv]\"\n",[33,41889,41890],{"class":35,"line":187},[33,41891,92],{"emptyLinePlaceholder":91},[33,41893,41894],{"class":35,"line":201},[33,41895,41896],{"class":39},"# Step 4: Confirm no import errors\n",[33,41898,41899,41901,41903],{"class":35,"line":206},[33,41900,47],{"class":46},[33,41902,106],{"class":50},[33,41904,9725],{"class":54},[14,41906,39550,41907,41910,41911,41914],{},[30,41908,41909],{},"[cv]"," extra installs ",[30,41912,41913],{},"opencv-python-headless"," as a dependency. This is the headless variant — it has no GUI dependencies, which is correct for server environments.",[18,41916,41918,41919],{"id":41917},"variant-fix-1-modulenotfounderror-no-module-named-cv2","Variant Fix 1: ",[30,41920,41921],{},"ModuleNotFoundError: No module named 'cv2'",[14,41923,41924,41925,41927,41928,20891],{},"This error means camelot is installed but the ",[30,41926,41909],{}," extra was not included. The package was installed as bare ",[30,41929,41473],{},[23,41931,41933],{"className":25,"code":41932,"language":27,"meta":28,"style":28},"# Check what is installed\npip show camelot-py | grep -i requires\n",[30,41934,41935,41940],{"__ignoreMap":28},[33,41936,41937],{"class":35,"line":36},[33,41938,41939],{"class":39},"# Check what is installed\n",[33,41941,41942,41944,41947,41950,41952,41955,41958],{"class":35,"line":43},[33,41943,76],{"class":46},[33,41945,41946],{"class":54}," show",[33,41948,41949],{"class":54}," camelot-py",[33,41951,2850],{"class":163},[33,41953,41954],{"class":46}," grep",[33,41956,41957],{"class":50}," -i",[33,41959,41960],{"class":54}," requires\n",[14,41962,41963,41964,41966],{},"If ",[30,41965,41520],{}," is not in the requirements list, reinstall with the extra:",[23,41968,41970],{"className":25,"code":41969,"language":27,"meta":28,"style":28},"pip uninstall camelot-py -y\npip install \"camelot-py[cv]\"\n",[30,41971,41972,41984],{"__ignoreMap":28},[33,41973,41974,41976,41979,41981],{"class":35,"line":36},[33,41975,76],{"class":46},[33,41977,41978],{"class":54}," uninstall",[33,41980,41949],{"class":54},[33,41982,41983],{"class":50}," -y\n",[33,41985,41986,41988,41990],{"class":35,"line":43},[33,41987,76],{"class":46},[33,41989,79],{"class":54},[33,41991,41887],{"class":54},[14,41993,41994,41995,41998,41999,42001,42002,42005],{},"If you are in a virtualenv where ",[30,41996,41997],{},"opencv-python"," is already installed (not the headless variant), the ",[30,42000,41520],{}," import may still fail due to missing GUI libraries (",[30,42003,42004],{},"libGL.so.1","):",[23,42007,42009],{"className":25,"code":42008,"language":27,"meta":28,"style":28},"# Install the headless variant to avoid X11 \u002F display dependencies\npip uninstall opencv-python -y\npip install opencv-python-headless\n\n# Or install the missing system lib directly:\nsudo apt-get install -y libgl1\n",[30,42010,42011,42016,42027,42035,42039,42044],{"__ignoreMap":28},[33,42012,42013],{"class":35,"line":36},[33,42014,42015],{"class":39},"# Install the headless variant to avoid X11 \u002F display dependencies\n",[33,42017,42018,42020,42022,42025],{"class":35,"line":43},[33,42019,76],{"class":46},[33,42021,41978],{"class":54},[33,42023,42024],{"class":54}," opencv-python",[33,42026,41983],{"class":50},[33,42028,42029,42031,42033],{"class":35,"line":61},[33,42030,76],{"class":46},[33,42032,79],{"class":54},[33,42034,36967],{"class":54},[33,42036,42037],{"class":35,"line":73},[33,42038,92],{"emptyLinePlaceholder":91},[33,42040,42041],{"class":35,"line":88},[33,42042,42043],{"class":39},"# Or install the missing system lib directly:\n",[33,42045,42046,42048,42050,42052,42054],{"class":35,"line":95},[33,42047,9669],{"class":46},[33,42049,9672],{"class":54},[33,42051,79],{"class":54},[33,42053,20912],{"class":50},[33,42055,42056],{"class":54}," libgl1\n",[18,42058,42060,42061,42063],{"id":42059},"variant-fix-2-oserror-ghostscript-not-found-at-runtime","Variant Fix 2: ",[30,42062,9731],{}," at Runtime",[14,42065,42066,42067,42070,42071,42073],{},"This error appears after a successful ",[30,42068,42069],{},"import camelot"," — it surfaces when ",[30,42072,39695],{}," is first called:",[23,42075,42078],{"className":42076,"code":42077,"language":2000},[1998],"OSError: ghostscript not found on PATH. Please install ghostscript.\n",[30,42079,42077],{"__ignoreMap":28},[14,42081,42082],{},"The binary is either not installed or not on the PATH seen by the Python process:",[23,42084,42086],{"className":25,"code":42085,"language":27,"meta":28,"style":28},"# Confirm the binary name — Debian\u002FUbuntu uses 'ghostscript', others use 'gs'\nwhich ghostscript || which gs\n\n# If missing:\nsudo apt-get install -y ghostscript\n\n# If present but not on PATH (e.g., in a restricted shell):\nexport PATH=\"$PATH:\u002Fusr\u002Fbin\"\npython -c \"import camelot; camelot.read_pdf('test.pdf', pages='1', flavor='lattice')\"\n",[30,42087,42088,42093,42108,42112,42117,42129,42133,42138,42153],{"__ignoreMap":28},[33,42089,42090],{"class":35,"line":36},[33,42091,42092],{"class":39},"# Confirm the binary name — Debian\u002FUbuntu uses 'ghostscript', others use 'gs'\n",[33,42094,42095,42097,42099,42102,42105],{"class":35,"line":43},[33,42096,35269],{"class":50},[33,42098,9677],{"class":54},[33,42100,42101],{"class":163}," ||",[33,42103,42104],{"class":50}," which",[33,42106,42107],{"class":54}," gs\n",[33,42109,42110],{"class":35,"line":61},[33,42111,92],{"emptyLinePlaceholder":91},[33,42113,42114],{"class":35,"line":73},[33,42115,42116],{"class":39},"# If missing:\n",[33,42118,42119,42121,42123,42125,42127],{"class":35,"line":88},[33,42120,9669],{"class":46},[33,42122,9672],{"class":54},[33,42124,79],{"class":54},[33,42126,20912],{"class":50},[33,42128,36900],{"class":54},[33,42130,42131],{"class":35,"line":95},[33,42132,92],{"emptyLinePlaceholder":91},[33,42134,42135],{"class":35,"line":101},[33,42136,42137],{"class":39},"# If present but not on PATH (e.g., in a restricted shell):\n",[33,42139,42140,42142,42144,42146,42148,42150],{"class":35,"line":171},[33,42141,35332],{"class":163},[33,42143,35335],{"class":167},[33,42145,242],{"class":163},[33,42147,274],{"class":54},[33,42149,35343],{"class":167},[33,42151,42152],{"class":54},":\u002Fusr\u002Fbin\"\n",[33,42154,42155,42157,42159],{"class":35,"line":179},[33,42156,47],{"class":46},[33,42158,106],{"class":50},[33,42160,42161],{"class":54}," \"import camelot; camelot.read_pdf('test.pdf', pages='1', flavor='lattice')\"\n",[14,42163,42164,42165,42168,42169,42172],{},"If running inside a Docker container or CI environment without ",[30,42166,42167],{},"apt"," access, use the ",[30,42170,42171],{},"ghostscript"," Python binding as a fallback — but note this is slower than the system binary:",[23,42174,42176],{"className":25,"code":42175,"language":27,"meta":28,"style":28},"pip install ghostscript\n",[30,42177,42178],{"__ignoreMap":28},[33,42179,42180,42182,42184],{"class":35,"line":36},[33,42181,76],{"class":46},[33,42183,79],{"class":54},[33,42185,36900],{"class":54},[14,42187,42188],{},"Then verify camelot can find it:",[23,42190,42192],{"className":126,"code":42191,"language":47,"meta":28,"style":28},"# pip install ghostscript camelot-py[cv]\nimport ghostscript   # exercises the binding\nimport camelot\nprint(\"All dependencies resolved:\", camelot.__version__)\n",[30,42193,42194,42199,42209,42215],{"__ignoreMap":28},[33,42195,42196],{"class":35,"line":36},[33,42197,42198],{"class":39},"# pip install ghostscript camelot-py[cv]\n",[33,42200,42201,42203,42206],{"class":35,"line":43},[33,42202,164],{"class":163},[33,42204,42205],{"class":167}," ghostscript   ",[33,42207,42208],{"class":39},"# exercises the binding\n",[33,42210,42211,42213],{"class":35,"line":61},[33,42212,164],{"class":163},[33,42214,10567],{"class":167},[33,42216,42217,42219,42221,42224,42227,42229],{"class":35,"line":73},[33,42218,13474],{"class":50},[33,42220,602],{"class":167},[33,42222,42223],{"class":54},"\"All dependencies resolved:\"",[33,42225,42226],{"class":167},", camelot.",[33,42228,37016],{"class":50},[33,42230,221],{"class":167},[18,42232,42234,42235,42238,42239,42242],{"id":42233},"variant-fix-3-importerror-for-tkinter-on-headless-servers","Variant Fix 3: ",[30,42236,42237],{},"ImportError"," for ",[30,42240,42241],{},"tkinter"," on Headless Servers",[14,42244,42245,42246,42248,42249,42251],{},"Some older ",[30,42247,41473],{}," versions imported ",[30,42250,42241],{}," at module load time. On headless servers (no display), this raises:",[23,42253,42256],{"className":42254,"code":42255,"language":2000},[1998],"_tkinter.TclError: no display name and no $DISPLAY environment variable\n",[30,42257,42255],{"__ignoreMap":28},[14,42259,42260,42261,42263],{},"Fix: upgrade camelot to 0.10.1 or later, which dropped the hard ",[30,42262,42241],{}," dependency:",[23,42265,42267],{"className":25,"code":42266,"language":27,"meta":28,"style":28},"pip install --upgrade \"camelot-py[cv]\"\npython -c \"import camelot; print(camelot.__version__)\"\n",[30,42268,42269,42279],{"__ignoreMap":28},[33,42270,42271,42273,42275,42277],{"class":35,"line":36},[33,42272,76],{"class":46},[33,42274,79],{"class":54},[33,42276,26668],{"class":50},[33,42278,41887],{"class":54},[33,42280,42281,42283,42285],{"class":35,"line":43},[33,42282,47],{"class":46},[33,42284,106],{"class":50},[33,42286,9725],{"class":54},[14,42288,42289],{},"If upgrading is not possible, set a dummy display before importing:",[23,42291,42293],{"className":25,"code":42292,"language":27,"meta":28,"style":28},"# Temporary workaround only — upgrade is the proper fix\nDISPLAY=:99 python your_script.py\n",[30,42294,42295,42300],{"__ignoreMap":28},[33,42296,42297],{"class":35,"line":36},[33,42298,42299],{"class":39},"# Temporary workaround only — upgrade is the proper fix\n",[33,42301,42302,42305,42307,42310,42313],{"class":35,"line":43},[33,42303,42304],{"class":167},"DISPLAY",[33,42306,242],{"class":163},[33,42308,42309],{"class":54},":99",[33,42311,42312],{"class":46}," python",[33,42314,42315],{"class":54}," your_script.py\n",[18,42317,42319],{"id":42318},"variant-fix-4-ghostscript-version-conflict","Variant Fix 4: Ghostscript Version Conflict",[14,42321,42322],{},"Camelot calls Ghostscript via the command-line API using version-specific flag sets. If you have a very new Ghostscript (11.x) installed and a pinned older camelot, the subprocess call may fail with:",[23,42324,42327],{"className":42325,"code":42326,"language":2000},[1998],"subprocess.CalledProcessError: Command '['gs', ...]' returned non-zero exit status 1\n",[30,42328,42326],{"__ignoreMap":28},[14,42330,42331],{},"Check the installed version and whether camelot supports it:",[23,42333,42335],{"className":25,"code":42334,"language":27,"meta":28,"style":28},"gs --version\npip show camelot-py | grep Version\n",[30,42336,42337,42343],{"__ignoreMap":28},[33,42338,42339,42341],{"class":35,"line":36},[33,42340,40219],{"class":46},[33,42342,41864],{"class":50},[33,42344,42345,42347,42349,42351,42353,42355],{"class":35,"line":43},[33,42346,76],{"class":46},[33,42348,41946],{"class":54},[33,42350,41949],{"class":54},[33,42352,2850],{"class":163},[33,42354,41954],{"class":46},[33,42356,42357],{"class":54}," Version\n",[14,42359,42360],{},"If Ghostscript is 11.x and camelot is below 0.11, upgrade camelot first:",[23,42362,42364],{"className":25,"code":42363,"language":27,"meta":28,"style":28},"pip install --upgrade \"camelot-py[cv]\"\n",[30,42365,42366],{"__ignoreMap":28},[33,42367,42368,42370,42372,42374],{"class":35,"line":36},[33,42369,76],{"class":46},[33,42371,79],{"class":54},[33,42373,26668],{"class":50},[33,42375,41887],{"class":54},[14,42377,42378,42379,42382],{},"If you cannot upgrade camelot (pinned by another dependency), install an older Ghostscript alongside the current one using the Ghostscript release tarballs at ghostscript.com, then point camelot at it by setting the ",[30,42380,42381],{},"GS_PROG"," environment variable:",[23,42384,42386],{"className":25,"code":42385,"language":27,"meta":28,"style":28},"export GS_PROG=\u002Fusr\u002Flocal\u002Fbin\u002Fgs-9.56\npython your_script.py\n",[30,42387,42388,42400],{"__ignoreMap":28},[33,42389,42390,42392,42395,42397],{"class":35,"line":36},[33,42391,35332],{"class":163},[33,42393,42394],{"class":167}," GS_PROG",[33,42396,242],{"class":163},[33,42398,42399],{"class":167},"\u002Fusr\u002Flocal\u002Fbin\u002Fgs-9.56\n",[33,42401,42402,42404],{"class":35,"line":43},[33,42403,47],{"class":46},[33,42405,42315],{"class":54},[18,42407,42409],{"id":42408},"environment-specific-notes","Environment-Specific Notes",[424,42411,42413],{"id":42412},"docker","Docker",[14,42415,42416,42417,42419,42420,42422,42423,42425],{},"Minimal Docker images (Alpine, ",[30,42418,36707],{},") do not include Ghostscript. Add the following to your ",[30,42421,36045],{}," before the ",[30,42424,36846],{}," step:",[23,42427,42429],{"className":36048,"code":42428,"language":36050,"meta":28,"style":28},"# Debian-based image\nRUN apt-get update && apt-get install -y --no-install-recommends \\\n    ghostscript \\\n    libsm6 \\\n    libxext6 \\\n    && rm -rf \u002Fvar\u002Flib\u002Fapt\u002Flists\u002F*\n\nRUN pip install \"camelot-py[cv]\"\n",[30,42430,42431,42436,42441,42446,42451,42456,42461,42465],{"__ignoreMap":28},[33,42432,42433],{"class":35,"line":36},[33,42434,42435],{},"# Debian-based image\n",[33,42437,42438],{"class":35,"line":43},[33,42439,42440],{},"RUN apt-get update && apt-get install -y --no-install-recommends \\\n",[33,42442,42443],{"class":35,"line":61},[33,42444,42445],{},"    ghostscript \\\n",[33,42447,42448],{"class":35,"line":73},[33,42449,42450],{},"    libsm6 \\\n",[33,42452,42453],{"class":35,"line":88},[33,42454,42455],{},"    libxext6 \\\n",[33,42457,42458],{"class":35,"line":95},[33,42459,42460],{},"    && rm -rf \u002Fvar\u002Flib\u002Fapt\u002Flists\u002F*\n",[33,42462,42463],{"class":35,"line":101},[33,42464,92],{"emptyLinePlaceholder":91},[33,42466,42467],{"class":35,"line":171},[33,42468,42469],{},"RUN pip install \"camelot-py[cv]\"\n",[14,42471,42472,10065,42475,42478],{},[30,42473,42474],{},"libsm6",[30,42476,42477],{},"libxext6"," are required by OpenCV on Debian even in headless mode.",[424,42480,42482],{"id":42481},"github-actions","GitHub Actions",[14,42484,42485],{},"Add the system install step before the Python setup action:",[23,42487,42489],{"className":2062,"code":42488,"language":2064,"meta":28,"style":28},"- name: Install system dependencies\n  run: sudo apt-get update && sudo apt-get install -y ghostscript libsm6 libxext6\n\n- name: Install Python dependencies\n  run: pip install \"camelot-py[cv]\" pandas\n",[30,42490,42491,42503,42513,42517,42528],{"__ignoreMap":28},[33,42492,42493,42496,42498,42500],{"class":35,"line":36},[33,42494,42495],{"class":167},"- ",[33,42497,1118],{"class":2076},[33,42499,2079],{"class":167},[33,42501,42502],{"class":54},"Install system dependencies\n",[33,42504,42505,42508,42510],{"class":35,"line":43},[33,42506,42507],{"class":2076},"  run",[33,42509,2079],{"class":167},[33,42511,42512],{"class":54},"sudo apt-get update && sudo apt-get install -y ghostscript libsm6 libxext6\n",[33,42514,42515],{"class":35,"line":61},[33,42516,92],{"emptyLinePlaceholder":91},[33,42518,42519,42521,42523,42525],{"class":35,"line":73},[33,42520,42495],{"class":167},[33,42522,1118],{"class":2076},[33,42524,2079],{"class":167},[33,42526,42527],{"class":54},"Install Python dependencies\n",[33,42529,42530,42532,42534],{"class":35,"line":88},[33,42531,42507],{"class":2076},[33,42533,2079],{"class":167},[33,42535,42536],{"class":54},"pip install \"camelot-py[cv]\" pandas\n",[424,42538,42540],{"id":42539},"red-hat-centos-fedora","Red Hat \u002F CentOS \u002F Fedora",[14,42542,42543,42544,8877,42547,20891],{},"Replace ",[30,42545,42546],{},"apt-get",[30,42548,42549],{},"dnf",[23,42551,42553],{"className":25,"code":42552,"language":27,"meta":28,"style":28},"sudo dnf install -y ghostscript opencv\npip install \"camelot-py[cv]\"\n",[30,42554,42555,42571],{"__ignoreMap":28},[33,42556,42557,42559,42562,42564,42566,42568],{"class":35,"line":36},[33,42558,9669],{"class":46},[33,42560,42561],{"class":54}," dnf",[33,42563,79],{"class":54},[33,42565,20912],{"class":50},[33,42567,9677],{"class":54},[33,42569,42570],{"class":54}," opencv\n",[33,42572,42573,42575,42577],{"class":35,"line":43},[33,42574,76],{"class":46},[33,42576,79],{"class":54},[33,42578,41887],{"class":54},[14,42580,42581,42582,42585,42586,42588],{},"Note that ",[30,42583,42584],{},"opencv"," on dnf installs the C++ library but not the Python binding — the ",[30,42587,41909],{}," extra still handles the Python side.",[18,42590,42592],{"id":42591},"troubleshooting-table","Troubleshooting Table",[14,42594,42595],{},"A quick reference for matching the exact error text to the right fix:",[4273,42597,42598,42611],{},[4276,42599,42600],{},[4279,42601,42602,42605,42608],{},[4282,42603,42604],{},"Error message",[4282,42606,42607],{},"Component missing",[4282,42609,42610],{},"Fix command",[4292,42612,42613,42628,42642,42656,42670,42685,42699],{},[4279,42614,42615,42620,42623],{},[4297,42616,42617],{},[30,42618,42619],{},"ImportError: No module named 'camelot'",[4297,42621,42622],{},"camelot not installed",[4297,42624,42625],{},[30,42626,42627],{},"pip install \"camelot-py[cv]\"",[4279,42629,42630,42634,42637],{},[4297,42631,42632],{},[30,42633,41921],{},[4297,42635,42636],{},"OpenCV not installed",[4297,42638,42639],{},[30,42640,42641],{},"pip install opencv-python-headless",[4279,42643,42644,42649,42652],{},[4297,42645,42646],{},[30,42647,42648],{},"OSError: ghostscript not found on PATH",[4297,42650,42651],{},"Ghostscript binary missing",[4297,42653,42654],{},[30,42655,14432],{},[4279,42657,42658,42662,42665],{},[4297,42659,42660],{},[30,42661,42648],{},[4297,42663,42664],{},"Binary not on Python's PATH",[4297,42666,42667],{},[30,42668,42669],{},"export PATH=\"$PATH:\u002Fusr\u002Fbin\"",[4279,42671,42672,42677,42680],{},[4297,42673,42674],{},[30,42675,42676],{},"_tkinter.TclError: no display name",[4297,42678,42679],{},"Old camelot with tkinter dep",[4297,42681,42682],{},[30,42683,42684],{},"pip install --upgrade \"camelot-py[cv]\"",[4279,42686,42687,42692,42695],{},[4297,42688,42689],{},[30,42690,42691],{},"libGL.so.1: cannot open shared object file",[4297,42693,42694],{},"Non-headless OpenCV on server",[4297,42696,42697],{},[30,42698,42641],{},[4279,42700,42701,42710,42713],{},[4297,42702,42703,42706,42707,42709],{},[30,42704,42705],{},"subprocess.CalledProcessError"," on ",[30,42708,40219],{}," call",[4297,42711,42712],{},"Ghostscript version mismatch",[4297,42714,42715],{},[30,42716,42684],{},[18,42718,42720],{"id":42719},"confirming-the-correct-camelot-package-is-installed","Confirming the Correct camelot Package Is Installed",[14,42722,42723,42724,365,42726,42728,42729,42731],{},"There are three similarly named packages on PyPI: ",[30,42725,41473],{},[30,42727,16139],{}," (an unrelated project), and ",[30,42730,38669],{},". Installing the wrong one is a common mistake:",[23,42733,42735],{"className":25,"code":42734,"language":27,"meta":28,"style":28},"# Confirm camelot-py is installed, not the unrelated 'camelot' package\npip show camelot-py\n\n# The output should include:\n# Name: camelot-py\n# ...\n# Requires: chardet, click, numpy, openpyxl, pandas, pdfminer.six, pypdf2, tabulate\n# (plus opencv-python-headless when installed with [cv])\n\n# If you see 'Name: camelot' without 'py', you have the wrong package:\npip uninstall camelot -y\npip install \"camelot-py[cv]\"\n",[30,42736,42737,42742,42751,42755,42760,42765,42770,42775,42780,42784,42789,42800],{"__ignoreMap":28},[33,42738,42739],{"class":35,"line":36},[33,42740,42741],{"class":39},"# Confirm camelot-py is installed, not the unrelated 'camelot' package\n",[33,42743,42744,42746,42748],{"class":35,"line":43},[33,42745,76],{"class":46},[33,42747,41946],{"class":54},[33,42749,42750],{"class":54}," camelot-py\n",[33,42752,42753],{"class":35,"line":61},[33,42754,92],{"emptyLinePlaceholder":91},[33,42756,42757],{"class":35,"line":73},[33,42758,42759],{"class":39},"# The output should include:\n",[33,42761,42762],{"class":35,"line":88},[33,42763,42764],{"class":39},"# Name: camelot-py\n",[33,42766,42767],{"class":35,"line":95},[33,42768,42769],{"class":39},"# ...\n",[33,42771,42772],{"class":35,"line":101},[33,42773,42774],{"class":39},"# Requires: chardet, click, numpy, openpyxl, pandas, pdfminer.six, pypdf2, tabulate\n",[33,42776,42777],{"class":35,"line":171},[33,42778,42779],{"class":39},"# (plus opencv-python-headless when installed with [cv])\n",[33,42781,42782],{"class":35,"line":179},[33,42783,92],{"emptyLinePlaceholder":91},[33,42785,42786],{"class":35,"line":187},[33,42787,42788],{"class":39},"# If you see 'Name: camelot' without 'py', you have the wrong package:\n",[33,42790,42791,42793,42795,42798],{"class":35,"line":201},[33,42792,76],{"class":46},[33,42794,41978],{"class":54},[33,42796,42797],{"class":54}," camelot",[33,42799,41983],{"class":50},[33,42801,42802,42804,42806],{"class":35,"line":206},[33,42803,76],{"class":46},[33,42805,79],{"class":54},[33,42807,41887],{"class":54},[14,42809,42810,42811,42813],{},"After a correct install, the import path is always ",[30,42812,42069],{}," — both packages use the same name, which is why the error message alone is not enough to distinguish them.",[18,42815,9247],{"id":9246},[14,42817,42818],{},"After applying all fixes, run the full smoke test:",[23,42820,42822],{"className":126,"code":42821,"language":47,"meta":28,"style":28},"# pip install \"camelot-py[cv]\" pandas\nfrom pathlib import Path\nimport camelot\nimport pandas as pd\n\ndef smoke_test(pdf_path: Path) -> None:\n    \"\"\"Verify camelot can open a PDF and return at least one table.\"\"\"\n    if not pdf_path.exists():\n        raise FileNotFoundError(f\"Test PDF not found: {pdf_path}\")\n    try:\n        tables = camelot.read_pdf(\n            str(pdf_path),\n            pages=\"1\",\n            flavor=\"lattice\",\n        )\n    except Exception as e:\n        raise RuntimeError(f\"camelot.read_pdf failed: {e}\") from e\n\n    assert tables.n >= 0, \"camelot.read_pdf ran without error\"\n    print(f\"camelot {camelot.__version__} OK — found {tables.n} table(s) on page 1\")\n\nif __name__ == \"__main__\":\n    # Use any single-page PDF with a bordered table, or any PDF to test import\n    smoke_test(Path(\"data\u002Fsample.pdf\"))\n",[30,42823,42824,42828,42838,42844,42854,42858,42871,42876,42884,42907,42913,42921,42927,42937,42947,42951,42961,42988,42992,43008,43040,43044,43056,43061],{"__ignoreMap":28},[33,42825,42826],{"class":35,"line":36},[33,42827,10550],{"class":39},[33,42829,42830,42832,42834,42836],{"class":35,"line":43},[33,42831,190],{"class":163},[33,42833,193],{"class":167},[33,42835,164],{"class":163},[33,42837,198],{"class":167},[33,42839,42840,42842],{"class":35,"line":61},[33,42841,164],{"class":163},[33,42843,10567],{"class":167},[33,42845,42846,42848,42850,42852],{"class":35,"line":73},[33,42847,164],{"class":163},[33,42849,492],{"class":167},[33,42851,495],{"class":163},[33,42853,498],{"class":167},[33,42855,42856],{"class":35,"line":88},[33,42857,92],{"emptyLinePlaceholder":91},[33,42859,42860,42862,42865,42867,42869],{"class":35,"line":95},[33,42861,562],{"class":163},[33,42863,42864],{"class":46}," smoke_test",[33,42866,37097],{"class":167},[33,42868,571],{"class":50},[33,42870,574],{"class":167},[33,42872,42873],{"class":35,"line":101},[33,42874,42875],{"class":54},"    \"\"\"Verify camelot can open a PDF and return at least one table.\"\"\"\n",[33,42877,42878,42880,42882],{"class":35,"line":171},[33,42879,617],{"class":163},[33,42881,620],{"class":163},[33,42883,21595],{"class":167},[33,42885,42886,42888,42890,42892,42894,42897,42899,42901,42903,42905],{"class":35,"line":179},[33,42887,4051],{"class":163},[33,42889,2945],{"class":50},[33,42891,602],{"class":167},[33,42893,4059],{"class":163},[33,42895,42896],{"class":54},"\"Test PDF not found: ",[33,42898,1115],{"class":50},[33,42900,27069],{"class":167},[33,42902,1121],{"class":50},[33,42904,274],{"class":54},[33,42906,221],{"class":167},[33,42908,42909,42911],{"class":35,"line":187},[33,42910,2424],{"class":163},[33,42912,574],{"class":167},[33,42914,42915,42917,42919],{"class":35,"line":201},[33,42916,37895],{"class":167},[33,42918,242],{"class":163},[33,42920,10668],{"class":167},[33,42922,42923,42925],{"class":35,"line":206},[33,42924,10673],{"class":50},[33,42926,37906],{"class":167},[33,42928,42929,42931,42933,42935],{"class":35,"line":224},[33,42930,10681],{"class":238},[33,42932,242],{"class":163},[33,42934,35984],{"class":54},[33,42936,247],{"class":167},[33,42938,42939,42941,42943,42945],{"class":35,"line":229},[33,42940,10691],{"class":238},[33,42942,242],{"class":163},[33,42944,10985],{"class":54},[33,42946,247],{"class":167},[33,42948,42949],{"class":35,"line":235},[33,42950,5867],{"class":167},[33,42952,42953,42955,42957,42959],{"class":35,"line":250},[33,42954,2449],{"class":163},[33,42956,783],{"class":50},[33,42958,1852],{"class":163},[33,42960,7583],{"class":167},[33,42962,42963,42965,42967,42969,42971,42974,42976,42978,42980,42982,42984,42986],{"class":35,"line":266},[33,42964,4051],{"class":163},[33,42966,7590],{"class":50},[33,42968,602],{"class":167},[33,42970,4059],{"class":163},[33,42972,42973],{"class":54},"\"camelot.read_pdf failed: ",[33,42975,1115],{"class":50},[33,42977,7602],{"class":167},[33,42979,1121],{"class":50},[33,42981,274],{"class":54},[33,42983,1649],{"class":167},[33,42985,190],{"class":163},[33,42987,7613],{"class":167},[33,42989,42990],{"class":35,"line":290},[33,42991,92],{"emptyLinePlaceholder":91},[33,42993,42994,42996,42998,43001,43003,43005],{"class":35,"line":295},[33,42995,9228],{"class":163},[33,42997,40572],{"class":167},[33,42999,43000],{"class":163},">=",[33,43002,10791],{"class":50},[33,43004,365],{"class":167},[33,43006,43007],{"class":54},"\"camelot.read_pdf ran without error\"\n",[33,43009,43010,43012,43014,43016,43018,43020,43023,43026,43029,43031,43033,43035,43038],{"class":35,"line":300},[33,43011,7268],{"class":50},[33,43013,602],{"class":167},[33,43015,4059],{"class":163},[33,43017,40771],{"class":54},[33,43019,1115],{"class":50},[33,43021,43022],{"class":167},"camelot.",[33,43024,43025],{"class":50},"__version__}",[33,43027,43028],{"class":54}," OK — found ",[33,43030,1115],{"class":50},[33,43032,37984],{"class":167},[33,43034,1121],{"class":50},[33,43036,43037],{"class":54}," table(s) on page 1\"",[33,43039,221],{"class":167},[33,43041,43042],{"class":35,"line":317},[33,43043,92],{"emptyLinePlaceholder":91},[33,43045,43046,43048,43050,43052,43054],{"class":35,"line":332},[33,43047,2491],{"class":163},[33,43049,2494],{"class":50},[33,43051,2497],{"class":163},[33,43053,2500],{"class":54},[33,43055,574],{"class":167},[33,43057,43058],{"class":35,"line":347},[33,43059,43060],{"class":39},"    # Use any single-page PDF with a bordered table, or any PDF to test import\n",[33,43062,43063,43066,43069],{"class":35,"line":374},[33,43064,43065],{"class":167},"    smoke_test(Path(",[33,43067,43068],{"class":54},"\"data\u002Fsample.pdf\"",[33,43070,371],{"class":167},[14,43072,43073,43074,43076,43077,43080,43081,3035],{},"A clean run with no ",[30,43075,42237],{},", no ",[30,43078,43079],{},"OSError",", and a printed version confirms all dependencies are wired correctly. For the full extraction workflow once camelot is working, continue with ",[940,43082,9592],{"href":942},[18,43084,6918],{"id":6917},[4211,43086,43087,43092,43097,43102],{},[4214,43088,43089,43091],{},[940,43090,9592],{"href":942}," — lattice vs stream extraction once camelot is installed",[4214,43093,43094,43096],{},[940,43095,10535],{"href":10534}," — coordinate-sorting fix when extraction runs but output is misaligned",[4214,43098,43099,43101],{},[940,43100,10077],{"href":10076}," — OCR alternative when camelot cannot find tables",[4214,43103,43104,43106],{},[940,43105,36756],{"href":26957}," — system dependency setup for Tesseract on the same Linux environment",[14,43108,6947,43109,3035],{},[940,43110,9592],{"href":942},[6953,43112,43113],{},"html pre.shiki code .sAwPA, html code.shiki .sAwPA{--shiki-default:#6A737D}html pre.shiki code .sD7c4, html code.shiki .sD7c4{--shiki-default:#D73A49}html pre.shiki code .s7eDp, html code.shiki .s7eDp{--shiki-default:#6F42C1}html pre.shiki code .sgsFI, html code.shiki .sgsFI{--shiki-default:#24292E}html pre.shiki code .sYu0t, html code.shiki .sYu0t{--shiki-default:#005CC5}html pre.shiki code .sYBdl, html code.shiki .sYBdl{--shiki-default:#032F62}html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html pre.shiki code .shJU0, html code.shiki .shJU0{--shiki-default:#22863A}html pre.shiki code .sqxcx, html code.shiki .sqxcx{--shiki-default:#E36209}",{"title":28,"searchDepth":43,"depth":43,"links":43115},[43116,43117,43118,43120,43122,43124,43126,43127,43132,43133,43134,43135],{"id":7020,"depth":43,"text":7021},{"id":35016,"depth":43,"text":35017},{"id":41811,"depth":43,"text":43119},"Fix: Install Ghostscript and camelot-pycv",{"id":41917,"depth":43,"text":43121},"Variant Fix 1: ModuleNotFoundError: No module named 'cv2'",{"id":42059,"depth":43,"text":43123},"Variant Fix 2: OSError: ghostscript not found at Runtime",{"id":42233,"depth":43,"text":43125},"Variant Fix 3: ImportError for tkinter on Headless Servers",{"id":42318,"depth":43,"text":42319},{"id":42408,"depth":43,"text":42409,"children":43128},[43129,43130,43131],{"id":42412,"depth":61,"text":42413},{"id":42481,"depth":61,"text":42482},{"id":42539,"depth":61,"text":42540},{"id":42591,"depth":43,"text":42592},{"id":42719,"depth":43,"text":42720},{"id":9246,"depth":43,"text":9247},{"id":6917,"depth":43,"text":6918},"Fix Camelot Import Error","Fix ImportError, ModuleNotFoundError, and OSError ghostscript not found when importing camelot-py on Linux. Install Ghostscript, camelot-py[cv], and resolve cv2 errors.",{},"\u002Fautomating-pdf-extraction-generation\u002Fextracting-tables-from-pdfs\u002Ffix-camelot-import-error-on-linux",{"title":9739,"description":43137},"Fix camelot ImportError and OSError ghostscript on Linux","automating-pdf-extraction-generation\u002Fextracting-tables-from-pdfs\u002Ffix-camelot-import-error-on-linux\u002Findex",[9631,47,16139,43144,42171],"linux","6Gp5-v1nL-TcZpp_1dcM4_nxy317EjGQ2XOmJS7dJN8",{"id":43147,"title":10535,"body":43148,"breadcrumbTitle":46386,"canonical":6977,"date":46387,"description":46388,"draft":6980,"extension":6981,"image":6977,"meta":46389,"navigation":91,"path":46390,"robots":6977,"seo":46391,"seoTitle":46392,"stem":46393,"tags":46394,"updatedAt":6978,"__hash__":46397},"content\u002Fautomating-pdf-extraction-generation\u002Fextracting-tables-from-pdfs\u002Ffix-pdf-text-extraction-alignment-issues\u002Findex.md",{"type":7,"value":43149,"toc":46373},[43150,43153,43163,43165,43172,43185,43187,43190,43737,43746,43750,43755,44461,44471,44479,44488,44675,44680,44687,44693,45196,45200,45203,46019,46023,46125,46127,46130,46145,46316,46343,46345,46367,46371],[10,43151,10535],{"id":43152},"fix-pdf-text-extraction-alignment-issues",[14,43154,43155,43156,43158,43159,43162],{},"Calling ",[30,43157,12958],{}," or splitting on newlines returns jumbled output when the PDF contains multi-column layouts. The symptom is merged numeric values like ",[30,43160,43161],{},"\"12,450.001,200.50\"",", column headers appearing mid-row, or a complete reversal of reading order. The cause is not a bug in the library — it is a fundamental property of the PDF format.",[18,43164,7021],{"id":7020},[14,43166,43167,43168,43171],{},"PDFs store each glyph as an independent object with an absolute x\u002Fy coordinate. There is no concept of \"row\" or \"column\" in the format. When ",[30,43169,43170],{},"extract_text()"," reassembles glyphs, it sorts by vertical position and then reads left-to-right — but two text blocks from different columns that share a similar y-coordinate land on the same output line, concatenated without a separator.",[14,43173,43174,43175,43177,43178,43180,43181,43184],{},"The fix: bypass ",[30,43176,43170],{}," entirely and work with ",[940,43179,943],{"href":942},"'s ",[30,43182,43183],{},"extract_words()",", which returns each word with its bounding box. Cluster words into rows using a y-tolerance, then sort each row by x-position.",[18,43186,35017],{"id":35016},[14,43188,43189],{},"Confirm the root cause before investing in a full fix:",[23,43191,43193],{"className":126,"code":43192,"language":47,"meta":28,"style":28},"# pip install pdfplumber\nfrom pathlib import Path\nimport pdfplumber\nimport statistics\n\nPDF_PATH = Path(\"data\u002Freport.pdf\")\n\ndef diagnose_alignment(path: Path, page_idx: int = 0) -> dict:\n    \"\"\"Show coordinate overlap statistics for a single PDF page.\"\"\"\n    try:\n        with pdfplumber.open(path) as pdf:\n            page = pdf.pages[page_idx]\n            words = page.extract_words(x_tolerance=2)\n    except Exception as e:\n        raise RuntimeError(f\"Could not open {path}: {e}\") from e\n\n    if not words:\n        return {\"status\": \"empty\", \"word_count\": 0}\n\n    tops = [w[\"top\"] for w in words]\n    bottoms = [w[\"bottom\"] for w in words]\n    heights = [b - t for t, b in zip(tops, bottoms)]\n    median_h = statistics.median(heights)\n\n    # Count word pairs that share a y-band but have a large x-gap (separate columns)\n    overlaps = 0\n    for i, w1 in enumerate(words):\n        for w2 in words[i + 1:]:\n            same_row = abs(w1[\"top\"] - w2[\"top\"]) \u003C median_h * 0.5\n            wide_gap = (w2[\"x0\"] - w1[\"x1\"]) > 20  # >20pt gap → likely different column\n            if same_row and wide_gap:\n                overlaps += 1\n\n    return {\n        \"word_count\": len(words),\n        \"median_line_height\": round(median_h, 2),\n        \"column_overlap_pairs\": overlaps,\n        \"recommended_y_tolerance\": round(median_h * 0.4, 2),\n    }\n\nif __name__ == \"__main__\":\n    result = diagnose_alignment(PDF_PATH)\n    print(result)\n    # column_overlap_pairs > 0 → coordinate sorting is needed\n",[30,43194,43195,43199,43209,43215,43222,43226,43238,43242,43264,43269,43275,43285,43294,43313,43323,43358,43362,43371,43396,43400,43425,43447,43473,43483,43487,43492,43501,43515,43534,43570,43603,43615,43624,43628,43634,43645,43662,43670,43693,43697,43701,43713,43726,43732],{"__ignoreMap":28},[33,43196,43197],{"class":35,"line":36},[33,43198,9763],{"class":39},[33,43200,43201,43203,43205,43207],{"class":35,"line":43},[33,43202,190],{"class":163},[33,43204,193],{"class":167},[33,43206,164],{"class":163},[33,43208,198],{"class":167},[33,43210,43211,43213],{"class":35,"line":61},[33,43212,164],{"class":163},[33,43214,485],{"class":167},[33,43216,43217,43219],{"class":35,"line":73},[33,43218,164],{"class":163},[33,43220,43221],{"class":167}," statistics\n",[33,43223,43224],{"class":35,"line":88},[33,43225,92],{"emptyLinePlaceholder":91},[33,43227,43228,43230,43232,43234,43236],{"class":35,"line":95},[33,43229,7076],{"class":50},[33,43231,212],{"class":163},[33,43233,215],{"class":167},[33,43235,7083],{"class":54},[33,43237,221],{"class":167},[33,43239,43240],{"class":35,"line":101},[33,43241,92],{"emptyLinePlaceholder":91},[33,43243,43244,43246,43249,43252,43254,43256,43258,43260,43262],{"class":35,"line":171},[33,43245,562],{"class":163},[33,43247,43248],{"class":46}," diagnose_alignment",[33,43250,43251],{"class":167},"(path: Path, page_idx: ",[33,43253,1059],{"class":50},[33,43255,212],{"class":163},[33,43257,10791],{"class":50},[33,43259,1617],{"class":167},[33,43261,37100],{"class":50},[33,43263,574],{"class":167},[33,43265,43266],{"class":35,"line":179},[33,43267,43268],{"class":54},"    \"\"\"Show coordinate overlap statistics for a single PDF page.\"\"\"\n",[33,43270,43271,43273],{"class":35,"line":187},[33,43272,2424],{"class":163},[33,43274,574],{"class":167},[33,43276,43277,43279,43281,43283],{"class":35,"line":201},[33,43278,2191],{"class":163},[33,43280,7123],{"class":167},[33,43282,495],{"class":163},[33,43284,686],{"class":167},[33,43286,43287,43289,43291],{"class":35,"line":206},[33,43288,9865],{"class":167},[33,43290,242],{"class":163},[33,43292,43293],{"class":167}," pdf.pages[page_idx]\n",[33,43295,43296,43299,43301,43304,43307,43309,43311],{"class":35,"line":224},[33,43297,43298],{"class":167},"            words ",[33,43300,242],{"class":163},[33,43302,43303],{"class":167}," page.extract_words(",[33,43305,43306],{"class":238},"x_tolerance",[33,43308,242],{"class":163},[33,43310,1533],{"class":50},[33,43312,221],{"class":167},[33,43314,43315,43317,43319,43321],{"class":35,"line":229},[33,43316,2449],{"class":163},[33,43318,783],{"class":50},[33,43320,1852],{"class":163},[33,43322,7583],{"class":167},[33,43324,43325,43327,43329,43331,43333,43336,43338,43340,43342,43344,43346,43348,43350,43352,43354,43356],{"class":35,"line":235},[33,43326,4051],{"class":163},[33,43328,7590],{"class":50},[33,43330,602],{"class":167},[33,43332,4059],{"class":163},[33,43334,43335],{"class":54},"\"Could not open ",[33,43337,1115],{"class":50},[33,43339,2580],{"class":167},[33,43341,1121],{"class":50},[33,43343,2079],{"class":54},[33,43345,1115],{"class":50},[33,43347,7602],{"class":167},[33,43349,1121],{"class":50},[33,43351,274],{"class":54},[33,43353,1649],{"class":167},[33,43355,190],{"class":163},[33,43357,7613],{"class":167},[33,43359,43360],{"class":35,"line":250},[33,43361,92],{"emptyLinePlaceholder":91},[33,43363,43364,43366,43368],{"class":35,"line":266},[33,43365,617],{"class":163},[33,43367,620],{"class":163},[33,43369,43370],{"class":167}," words:\n",[33,43372,43373,43375,43377,43380,43382,43385,43387,43390,43392,43394],{"class":35,"line":290},[33,43374,1659],{"class":163},[33,43376,4098],{"class":167},[33,43378,43379],{"class":54},"\"status\"",[33,43381,2079],{"class":167},[33,43383,43384],{"class":54},"\"empty\"",[33,43386,365],{"class":167},[33,43388,43389],{"class":54},"\"word_count\"",[33,43391,2079],{"class":167},[33,43393,748],{"class":50},[33,43395,4113],{"class":167},[33,43397,43398],{"class":35,"line":295},[33,43399,92],{"emptyLinePlaceholder":91},[33,43401,43402,43405,43407,43410,43413,43415,43417,43420,43422],{"class":35,"line":300},[33,43403,43404],{"class":167},"    tops ",[33,43406,242],{"class":163},[33,43408,43409],{"class":167}," [w[",[33,43411,43412],{"class":54},"\"top\"",[33,43414,763],{"class":167},[33,43416,6124],{"class":163},[33,43418,43419],{"class":167}," w ",[33,43421,662],{"class":163},[33,43423,43424],{"class":167}," words]\n",[33,43426,43427,43430,43432,43434,43437,43439,43441,43443,43445],{"class":35,"line":317},[33,43428,43429],{"class":167},"    bottoms ",[33,43431,242],{"class":163},[33,43433,43409],{"class":167},[33,43435,43436],{"class":54},"\"bottom\"",[33,43438,763],{"class":167},[33,43440,6124],{"class":163},[33,43442,43419],{"class":167},[33,43444,662],{"class":163},[33,43446,43424],{"class":167},[33,43448,43449,43452,43454,43457,43459,43461,43463,43466,43468,43470],{"class":35,"line":332},[33,43450,43451],{"class":167},"    heights ",[33,43453,242],{"class":163},[33,43455,43456],{"class":167}," [b ",[33,43458,4126],{"class":163},[33,43460,10818],{"class":167},[33,43462,6124],{"class":163},[33,43464,43465],{"class":167}," t, b ",[33,43467,662],{"class":163},[33,43469,7902],{"class":50},[33,43471,43472],{"class":167},"(tops, bottoms)]\n",[33,43474,43475,43478,43480],{"class":35,"line":347},[33,43476,43477],{"class":167},"    median_h ",[33,43479,242],{"class":163},[33,43481,43482],{"class":167}," statistics.median(heights)\n",[33,43484,43485],{"class":35,"line":374},[33,43486,92],{"emptyLinePlaceholder":91},[33,43488,43489],{"class":35,"line":397},[33,43490,43491],{"class":39},"    # Count word pairs that share a y-band but have a large x-gap (separate columns)\n",[33,43493,43494,43497,43499],{"class":35,"line":653},[33,43495,43496],{"class":167},"    overlaps ",[33,43498,242],{"class":163},[33,43500,28914],{"class":50},[33,43502,43503,43505,43508,43510,43512],{"class":35,"line":667},[33,43504,656],{"class":163},[33,43506,43507],{"class":167}," i, w1 ",[33,43509,662],{"class":163},[33,43511,7403],{"class":50},[33,43513,43514],{"class":167},"(words):\n",[33,43516,43517,43519,43522,43524,43527,43529,43531],{"class":35,"line":675},[33,43518,5973],{"class":163},[33,43520,43521],{"class":167}," w2 ",[33,43523,662],{"class":163},[33,43525,43526],{"class":167}," words[i ",[33,43528,1811],{"class":163},[33,43530,1814],{"class":50},[33,43532,43533],{"class":167},":]:\n",[33,43535,43536,43539,43541,43544,43547,43549,43551,43553,43556,43558,43560,43562,43565,43567],{"class":35,"line":689},[33,43537,43538],{"class":167},"            same_row ",[33,43540,242],{"class":163},[33,43542,43543],{"class":50}," abs",[33,43545,43546],{"class":167},"(w1[",[33,43548,43412],{"class":54},[33,43550,763],{"class":167},[33,43552,4126],{"class":163},[33,43554,43555],{"class":167}," w2[",[33,43557,43412],{"class":54},[33,43559,8675],{"class":167},[33,43561,4043],{"class":163},[33,43563,43564],{"class":167}," median_h ",[33,43566,1769],{"class":163},[33,43568,43569],{"class":50}," 0.5\n",[33,43571,43572,43575,43577,43580,43583,43585,43587,43590,43593,43595,43597,43600],{"class":35,"line":703},[33,43573,43574],{"class":167},"            wide_gap ",[33,43576,242],{"class":163},[33,43578,43579],{"class":167}," (w2[",[33,43581,43582],{"class":54},"\"x0\"",[33,43584,763],{"class":167},[33,43586,4126],{"class":163},[33,43588,43589],{"class":167}," w1[",[33,43591,43592],{"class":54},"\"x1\"",[33,43594,8675],{"class":167},[33,43596,6009],{"class":163},[33,43598,43599],{"class":50}," 20",[33,43601,43602],{"class":39},"  # >20pt gap → likely different column\n",[33,43604,43605,43607,43610,43612],{"class":35,"line":714},[33,43606,5995],{"class":163},[33,43608,43609],{"class":167}," same_row ",[33,43611,6001],{"class":163},[33,43613,43614],{"class":167}," wide_gap:\n",[33,43616,43617,43620,43622],{"class":35,"line":723},[33,43618,43619],{"class":167},"                overlaps ",[33,43621,28976],{"class":163},[33,43623,17709],{"class":50},[33,43625,43626],{"class":35,"line":754},[33,43627,92],{"emptyLinePlaceholder":91},[33,43629,43630,43632],{"class":35,"line":771},[33,43631,1332],{"class":163},[33,43633,16265],{"class":167},[33,43635,43636,43639,43641,43643],{"class":35,"line":777},[33,43637,43638],{"class":54},"        \"word_count\"",[33,43640,2079],{"class":167},[33,43642,928],{"class":50},[33,43644,37239],{"class":167},[33,43646,43647,43650,43652,43655,43658,43660],{"class":35,"line":788},[33,43648,43649],{"class":54},"        \"median_line_height\"",[33,43651,2079],{"class":167},[33,43653,43654],{"class":50},"round",[33,43656,43657],{"class":167},"(median_h, ",[33,43659,1533],{"class":50},[33,43661,1506],{"class":167},[33,43663,43664,43667],{"class":35,"line":804},[33,43665,43666],{"class":54},"        \"column_overlap_pairs\"",[33,43668,43669],{"class":167},": overlaps,\n",[33,43671,43672,43675,43677,43679,43682,43684,43687,43689,43691],{"class":35,"line":809},[33,43673,43674],{"class":54},"        \"recommended_y_tolerance\"",[33,43676,2079],{"class":167},[33,43678,43654],{"class":50},[33,43680,43681],{"class":167},"(median_h ",[33,43683,1769],{"class":163},[33,43685,43686],{"class":50}," 0.4",[33,43688,365],{"class":167},[33,43690,1533],{"class":50},[33,43692,1506],{"class":167},[33,43694,43695],{"class":35,"line":819},[33,43696,20781],{"class":167},[33,43698,43699],{"class":35,"line":829},[33,43700,92],{"emptyLinePlaceholder":91},[33,43702,43703,43705,43707,43709,43711],{"class":35,"line":834},[33,43704,2491],{"class":163},[33,43706,2494],{"class":50},[33,43708,2497],{"class":163},[33,43710,2500],{"class":54},[33,43712,574],{"class":167},[33,43714,43715,43717,43719,43722,43724],{"class":35,"line":839},[33,43716,8842],{"class":167},[33,43718,242],{"class":163},[33,43720,43721],{"class":167}," diagnose_alignment(",[33,43723,7076],{"class":50},[33,43725,221],{"class":167},[33,43727,43728,43730],{"class":35,"line":860},[33,43729,7268],{"class":50},[33,43731,8864],{"class":167},[33,43733,43734],{"class":35,"line":887},[33,43735,43736],{"class":39},"    # column_overlap_pairs > 0 → coordinate sorting is needed\n",[14,43738,41963,43739,43742,43743,43745],{},[30,43740,43741],{},"column_overlap_pairs"," is zero, the file does not have multi-column misalignment — check instead for encoding issues (see ",[940,43744,27254],{"href":27253}," if the garbling appears after a CSV export step).",[18,43747,43749],{"id":43748},"fix-coordinate-sorted-row-reconstruction","Fix: Coordinate-Sorted Row Reconstruction",[14,43751,42543,43752,43754],{},[30,43753,43170],{}," with a word-level reconstruction pipeline. The key changes are on every modified line:",[23,43756,43758],{"className":126,"code":43757,"language":47,"meta":28,"style":28},"# pip install pdfplumber pandas\nfrom pathlib import Path\nimport pdfplumber\nimport pandas as pd\nimport statistics\n\nPDF_PATH = Path(\"data\u002Freport.pdf\")\nOUTPUT_PATH = Path(\"output\u002Faligned.csv\")\n\ndef extract_aligned_rows(\n    path: Path,\n    page_idx: int = 0,\n    y_tolerance: float | None = None,   # None → auto-detect from median line height\n    x_tolerance: int = 2,               # merge kerned\u002Fhyphenated glyphs within 2pt\n) -> list[list[str]]:\n    \"\"\"\n    Extract text rows from a PDF page using x\u002Fy coordinate sorting.\n    Returns a list of rows, where each row is a list of word strings.\n    \"\"\"\n    try:\n        with pdfplumber.open(path) as pdf:\n            page = pdf.pages[page_idx]\n            # extract_words returns dicts with x0, x1, top, bottom, text\n            words = page.extract_words(x_tolerance=x_tolerance)  # changed: was extract_text()\n    except Exception as e:\n        raise RuntimeError(f\"Failed to read {path}: {e}\") from e\n\n    if not words:\n        return []\n\n    # Auto-detect y_tolerance from median glyph height\n    if y_tolerance is None:\n        heights = [w[\"bottom\"] - w[\"top\"] for w in words]\n        y_tolerance = statistics.median(heights) * 0.4  # changed: was hardcoded 3.0\n\n    # Sort words top-to-bottom\n    words.sort(key=lambda w: w[\"top\"])  # changed: was not sorted\n\n    rows: list[list[dict]] = []\n    current_row: list[dict] = [words[0]]\n    current_top: float = words[0][\"top\"]\n\n    for word in words[1:]:\n        if abs(word[\"top\"] - current_top) \u003C= y_tolerance:  # same visual row\n            current_row.append(word)\n        else:\n            current_row.sort(key=lambda w: w[\"x0\"])        # changed: sort left-to-right\n            rows.append([w[\"text\"] for w in current_row])\n            current_row = [word]\n            current_top = word[\"top\"]\n\n    if current_row:\n        current_row.sort(key=lambda w: w[\"x0\"])\n        rows.append([w[\"text\"] for w in current_row])\n\n    return rows\n\nif __name__ == \"__main__\":\n    OUTPUT_PATH.parent.mkdir(exist_ok=True)\n    rows = extract_aligned_rows(PDF_PATH)\n    df = pd.DataFrame(rows)\n    df.to_csv(OUTPUT_PATH, index=False, header=False)\n    print(f\"Exported {len(rows)} rows to {OUTPUT_PATH}\")\n",[30,43759,43760,43764,43774,43780,43790,43796,43800,43812,43825,43829,43838,43842,43855,43875,43891,43901,43905,43910,43915,43919,43925,43935,43943,43948,43966,43976,44011,44015,44023,44029,44033,44038,44051,44081,44098,44102,44107,44128,44132,44144,44163,44184,44188,44203,44230,44235,44241,44260,44278,44288,44302,44306,44313,44328,44345,44349,44356,44360,44372,44386,44400,44409,44434],{"__ignoreMap":28},[33,43761,43762],{"class":35,"line":36},[33,43763,7041],{"class":39},[33,43765,43766,43768,43770,43772],{"class":35,"line":43},[33,43767,190],{"class":163},[33,43769,193],{"class":167},[33,43771,164],{"class":163},[33,43773,198],{"class":167},[33,43775,43776,43778],{"class":35,"line":61},[33,43777,164],{"class":163},[33,43779,485],{"class":167},[33,43781,43782,43784,43786,43788],{"class":35,"line":73},[33,43783,164],{"class":163},[33,43785,492],{"class":167},[33,43787,495],{"class":163},[33,43789,498],{"class":167},[33,43791,43792,43794],{"class":35,"line":88},[33,43793,164],{"class":163},[33,43795,43221],{"class":167},[33,43797,43798],{"class":35,"line":95},[33,43799,92],{"emptyLinePlaceholder":91},[33,43801,43802,43804,43806,43808,43810],{"class":35,"line":101},[33,43803,7076],{"class":50},[33,43805,212],{"class":163},[33,43807,215],{"class":167},[33,43809,7083],{"class":54},[33,43811,221],{"class":167},[33,43813,43814,43816,43818,43820,43823],{"class":35,"line":171},[33,43815,521],{"class":50},[33,43817,212],{"class":163},[33,43819,215],{"class":167},[33,43821,43822],{"class":54},"\"output\u002Faligned.csv\"",[33,43824,221],{"class":167},[33,43826,43827],{"class":35,"line":179},[33,43828,92],{"emptyLinePlaceholder":91},[33,43830,43831,43833,43836],{"class":35,"line":187},[33,43832,562],{"class":163},[33,43834,43835],{"class":46}," extract_aligned_rows",[33,43837,7637],{"class":167},[33,43839,43840],{"class":35,"line":201},[33,43841,10615],{"class":167},[33,43843,43844,43847,43849,43851,43853],{"class":35,"line":206},[33,43845,43846],{"class":167},"    page_idx: ",[33,43848,1059],{"class":50},[33,43850,212],{"class":163},[33,43852,10791],{"class":50},[33,43854,247],{"class":167},[33,43856,43857,43860,43862,43864,43866,43868,43870,43872],{"class":35,"line":224},[33,43858,43859],{"class":167},"    y_tolerance: ",[33,43861,1720],{"class":50},[33,43863,2850],{"class":163},[33,43865,7657],{"class":50},[33,43867,212],{"class":163},[33,43869,7657],{"class":50},[33,43871,1166],{"class":167},[33,43873,43874],{"class":39},"# None → auto-detect from median line height\n",[33,43876,43877,43880,43882,43884,43886,43888],{"class":35,"line":229},[33,43878,43879],{"class":167},"    x_tolerance: ",[33,43881,1059],{"class":50},[33,43883,212],{"class":163},[33,43885,7451],{"class":50},[33,43887,1182],{"class":167},[33,43889,43890],{"class":39},"# merge kerned\u002Fhyphenated glyphs within 2pt\n",[33,43892,43893,43896,43898],{"class":35,"line":235},[33,43894,43895],{"class":167},") -> list[list[",[33,43897,1053],{"class":50},[33,43899,43900],{"class":167},"]]:\n",[33,43902,43903],{"class":35,"line":250},[33,43904,7673],{"class":54},[33,43906,43907],{"class":35,"line":266},[33,43908,43909],{"class":54},"    Extract text rows from a PDF page using x\u002Fy coordinate sorting.\n",[33,43911,43912],{"class":35,"line":290},[33,43913,43914],{"class":54},"    Returns a list of rows, where each row is a list of word strings.\n",[33,43916,43917],{"class":35,"line":295},[33,43918,7673],{"class":54},[33,43920,43921,43923],{"class":35,"line":300},[33,43922,2424],{"class":163},[33,43924,574],{"class":167},[33,43926,43927,43929,43931,43933],{"class":35,"line":317},[33,43928,2191],{"class":163},[33,43930,7123],{"class":167},[33,43932,495],{"class":163},[33,43934,686],{"class":167},[33,43936,43937,43939,43941],{"class":35,"line":332},[33,43938,9865],{"class":167},[33,43940,242],{"class":163},[33,43942,43293],{"class":167},[33,43944,43945],{"class":35,"line":347},[33,43946,43947],{"class":39},"            # extract_words returns dicts with x0, x1, top, bottom, text\n",[33,43949,43950,43952,43954,43956,43958,43960,43963],{"class":35,"line":374},[33,43951,43298],{"class":167},[33,43953,242],{"class":163},[33,43955,43303],{"class":167},[33,43957,43306],{"class":238},[33,43959,242],{"class":163},[33,43961,43962],{"class":167},"x_tolerance)  ",[33,43964,43965],{"class":39},"# changed: was extract_text()\n",[33,43967,43968,43970,43972,43974],{"class":35,"line":397},[33,43969,2449],{"class":163},[33,43971,783],{"class":50},[33,43973,1852],{"class":163},[33,43975,7583],{"class":167},[33,43977,43978,43980,43982,43984,43986,43989,43991,43993,43995,43997,43999,44001,44003,44005,44007,44009],{"class":35,"line":653},[33,43979,4051],{"class":163},[33,43981,7590],{"class":50},[33,43983,602],{"class":167},[33,43985,4059],{"class":163},[33,43987,43988],{"class":54},"\"Failed to read ",[33,43990,1115],{"class":50},[33,43992,2580],{"class":167},[33,43994,1121],{"class":50},[33,43996,2079],{"class":54},[33,43998,1115],{"class":50},[33,44000,7602],{"class":167},[33,44002,1121],{"class":50},[33,44004,274],{"class":54},[33,44006,1649],{"class":167},[33,44008,190],{"class":163},[33,44010,7613],{"class":167},[33,44012,44013],{"class":35,"line":667},[33,44014,92],{"emptyLinePlaceholder":91},[33,44016,44017,44019,44021],{"class":35,"line":675},[33,44018,617],{"class":163},[33,44020,620],{"class":163},[33,44022,43370],{"class":167},[33,44024,44025,44027],{"class":35,"line":689},[33,44026,1659],{"class":163},[33,44028,589],{"class":167},[33,44030,44031],{"class":35,"line":703},[33,44032,92],{"emptyLinePlaceholder":91},[33,44034,44035],{"class":35,"line":714},[33,44036,44037],{"class":39},"    # Auto-detect y_tolerance from median glyph height\n",[33,44039,44040,44042,44045,44047,44049],{"class":35,"line":723},[33,44041,617],{"class":163},[33,44043,44044],{"class":167}," y_tolerance ",[33,44046,3847],{"class":163},[33,44048,7657],{"class":50},[33,44050,574],{"class":167},[33,44052,44053,44056,44058,44060,44062,44064,44066,44069,44071,44073,44075,44077,44079],{"class":35,"line":754},[33,44054,44055],{"class":167},"        heights ",[33,44057,242],{"class":163},[33,44059,43409],{"class":167},[33,44061,43436],{"class":54},[33,44063,763],{"class":167},[33,44065,4126],{"class":163},[33,44067,44068],{"class":167}," w[",[33,44070,43412],{"class":54},[33,44072,763],{"class":167},[33,44074,6124],{"class":163},[33,44076,43419],{"class":167},[33,44078,662],{"class":163},[33,44080,43424],{"class":167},[33,44082,44083,44086,44088,44091,44093,44095],{"class":35,"line":771},[33,44084,44085],{"class":167},"        y_tolerance ",[33,44087,242],{"class":163},[33,44089,44090],{"class":167}," statistics.median(heights) ",[33,44092,1769],{"class":163},[33,44094,43686],{"class":50},[33,44096,44097],{"class":39},"  # changed: was hardcoded 3.0\n",[33,44099,44100],{"class":35,"line":777},[33,44101,92],{"emptyLinePlaceholder":91},[33,44103,44104],{"class":35,"line":788},[33,44105,44106],{"class":39},"    # Sort words top-to-bottom\n",[33,44108,44109,44112,44115,44118,44121,44123,44125],{"class":35,"line":804},[33,44110,44111],{"class":167},"    words.sort(",[33,44113,44114],{"class":238},"key",[33,44116,44117],{"class":163},"=lambda",[33,44119,44120],{"class":167}," w: w[",[33,44122,43412],{"class":54},[33,44124,27240],{"class":167},[33,44126,44127],{"class":39},"# changed: was not sorted\n",[33,44129,44130],{"class":35,"line":809},[33,44131,92],{"emptyLinePlaceholder":91},[33,44133,44134,44136,44138,44140,44142],{"class":35,"line":819},[33,44135,13076],{"class":167},[33,44137,37100],{"class":50},[33,44139,13081],{"class":167},[33,44141,242],{"class":163},[33,44143,589],{"class":167},[33,44145,44146,44149,44151,44153,44155,44158,44160],{"class":35,"line":829},[33,44147,44148],{"class":167},"    current_row: list[",[33,44150,37100],{"class":50},[33,44152,763],{"class":167},[33,44154,242],{"class":163},[33,44156,44157],{"class":167}," [words[",[33,44159,748],{"class":50},[33,44161,44162],{"class":167},"]]\n",[33,44164,44165,44168,44170,44172,44175,44177,44180,44182],{"class":35,"line":834},[33,44166,44167],{"class":167},"    current_top: ",[33,44169,1720],{"class":50},[33,44171,212],{"class":163},[33,44173,44174],{"class":167}," words[",[33,44176,748],{"class":50},[33,44178,44179],{"class":167},"][",[33,44181,43412],{"class":54},[33,44183,9202],{"class":167},[33,44185,44186],{"class":35,"line":839},[33,44187,92],{"emptyLinePlaceholder":91},[33,44189,44190,44192,44195,44197,44199,44201],{"class":35,"line":860},[33,44191,656],{"class":163},[33,44193,44194],{"class":167}," word ",[33,44196,662],{"class":163},[33,44198,44174],{"class":167},[33,44200,734],{"class":50},[33,44202,43533],{"class":167},[33,44204,44205,44207,44209,44212,44214,44216,44218,44221,44224,44227],{"class":35,"line":887},[33,44206,8221],{"class":163},[33,44208,43543],{"class":50},[33,44210,44211],{"class":167},"(word[",[33,44213,43412],{"class":54},[33,44215,763],{"class":167},[33,44217,4126],{"class":163},[33,44219,44220],{"class":167}," current_top) ",[33,44222,44223],{"class":163},"\u003C=",[33,44225,44226],{"class":167}," y_tolerance:  ",[33,44228,44229],{"class":39},"# same visual row\n",[33,44231,44232],{"class":35,"line":907},[33,44233,44234],{"class":167},"            current_row.append(word)\n",[33,44236,44237,44239],{"class":35,"line":1826},[33,44238,41290],{"class":163},[33,44240,574],{"class":167},[33,44242,44243,44246,44248,44250,44252,44254,44257],{"class":35,"line":1844},[33,44244,44245],{"class":167},"            current_row.sort(",[33,44247,44114],{"class":238},[33,44249,44117],{"class":163},[33,44251,44120],{"class":167},[33,44253,43582],{"class":54},[33,44255,44256],{"class":167},"])        ",[33,44258,44259],{"class":39},"# changed: sort left-to-right\n",[33,44261,44262,44265,44267,44269,44271,44273,44275],{"class":35,"line":1858},[33,44263,44264],{"class":167},"            rows.append([w[",[33,44266,3459],{"class":54},[33,44268,763],{"class":167},[33,44270,6124],{"class":163},[33,44272,43419],{"class":167},[33,44274,662],{"class":163},[33,44276,44277],{"class":167}," current_row])\n",[33,44279,44280,44283,44285],{"class":35,"line":1871},[33,44281,44282],{"class":167},"            current_row ",[33,44284,242],{"class":163},[33,44286,44287],{"class":167}," [word]\n",[33,44289,44290,44293,44295,44298,44300],{"class":35,"line":1877},[33,44291,44292],{"class":167},"            current_top ",[33,44294,242],{"class":163},[33,44296,44297],{"class":167}," word[",[33,44299,43412],{"class":54},[33,44301,9202],{"class":167},[33,44303,44304],{"class":35,"line":1883},[33,44305,92],{"emptyLinePlaceholder":91},[33,44307,44308,44310],{"class":35,"line":1915},[33,44309,617],{"class":163},[33,44311,44312],{"class":167}," current_row:\n",[33,44314,44315,44318,44320,44322,44324,44326],{"class":35,"line":1926},[33,44316,44317],{"class":167},"        current_row.sort(",[33,44319,44114],{"class":238},[33,44321,44117],{"class":163},[33,44323,44120],{"class":167},[33,44325,43582],{"class":54},[33,44327,751],{"class":167},[33,44329,44330,44333,44335,44337,44339,44341,44343],{"class":35,"line":1932},[33,44331,44332],{"class":167},"        rows.append([w[",[33,44334,3459],{"class":54},[33,44336,763],{"class":167},[33,44338,6124],{"class":163},[33,44340,43419],{"class":167},[33,44342,662],{"class":163},[33,44344,44277],{"class":167},[33,44346,44347],{"class":35,"line":1938},[33,44348,92],{"emptyLinePlaceholder":91},[33,44350,44351,44353],{"class":35,"line":1950},[33,44352,1332],{"class":163},[33,44354,44355],{"class":167}," rows\n",[33,44357,44358],{"class":35,"line":1958},[33,44359,92],{"emptyLinePlaceholder":91},[33,44361,44362,44364,44366,44368,44370],{"class":35,"line":4904},[33,44363,2491],{"class":163},[33,44365,2494],{"class":50},[33,44367,2497],{"class":163},[33,44369,2500],{"class":54},[33,44371,574],{"class":167},[33,44373,44374,44376,44378,44380,44382,44384],{"class":35,"line":4909},[33,44375,863],{"class":50},[33,44377,866],{"class":167},[33,44379,878],{"class":238},[33,44381,242],{"class":163},[33,44383,855],{"class":50},[33,44385,221],{"class":167},[33,44387,44388,44391,44393,44396,44398],{"class":35,"line":4915},[33,44389,44390],{"class":167},"    rows ",[33,44392,242],{"class":163},[33,44394,44395],{"class":167}," extract_aligned_rows(",[33,44397,7076],{"class":50},[33,44399,221],{"class":167},[33,44401,44402,44404,44406],{"class":35,"line":4925},[33,44403,4025],{"class":167},[33,44405,242],{"class":163},[33,44407,44408],{"class":167}," pd.DataFrame(rows)\n",[33,44410,44411,44413,44415,44417,44419,44421,44423,44425,44428,44430,44432],{"class":35,"line":4935},[33,44412,39534],{"class":167},[33,44414,521],{"class":50},[33,44416,365],{"class":167},[33,44418,897],{"class":238},[33,44420,242],{"class":163},[33,44422,902],{"class":50},[33,44424,365],{"class":167},[33,44426,44427],{"class":238},"header",[33,44429,242],{"class":163},[33,44431,902],{"class":50},[33,44433,221],{"class":167},[33,44435,44436,44438,44440,44442,44445,44447,44450,44452,44454,44457,44459],{"class":35,"line":4941},[33,44437,7268],{"class":50},[33,44439,602],{"class":167},[33,44441,4059],{"class":163},[33,44443,44444],{"class":54},"\"Exported ",[33,44446,4065],{"class":50},[33,44448,44449],{"class":167},"(rows)",[33,44451,1121],{"class":50},[33,44453,919],{"class":54},[33,44455,44456],{"class":50},"{OUTPUT_PATH}",[33,44458,274],{"class":54},[33,44460,221],{"class":167},[14,44462,44463,44464,44467,44468,44470],{},"The three ",[30,44465,44466],{},"# changed:"," comments mark every line that differs from the naive ",[30,44469,43170],{}," approach.",[18,44472,44474,44475,44478],{"id":44473},"variant-fix-1-use-layouttrue-for-simple-single-column-misalignment","Variant Fix 1: Use ",[30,44476,44477],{},"layout=True"," for Simple Single-Column Misalignment",[14,44480,44481,44482,44484,44485,44487],{},"For documents where text flows in a single column but ",[30,44483,43170],{}," still scrambles order, the ",[30,44486,44477],{}," parameter re-sorts glyphs spatially before assembling output:",[23,44489,44491],{"className":126,"code":44490,"language":47,"meta":28,"style":28},"# pip install pdfplumber\nfrom pathlib import Path\nimport pdfplumber\n\nPDF_PATH = Path(\"data\u002Freport.pdf\")\n\ndef extract_with_layout(path: Path) -> str:\n    \"\"\"Use layout=True to restore reading order without manual coordinate work.\"\"\"\n    with pdfplumber.open(path) as pdf:\n        pages_text = []\n        for page in pdf.pages:\n            # layout=True uses pdfminer's LAParams to sort text spatially\n            text = page.extract_text(layout=True)   # changed: added layout=True\n            if text:\n                pages_text.append(text)\n    return \"\\n\\n\".join(pages_text)\n\nif __name__ == \"__main__\":\n    text = extract_with_layout(PDF_PATH)\n    print(text[:500])\n",[30,44492,44493,44497,44507,44513,44517,44529,44533,44546,44551,44561,44570,44580,44585,44607,44614,44619,44634,44638,44650,44664],{"__ignoreMap":28},[33,44494,44495],{"class":35,"line":36},[33,44496,9763],{"class":39},[33,44498,44499,44501,44503,44505],{"class":35,"line":43},[33,44500,190],{"class":163},[33,44502,193],{"class":167},[33,44504,164],{"class":163},[33,44506,198],{"class":167},[33,44508,44509,44511],{"class":35,"line":61},[33,44510,164],{"class":163},[33,44512,485],{"class":167},[33,44514,44515],{"class":35,"line":73},[33,44516,92],{"emptyLinePlaceholder":91},[33,44518,44519,44521,44523,44525,44527],{"class":35,"line":88},[33,44520,7076],{"class":50},[33,44522,212],{"class":163},[33,44524,215],{"class":167},[33,44526,7083],{"class":54},[33,44528,221],{"class":167},[33,44530,44531],{"class":35,"line":95},[33,44532,92],{"emptyLinePlaceholder":91},[33,44534,44535,44537,44540,44542,44544],{"class":35,"line":101},[33,44536,562],{"class":163},[33,44538,44539],{"class":46}," extract_with_layout",[33,44541,3743],{"class":167},[33,44543,1053],{"class":50},[33,44545,574],{"class":167},[33,44547,44548],{"class":35,"line":171},[33,44549,44550],{"class":54},"    \"\"\"Use layout=True to restore reading order without manual coordinate work.\"\"\"\n",[33,44552,44553,44555,44557,44559],{"class":35,"line":179},[33,44554,1635],{"class":163},[33,44556,7123],{"class":167},[33,44558,495],{"class":163},[33,44560,686],{"class":167},[33,44562,44563,44566,44568],{"class":35,"line":187},[33,44564,44565],{"class":167},"        pages_text ",[33,44567,242],{"class":163},[33,44569,589],{"class":167},[33,44571,44572,44574,44576,44578],{"class":35,"line":201},[33,44573,5973],{"class":163},[33,44575,695],{"class":167},[33,44577,662],{"class":163},[33,44579,700],{"class":167},[33,44581,44582],{"class":35,"line":206},[33,44583,44584],{"class":39},"            # layout=True uses pdfminer's LAParams to sort text spatially\n",[33,44586,44587,44590,44592,44595,44598,44600,44602,44604],{"class":35,"line":224},[33,44588,44589],{"class":167},"            text ",[33,44591,242],{"class":163},[33,44593,44594],{"class":167}," page.extract_text(",[33,44596,44597],{"class":238},"layout",[33,44599,242],{"class":163},[33,44601,855],{"class":50},[33,44603,12000],{"class":167},[33,44605,44606],{"class":39},"# changed: added layout=True\n",[33,44608,44609,44611],{"class":35,"line":229},[33,44610,5995],{"class":163},[33,44612,44613],{"class":167}," text:\n",[33,44615,44616],{"class":35,"line":235},[33,44617,44618],{"class":167},"                pages_text.append(text)\n",[33,44620,44621,44623,44626,44629,44631],{"class":35,"line":250},[33,44622,1332],{"class":163},[33,44624,44625],{"class":54}," \"",[33,44627,44628],{"class":50},"\\n\\n",[33,44630,274],{"class":54},[33,44632,44633],{"class":167},".join(pages_text)\n",[33,44635,44636],{"class":35,"line":266},[33,44637,92],{"emptyLinePlaceholder":91},[33,44639,44640,44642,44644,44646,44648],{"class":35,"line":290},[33,44641,2491],{"class":163},[33,44643,2494],{"class":50},[33,44645,2497],{"class":163},[33,44647,2500],{"class":54},[33,44649,574],{"class":167},[33,44651,44652,44655,44657,44660,44662],{"class":35,"line":295},[33,44653,44654],{"class":167},"    text ",[33,44656,242],{"class":163},[33,44658,44659],{"class":167}," extract_with_layout(",[33,44661,7076],{"class":50},[33,44663,221],{"class":167},[33,44665,44666,44668,44671,44673],{"class":35,"line":300},[33,44667,7268],{"class":50},[33,44669,44670],{"class":167},"(text[:",[33,44672,13437],{"class":50},[33,44674,751],{"class":167},[14,44676,44677,44679],{},[30,44678,44477],{}," is simpler but less precise than the word-level approach — it works for single-column text but still merges columns in multi-column PDFs.",[18,44681,44683,44684],{"id":44682},"variant-fix-2-char-level-clustering-with-chars","Variant Fix 2: Char-Level Clustering with ",[30,44685,44686],{},"chars",[14,44688,44689,44690,44692],{},"When even ",[30,44691,43183],{}," splits tokens incorrectly (common with PDFs generated from LaTeX or some CAD tools), drop to character-level objects and cluster manually:",[23,44694,44696],{"className":126,"code":44695,"language":47,"meta":28,"style":28},"# pip install pdfplumber\nfrom pathlib import Path\nimport pdfplumber\n\nPDF_PATH = Path(\"data\u002Fcad_export.pdf\")\n\ndef cluster_chars(path: Path, page_idx: int = 0, y_tol: float = 2.0, x_merge: float = 1.5) -> list[str]:\n    \"\"\"Reconstruct text from individual characters for heavily fragmented PDFs.\"\"\"\n    with pdfplumber.open(path) as pdf:\n        chars = pdf.pages[page_idx].chars  # list of dicts with x0, top, text, etc.\n\n    if not chars:\n        return []\n\n    chars = sorted(chars, key=lambda c: (round(c[\"top\"] \u002F y_tol), c[\"x0\"]))\n\n    lines: list[str] = []\n    current_line: list[dict] = [chars[0]]\n    for ch in chars[1:]:\n        prev = current_line[-1]\n        same_row = abs(ch[\"top\"] - prev[\"top\"]) \u003C y_tol\n        adjacent = (ch[\"x0\"] - prev[\"x1\"]) \u003C x_merge\n        if same_row:\n            # Insert space if there is a gap wider than x_merge\n            if not adjacent:\n                current_line.append({\"text\": \" \", \"x0\": prev[\"x1\"], \"x1\": prev[\"x1\"], \"top\": prev[\"top\"]})\n            current_line.append(ch)\n        else:\n            lines.append(\"\".join(c[\"text\"] for c in current_line))\n            current_line = [ch]\n\n    if current_line:\n        lines.append(\"\".join(c[\"text\"] for c in current_line))\n\n    return lines\n\nif __name__ == \"__main__\":\n    lines = cluster_chars(PDF_PATH)\n    for line in lines[:10]:\n        print(repr(line))\n",[30,44697,44698,44702,44712,44718,44722,44735,44739,44780,44785,44795,44808,44812,44821,44827,44831,44868,44872,44885,44903,44919,44935,44965,44992,44999,45004,45013,45052,45057,45063,45086,45096,45100,45107,45128,45132,45139,45143,45155,45169,45184],{"__ignoreMap":28},[33,44699,44700],{"class":35,"line":36},[33,44701,9763],{"class":39},[33,44703,44704,44706,44708,44710],{"class":35,"line":43},[33,44705,190],{"class":163},[33,44707,193],{"class":167},[33,44709,164],{"class":163},[33,44711,198],{"class":167},[33,44713,44714,44716],{"class":35,"line":61},[33,44715,164],{"class":163},[33,44717,485],{"class":167},[33,44719,44720],{"class":35,"line":73},[33,44721,92],{"emptyLinePlaceholder":91},[33,44723,44724,44726,44728,44730,44733],{"class":35,"line":88},[33,44725,7076],{"class":50},[33,44727,212],{"class":163},[33,44729,215],{"class":167},[33,44731,44732],{"class":54},"\"data\u002Fcad_export.pdf\"",[33,44734,221],{"class":167},[33,44736,44737],{"class":35,"line":95},[33,44738,92],{"emptyLinePlaceholder":91},[33,44740,44741,44743,44746,44748,44750,44752,44754,44757,44759,44761,44763,44766,44768,44770,44773,44776,44778],{"class":35,"line":101},[33,44742,562],{"class":163},[33,44744,44745],{"class":46}," cluster_chars",[33,44747,43251],{"class":167},[33,44749,1059],{"class":50},[33,44751,212],{"class":163},[33,44753,10791],{"class":50},[33,44755,44756],{"class":167},", y_tol: ",[33,44758,1720],{"class":50},[33,44760,212],{"class":163},[33,44762,1725],{"class":50},[33,44764,44765],{"class":167},", x_merge: ",[33,44767,1720],{"class":50},[33,44769,212],{"class":163},[33,44771,44772],{"class":50}," 1.5",[33,44774,44775],{"class":167},") -> list[",[33,44777,1053],{"class":50},[33,44779,17477],{"class":167},[33,44781,44782],{"class":35,"line":171},[33,44783,44784],{"class":54},"    \"\"\"Reconstruct text from individual characters for heavily fragmented PDFs.\"\"\"\n",[33,44786,44787,44789,44791,44793],{"class":35,"line":179},[33,44788,1635],{"class":163},[33,44790,7123],{"class":167},[33,44792,495],{"class":163},[33,44794,686],{"class":167},[33,44796,44797,44800,44802,44805],{"class":35,"line":187},[33,44798,44799],{"class":167},"        chars ",[33,44801,242],{"class":163},[33,44803,44804],{"class":167}," pdf.pages[page_idx].chars  ",[33,44806,44807],{"class":39},"# list of dicts with x0, top, text, etc.\n",[33,44809,44810],{"class":35,"line":201},[33,44811,92],{"emptyLinePlaceholder":91},[33,44813,44814,44816,44818],{"class":35,"line":206},[33,44815,617],{"class":163},[33,44817,620],{"class":163},[33,44819,44820],{"class":167}," chars:\n",[33,44822,44823,44825],{"class":35,"line":224},[33,44824,1659],{"class":163},[33,44826,589],{"class":167},[33,44828,44829],{"class":35,"line":229},[33,44830,92],{"emptyLinePlaceholder":91},[33,44832,44833,44836,44838,44840,44843,44845,44847,44850,44852,44855,44857,44859,44861,44864,44866],{"class":35,"line":235},[33,44834,44835],{"class":167},"    chars ",[33,44837,242],{"class":163},[33,44839,28924],{"class":50},[33,44841,44842],{"class":167},"(chars, ",[33,44844,44114],{"class":238},[33,44846,44117],{"class":163},[33,44848,44849],{"class":167}," c: (",[33,44851,43654],{"class":50},[33,44853,44854],{"class":167},"(c[",[33,44856,43412],{"class":54},[33,44858,763],{"class":167},[33,44860,1351],{"class":163},[33,44862,44863],{"class":167}," y_tol), c[",[33,44865,43582],{"class":54},[33,44867,7211],{"class":167},[33,44869,44870],{"class":35,"line":250},[33,44871,92],{"emptyLinePlaceholder":91},[33,44873,44874,44877,44879,44881,44883],{"class":35,"line":266},[33,44875,44876],{"class":167},"    lines: list[",[33,44878,1053],{"class":50},[33,44880,763],{"class":167},[33,44882,242],{"class":163},[33,44884,589],{"class":167},[33,44886,44887,44890,44892,44894,44896,44899,44901],{"class":35,"line":290},[33,44888,44889],{"class":167},"    current_line: list[",[33,44891,37100],{"class":50},[33,44893,763],{"class":167},[33,44895,242],{"class":163},[33,44897,44898],{"class":167}," [chars[",[33,44900,748],{"class":50},[33,44902,44162],{"class":167},[33,44904,44905,44907,44910,44912,44915,44917],{"class":35,"line":295},[33,44906,656],{"class":163},[33,44908,44909],{"class":167}," ch ",[33,44911,662],{"class":163},[33,44913,44914],{"class":167}," chars[",[33,44916,734],{"class":50},[33,44918,43533],{"class":167},[33,44920,44921,44924,44926,44929,44931,44933],{"class":35,"line":300},[33,44922,44923],{"class":167},"        prev ",[33,44925,242],{"class":163},[33,44927,44928],{"class":167}," current_line[",[33,44930,4126],{"class":163},[33,44932,734],{"class":50},[33,44934,9202],{"class":167},[33,44936,44937,44940,44942,44944,44947,44949,44951,44953,44956,44958,44960,44962],{"class":35,"line":317},[33,44938,44939],{"class":167},"        same_row ",[33,44941,242],{"class":163},[33,44943,43543],{"class":50},[33,44945,44946],{"class":167},"(ch[",[33,44948,43412],{"class":54},[33,44950,763],{"class":167},[33,44952,4126],{"class":163},[33,44954,44955],{"class":167}," prev[",[33,44957,43412],{"class":54},[33,44959,8675],{"class":167},[33,44961,4043],{"class":163},[33,44963,44964],{"class":167}," y_tol\n",[33,44966,44967,44970,44972,44975,44977,44979,44981,44983,44985,44987,44989],{"class":35,"line":332},[33,44968,44969],{"class":167},"        adjacent ",[33,44971,242],{"class":163},[33,44973,44974],{"class":167}," (ch[",[33,44976,43582],{"class":54},[33,44978,763],{"class":167},[33,44980,4126],{"class":163},[33,44982,44955],{"class":167},[33,44984,43592],{"class":54},[33,44986,8675],{"class":167},[33,44988,4043],{"class":163},[33,44990,44991],{"class":167}," x_merge\n",[33,44993,44994,44996],{"class":35,"line":347},[33,44995,8221],{"class":163},[33,44997,44998],{"class":167}," same_row:\n",[33,45000,45001],{"class":35,"line":374},[33,45002,45003],{"class":39},"            # Insert space if there is a gap wider than x_merge\n",[33,45005,45006,45008,45010],{"class":35,"line":397},[33,45007,5995],{"class":163},[33,45009,620],{"class":163},[33,45011,45012],{"class":167}," adjacent:\n",[33,45014,45015,45018,45020,45022,45024,45026,45028,45031,45033,45035,45037,45039,45041,45043,45045,45047,45049],{"class":35,"line":653},[33,45016,45017],{"class":167},"                current_line.append({",[33,45019,3459],{"class":54},[33,45021,2079],{"class":167},[33,45023,17294],{"class":54},[33,45025,365],{"class":167},[33,45027,43582],{"class":54},[33,45029,45030],{"class":167},": prev[",[33,45032,43592],{"class":54},[33,45034,8314],{"class":167},[33,45036,43592],{"class":54},[33,45038,45030],{"class":167},[33,45040,43592],{"class":54},[33,45042,8314],{"class":167},[33,45044,43412],{"class":54},[33,45046,45030],{"class":167},[33,45048,43412],{"class":54},[33,45050,45051],{"class":167},"]})\n",[33,45053,45054],{"class":35,"line":667},[33,45055,45056],{"class":167},"            current_line.append(ch)\n",[33,45058,45059,45061],{"class":35,"line":675},[33,45060,41290],{"class":163},[33,45062,574],{"class":167},[33,45064,45065,45068,45070,45073,45075,45077,45079,45081,45083],{"class":35,"line":689},[33,45066,45067],{"class":167},"            lines.append(",[33,45069,3198],{"class":54},[33,45071,45072],{"class":167},".join(c[",[33,45074,3459],{"class":54},[33,45076,763],{"class":167},[33,45078,6124],{"class":163},[33,45080,7486],{"class":167},[33,45082,662],{"class":163},[33,45084,45085],{"class":167}," current_line))\n",[33,45087,45088,45091,45093],{"class":35,"line":703},[33,45089,45090],{"class":167},"            current_line ",[33,45092,242],{"class":163},[33,45094,45095],{"class":167}," [ch]\n",[33,45097,45098],{"class":35,"line":714},[33,45099,92],{"emptyLinePlaceholder":91},[33,45101,45102,45104],{"class":35,"line":723},[33,45103,617],{"class":163},[33,45105,45106],{"class":167}," current_line:\n",[33,45108,45109,45112,45114,45116,45118,45120,45122,45124,45126],{"class":35,"line":754},[33,45110,45111],{"class":167},"        lines.append(",[33,45113,3198],{"class":54},[33,45115,45072],{"class":167},[33,45117,3459],{"class":54},[33,45119,763],{"class":167},[33,45121,6124],{"class":163},[33,45123,7486],{"class":167},[33,45125,662],{"class":163},[33,45127,45085],{"class":167},[33,45129,45130],{"class":35,"line":771},[33,45131,92],{"emptyLinePlaceholder":91},[33,45133,45134,45136],{"class":35,"line":777},[33,45135,1332],{"class":163},[33,45137,45138],{"class":167}," lines\n",[33,45140,45141],{"class":35,"line":788},[33,45142,92],{"emptyLinePlaceholder":91},[33,45144,45145,45147,45149,45151,45153],{"class":35,"line":804},[33,45146,2491],{"class":163},[33,45148,2494],{"class":50},[33,45150,2497],{"class":163},[33,45152,2500],{"class":54},[33,45154,574],{"class":167},[33,45156,45157,45160,45162,45165,45167],{"class":35,"line":809},[33,45158,45159],{"class":167},"    lines ",[33,45161,242],{"class":163},[33,45163,45164],{"class":167}," cluster_chars(",[33,45166,7076],{"class":50},[33,45168,221],{"class":167},[33,45170,45171,45173,45175,45177,45180,45182],{"class":35,"line":819},[33,45172,656],{"class":163},[33,45174,13133],{"class":167},[33,45176,662],{"class":163},[33,45178,45179],{"class":167}," lines[:",[33,45181,3545],{"class":50},[33,45183,17477],{"class":167},[33,45185,45186,45188,45190,45193],{"class":35,"line":829},[33,45187,9414],{"class":50},[33,45189,602],{"class":167},[33,45191,45192],{"class":50},"repr",[33,45194,45195],{"class":167},"(line))\n",[18,45197,45199],{"id":45198},"handling-multi-page-documents","Handling Multi-Page Documents",[14,45201,45202],{},"When extracting from a document with several pages, iterate and keep the first row of the first page as the canonical header. Do not concatenate raw word lists before promoting a header — the header will appear duplicated on every page if the PDF was generated with page-level table repeats.",[23,45204,45206],{"className":126,"code":45205,"language":47,"meta":28,"style":28},"# pip install pdfplumber pandas\nfrom pathlib import Path\nimport pdfplumber\nimport pandas as pd\nimport statistics\n\nPDF_PATH = Path(\"data\u002Fmulti_page_report.pdf\")\nOUTPUT_PATH = Path(\"output\u002Fmulti_page_aligned.csv\")\n\ndef extract_all_pages(path: Path, x_tolerance: int = 2) -> pd.DataFrame:\n    \"\"\"\n    Extract aligned rows from every page of a PDF and return a single DataFrame.\n    Promotes the first row of the first page as column headers.\n    Drops repeated header rows on subsequent pages.\n    \"\"\"\n    all_rows: list[list[str]] = []\n    header: list[str] | None = None\n\n    try:\n        with pdfplumber.open(path) as pdf:\n            for page in pdf.pages:\n                words = page.extract_words(x_tolerance=x_tolerance)\n                if not words:\n                    continue\n\n                heights = [w[\"bottom\"] - w[\"top\"] for w in words]\n                y_tol = statistics.median(heights) * 0.4\n\n                words.sort(key=lambda w: w[\"top\"])\n                rows: list[list[str]] = []\n                cur_row = [words[0]]\n                cur_top = words[0][\"top\"]\n\n                for word in words[1:]:\n                    if abs(word[\"top\"] - cur_top) \u003C= y_tol:\n                        cur_row.append(word)\n                    else:\n                        cur_row.sort(key=lambda w: w[\"x0\"])\n                        rows.append([w[\"text\"] for w in cur_row])\n                        cur_row = [word]\n                        cur_top = word[\"top\"]\n                if cur_row:\n                    cur_row.sort(key=lambda w: w[\"x0\"])\n                    rows.append([w[\"text\"] for w in cur_row])\n\n                if header is None and rows:\n                    header = rows[0]\n                    rows = rows[1:]\n\n                # Drop rows that match the canonical header (page-break repeats)\n                all_rows.extend(r for r in rows if r != header)\n\n    except Exception as e:\n        raise RuntimeError(f\"Extraction failed on {path}: {e}\") from e\n\n    if not all_rows:\n        return pd.DataFrame()\n\n    max_cols = max(len(r) for r in all_rows)\n    padded = [r + [\"\"] * (max_cols - len(r)) for r in all_rows]\n    df = pd.DataFrame(padded, columns=(header or list(range(max_cols))))\n    df.replace(\"\", pd.NA, inplace=True)\n    return df\n\nif __name__ == \"__main__\":\n    OUTPUT_PATH.parent.mkdir(exist_ok=True)\n    df = extract_all_pages(PDF_PATH)\n    df.to_csv(OUTPUT_PATH, index=False)\n    print(f\"Exported {len(df)} rows × {df.shape[1]} cols\")\n",[30,45207,45208,45212,45222,45228,45238,45244,45248,45261,45274,45278,45296,45300,45305,45310,45315,45319,45332,45349,45353,45359,45369,45379,45394,45402,45406,45410,45439,45453,45457,45472,45485,45498,45515,45519,45533,45555,45560,45566,45581,45599,45608,45621,45628,45643,45660,45664,45678,45690,45703,45707,45712,45736,45740,45750,45785,45789,45798,45804,45808,45834,45873,45901,45922,45928,45932,45944,45958,45971,45987],{"__ignoreMap":28},[33,45209,45210],{"class":35,"line":36},[33,45211,7041],{"class":39},[33,45213,45214,45216,45218,45220],{"class":35,"line":43},[33,45215,190],{"class":163},[33,45217,193],{"class":167},[33,45219,164],{"class":163},[33,45221,198],{"class":167},[33,45223,45224,45226],{"class":35,"line":61},[33,45225,164],{"class":163},[33,45227,485],{"class":167},[33,45229,45230,45232,45234,45236],{"class":35,"line":73},[33,45231,164],{"class":163},[33,45233,492],{"class":167},[33,45235,495],{"class":163},[33,45237,498],{"class":167},[33,45239,45240,45242],{"class":35,"line":88},[33,45241,164],{"class":163},[33,45243,43221],{"class":167},[33,45245,45246],{"class":35,"line":95},[33,45247,92],{"emptyLinePlaceholder":91},[33,45249,45250,45252,45254,45256,45259],{"class":35,"line":101},[33,45251,7076],{"class":50},[33,45253,212],{"class":163},[33,45255,215],{"class":167},[33,45257,45258],{"class":54},"\"data\u002Fmulti_page_report.pdf\"",[33,45260,221],{"class":167},[33,45262,45263,45265,45267,45269,45272],{"class":35,"line":171},[33,45264,521],{"class":50},[33,45266,212],{"class":163},[33,45268,215],{"class":167},[33,45270,45271],{"class":54},"\"output\u002Fmulti_page_aligned.csv\"",[33,45273,221],{"class":167},[33,45275,45276],{"class":35,"line":179},[33,45277,92],{"emptyLinePlaceholder":91},[33,45279,45280,45282,45285,45288,45290,45292,45294],{"class":35,"line":187},[33,45281,562],{"class":163},[33,45283,45284],{"class":46}," extract_all_pages",[33,45286,45287],{"class":167},"(path: Path, x_tolerance: ",[33,45289,1059],{"class":50},[33,45291,212],{"class":163},[33,45293,7451],{"class":50},[33,45295,7668],{"class":167},[33,45297,45298],{"class":35,"line":201},[33,45299,7673],{"class":54},[33,45301,45302],{"class":35,"line":206},[33,45303,45304],{"class":54},"    Extract aligned rows from every page of a PDF and return a single DataFrame.\n",[33,45306,45307],{"class":35,"line":224},[33,45308,45309],{"class":54},"    Promotes the first row of the first page as column headers.\n",[33,45311,45312],{"class":35,"line":229},[33,45313,45314],{"class":54},"    Drops repeated header rows on subsequent pages.\n",[33,45316,45317],{"class":35,"line":235},[33,45318,7673],{"class":54},[33,45320,45321,45324,45326,45328,45330],{"class":35,"line":250},[33,45322,45323],{"class":167},"    all_rows: list[list[",[33,45325,1053],{"class":50},[33,45327,13081],{"class":167},[33,45329,242],{"class":163},[33,45331,589],{"class":167},[33,45333,45334,45337,45339,45341,45343,45345,45347],{"class":35,"line":266},[33,45335,45336],{"class":167},"    header: list[",[33,45338,1053],{"class":50},[33,45340,763],{"class":167},[33,45342,7654],{"class":163},[33,45344,7657],{"class":50},[33,45346,212],{"class":163},[33,45348,3852],{"class":50},[33,45350,45351],{"class":35,"line":290},[33,45352,92],{"emptyLinePlaceholder":91},[33,45354,45355,45357],{"class":35,"line":295},[33,45356,2424],{"class":163},[33,45358,574],{"class":167},[33,45360,45361,45363,45365,45367],{"class":35,"line":300},[33,45362,2191],{"class":163},[33,45364,7123],{"class":167},[33,45366,495],{"class":163},[33,45368,686],{"class":167},[33,45370,45371,45373,45375,45377],{"class":35,"line":317},[33,45372,1793],{"class":163},[33,45374,695],{"class":167},[33,45376,662],{"class":163},[33,45378,700],{"class":167},[33,45380,45381,45383,45385,45387,45389,45391],{"class":35,"line":332},[33,45382,37179],{"class":167},[33,45384,242],{"class":163},[33,45386,43303],{"class":167},[33,45388,43306],{"class":238},[33,45390,242],{"class":163},[33,45392,45393],{"class":167},"x_tolerance)\n",[33,45395,45396,45398,45400],{"class":35,"line":347},[33,45397,7170],{"class":163},[33,45399,620],{"class":163},[33,45401,43370],{"class":167},[33,45403,45404],{"class":35,"line":374},[33,45405,8629],{"class":163},[33,45407,45408],{"class":35,"line":397},[33,45409,92],{"emptyLinePlaceholder":91},[33,45411,45412,45415,45417,45419,45421,45423,45425,45427,45429,45431,45433,45435,45437],{"class":35,"line":653},[33,45413,45414],{"class":167},"                heights ",[33,45416,242],{"class":163},[33,45418,43409],{"class":167},[33,45420,43436],{"class":54},[33,45422,763],{"class":167},[33,45424,4126],{"class":163},[33,45426,44068],{"class":167},[33,45428,43412],{"class":54},[33,45430,763],{"class":167},[33,45432,6124],{"class":163},[33,45434,43419],{"class":167},[33,45436,662],{"class":163},[33,45438,43424],{"class":167},[33,45440,45441,45444,45446,45448,45450],{"class":35,"line":667},[33,45442,45443],{"class":167},"                y_tol ",[33,45445,242],{"class":163},[33,45447,44090],{"class":167},[33,45449,1769],{"class":163},[33,45451,45452],{"class":50}," 0.4\n",[33,45454,45455],{"class":35,"line":675},[33,45456,92],{"emptyLinePlaceholder":91},[33,45458,45459,45462,45464,45466,45468,45470],{"class":35,"line":689},[33,45460,45461],{"class":167},"                words.sort(",[33,45463,44114],{"class":238},[33,45465,44117],{"class":163},[33,45467,44120],{"class":167},[33,45469,43412],{"class":54},[33,45471,751],{"class":167},[33,45473,45474,45477,45479,45481,45483],{"class":35,"line":703},[33,45475,45476],{"class":167},"                rows: list[list[",[33,45478,1053],{"class":50},[33,45480,13081],{"class":167},[33,45482,242],{"class":163},[33,45484,589],{"class":167},[33,45486,45487,45490,45492,45494,45496],{"class":35,"line":714},[33,45488,45489],{"class":167},"                cur_row ",[33,45491,242],{"class":163},[33,45493,44157],{"class":167},[33,45495,748],{"class":50},[33,45497,44162],{"class":167},[33,45499,45500,45503,45505,45507,45509,45511,45513],{"class":35,"line":723},[33,45501,45502],{"class":167},"                cur_top ",[33,45504,242],{"class":163},[33,45506,44174],{"class":167},[33,45508,748],{"class":50},[33,45510,44179],{"class":167},[33,45512,43412],{"class":54},[33,45514,9202],{"class":167},[33,45516,45517],{"class":35,"line":754},[33,45518,92],{"emptyLinePlaceholder":91},[33,45520,45521,45523,45525,45527,45529,45531],{"class":35,"line":771},[33,45522,692],{"class":163},[33,45524,44194],{"class":167},[33,45526,662],{"class":163},[33,45528,44174],{"class":167},[33,45530,734],{"class":50},[33,45532,43533],{"class":167},[33,45534,45535,45537,45539,45541,45543,45545,45547,45550,45552],{"class":35,"line":777},[33,45536,717],{"class":163},[33,45538,43543],{"class":50},[33,45540,44211],{"class":167},[33,45542,43412],{"class":54},[33,45544,763],{"class":167},[33,45546,4126],{"class":163},[33,45548,45549],{"class":167}," cur_top) ",[33,45551,44223],{"class":163},[33,45553,45554],{"class":167}," y_tol:\n",[33,45556,45557],{"class":35,"line":788},[33,45558,45559],{"class":167},"                        cur_row.append(word)\n",[33,45561,45562,45564],{"class":35,"line":804},[33,45563,39369],{"class":163},[33,45565,574],{"class":167},[33,45567,45568,45571,45573,45575,45577,45579],{"class":35,"line":809},[33,45569,45570],{"class":167},"                        cur_row.sort(",[33,45572,44114],{"class":238},[33,45574,44117],{"class":163},[33,45576,44120],{"class":167},[33,45578,43582],{"class":54},[33,45580,751],{"class":167},[33,45582,45583,45586,45588,45590,45592,45594,45596],{"class":35,"line":819},[33,45584,45585],{"class":167},"                        rows.append([w[",[33,45587,3459],{"class":54},[33,45589,763],{"class":167},[33,45591,6124],{"class":163},[33,45593,43419],{"class":167},[33,45595,662],{"class":163},[33,45597,45598],{"class":167}," cur_row])\n",[33,45600,45601,45604,45606],{"class":35,"line":829},[33,45602,45603],{"class":167},"                        cur_row ",[33,45605,242],{"class":163},[33,45607,44287],{"class":167},[33,45609,45610,45613,45615,45617,45619],{"class":35,"line":834},[33,45611,45612],{"class":167},"                        cur_top ",[33,45614,242],{"class":163},[33,45616,44297],{"class":167},[33,45618,43412],{"class":54},[33,45620,9202],{"class":167},[33,45622,45623,45625],{"class":35,"line":839},[33,45624,7170],{"class":163},[33,45626,45627],{"class":167}," cur_row:\n",[33,45629,45630,45633,45635,45637,45639,45641],{"class":35,"line":860},[33,45631,45632],{"class":167},"                    cur_row.sort(",[33,45634,44114],{"class":238},[33,45636,44117],{"class":163},[33,45638,44120],{"class":167},[33,45640,43582],{"class":54},[33,45642,751],{"class":167},[33,45644,45645,45648,45650,45652,45654,45656,45658],{"class":35,"line":887},[33,45646,45647],{"class":167},"                    rows.append([w[",[33,45649,3459],{"class":54},[33,45651,763],{"class":167},[33,45653,6124],{"class":163},[33,45655,43419],{"class":167},[33,45657,662],{"class":163},[33,45659,45598],{"class":167},[33,45661,45662],{"class":35,"line":907},[33,45663,92],{"emptyLinePlaceholder":91},[33,45665,45666,45668,45670,45672,45674,45676],{"class":35,"line":1826},[33,45667,7170],{"class":163},[33,45669,17788],{"class":167},[33,45671,3847],{"class":163},[33,45673,7657],{"class":50},[33,45675,5615],{"class":163},[33,45677,8723],{"class":167},[33,45679,45680,45682,45684,45686,45688],{"class":35,"line":1844},[33,45681,7468],{"class":167},[33,45683,242],{"class":163},[33,45685,13250],{"class":167},[33,45687,748],{"class":50},[33,45689,9202],{"class":167},[33,45691,45692,45695,45697,45699,45701],{"class":35,"line":1858},[33,45693,45694],{"class":167},"                    rows ",[33,45696,242],{"class":163},[33,45698,13250],{"class":167},[33,45700,734],{"class":50},[33,45702,39364],{"class":167},[33,45704,45705],{"class":35,"line":1871},[33,45706,92],{"emptyLinePlaceholder":91},[33,45708,45709],{"class":35,"line":1877},[33,45710,45711],{"class":39},"                # Drop rows that match the canonical header (page-break repeats)\n",[33,45713,45714,45717,45719,45722,45724,45727,45729,45731,45733],{"class":35,"line":1883},[33,45715,45716],{"class":167},"                all_rows.extend(r ",[33,45718,6124],{"class":163},[33,45720,45721],{"class":167}," r ",[33,45723,662],{"class":163},[33,45725,45726],{"class":167}," rows ",[33,45728,2491],{"class":163},[33,45730,45721],{"class":167},[33,45732,17877],{"class":163},[33,45734,45735],{"class":167}," header)\n",[33,45737,45738],{"class":35,"line":1915},[33,45739,92],{"emptyLinePlaceholder":91},[33,45741,45742,45744,45746,45748],{"class":35,"line":1926},[33,45743,2449],{"class":163},[33,45745,783],{"class":50},[33,45747,1852],{"class":163},[33,45749,7583],{"class":167},[33,45751,45752,45754,45756,45758,45760,45763,45765,45767,45769,45771,45773,45775,45777,45779,45781,45783],{"class":35,"line":1932},[33,45753,4051],{"class":163},[33,45755,7590],{"class":50},[33,45757,602],{"class":167},[33,45759,4059],{"class":163},[33,45761,45762],{"class":54},"\"Extraction failed on ",[33,45764,1115],{"class":50},[33,45766,2580],{"class":167},[33,45768,1121],{"class":50},[33,45770,2079],{"class":54},[33,45772,1115],{"class":50},[33,45774,7602],{"class":167},[33,45776,1121],{"class":50},[33,45778,274],{"class":54},[33,45780,1649],{"class":167},[33,45782,190],{"class":163},[33,45784,7613],{"class":167},[33,45786,45787],{"class":35,"line":1938},[33,45788,92],{"emptyLinePlaceholder":91},[33,45790,45791,45793,45795],{"class":35,"line":1950},[33,45792,617],{"class":163},[33,45794,620],{"class":163},[33,45796,45797],{"class":167}," all_rows:\n",[33,45799,45800,45802],{"class":35,"line":1958},[33,45801,1659],{"class":163},[33,45803,7721],{"class":167},[33,45805,45806],{"class":35,"line":4904},[33,45807,92],{"emptyLinePlaceholder":91},[33,45809,45810,45813,45815,45818,45820,45822,45825,45827,45829,45831],{"class":35,"line":4909},[33,45811,45812],{"class":167},"    max_cols ",[33,45814,242],{"class":163},[33,45816,45817],{"class":50}," max",[33,45819,602],{"class":167},[33,45821,928],{"class":50},[33,45823,45824],{"class":167},"(r) ",[33,45826,6124],{"class":163},[33,45828,45721],{"class":167},[33,45830,662],{"class":163},[33,45832,45833],{"class":167}," all_rows)\n",[33,45835,45836,45839,45841,45844,45846,45848,45850,45852,45854,45857,45859,45861,45864,45866,45868,45870],{"class":35,"line":4915},[33,45837,45838],{"class":167},"    padded ",[33,45840,242],{"class":163},[33,45842,45843],{"class":167}," [r ",[33,45845,1811],{"class":163},[33,45847,9178],{"class":167},[33,45849,3198],{"class":54},[33,45851,763],{"class":167},[33,45853,1769],{"class":163},[33,45855,45856],{"class":167}," (max_cols ",[33,45858,4126],{"class":163},[33,45860,4037],{"class":50},[33,45862,45863],{"class":167},"(r)) ",[33,45865,6124],{"class":163},[33,45867,45721],{"class":167},[33,45869,662],{"class":163},[33,45871,45872],{"class":167}," all_rows]\n",[33,45874,45875,45877,45879,45882,45884,45886,45889,45891,45893,45895,45898],{"class":35,"line":4925},[33,45876,4025],{"class":167},[33,45878,242],{"class":163},[33,45880,45881],{"class":167}," pd.DataFrame(padded, ",[33,45883,740],{"class":238},[33,45885,242],{"class":163},[33,45887,45888],{"class":167},"(header ",[33,45890,7162],{"class":163},[33,45892,599],{"class":50},[33,45894,602],{"class":167},[33,45896,45897],{"class":50},"range",[33,45899,45900],{"class":167},"(max_cols))))\n",[33,45902,45903,45906,45908,45910,45912,45914,45916,45918,45920],{"class":35,"line":4935},[33,45904,45905],{"class":167},"    df.replace(",[33,45907,3198],{"class":54},[33,45909,10884],{"class":167},[33,45911,8018],{"class":50},[33,45913,365],{"class":167},[33,45915,10891],{"class":238},[33,45917,242],{"class":163},[33,45919,855],{"class":50},[33,45921,221],{"class":167},[33,45923,45924,45926],{"class":35,"line":4941},[33,45925,1332],{"class":163},[33,45927,11719],{"class":167},[33,45929,45930],{"class":35,"line":4950},[33,45931,92],{"emptyLinePlaceholder":91},[33,45933,45934,45936,45938,45940,45942],{"class":35,"line":4960},[33,45935,2491],{"class":163},[33,45937,2494],{"class":50},[33,45939,2497],{"class":163},[33,45941,2500],{"class":54},[33,45943,574],{"class":167},[33,45945,45946,45948,45950,45952,45954,45956],{"class":35,"line":4965},[33,45947,863],{"class":50},[33,45949,866],{"class":167},[33,45951,878],{"class":238},[33,45953,242],{"class":163},[33,45955,855],{"class":50},[33,45957,221],{"class":167},[33,45959,45960,45962,45964,45967,45969],{"class":35,"line":4971},[33,45961,4025],{"class":167},[33,45963,242],{"class":163},[33,45965,45966],{"class":167}," extract_all_pages(",[33,45968,7076],{"class":50},[33,45970,221],{"class":167},[33,45972,45973,45975,45977,45979,45981,45983,45985],{"class":35,"line":4983},[33,45974,39534],{"class":167},[33,45976,521],{"class":50},[33,45978,365],{"class":167},[33,45980,897],{"class":238},[33,45982,242],{"class":163},[33,45984,902],{"class":50},[33,45986,221],{"class":167},[33,45988,45989,45991,45993,45995,45997,45999,46001,46003,46005,46007,46009,46011,46013,46015,46017],{"class":35,"line":4988},[33,45990,7268],{"class":50},[33,45992,602],{"class":167},[33,45994,4059],{"class":163},[33,45996,44444],{"class":54},[33,45998,4065],{"class":50},[33,46000,4068],{"class":167},[33,46002,1121],{"class":50},[33,46004,16022],{"class":54},[33,46006,1115],{"class":50},[33,46008,9541],{"class":167},[33,46010,734],{"class":50},[33,46012,9546],{"class":167},[33,46014,1121],{"class":50},[33,46016,39916],{"class":54},[33,46018,221],{"class":167},[18,46020,46022],{"id":46021},"troubleshooting-common-symptoms","Troubleshooting Common Symptoms",[4273,46024,46025,46035],{},[4276,46026,46027],{},[4279,46028,46029,46031,46033],{},[4282,46030,4284],{},[4282,46032,4287],{},[4282,46034,4290],{},[4292,46036,46037,46054,46069,46091,46108],{},[4279,46038,46039,46044,46049],{},[4297,46040,46041,46042,12027],{},"Two columns merged into one (",[30,46043,43161],{},[4297,46045,46046,46048],{},[30,46047,43170],{}," reads same-y glyphs left-to-right without column awareness",[4297,46050,14337,46051,46053],{},[30,46052,43183],{}," + coordinate sorting",[4279,46055,46056,46059,46062],{},[4297,46057,46058],{},"Row count doubles on second and later pages",[4297,46060,46061],{},"Header row not being detected and dropped",[4297,46063,46064,46065,46068],{},"Compare each row against ",[30,46066,46067],{},"canonical_header","; drop matches",[4279,46070,46071,46074,46080],{},[4297,46072,46073],{},"Single words split across multiple output rows",[4297,46075,46076,46079],{},[30,46077,46078],{},"y_tolerance"," too small — sub-pixel font variations cause word fragmentation",[4297,46081,46082,46083,36661,46085,2012,46088],{},"Increase ",[30,46084,46078],{},[30,46086,46087],{},"median_h * 0.5",[30,46089,46090],{},"0.6",[4279,46092,46093,46096,46101],{},[4297,46094,46095],{},"Entire page text in a single line",[4297,46097,46098,46100],{},[30,46099,46078],{}," too large — all words collapse into one row",[4297,46102,46103,46104,46107],{},"Reduce to ",[30,46105,46106],{},"median_h * 0.25","; check DPI if it is a scanned image",[4279,46109,46110,46113,46116],{},[4297,46111,46112],{},"Latin-1 special characters garbled",[4297,46114,46115],{},"Encoding mismatch when writing CSV",[4297,46117,4358,46118,36661,46121,46124],{},[30,46119,46120],{},"encoding=\"utf-8\"",[30,46122,46123],{},"to_csv()","; check source PDF font encoding",[18,46126,9247],{"id":9246},[14,46128,46129],{},"Confirm the fix worked by checking three things:",[35387,46131,46132],{},[4214,46133,46134,46137,46138,46141,46142,46144],{},[1974,46135,46136],{},"No merged numerics."," Parse the output CSV with ",[30,46139,46140],{},"pd.to_numeric(..., errors=\"coerce\")"," and assert ",[30,46143,8884],{}," count is low:",[23,46146,46148],{"className":126,"code":46147,"language":47,"meta":28,"style":28},"import pandas as pd\nfrom pathlib import Path\n\ndf = pd.read_csv(Path(\"output\u002Faligned.csv\"), header=None)\nfor col in df.columns:\n    numeric = pd.to_numeric(df[col], errors=\"coerce\")\n    nan_rate = numeric.isna().mean()\n    if nan_rate \u003C 0.2:\n        print(f\"Col {col}: {nan_rate:.0%} non-numeric — looks clean\")\n    else:\n        print(f\"Col {col}: {nan_rate:.0%} non-numeric — check alignment\")\n",[30,46149,46150,46160,46170,46174,46195,46205,46222,46232,46246,46279,46285],{"__ignoreMap":28},[33,46151,46152,46154,46156,46158],{"class":35,"line":36},[33,46153,164],{"class":163},[33,46155,492],{"class":167},[33,46157,495],{"class":163},[33,46159,498],{"class":167},[33,46161,46162,46164,46166,46168],{"class":35,"line":43},[33,46163,190],{"class":163},[33,46165,193],{"class":167},[33,46167,164],{"class":163},[33,46169,198],{"class":167},[33,46171,46172],{"class":35,"line":61},[33,46173,92],{"emptyLinePlaceholder":91},[33,46175,46176,46178,46180,46183,46185,46187,46189,46191,46193],{"class":35,"line":73},[33,46177,13459],{"class":167},[33,46179,242],{"class":163},[33,46181,46182],{"class":167}," pd.read_csv(Path(",[33,46184,43822],{"class":54},[33,46186,18525],{"class":167},[33,46188,44427],{"class":238},[33,46190,242],{"class":163},[33,46192,571],{"class":50},[33,46194,221],{"class":167},[33,46196,46197,46199,46201,46203],{"class":35,"line":88},[33,46198,6124],{"class":163},[33,46200,7985],{"class":167},[33,46202,662],{"class":163},[33,46204,8005],{"class":167},[33,46206,46207,46210,46212,46214,46216,46218,46220],{"class":35,"line":95},[33,46208,46209],{"class":167},"    numeric ",[33,46211,242],{"class":163},[33,46213,16774],{"class":167},[33,46215,8317],{"class":238},[33,46217,242],{"class":163},[33,46219,12107],{"class":54},[33,46221,221],{"class":167},[33,46223,46224,46227,46229],{"class":35,"line":101},[33,46225,46226],{"class":167},"    nan_rate ",[33,46228,242],{"class":163},[33,46230,46231],{"class":167}," numeric.isna().mean()\n",[33,46233,46234,46236,46239,46241,46244],{"class":35,"line":171},[33,46235,617],{"class":163},[33,46237,46238],{"class":167}," nan_rate ",[33,46240,4043],{"class":163},[33,46242,46243],{"class":50}," 0.2",[33,46245,574],{"class":167},[33,46247,46248,46250,46252,46254,46257,46259,46261,46263,46265,46267,46270,46272,46274,46277],{"class":35,"line":179},[33,46249,9414],{"class":50},[33,46251,602],{"class":167},[33,46253,4059],{"class":163},[33,46255,46256],{"class":54},"\"Col ",[33,46258,1115],{"class":50},[33,46260,8276],{"class":167},[33,46262,1121],{"class":50},[33,46264,2079],{"class":54},[33,46266,1115],{"class":50},[33,46268,46269],{"class":167},"nan_rate",[33,46271,12775],{"class":163},[33,46273,1121],{"class":50},[33,46275,46276],{"class":54}," non-numeric — looks clean\"",[33,46278,221],{"class":167},[33,46280,46281,46283],{"class":35,"line":187},[33,46282,6864],{"class":163},[33,46284,574],{"class":167},[33,46286,46287,46289,46291,46293,46295,46297,46299,46301,46303,46305,46307,46309,46311,46314],{"class":35,"line":201},[33,46288,9414],{"class":50},[33,46290,602],{"class":167},[33,46292,4059],{"class":163},[33,46294,46256],{"class":54},[33,46296,1115],{"class":50},[33,46298,8276],{"class":167},[33,46300,1121],{"class":50},[33,46302,2079],{"class":54},[33,46304,1115],{"class":50},[33,46306,46269],{"class":167},[33,46308,12775],{"class":163},[33,46310,1121],{"class":50},[33,46312,46313],{"class":54}," non-numeric — check alignment\"",[33,46315,221],{"class":167},[35387,46317,46318,46327,46337],{"start":43},[4214,46319,46320,46323,46324,3035],{},[1974,46321,46322],{},"Row count matches source."," Open the PDF in a viewer, count rows in one table manually, and assert ",[30,46325,46326],{},"len(rows) == expected",[4214,46328,46329,46332,46333,46336],{},[1974,46330,46331],{},"Column count is consistent."," ",[30,46334,46335],{},"pd.Series([len(r) for r in rows]).value_counts()"," should show a single dominant column width; multiple widths indicate rows that were split or merged incorrectly.",[4214,46338,46339,46342],{},[1974,46340,46341],{},"Spot-check known values."," If the PDF is a financial report, pick three cells with known values (e.g., a specific total) and assert they appear in the correct column after extraction.",[18,46344,6918],{"id":6917},[4211,46346,46347,46352,46357,46362],{},[4214,46348,46349,46351],{},[940,46350,9592],{"href":942}," — full camelot and pdfplumber pipeline including lattice vs stream selection",[4214,46353,46354,46356],{},[940,46355,10077],{"href":10076}," — when there is no text layer at all",[4214,46358,46359,46361],{},[940,46360,9599],{"href":9598}," — post-extraction data cleaning and type coercion",[4214,46363,46364,46366],{},[940,46365,27254],{"href":27253}," — separate issue when garbling comes from character encoding, not coordinates",[14,46368,6947,46369,3035],{},[940,46370,9592],{"href":942},[6953,46372,9614],{},{"title":28,"searchDepth":43,"depth":43,"links":46374},[46375,46376,46377,46378,46380,46382,46383,46384,46385],{"id":7020,"depth":43,"text":7021},{"id":35016,"depth":43,"text":35017},{"id":43748,"depth":43,"text":43749},{"id":44473,"depth":43,"text":46379},"Variant Fix 1: Use layout=True for Simple Single-Column Misalignment",{"id":44682,"depth":43,"text":46381},"Variant Fix 2: Char-Level Clustering with chars",{"id":45198,"depth":43,"text":45199},{"id":46021,"depth":43,"text":46022},{"id":9246,"depth":43,"text":9247},{"id":6917,"depth":43,"text":6918},"Fix Alignment Issues","2026-05-05","Fix jumbled or misaligned columns from PDF text extraction by sorting x\u002Fy coordinates with pdfplumber extract_words and char clustering. Includes layout=True tip.",{},"\u002Fautomating-pdf-extraction-generation\u002Fextracting-tables-from-pdfs\u002Ffix-pdf-text-extraction-alignment-issues",{"title":10535,"description":46388},"Fix PDF Text Extraction Alignment Issues in Python","automating-pdf-extraction-generation\u002Fextracting-tables-from-pdfs\u002Ffix-pdf-text-extraction-alignment-issues\u002Findex",[9631,47,943,46395,46396],"text extraction","alignment","LBCQrnk1W4_jPrHpDObfvpIHEXKPAgzzFJQrphYCSQc",{"id":46399,"title":10077,"body":46400,"breadcrumbTitle":49135,"canonical":6977,"date":46387,"description":49136,"draft":6980,"extension":6981,"image":6977,"meta":49137,"navigation":91,"path":49138,"robots":6977,"seo":49139,"seoTitle":49140,"stem":49141,"tags":49142,"updatedAt":6978,"__hash__":49145},"content\u002Fautomating-pdf-extraction-generation\u002Fextracting-tables-from-pdfs\u002Fhow-to-extract-tables-from-scanned-pdfs\u002Findex.md",{"type":7,"value":46401,"toc":49121},[46402,46405,46410,46412,46426,46475,46516,46518,46521,46749,46755,46757,46760,46808,46811,46828,46837,46841,46851,47092,47097,47141,47145,47151,47356,47362,47366,47373,47579,47587,47591,48023,48027,48033,48351,48355,48366,48764,48774,48776,48779,48982,48992,48995,49090,49092,49114,49118],[10,46403,10077],{"id":46404},"how-to-extract-tables-from-scanned-pdfs",[14,46406,46407,46409],{},[940,46408,943],{"href":942}," and camelot return empty DataFrames on scanned documents because they parse the PDF content stream — which contains no text objects when a document was photocopied or printed-then-scanned. The fix is an OCR pipeline: render each page to a high-DPI image, extract text with spatial coordinates via Tesseract, then reconstruct row structure by clustering y-coordinates.",[18,46411,7021],{"id":7020},[14,46413,46414,46415,46418,46419,365,46421,365,46423,46425],{},"A scanned PDF is a wrapper around one or more raster images. It has no ",[30,46416,46417],{},"\u002FText"," stream, no font dictionaries, and no vector line objects. Any library that reads those structures (",[30,46420,943],{},[30,46422,16139],{},[30,46424,36791],{},") silently returns empty results — not an error, just nothing. The symptom:",[23,46427,46429],{"className":126,"code":46428,"language":47,"meta":28,"style":28},"import pdfplumber\nwith pdfplumber.open(\"scanned_report.pdf\") as pdf:\n    tables = pdf.pages[0].extract_tables()\nprint(tables)   # → []\n",[30,46430,46431,46437,46452,46465],{"__ignoreMap":28},[33,46432,46433,46435],{"class":35,"line":36},[33,46434,164],{"class":163},[33,46436,485],{"class":167},[33,46438,46439,46441,46443,46446,46448,46450],{"class":35,"line":43},[33,46440,22271],{"class":163},[33,46442,39641],{"class":167},[33,46444,46445],{"class":54},"\"scanned_report.pdf\"",[33,46447,1649],{"class":167},[33,46449,495],{"class":163},[33,46451,686],{"class":167},[33,46453,46454,46456,46458,46460,46462],{"class":35,"line":61},[33,46455,37709],{"class":167},[33,46457,242],{"class":163},[33,46459,9870],{"class":167},[33,46461,748],{"class":50},[33,46463,46464],{"class":167},"].extract_tables()\n",[33,46466,46467,46469,46472],{"class":35,"line":73},[33,46468,13474],{"class":50},[33,46470,46471],{"class":167},"(tables)   ",[33,46473,46474],{"class":39},"# → []\n",[23,46476,46478],{"className":126,"code":46477,"language":47,"meta":28,"style":28},"import camelot\ntables = camelot.read_pdf(\"scanned_report.pdf\", flavor=\"lattice\")\nprint(tables.n)  # → 0\n",[30,46479,46480,46486,46506],{"__ignoreMap":28},[33,46481,46482,46484],{"class":35,"line":36},[33,46483,164],{"class":163},[33,46485,10567],{"class":167},[33,46487,46488,46490,46492,46494,46496,46498,46500,46502,46504],{"class":35,"line":43},[33,46489,13370],{"class":167},[33,46491,242],{"class":163},[33,46493,40545],{"class":167},[33,46495,46445],{"class":54},[33,46497,365],{"class":167},[33,46499,10748],{"class":238},[33,46501,242],{"class":163},[33,46503,10985],{"class":54},[33,46505,221],{"class":167},[33,46507,46508,46510,46513],{"class":35,"line":61},[33,46509,13474],{"class":50},[33,46511,46512],{"class":167},"(tables.n)  ",[33,46514,46515],{"class":39},"# → 0\n",[18,46517,35017],{"id":35016},[14,46519,46520],{},"Before building a pipeline, confirm you are actually dealing with a scanned PDF:",[23,46522,46524],{"className":126,"code":46523,"language":47,"meta":28,"style":28},"# pip install pymupdf\nfrom pathlib import Path\nimport fitz  # PyMuPDF\n\ndef is_scanned(path: Path) -> bool:\n    \"\"\"Return True if the PDF has no selectable text on any page.\"\"\"\n    try:\n        doc = fitz.open(str(path))\n        total_chars = sum(len(page.get_text(\"text\").strip()) for page in doc)\n        doc.close()\n        return total_chars == 0\n    except Exception as e:\n        raise RuntimeError(f\"Could not inspect {path}: {e}\") from e\n\nif __name__ == \"__main__\":\n    pdf = Path(\"data\u002Fscanned_report.pdf\")\n    if is_scanned(pdf):\n        print(\"No text layer — use OCR pipeline\")\n    else:\n        print(\"Text layer present — use pdfplumber or camelot\")\n",[30,46525,46526,46531,46541,46551,46555,46568,46573,46579,46592,46623,46628,46639,46649,46684,46688,46700,46714,46721,46732,46738],{"__ignoreMap":28},[33,46527,46528],{"class":35,"line":36},[33,46529,46530],{"class":39},"# pip install pymupdf\n",[33,46532,46533,46535,46537,46539],{"class":35,"line":43},[33,46534,190],{"class":163},[33,46536,193],{"class":167},[33,46538,164],{"class":163},[33,46540,198],{"class":167},[33,46542,46543,46545,46548],{"class":35,"line":61},[33,46544,164],{"class":163},[33,46546,46547],{"class":167}," fitz  ",[33,46549,46550],{"class":39},"# PyMuPDF\n",[33,46552,46553],{"class":35,"line":73},[33,46554,92],{"emptyLinePlaceholder":91},[33,46556,46557,46559,46562,46564,46566],{"class":35,"line":88},[33,46558,562],{"class":163},[33,46560,46561],{"class":46}," is_scanned",[33,46563,3743],{"class":167},[33,46565,2821],{"class":50},[33,46567,574],{"class":167},[33,46569,46570],{"class":35,"line":95},[33,46571,46572],{"class":54},"    \"\"\"Return True if the PDF has no selectable text on any page.\"\"\"\n",[33,46574,46575,46577],{"class":35,"line":101},[33,46576,2424],{"class":163},[33,46578,574],{"class":167},[33,46580,46581,46583,46585,46588,46590],{"class":35,"line":171},[33,46582,20077],{"class":167},[33,46584,242],{"class":163},[33,46586,46587],{"class":167}," fitz.open(",[33,46589,1053],{"class":50},[33,46591,21248],{"class":167},[33,46593,46594,46597,46599,46602,46604,46606,46609,46611,46614,46616,46618,46620],{"class":35,"line":179},[33,46595,46596],{"class":167},"        total_chars ",[33,46598,242],{"class":163},[33,46600,46601],{"class":50}," sum",[33,46603,602],{"class":167},[33,46605,928],{"class":50},[33,46607,46608],{"class":167},"(page.get_text(",[33,46610,3459],{"class":54},[33,46612,46613],{"class":167},").strip()) ",[33,46615,6124],{"class":163},[33,46617,695],{"class":167},[33,46619,662],{"class":163},[33,46621,46622],{"class":167}," doc)\n",[33,46624,46625],{"class":35,"line":187},[33,46626,46627],{"class":167},"        doc.close()\n",[33,46629,46630,46632,46635,46637],{"class":35,"line":201},[33,46631,1659],{"class":163},[33,46633,46634],{"class":167}," total_chars ",[33,46636,1865],{"class":163},[33,46638,28914],{"class":50},[33,46640,46641,46643,46645,46647],{"class":35,"line":206},[33,46642,2449],{"class":163},[33,46644,783],{"class":50},[33,46646,1852],{"class":163},[33,46648,7583],{"class":167},[33,46650,46651,46653,46655,46657,46659,46662,46664,46666,46668,46670,46672,46674,46676,46678,46680,46682],{"class":35,"line":224},[33,46652,4051],{"class":163},[33,46654,7590],{"class":50},[33,46656,602],{"class":167},[33,46658,4059],{"class":163},[33,46660,46661],{"class":54},"\"Could not inspect ",[33,46663,1115],{"class":50},[33,46665,2580],{"class":167},[33,46667,1121],{"class":50},[33,46669,2079],{"class":54},[33,46671,1115],{"class":50},[33,46673,7602],{"class":167},[33,46675,1121],{"class":50},[33,46677,274],{"class":54},[33,46679,1649],{"class":167},[33,46681,190],{"class":163},[33,46683,7613],{"class":167},[33,46685,46686],{"class":35,"line":229},[33,46687,92],{"emptyLinePlaceholder":91},[33,46689,46690,46692,46694,46696,46698],{"class":35,"line":235},[33,46691,2491],{"class":163},[33,46693,2494],{"class":50},[33,46695,2497],{"class":163},[33,46697,2500],{"class":54},[33,46699,574],{"class":167},[33,46701,46702,46705,46707,46709,46712],{"class":35,"line":250},[33,46703,46704],{"class":167},"    pdf ",[33,46706,242],{"class":163},[33,46708,215],{"class":167},[33,46710,46711],{"class":54},"\"data\u002Fscanned_report.pdf\"",[33,46713,221],{"class":167},[33,46715,46716,46718],{"class":35,"line":266},[33,46717,617],{"class":163},[33,46719,46720],{"class":167}," is_scanned(pdf):\n",[33,46722,46723,46725,46727,46730],{"class":35,"line":290},[33,46724,9414],{"class":50},[33,46726,602],{"class":167},[33,46728,46729],{"class":54},"\"No text layer — use OCR pipeline\"",[33,46731,221],{"class":167},[33,46733,46734,46736],{"class":35,"line":295},[33,46735,6864],{"class":163},[33,46737,574],{"class":167},[33,46739,46740,46742,46744,46747],{"class":35,"line":300},[33,46741,9414],{"class":50},[33,46743,602],{"class":167},[33,46745,46746],{"class":54},"\"Text layer present — use pdfplumber or camelot\"",[33,46748,221],{"class":167},[14,46750,46751,46752,46754],{},"A result of ",[30,46753,855],{}," with zero characters confirms the OCR path is required. A small character count (under 50) often indicates a partially OCR'd scan where Tesseract was run at low quality — treat it the same as a full scan.",[18,46756,21],{"id":20},[14,46758,46759],{},"Install system binaries and Python packages before running the pipeline:",[23,46761,46763],{"className":25,"code":46762,"language":27,"meta":28,"style":28},"# System dependencies (Ubuntu\u002FDebian)\nsudo apt-get install tesseract-ocr poppler-utils\n\n# Python packages\npip install pdf2image pytesseract pandas pymupdf\n",[30,46764,46765,46770,46783,46787,46791],{"__ignoreMap":28},[33,46766,46767],{"class":35,"line":36},[33,46768,46769],{"class":39},"# System dependencies (Ubuntu\u002FDebian)\n",[33,46771,46772,46774,46776,46778,46780],{"class":35,"line":43},[33,46773,9669],{"class":46},[33,46775,9672],{"class":54},[33,46777,79],{"class":54},[33,46779,26693],{"class":54},[33,46781,46782],{"class":54}," poppler-utils\n",[33,46784,46785],{"class":35,"line":61},[33,46786,92],{"emptyLinePlaceholder":91},[33,46788,46789],{"class":35,"line":73},[33,46790,9692],{"class":39},[33,46792,46793,46795,46797,46800,46803,46805],{"class":35,"line":88},[33,46794,76],{"class":46},[33,46796,79],{"class":54},[33,46798,46799],{"class":54}," pdf2image",[33,46801,46802],{"class":54}," pytesseract",[33,46804,16183],{"class":54},[33,46806,46807],{"class":54}," pymupdf\n",[14,46809,46810],{},"Verify Tesseract is on the PATH:",[23,46812,46814],{"className":25,"code":46813,"language":27,"meta":28,"style":28},"tesseract --version\n# Expected: tesseract 4.x or 5.x\n",[30,46815,46816,46823],{"__ignoreMap":28},[33,46817,46818,46821],{"class":35,"line":36},[33,46819,46820],{"class":46},"tesseract",[33,46822,41864],{"class":50},[33,46824,46825],{"class":35,"line":43},[33,46826,46827],{"class":39},"# Expected: tesseract 4.x or 5.x\n",[14,46829,46830,46831,46834,46835,3035],{},"If Tesseract is not found, set the path in code: ",[30,46832,46833],{},"pytesseract.pytesseract.tesseract_cmd = \"\u002Fusr\u002Fbin\u002Ftesseract\"",". For the full Tesseract not-found error on Linux, see ",[940,46836,36756],{"href":26957},[18,46838,46840],{"id":46839},"step-1-render-pages-to-images","Step 1: Render Pages to Images",[14,46842,46843,46846,46847,46850],{},[30,46844,46845],{},"pdf2image.convert_from_path"," calls Poppler's ",[30,46848,46849],{},"pdftoppm"," under the hood. Use at least 300 DPI — lower resolutions blur character edges and cause Tesseract to merge adjacent cell text.",[23,46852,46854],{"className":126,"code":46853,"language":47,"meta":28,"style":28},"# pip install pdf2image\nfrom pathlib import Path\nfrom pdf2image import convert_from_path\nfrom PIL import Image  # installed with pdf2image\n\nPDF_PATH = Path(\"data\u002Fscanned_report.pdf\")\n\ndef render_pages(path: Path, dpi: int = 300) -> list[Image.Image]:\n    \"\"\"Render all PDF pages to PIL Image objects at the specified DPI.\"\"\"\n    try:\n        images = convert_from_path(str(path), dpi=dpi)\n    except Exception as e:\n        raise RuntimeError(\n            f\"pdf2image failed on {path}. Ensure poppler-utils is installed: {e}\"\n        ) from e\n    print(f\"Rendered {len(images)} page(s) at {dpi} DPI\")\n    return images\n\nif __name__ == \"__main__\":\n    pages = render_pages(PDF_PATH)\n",[30,46855,46856,46861,46871,46883,46899,46903,46915,46919,46939,46944,46950,46972,46982,46990,47014,47023,47055,47062,47066,47078],{"__ignoreMap":28},[33,46857,46858],{"class":35,"line":36},[33,46859,46860],{"class":39},"# pip install pdf2image\n",[33,46862,46863,46865,46867,46869],{"class":35,"line":43},[33,46864,190],{"class":163},[33,46866,193],{"class":167},[33,46868,164],{"class":163},[33,46870,198],{"class":167},[33,46872,46873,46875,46878,46880],{"class":35,"line":61},[33,46874,190],{"class":163},[33,46876,46877],{"class":167}," pdf2image ",[33,46879,164],{"class":163},[33,46881,46882],{"class":167}," convert_from_path\n",[33,46884,46885,46887,46890,46893,46896],{"class":35,"line":73},[33,46886,190],{"class":163},[33,46888,46889],{"class":50}," PIL",[33,46891,46892],{"class":163}," import",[33,46894,46895],{"class":167}," Image  ",[33,46897,46898],{"class":39},"# installed with pdf2image\n",[33,46900,46901],{"class":35,"line":88},[33,46902,92],{"emptyLinePlaceholder":91},[33,46904,46905,46907,46909,46911,46913],{"class":35,"line":95},[33,46906,7076],{"class":50},[33,46908,212],{"class":163},[33,46910,215],{"class":167},[33,46912,46711],{"class":54},[33,46914,221],{"class":167},[33,46916,46917],{"class":35,"line":101},[33,46918,92],{"emptyLinePlaceholder":91},[33,46920,46921,46923,46926,46929,46931,46933,46936],{"class":35,"line":171},[33,46922,562],{"class":163},[33,46924,46925],{"class":46}," render_pages",[33,46927,46928],{"class":167},"(path: Path, dpi: ",[33,46930,1059],{"class":50},[33,46932,212],{"class":163},[33,46934,46935],{"class":50}," 300",[33,46937,46938],{"class":167},") -> list[Image.Image]:\n",[33,46940,46941],{"class":35,"line":179},[33,46942,46943],{"class":54},"    \"\"\"Render all PDF pages to PIL Image objects at the specified DPI.\"\"\"\n",[33,46945,46946,46948],{"class":35,"line":187},[33,46947,2424],{"class":163},[33,46949,574],{"class":167},[33,46951,46952,46955,46957,46960,46962,46964,46967,46969],{"class":35,"line":201},[33,46953,46954],{"class":167},"        images ",[33,46956,242],{"class":163},[33,46958,46959],{"class":167}," convert_from_path(",[33,46961,1053],{"class":50},[33,46963,13643],{"class":167},[33,46965,46966],{"class":238},"dpi",[33,46968,242],{"class":163},[33,46970,46971],{"class":167},"dpi)\n",[33,46973,46974,46976,46978,46980],{"class":35,"line":206},[33,46975,2449],{"class":163},[33,46977,783],{"class":50},[33,46979,1852],{"class":163},[33,46981,7583],{"class":167},[33,46983,46984,46986,46988],{"class":35,"line":224},[33,46985,4051],{"class":163},[33,46987,7590],{"class":50},[33,46989,7637],{"class":167},[33,46991,46992,46994,46997,46999,47001,47003,47006,47008,47010,47012],{"class":35,"line":229},[33,46993,12744],{"class":163},[33,46995,46996],{"class":54},"\"pdf2image failed on ",[33,46998,1115],{"class":50},[33,47000,2580],{"class":167},[33,47002,1121],{"class":50},[33,47004,47005],{"class":54},". Ensure poppler-utils is installed: ",[33,47007,1115],{"class":50},[33,47009,7602],{"class":167},[33,47011,1121],{"class":50},[33,47013,7504],{"class":54},[33,47015,47016,47019,47021],{"class":35,"line":235},[33,47017,47018],{"class":167},"        ) ",[33,47020,190],{"class":163},[33,47022,7613],{"class":167},[33,47024,47025,47027,47029,47031,47034,47036,47039,47041,47044,47046,47048,47050,47053],{"class":35,"line":250},[33,47026,7268],{"class":50},[33,47028,602],{"class":167},[33,47030,4059],{"class":163},[33,47032,47033],{"class":54},"\"Rendered ",[33,47035,4065],{"class":50},[33,47037,47038],{"class":167},"(images)",[33,47040,1121],{"class":50},[33,47042,47043],{"class":54}," page(s) at ",[33,47045,1115],{"class":50},[33,47047,46966],{"class":167},[33,47049,1121],{"class":50},[33,47051,47052],{"class":54}," DPI\"",[33,47054,221],{"class":167},[33,47056,47057,47059],{"class":35,"line":266},[33,47058,1332],{"class":163},[33,47060,47061],{"class":167}," images\n",[33,47063,47064],{"class":35,"line":290},[33,47065,92],{"emptyLinePlaceholder":91},[33,47067,47068,47070,47072,47074,47076],{"class":35,"line":295},[33,47069,2491],{"class":163},[33,47071,2494],{"class":50},[33,47073,2497],{"class":163},[33,47075,2500],{"class":54},[33,47077,574],{"class":167},[33,47079,47080,47083,47085,47088,47090],{"class":35,"line":300},[33,47081,47082],{"class":167},"    pages ",[33,47084,242],{"class":163},[33,47086,47087],{"class":167}," render_pages(",[33,47089,7076],{"class":50},[33,47091,221],{"class":167},[14,47093,47094],{},[1974,47095,47096],{},"DPI guidance:",[4273,47098,47099,47109],{},[4276,47100,47101],{},[4279,47102,47103,47106],{},[4282,47104,47105],{},"Scan quality",[4282,47107,47108],{},"Recommended DPI",[4292,47110,47111,47118,47125,47133],{},[4279,47112,47113,47116],{},[4297,47114,47115],{},"Clean laser print",[4297,47117,2611],{},[4279,47119,47120,47123],{},[4297,47121,47122],{},"Typical office scan",[4297,47124,26433],{},[4279,47126,47127,47130],{},[4297,47128,47129],{},"Old or faint document",[4297,47131,47132],{},"400–600",[4279,47134,47135,47138],{},[4297,47136,47137],{},"Mixed content with small text",[4297,47139,47140],{},"400",[18,47142,47144],{"id":47143},"step-2-extract-ocr-data-with-spatial-coordinates","Step 2: Extract OCR Data with Spatial Coordinates",[14,47146,47147,47150],{},[30,47148,47149],{},"pytesseract.image_to_data()"," returns word-level bounding boxes alongside recognized text. This spatial data is essential for reconstructing rows — without it you only get a flat string.",[23,47152,47154],{"className":126,"code":47153,"language":47,"meta":28,"style":28},"# pip install pytesseract\nfrom PIL import Image\nimport pytesseract\n\ndef ocr_page(image: Image.Image, min_confidence: int = 60) -> list[tuple[str, int, int]]:\n    \"\"\"\n    Run Tesseract on a single page image.\n    Returns a list of (text, x_left, y_top) tuples for words above the confidence threshold.\n    \"\"\"\n    data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)\n    tokens = []\n    for i in range(len(data[\"text\"])):\n        text = data[\"text\"][i].strip()\n        conf = int(data[\"conf\"][i])\n        if text and conf >= min_confidence:\n            tokens.append((text, data[\"left\"][i], data[\"top\"][i]))\n    return tokens\n",[30,47155,47156,47161,47172,47179,47183,47214,47218,47223,47228,47232,47254,47263,47285,47300,47317,47334,47349],{"__ignoreMap":28},[33,47157,47158],{"class":35,"line":36},[33,47159,47160],{"class":39},"# pip install pytesseract\n",[33,47162,47163,47165,47167,47169],{"class":35,"line":43},[33,47164,190],{"class":163},[33,47166,46889],{"class":50},[33,47168,46892],{"class":163},[33,47170,47171],{"class":167}," Image\n",[33,47173,47174,47176],{"class":35,"line":61},[33,47175,164],{"class":163},[33,47177,47178],{"class":167}," pytesseract\n",[33,47180,47181],{"class":35,"line":73},[33,47182,92],{"emptyLinePlaceholder":91},[33,47184,47185,47187,47190,47193,47195,47197,47199,47202,47204,47206,47208,47210,47212],{"class":35,"line":88},[33,47186,562],{"class":163},[33,47188,47189],{"class":46}," ocr_page",[33,47191,47192],{"class":167},"(image: Image.Image, min_confidence: ",[33,47194,1059],{"class":50},[33,47196,212],{"class":163},[33,47198,28533],{"class":50},[33,47200,47201],{"class":167},") -> list[tuple[",[33,47203,1053],{"class":50},[33,47205,365],{"class":167},[33,47207,1059],{"class":50},[33,47209,365],{"class":167},[33,47211,1059],{"class":50},[33,47213,43900],{"class":167},[33,47215,47216],{"class":35,"line":95},[33,47217,7673],{"class":54},[33,47219,47220],{"class":35,"line":101},[33,47221,47222],{"class":54},"    Run Tesseract on a single page image.\n",[33,47224,47225],{"class":35,"line":171},[33,47226,47227],{"class":54},"    Returns a list of (text, x_left, y_top) tuples for words above the confidence threshold.\n",[33,47229,47230],{"class":35,"line":179},[33,47231,7673],{"class":54},[33,47233,47234,47236,47238,47241,47244,47246,47249,47252],{"class":35,"line":187},[33,47235,24507],{"class":167},[33,47237,242],{"class":163},[33,47239,47240],{"class":167}," pytesseract.image_to_data(image, ",[33,47242,47243],{"class":238},"output_type",[33,47245,242],{"class":163},[33,47247,47248],{"class":167},"pytesseract.Output.",[33,47250,47251],{"class":50},"DICT",[33,47253,221],{"class":167},[33,47255,47256,47259,47261],{"class":35,"line":201},[33,47257,47258],{"class":167},"    tokens ",[33,47260,242],{"class":163},[33,47262,589],{"class":167},[33,47264,47265,47267,47270,47272,47274,47276,47278,47280,47282],{"class":35,"line":206},[33,47266,656],{"class":163},[33,47268,47269],{"class":167}," i ",[33,47271,662],{"class":163},[33,47273,1801],{"class":50},[33,47275,602],{"class":167},[33,47277,928],{"class":50},[33,47279,20361],{"class":167},[33,47281,3459],{"class":54},[33,47283,47284],{"class":167},"])):\n",[33,47286,47287,47290,47292,47295,47297],{"class":35,"line":224},[33,47288,47289],{"class":167},"        text ",[33,47291,242],{"class":163},[33,47293,47294],{"class":167}," data[",[33,47296,3459],{"class":54},[33,47298,47299],{"class":167},"][i].strip()\n",[33,47301,47302,47305,47307,47309,47311,47314],{"class":35,"line":229},[33,47303,47304],{"class":167},"        conf ",[33,47306,242],{"class":163},[33,47308,3149],{"class":50},[33,47310,20361],{"class":167},[33,47312,47313],{"class":54},"\"conf\"",[33,47315,47316],{"class":167},"][i])\n",[33,47318,47319,47321,47324,47326,47329,47331],{"class":35,"line":235},[33,47320,8221],{"class":163},[33,47322,47323],{"class":167}," text ",[33,47325,6001],{"class":163},[33,47327,47328],{"class":167}," conf ",[33,47330,43000],{"class":163},[33,47332,47333],{"class":167}," min_confidence:\n",[33,47335,47336,47339,47341,47344,47346],{"class":35,"line":250},[33,47337,47338],{"class":167},"            tokens.append((text, data[",[33,47340,28050],{"class":54},[33,47342,47343],{"class":167},"][i], data[",[33,47345,43412],{"class":54},[33,47347,47348],{"class":167},"][i]))\n",[33,47350,47351,47353],{"class":35,"line":266},[33,47352,1332],{"class":163},[33,47354,47355],{"class":167}," tokens\n",[14,47357,47358,47361],{},[30,47359,47360],{},"min_confidence=60"," filters out noise (speckles, bleed-through) that Tesseract assigns low scores. Raise to 70–75 for cleaner scans; lower to 50 for degraded documents.",[18,47363,47365],{"id":47364},"step-3-reconstruct-table-rows-by-y-clustering","Step 3: Reconstruct Table Rows by Y-Clustering",[14,47367,47368,47369,47372],{},"Group tokens into rows by rounding their y-coordinate to a bucket width (",[30,47370,47371],{},"row_tolerance","), then sort each bucket by x-coordinate. This replaces the missing vector-line metadata.",[23,47374,47376],{"className":126,"code":47375,"language":47,"meta":28,"style":28},"def tokens_to_rows(\n    tokens: list[tuple[str, int, int]],\n    row_tolerance: int = 15,\n) -> list[list[str]]:\n    \"\"\"\n    Cluster OCR tokens into table rows using y-coordinate bucketing.\n    row_tolerance: pixel height of one row bucket (tune per document font size).\n    \"\"\"\n    row_map: dict[int, list[tuple[int, str]]] = {}\n    for text, x, y in tokens:\n        bucket = round(y \u002F row_tolerance)   # integer bucket key\n        row_map.setdefault(bucket, []).append((x, text))\n\n    rows = []\n    for bucket_key in sorted(row_map):\n        row_map[bucket_key].sort(key=lambda item: item[0])   # sort left-to-right\n        rows.append([cell[1] for cell in row_map[bucket_key]])\n\n    return rows\n",[30,47377,47378,47387,47405,47419,47427,47431,47436,47441,47445,47468,47480,47501,47506,47510,47518,47532,47551,47569,47573],{"__ignoreMap":28},[33,47379,47380,47382,47385],{"class":35,"line":36},[33,47381,562],{"class":163},[33,47383,47384],{"class":46}," tokens_to_rows",[33,47386,7637],{"class":167},[33,47388,47389,47392,47394,47396,47398,47400,47402],{"class":35,"line":43},[33,47390,47391],{"class":167},"    tokens: list[tuple[",[33,47393,1053],{"class":50},[33,47395,365],{"class":167},[33,47397,1059],{"class":50},[33,47399,365],{"class":167},[33,47401,1059],{"class":50},[33,47403,47404],{"class":167},"]],\n",[33,47406,47407,47410,47412,47414,47417],{"class":35,"line":61},[33,47408,47409],{"class":167},"    row_tolerance: ",[33,47411,1059],{"class":50},[33,47413,212],{"class":163},[33,47415,47416],{"class":50}," 15",[33,47418,247],{"class":167},[33,47420,47421,47423,47425],{"class":35,"line":73},[33,47422,43895],{"class":167},[33,47424,1053],{"class":50},[33,47426,43900],{"class":167},[33,47428,47429],{"class":35,"line":88},[33,47430,7673],{"class":54},[33,47432,47433],{"class":35,"line":95},[33,47434,47435],{"class":54},"    Cluster OCR tokens into table rows using y-coordinate bucketing.\n",[33,47437,47438],{"class":35,"line":101},[33,47439,47440],{"class":54},"    row_tolerance: pixel height of one row bucket (tune per document font size).\n",[33,47442,47443],{"class":35,"line":171},[33,47444,7673],{"class":54},[33,47446,47447,47450,47452,47455,47457,47459,47461,47464,47466],{"class":35,"line":179},[33,47448,47449],{"class":167},"    row_map: dict[",[33,47451,1059],{"class":50},[33,47453,47454],{"class":167},", list[tuple[",[33,47456,1059],{"class":50},[33,47458,365],{"class":167},[33,47460,1053],{"class":50},[33,47462,47463],{"class":167},"]]] ",[33,47465,242],{"class":163},[33,47467,14093],{"class":167},[33,47469,47470,47472,47475,47477],{"class":35,"line":187},[33,47471,656],{"class":163},[33,47473,47474],{"class":167}," text, x, y ",[33,47476,662],{"class":163},[33,47478,47479],{"class":167}," tokens:\n",[33,47481,47482,47485,47487,47490,47493,47495,47498],{"class":35,"line":201},[33,47483,47484],{"class":167},"        bucket ",[33,47486,242],{"class":163},[33,47488,47489],{"class":50}," round",[33,47491,47492],{"class":167},"(y ",[33,47494,1351],{"class":163},[33,47496,47497],{"class":167}," row_tolerance)   ",[33,47499,47500],{"class":39},"# integer bucket key\n",[33,47502,47503],{"class":35,"line":206},[33,47504,47505],{"class":167},"        row_map.setdefault(bucket, []).append((x, text))\n",[33,47507,47508],{"class":35,"line":224},[33,47509,92],{"emptyLinePlaceholder":91},[33,47511,47512,47514,47516],{"class":35,"line":229},[33,47513,44390],{"class":167},[33,47515,242],{"class":163},[33,47517,589],{"class":167},[33,47519,47520,47522,47525,47527,47529],{"class":35,"line":235},[33,47521,656],{"class":163},[33,47523,47524],{"class":167}," bucket_key ",[33,47526,662],{"class":163},[33,47528,28924],{"class":50},[33,47530,47531],{"class":167},"(row_map):\n",[33,47533,47534,47537,47539,47541,47544,47546,47548],{"class":35,"line":250},[33,47535,47536],{"class":167},"        row_map[bucket_key].sort(",[33,47538,44114],{"class":238},[33,47540,44117],{"class":163},[33,47542,47543],{"class":167}," item: item[",[33,47545,748],{"class":50},[33,47547,7283],{"class":167},[33,47549,47550],{"class":39},"# sort left-to-right\n",[33,47552,47553,47556,47558,47560,47562,47564,47566],{"class":35,"line":266},[33,47554,47555],{"class":167},"        rows.append([cell[",[33,47557,734],{"class":50},[33,47559,763],{"class":167},[33,47561,6124],{"class":163},[33,47563,17467],{"class":167},[33,47565,662],{"class":163},[33,47567,47568],{"class":167}," row_map[bucket_key]])\n",[33,47570,47571],{"class":35,"line":290},[33,47572,92],{"emptyLinePlaceholder":91},[33,47574,47575,47577],{"class":35,"line":295},[33,47576,1332],{"class":163},[33,47578,44355],{"class":167},[14,47580,47581,47586],{},[1974,47582,47583,47584,20891],{},"Tuning ",[30,47585,47371],{}," Start with 15 pixels at 300 DPI. If rows merge, lower it to 10. If single rows split across two buckets, raise it to 20. For variable-height rows (bold headers vs body text), post-process by merging adjacent short rows.",[18,47588,47590],{"id":47589},"step-4-export-to-dataframe","Step 4: Export to DataFrame",[23,47592,47594],{"className":126,"code":47593,"language":47,"meta":28,"style":28},"# pip install pandas\nfrom pathlib import Path\nimport pandas as pd\n\nOUTPUT_PATH = Path(\"output\u002Fscanned_table.csv\")\n\ndef rows_to_dataframe(rows: list[list[str]], header_row: int = 0) -> pd.DataFrame:\n    \"\"\"Convert row-list to DataFrame, using the first row as column names.\"\"\"\n    if not rows:\n        return pd.DataFrame()\n\n    # Pad all rows to the same width\n    max_cols = max(len(r) for r in rows)\n    padded = [r + [\"\"] * (max_cols - len(r)) for r in rows]\n\n    header = padded[header_row]\n    data = padded[header_row + 1:]\n    df = pd.DataFrame(data, columns=header)\n    df.replace(\"\", pd.NA, inplace=True)\n    return df\n\nif __name__ == \"__main__\":\n    OUTPUT_PATH.parent.mkdir(exist_ok=True)\n    # (assumes pages and OCR steps already run)\n    from pdf2image import convert_from_path\n    import pytesseract\n\n    PDF_PATH = Path(\"data\u002Fscanned_report.pdf\")\n    images = convert_from_path(str(PDF_PATH), dpi=300)\n    all_rows: list[list[str]] = []\n    for img in images:\n        tokens = ocr_page(img)\n        all_rows.extend(tokens_to_rows(tokens))\n\n    df = rows_to_dataframe(all_rows)\n    df.to_csv(OUTPUT_PATH, index=False)\n    print(f\"Exported {len(df)} rows × {df.shape[1]} cols to {OUTPUT_PATH}\")\n    print(df.head())\n",[30,47595,47596,47600,47610,47620,47624,47637,47641,47664,47669,47677,47683,47687,47692,47715,47750,47754,47763,47778,47793,47813,47819,47823,47835,47849,47854,47864,47870,47874,47887,47912,47924,47936,47946,47951,47955,47964,47980,48017],{"__ignoreMap":28},[33,47597,47598],{"class":35,"line":36},[33,47599,8895],{"class":39},[33,47601,47602,47604,47606,47608],{"class":35,"line":43},[33,47603,190],{"class":163},[33,47605,193],{"class":167},[33,47607,164],{"class":163},[33,47609,198],{"class":167},[33,47611,47612,47614,47616,47618],{"class":35,"line":61},[33,47613,164],{"class":163},[33,47615,492],{"class":167},[33,47617,495],{"class":163},[33,47619,498],{"class":167},[33,47621,47622],{"class":35,"line":73},[33,47623,92],{"emptyLinePlaceholder":91},[33,47625,47626,47628,47630,47632,47635],{"class":35,"line":88},[33,47627,521],{"class":50},[33,47629,212],{"class":163},[33,47631,215],{"class":167},[33,47633,47634],{"class":54},"\"output\u002Fscanned_table.csv\"",[33,47636,221],{"class":167},[33,47638,47639],{"class":35,"line":95},[33,47640,92],{"emptyLinePlaceholder":91},[33,47642,47643,47645,47648,47651,47653,47656,47658,47660,47662],{"class":35,"line":101},[33,47644,562],{"class":163},[33,47646,47647],{"class":46}," rows_to_dataframe",[33,47649,47650],{"class":167},"(rows: list[list[",[33,47652,1053],{"class":50},[33,47654,47655],{"class":167},"]], header_row: ",[33,47657,1059],{"class":50},[33,47659,212],{"class":163},[33,47661,10791],{"class":50},[33,47663,7668],{"class":167},[33,47665,47666],{"class":35,"line":171},[33,47667,47668],{"class":54},"    \"\"\"Convert row-list to DataFrame, using the first row as column names.\"\"\"\n",[33,47670,47671,47673,47675],{"class":35,"line":179},[33,47672,617],{"class":163},[33,47674,620],{"class":163},[33,47676,8723],{"class":167},[33,47678,47679,47681],{"class":35,"line":187},[33,47680,1659],{"class":163},[33,47682,7721],{"class":167},[33,47684,47685],{"class":35,"line":201},[33,47686,92],{"emptyLinePlaceholder":91},[33,47688,47689],{"class":35,"line":206},[33,47690,47691],{"class":39},"    # Pad all rows to the same width\n",[33,47693,47694,47696,47698,47700,47702,47704,47706,47708,47710,47712],{"class":35,"line":224},[33,47695,45812],{"class":167},[33,47697,242],{"class":163},[33,47699,45817],{"class":50},[33,47701,602],{"class":167},[33,47703,928],{"class":50},[33,47705,45824],{"class":167},[33,47707,6124],{"class":163},[33,47709,45721],{"class":167},[33,47711,662],{"class":163},[33,47713,47714],{"class":167}," rows)\n",[33,47716,47717,47719,47721,47723,47725,47727,47729,47731,47733,47735,47737,47739,47741,47743,47745,47747],{"class":35,"line":229},[33,47718,45838],{"class":167},[33,47720,242],{"class":163},[33,47722,45843],{"class":167},[33,47724,1811],{"class":163},[33,47726,9178],{"class":167},[33,47728,3198],{"class":54},[33,47730,763],{"class":167},[33,47732,1769],{"class":163},[33,47734,45856],{"class":167},[33,47736,4126],{"class":163},[33,47738,4037],{"class":50},[33,47740,45863],{"class":167},[33,47742,6124],{"class":163},[33,47744,45721],{"class":167},[33,47746,662],{"class":163},[33,47748,47749],{"class":167}," rows]\n",[33,47751,47752],{"class":35,"line":235},[33,47753,92],{"emptyLinePlaceholder":91},[33,47755,47756,47758,47760],{"class":35,"line":250},[33,47757,13245],{"class":167},[33,47759,242],{"class":163},[33,47761,47762],{"class":167}," padded[header_row]\n",[33,47764,47765,47767,47769,47772,47774,47776],{"class":35,"line":266},[33,47766,24507],{"class":167},[33,47768,242],{"class":163},[33,47770,47771],{"class":167}," padded[header_row ",[33,47773,1811],{"class":163},[33,47775,1814],{"class":50},[33,47777,39364],{"class":167},[33,47779,47780,47782,47784,47787,47789,47791],{"class":35,"line":290},[33,47781,4025],{"class":167},[33,47783,242],{"class":163},[33,47785,47786],{"class":167}," pd.DataFrame(data, ",[33,47788,740],{"class":238},[33,47790,242],{"class":163},[33,47792,7549],{"class":167},[33,47794,47795,47797,47799,47801,47803,47805,47807,47809,47811],{"class":35,"line":295},[33,47796,45905],{"class":167},[33,47798,3198],{"class":54},[33,47800,10884],{"class":167},[33,47802,8018],{"class":50},[33,47804,365],{"class":167},[33,47806,10891],{"class":238},[33,47808,242],{"class":163},[33,47810,855],{"class":50},[33,47812,221],{"class":167},[33,47814,47815,47817],{"class":35,"line":300},[33,47816,1332],{"class":163},[33,47818,11719],{"class":167},[33,47820,47821],{"class":35,"line":317},[33,47822,92],{"emptyLinePlaceholder":91},[33,47824,47825,47827,47829,47831,47833],{"class":35,"line":332},[33,47826,2491],{"class":163},[33,47828,2494],{"class":50},[33,47830,2497],{"class":163},[33,47832,2500],{"class":54},[33,47834,574],{"class":167},[33,47836,47837,47839,47841,47843,47845,47847],{"class":35,"line":347},[33,47838,863],{"class":50},[33,47840,866],{"class":167},[33,47842,878],{"class":238},[33,47844,242],{"class":163},[33,47846,855],{"class":50},[33,47848,221],{"class":167},[33,47850,47851],{"class":35,"line":374},[33,47852,47853],{"class":39},"    # (assumes pages and OCR steps already run)\n",[33,47855,47856,47858,47860,47862],{"class":35,"line":397},[33,47857,3878],{"class":163},[33,47859,46877],{"class":167},[33,47861,164],{"class":163},[33,47863,46882],{"class":167},[33,47865,47866,47868],{"class":35,"line":653},[33,47867,1627],{"class":163},[33,47869,47178],{"class":167},[33,47871,47872],{"class":35,"line":667},[33,47873,92],{"emptyLinePlaceholder":91},[33,47875,47876,47879,47881,47883,47885],{"class":35,"line":675},[33,47877,47878],{"class":50},"    PDF_PATH",[33,47880,212],{"class":163},[33,47882,215],{"class":167},[33,47884,46711],{"class":54},[33,47886,221],{"class":167},[33,47888,47889,47892,47894,47896,47898,47900,47902,47904,47906,47908,47910],{"class":35,"line":689},[33,47890,47891],{"class":167},"    images ",[33,47893,242],{"class":163},[33,47895,46959],{"class":167},[33,47897,1053],{"class":50},[33,47899,602],{"class":167},[33,47901,7076],{"class":50},[33,47903,18525],{"class":167},[33,47905,46966],{"class":238},[33,47907,242],{"class":163},[33,47909,26433],{"class":50},[33,47911,221],{"class":167},[33,47913,47914,47916,47918,47920,47922],{"class":35,"line":703},[33,47915,45323],{"class":167},[33,47917,1053],{"class":50},[33,47919,13081],{"class":167},[33,47921,242],{"class":163},[33,47923,589],{"class":167},[33,47925,47926,47928,47931,47933],{"class":35,"line":714},[33,47927,656],{"class":163},[33,47929,47930],{"class":167}," img ",[33,47932,662],{"class":163},[33,47934,47935],{"class":167}," images:\n",[33,47937,47938,47941,47943],{"class":35,"line":723},[33,47939,47940],{"class":167},"        tokens ",[33,47942,242],{"class":163},[33,47944,47945],{"class":167}," ocr_page(img)\n",[33,47947,47948],{"class":35,"line":754},[33,47949,47950],{"class":167},"        all_rows.extend(tokens_to_rows(tokens))\n",[33,47952,47953],{"class":35,"line":771},[33,47954,92],{"emptyLinePlaceholder":91},[33,47956,47957,47959,47961],{"class":35,"line":777},[33,47958,4025],{"class":167},[33,47960,242],{"class":163},[33,47962,47963],{"class":167}," rows_to_dataframe(all_rows)\n",[33,47965,47966,47968,47970,47972,47974,47976,47978],{"class":35,"line":788},[33,47967,39534],{"class":167},[33,47969,521],{"class":50},[33,47971,365],{"class":167},[33,47973,897],{"class":238},[33,47975,242],{"class":163},[33,47977,902],{"class":50},[33,47979,221],{"class":167},[33,47981,47982,47984,47986,47988,47990,47992,47994,47996,47998,48000,48002,48004,48006,48008,48011,48013,48015],{"class":35,"line":804},[33,47983,7268],{"class":50},[33,47985,602],{"class":167},[33,47987,4059],{"class":163},[33,47989,44444],{"class":54},[33,47991,4065],{"class":50},[33,47993,4068],{"class":167},[33,47995,1121],{"class":50},[33,47997,16022],{"class":54},[33,47999,1115],{"class":50},[33,48001,9541],{"class":167},[33,48003,734],{"class":50},[33,48005,9546],{"class":167},[33,48007,1121],{"class":50},[33,48009,48010],{"class":54}," cols to ",[33,48012,44456],{"class":50},[33,48014,274],{"class":54},[33,48016,221],{"class":167},[33,48018,48019,48021],{"class":35,"line":809},[33,48020,7268],{"class":50},[33,48022,13311],{"class":167},[18,48024,48026],{"id":48025},"variant-fix-multi-page-tables-with-consistent-headers","Variant Fix: Multi-Page Tables with Consistent Headers",[14,48028,48029,48030,48032],{},"When the scanned document spans many pages and each page repeats the table header, deduplicate before concatenating — the same technique used in ",[940,48031,9592],{"href":942}," for native PDFs applies here:",[23,48034,48036],{"className":126,"code":48035,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\n\ndef merge_scanned_pages(page_rows: list[list[list[str]]]) -> pd.DataFrame:\n    \"\"\"\n    Merge rows from multiple scanned pages, removing repeated header rows.\n    page_rows: list of row-lists, one per page.\n    \"\"\"\n    if not page_rows or not page_rows[0]:\n        return pd.DataFrame()\n\n    header = page_rows[0][0]   # canonical header from first page\n    all_data: list[list[str]] = []\n\n    for rows in page_rows:\n        for row in rows:\n            if row == header:\n                continue   # skip repeated headers\n            all_data.append(row)\n\n    max_cols = max((len(r) for r in all_data), default=0)\n    if max_cols == 0:\n        return pd.DataFrame()\n\n    padded = [r + [\"\"] * (max_cols - len(r)) for r in all_data]\n    df = pd.DataFrame(padded, columns=header + [f\"extra_{i}\" for i in range(max_cols - len(header))])\n    return df\n",[30,48037,48038,48042,48052,48056,48071,48075,48080,48085,48089,48109,48115,48119,48139,48152,48156,48167,48177,48188,48196,48201,48205,48237,48250,48256,48260,48295,48345],{"__ignoreMap":28},[33,48039,48040],{"class":35,"line":36},[33,48041,8895],{"class":39},[33,48043,48044,48046,48048,48050],{"class":35,"line":43},[33,48045,164],{"class":163},[33,48047,492],{"class":167},[33,48049,495],{"class":163},[33,48051,498],{"class":167},[33,48053,48054],{"class":35,"line":61},[33,48055,92],{"emptyLinePlaceholder":91},[33,48057,48058,48060,48063,48066,48068],{"class":35,"line":73},[33,48059,562],{"class":163},[33,48061,48062],{"class":46}," merge_scanned_pages",[33,48064,48065],{"class":167},"(page_rows: list[list[list[",[33,48067,1053],{"class":50},[33,48069,48070],{"class":167},"]]]) -> pd.DataFrame:\n",[33,48072,48073],{"class":35,"line":88},[33,48074,7673],{"class":54},[33,48076,48077],{"class":35,"line":95},[33,48078,48079],{"class":54},"    Merge rows from multiple scanned pages, removing repeated header rows.\n",[33,48081,48082],{"class":35,"line":101},[33,48083,48084],{"class":54},"    page_rows: list of row-lists, one per page.\n",[33,48086,48087],{"class":35,"line":171},[33,48088,7673],{"class":54},[33,48090,48091,48093,48095,48098,48100,48102,48105,48107],{"class":35,"line":179},[33,48092,617],{"class":163},[33,48094,620],{"class":163},[33,48096,48097],{"class":167}," page_rows ",[33,48099,7162],{"class":163},[33,48101,620],{"class":163},[33,48103,48104],{"class":167}," page_rows[",[33,48106,748],{"class":50},[33,48108,17477],{"class":167},[33,48110,48111,48113],{"class":35,"line":187},[33,48112,1659],{"class":163},[33,48114,7721],{"class":167},[33,48116,48117],{"class":35,"line":201},[33,48118,92],{"emptyLinePlaceholder":91},[33,48120,48121,48123,48125,48127,48129,48131,48133,48136],{"class":35,"line":206},[33,48122,13245],{"class":167},[33,48124,242],{"class":163},[33,48126,48104],{"class":167},[33,48128,748],{"class":50},[33,48130,44179],{"class":167},[33,48132,748],{"class":50},[33,48134,48135],{"class":167},"]   ",[33,48137,48138],{"class":39},"# canonical header from first page\n",[33,48140,48141,48144,48146,48148,48150],{"class":35,"line":224},[33,48142,48143],{"class":167},"    all_data: list[list[",[33,48145,1053],{"class":50},[33,48147,13081],{"class":167},[33,48149,242],{"class":163},[33,48151,589],{"class":167},[33,48153,48154],{"class":35,"line":229},[33,48155,92],{"emptyLinePlaceholder":91},[33,48157,48158,48160,48162,48164],{"class":35,"line":235},[33,48159,656],{"class":163},[33,48161,45726],{"class":167},[33,48163,662],{"class":163},[33,48165,48166],{"class":167}," page_rows:\n",[33,48168,48169,48171,48173,48175],{"class":35,"line":250},[33,48170,5973],{"class":163},[33,48172,3844],{"class":167},[33,48174,662],{"class":163},[33,48176,8723],{"class":167},[33,48178,48179,48181,48183,48185],{"class":35,"line":266},[33,48180,5995],{"class":163},[33,48182,3844],{"class":167},[33,48184,1865],{"class":163},[33,48186,48187],{"class":167}," header:\n",[33,48189,48190,48193],{"class":35,"line":290},[33,48191,48192],{"class":163},"                continue",[33,48194,48195],{"class":39},"   # skip repeated headers\n",[33,48197,48198],{"class":35,"line":295},[33,48199,48200],{"class":167},"            all_data.append(row)\n",[33,48202,48203],{"class":35,"line":300},[33,48204,92],{"emptyLinePlaceholder":91},[33,48206,48207,48209,48211,48213,48216,48218,48220,48222,48224,48226,48229,48231,48233,48235],{"class":35,"line":317},[33,48208,45812],{"class":167},[33,48210,242],{"class":163},[33,48212,45817],{"class":50},[33,48214,48215],{"class":167},"((",[33,48217,928],{"class":50},[33,48219,45824],{"class":167},[33,48221,6124],{"class":163},[33,48223,45721],{"class":167},[33,48225,662],{"class":163},[33,48227,48228],{"class":167}," all_data), ",[33,48230,6685],{"class":238},[33,48232,242],{"class":163},[33,48234,748],{"class":50},[33,48236,221],{"class":167},[33,48238,48239,48241,48244,48246,48248],{"class":35,"line":332},[33,48240,617],{"class":163},[33,48242,48243],{"class":167}," max_cols ",[33,48245,1865],{"class":163},[33,48247,10791],{"class":50},[33,48249,574],{"class":167},[33,48251,48252,48254],{"class":35,"line":347},[33,48253,1659],{"class":163},[33,48255,7721],{"class":167},[33,48257,48258],{"class":35,"line":374},[33,48259,92],{"emptyLinePlaceholder":91},[33,48261,48262,48264,48266,48268,48270,48272,48274,48276,48278,48280,48282,48284,48286,48288,48290,48292],{"class":35,"line":397},[33,48263,45838],{"class":167},[33,48265,242],{"class":163},[33,48267,45843],{"class":167},[33,48269,1811],{"class":163},[33,48271,9178],{"class":167},[33,48273,3198],{"class":54},[33,48275,763],{"class":167},[33,48277,1769],{"class":163},[33,48279,45856],{"class":167},[33,48281,4126],{"class":163},[33,48283,4037],{"class":50},[33,48285,45863],{"class":167},[33,48287,6124],{"class":163},[33,48289,45721],{"class":167},[33,48291,662],{"class":163},[33,48293,48294],{"class":167}," all_data]\n",[33,48296,48297,48299,48301,48303,48305,48307,48310,48312,48314,48316,48319,48321,48323,48325,48327,48329,48331,48333,48335,48338,48340,48342],{"class":35,"line":653},[33,48298,4025],{"class":167},[33,48300,242],{"class":163},[33,48302,45881],{"class":167},[33,48304,740],{"class":238},[33,48306,242],{"class":163},[33,48308,48309],{"class":167},"header ",[33,48311,1811],{"class":163},[33,48313,9178],{"class":167},[33,48315,4059],{"class":163},[33,48317,48318],{"class":54},"\"extra_",[33,48320,1115],{"class":50},[33,48322,7499],{"class":167},[33,48324,1121],{"class":50},[33,48326,274],{"class":54},[33,48328,14766],{"class":163},[33,48330,47269],{"class":167},[33,48332,662],{"class":163},[33,48334,1801],{"class":50},[33,48336,48337],{"class":167},"(max_cols ",[33,48339,4126],{"class":163},[33,48341,4037],{"class":50},[33,48343,48344],{"class":167},"(header))])\n",[33,48346,48347,48349],{"class":35,"line":667},[33,48348,1332],{"class":163},[33,48350,11719],{"class":167},[18,48352,48354],{"id":48353},"image-pre-processing-for-low-quality-scans","Image Pre-Processing for Low-Quality Scans",[14,48356,48357,48358,48361,48362,48365],{},"If Tesseract accuracy is poor (many garbage tokens even at ",[30,48359,48360],{},"min_confidence=50","), pre-process the image before calling ",[30,48363,48364],{},"image_to_data",". Improving contrast and binarising the image gives Tesseract cleaner character boundaries.",[23,48367,48369],{"className":126,"code":48368,"language":47,"meta":28,"style":28},"# pip install pytesseract pillow opencv-python-headless\nfrom PIL import Image, ImageFilter, ImageOps\nimport cv2\nimport numpy as np\nimport pytesseract\n\ndef preprocess_for_ocr(image: Image.Image) -> Image.Image:\n    \"\"\"\n    Sharpen, binarize, and denoise a scan image before OCR.\n    Returns a processed PIL Image.\n    \"\"\"\n    # Convert to greyscale\n    grey = ImageOps.grayscale(image)\n\n    # Convert to numpy for OpenCV processing\n    arr = np.array(grey)\n\n    # Adaptive threshold — handles uneven lighting across the scan\n    binary = cv2.adaptiveThreshold(\n        arr, 255,\n        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,\n        cv2.THRESH_BINARY,\n        blockSize=15,   # neighbourhood size; raise for larger text\n        C=8,            # constant subtracted from mean; raise to remove light background\n    )\n\n    # Morphological opening to remove small noise spots\n    kernel = np.ones((1, 1), np.uint8)\n    cleaned = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)\n\n    return Image.fromarray(cleaned)\n\ndef ocr_page_enhanced(image: Image.Image, min_confidence: int = 60) -> list[tuple[str, int, int]]:\n    \"\"\"OCR pipeline with image pre-processing.\"\"\"\n    processed = preprocess_for_ocr(image)\n    data = pytesseract.image_to_data(processed, output_type=pytesseract.Output.DICT)\n    tokens = []\n    for i in range(len(data[\"text\"])):\n        text = data[\"text\"][i].strip()\n        conf = int(data[\"conf\"][i])\n        if text and conf >= min_confidence:\n            tokens.append((text, data[\"left\"][i], data[\"top\"][i]))\n    return tokens\n",[30,48370,48371,48376,48387,48393,48405,48411,48415,48425,48429,48434,48439,48443,48448,48458,48462,48467,48477,48481,48486,48496,48505,48515,48524,48538,48553,48557,48561,48566,48585,48600,48604,48611,48615,48644,48649,48659,48678,48686,48706,48718,48732,48746,48758],{"__ignoreMap":28},[33,48372,48373],{"class":35,"line":36},[33,48374,48375],{"class":39},"# pip install pytesseract pillow opencv-python-headless\n",[33,48377,48378,48380,48382,48384],{"class":35,"line":43},[33,48379,190],{"class":163},[33,48381,46889],{"class":50},[33,48383,46892],{"class":163},[33,48385,48386],{"class":167}," Image, ImageFilter, ImageOps\n",[33,48388,48389,48391],{"class":35,"line":61},[33,48390,164],{"class":163},[33,48392,41647],{"class":167},[33,48394,48395,48397,48400,48402],{"class":35,"line":73},[33,48396,164],{"class":163},[33,48398,48399],{"class":167}," numpy ",[33,48401,495],{"class":163},[33,48403,48404],{"class":167}," np\n",[33,48406,48407,48409],{"class":35,"line":88},[33,48408,164],{"class":163},[33,48410,47178],{"class":167},[33,48412,48413],{"class":35,"line":95},[33,48414,92],{"emptyLinePlaceholder":91},[33,48416,48417,48419,48422],{"class":35,"line":101},[33,48418,562],{"class":163},[33,48420,48421],{"class":46}," preprocess_for_ocr",[33,48423,48424],{"class":167},"(image: Image.Image) -> Image.Image:\n",[33,48426,48427],{"class":35,"line":171},[33,48428,7673],{"class":54},[33,48430,48431],{"class":35,"line":179},[33,48432,48433],{"class":54},"    Sharpen, binarize, and denoise a scan image before OCR.\n",[33,48435,48436],{"class":35,"line":187},[33,48437,48438],{"class":54},"    Returns a processed PIL Image.\n",[33,48440,48441],{"class":35,"line":201},[33,48442,7673],{"class":54},[33,48444,48445],{"class":35,"line":206},[33,48446,48447],{"class":39},"    # Convert to greyscale\n",[33,48449,48450,48453,48455],{"class":35,"line":224},[33,48451,48452],{"class":167},"    grey ",[33,48454,242],{"class":163},[33,48456,48457],{"class":167}," ImageOps.grayscale(image)\n",[33,48459,48460],{"class":35,"line":229},[33,48461,92],{"emptyLinePlaceholder":91},[33,48463,48464],{"class":35,"line":235},[33,48465,48466],{"class":39},"    # Convert to numpy for OpenCV processing\n",[33,48468,48469,48472,48474],{"class":35,"line":250},[33,48470,48471],{"class":167},"    arr ",[33,48473,242],{"class":163},[33,48475,48476],{"class":167}," np.array(grey)\n",[33,48478,48479],{"class":35,"line":266},[33,48480,92],{"emptyLinePlaceholder":91},[33,48482,48483],{"class":35,"line":290},[33,48484,48485],{"class":39},"    # Adaptive threshold — handles uneven lighting across the scan\n",[33,48487,48488,48491,48493],{"class":35,"line":295},[33,48489,48490],{"class":167},"    binary ",[33,48492,242],{"class":163},[33,48494,48495],{"class":167}," cv2.adaptiveThreshold(\n",[33,48497,48498,48501,48503],{"class":35,"line":300},[33,48499,48500],{"class":167},"        arr, ",[33,48502,2678],{"class":50},[33,48504,247],{"class":167},[33,48506,48507,48510,48513],{"class":35,"line":317},[33,48508,48509],{"class":167},"        cv2.",[33,48511,48512],{"class":50},"ADAPTIVE_THRESH_GAUSSIAN_C",[33,48514,247],{"class":167},[33,48516,48517,48519,48522],{"class":35,"line":332},[33,48518,48509],{"class":167},[33,48520,48521],{"class":50},"THRESH_BINARY",[33,48523,247],{"class":167},[33,48525,48526,48529,48531,48533,48535],{"class":35,"line":347},[33,48527,48528],{"class":238},"        blockSize",[33,48530,242],{"class":163},[33,48532,1646],{"class":50},[33,48534,1166],{"class":167},[33,48536,48537],{"class":39},"# neighbourhood size; raise for larger text\n",[33,48539,48540,48543,48545,48547,48550],{"class":35,"line":374},[33,48541,48542],{"class":238},"        C",[33,48544,242],{"class":163},[33,48546,2591],{"class":50},[33,48548,48549],{"class":167},",            ",[33,48551,48552],{"class":39},"# constant subtracted from mean; raise to remove light background\n",[33,48554,48555],{"class":35,"line":397},[33,48556,1202],{"class":167},[33,48558,48559],{"class":35,"line":653},[33,48560,92],{"emptyLinePlaceholder":91},[33,48562,48563],{"class":35,"line":667},[33,48564,48565],{"class":39},"    # Morphological opening to remove small noise spots\n",[33,48567,48568,48571,48573,48576,48578,48580,48582],{"class":35,"line":675},[33,48569,48570],{"class":167},"    kernel ",[33,48572,242],{"class":163},[33,48574,48575],{"class":167}," np.ones((",[33,48577,734],{"class":50},[33,48579,365],{"class":167},[33,48581,734],{"class":50},[33,48583,48584],{"class":167},"), np.uint8)\n",[33,48586,48587,48589,48591,48594,48597],{"class":35,"line":689},[33,48588,12471],{"class":167},[33,48590,242],{"class":163},[33,48592,48593],{"class":167}," cv2.morphologyEx(binary, cv2.",[33,48595,48596],{"class":50},"MORPH_OPEN",[33,48598,48599],{"class":167},", kernel)\n",[33,48601,48602],{"class":35,"line":703},[33,48603,92],{"emptyLinePlaceholder":91},[33,48605,48606,48608],{"class":35,"line":714},[33,48607,1332],{"class":163},[33,48609,48610],{"class":167}," Image.fromarray(cleaned)\n",[33,48612,48613],{"class":35,"line":723},[33,48614,92],{"emptyLinePlaceholder":91},[33,48616,48617,48619,48622,48624,48626,48628,48630,48632,48634,48636,48638,48640,48642],{"class":35,"line":754},[33,48618,562],{"class":163},[33,48620,48621],{"class":46}," ocr_page_enhanced",[33,48623,47192],{"class":167},[33,48625,1059],{"class":50},[33,48627,212],{"class":163},[33,48629,28533],{"class":50},[33,48631,47201],{"class":167},[33,48633,1053],{"class":50},[33,48635,365],{"class":167},[33,48637,1059],{"class":50},[33,48639,365],{"class":167},[33,48641,1059],{"class":50},[33,48643,43900],{"class":167},[33,48645,48646],{"class":35,"line":771},[33,48647,48648],{"class":54},"    \"\"\"OCR pipeline with image pre-processing.\"\"\"\n",[33,48650,48651,48654,48656],{"class":35,"line":777},[33,48652,48653],{"class":167},"    processed ",[33,48655,242],{"class":163},[33,48657,48658],{"class":167}," preprocess_for_ocr(image)\n",[33,48660,48661,48663,48665,48668,48670,48672,48674,48676],{"class":35,"line":788},[33,48662,24507],{"class":167},[33,48664,242],{"class":163},[33,48666,48667],{"class":167}," pytesseract.image_to_data(processed, ",[33,48669,47243],{"class":238},[33,48671,242],{"class":163},[33,48673,47248],{"class":167},[33,48675,47251],{"class":50},[33,48677,221],{"class":167},[33,48679,48680,48682,48684],{"class":35,"line":804},[33,48681,47258],{"class":167},[33,48683,242],{"class":163},[33,48685,589],{"class":167},[33,48687,48688,48690,48692,48694,48696,48698,48700,48702,48704],{"class":35,"line":809},[33,48689,656],{"class":163},[33,48691,47269],{"class":167},[33,48693,662],{"class":163},[33,48695,1801],{"class":50},[33,48697,602],{"class":167},[33,48699,928],{"class":50},[33,48701,20361],{"class":167},[33,48703,3459],{"class":54},[33,48705,47284],{"class":167},[33,48707,48708,48710,48712,48714,48716],{"class":35,"line":819},[33,48709,47289],{"class":167},[33,48711,242],{"class":163},[33,48713,47294],{"class":167},[33,48715,3459],{"class":54},[33,48717,47299],{"class":167},[33,48719,48720,48722,48724,48726,48728,48730],{"class":35,"line":829},[33,48721,47304],{"class":167},[33,48723,242],{"class":163},[33,48725,3149],{"class":50},[33,48727,20361],{"class":167},[33,48729,47313],{"class":54},[33,48731,47316],{"class":167},[33,48733,48734,48736,48738,48740,48742,48744],{"class":35,"line":834},[33,48735,8221],{"class":163},[33,48737,47323],{"class":167},[33,48739,6001],{"class":163},[33,48741,47328],{"class":167},[33,48743,43000],{"class":163},[33,48745,47333],{"class":167},[33,48747,48748,48750,48752,48754,48756],{"class":35,"line":839},[33,48749,47338],{"class":167},[33,48751,28050],{"class":54},[33,48753,47343],{"class":167},[33,48755,43412],{"class":54},[33,48757,47348],{"class":167},[33,48759,48760,48762],{"class":35,"line":860},[33,48761,1332],{"class":163},[33,48763,47355],{"class":167},[14,48765,9574,48766,48769,48770,48773],{},[30,48767,48768],{},"ocr_page_enhanced"," in place of ",[30,48771,48772],{},"ocr_page"," for any scan with visible background noise, shadow at page edges, or inconsistent ink coverage. The adaptive threshold is particularly effective for documents that were scanned under uneven lighting.",[18,48775,9247],{"id":9246},[14,48777,48778],{},"Check extraction quality before trusting the output:",[23,48780,48782],{"className":126,"code":48781,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\nfrom pathlib import Path\n\ndf = pd.read_csv(Path(\"output\u002Fscanned_table.csv\"))\n\n# 1. Row count — compare against a manual count from the source PDF\nprint(f\"Rows: {len(df)}\")\n\n# 2. Null ratio — high nulls indicate column reconstruction problems\nnull_rate = df.isnull().mean().mean()\nprint(f\"Null rate: {null_rate:.1%}  (>30% → tune row_tolerance or raise DPI)\")\n\n# 3. Numeric column check — merged cells show as parse failures\nfor col in df.columns:\n    n = pd.to_numeric(df[col], errors=\"coerce\").notna().mean()\n    if n > 0.7:\n        print(f\"  {col}: numeric ({n:.0%} parseable)\")\n",[30,48783,48784,48788,48798,48808,48812,48824,48828,48833,48854,48858,48863,48873,48898,48902,48907,48917,48935,48949],{"__ignoreMap":28},[33,48785,48786],{"class":35,"line":36},[33,48787,8895],{"class":39},[33,48789,48790,48792,48794,48796],{"class":35,"line":43},[33,48791,164],{"class":163},[33,48793,492],{"class":167},[33,48795,495],{"class":163},[33,48797,498],{"class":167},[33,48799,48800,48802,48804,48806],{"class":35,"line":61},[33,48801,190],{"class":163},[33,48803,193],{"class":167},[33,48805,164],{"class":163},[33,48807,198],{"class":167},[33,48809,48810],{"class":35,"line":73},[33,48811,92],{"emptyLinePlaceholder":91},[33,48813,48814,48816,48818,48820,48822],{"class":35,"line":88},[33,48815,13459],{"class":167},[33,48817,242],{"class":163},[33,48819,46182],{"class":167},[33,48821,47634],{"class":54},[33,48823,371],{"class":167},[33,48825,48826],{"class":35,"line":95},[33,48827,92],{"emptyLinePlaceholder":91},[33,48829,48830],{"class":35,"line":101},[33,48831,48832],{"class":39},"# 1. Row count — compare against a manual count from the source PDF\n",[33,48834,48835,48837,48839,48841,48844,48846,48848,48850,48852],{"class":35,"line":171},[33,48836,13474],{"class":50},[33,48838,602],{"class":167},[33,48840,4059],{"class":163},[33,48842,48843],{"class":54},"\"Rows: ",[33,48845,4065],{"class":50},[33,48847,4068],{"class":167},[33,48849,1121],{"class":50},[33,48851,274],{"class":54},[33,48853,221],{"class":167},[33,48855,48856],{"class":35,"line":179},[33,48857,92],{"emptyLinePlaceholder":91},[33,48859,48860],{"class":35,"line":187},[33,48861,48862],{"class":39},"# 2. Null ratio — high nulls indicate column reconstruction problems\n",[33,48864,48865,48868,48870],{"class":35,"line":201},[33,48866,48867],{"class":167},"null_rate ",[33,48869,242],{"class":163},[33,48871,48872],{"class":167}," df.isnull().mean().mean()\n",[33,48874,48875,48877,48879,48881,48884,48886,48889,48891,48893,48896],{"class":35,"line":206},[33,48876,13474],{"class":50},[33,48878,602],{"class":167},[33,48880,4059],{"class":163},[33,48882,48883],{"class":54},"\"Null rate: ",[33,48885,1115],{"class":50},[33,48887,48888],{"class":167},"null_rate",[33,48890,12755],{"class":163},[33,48892,1121],{"class":50},[33,48894,48895],{"class":54},"  (>30% → tune row_tolerance or raise DPI)\"",[33,48897,221],{"class":167},[33,48899,48900],{"class":35,"line":224},[33,48901,92],{"emptyLinePlaceholder":91},[33,48903,48904],{"class":35,"line":229},[33,48905,48906],{"class":39},"# 3. Numeric column check — merged cells show as parse failures\n",[33,48908,48909,48911,48913,48915],{"class":35,"line":235},[33,48910,6124],{"class":163},[33,48912,7985],{"class":167},[33,48914,662],{"class":163},[33,48916,8005],{"class":167},[33,48918,48919,48922,48924,48926,48928,48930,48932],{"class":35,"line":250},[33,48920,48921],{"class":167},"    n ",[33,48923,242],{"class":163},[33,48925,16774],{"class":167},[33,48927,8317],{"class":238},[33,48929,242],{"class":163},[33,48931,12107],{"class":54},[33,48933,48934],{"class":167},").notna().mean()\n",[33,48936,48937,48939,48942,48944,48947],{"class":35,"line":266},[33,48938,617],{"class":163},[33,48940,48941],{"class":167}," n ",[33,48943,6009],{"class":163},[33,48945,48946],{"class":50}," 0.7",[33,48948,574],{"class":167},[33,48950,48951,48953,48955,48957,48960,48962,48964,48966,48969,48971,48973,48975,48977,48980],{"class":35,"line":290},[33,48952,9414],{"class":50},[33,48954,602],{"class":167},[33,48956,4059],{"class":163},[33,48958,48959],{"class":54},"\"  ",[33,48961,1115],{"class":50},[33,48963,8276],{"class":167},[33,48965,1121],{"class":50},[33,48967,48968],{"class":54},": numeric (",[33,48970,1115],{"class":50},[33,48972,22354],{"class":167},[33,48974,12775],{"class":163},[33,48976,1121],{"class":50},[33,48978,48979],{"class":54}," parseable)\"",[33,48981,221],{"class":167},[14,48983,48984,48985,48988,48989,48991],{},"If null rate exceeds 30%, try increasing DPI to 400 or lowering ",[30,48986,48987],{},"min_confidence"," to 50. If numerics fail to parse, the column was merged — lower ",[30,48990,47371],{}," to split the rows more finely.",[18,48993,48994],{"id":29070},"Common Mistakes",[4273,48996,48997,49007],{},[4276,48998,48999],{},[4279,49000,49001,49003,49005],{},[4282,49002,29080],{},[4282,49004,4287],{},[4282,49006,4290],{},[4292,49008,49009,49023,49039,49059,49072],{},[4279,49010,49011,49014,49017],{},[4297,49012,49013],{},"Empty CSV output",[4297,49015,49016],{},"No text layer — ran pdfplumber directly on scan",[4297,49018,17059,49019,49022],{},[30,49020,49021],{},"is_scanned()"," first; route to this OCR pipeline",[4279,49024,49025,49028,49033],{},[4297,49026,49027],{},"Rows merged together",[4297,49029,49030,49032],{},[30,49031,47371],{}," too large for the font size",[4297,49034,49035,49036,49038],{},"Lower ",[30,49037,47371],{},"; start at 10px for small fonts",[4279,49040,49041,49050,49053],{},[4297,49042,49043,49044,49047,49048,12027],{},"Numeric values misread (",[30,49045,49046],{},"l"," instead of ",[30,49049,734],{},[4297,49051,49052],{},"DPI too low — character edges blur",[4297,49054,17059,49055,49058],{},[30,49056,49057],{},"dpi=300","; raise to 400 for old documents",[4279,49060,49061,49064,49069],{},[4297,49062,49063],{},"Garbage tokens in every cell",[4297,49065,49066,49068],{},[30,49067,48987],{}," too low",[4297,49070,49071],{},"Raise to 65–70; pre-process image contrast first",[4279,49073,49074,49082,49085],{},[4297,49075,49076,42706,49079],{},[30,49077,49078],{},"poppler not found",[30,49080,49081],{},"convert_from_path",[4297,49083,49084],{},"Poppler system binaries missing",[4297,49086,49087],{},[30,49088,49089],{},"sudo apt-get install poppler-utils",[18,49091,6918],{"id":6917},[4211,49093,49094,49099,49104,49109],{},[4214,49095,49096,49098],{},[940,49097,9592],{"href":942}," — lattice and stream extraction for native PDFs with text layers",[4214,49100,49101,49103],{},[940,49102,10535],{"href":10534}," — coordinate-sorting fix for partially-OCR'd PDFs that still misalign",[4214,49105,49106,49108],{},[940,49107,36756],{"href":26957}," — full OCR preprocessing guide including image enhancement",[4214,49110,49111,49113],{},[940,49112,9599],{"href":9598}," — clean and type-coerce the extracted DataFrame",[14,49115,6947,49116,3035],{},[940,49117,9592],{"href":942},[6953,49119,49120],{},"html pre.shiki code .sD7c4, html code.shiki .sD7c4{--shiki-default:#D73A49}html pre.shiki code .sgsFI, html code.shiki .sgsFI{--shiki-default:#24292E}html pre.shiki code .sYBdl, html code.shiki .sYBdl{--shiki-default:#032F62}html pre.shiki code .sYu0t, html code.shiki .sYu0t{--shiki-default:#005CC5}html pre.shiki code .sAwPA, html code.shiki .sAwPA{--shiki-default:#6A737D}html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html pre.shiki code .sqxcx, html code.shiki .sqxcx{--shiki-default:#E36209}html pre.shiki code .s7eDp, html code.shiki .s7eDp{--shiki-default:#6F42C1}",{"title":28,"searchDepth":43,"depth":43,"links":49122},[49123,49124,49125,49126,49127,49128,49129,49130,49131,49132,49133,49134],{"id":7020,"depth":43,"text":7021},{"id":35016,"depth":43,"text":35017},{"id":20,"depth":43,"text":21},{"id":46839,"depth":43,"text":46840},{"id":47143,"depth":43,"text":47144},{"id":47364,"depth":43,"text":47365},{"id":47589,"depth":43,"text":47590},{"id":48025,"depth":43,"text":48026},{"id":48353,"depth":43,"text":48354},{"id":9246,"depth":43,"text":9247},{"id":29070,"depth":43,"text":48994},{"id":6917,"depth":43,"text":6918},"Scanned PDF Tables","Extract tables from scanned PDFs using pdf2image and Tesseract OCR. Diagnose missing text layers, preprocess images, reconstruct tabular rows, and export to CSV.",{},"\u002Fautomating-pdf-extraction-generation\u002Fextracting-tables-from-pdfs\u002Fhow-to-extract-tables-from-scanned-pdfs",{"title":10077,"description":49136},"Extract Tables from Scanned PDFs with Python OCR","automating-pdf-extraction-generation\u002Fextracting-tables-from-pdfs\u002Fhow-to-extract-tables-from-scanned-pdfs\u002Findex",[9631,47,49143,46820,49144],"ocr","pdf2image","0zabP60LQMA09ry475SG9m0yZ3uWpeQc7xpes1AXqhM",{"id":49147,"title":9592,"body":49148,"breadcrumbTitle":53840,"canonical":6977,"date":46387,"description":53841,"draft":6980,"extension":6981,"image":6977,"meta":53842,"navigation":91,"path":53843,"robots":6977,"seo":53844,"seoTitle":53845,"stem":53846,"tags":53847,"updatedAt":6978,"__hash__":53849},"content\u002Fautomating-pdf-extraction-generation\u002Fextracting-tables-from-pdfs\u002Findex.md",{"type":7,"value":49149,"toc":53822},[49150,49153,49162,49164,49167,49231,49246,49253,49257,49260,49734,49739,49794,49909,49913,49916,50434,50447,50451,50454,50848,50857,50861,50868,51274,51283,51287,51290,51783,51787,51791,51794,51998,52002,52013,52017,52027,52031,52034,52341,52344,52346,52356,52365,52676,52684,52697,52699,52795,52797,53787,53789,53816,53820],[10,49151,9592],{"id":49152},"extracting-tables-from-pdfs",[14,49154,49155,49156,49158,49159,3035],{},"PDFs store table data in three fundamentally different ways — bordered grids with vector lines, whitespace-aligned columns, and rasterized images — and each requires a different extraction path. Generic text parsers collapse all three into a single misaligned blob. This guide implements a decision-driven pipeline: diagnose the PDF structure first, then route to ",[940,49157,943],{"href":942}," for whitespace tables, camelot lattice for bordered grids, or the ",[940,49160,49161],{"href":10076},"OCR pipeline for scanned documents",[18,49163,21],{"id":20},[14,49165,49166],{},"Install system and Python dependencies before any extraction attempt:",[23,49168,49170],{"className":25,"code":49169,"language":27,"meta":28,"style":28},"# System dependencies (Ubuntu\u002FDebian)\nsudo apt-get install ghostscript libsm6 libxext6\n\n# Python packages\npip install pdfplumber \"camelot-py[cv]\" pandas pdf2image pytesseract\n\n# Optional: verify camelot system deps are present\npython -c \"import camelot; print(camelot.__version__)\"\n",[30,49171,49172,49176,49190,49194,49198,49214,49218,49223],{"__ignoreMap":28},[33,49173,49174],{"class":35,"line":36},[33,49175,46769],{"class":39},[33,49177,49178,49180,49182,49184,49186,49188],{"class":35,"line":43},[33,49179,9669],{"class":46},[33,49181,9672],{"class":54},[33,49183,79],{"class":54},[33,49185,9677],{"class":54},[33,49187,9680],{"class":54},[33,49189,9683],{"class":54},[33,49191,49192],{"class":35,"line":61},[33,49193,92],{"emptyLinePlaceholder":91},[33,49195,49196],{"class":35,"line":73},[33,49197,9692],{"class":39},[33,49199,49200,49202,49204,49206,49208,49210,49212],{"class":35,"line":88},[33,49201,76],{"class":46},[33,49203,79],{"class":54},[33,49205,9701],{"class":54},[33,49207,9704],{"class":54},[33,49209,16183],{"class":54},[33,49211,46799],{"class":54},[33,49213,47178],{"class":54},[33,49215,49216],{"class":35,"line":95},[33,49217,92],{"emptyLinePlaceholder":91},[33,49219,49220],{"class":35,"line":101},[33,49221,49222],{"class":39},"# Optional: verify camelot system deps are present\n",[33,49224,49225,49227,49229],{"class":35,"line":171},[33,49226,47],{"class":46},[33,49228,106],{"class":50},[33,49230,9725],{"class":54},[14,49232,49233,49234,49236,49237,49239,49240,49242,49243,49245],{},"If the ",[30,49235,16139],{}," import raises ",[30,49238,9731],{}," or a ",[30,49241,41520],{}," import error, see ",[940,49244,9739],{"href":9738}," before continuing.",[14,49247,49248,49249,49252],{},"Place a representative sample PDF at ",[30,49250,49251],{},"data\u002Finput.pdf"," to follow along. A one-page PDF with a visible-border invoice table works well for lattice mode; a financial statement with whitespace-separated columns suits stream mode.",[18,49254,49256],{"id":49255},"step-1-diagnostic-classify-the-pdf-before-choosing-a-parser","Step 1: Diagnostic — Classify the PDF Before Choosing a Parser",[14,49258,49259],{},"Run this snippet on any unknown PDF before touching camelot or pdfplumber. It detects text layer presence and line density to determine which extraction path applies.",[23,49261,49263],{"className":126,"code":49262,"language":47,"meta":28,"style":28},"# pip install pdfplumber\nfrom pathlib import Path\nimport pdfplumber\n\nPDF_PATH = Path(\"data\u002Finput.pdf\")\n\ndef classify_pdf(path: Path) -> dict:\n    \"\"\"Return page-level metadata to choose the right extraction strategy.\"\"\"\n    result = {\"pages\": [], \"has_text\": False, \"has_lines\": False, \"likely_scanned\": False}\n    try:\n        with pdfplumber.open(path) as pdf:\n            for i, page in enumerate(pdf.pages):\n                text = page.extract_text() or \"\"\n                lines = page.lines or []\n                rects = page.rects or []\n                page_info = {\n                    \"page\": i + 1,\n                    \"text_chars\": len(text.strip()),\n                    \"vector_lines\": len(lines),\n                    \"vector_rects\": len(rects),\n                }\n                result[\"pages\"].append(page_info)\n                if text.strip():\n                    result[\"has_text\"] = True\n                if lines or rects:\n                    result[\"has_lines\"] = True\n    except Exception as e:\n        raise RuntimeError(f\"Could not open {path}: {e}\") from e\n\n    # Heuristic: no text and no lines → rasterized scan\n    result[\"likely_scanned\"] = not result[\"has_text\"] and not result[\"has_lines\"]\n    return result\n\nif __name__ == \"__main__\":\n    info = classify_pdf(PDF_PATH)\n    for p in info[\"pages\"]:\n        print(p)\n    print(f\"Recommended path: {'OCR' if info['likely_scanned'] else 'lattice' if info['has_lines'] else 'stream'}\")\n",[30,49264,49265,49269,49279,49285,49289,49302,49306,49318,49323,49364,49370,49380,49392,49404,49417,49430,49439,49451,49463,49473,49484,49489,49499,49506,49519,49531,49543,49553,49587,49591,49596,49626,49633,49637,49649,49662,49677,49684],{"__ignoreMap":28},[33,49266,49267],{"class":35,"line":36},[33,49268,9763],{"class":39},[33,49270,49271,49273,49275,49277],{"class":35,"line":43},[33,49272,190],{"class":163},[33,49274,193],{"class":167},[33,49276,164],{"class":163},[33,49278,198],{"class":167},[33,49280,49281,49283],{"class":35,"line":61},[33,49282,164],{"class":163},[33,49284,485],{"class":167},[33,49286,49287],{"class":35,"line":73},[33,49288,92],{"emptyLinePlaceholder":91},[33,49290,49291,49293,49295,49297,49300],{"class":35,"line":88},[33,49292,7076],{"class":50},[33,49294,212],{"class":163},[33,49296,215],{"class":167},[33,49298,49299],{"class":54},"\"data\u002Finput.pdf\"",[33,49301,221],{"class":167},[33,49303,49304],{"class":35,"line":95},[33,49305,92],{"emptyLinePlaceholder":91},[33,49307,49308,49310,49312,49314,49316],{"class":35,"line":101},[33,49309,562],{"class":163},[33,49311,9810],{"class":46},[33,49313,3743],{"class":167},[33,49315,37100],{"class":50},[33,49317,574],{"class":167},[33,49319,49320],{"class":35,"line":171},[33,49321,49322],{"class":54},"    \"\"\"Return page-level metadata to choose the right extraction strategy.\"\"\"\n",[33,49324,49325,49327,49329,49331,49334,49337,49340,49342,49344,49346,49349,49351,49353,49355,49358,49360,49362],{"class":35,"line":179},[33,49326,8842],{"class":167},[33,49328,242],{"class":163},[33,49330,4098],{"class":167},[33,49332,49333],{"class":54},"\"pages\"",[33,49335,49336],{"class":167},": [], ",[33,49338,49339],{"class":54},"\"has_text\"",[33,49341,2079],{"class":167},[33,49343,902],{"class":50},[33,49345,365],{"class":167},[33,49347,49348],{"class":54},"\"has_lines\"",[33,49350,2079],{"class":167},[33,49352,902],{"class":50},[33,49354,365],{"class":167},[33,49356,49357],{"class":54},"\"likely_scanned\"",[33,49359,2079],{"class":167},[33,49361,902],{"class":50},[33,49363,4113],{"class":167},[33,49365,49366,49368],{"class":35,"line":187},[33,49367,2424],{"class":163},[33,49369,574],{"class":167},[33,49371,49372,49374,49376,49378],{"class":35,"line":201},[33,49373,2191],{"class":163},[33,49375,7123],{"class":167},[33,49377,495],{"class":163},[33,49379,686],{"class":167},[33,49381,49382,49384,49386,49388,49390],{"class":35,"line":206},[33,49383,1793],{"class":163},[33,49385,37139],{"class":167},[33,49387,662],{"class":163},[33,49389,7403],{"class":50},[33,49391,40080],{"class":167},[33,49393,49394,49396,49398,49400,49402],{"class":35,"line":224},[33,49395,13116],{"class":167},[33,49397,242],{"class":163},[33,49399,13121],{"class":167},[33,49401,7162],{"class":163},[33,49403,13126],{"class":54},[33,49405,49406,49408,49410,49413,49415],{"class":35,"line":229},[33,49407,37159],{"class":167},[33,49409,242],{"class":163},[33,49411,49412],{"class":167}," page.lines ",[33,49414,7162],{"class":163},[33,49416,589],{"class":167},[33,49418,49419,49421,49423,49426,49428],{"class":35,"line":235},[33,49420,37169],{"class":167},[33,49422,242],{"class":163},[33,49424,49425],{"class":167}," page.rects ",[33,49427,7162],{"class":163},[33,49429,589],{"class":167},[33,49431,49432,49435,49437],{"class":35,"line":250},[33,49433,49434],{"class":167},"                page_info ",[33,49436,242],{"class":163},[33,49438,16265],{"class":167},[33,49440,49441,49443,49445,49447,49449],{"class":35,"line":266},[33,49442,37194],{"class":54},[33,49444,37197],{"class":167},[33,49446,1811],{"class":163},[33,49448,1814],{"class":50},[33,49450,247],{"class":167},[33,49452,49453,49456,49458,49460],{"class":35,"line":290},[33,49454,49455],{"class":54},"                    \"text_chars\"",[33,49457,2079],{"class":167},[33,49459,928],{"class":50},[33,49461,49462],{"class":167},"(text.strip()),\n",[33,49464,49465,49467,49469,49471],{"class":35,"line":295},[33,49466,37208],{"class":54},[33,49468,2079],{"class":167},[33,49470,928],{"class":50},[33,49472,37215],{"class":167},[33,49474,49475,49478,49480,49482],{"class":35,"line":300},[33,49476,49477],{"class":54},"                    \"vector_rects\"",[33,49479,2079],{"class":167},[33,49481,928],{"class":50},[33,49483,37227],{"class":167},[33,49485,49486],{"class":35,"line":317},[33,49487,49488],{"class":167},"                }\n",[33,49490,49491,49494,49496],{"class":35,"line":332},[33,49492,49493],{"class":167},"                result[",[33,49495,49333],{"class":54},[33,49497,49498],{"class":167},"].append(page_info)\n",[33,49500,49501,49503],{"class":35,"line":347},[33,49502,7170],{"class":163},[33,49504,49505],{"class":167}," text.strip():\n",[33,49507,49508,49511,49513,49515,49517],{"class":35,"line":374},[33,49509,49510],{"class":167},"                    result[",[33,49512,49339],{"class":54},[33,49514,763],{"class":167},[33,49516,242],{"class":163},[33,49518,2887],{"class":50},[33,49520,49521,49523,49526,49528],{"class":35,"line":397},[33,49522,7170],{"class":163},[33,49524,49525],{"class":167}," lines ",[33,49527,7162],{"class":163},[33,49529,49530],{"class":167}," rects:\n",[33,49532,49533,49535,49537,49539,49541],{"class":35,"line":653},[33,49534,49510],{"class":167},[33,49536,49348],{"class":54},[33,49538,763],{"class":167},[33,49540,242],{"class":163},[33,49542,2887],{"class":50},[33,49544,49545,49547,49549,49551],{"class":35,"line":667},[33,49546,2449],{"class":163},[33,49548,783],{"class":50},[33,49550,1852],{"class":163},[33,49552,7583],{"class":167},[33,49554,49555,49557,49559,49561,49563,49565,49567,49569,49571,49573,49575,49577,49579,49581,49583,49585],{"class":35,"line":675},[33,49556,4051],{"class":163},[33,49558,7590],{"class":50},[33,49560,602],{"class":167},[33,49562,4059],{"class":163},[33,49564,43335],{"class":54},[33,49566,1115],{"class":50},[33,49568,2580],{"class":167},[33,49570,1121],{"class":50},[33,49572,2079],{"class":54},[33,49574,1115],{"class":50},[33,49576,7602],{"class":167},[33,49578,1121],{"class":50},[33,49580,274],{"class":54},[33,49582,1649],{"class":167},[33,49584,190],{"class":163},[33,49586,7613],{"class":167},[33,49588,49589],{"class":35,"line":689},[33,49590,92],{"emptyLinePlaceholder":91},[33,49592,49593],{"class":35,"line":703},[33,49594,49595],{"class":39},"    # Heuristic: no text and no lines → rasterized scan\n",[33,49597,49598,49601,49603,49605,49607,49609,49612,49614,49616,49618,49620,49622,49624],{"class":35,"line":714},[33,49599,49600],{"class":167},"    result[",[33,49602,49357],{"class":54},[33,49604,763],{"class":167},[33,49606,242],{"class":163},[33,49608,620],{"class":163},[33,49610,49611],{"class":167}," result[",[33,49613,49339],{"class":54},[33,49615,763],{"class":167},[33,49617,6001],{"class":163},[33,49619,620],{"class":163},[33,49621,49611],{"class":167},[33,49623,49348],{"class":54},[33,49625,9202],{"class":167},[33,49627,49628,49630],{"class":35,"line":723},[33,49629,1332],{"class":163},[33,49631,49632],{"class":167}," result\n",[33,49634,49635],{"class":35,"line":754},[33,49636,92],{"emptyLinePlaceholder":91},[33,49638,49639,49641,49643,49645,49647],{"class":35,"line":771},[33,49640,2491],{"class":163},[33,49642,2494],{"class":50},[33,49644,2497],{"class":163},[33,49646,2500],{"class":54},[33,49648,574],{"class":167},[33,49650,49651,49654,49656,49658,49660],{"class":35,"line":777},[33,49652,49653],{"class":167},"    info ",[33,49655,242],{"class":163},[33,49657,10032],{"class":167},[33,49659,7076],{"class":50},[33,49661,221],{"class":167},[33,49663,49664,49666,49668,49670,49673,49675],{"class":35,"line":788},[33,49665,656],{"class":163},[33,49667,6127],{"class":167},[33,49669,662],{"class":163},[33,49671,49672],{"class":167}," info[",[33,49674,49333],{"class":54},[33,49676,17477],{"class":167},[33,49678,49679,49681],{"class":35,"line":804},[33,49680,9414],{"class":50},[33,49682,49683],{"class":167},"(p)\n",[33,49685,49686,49688,49690,49692,49695,49697,49700,49702,49704,49707,49709,49711,49714,49716,49718,49721,49723,49725,49728,49730,49732],{"class":35,"line":809},[33,49687,7268],{"class":50},[33,49689,602],{"class":167},[33,49691,4059],{"class":163},[33,49693,49694],{"class":54},"\"Recommended path: ",[33,49696,1115],{"class":50},[33,49698,49699],{"class":54},"'OCR'",[33,49701,9994],{"class":163},[33,49703,49672],{"class":167},[33,49705,49706],{"class":54},"'likely_scanned'",[33,49708,763],{"class":167},[33,49710,7489],{"class":163},[33,49712,49713],{"class":54}," 'lattice'",[33,49715,9994],{"class":163},[33,49717,49672],{"class":167},[33,49719,49720],{"class":54},"'has_lines'",[33,49722,763],{"class":167},[33,49724,7489],{"class":163},[33,49726,49727],{"class":54}," 'stream'",[33,49729,1121],{"class":50},[33,49731,274],{"class":54},[33,49733,221],{"class":167},[14,49735,49736],{},[1974,49737,49738],{},"Reading the output:",[4273,49740,49741,49751],{},[4276,49742,49743],{},[4279,49744,49745,49748],{},[4282,49746,49747],{},"Condition",[4282,49749,49750],{},"Recommended parser",[4292,49752,49753,49768,49782],{},[4279,49754,49755,49763],{},[4297,49756,49757,365,49760],{},[30,49758,49759],{},"has_lines=True",[30,49761,49762],{},"has_text=True",[4297,49764,49765,49766],{},"camelot ",[30,49767,36830],{},[4279,49769,49770,49777],{},[4297,49771,49772,365,49774],{},[30,49773,49762],{},[30,49775,49776],{},"has_lines=False",[4297,49778,49765,49779,49781],{},[30,49780,36834],{}," or pdfplumber",[4279,49783,49784,49791],{},[4297,49785,49786,365,49789],{},[30,49787,49788],{},"has_text=False",[30,49790,49776],{},[4297,49792,49793],{},"OCR pipeline (pdf2image + Tesseract)",[2540,49795,2547,49798,2547,49801,2547,49804,2547,2547,49811,2547,49814,2547,49818,2547,2547,49821,2547,2547,49825,2547,49829,2547,49833,2547,2547,49837,2547,49840,2547,49844,2547,49846,2547,49848,2547,2547,49850,2547,49854,2547,49858,2547,49861,2547,49865,2547,2547,49867,2547,49870,2547,49874,2547,49876,2547,49878,2547,2547,49880,2547,49884,2547,49886,2547,49889,2547,49892,2547,49897,2547,49901,2547,49905],{"viewBox":49796,"role":2543,"ariaLabel":49797,"xmlns":2545,"style":2546},"0 0 760 380","Decision tree for choosing a PDF table extraction method",[2549,49799,49800],{},"PDF Table Extraction Decision Tree",[2553,49802,49803],{},"A decision tree showing how to route a PDF to the correct extraction method: bordered grid to camelot lattice, whitespace columns to stream mode, and scanned image to OCR.",[2557,49805,2559,49806,2547],{},[2573,49807,2564,49809,2559],{"id":49808,"viewBox":2576,"refX":2577,"refY":1153,"markerWidth":1179,"markerHeight":1179,"orient":2578},"extract-tables-arrow",[2580,49810],{"d":2582,"fill":2583},[2585,49812],{"x":26446,"y":2587,"width":2701,"height":49813,"rx":2591,"fill":2592,"stroke":11166,"style":11210},"52",[2000,49815,49817],{"x":2626,"y":49816,"fill":2599,"style":2600},"42","Input PDF",[2000,49819,49820],{"x":2626,"y":2590,"fill":2583,"style":2685},"run classify_pdf()",[35,49822],{"x1":2626,"y1":49823,"x2":2626,"y2":2679,"stroke":2583,"markerEnd":49824,"style":2594},"72","url(#extract-tables-arrow)",[49826,49827],"polygon",{"points":49828,"fill":2615,"stroke":2593,"style":2594},"380,110 480,148 380,186 280,148",[2000,49830,49832],{"x":2626,"y":49831,"fill":2599,"style":2685},"143","Vector lines",[2000,49834,49836],{"x":2626,"y":49835,"fill":2599,"style":2685},"161","present?",[35,49838],{"x1":49839,"y1":11194,"x2":2589,"y2":11194,"stroke":2583,"markerEnd":49824,"style":2594},"280",[2000,49841,49843],{"x":2611,"y":49842,"fill":11166,"style":2605},"138","YES",[2585,49845],{"x":2587,"y":11095,"width":2650,"height":2590,"rx":2591,"fill":11165,"stroke":11166,"style":2594},[2000,49847,16139],{"x":38748,"y":26406,"fill":2599,"style":2600},[2000,49849,36830],{"x":38748,"y":11173,"fill":11166,"style":2685},[35,49851],{"x1":49852,"y1":11194,"x2":49853,"y2":11194,"stroke":2583,"markerEnd":49824,"style":2594},"480","560",[2000,49855,49857],{"x":49856,"y":49842,"fill":11166,"style":2605},"520","NO",[49826,49859],{"points":49860,"fill":2615,"stroke":2593,"style":2594},"620,110 720,148 620,186 520,148",[2000,49862,49864],{"x":49863,"y":49831,"fill":2599,"style":2685},"620","Text layer",[2000,49866,49836],{"x":49863,"y":49835,"fill":2599,"style":2685},[35,49868],{"x1":49863,"y1":11119,"x2":49863,"y2":49869,"stroke":2583,"markerEnd":49824,"style":2594},"250",[2000,49871,49843],{"x":26421,"y":49872,"fill":11166,"style":49873},"222","font-size:11px",[2585,49875],{"x":49853,"y":49869,"width":2589,"height":2590,"rx":2591,"fill":11165,"stroke":11166,"style":2594},[2000,49877,16139],{"x":49863,"y":11148,"fill":2599,"style":2600},[2000,49879,36834],{"x":49863,"y":2689,"fill":11166,"style":2685},[35,49881],{"x1":49882,"y1":11194,"x2":49883,"y2":11194,"stroke":2583,"style":2594},"720","740",[35,49885],{"x1":49883,"y1":11194,"x2":49883,"y2":2698,"stroke":2583,"style":2594},[35,49887],{"x1":49883,"y1":2698,"x2":49888,"y2":2698,"stroke":2583,"markerEnd":49824,"style":2594},"710",[2000,49890,49857],{"x":49891,"y":49872,"fill":11166,"style":49873},"722",[2585,49893],{"x":49894,"y":49839,"width":2588,"height":2590,"rx":2591,"fill":49895,"stroke":49896,"style":2594},"580","#fff7ed","#f59e0b",[2000,49898,49144],{"x":49899,"y":49900,"fill":2599,"style":2600},"645","305",[2000,49902,49904],{"x":49899,"y":49903,"fill":49896,"style":2685},"323","+ Tesseract OCR",[2000,49906,49908],{"x":2626,"y":49907,"fill":2583,"style":2605},"355","\nScanned pages → OCR pipeline; vector lines → lattice; whitespace columns → stream\n",[18,49910,49912],{"id":49911},"step-2-extract-bordered-tables-with-camelot-lattice","Step 2: Extract Bordered Tables with camelot Lattice",[14,49914,49915],{},"Lattice mode traces the physical grid lines in the PDF. Use it whenever a table has visible borders — invoices, financial statements, structured reports.",[23,49917,49919],{"className":126,"code":49918,"language":47,"meta":28,"style":28},"# pip install \"camelot-py[cv]\" pandas\nfrom pathlib import Path\nimport camelot\nimport pandas as pd\n\nPDF_PATH = Path(\"data\u002Ffinancial_statement.pdf\")\nOUTPUT_DIR = Path(\"output\")\nOUTPUT_DIR.mkdir(exist_ok=True)\n\ndef extract_lattice(path: Path, pages: str = \"1-end\") -> list[pd.DataFrame]:\n    \"\"\"Extract bordered tables using camelot lattice mode.\"\"\"\n    try:\n        tables = camelot.read_pdf(\n            str(path),\n            pages=pages,\n            flavor=\"lattice\",\n            process_background=True,   # detect lines on coloured backgrounds\n            line_scale=40,             # raise to 50-60 for thin\u002Ffaint lines\n        )\n    except Exception as e:\n        raise RuntimeError(f\"camelot lattice failed on {path}: {e}\") from e\n\n    if tables.n == 0:\n        raise ValueError(f\"No tables found in {path} — check if borders are truly vector lines\")\n\n    dfs = []\n    for t in tables:\n        df = t.df.copy()\n        df.replace(\"\", pd.NA, inplace=True)\n        df.dropna(how=\"all\", inplace=True)\n        df.dropna(axis=1, how=\"all\", inplace=True)\n        dfs.append(df)\n\n    return dfs\n\nif __name__ == \"__main__\":\n    results = extract_lattice(PDF_PATH, pages=\"1-3\")\n    for i, df in enumerate(results):\n        out = OUTPUT_DIR \u002F f\"table_{i+1}.csv\"\n        df.to_csv(out, index=False)\n        print(f\"Table {i+1}: {df.shape[0]} rows × {df.shape[1]} cols → {out}\")\n",[30,49920,49921,49925,49935,49941,49951,49955,49968,49980,49994,49998,50015,50020,50026,50034,50040,50048,50058,50071,50086,50090,50100,50135,50139,50151,50175,50179,50187,50197,50205,50225,50246,50274,50279,50283,50289,50293,50305,50327,50340,50367,50380],{"__ignoreMap":28},[33,49922,49923],{"class":35,"line":36},[33,49924,10550],{"class":39},[33,49926,49927,49929,49931,49933],{"class":35,"line":43},[33,49928,190],{"class":163},[33,49930,193],{"class":167},[33,49932,164],{"class":163},[33,49934,198],{"class":167},[33,49936,49937,49939],{"class":35,"line":61},[33,49938,164],{"class":163},[33,49940,10567],{"class":167},[33,49942,49943,49945,49947,49949],{"class":35,"line":73},[33,49944,164],{"class":163},[33,49946,492],{"class":167},[33,49948,495],{"class":163},[33,49950,498],{"class":167},[33,49952,49953],{"class":35,"line":88},[33,49954,92],{"emptyLinePlaceholder":91},[33,49956,49957,49959,49961,49963,49966],{"class":35,"line":95},[33,49958,7076],{"class":50},[33,49960,212],{"class":163},[33,49962,215],{"class":167},[33,49964,49965],{"class":54},"\"data\u002Ffinancial_statement.pdf\"",[33,49967,221],{"class":167},[33,49969,49970,49972,49974,49976,49978],{"class":35,"line":101},[33,49971,4615],{"class":50},[33,49973,212],{"class":163},[33,49975,215],{"class":167},[33,49977,41169],{"class":54},[33,49979,221],{"class":167},[33,49981,49982,49984,49986,49988,49990,49992],{"class":35,"line":171},[33,49983,4615],{"class":50},[33,49985,1078],{"class":167},[33,49987,878],{"class":238},[33,49989,242],{"class":163},[33,49991,855],{"class":50},[33,49993,221],{"class":167},[33,49995,49996],{"class":35,"line":179},[33,49997,92],{"emptyLinePlaceholder":91},[33,49999,50000,50002,50005,50007,50009,50011,50013],{"class":35,"line":187},[33,50001,562],{"class":163},[33,50003,50004],{"class":46}," extract_lattice",[33,50006,14838],{"class":167},[33,50008,1053],{"class":50},[33,50010,212],{"class":163},[33,50012,10627],{"class":54},[33,50014,10647],{"class":167},[33,50016,50017],{"class":35,"line":201},[33,50018,50019],{"class":54},"    \"\"\"Extract bordered tables using camelot lattice mode.\"\"\"\n",[33,50021,50022,50024],{"class":35,"line":206},[33,50023,2424],{"class":163},[33,50025,574],{"class":167},[33,50027,50028,50030,50032],{"class":35,"line":224},[33,50029,37895],{"class":167},[33,50031,242],{"class":163},[33,50033,10668],{"class":167},[33,50035,50036,50038],{"class":35,"line":229},[33,50037,10673],{"class":50},[33,50039,10676],{"class":167},[33,50041,50042,50044,50046],{"class":35,"line":235},[33,50043,10681],{"class":238},[33,50045,242],{"class":163},[33,50047,10686],{"class":167},[33,50049,50050,50052,50054,50056],{"class":35,"line":250},[33,50051,10691],{"class":238},[33,50053,242],{"class":163},[33,50055,10985],{"class":54},[33,50057,247],{"class":167},[33,50059,50060,50062,50064,50066,50068],{"class":35,"line":266},[33,50061,10701],{"class":238},[33,50063,242],{"class":163},[33,50065,855],{"class":50},[33,50067,1166],{"class":167},[33,50069,50070],{"class":39},"# detect lines on coloured backgrounds\n",[33,50072,50073,50076,50078,50080,50083],{"class":35,"line":290},[33,50074,50075],{"class":238},"            line_scale",[33,50077,242],{"class":163},[33,50079,26323],{"class":50},[33,50081,50082],{"class":167},",             ",[33,50084,50085],{"class":39},"# raise to 50-60 for thin\u002Ffaint lines\n",[33,50087,50088],{"class":35,"line":295},[33,50089,5867],{"class":167},[33,50091,50092,50094,50096,50098],{"class":35,"line":300},[33,50093,2449],{"class":163},[33,50095,783],{"class":50},[33,50097,1852],{"class":163},[33,50099,7583],{"class":167},[33,50101,50102,50104,50106,50108,50110,50113,50115,50117,50119,50121,50123,50125,50127,50129,50131,50133],{"class":35,"line":317},[33,50103,4051],{"class":163},[33,50105,7590],{"class":50},[33,50107,602],{"class":167},[33,50109,4059],{"class":163},[33,50111,50112],{"class":54},"\"camelot lattice failed on ",[33,50114,1115],{"class":50},[33,50116,2580],{"class":167},[33,50118,1121],{"class":50},[33,50120,2079],{"class":54},[33,50122,1115],{"class":50},[33,50124,7602],{"class":167},[33,50126,1121],{"class":50},[33,50128,274],{"class":54},[33,50130,1649],{"class":167},[33,50132,190],{"class":163},[33,50134,7613],{"class":167},[33,50136,50137],{"class":35,"line":332},[33,50138,92],{"emptyLinePlaceholder":91},[33,50140,50141,50143,50145,50147,50149],{"class":35,"line":347},[33,50142,617],{"class":163},[33,50144,40572],{"class":167},[33,50146,1865],{"class":163},[33,50148,10791],{"class":50},[33,50150,574],{"class":167},[33,50152,50153,50155,50157,50159,50161,50164,50166,50168,50170,50173],{"class":35,"line":374},[33,50154,4051],{"class":163},[33,50156,4054],{"class":50},[33,50158,602],{"class":167},[33,50160,4059],{"class":163},[33,50162,50163],{"class":54},"\"No tables found in ",[33,50165,1115],{"class":50},[33,50167,2580],{"class":167},[33,50169,1121],{"class":50},[33,50171,50172],{"class":54}," — check if borders are truly vector lines\"",[33,50174,221],{"class":167},[33,50176,50177],{"class":35,"line":397},[33,50178,92],{"emptyLinePlaceholder":91},[33,50180,50181,50183,50185],{"class":35,"line":653},[33,50182,37500],{"class":167},[33,50184,242],{"class":163},[33,50186,589],{"class":167},[33,50188,50189,50191,50193,50195],{"class":35,"line":667},[33,50190,656],{"class":163},[33,50192,10818],{"class":167},[33,50194,662],{"class":163},[33,50196,38001],{"class":167},[33,50198,50199,50201,50203],{"class":35,"line":675},[33,50200,7930],{"class":167},[33,50202,242],{"class":163},[33,50204,10832],{"class":167},[33,50206,50207,50209,50211,50213,50215,50217,50219,50221,50223],{"class":35,"line":689},[33,50208,10879],{"class":167},[33,50210,3198],{"class":54},[33,50212,10884],{"class":167},[33,50214,8018],{"class":50},[33,50216,365],{"class":167},[33,50218,10891],{"class":238},[33,50220,242],{"class":163},[33,50222,855],{"class":50},[33,50224,221],{"class":167},[33,50226,50227,50230,50232,50234,50236,50238,50240,50242,50244],{"class":35,"line":703},[33,50228,50229],{"class":167},"        df.dropna(",[33,50231,28045],{"class":238},[33,50233,242],{"class":163},[33,50235,35616],{"class":54},[33,50237,365],{"class":167},[33,50239,10891],{"class":238},[33,50241,242],{"class":163},[33,50243,855],{"class":50},[33,50245,221],{"class":167},[33,50247,50248,50250,50252,50254,50256,50258,50260,50262,50264,50266,50268,50270,50272],{"class":35,"line":714},[33,50249,50229],{"class":167},[33,50251,4177],{"class":238},[33,50253,242],{"class":163},[33,50255,734],{"class":50},[33,50257,365],{"class":167},[33,50259,28045],{"class":238},[33,50261,242],{"class":163},[33,50263,35616],{"class":54},[33,50265,365],{"class":167},[33,50267,10891],{"class":238},[33,50269,242],{"class":163},[33,50271,855],{"class":50},[33,50273,221],{"class":167},[33,50275,50276],{"class":35,"line":723},[33,50277,50278],{"class":167},"        dfs.append(df)\n",[33,50280,50281],{"class":35,"line":754},[33,50282,92],{"emptyLinePlaceholder":91},[33,50284,50285,50287],{"class":35,"line":771},[33,50286,1332],{"class":163},[33,50288,37688],{"class":167},[33,50290,50291],{"class":35,"line":777},[33,50292,92],{"emptyLinePlaceholder":91},[33,50294,50295,50297,50299,50301,50303],{"class":35,"line":788},[33,50296,2491],{"class":163},[33,50298,2494],{"class":50},[33,50300,2497],{"class":163},[33,50302,2500],{"class":54},[33,50304,574],{"class":167},[33,50306,50307,50309,50311,50314,50316,50318,50320,50322,50325],{"class":35,"line":804},[33,50308,37112],{"class":167},[33,50310,242],{"class":163},[33,50312,50313],{"class":167}," extract_lattice(",[33,50315,7076],{"class":50},[33,50317,365],{"class":167},[33,50319,10971],{"class":238},[33,50321,242],{"class":163},[33,50323,50324],{"class":54},"\"1-3\"",[33,50326,221],{"class":167},[33,50328,50329,50331,50333,50335,50337],{"class":35,"line":809},[33,50330,656],{"class":163},[33,50332,10994],{"class":167},[33,50334,662],{"class":163},[33,50336,7403],{"class":50},[33,50338,50339],{"class":167},"(results):\n",[33,50341,50342,50345,50347,50350,50352,50354,50357,50359,50361,50363,50365],{"class":35,"line":819},[33,50343,50344],{"class":167},"        out ",[33,50346,242],{"class":163},[33,50348,50349],{"class":50}," OUTPUT_DIR",[33,50351,1107],{"class":163},[33,50353,1110],{"class":163},[33,50355,50356],{"class":54},"\"table_",[33,50358,1115],{"class":50},[33,50360,7499],{"class":167},[33,50362,1811],{"class":163},[33,50364,40161],{"class":50},[33,50366,40176],{"class":54},[33,50368,50369,50372,50374,50376,50378],{"class":35,"line":829},[33,50370,50371],{"class":167},"        df.to_csv(out, ",[33,50373,897],{"class":238},[33,50375,242],{"class":163},[33,50377,902],{"class":50},[33,50379,221],{"class":167},[33,50381,50382,50384,50386,50388,50390,50392,50394,50396,50398,50400,50402,50404,50406,50408,50410,50412,50414,50416,50418,50420,50422,50424,50426,50428,50430,50432],{"class":35,"line":834},[33,50383,9414],{"class":50},[33,50385,602],{"class":167},[33,50387,4059],{"class":163},[33,50389,11012],{"class":54},[33,50391,1115],{"class":50},[33,50393,7499],{"class":167},[33,50395,1811],{"class":163},[33,50397,40161],{"class":50},[33,50399,2079],{"class":54},[33,50401,1115],{"class":50},[33,50403,9541],{"class":167},[33,50405,748],{"class":50},[33,50407,9546],{"class":167},[33,50409,1121],{"class":50},[33,50411,16022],{"class":54},[33,50413,1115],{"class":50},[33,50415,9541],{"class":167},[33,50417,734],{"class":50},[33,50419,9546],{"class":167},[33,50421,1121],{"class":50},[33,50423,16035],{"class":54},[33,50425,1115],{"class":50},[33,50427,18014],{"class":167},[33,50429,1121],{"class":50},[33,50431,274],{"class":54},[33,50433,221],{"class":167},[14,50435,50436,50439,50440,50443,50444,50446],{},[1974,50437,50438],{},"Accuracy score:"," camelot exposes ",[30,50441,50442],{},"t.parsing_report[\"accuracy\"]"," (0–100). Scores below 80 indicate line detection problems — try adjusting ",[30,50445,11064],{}," or switching to stream mode.",[18,50448,50450],{"id":50449},"step-3-extract-whitespace-tables-with-camelot-stream","Step 3: Extract Whitespace Tables with camelot Stream",[14,50452,50453],{},"Stream mode infers column boundaries from the whitespace gaps between text runs. Use it for financial PDFs and reports that use fixed-width fonts and spaced columns instead of drawn borders.",[23,50455,50457],{"className":126,"code":50456,"language":47,"meta":28,"style":28},"# pip install \"camelot-py[cv]\" pandas\nfrom pathlib import Path\nimport camelot\nimport pandas as pd\n\nPDF_PATH = Path(\"data\u002Fannual_report.pdf\")\n\ndef extract_stream(path: Path, pages: str = \"1\", col_sep_width: int = 10) -> list[pd.DataFrame]:\n    \"\"\"Extract whitespace-delimited tables using camelot stream mode.\"\"\"\n    try:\n        tables = camelot.read_pdf(\n            str(path),\n            pages=pages,\n            flavor=\"stream\",\n            # edge_tol controls how close text must be to a column edge\n            edge_tol=500,\n            # row_tol: vertical tolerance for grouping text into the same row\n            row_tol=2,\n        )\n    except Exception as e:\n        raise RuntimeError(f\"camelot stream failed: {e}\") from e\n\n    dfs = []\n    for t in tables:\n        df = t.df.copy()\n        # First row is usually the header; promote it\n        df.columns = df.iloc[0].str.strip()\n        df = df.iloc[1:].reset_index(drop=True)\n        df.replace(\"\", pd.NA, inplace=True)\n        dfs.append(df)\n    return dfs\n\nif __name__ == \"__main__\":\n    results = extract_stream(PDF_PATH, pages=\"2-5\")\n    for i, df in enumerate(results):\n        print(f\"Table {i+1}: {df.shape}\")\n        print(df.head(3), \"\\n\")\n",[30,50458,50459,50463,50473,50479,50489,50493,50506,50510,50537,50542,50548,50556,50562,50570,50580,50585,50596,50601,50612,50616,50626,50653,50657,50665,50675,50683,50688,50700,50720,50740,50744,50750,50754,50766,50788,50800,50830],{"__ignoreMap":28},[33,50460,50461],{"class":35,"line":36},[33,50462,10550],{"class":39},[33,50464,50465,50467,50469,50471],{"class":35,"line":43},[33,50466,190],{"class":163},[33,50468,193],{"class":167},[33,50470,164],{"class":163},[33,50472,198],{"class":167},[33,50474,50475,50477],{"class":35,"line":61},[33,50476,164],{"class":163},[33,50478,10567],{"class":167},[33,50480,50481,50483,50485,50487],{"class":35,"line":73},[33,50482,164],{"class":163},[33,50484,492],{"class":167},[33,50486,495],{"class":163},[33,50488,498],{"class":167},[33,50490,50491],{"class":35,"line":88},[33,50492,92],{"emptyLinePlaceholder":91},[33,50494,50495,50497,50499,50501,50504],{"class":35,"line":95},[33,50496,7076],{"class":50},[33,50498,212],{"class":163},[33,50500,215],{"class":167},[33,50502,50503],{"class":54},"\"data\u002Fannual_report.pdf\"",[33,50505,221],{"class":167},[33,50507,50508],{"class":35,"line":101},[33,50509,92],{"emptyLinePlaceholder":91},[33,50511,50512,50514,50517,50519,50521,50523,50526,50529,50531,50533,50535],{"class":35,"line":171},[33,50513,562],{"class":163},[33,50515,50516],{"class":46}," extract_stream",[33,50518,14838],{"class":167},[33,50520,1053],{"class":50},[33,50522,212],{"class":163},[33,50524,50525],{"class":54}," \"1\"",[33,50527,50528],{"class":167},", col_sep_width: ",[33,50530,1059],{"class":50},[33,50532,212],{"class":163},[33,50534,37265],{"class":50},[33,50536,10647],{"class":167},[33,50538,50539],{"class":35,"line":179},[33,50540,50541],{"class":54},"    \"\"\"Extract whitespace-delimited tables using camelot stream mode.\"\"\"\n",[33,50543,50544,50546],{"class":35,"line":187},[33,50545,2424],{"class":163},[33,50547,574],{"class":167},[33,50549,50550,50552,50554],{"class":35,"line":201},[33,50551,37895],{"class":167},[33,50553,242],{"class":163},[33,50555,10668],{"class":167},[33,50557,50558,50560],{"class":35,"line":206},[33,50559,10673],{"class":50},[33,50561,10676],{"class":167},[33,50563,50564,50566,50568],{"class":35,"line":224},[33,50565,10681],{"class":238},[33,50567,242],{"class":163},[33,50569,10686],{"class":167},[33,50571,50572,50574,50576,50578],{"class":35,"line":229},[33,50573,10691],{"class":238},[33,50575,242],{"class":163},[33,50577,13407],{"class":54},[33,50579,247],{"class":167},[33,50581,50582],{"class":35,"line":235},[33,50583,50584],{"class":39},"            # edge_tol controls how close text must be to a column edge\n",[33,50586,50587,50590,50592,50594],{"class":35,"line":250},[33,50588,50589],{"class":238},"            edge_tol",[33,50591,242],{"class":163},[33,50593,13437],{"class":50},[33,50595,247],{"class":167},[33,50597,50598],{"class":35,"line":266},[33,50599,50600],{"class":39},"            # row_tol: vertical tolerance for grouping text into the same row\n",[33,50602,50603,50606,50608,50610],{"class":35,"line":290},[33,50604,50605],{"class":238},"            row_tol",[33,50607,242],{"class":163},[33,50609,1533],{"class":50},[33,50611,247],{"class":167},[33,50613,50614],{"class":35,"line":295},[33,50615,5867],{"class":167},[33,50617,50618,50620,50622,50624],{"class":35,"line":300},[33,50619,2449],{"class":163},[33,50621,783],{"class":50},[33,50623,1852],{"class":163},[33,50625,7583],{"class":167},[33,50627,50628,50630,50632,50634,50636,50639,50641,50643,50645,50647,50649,50651],{"class":35,"line":317},[33,50629,4051],{"class":163},[33,50631,7590],{"class":50},[33,50633,602],{"class":167},[33,50635,4059],{"class":163},[33,50637,50638],{"class":54},"\"camelot stream failed: ",[33,50640,1115],{"class":50},[33,50642,7602],{"class":167},[33,50644,1121],{"class":50},[33,50646,274],{"class":54},[33,50648,1649],{"class":167},[33,50650,190],{"class":163},[33,50652,7613],{"class":167},[33,50654,50655],{"class":35,"line":332},[33,50656,92],{"emptyLinePlaceholder":91},[33,50658,50659,50661,50663],{"class":35,"line":347},[33,50660,37500],{"class":167},[33,50662,242],{"class":163},[33,50664,589],{"class":167},[33,50666,50667,50669,50671,50673],{"class":35,"line":374},[33,50668,656],{"class":163},[33,50670,10818],{"class":167},[33,50672,662],{"class":163},[33,50674,38001],{"class":167},[33,50676,50677,50679,50681],{"class":35,"line":397},[33,50678,7930],{"class":167},[33,50680,242],{"class":163},[33,50682,10832],{"class":167},[33,50684,50685],{"class":35,"line":653},[33,50686,50687],{"class":39},"        # First row is usually the header; promote it\n",[33,50689,50690,50692,50694,50696,50698],{"class":35,"line":667},[33,50691,10842],{"class":167},[33,50693,242],{"class":163},[33,50695,10847],{"class":167},[33,50697,748],{"class":50},[33,50699,10852],{"class":167},[33,50701,50702,50704,50706,50708,50710,50712,50714,50716,50718],{"class":35,"line":675},[33,50703,7930],{"class":167},[33,50705,242],{"class":163},[33,50707,10847],{"class":167},[33,50709,734],{"class":50},[33,50711,10865],{"class":167},[33,50713,10868],{"class":238},[33,50715,242],{"class":163},[33,50717,855],{"class":50},[33,50719,221],{"class":167},[33,50721,50722,50724,50726,50728,50730,50732,50734,50736,50738],{"class":35,"line":689},[33,50723,10879],{"class":167},[33,50725,3198],{"class":54},[33,50727,10884],{"class":167},[33,50729,8018],{"class":50},[33,50731,365],{"class":167},[33,50733,10891],{"class":238},[33,50735,242],{"class":163},[33,50737,855],{"class":50},[33,50739,221],{"class":167},[33,50741,50742],{"class":35,"line":703},[33,50743,50278],{"class":167},[33,50745,50746,50748],{"class":35,"line":714},[33,50747,1332],{"class":163},[33,50749,37688],{"class":167},[33,50751,50752],{"class":35,"line":723},[33,50753,92],{"emptyLinePlaceholder":91},[33,50755,50756,50758,50760,50762,50764],{"class":35,"line":754},[33,50757,2491],{"class":163},[33,50759,2494],{"class":50},[33,50761,2497],{"class":163},[33,50763,2500],{"class":54},[33,50765,574],{"class":167},[33,50767,50768,50770,50772,50775,50777,50779,50781,50783,50786],{"class":35,"line":771},[33,50769,37112],{"class":167},[33,50771,242],{"class":163},[33,50773,50774],{"class":167}," extract_stream(",[33,50776,7076],{"class":50},[33,50778,365],{"class":167},[33,50780,10971],{"class":238},[33,50782,242],{"class":163},[33,50784,50785],{"class":54},"\"2-5\"",[33,50787,221],{"class":167},[33,50789,50790,50792,50794,50796,50798],{"class":35,"line":777},[33,50791,656],{"class":163},[33,50793,10994],{"class":167},[33,50795,662],{"class":163},[33,50797,7403],{"class":50},[33,50799,50339],{"class":167},[33,50801,50802,50804,50806,50808,50810,50812,50814,50816,50818,50820,50822,50824,50826,50828],{"class":35,"line":788},[33,50803,9414],{"class":50},[33,50805,602],{"class":167},[33,50807,4059],{"class":163},[33,50809,11012],{"class":54},[33,50811,1115],{"class":50},[33,50813,7499],{"class":167},[33,50815,1811],{"class":163},[33,50817,40161],{"class":50},[33,50819,2079],{"class":54},[33,50821,1115],{"class":50},[33,50823,9426],{"class":167},[33,50825,1121],{"class":50},[33,50827,274],{"class":54},[33,50829,221],{"class":167},[33,50831,50832,50834,50836,50838,50840,50842,50844,50846],{"class":35,"line":804},[33,50833,9414],{"class":50},[33,50835,35717],{"class":167},[33,50837,10258],{"class":50},[33,50839,18525],{"class":167},[33,50841,274],{"class":54},[33,50843,25830],{"class":50},[33,50845,274],{"class":54},[33,50847,221],{"class":167},[14,50849,50850,50853,50854,3035],{},[1974,50851,50852],{},"When stream fails:"," if columns merge, open the PDF in a viewer, note the x-coordinates of column separators, and pass them explicitly: ",[30,50855,50856],{},"camelot.read_pdf(..., columns=[\"72,144,288,432\"])",[18,50858,50860],{"id":50859},"step-4-pdfplumber-fallback-for-sparse-or-irregular-grids","Step 4: pdfplumber Fallback for Sparse or Irregular Grids",[14,50862,50863,50867],{},[940,50864,943],{"href":50865,"rel":50866},"https:\u002F\u002Fgithub.com\u002Fjsvine\u002Fpdfplumber",[1367]," works well when camelot finds no tables — particularly for loosely-bordered tables or when the PDF is generated by tools that draw border-like rectangles rather than actual PDF line objects.",[23,50869,50871],{"className":126,"code":50870,"language":47,"meta":28,"style":28},"# pip install pdfplumber pandas\nfrom pathlib import Path\nimport pdfplumber\nimport pandas as pd\n\nPDF_PATH = Path(\"data\u002Freport.pdf\")\n\ndef extract_with_pdfplumber(path: Path) -> list[pd.DataFrame]:\n    \"\"\"Extract tables page-by-page using pdfplumber's built-in table finder.\"\"\"\n    dfs = []\n    try:\n        with pdfplumber.open(path) as pdf:\n            for page_num, page in enumerate(pdf.pages, start=1):\n                raw_tables = page.extract_tables(table_settings={\n                    \"vertical_strategy\": \"lines_strict\",\n                    \"horizontal_strategy\": \"lines_strict\",\n                    \"snap_tolerance\": 3,\n                })\n                for raw in raw_tables:\n                    if not raw or len(raw) \u003C 2:\n                        continue\n                    # First row as header\n                    header = [str(c).strip() if c else f\"col_{i}\" for i, c in enumerate(raw[0])]\n                    rows = raw[1:]\n                    df = pd.DataFrame(rows, columns=header)\n                    df.replace(\"\", pd.NA, inplace=True)\n                    dfs.append(df)\n    except Exception as e:\n        raise RuntimeError(f\"pdfplumber extraction failed: {e}\") from e\n    return dfs\n\nif __name__ == \"__main__\":\n    tables = extract_with_pdfplumber(PDF_PATH)\n    print(f\"Found {len(tables)} table(s)\")\n    for df in tables:\n        print(df.head())\n",[30,50872,50873,50877,50887,50893,50903,50907,50919,50923,50932,50937,50945,50951,50961,50981,50995,51005,51015,51025,51029,51039,51059,51063,51068,51112,51125,51140,51161,51165,51175,51202,51208,51212,51224,51237,51258,51268],{"__ignoreMap":28},[33,50874,50875],{"class":35,"line":36},[33,50876,7041],{"class":39},[33,50878,50879,50881,50883,50885],{"class":35,"line":43},[33,50880,190],{"class":163},[33,50882,193],{"class":167},[33,50884,164],{"class":163},[33,50886,198],{"class":167},[33,50888,50889,50891],{"class":35,"line":61},[33,50890,164],{"class":163},[33,50892,485],{"class":167},[33,50894,50895,50897,50899,50901],{"class":35,"line":73},[33,50896,164],{"class":163},[33,50898,492],{"class":167},[33,50900,495],{"class":163},[33,50902,498],{"class":167},[33,50904,50905],{"class":35,"line":88},[33,50906,92],{"emptyLinePlaceholder":91},[33,50908,50909,50911,50913,50915,50917],{"class":35,"line":95},[33,50910,7076],{"class":50},[33,50912,212],{"class":163},[33,50914,215],{"class":167},[33,50916,7083],{"class":54},[33,50918,221],{"class":167},[33,50920,50921],{"class":35,"line":101},[33,50922,92],{"emptyLinePlaceholder":91},[33,50924,50925,50927,50930],{"class":35,"line":171},[33,50926,562],{"class":163},[33,50928,50929],{"class":46}," extract_with_pdfplumber",[33,50931,7362],{"class":167},[33,50933,50934],{"class":35,"line":179},[33,50935,50936],{"class":54},"    \"\"\"Extract tables page-by-page using pdfplumber's built-in table finder.\"\"\"\n",[33,50938,50939,50941,50943],{"class":35,"line":187},[33,50940,37500],{"class":167},[33,50942,242],{"class":163},[33,50944,589],{"class":167},[33,50946,50947,50949],{"class":35,"line":201},[33,50948,2424],{"class":163},[33,50950,574],{"class":167},[33,50952,50953,50955,50957,50959],{"class":35,"line":206},[33,50954,2191],{"class":163},[33,50956,7123],{"class":167},[33,50958,495],{"class":163},[33,50960,686],{"class":167},[33,50962,50963,50965,50967,50969,50971,50973,50975,50977,50979],{"class":35,"line":224},[33,50964,1793],{"class":163},[33,50966,7398],{"class":167},[33,50968,662],{"class":163},[33,50970,7403],{"class":50},[33,50972,7406],{"class":167},[33,50974,7409],{"class":238},[33,50976,242],{"class":163},[33,50978,734],{"class":50},[33,50980,1737],{"class":167},[33,50982,50983,50985,50987,50989,50991,50993],{"class":35,"line":229},[33,50984,10212],{"class":167},[33,50986,242],{"class":163},[33,50988,10217],{"class":167},[33,50990,10220],{"class":238},[33,50992,242],{"class":163},[33,50994,10225],{"class":167},[33,50996,50997,50999,51001,51003],{"class":35,"line":235},[33,50998,10230],{"class":54},[33,51000,2079],{"class":167},[33,51002,10235],{"class":54},[33,51004,247],{"class":167},[33,51006,51007,51009,51011,51013],{"class":35,"line":250},[33,51008,10242],{"class":54},[33,51010,2079],{"class":167},[33,51012,10235],{"class":54},[33,51014,247],{"class":167},[33,51016,51017,51019,51021,51023],{"class":35,"line":266},[33,51018,10253],{"class":54},[33,51020,2079],{"class":167},[33,51022,10258],{"class":50},[33,51024,247],{"class":167},[33,51026,51027],{"class":35,"line":290},[33,51028,37314],{"class":167},[33,51030,51031,51033,51035,51037],{"class":35,"line":295},[33,51032,692],{"class":163},[33,51034,7422],{"class":167},[33,51036,662],{"class":163},[33,51038,10280],{"class":167},[33,51040,51041,51043,51045,51047,51049,51051,51053,51055,51057],{"class":35,"line":300},[33,51042,717],{"class":163},[33,51044,620],{"class":163},[33,51046,7422],{"class":167},[33,51048,7162],{"class":163},[33,51050,4037],{"class":50},[33,51052,7446],{"class":167},[33,51054,4043],{"class":163},[33,51056,7451],{"class":50},[33,51058,574],{"class":167},[33,51060,51061],{"class":35,"line":317},[33,51062,7458],{"class":163},[33,51064,51065],{"class":35,"line":332},[33,51066,51067],{"class":39},"                    # First row as header\n",[33,51069,51070,51072,51074,51076,51078,51080,51082,51084,51086,51088,51090,51092,51094,51096,51098,51100,51102,51104,51106,51108,51110],{"class":35,"line":347},[33,51071,7468],{"class":167},[33,51073,242],{"class":163},[33,51075,9178],{"class":167},[33,51077,1053],{"class":50},[33,51079,7481],{"class":167},[33,51081,2491],{"class":163},[33,51083,7486],{"class":167},[33,51085,7489],{"class":163},[33,51087,1110],{"class":163},[33,51089,7494],{"class":54},[33,51091,1115],{"class":50},[33,51093,7499],{"class":167},[33,51095,1121],{"class":50},[33,51097,274],{"class":54},[33,51099,14766],{"class":163},[33,51101,7512],{"class":167},[33,51103,662],{"class":163},[33,51105,7403],{"class":50},[33,51107,7519],{"class":167},[33,51109,748],{"class":50},[33,51111,14779],{"class":167},[33,51113,51114,51116,51118,51121,51123],{"class":35,"line":374},[33,51115,45694],{"class":167},[33,51117,242],{"class":163},[33,51119,51120],{"class":167}," raw[",[33,51122,734],{"class":50},[33,51124,39364],{"class":167},[33,51126,51127,51129,51131,51134,51136,51138],{"class":35,"line":397},[33,51128,7533],{"class":167},[33,51130,242],{"class":163},[33,51132,51133],{"class":167}," pd.DataFrame(rows, ",[33,51135,740],{"class":238},[33,51137,242],{"class":163},[33,51139,7549],{"class":167},[33,51141,51142,51145,51147,51149,51151,51153,51155,51157,51159],{"class":35,"line":653},[33,51143,51144],{"class":167},"                    df.replace(",[33,51146,3198],{"class":54},[33,51148,10884],{"class":167},[33,51150,8018],{"class":50},[33,51152,365],{"class":167},[33,51154,10891],{"class":238},[33,51156,242],{"class":163},[33,51158,855],{"class":50},[33,51160,221],{"class":167},[33,51162,51163],{"class":35,"line":667},[33,51164,37650],{"class":167},[33,51166,51167,51169,51171,51173],{"class":35,"line":675},[33,51168,2449],{"class":163},[33,51170,783],{"class":50},[33,51172,1852],{"class":163},[33,51174,7583],{"class":167},[33,51176,51177,51179,51181,51183,51185,51188,51190,51192,51194,51196,51198,51200],{"class":35,"line":689},[33,51178,4051],{"class":163},[33,51180,7590],{"class":50},[33,51182,602],{"class":167},[33,51184,4059],{"class":163},[33,51186,51187],{"class":54},"\"pdfplumber extraction failed: ",[33,51189,1115],{"class":50},[33,51191,7602],{"class":167},[33,51193,1121],{"class":50},[33,51195,274],{"class":54},[33,51197,1649],{"class":167},[33,51199,190],{"class":163},[33,51201,7613],{"class":167},[33,51203,51204,51206],{"class":35,"line":703},[33,51205,1332],{"class":163},[33,51207,37688],{"class":167},[33,51209,51210],{"class":35,"line":714},[33,51211,92],{"emptyLinePlaceholder":91},[33,51213,51214,51216,51218,51220,51222],{"class":35,"line":723},[33,51215,2491],{"class":163},[33,51217,2494],{"class":50},[33,51219,2497],{"class":163},[33,51221,2500],{"class":54},[33,51223,574],{"class":167},[33,51225,51226,51228,51230,51233,51235],{"class":35,"line":754},[33,51227,37709],{"class":167},[33,51229,242],{"class":163},[33,51231,51232],{"class":167}," extract_with_pdfplumber(",[33,51234,7076],{"class":50},[33,51236,221],{"class":167},[33,51238,51239,51241,51243,51245,51248,51250,51252,51254,51256],{"class":35,"line":771},[33,51240,7268],{"class":50},[33,51242,602],{"class":167},[33,51244,4059],{"class":163},[33,51246,51247],{"class":54},"\"Found ",[33,51249,4065],{"class":50},[33,51251,39168],{"class":167},[33,51253,1121],{"class":50},[33,51255,6247],{"class":54},[33,51257,221],{"class":167},[33,51259,51260,51262,51264,51266],{"class":35,"line":777},[33,51261,656],{"class":163},[33,51263,7810],{"class":167},[33,51265,662],{"class":163},[33,51267,38001],{"class":167},[33,51269,51270,51272],{"class":35,"line":788},[33,51271,9414],{"class":50},[33,51273,13311],{"class":167},[14,51275,51276,51277,51279,51280,51282],{},"If columns are still misaligned after pdfplumber extraction, the root cause is usually coordinate drift in multi-column layouts — see ",[940,51278,10535],{"href":10534}," for the ",[30,51281,10530],{}," + coordinate-sorting approach.",[18,51284,51286],{"id":51285},"step-5-multi-page-header-deduplication","Step 5: Multi-Page Header Deduplication",[14,51288,51289],{},"Paginated PDFs repeat the header row at each page break. Concatenating tables naively produces a DataFrame with hundreds of mid-data header rows.",[23,51291,51293],{"className":126,"code":51292,"language":47,"meta":28,"style":28},"# pip install pandas\nfrom pathlib import Path\nimport pandas as pd\n\ndef dedup_and_concat(tables: list[pd.DataFrame]) -> pd.DataFrame:\n    \"\"\"Merge tables from multiple pages, removing repeated header rows.\"\"\"\n    if not tables:\n        return pd.DataFrame()\n\n    # Use first row of first table as the canonical header\n    first = tables[0]\n    if first.iloc[0].tolist() == first.columns.tolist():\n        # DataFrame was built with header already promoted; skip first data row check\n        canonical_header = first.columns.tolist()\n    else:\n        canonical_header = first.iloc[0].tolist()\n        first = first.iloc[1:].copy()\n        first.columns = canonical_header\n\n    cleaned = [first]\n    for df in tables[1:]:\n        # Detect and drop repeated header rows by comparing to canonical\n        if df.shape[1] == len(canonical_header):\n            mask = df.apply(lambda row: row.tolist() == canonical_header, axis=1)\n            df = df[~mask].copy()\n        df.columns = canonical_header\n        cleaned.append(df)\n\n    combined = pd.concat(cleaned, ignore_index=True)\n\n    # Forward-fill merged\u002Fspanned cells (common in PDF tables)\n    combined = combined.ffill()\n\n    # Coerce numeric columns\n    for col in combined.columns:\n        numeric = pd.to_numeric(combined[col], errors=\"coerce\")\n        if numeric.notna().mean() > 0.8:   # >80% parseable → treat as numeric\n            combined[col] = numeric\n\n    return combined\n\nif __name__ == \"__main__\":\n    sample = [\n        pd.DataFrame([[\"ID\", \"Amount\", \"Date\"], [\"1\", \"500.00\", \"2026-01-01\"]]),\n        pd.DataFrame([[\"ID\", \"Amount\", \"Date\"], [\"2\", \"750.50\", \"2026-01-15\"]]),\n    ]\n    result = dedup_and_concat(sample)\n    print(result)\n    result.to_csv(\"output\u002Fmerged_tables.csv\", index=False)\n",[30,51294,51295,51299,51309,51319,51323,51333,51338,51346,51352,51356,51361,51374,51391,51396,51406,51412,51424,51438,51448,51452,51461,51475,51480,51497,51524,51538,51546,51550,51554,51570,51574,51579,51587,51591,51596,51606,51624,51641,51650,51654,51660,51664,51676,51684,51717,51746,51750,51759,51765],{"__ignoreMap":28},[33,51296,51297],{"class":35,"line":36},[33,51298,8895],{"class":39},[33,51300,51301,51303,51305,51307],{"class":35,"line":43},[33,51302,190],{"class":163},[33,51304,193],{"class":167},[33,51306,164],{"class":163},[33,51308,198],{"class":167},[33,51310,51311,51313,51315,51317],{"class":35,"line":61},[33,51312,164],{"class":163},[33,51314,492],{"class":167},[33,51316,495],{"class":163},[33,51318,498],{"class":167},[33,51320,51321],{"class":35,"line":73},[33,51322,92],{"emptyLinePlaceholder":91},[33,51324,51325,51327,51330],{"class":35,"line":88},[33,51326,562],{"class":163},[33,51328,51329],{"class":46}," dedup_and_concat",[33,51331,51332],{"class":167},"(tables: list[pd.DataFrame]) -> pd.DataFrame:\n",[33,51334,51335],{"class":35,"line":95},[33,51336,51337],{"class":54},"    \"\"\"Merge tables from multiple pages, removing repeated header rows.\"\"\"\n",[33,51339,51340,51342,51344],{"class":35,"line":101},[33,51341,617],{"class":163},[33,51343,620],{"class":163},[33,51345,38001],{"class":167},[33,51347,51348,51350],{"class":35,"line":171},[33,51349,1659],{"class":163},[33,51351,7721],{"class":167},[33,51353,51354],{"class":35,"line":179},[33,51355,92],{"emptyLinePlaceholder":91},[33,51357,51358],{"class":35,"line":187},[33,51359,51360],{"class":39},"    # Use first row of first table as the canonical header\n",[33,51362,51363,51366,51368,51370,51372],{"class":35,"line":201},[33,51364,51365],{"class":167},"    first ",[33,51367,242],{"class":163},[33,51369,13464],{"class":167},[33,51371,748],{"class":50},[33,51373,9202],{"class":167},[33,51375,51376,51378,51381,51383,51386,51388],{"class":35,"line":206},[33,51377,617],{"class":163},[33,51379,51380],{"class":167}," first.iloc[",[33,51382,748],{"class":50},[33,51384,51385],{"class":167},"].tolist() ",[33,51387,1865],{"class":163},[33,51389,51390],{"class":167}," first.columns.tolist():\n",[33,51392,51393],{"class":35,"line":224},[33,51394,51395],{"class":39},"        # DataFrame was built with header already promoted; skip first data row check\n",[33,51397,51398,51401,51403],{"class":35,"line":229},[33,51399,51400],{"class":167},"        canonical_header ",[33,51402,242],{"class":163},[33,51404,51405],{"class":167}," first.columns.tolist()\n",[33,51407,51408,51410],{"class":35,"line":235},[33,51409,6864],{"class":163},[33,51411,574],{"class":167},[33,51413,51414,51416,51418,51420,51422],{"class":35,"line":250},[33,51415,51400],{"class":167},[33,51417,242],{"class":163},[33,51419,51380],{"class":167},[33,51421,748],{"class":50},[33,51423,40639],{"class":167},[33,51425,51426,51429,51431,51433,51435],{"class":35,"line":266},[33,51427,51428],{"class":167},"        first ",[33,51430,242],{"class":163},[33,51432,51380],{"class":167},[33,51434,734],{"class":50},[33,51436,51437],{"class":167},":].copy()\n",[33,51439,51440,51443,51445],{"class":35,"line":290},[33,51441,51442],{"class":167},"        first.columns ",[33,51444,242],{"class":163},[33,51446,51447],{"class":167}," canonical_header\n",[33,51449,51450],{"class":35,"line":295},[33,51451,92],{"emptyLinePlaceholder":91},[33,51453,51454,51456,51458],{"class":35,"line":300},[33,51455,12471],{"class":167},[33,51457,242],{"class":163},[33,51459,51460],{"class":167}," [first]\n",[33,51462,51463,51465,51467,51469,51471,51473],{"class":35,"line":317},[33,51464,656],{"class":163},[33,51466,7810],{"class":167},[33,51468,662],{"class":163},[33,51470,13464],{"class":167},[33,51472,734],{"class":50},[33,51474,43533],{"class":167},[33,51476,51477],{"class":35,"line":332},[33,51478,51479],{"class":39},"        # Detect and drop repeated header rows by comparing to canonical\n",[33,51481,51482,51484,51486,51488,51490,51492,51494],{"class":35,"line":347},[33,51483,8221],{"class":163},[33,51485,9516],{"class":167},[33,51487,734],{"class":50},[33,51489,763],{"class":167},[33,51491,1865],{"class":163},[33,51493,4037],{"class":50},[33,51495,51496],{"class":167},"(canonical_header):\n",[33,51498,51499,51502,51504,51506,51508,51511,51513,51516,51518,51520,51522],{"class":35,"line":374},[33,51500,51501],{"class":167},"            mask ",[33,51503,242],{"class":163},[33,51505,39836],{"class":167},[33,51507,39839],{"class":163},[33,51509,51510],{"class":167}," row: row.tolist() ",[33,51512,1865],{"class":163},[33,51514,51515],{"class":167}," canonical_header, ",[33,51517,4177],{"class":238},[33,51519,242],{"class":163},[33,51521,734],{"class":50},[33,51523,221],{"class":167},[33,51525,51526,51529,51531,51533,51535],{"class":35,"line":397},[33,51527,51528],{"class":167},"            df ",[33,51530,242],{"class":163},[33,51532,7935],{"class":167},[33,51534,7938],{"class":163},[33,51536,51537],{"class":167},"mask].copy()\n",[33,51539,51540,51542,51544],{"class":35,"line":653},[33,51541,10842],{"class":167},[33,51543,242],{"class":163},[33,51545,51447],{"class":167},[33,51547,51548],{"class":35,"line":667},[33,51549,8043],{"class":167},[33,51551,51552],{"class":35,"line":675},[33,51553,92],{"emptyLinePlaceholder":91},[33,51555,51556,51558,51560,51562,51564,51566,51568],{"class":35,"line":689},[33,51557,842],{"class":167},[33,51559,242],{"class":163},[33,51561,8061],{"class":167},[33,51563,850],{"class":238},[33,51565,242],{"class":163},[33,51567,855],{"class":50},[33,51569,221],{"class":167},[33,51571,51572],{"class":35,"line":703},[33,51573,92],{"emptyLinePlaceholder":91},[33,51575,51576],{"class":35,"line":714},[33,51577,51578],{"class":39},"    # Forward-fill merged\u002Fspanned cells (common in PDF tables)\n",[33,51580,51581,51583,51585],{"class":35,"line":723},[33,51582,842],{"class":167},[33,51584,242],{"class":163},[33,51586,8087],{"class":167},[33,51588,51589],{"class":35,"line":754},[33,51590,92],{"emptyLinePlaceholder":91},[33,51592,51593],{"class":35,"line":771},[33,51594,51595],{"class":39},"    # Coerce numeric columns\n",[33,51597,51598,51600,51602,51604],{"class":35,"line":777},[33,51599,656],{"class":163},[33,51601,7985],{"class":167},[33,51603,662],{"class":163},[33,51605,8216],{"class":167},[33,51607,51608,51611,51613,51616,51618,51620,51622],{"class":35,"line":788},[33,51609,51610],{"class":167},"        numeric ",[33,51612,242],{"class":163},[33,51614,51615],{"class":167}," pd.to_numeric(combined[col], ",[33,51617,8317],{"class":238},[33,51619,242],{"class":163},[33,51621,12107],{"class":54},[33,51623,221],{"class":167},[33,51625,51626,51628,51631,51633,51636,51638],{"class":35,"line":804},[33,51627,8221],{"class":163},[33,51629,51630],{"class":167}," numeric.notna().mean() ",[33,51632,6009],{"class":163},[33,51634,51635],{"class":50}," 0.8",[33,51637,20656],{"class":167},[33,51639,51640],{"class":39},"# >80% parseable → treat as numeric\n",[33,51642,51643,51645,51647],{"class":35,"line":809},[33,51644,15932],{"class":167},[33,51646,242],{"class":163},[33,51648,51649],{"class":167}," numeric\n",[33,51651,51652],{"class":35,"line":819},[33,51653,92],{"emptyLinePlaceholder":91},[33,51655,51656,51658],{"class":35,"line":829},[33,51657,1332],{"class":163},[33,51659,8098],{"class":167},[33,51661,51662],{"class":35,"line":834},[33,51663,92],{"emptyLinePlaceholder":91},[33,51665,51666,51668,51670,51672,51674],{"class":35,"line":839},[33,51667,2491],{"class":163},[33,51669,2494],{"class":50},[33,51671,2497],{"class":163},[33,51673,2500],{"class":54},[33,51675,574],{"class":167},[33,51677,51678,51680,51682],{"class":35,"line":860},[33,51679,11744],{"class":167},[33,51681,242],{"class":163},[33,51683,7473],{"class":167},[33,51685,51686,51689,51691,51693,51695,51697,51699,51702,51704,51706,51709,51711,51714],{"class":35,"line":887},[33,51687,51688],{"class":167},"        pd.DataFrame([[",[33,51690,8855],{"class":54},[33,51692,365],{"class":167},[33,51694,7030],{"class":54},[33,51696,365],{"class":167},[33,51698,7027],{"class":54},[33,51700,51701],{"class":167},"], [",[33,51703,35984],{"class":54},[33,51705,365],{"class":167},[33,51707,51708],{"class":54},"\"500.00\"",[33,51710,365],{"class":167},[33,51712,51713],{"class":54},"\"2026-01-01\"",[33,51715,51716],{"class":167},"]]),\n",[33,51718,51719,51721,51723,51725,51727,51729,51731,51733,51735,51737,51740,51742,51744],{"class":35,"line":907},[33,51720,51688],{"class":167},[33,51722,8855],{"class":54},[33,51724,365],{"class":167},[33,51726,7030],{"class":54},[33,51728,365],{"class":167},[33,51730,7027],{"class":54},[33,51732,51701],{"class":167},[33,51734,13395],{"class":54},[33,51736,365],{"class":167},[33,51738,51739],{"class":54},"\"750.50\"",[33,51741,365],{"class":167},[33,51743,12407],{"class":54},[33,51745,51716],{"class":167},[33,51747,51748],{"class":35,"line":1826},[33,51749,19559],{"class":167},[33,51751,51752,51754,51756],{"class":35,"line":1844},[33,51753,8842],{"class":167},[33,51755,242],{"class":163},[33,51757,51758],{"class":167}," dedup_and_concat(sample)\n",[33,51760,51761,51763],{"class":35,"line":1858},[33,51762,7268],{"class":50},[33,51764,8864],{"class":167},[33,51766,51767,51770,51773,51775,51777,51779,51781],{"class":35,"line":1871},[33,51768,51769],{"class":167},"    result.to_csv(",[33,51771,51772],{"class":54},"\"output\u002Fmerged_tables.csv\"",[33,51774,365],{"class":167},[33,51776,897],{"class":238},[33,51778,242],{"class":163},[33,51780,902],{"class":50},[33,51782,221],{"class":167},[18,51784,51786],{"id":51785},"edge-cases","Edge Cases",[424,51788,51790],{"id":51789},"tables-spanning-two-pages-with-no-repeat-header","Tables Spanning Two Pages with No Repeat Header",[14,51792,51793],{},"Some PDF generators split a table mid-row at a page boundary, producing a bottom fragment and a top fragment on successive pages.",[23,51795,51797],{"className":126,"code":51796,"language":47,"meta":28,"style":28},"# pip install camelot-py[cv] pandas\nfrom pathlib import Path\nimport camelot, pandas as pd\n\ndef stitch_split_table(path: Path, pages: str = \"3,4\") -> pd.DataFrame:\n    \"\"\"Join tables split across adjacent pages when no header repeats.\"\"\"\n    tables = camelot.read_pdf(str(path), pages=pages, flavor=\"lattice\")\n    dfs = [t.df for t in tables]\n    if not dfs:\n        return pd.DataFrame()\n    header = dfs[0].iloc[0].tolist()\n    frames = []\n    for df in dfs:\n        if df.iloc[0].tolist() == header:\n            df = df.iloc[1:]\n        df.columns = header\n        frames.append(df)\n    return pd.concat(frames, ignore_index=True)\n",[30,51798,51799,51803,51813,51824,51828,51846,51851,51877,51895,51904,51910,51927,51935,51945,51959,51971,51980,51984],{"__ignoreMap":28},[33,51800,51801],{"class":35,"line":36},[33,51802,37785],{"class":39},[33,51804,51805,51807,51809,51811],{"class":35,"line":43},[33,51806,190],{"class":163},[33,51808,193],{"class":167},[33,51810,164],{"class":163},[33,51812,198],{"class":167},[33,51814,51815,51817,51820,51822],{"class":35,"line":61},[33,51816,164],{"class":163},[33,51818,51819],{"class":167}," camelot, pandas ",[33,51821,495],{"class":163},[33,51823,498],{"class":167},[33,51825,51826],{"class":35,"line":73},[33,51827,92],{"emptyLinePlaceholder":91},[33,51829,51830,51832,51835,51837,51839,51841,51844],{"class":35,"line":88},[33,51831,562],{"class":163},[33,51833,51834],{"class":46}," stitch_split_table",[33,51836,14838],{"class":167},[33,51838,1053],{"class":50},[33,51840,212],{"class":163},[33,51842,51843],{"class":54}," \"3,4\"",[33,51845,7668],{"class":167},[33,51847,51848],{"class":35,"line":95},[33,51849,51850],{"class":54},"    \"\"\"Join tables split across adjacent pages when no header repeats.\"\"\"\n",[33,51852,51853,51855,51857,51859,51861,51863,51865,51867,51869,51871,51873,51875],{"class":35,"line":101},[33,51854,37709],{"class":167},[33,51856,242],{"class":163},[33,51858,40545],{"class":167},[33,51860,1053],{"class":50},[33,51862,13643],{"class":167},[33,51864,10971],{"class":238},[33,51866,242],{"class":163},[33,51868,14880],{"class":167},[33,51870,10748],{"class":238},[33,51872,242],{"class":163},[33,51874,10985],{"class":54},[33,51876,221],{"class":167},[33,51878,51879,51881,51883,51886,51888,51890,51892],{"class":35,"line":171},[33,51880,37500],{"class":167},[33,51882,242],{"class":163},[33,51884,51885],{"class":167}," [t.df ",[33,51887,6124],{"class":163},[33,51889,10818],{"class":167},[33,51891,662],{"class":163},[33,51893,51894],{"class":167}," tables]\n",[33,51896,51897,51899,51901],{"class":35,"line":179},[33,51898,617],{"class":163},[33,51900,620],{"class":163},[33,51902,51903],{"class":167}," dfs:\n",[33,51905,51906,51908],{"class":35,"line":187},[33,51907,1659],{"class":163},[33,51909,7721],{"class":167},[33,51911,51912,51914,51916,51919,51921,51923,51925],{"class":35,"line":201},[33,51913,13245],{"class":167},[33,51915,242],{"class":163},[33,51917,51918],{"class":167}," dfs[",[33,51920,748],{"class":50},[33,51922,11044],{"class":167},[33,51924,748],{"class":50},[33,51926,40639],{"class":167},[33,51928,51929,51931,51933],{"class":35,"line":206},[33,51930,584],{"class":167},[33,51932,242],{"class":163},[33,51934,589],{"class":167},[33,51936,51937,51939,51941,51943],{"class":35,"line":224},[33,51938,656],{"class":163},[33,51940,7810],{"class":167},[33,51942,662],{"class":163},[33,51944,51903],{"class":167},[33,51946,51947,51949,51951,51953,51955,51957],{"class":35,"line":229},[33,51948,8221],{"class":163},[33,51950,10847],{"class":167},[33,51952,748],{"class":50},[33,51954,51385],{"class":167},[33,51956,1865],{"class":163},[33,51958,48187],{"class":167},[33,51960,51961,51963,51965,51967,51969],{"class":35,"line":235},[33,51962,51528],{"class":167},[33,51964,242],{"class":163},[33,51966,10847],{"class":167},[33,51968,734],{"class":50},[33,51970,39364],{"class":167},[33,51972,51973,51975,51977],{"class":35,"line":250},[33,51974,10842],{"class":167},[33,51976,242],{"class":163},[33,51978,51979],{"class":167}," header\n",[33,51981,51982],{"class":35,"line":266},[33,51983,10929],{"class":167},[33,51985,51986,51988,51990,51992,51994,51996],{"class":35,"line":290},[33,51987,1332],{"class":163},[33,51989,847],{"class":167},[33,51991,850],{"class":238},[33,51993,242],{"class":163},[33,51995,855],{"class":50},[33,51997,221],{"class":167},[424,51999,52001],{"id":52000},"rotated-tables-landscape-pages","Rotated Tables (Landscape Pages)",[14,52003,52004,52005,52008,52009,52012],{},"pdfplumber respects page rotation automatically. For camelot, crop the page manually using ",[30,52006,52007],{},"camelot.read_pdf(..., layout_kwargs={\"char_margin\": 2.0})",". If rotation is inconsistent, pre-rotate with ",[30,52010,52011],{},"PyMuPDF"," before extraction.",[424,52014,52016],{"id":52015},"tables-with-background-colour-fills","Tables with Background Colour Fills",[14,52018,11057,52019,52022,52023,52026],{},[30,52020,52021],{},"process_background=True"," flag handles most cases. If lines are still missed, lower the threshold: add ",[30,52024,52025],{},"copy_text=[\"h\", \"v\"]"," to camelot's settings to extract text from cells with coloured fills.",[18,52028,52030],{"id":52029},"validation","Validation",[14,52032,52033],{},"Always verify shape and dtypes before writing downstream outputs:",[23,52035,52037],{"className":126,"code":52036,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\n\ndef validate_table(df: pd.DataFrame, expected_cols: int, min_rows: int = 1) -> None:\n    \"\"\"Assert basic structural integrity of an extracted table.\"\"\"\n    assert df.shape[1] == expected_cols, (\n        f\"Column count mismatch: expected {expected_cols}, got {df.shape[1]}. \"\n        \"Check for header dedup or coordinate drift.\"\n    )\n    assert df.shape[0] >= min_rows, (\n        f\"Too few rows: expected >= {min_rows}, got {df.shape[0]}.\"\n    )\n    null_ratio = df.isnull().mean().mean()\n    if null_ratio > 0.3:\n        print(f\"Warning: {null_ratio:.0%} of cells are null — check forward-fill or merged cells.\")\n\n    numeric_cols = df.select_dtypes(include=\"number\").columns.tolist()\n    print(f\"Shape: {df.shape}, Numeric cols: {numeric_cols}, Null ratio: {null_ratio:.1%}\")\n\nif __name__ == \"__main__\":\n    df = pd.read_csv(\"output\u002Fmerged_tables.csv\")\n    validate_table(df, expected_cols=3, min_rows=5)\n",[30,52038,52039,52043,52053,52057,52083,52088,52102,52130,52135,52139,52154,52182,52186,52194,52207,52230,52234,52251,52292,52296,52308,52320],{"__ignoreMap":28},[33,52040,52041],{"class":35,"line":36},[33,52042,8895],{"class":39},[33,52044,52045,52047,52049,52051],{"class":35,"line":43},[33,52046,164],{"class":163},[33,52048,492],{"class":167},[33,52050,495],{"class":163},[33,52052,498],{"class":167},[33,52054,52055],{"class":35,"line":61},[33,52056,92],{"emptyLinePlaceholder":91},[33,52058,52059,52061,52064,52066,52068,52071,52073,52075,52077,52079,52081],{"class":35,"line":73},[33,52060,562],{"class":163},[33,52062,52063],{"class":46}," validate_table",[33,52065,39735],{"class":167},[33,52067,1059],{"class":50},[33,52069,52070],{"class":167},", min_rows: ",[33,52072,1059],{"class":50},[33,52074,212],{"class":163},[33,52076,1814],{"class":50},[33,52078,1617],{"class":167},[33,52080,571],{"class":50},[33,52082,574],{"class":167},[33,52084,52085],{"class":35,"line":88},[33,52086,52087],{"class":54},"    \"\"\"Assert basic structural integrity of an extracted table.\"\"\"\n",[33,52089,52090,52092,52094,52096,52098,52100],{"class":35,"line":95},[33,52091,9228],{"class":163},[33,52093,9516],{"class":167},[33,52095,734],{"class":50},[33,52097,763],{"class":167},[33,52099,1865],{"class":163},[33,52101,39788],{"class":167},[33,52103,52104,52106,52109,52111,52113,52115,52117,52119,52121,52123,52125,52127],{"class":35,"line":101},[33,52105,9533],{"class":163},[33,52107,52108],{"class":54},"\"Column count mismatch: expected ",[33,52110,1115],{"class":50},[33,52112,12914],{"class":167},[33,52114,1121],{"class":50},[33,52116,21519],{"class":54},[33,52118,1115],{"class":50},[33,52120,9541],{"class":167},[33,52122,734],{"class":50},[33,52124,9546],{"class":167},[33,52126,1121],{"class":50},[33,52128,52129],{"class":54},". \"\n",[33,52131,52132],{"class":35,"line":171},[33,52133,52134],{"class":54},"        \"Check for header dedup or coordinate drift.\"\n",[33,52136,52137],{"class":35,"line":179},[33,52138,1202],{"class":167},[33,52140,52141,52143,52145,52147,52149,52151],{"class":35,"line":187},[33,52142,9228],{"class":163},[33,52144,9516],{"class":167},[33,52146,748],{"class":50},[33,52148,763],{"class":167},[33,52150,43000],{"class":163},[33,52152,52153],{"class":167}," min_rows, (\n",[33,52155,52156,52158,52161,52163,52165,52167,52169,52171,52173,52175,52177,52179],{"class":35,"line":201},[33,52157,9533],{"class":163},[33,52159,52160],{"class":54},"\"Too few rows: expected >= ",[33,52162,1115],{"class":50},[33,52164,4078],{"class":167},[33,52166,1121],{"class":50},[33,52168,21519],{"class":54},[33,52170,1115],{"class":50},[33,52172,9541],{"class":167},[33,52174,748],{"class":50},[33,52176,9546],{"class":167},[33,52178,1121],{"class":50},[33,52180,52181],{"class":54},".\"\n",[33,52183,52184],{"class":35,"line":206},[33,52185,1202],{"class":167},[33,52187,52188,52190,52192],{"class":35,"line":224},[33,52189,12716],{"class":167},[33,52191,242],{"class":163},[33,52193,48872],{"class":167},[33,52195,52196,52198,52200,52202,52205],{"class":35,"line":229},[33,52197,617],{"class":163},[33,52199,12728],{"class":167},[33,52201,6009],{"class":163},[33,52203,52204],{"class":50}," 0.3",[33,52206,574],{"class":167},[33,52208,52209,52211,52213,52215,52217,52219,52221,52223,52225,52228],{"class":35,"line":235},[33,52210,9414],{"class":50},[33,52212,602],{"class":167},[33,52214,4059],{"class":163},[33,52216,12747],{"class":54},[33,52218,1115],{"class":50},[33,52220,12752],{"class":167},[33,52222,12775],{"class":163},[33,52224,1121],{"class":50},[33,52226,52227],{"class":54}," of cells are null — check forward-fill or merged cells.\"",[33,52229,221],{"class":167},[33,52231,52232],{"class":35,"line":250},[33,52233,92],{"emptyLinePlaceholder":91},[33,52235,52236,52238,52240,52242,52244,52246,52248],{"class":35,"line":266},[33,52237,16725],{"class":167},[33,52239,242],{"class":163},[33,52241,23604],{"class":167},[33,52243,23607],{"class":238},[33,52245,242],{"class":163},[33,52247,23612],{"class":54},[33,52249,52250],{"class":167},").columns.tolist()\n",[33,52252,52253,52255,52257,52259,52261,52263,52265,52267,52270,52272,52275,52277,52280,52282,52284,52286,52288,52290],{"class":35,"line":290},[33,52254,7268],{"class":50},[33,52256,602],{"class":167},[33,52258,4059],{"class":163},[33,52260,16863],{"class":54},[33,52262,1115],{"class":50},[33,52264,9426],{"class":167},[33,52266,1121],{"class":50},[33,52268,52269],{"class":54},", Numeric cols: ",[33,52271,1115],{"class":50},[33,52273,52274],{"class":167},"numeric_cols",[33,52276,1121],{"class":50},[33,52278,52279],{"class":54},", Null ratio: ",[33,52281,1115],{"class":50},[33,52283,12752],{"class":167},[33,52285,12755],{"class":163},[33,52287,1121],{"class":50},[33,52289,274],{"class":54},[33,52291,221],{"class":167},[33,52293,52294],{"class":35,"line":295},[33,52295,92],{"emptyLinePlaceholder":91},[33,52297,52298,52300,52302,52304,52306],{"class":35,"line":300},[33,52299,2491],{"class":163},[33,52301,2494],{"class":50},[33,52303,2497],{"class":163},[33,52305,2500],{"class":54},[33,52307,574],{"class":167},[33,52309,52310,52312,52314,52316,52318],{"class":35,"line":317},[33,52311,4025],{"class":167},[33,52313,242],{"class":163},[33,52315,9481],{"class":167},[33,52317,51772],{"class":54},[33,52319,221],{"class":167},[33,52321,52322,52325,52327,52329,52331,52333,52335,52337,52339],{"class":35,"line":332},[33,52323,52324],{"class":167},"    validate_table(df, ",[33,52326,12914],{"class":238},[33,52328,242],{"class":163},[33,52330,10258],{"class":50},[33,52332,365],{"class":167},[33,52334,4078],{"class":238},[33,52336,242],{"class":163},[33,52338,1153],{"class":50},[33,52340,221],{"class":167},[14,52342,52343],{},"Run this after every extraction and compare against a manually counted row total from the source PDF.",[18,52345,13845],{"id":13844},[14,52347,52348,52351,52352,52355],{},[1974,52349,52350],{},"Memory limits:"," pdfplumber loads the entire PDF into memory. For files above ~100 MB, open only specific page ranges: ",[30,52353,52354],{},"pdf.pages[start:end]"," to avoid OOM errors.",[14,52357,52358,52361,52362,52364],{},[1974,52359,52360],{},"Batch processing:"," use ",[30,52363,4240],{}," — PDF parsing is CPU-bound, so threading gains nothing.",[23,52366,52368],{"className":126,"code":52367,"language":47,"meta":28,"style":28},"from concurrent.futures import ProcessPoolExecutor, as_completed\nfrom pathlib import Path\nimport pandas as pd\n\ndef process_one(path: Path) -> pd.DataFrame:\n    # Import inside function so each worker gets its own import state\n    import pdfplumber\n    dfs = []\n    with pdfplumber.open(path) as pdf:\n        for page in pdf.pages:\n            raw = page.extract_tables()\n            for t in (raw or []):\n                if t and len(t) > 1:\n                    dfs.append(pd.DataFrame(t[1:], columns=t[0]))\n    return pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()\n\ndef batch_extract(pdf_dir: Path) -> dict[Path, pd.DataFrame]:\n    paths = list(pdf_dir.glob(\"*.pdf\"))\n    results = {}\n    with ProcessPoolExecutor() as pool:\n        futures = {pool.submit(process_one, p): p for p in paths}\n        for fut in as_completed(futures):\n            p = futures[fut]\n            try:\n                results[p] = fut.result()\n            except Exception as e:\n                print(f\"Failed {p}: {e}\")\n    return results\n",[30,52369,52370,52380,52390,52400,52404,52413,52418,52424,52432,52442,52452,52460,52474,52493,52513,52537,52541,52550,52564,52572,52582,52599,52609,52617,52623,52632,52642,52670],{"__ignoreMap":28},[33,52371,52372,52374,52376,52378],{"class":35,"line":36},[33,52373,190],{"class":163},[33,52375,13880],{"class":167},[33,52377,164],{"class":163},[33,52379,13885],{"class":167},[33,52381,52382,52384,52386,52388],{"class":35,"line":43},[33,52383,190],{"class":163},[33,52385,193],{"class":167},[33,52387,164],{"class":163},[33,52389,198],{"class":167},[33,52391,52392,52394,52396,52398],{"class":35,"line":61},[33,52393,164],{"class":163},[33,52395,492],{"class":167},[33,52397,495],{"class":163},[33,52399,498],{"class":167},[33,52401,52402],{"class":35,"line":73},[33,52403,92],{"emptyLinePlaceholder":91},[33,52405,52406,52408,52411],{"class":35,"line":88},[33,52407,562],{"class":163},[33,52409,52410],{"class":46}," process_one",[33,52412,7103],{"class":167},[33,52414,52415],{"class":35,"line":95},[33,52416,52417],{"class":39},"    # Import inside function so each worker gets its own import state\n",[33,52419,52420,52422],{"class":35,"line":101},[33,52421,1627],{"class":163},[33,52423,485],{"class":167},[33,52425,52426,52428,52430],{"class":35,"line":171},[33,52427,37500],{"class":167},[33,52429,242],{"class":163},[33,52431,589],{"class":167},[33,52433,52434,52436,52438,52440],{"class":35,"line":179},[33,52435,1635],{"class":163},[33,52437,7123],{"class":167},[33,52439,495],{"class":163},[33,52441,686],{"class":167},[33,52443,52444,52446,52448,52450],{"class":35,"line":187},[33,52445,5973],{"class":163},[33,52447,695],{"class":167},[33,52449,662],{"class":163},[33,52451,700],{"class":167},[33,52453,52454,52456,52458],{"class":35,"line":201},[33,52455,7142],{"class":167},[33,52457,242],{"class":163},[33,52459,7147],{"class":167},[33,52461,52462,52464,52466,52468,52470,52472],{"class":35,"line":206},[33,52463,1793],{"class":163},[33,52465,10818],{"class":167},[33,52467,662],{"class":163},[33,52469,7159],{"class":167},[33,52471,7162],{"class":163},[33,52473,7165],{"class":167},[33,52475,52476,52478,52480,52482,52484,52487,52489,52491],{"class":35,"line":224},[33,52477,7170],{"class":163},[33,52479,10818],{"class":167},[33,52481,6001],{"class":163},[33,52483,4037],{"class":50},[33,52485,52486],{"class":167},"(t) ",[33,52488,6009],{"class":163},[33,52490,1814],{"class":50},[33,52492,574],{"class":167},[33,52494,52495,52498,52500,52502,52504,52506,52509,52511],{"class":35,"line":229},[33,52496,52497],{"class":167},"                    dfs.append(pd.DataFrame(t[",[33,52499,734],{"class":50},[33,52501,737],{"class":167},[33,52503,740],{"class":238},[33,52505,242],{"class":163},[33,52507,52508],{"class":167},"t[",[33,52510,748],{"class":50},[33,52512,7211],{"class":167},[33,52514,52515,52517,52520,52522,52524,52526,52528,52530,52533,52535],{"class":35,"line":235},[33,52516,1332],{"class":163},[33,52518,52519],{"class":167}," pd.concat(dfs, ",[33,52521,850],{"class":238},[33,52523,242],{"class":163},[33,52525,855],{"class":50},[33,52527,1649],{"class":167},[33,52529,2491],{"class":163},[33,52531,52532],{"class":167}," dfs ",[33,52534,7489],{"class":163},[33,52536,7721],{"class":167},[33,52538,52539],{"class":35,"line":250},[33,52540,92],{"emptyLinePlaceholder":91},[33,52542,52543,52545,52547],{"class":35,"line":266},[33,52544,562],{"class":163},[33,52546,14054],{"class":46},[33,52548,52549],{"class":167},"(pdf_dir: Path) -> dict[Path, pd.DataFrame]:\n",[33,52551,52552,52554,52556,52558,52560,52562],{"class":35,"line":290},[33,52553,14067],{"class":167},[33,52555,242],{"class":163},[33,52557,599],{"class":50},[33,52559,14074],{"class":167},[33,52561,610],{"class":54},[33,52563,371],{"class":167},[33,52565,52566,52568,52570],{"class":35,"line":295},[33,52567,37112],{"class":167},[33,52569,242],{"class":163},[33,52571,14093],{"class":167},[33,52573,52574,52576,52578,52580],{"class":35,"line":300},[33,52575,1635],{"class":163},[33,52577,14100],{"class":167},[33,52579,495],{"class":163},[33,52581,14105],{"class":167},[33,52583,52584,52586,52588,52591,52593,52595,52597],{"class":35,"line":317},[33,52585,14110],{"class":167},[33,52587,242],{"class":163},[33,52589,52590],{"class":167}," {pool.submit(process_one, p): p ",[33,52592,6124],{"class":163},[33,52594,6127],{"class":167},[33,52596,662],{"class":163},[33,52598,14124],{"class":167},[33,52600,52601,52603,52605,52607],{"class":35,"line":332},[33,52602,5973],{"class":163},[33,52604,14131],{"class":167},[33,52606,662],{"class":163},[33,52608,14136],{"class":167},[33,52610,52611,52613,52615],{"class":35,"line":347},[33,52612,14141],{"class":167},[33,52614,242],{"class":163},[33,52616,14146],{"class":167},[33,52618,52619,52621],{"class":35,"line":374},[33,52620,14151],{"class":163},[33,52622,574],{"class":167},[33,52624,52625,52628,52630],{"class":35,"line":397},[33,52626,52627],{"class":167},"                results[p] ",[33,52629,242],{"class":163},[33,52631,14163],{"class":167},[33,52633,52634,52636,52638,52640],{"class":35,"line":653},[33,52635,14168],{"class":163},[33,52637,783],{"class":50},[33,52639,1852],{"class":163},[33,52641,7583],{"class":167},[33,52643,52644,52646,52648,52650,52652,52654,52656,52658,52660,52662,52664,52666,52668],{"class":35,"line":667},[33,52645,8264],{"class":50},[33,52647,602],{"class":167},[33,52649,4059],{"class":163},[33,52651,14185],{"class":54},[33,52653,1115],{"class":50},[33,52655,14],{"class":167},[33,52657,1121],{"class":50},[33,52659,2079],{"class":54},[33,52661,1115],{"class":50},[33,52663,7602],{"class":167},[33,52665,1121],{"class":50},[33,52667,274],{"class":54},[33,52669,221],{"class":167},[33,52671,52672,52674],{"class":35,"line":675},[33,52673,1332],{"class":163},[33,52675,14211],{"class":167},[14,52677,52678,52679,52683],{},"For very large PDFs (500+ pages), consider splitting them first — see ",[940,52680,52682],{"href":52681},"\u002Fautomating-pdf-extraction-generation\u002Fmerging-and-splitting-pdf-documents\u002F","Merging and Splitting PDF Documents"," for pypdf-based splitting.",[14,52685,52686,52689,52690,52693,52694,52696],{},[1974,52687,52688],{},"Out-of-core alternative:"," pipe camelot output directly to ",[30,52691,52692],{},"pandas.DataFrame.to_parquet()"," in append mode using ",[30,52695,14295],{}," for datasets that exceed RAM.",[18,52698,4271],{"id":4270},[4273,52700,52701,52711],{},[4276,52702,52703],{},[4279,52704,52705,52707,52709],{},[4282,52706,14317],{},[4282,52708,4287],{},[4282,52710,4290],{},[4292,52712,52713,52728,52744,52762,52782],{},[4279,52714,52715,52719,52722],{},[4297,52716,52717],{},[30,52718,9731],{},[4297,52720,52721],{},"camelot's lattice mode requires the Ghostscript system binary",[4297,52723,52724,40229,52726],{},[30,52725,14432],{},[940,52727,9739],{"href":9738},[4279,52729,52730,52736,52739],{},[4297,52731,52732,52735],{},[30,52733,52734],{},"camelot.TableList"," returns 0 tables",[4297,52737,52738],{},"No vector lines detected on target pages",[4297,52740,14337,52741,52743],{},[30,52742,11068],{}," or use pdfplumber",[4279,52745,52746,52749,52752],{},[4297,52747,52748],{},"Columns merge or shift mid-table",[4297,52750,52751],{},"Coordinate drift across pages; y-tolerance mismatch",[4297,52753,52754,52755,52758,52759],{},"Use camelot's ",[30,52756,52757],{},"copy_text"," parameter or pdfplumber with ",[30,52760,52761],{},"snap_tolerance=3",[4279,52763,52764,52770,52775],{},[4297,52765,52766,52769],{},[30,52767,52768],{},"ValueError: could not convert string"," on numeric cols",[4297,52771,52772,52773,12027],{},"Merged cells contain concatenated values (e.g., ",[30,52774,43161],{},[4297,52776,11870,52777,52779,52780],{},[30,52778,10530],{},"-based reconstruction; see ",[940,52781,10535],{"href":10534},[4279,52783,52784,52787,52790],{},[4297,52785,52786],{},"Empty DataFrame from scanned PDF",[4297,52788,52789],{},"No text layer present",[4297,52791,52792,52793],{},"Route to OCR pipeline via ",[940,52794,10077],{"href":10076},[18,52796,14437],{"id":14436},[23,52798,52800],{"className":126,"code":52799,"language":47,"meta":28,"style":28},"#!\u002Fusr\u002Fbin\u002Fenv python3\n\"\"\"\nextract_tables.py — Extract tables from a PDF using the best available method.\n\nUsage:\n    python extract_tables.py input.pdf --pages 1-5 --flavor auto --output output\u002F\n    python extract_tables.py input.pdf --flavor stream --output output\u002F\n\npip install pdfplumber \"camelot-py[cv]\" pandas pdf2image pytesseract\n\"\"\"\nimport argparse\nimport sys\nfrom pathlib import Path\nimport pdfplumber\nimport camelot\nimport pandas as pd\n\n\ndef classify(path: Path) -> str:\n    \"\"\"Return 'lattice', 'stream', or 'ocr' based on PDF content.\"\"\"\n    with pdfplumber.open(path) as pdf:\n        page = pdf.pages[0]\n        has_text = bool((page.extract_text() or \"\").strip())\n        has_lines = bool(page.lines or page.rects)\n    if not has_text and not has_lines:\n        return \"ocr\"\n    return \"lattice\" if has_lines else \"stream\"\n\n\ndef extract(path: Path, pages: str, flavor: str) -> list[pd.DataFrame]:\n    if flavor == \"ocr\":\n        raise SystemExit(\n            \"OCR pipeline required — run: python -m ocr_extract \"\n            \"(see how-to-extract-tables-from-scanned-pdfs\u002F)\"\n        )\n    if flavor in (\"lattice\", \"stream\"):\n        tables = camelot.read_pdf(str(path), pages=pages, flavor=flavor,\n                                  process_background=(flavor == \"lattice\"))\n        return [t.df for t in tables]\n    # pdfplumber fallback\n    dfs = []\n    with pdfplumber.open(path) as pdf:\n        for page in pdf.pages:\n            for raw in (page.extract_tables() or []):\n                if raw and len(raw) > 1:\n                    dfs.append(pd.DataFrame(raw[1:], columns=raw[0]))\n    return dfs\n\n\ndef main() -> None:\n    parser = argparse.ArgumentParser(description=\"Extract tables from a PDF\")\n    parser.add_argument(\"pdf\", type=Path, help=\"Path to input PDF\")\n    parser.add_argument(\"--pages\", default=\"1\", help=\"Pages to parse, e.g. 1-5 or 1,3,5\")\n    parser.add_argument(\n        \"--flavor\", choices=[\"auto\", \"lattice\", \"stream\", \"pdfplumber\"],\n        default=\"auto\", help=\"Extraction method\"\n    )\n    parser.add_argument(\"--output\", type=Path, default=Path(\"output\"),\n                        help=\"Output directory for CSV files\")\n    args = parser.parse_args()\n\n    if not args.pdf.exists():\n        sys.exit(f\"File not found: {args.pdf}\")\n\n    args.output.mkdir(parents=True, exist_ok=True)\n\n    flavor = classify(args.pdf) if args.flavor == \"auto\" else args.flavor\n    print(f\"Using flavor: {flavor}\")\n\n    try:\n        tables = extract(args.pdf, args.pages, flavor)\n    except Exception as e:\n        sys.exit(f\"Extraction failed: {e}\")\n\n    if not tables:\n        sys.exit(\"No tables found. Try --flavor stream or --flavor pdfplumber.\")\n\n    for i, df in enumerate(tables, start=1):\n        df.replace(\"\", pd.NA, inplace=True)\n        df.dropna(how=\"all\", inplace=True)\n        out = args.output \u002F f\"table_{i:03d}.csv\"\n        df.to_csv(out, index=False)\n        print(f\"  [{i}] {df.shape[0]} rows × {df.shape[1]} cols → {out}\")\n\n    print(f\"Done. {len(tables)} table(s) exported to {args.output}\u002F\")\n\n\nif __name__ == \"__main__\":\n    main()\n",[30,52801,52802,52806,52810,52815,52819,52823,52828,52833,52837,52842,52846,52852,52858,52868,52874,52880,52890,52894,52898,52910,52915,52925,52937,52953,52967,52981,52987,53001,53005,53009,53026,53038,53046,53051,53056,53060,53078,53102,53117,53131,53136,53144,53154,53164,53178,53196,53215,53221,53225,53229,53241,53258,53280,53305,53309,53338,53356,53360,53384,53396,53404,53408,53416,53434,53438,53459,53463,53483,53504,53508,53514,53523,53533,53551,53555,53563,53572,53576,53597,53617,53637,53663,53675,53728,53732,53763,53767,53771,53783],{"__ignoreMap":28},[33,52803,52804],{"class":35,"line":36},[33,52805,14447],{"class":39},[33,52807,52808],{"class":35,"line":43},[33,52809,139],{"class":54},[33,52811,52812],{"class":35,"line":61},[33,52813,52814],{"class":54},"extract_tables.py — Extract tables from a PDF using the best available method.\n",[33,52816,52817],{"class":35,"line":73},[33,52818,92],{"emptyLinePlaceholder":91},[33,52820,52821],{"class":35,"line":88},[33,52822,4435],{"class":54},[33,52824,52825],{"class":35,"line":95},[33,52826,52827],{"class":54},"    python extract_tables.py input.pdf --pages 1-5 --flavor auto --output output\u002F\n",[33,52829,52830],{"class":35,"line":101},[33,52831,52832],{"class":54},"    python extract_tables.py input.pdf --flavor stream --output output\u002F\n",[33,52834,52835],{"class":35,"line":171},[33,52836,92],{"emptyLinePlaceholder":91},[33,52838,52839],{"class":35,"line":179},[33,52840,52841],{"class":54},"pip install pdfplumber \"camelot-py[cv]\" pandas pdf2image pytesseract\n",[33,52843,52844],{"class":35,"line":187},[33,52845,139],{"class":54},[33,52847,52848,52850],{"class":35,"line":201},[33,52849,164],{"class":163},[33,52851,4461],{"class":167},[33,52853,52854,52856],{"class":35,"line":206},[33,52855,164],{"class":163},[33,52857,168],{"class":167},[33,52859,52860,52862,52864,52866],{"class":35,"line":224},[33,52861,190],{"class":163},[33,52863,193],{"class":167},[33,52865,164],{"class":163},[33,52867,198],{"class":167},[33,52869,52870,52872],{"class":35,"line":229},[33,52871,164],{"class":163},[33,52873,485],{"class":167},[33,52875,52876,52878],{"class":35,"line":235},[33,52877,164],{"class":163},[33,52879,10567],{"class":167},[33,52881,52882,52884,52886,52888],{"class":35,"line":250},[33,52883,164],{"class":163},[33,52885,492],{"class":167},[33,52887,495],{"class":163},[33,52889,498],{"class":167},[33,52891,52892],{"class":35,"line":266},[33,52893,92],{"emptyLinePlaceholder":91},[33,52895,52896],{"class":35,"line":290},[33,52897,92],{"emptyLinePlaceholder":91},[33,52899,52900,52902,52904,52906,52908],{"class":35,"line":295},[33,52901,562],{"class":163},[33,52903,14550],{"class":46},[33,52905,3743],{"class":167},[33,52907,1053],{"class":50},[33,52909,574],{"class":167},[33,52911,52912],{"class":35,"line":300},[33,52913,52914],{"class":54},"    \"\"\"Return 'lattice', 'stream', or 'ocr' based on PDF content.\"\"\"\n",[33,52916,52917,52919,52921,52923],{"class":35,"line":317},[33,52918,1635],{"class":163},[33,52920,7123],{"class":167},[33,52922,495],{"class":163},[33,52924,686],{"class":167},[33,52926,52927,52929,52931,52933,52935],{"class":35,"line":332},[33,52928,14571],{"class":167},[33,52930,242],{"class":163},[33,52932,9870],{"class":167},[33,52934,748],{"class":50},[33,52936,9202],{"class":167},[33,52938,52939,52941,52943,52945,52947,52949,52951],{"class":35,"line":347},[33,52940,14584],{"class":167},[33,52942,242],{"class":163},[33,52944,9884],{"class":50},[33,52946,9887],{"class":167},[33,52948,7162],{"class":163},[33,52950,9892],{"class":54},[33,52952,9895],{"class":167},[33,52954,52955,52957,52959,52961,52963,52965],{"class":35,"line":374},[33,52956,14601],{"class":167},[33,52958,242],{"class":163},[33,52960,9884],{"class":50},[33,52962,9907],{"class":167},[33,52964,7162],{"class":163},[33,52966,9912],{"class":167},[33,52968,52969,52971,52973,52975,52977,52979],{"class":35,"line":397},[33,52970,617],{"class":163},[33,52972,620],{"class":163},[33,52974,9970],{"class":167},[33,52976,6001],{"class":163},[33,52978,620],{"class":163},[33,52980,9977],{"class":167},[33,52982,52983,52985],{"class":35,"line":653},[33,52984,1659],{"class":163},[33,52986,9984],{"class":54},[33,52988,52989,52991,52993,52995,52997,52999],{"class":35,"line":667},[33,52990,1332],{"class":163},[33,52992,9991],{"class":54},[33,52994,9994],{"class":163},[33,52996,9997],{"class":167},[33,52998,7489],{"class":163},[33,53000,10002],{"class":54},[33,53002,53003],{"class":35,"line":675},[33,53004,92],{"emptyLinePlaceholder":91},[33,53006,53007],{"class":35,"line":689},[33,53008,92],{"emptyLinePlaceholder":91},[33,53010,53011,53013,53016,53018,53020,53022,53024],{"class":35,"line":703},[33,53012,562],{"class":163},[33,53014,53015],{"class":46}," extract",[33,53017,14838],{"class":167},[33,53019,1053],{"class":50},[33,53021,14843],{"class":167},[33,53023,1053],{"class":50},[33,53025,10647],{"class":167},[33,53027,53028,53030,53032,53034,53036],{"class":35,"line":714},[33,53029,617],{"class":163},[33,53031,15750],{"class":167},[33,53033,1865],{"class":163},[33,53035,15755],{"class":54},[33,53037,574],{"class":167},[33,53039,53040,53042,53044],{"class":35,"line":723},[33,53041,4051],{"class":163},[33,53043,16617],{"class":50},[33,53045,7637],{"class":167},[33,53047,53048],{"class":35,"line":754},[33,53049,53050],{"class":54},"            \"OCR pipeline required — run: python -m ocr_extract \"\n",[33,53052,53053],{"class":35,"line":771},[33,53054,53055],{"class":54},"            \"(see how-to-extract-tables-from-scanned-pdfs\u002F)\"\n",[33,53057,53058],{"class":35,"line":777},[33,53059,5867],{"class":167},[33,53061,53062,53064,53066,53068,53070,53072,53074,53076],{"class":35,"line":788},[33,53063,617],{"class":163},[33,53065,15750],{"class":167},[33,53067,662],{"class":163},[33,53069,17583],{"class":167},[33,53071,10985],{"class":54},[33,53073,365],{"class":167},[33,53075,13407],{"class":54},[33,53077,1737],{"class":167},[33,53079,53080,53082,53084,53086,53088,53090,53092,53094,53096,53098,53100],{"class":35,"line":804},[33,53081,37895],{"class":167},[33,53083,242],{"class":163},[33,53085,40545],{"class":167},[33,53087,1053],{"class":50},[33,53089,13643],{"class":167},[33,53091,10971],{"class":238},[33,53093,242],{"class":163},[33,53095,14880],{"class":167},[33,53097,10748],{"class":238},[33,53099,242],{"class":163},[33,53101,10696],{"class":167},[33,53103,53104,53107,53109,53111,53113,53115],{"class":35,"line":809},[33,53105,53106],{"class":238},"                                  process_background",[33,53108,242],{"class":163},[33,53110,10706],{"class":167},[33,53112,1865],{"class":163},[33,53114,9991],{"class":54},[33,53116,371],{"class":167},[33,53118,53119,53121,53123,53125,53127,53129],{"class":35,"line":819},[33,53120,1659],{"class":163},[33,53122,51885],{"class":167},[33,53124,6124],{"class":163},[33,53126,10818],{"class":167},[33,53128,662],{"class":163},[33,53130,51894],{"class":167},[33,53132,53133],{"class":35,"line":829},[33,53134,53135],{"class":39},"    # pdfplumber fallback\n",[33,53137,53138,53140,53142],{"class":35,"line":834},[33,53139,37500],{"class":167},[33,53141,242],{"class":163},[33,53143,589],{"class":167},[33,53145,53146,53148,53150,53152],{"class":35,"line":839},[33,53147,1635],{"class":163},[33,53149,7123],{"class":167},[33,53151,495],{"class":163},[33,53153,686],{"class":167},[33,53155,53156,53158,53160,53162],{"class":35,"line":860},[33,53157,5973],{"class":163},[33,53159,695],{"class":167},[33,53161,662],{"class":163},[33,53163,700],{"class":167},[33,53165,53166,53168,53170,53172,53174,53176],{"class":35,"line":887},[33,53167,1793],{"class":163},[33,53169,7422],{"class":167},[33,53171,662],{"class":163},[33,53173,7427],{"class":167},[33,53175,7162],{"class":163},[33,53177,7165],{"class":167},[33,53179,53180,53182,53184,53186,53188,53190,53192,53194],{"class":35,"line":907},[33,53181,7170],{"class":163},[33,53183,7422],{"class":167},[33,53185,6001],{"class":163},[33,53187,4037],{"class":50},[33,53189,7446],{"class":167},[33,53191,6009],{"class":163},[33,53193,1814],{"class":50},[33,53195,574],{"class":167},[33,53197,53198,53201,53203,53205,53207,53209,53211,53213],{"class":35,"line":1826},[33,53199,53200],{"class":167},"                    dfs.append(pd.DataFrame(raw[",[33,53202,734],{"class":50},[33,53204,737],{"class":167},[33,53206,740],{"class":238},[33,53208,242],{"class":163},[33,53210,13789],{"class":167},[33,53212,748],{"class":50},[33,53214,7211],{"class":167},[33,53216,53217,53219],{"class":35,"line":1844},[33,53218,1332],{"class":163},[33,53220,37688],{"class":167},[33,53222,53223],{"class":35,"line":1858},[33,53224,92],{"emptyLinePlaceholder":91},[33,53226,53227],{"class":35,"line":1871},[33,53228,92],{"emptyLinePlaceholder":91},[33,53230,53231,53233,53235,53237,53239],{"class":35,"line":1877},[33,53232,562],{"class":163},[33,53234,6636],{"class":46},[33,53236,568],{"class":167},[33,53238,571],{"class":50},[33,53240,574],{"class":167},[33,53242,53243,53245,53247,53249,53251,53253,53256],{"class":35,"line":1883},[33,53244,6648],{"class":167},[33,53246,242],{"class":163},[33,53248,6653],{"class":167},[33,53250,6656],{"class":238},[33,53252,242],{"class":163},[33,53254,53255],{"class":54},"\"Extract tables from a PDF\"",[33,53257,221],{"class":167},[33,53259,53260,53262,53264,53266,53268,53270,53272,53274,53276,53278],{"class":35,"line":1915},[33,53261,6669],{"class":167},[33,53263,15519],{"class":54},[33,53265,365],{"class":167},[33,53267,6677],{"class":238},[33,53269,242],{"class":163},[33,53271,6682],{"class":167},[33,53273,25463],{"class":238},[33,53275,242],{"class":163},[33,53277,41143],{"class":54},[33,53279,221],{"class":167},[33,53281,53282,53284,53286,53288,53290,53292,53294,53296,53298,53300,53303],{"class":35,"line":1926},[33,53283,6669],{"class":167},[33,53285,15535],{"class":54},[33,53287,365],{"class":167},[33,53289,6685],{"class":238},[33,53291,242],{"class":163},[33,53293,35984],{"class":54},[33,53295,365],{"class":167},[33,53297,25463],{"class":238},[33,53299,242],{"class":163},[33,53301,53302],{"class":54},"\"Pages to parse, e.g. 1-5 or 1,3,5\"",[33,53304,221],{"class":167},[33,53306,53307],{"class":35,"line":1932},[33,53308,6721],{"class":167},[33,53310,53311,53314,53316,53318,53320,53322,53324,53326,53328,53330,53332,53334,53336],{"class":35,"line":1938},[33,53312,53313],{"class":54},"        \"--flavor\"",[33,53315,365],{"class":167},[33,53317,15558],{"class":238},[33,53319,242],{"class":163},[33,53321,8309],{"class":167},[33,53323,15565],{"class":54},[33,53325,365],{"class":167},[33,53327,10985],{"class":54},[33,53329,365],{"class":167},[33,53331,13407],{"class":54},[33,53333,365],{"class":167},[33,53335,15578],{"class":54},[33,53337,8935],{"class":167},[33,53339,53340,53343,53345,53347,53349,53351,53353],{"class":35,"line":1950},[33,53341,53342],{"class":238},"        default",[33,53344,242],{"class":163},[33,53346,15565],{"class":54},[33,53348,365],{"class":167},[33,53350,25463],{"class":238},[33,53352,242],{"class":163},[33,53354,53355],{"class":54},"\"Extraction method\"\n",[33,53357,53358],{"class":35,"line":1958},[33,53359,1202],{"class":167},[33,53361,53362,53364,53366,53368,53370,53372,53374,53376,53378,53380,53382],{"class":35,"line":4904},[33,53363,6669],{"class":167},[33,53365,6699],{"class":54},[33,53367,365],{"class":167},[33,53369,6677],{"class":238},[33,53371,242],{"class":163},[33,53373,6682],{"class":167},[33,53375,6685],{"class":238},[33,53377,242],{"class":163},[33,53379,15641],{"class":167},[33,53381,41169],{"class":54},[33,53383,1506],{"class":167},[33,53385,53386,53389,53391,53394],{"class":35,"line":4909},[33,53387,53388],{"class":238},"                        help",[33,53390,242],{"class":163},[33,53392,53393],{"class":54},"\"Output directory for CSV files\"",[33,53395,221],{"class":167},[33,53397,53398,53400,53402],{"class":35,"line":4915},[33,53399,6766],{"class":167},[33,53401,242],{"class":163},[33,53403,6771],{"class":167},[33,53405,53406],{"class":35,"line":4925},[33,53407,92],{"emptyLinePlaceholder":91},[33,53409,53410,53412,53414],{"class":35,"line":4935},[33,53411,617],{"class":163},[33,53413,620],{"class":163},[33,53415,15668],{"class":167},[33,53417,53418,53420,53422,53424,53426,53428,53430,53432],{"class":35,"line":4941},[33,53419,2995],{"class":167},[33,53421,4059],{"class":163},[33,53423,15677],{"class":54},[33,53425,1115],{"class":50},[33,53427,15682],{"class":167},[33,53429,1121],{"class":50},[33,53431,274],{"class":54},[33,53433,221],{"class":167},[33,53435,53436],{"class":35,"line":4950},[33,53437,92],{"emptyLinePlaceholder":91},[33,53439,53440,53443,53445,53447,53449,53451,53453,53455,53457],{"class":35,"line":4960},[33,53441,53442],{"class":167},"    args.output.mkdir(",[33,53444,869],{"class":238},[33,53446,242],{"class":163},[33,53448,855],{"class":50},[33,53450,365],{"class":167},[33,53452,878],{"class":238},[33,53454,242],{"class":163},[33,53456,855],{"class":50},[33,53458,221],{"class":167},[33,53460,53461],{"class":35,"line":4965},[33,53462,92],{"emptyLinePlaceholder":91},[33,53464,53465,53467,53469,53471,53473,53475,53477,53479,53481],{"class":35,"line":4971},[33,53466,15697],{"class":167},[33,53468,242],{"class":163},[33,53470,15702],{"class":167},[33,53472,2491],{"class":163},[33,53474,15707],{"class":167},[33,53476,1865],{"class":163},[33,53478,15712],{"class":54},[33,53480,15715],{"class":163},[33,53482,15718],{"class":167},[33,53484,53485,53487,53489,53491,53494,53496,53498,53500,53502],{"class":35,"line":4983},[33,53486,7268],{"class":50},[33,53488,602],{"class":167},[33,53490,4059],{"class":163},[33,53492,53493],{"class":54},"\"Using flavor: ",[33,53495,1115],{"class":50},[33,53497,10748],{"class":167},[33,53499,1121],{"class":50},[33,53501,274],{"class":54},[33,53503,221],{"class":167},[33,53505,53506],{"class":35,"line":4988},[33,53507,92],{"emptyLinePlaceholder":91},[33,53509,53510,53512],{"class":35,"line":4993},[33,53511,2424],{"class":163},[33,53513,574],{"class":167},[33,53515,53516,53518,53520],{"class":35,"line":5003},[33,53517,37895],{"class":167},[33,53519,242],{"class":163},[33,53521,53522],{"class":167}," extract(args.pdf, args.pages, flavor)\n",[33,53524,53525,53527,53529,53531],{"class":35,"line":5008},[33,53526,2449],{"class":163},[33,53528,783],{"class":50},[33,53530,1852],{"class":163},[33,53532,7583],{"class":167},[33,53534,53535,53537,53539,53541,53543,53545,53547,53549],{"class":35,"line":5014},[33,53536,2995],{"class":167},[33,53538,4059],{"class":163},[33,53540,7597],{"class":54},[33,53542,1115],{"class":50},[33,53544,7602],{"class":167},[33,53546,1121],{"class":50},[33,53548,274],{"class":54},[33,53550,221],{"class":167},[33,53552,53553],{"class":35,"line":5019},[33,53554,92],{"emptyLinePlaceholder":91},[33,53556,53557,53559,53561],{"class":35,"line":5032},[33,53558,617],{"class":163},[33,53560,620],{"class":163},[33,53562,38001],{"class":167},[33,53564,53565,53567,53570],{"class":35,"line":5039},[33,53566,2995],{"class":167},[33,53568,53569],{"class":54},"\"No tables found. Try --flavor stream or --flavor pdfplumber.\"",[33,53571,221],{"class":167},[33,53573,53574],{"class":35,"line":5068},[33,53575,92],{"emptyLinePlaceholder":91},[33,53577,53578,53580,53582,53584,53586,53589,53591,53593,53595],{"class":35,"line":5077},[33,53579,656],{"class":163},[33,53581,10994],{"class":167},[33,53583,662],{"class":163},[33,53585,7403],{"class":50},[33,53587,53588],{"class":167},"(tables, ",[33,53590,7409],{"class":238},[33,53592,242],{"class":163},[33,53594,734],{"class":50},[33,53596,1737],{"class":167},[33,53598,53599,53601,53603,53605,53607,53609,53611,53613,53615],{"class":35,"line":5082},[33,53600,10879],{"class":167},[33,53602,3198],{"class":54},[33,53604,10884],{"class":167},[33,53606,8018],{"class":50},[33,53608,365],{"class":167},[33,53610,10891],{"class":238},[33,53612,242],{"class":163},[33,53614,855],{"class":50},[33,53616,221],{"class":167},[33,53618,53619,53621,53623,53625,53627,53629,53631,53633,53635],{"class":35,"line":5089},[33,53620,50229],{"class":167},[33,53622,28045],{"class":238},[33,53624,242],{"class":163},[33,53626,35616],{"class":54},[33,53628,365],{"class":167},[33,53630,10891],{"class":238},[33,53632,242],{"class":163},[33,53634,855],{"class":50},[33,53636,221],{"class":167},[33,53638,53639,53641,53643,53646,53648,53650,53652,53654,53656,53659,53661],{"class":35,"line":5098},[33,53640,50344],{"class":167},[33,53642,242],{"class":163},[33,53644,53645],{"class":167}," args.output ",[33,53647,1351],{"class":163},[33,53649,1110],{"class":163},[33,53651,50356],{"class":54},[33,53653,1115],{"class":50},[33,53655,7499],{"class":167},[33,53657,53658],{"class":163},":03d",[33,53660,1121],{"class":50},[33,53662,40176],{"class":54},[33,53664,53665,53667,53669,53671,53673],{"class":35,"line":5105},[33,53666,50371],{"class":167},[33,53668,897],{"class":238},[33,53670,242],{"class":163},[33,53672,902],{"class":50},[33,53674,221],{"class":167},[33,53676,53677,53679,53681,53683,53686,53688,53690,53692,53694,53696,53698,53700,53702,53704,53706,53708,53710,53712,53714,53716,53718,53720,53722,53724,53726],{"class":35,"line":5110},[33,53678,9414],{"class":50},[33,53680,602],{"class":167},[33,53682,4059],{"class":163},[33,53684,53685],{"class":54},"\"  [",[33,53687,1115],{"class":50},[33,53689,7499],{"class":167},[33,53691,1121],{"class":50},[33,53693,763],{"class":54},[33,53695,1115],{"class":50},[33,53697,9541],{"class":167},[33,53699,748],{"class":50},[33,53701,9546],{"class":167},[33,53703,1121],{"class":50},[33,53705,16022],{"class":54},[33,53707,1115],{"class":50},[33,53709,9541],{"class":167},[33,53711,734],{"class":50},[33,53713,9546],{"class":167},[33,53715,1121],{"class":50},[33,53717,16035],{"class":54},[33,53719,1115],{"class":50},[33,53721,18014],{"class":167},[33,53723,1121],{"class":50},[33,53725,274],{"class":54},[33,53727,221],{"class":167},[33,53729,53730],{"class":35,"line":5115},[33,53731,92],{"emptyLinePlaceholder":91},[33,53733,53734,53736,53738,53740,53743,53745,53747,53749,53752,53754,53756,53758,53761],{"class":35,"line":5128},[33,53735,7268],{"class":50},[33,53737,602],{"class":167},[33,53739,4059],{"class":163},[33,53741,53742],{"class":54},"\"Done. ",[33,53744,4065],{"class":50},[33,53746,39168],{"class":167},[33,53748,1121],{"class":50},[33,53750,53751],{"class":54}," table(s) exported to ",[33,53753,1115],{"class":50},[33,53755,16040],{"class":167},[33,53757,1121],{"class":50},[33,53759,53760],{"class":54},"\u002F\"",[33,53762,221],{"class":167},[33,53764,53765],{"class":35,"line":5135},[33,53766,92],{"emptyLinePlaceholder":91},[33,53768,53769],{"class":35,"line":5142},[33,53770,92],{"emptyLinePlaceholder":91},[33,53772,53773,53775,53777,53779,53781],{"class":35,"line":5151},[33,53774,2491],{"class":163},[33,53776,2494],{"class":50},[33,53778,2497],{"class":163},[33,53780,2500],{"class":54},[33,53782,574],{"class":167},[33,53784,53785],{"class":35,"line":5156},[33,53786,6914],{"class":167},[18,53788,6918],{"id":6917},[4211,53790,53791,53796,53801,53806,53811],{},[4214,53792,53793,53795],{},[940,53794,10535],{"href":10534}," — resolve jumbled columns from coordinate drift",[4214,53797,53798,53800],{},[940,53799,10077],{"href":10076}," — OCR pipeline for rasterized documents",[4214,53802,53803,53805],{},[940,53804,9739],{"href":9738}," — resolve Ghostscript and cv2 import failures",[4214,53807,53808,53810],{},[940,53809,36756],{"href":26957}," — broader OCR preprocessing techniques",[4214,53812,53813,53815],{},[940,53814,9599],{"href":9598}," — clean extracted tables after export",[14,53817,6947,53818,3035],{},[940,53819,6943],{"href":6942},[6953,53821,41404],{},{"title":28,"searchDepth":43,"depth":43,"links":53823},[53824,53825,53826,53827,53828,53829,53830,53835,53836,53837,53838,53839],{"id":20,"depth":43,"text":21},{"id":49255,"depth":43,"text":49256},{"id":49911,"depth":43,"text":49912},{"id":50449,"depth":43,"text":50450},{"id":50859,"depth":43,"text":50860},{"id":51285,"depth":43,"text":51286},{"id":51785,"depth":43,"text":51786,"children":53831},[53832,53833,53834],{"id":51789,"depth":61,"text":51790},{"id":52000,"depth":61,"text":52001},{"id":52015,"depth":61,"text":52016},{"id":52029,"depth":43,"text":52030},{"id":13844,"depth":43,"text":13845},{"id":4270,"depth":43,"text":4271},{"id":14436,"depth":43,"text":14437},{"id":6917,"depth":43,"text":6918},"Extracting Tables","Extract tabular data from PDFs using pdfplumber and camelot. Covers lattice vs stream mode, coordinate mapping, multi-page header dedup, and DataFrame export.",{},"\u002Fautomating-pdf-extraction-generation\u002Fextracting-tables-from-pdfs",{"title":9592,"description":53841},"Extracting Tables from PDFs with Python","automating-pdf-extraction-generation\u002Fextracting-tables-from-pdfs\u002Findex",[9631,47,943,16139,53848],"data extraction","FKEQSMzOqOxJeVhLdYg8jo5sgNrioTLGXXmfI2TQmHQ",{"id":53851,"title":53852,"body":53853,"breadcrumbTitle":57578,"canonical":6977,"date":46387,"description":57579,"draft":6980,"extension":6981,"image":6977,"meta":57580,"navigation":91,"path":57581,"robots":6977,"seo":57582,"seoTitle":57583,"stem":57584,"tags":57585,"updatedAt":6978,"__hash__":57588},"content\u002Fautomating-pdf-extraction-generation\u002Fgenerating-pdf-reports-dynamically\u002Fcreate-dynamic-invoice-pdfs-automatically\u002Findex.md","Create Dynamic Invoice PDFs Automatically",{"type":7,"value":53854,"toc":57563},[53855,53858,53861,53877,53881,53884,53919,53921,53941,53944,54020,54024,54034,54443,54447,54450,54592,54595,54599,54606,55244,55248,55251,56873,56876,56880,56883,56972,56982,56986,56989,57219,57224,57228,57241,57305,57307,57524,57526,57556,57560],[10,53856,53852],{"id":53857},"create-dynamic-invoice-pdfs-automatically",[14,53859,53860],{},"Per-customer invoice generation means turning one data row into a complete, formatted PDF — line items, subtotals, tax, a due date, and a footer with payment terms — without touching a template manually. This guide builds that pipeline with both ReportLab (canvas control, good for custom fonts and symbols) and Jinja2+WeasyPrint (HTML\u002FCSS layout, easier to style).",[14,53862,53863,53864,53866,53867,53870,53871,53874,53875,3035],{},"This is one of the concrete output patterns described in ",[940,53865,26191],{"href":19001},". If you encounter garbled boxes or a ",[30,53868,53869],{},"UnicodeEncodeError"," for currency symbols like ",[30,53872,53873],{},"€"," when using ReportLab, see ",[940,53876,28608],{"href":28607},[18,53878,53880],{"id":53879},"root-cause-of-common-invoice-failures","Root cause of common invoice failures",[14,53882,53883],{},"Two failures account for most broken invoice scripts:",[35387,53885,53886,53899],{},[4214,53887,53888,53891,53892,42706,53895,53898],{},[1974,53889,53890],{},"Table overflow \u002F silent row truncation."," WeasyPrint calculates page breaks synchronously. Without ",[30,53893,53894],{},"page-break-inside: avoid",[30,53896,53897],{},"\u003Ctr>",", the renderer may split a row across pages and silently discard the overflow. The symptom is a PDF that ends mid-table with no error raised.",[4214,53900,53901,53904,53905,53908,53909,53912,53913,365,53915,53918],{},[1974,53902,53903],{},"Unicode encoding crash before render."," When a CSV is opened without ",[30,53906,53907],{},"encoding='utf-8'",", Python raises ",[30,53910,53911],{},"UnicodeDecodeError"," on the first non-ASCII character (accented names, ",[30,53914,53873],{},[30,53916,53917],{},"£","). This happens in the data layer, before HTML or PDF rendering even starts.",[18,53920,21],{"id":20},[23,53922,53924],{"className":25,"code":53923,"language":27,"meta":28,"style":28},"pip install reportlab weasyprint jinja2 pandas\n",[30,53925,53926],{"__ignoreMap":28},[33,53927,53928,53930,53932,53934,53936,53939],{"class":35,"line":36},[33,53929,76],{"class":46},[33,53931,79],{"class":54},[33,53933,16198],{"class":54},[33,53935,20930],{"class":54},[33,53937,53938],{"class":54}," jinja2",[33,53940,9707],{"class":54},[14,53942,53943],{},"Create a sample invoice CSV:",[23,53945,53947],{"className":25,"code":53946,"language":27,"meta":28,"style":28},"python - \u003C\u003C'EOF'\nimport csv\nrows = [\n    {\"invoice_id\":\"INV-001\",\"customer\":\"Acme Corp\",\"email\":\"ap@acme.example\",\n     \"due_date\":\"2026-07-18\",\"tax_rate\":0.20,\n     \"items\":'[{\"desc\":\"Cloud setup\",\"qty\":10,\"rate\":150},{\"desc\":\"API integration\",\"qty\":25,\"rate\":120}]'},\n    {\"invoice_id\":\"INV-002\",\"customer\":\"Béta SARL\",\"email\":\"compta@beta.example\",\n     \"due_date\":\"2026-07-25\",\"tax_rate\":0.19,\n     \"items\":'[{\"desc\":\"Consulting\",\"qty\":8,\"rate\":200},{\"desc\":\"Support retainer\",\"qty\":1,\"rate\":500}]'},\n]\nwith open(\"invoices.csv\",\"w\",newline=\"\",encoding=\"utf-8\") as f:\n    w = csv.DictWriter(f, fieldnames=rows[0].keys()); w.writeheader(); w.writerows(rows)\nEOF\n",[30,53948,53949,53961,53966,53971,53976,53981,53986,53991,53996,54001,54005,54010,54015],{"__ignoreMap":28},[33,53950,53951,53953,53955,53958],{"class":35,"line":36},[33,53952,47],{"class":46},[33,53954,39025],{"class":54},[33,53956,53957],{"class":163}," \u003C\u003C",[33,53959,53960],{"class":54},"'EOF'\n",[33,53962,53963],{"class":35,"line":43},[33,53964,53965],{"class":54},"import csv\n",[33,53967,53968],{"class":35,"line":61},[33,53969,53970],{"class":54},"rows = [\n",[33,53972,53973],{"class":35,"line":73},[33,53974,53975],{"class":54},"    {\"invoice_id\":\"INV-001\",\"customer\":\"Acme Corp\",\"email\":\"ap@acme.example\",\n",[33,53977,53978],{"class":35,"line":88},[33,53979,53980],{"class":54},"     \"due_date\":\"2026-07-18\",\"tax_rate\":0.20,\n",[33,53982,53983],{"class":35,"line":95},[33,53984,53985],{"class":54},"     \"items\":'[{\"desc\":\"Cloud setup\",\"qty\":10,\"rate\":150},{\"desc\":\"API integration\",\"qty\":25,\"rate\":120}]'},\n",[33,53987,53988],{"class":35,"line":101},[33,53989,53990],{"class":54},"    {\"invoice_id\":\"INV-002\",\"customer\":\"Béta SARL\",\"email\":\"compta@beta.example\",\n",[33,53992,53993],{"class":35,"line":171},[33,53994,53995],{"class":54},"     \"due_date\":\"2026-07-25\",\"tax_rate\":0.19,\n",[33,53997,53998],{"class":35,"line":179},[33,53999,54000],{"class":54},"     \"items\":'[{\"desc\":\"Consulting\",\"qty\":8,\"rate\":200},{\"desc\":\"Support retainer\",\"qty\":1,\"rate\":500}]'},\n",[33,54002,54003],{"class":35,"line":187},[33,54004,9202],{"class":54},[33,54006,54007],{"class":35,"line":201},[33,54008,54009],{"class":54},"with open(\"invoices.csv\",\"w\",newline=\"\",encoding=\"utf-8\") as f:\n",[33,54011,54012],{"class":35,"line":206},[33,54013,54014],{"class":54},"    w = csv.DictWriter(f, fieldnames=rows[0].keys()); w.writeheader(); w.writerows(rows)\n",[33,54016,54017],{"class":35,"line":224},[33,54018,54019],{"class":54},"EOF\n",[18,54021,54023],{"id":54022},"step-1-load-and-parse-invoice-data","Step 1 — Load and parse invoice data",[14,54025,54026,54027,54029,54030,54033],{},"Always open with ",[30,54028,53907],{},". Parse the JSON ",[30,54031,54032],{},"items"," column before handing it to any renderer.",[23,54035,54037],{"className":126,"code":54036,"language":47,"meta":28,"style":28},"# pip install pandas\nimport json\nfrom pathlib import Path\nimport pandas as pd\n\ndef load_invoices(path: Path) -> list[dict]:\n    try:\n        df = pd.read_csv(path, encoding=\"utf-8\")      # explicit encoding prevents UnicodeDecodeError\n        df.columns = df.columns.str.strip().str.lower()\n        records = df.to_dict(\"records\")\n        for rec in records:\n            raw = rec.get(\"items\", \"[]\")\n            rec[\"items\"] = json.loads(raw) if isinstance(raw, str) else raw\n            for item in rec[\"items\"]:\n                item[\"total\"] = round(item[\"qty\"] * item[\"rate\"], 2)\n            subtotal = sum(i[\"total\"] for i in rec[\"items\"])\n            rec[\"subtotal\"] = subtotal\n            rec[\"tax_amount\"] = round(subtotal * float(rec[\"tax_rate\"]), 2)\n            rec[\"grand_total\"] = round(subtotal + rec[\"tax_amount\"], 2)\n        return records\n    except FileNotFoundError:\n        raise SystemExit(f\"Invoice file not found: {path}\")\n    except json.JSONDecodeError as exc:\n        raise SystemExit(f\"Malformed items JSON: {exc}\")\n\ninvoices = load_invoices(Path(\"invoices.csv\"))\n",[30,54038,54039,54043,54049,54059,54069,54073,54087,54093,54113,54122,54136,54148,54167,54197,54213,54249,54277,54291,54325,54352,54359,54367,54390,54401,54424,54428],{"__ignoreMap":28},[33,54040,54041],{"class":35,"line":36},[33,54042,8895],{"class":39},[33,54044,54045,54047],{"class":35,"line":43},[33,54046,164],{"class":163},[33,54048,3081],{"class":167},[33,54050,54051,54053,54055,54057],{"class":35,"line":61},[33,54052,190],{"class":163},[33,54054,193],{"class":167},[33,54056,164],{"class":163},[33,54058,198],{"class":167},[33,54060,54061,54063,54065,54067],{"class":35,"line":73},[33,54062,164],{"class":163},[33,54064,492],{"class":167},[33,54066,495],{"class":163},[33,54068,498],{"class":167},[33,54070,54071],{"class":35,"line":88},[33,54072,92],{"emptyLinePlaceholder":91},[33,54074,54075,54077,54080,54083,54085],{"class":35,"line":95},[33,54076,562],{"class":163},[33,54078,54079],{"class":46}," load_invoices",[33,54081,54082],{"class":167},"(path: Path) -> list[",[33,54084,37100],{"class":50},[33,54086,17477],{"class":167},[33,54088,54089,54091],{"class":35,"line":101},[33,54090,2424],{"class":163},[33,54092,574],{"class":167},[33,54094,54095,54097,54099,54101,54103,54105,54107,54110],{"class":35,"line":171},[33,54096,7930],{"class":167},[33,54098,242],{"class":163},[33,54100,27411],{"class":167},[33,54102,27249],{"class":238},[33,54104,242],{"class":163},[33,54106,1195],{"class":54},[33,54108,54109],{"class":167},")      ",[33,54111,54112],{"class":39},"# explicit encoding prevents UnicodeDecodeError\n",[33,54114,54115,54117,54119],{"class":35,"line":179},[33,54116,10842],{"class":167},[33,54118,242],{"class":163},[33,54120,54121],{"class":167}," df.columns.str.strip().str.lower()\n",[33,54123,54124,54127,54129,54132,54134],{"class":35,"line":187},[33,54125,54126],{"class":167},"        records ",[33,54128,242],{"class":163},[33,54130,54131],{"class":167}," df.to_dict(",[33,54133,21222],{"class":54},[33,54135,221],{"class":167},[33,54137,54138,54140,54143,54145],{"class":35,"line":201},[33,54139,5973],{"class":163},[33,54141,54142],{"class":167}," rec ",[33,54144,662],{"class":163},[33,54146,54147],{"class":167}," records:\n",[33,54149,54150,54152,54154,54157,54160,54162,54165],{"class":35,"line":206},[33,54151,7142],{"class":167},[33,54153,242],{"class":163},[33,54155,54156],{"class":167}," rec.get(",[33,54158,54159],{"class":54},"\"items\"",[33,54161,365],{"class":167},[33,54163,54164],{"class":54},"\"[]\"",[33,54166,221],{"class":167},[33,54168,54169,54172,54174,54176,54178,54181,54183,54185,54188,54190,54192,54194],{"class":35,"line":224},[33,54170,54171],{"class":167},"            rec[",[33,54173,54159],{"class":54},[33,54175,763],{"class":167},[33,54177,242],{"class":163},[33,54179,54180],{"class":167}," json.loads(raw) ",[33,54182,2491],{"class":163},[33,54184,36538],{"class":50},[33,54186,54187],{"class":167},"(raw, ",[33,54189,1053],{"class":50},[33,54191,1649],{"class":167},[33,54193,7489],{"class":163},[33,54195,54196],{"class":167}," raw\n",[33,54198,54199,54201,54204,54206,54209,54211],{"class":35,"line":229},[33,54200,1793],{"class":163},[33,54202,54203],{"class":167}," item ",[33,54205,662],{"class":163},[33,54207,54208],{"class":167}," rec[",[33,54210,54159],{"class":54},[33,54212,17477],{"class":167},[33,54214,54215,54218,54221,54223,54225,54227,54230,54233,54235,54237,54240,54243,54245,54247],{"class":35,"line":235},[33,54216,54217],{"class":167},"                item[",[33,54219,54220],{"class":54},"\"total\"",[33,54222,763],{"class":167},[33,54224,242],{"class":163},[33,54226,47489],{"class":50},[33,54228,54229],{"class":167},"(item[",[33,54231,54232],{"class":54},"\"qty\"",[33,54234,763],{"class":167},[33,54236,1769],{"class":163},[33,54238,54239],{"class":167}," item[",[33,54241,54242],{"class":54},"\"rate\"",[33,54244,8314],{"class":167},[33,54246,1533],{"class":50},[33,54248,221],{"class":167},[33,54250,54251,54254,54256,54258,54261,54263,54265,54267,54269,54271,54273,54275],{"class":35,"line":250},[33,54252,54253],{"class":167},"            subtotal ",[33,54255,242],{"class":163},[33,54257,46601],{"class":50},[33,54259,54260],{"class":167},"(i[",[33,54262,54220],{"class":54},[33,54264,763],{"class":167},[33,54266,6124],{"class":163},[33,54268,47269],{"class":167},[33,54270,662],{"class":163},[33,54272,54208],{"class":167},[33,54274,54159],{"class":54},[33,54276,751],{"class":167},[33,54278,54279,54281,54284,54286,54288],{"class":35,"line":266},[33,54280,54171],{"class":167},[33,54282,54283],{"class":54},"\"subtotal\"",[33,54285,763],{"class":167},[33,54287,242],{"class":163},[33,54289,54290],{"class":167}," subtotal\n",[33,54292,54293,54295,54298,54300,54302,54304,54307,54309,54312,54315,54318,54321,54323],{"class":35,"line":290},[33,54294,54171],{"class":167},[33,54296,54297],{"class":54},"\"tax_amount\"",[33,54299,763],{"class":167},[33,54301,242],{"class":163},[33,54303,47489],{"class":50},[33,54305,54306],{"class":167},"(subtotal ",[33,54308,1769],{"class":163},[33,54310,54311],{"class":50}," float",[33,54313,54314],{"class":167},"(rec[",[33,54316,54317],{"class":54},"\"tax_rate\"",[33,54319,54320],{"class":167},"]), ",[33,54322,1533],{"class":50},[33,54324,221],{"class":167},[33,54326,54327,54329,54332,54334,54336,54338,54340,54342,54344,54346,54348,54350],{"class":35,"line":295},[33,54328,54171],{"class":167},[33,54330,54331],{"class":54},"\"grand_total\"",[33,54333,763],{"class":167},[33,54335,242],{"class":163},[33,54337,47489],{"class":50},[33,54339,54306],{"class":167},[33,54341,1811],{"class":163},[33,54343,54208],{"class":167},[33,54345,54297],{"class":54},[33,54347,8314],{"class":167},[33,54349,1533],{"class":50},[33,54351,221],{"class":167},[33,54353,54354,54356],{"class":35,"line":300},[33,54355,1659],{"class":163},[33,54357,54358],{"class":167}," records\n",[33,54360,54361,54363,54365],{"class":35,"line":317},[33,54362,2449],{"class":163},[33,54364,2945],{"class":50},[33,54366,574],{"class":167},[33,54368,54369,54371,54373,54375,54377,54380,54382,54384,54386,54388],{"class":35,"line":332},[33,54370,4051],{"class":163},[33,54372,16617],{"class":50},[33,54374,602],{"class":167},[33,54376,4059],{"class":163},[33,54378,54379],{"class":54},"\"Invoice file not found: ",[33,54381,1115],{"class":50},[33,54383,2580],{"class":167},[33,54385,1121],{"class":50},[33,54387,274],{"class":54},[33,54389,221],{"class":167},[33,54391,54392,54394,54397,54399],{"class":35,"line":347},[33,54393,2449],{"class":163},[33,54395,54396],{"class":167}," json.JSONDecodeError ",[33,54398,495],{"class":163},[33,54400,1855],{"class":167},[33,54402,54403,54405,54407,54409,54411,54414,54416,54418,54420,54422],{"class":35,"line":374},[33,54404,4051],{"class":163},[33,54406,16617],{"class":50},[33,54408,602],{"class":167},[33,54410,4059],{"class":163},[33,54412,54413],{"class":54},"\"Malformed items JSON: ",[33,54415,1115],{"class":50},[33,54417,6565],{"class":167},[33,54419,1121],{"class":50},[33,54421,274],{"class":54},[33,54423,221],{"class":167},[33,54425,54426],{"class":35,"line":397},[33,54427,92],{"emptyLinePlaceholder":91},[33,54429,54430,54433,54435,54438,54441],{"class":35,"line":653},[33,54431,54432],{"class":167},"invoices ",[33,54434,242],{"class":163},[33,54436,54437],{"class":167}," load_invoices(Path(",[33,54439,54440],{"class":54},"\"invoices.csv\"",[33,54442,371],{"class":167},[18,54444,54446],{"id":54445},"minimal-reproducible-diagnostic","Minimal reproducible diagnostic",[14,54448,54449],{},"Before building the full pipeline, confirm parsing is correct:",[23,54451,54453],{"className":126,"code":54452,"language":47,"meta":28,"style":28},"# pip install pandas\nimport json, pprint\nfrom pathlib import Path\nimport pandas as pd\n\ndf = pd.read_csv(Path(\"invoices.csv\"), encoding=\"utf-8\")\nfirst = df.iloc[0].to_dict()\nitems = json.loads(first[\"items\"])\nprint(f\"Customer: {first['customer']}\")\nprint(f\"Line items: {len(items)}\")\npprint.pprint(items)\n",[30,54454,54455,54459,54466,54476,54486,54490,54510,54524,54538,54565,54587],{"__ignoreMap":28},[33,54456,54457],{"class":35,"line":36},[33,54458,8895],{"class":39},[33,54460,54461,54463],{"class":35,"line":43},[33,54462,164],{"class":163},[33,54464,54465],{"class":167}," json, pprint\n",[33,54467,54468,54470,54472,54474],{"class":35,"line":61},[33,54469,190],{"class":163},[33,54471,193],{"class":167},[33,54473,164],{"class":163},[33,54475,198],{"class":167},[33,54477,54478,54480,54482,54484],{"class":35,"line":73},[33,54479,164],{"class":163},[33,54481,492],{"class":167},[33,54483,495],{"class":163},[33,54485,498],{"class":167},[33,54487,54488],{"class":35,"line":88},[33,54489,92],{"emptyLinePlaceholder":91},[33,54491,54492,54494,54496,54498,54500,54502,54504,54506,54508],{"class":35,"line":95},[33,54493,13459],{"class":167},[33,54495,242],{"class":163},[33,54497,46182],{"class":167},[33,54499,54440],{"class":54},[33,54501,18525],{"class":167},[33,54503,27249],{"class":238},[33,54505,242],{"class":163},[33,54507,1195],{"class":54},[33,54509,221],{"class":167},[33,54511,54512,54515,54517,54519,54521],{"class":35,"line":101},[33,54513,54514],{"class":167},"first ",[33,54516,242],{"class":163},[33,54518,10847],{"class":167},[33,54520,748],{"class":50},[33,54522,54523],{"class":167},"].to_dict()\n",[33,54525,54526,54529,54531,54534,54536],{"class":35,"line":171},[33,54527,54528],{"class":167},"items ",[33,54530,242],{"class":163},[33,54532,54533],{"class":167}," json.loads(first[",[33,54535,54159],{"class":54},[33,54537,751],{"class":167},[33,54539,54540,54542,54544,54546,54549,54551,54554,54557,54559,54561,54563],{"class":35,"line":179},[33,54541,13474],{"class":50},[33,54543,602],{"class":167},[33,54545,4059],{"class":163},[33,54547,54548],{"class":54},"\"Customer: ",[33,54550,1115],{"class":50},[33,54552,54553],{"class":167},"first[",[33,54555,54556],{"class":54},"'customer'",[33,54558,9546],{"class":167},[33,54560,1121],{"class":50},[33,54562,274],{"class":54},[33,54564,221],{"class":167},[33,54566,54567,54569,54571,54573,54576,54578,54581,54583,54585],{"class":35,"line":187},[33,54568,13474],{"class":50},[33,54570,602],{"class":167},[33,54572,4059],{"class":163},[33,54574,54575],{"class":54},"\"Line items: ",[33,54577,4065],{"class":50},[33,54579,54580],{"class":167},"(items)",[33,54582,1121],{"class":50},[33,54584,274],{"class":54},[33,54586,221],{"class":167},[33,54588,54589],{"class":35,"line":201},[33,54590,54591],{"class":167},"pprint.pprint(items)\n",[14,54593,54594],{},"Expected output shows the customer name (including accented characters) and all line items with no decode error.",[18,54596,54598],{"id":54597},"step-2-generate-with-jinja2-weasyprint","Step 2 — Generate with Jinja2 + WeasyPrint",[14,54600,54601,54602,54605],{},"Good default choice: CSS handles layout, ",[30,54603,54604],{},"@page"," rules inject page numbers and a persistent header.",[23,54607,54609],{"className":126,"code":54608,"language":47,"meta":28,"style":28},"# pip install weasyprint jinja2\nfrom pathlib import Path\nfrom jinja2 import Environment, BaseLoader\nfrom weasyprint import HTML\n\nINVOICE_TEMPLATE = \"\"\"\u003C!DOCTYPE html>\n\u003Chtml>\u003Chead>\u003Cmeta charset=\"utf-8\">\n\u003Cstyle>\n@page {\n  size: A4;\n  margin: 18mm 15mm 22mm;\n  @top-left   { content: \"INVOICE\"; font-size: 8pt; color: #475569; }\n  @top-right  { content: \"{{ inv.invoice_id }}\"; font-size: 8pt; color: #475569; }\n  @bottom-center { content: \"Page \" counter(page) \" of \" counter(pages); font-size: 7pt; color: #475569; }\n}\nbody   { font-family: sans-serif; font-size: 10pt; color: #0f172a; margin: 0; }\nh1     { font-size: 22pt; margin-bottom: 2mm; }\n.meta  { font-size: 9pt; color: #475569; margin-bottom: 8mm; }\ntable  { width: 100%; border-collapse: collapse; margin-top: 6mm; }\nthead tr { background: #2563eb; color: #fff; }\nth, td   { padding: 5px 8px; font-size: 9pt; }\nth       { text-align: left; }\n.right   { text-align: right; }\ntbody tr { page-break-inside: avoid; }              \u002F* prevents row splitting *\u002F\ntbody tr:nth-child(even) { background: #f6f8fb; }\n.totals  { margin-top: 6mm; text-align: right; font-size: 10pt; }\n.totals td { padding: 2px 8px; }\n.grand   { font-weight: bold; font-size: 12pt; color: #2563eb; }\n.footer  { margin-top: 12mm; font-size: 8pt; color: #475569;\n           border-top: 1px solid #e2e8f0; padding-top: 4mm; }\n\u003C\u002Fstyle>\u003C\u002Fhead>\n\u003Cbody>\n\u003Ch1>Invoice\u003C\u002Fh1>\n\u003Cdiv class=\"meta\">\n  \u003Cstrong>{{ inv.customer }}\u003C\u002Fstrong>\u003Cbr>\n  Invoice: {{ inv.invoice_id }} &nbsp;|&nbsp; Due: {{ inv.due_date }}\u003Cbr>\n  Contact: {{ inv.email }}\n\u003C\u002Fdiv>\n\n\u003Ctable>\n  \u003Cthead>\u003Ctr>\n    \u003Cth>Description\u003C\u002Fth>\n    \u003Cth class=\"right\">Qty\u003C\u002Fth>\n    \u003Cth class=\"right\">Rate\u003C\u002Fth>\n    \u003Cth class=\"right\">Total\u003C\u002Fth>\n  \u003C\u002Ftr>\u003C\u002Fthead>\n  \u003Ctbody>\n  {% for item in inv.items %}\n  \u003Ctr>\n    \u003Ctd>{{ item.desc }}\u003C\u002Ftd>\n    \u003Ctd class=\"right\">{{ item.qty }}\u003C\u002Ftd>\n    \u003Ctd class=\"right\">${{ \"%.2f\"|format(item.rate) }}\u003C\u002Ftd>\n    \u003Ctd class=\"right\">${{ \"%.2f\"|format(item.total) }}\u003C\u002Ftd>\n  \u003C\u002Ftr>\n  {% endfor %}\n  \u003C\u002Ftbody>\n\u003C\u002Ftable>\n\n\u003Ctable class=\"totals\">\n  \u003Ctr>\u003Ctd>Subtotal\u003C\u002Ftd>\u003Ctd>${{ \"%.2f\"|format(inv.subtotal) }}\u003C\u002Ftd>\u003C\u002Ftr>\n  \u003Ctr>\u003Ctd>Tax ({{ (inv.tax_rate * 100)|int }}%)\u003C\u002Ftd>\u003Ctd>${{ \"%.2f\"|format(inv.tax_amount) }}\u003C\u002Ftd>\u003C\u002Ftr>\n  \u003Ctr class=\"grand\">\u003Ctd>Total Due\u003C\u002Ftd>\u003Ctd>${{ \"%.2f\"|format(inv.grand_total) }}\u003C\u002Ftd>\u003C\u002Ftr>\n\u003C\u002Ftable>\n\n\u003Cdiv class=\"footer\">Payment terms: 30 days. Bank transfer preferred.\u003Cbr>\nThank you for your business.\u003C\u002Fdiv>\n\u003C\u002Fbody>\u003C\u002Fhtml>\"\"\"\n\n\ndef render_invoice_weasyprint(inv: dict, out_dir: Path) -> Path:\n    out_dir.mkdir(parents=True, exist_ok=True)\n    out = out_dir \u002F f\"{inv['invoice_id']}.pdf\"\n    env = Environment(loader=BaseLoader())\n    tmpl = env.from_string(INVOICE_TEMPLATE)\n    html_str = tmpl.render(inv=inv)\n    try:\n        HTML(string=html_str).write_pdf(str(out))\n        return out\n    except Exception as exc:\n        raise RuntimeError(f\"WeasyPrint failed for {inv['invoice_id']}: {exc}\") from exc\n\n\nfor inv in invoices:\n    path = render_invoice_weasyprint(inv, Path(\"invoices\"))\n    print(f\"Generated: {path}\")\n",[30,54610,54611,54615,54625,54636,54646,54650,54660,54665,54670,54675,54680,54685,54690,54695,54700,54704,54709,54714,54719,54724,54729,54734,54739,54744,54749,54754,54759,54764,54769,54774,54779,54784,54789,54794,54799,54804,54809,54814,54819,54823,54828,54833,54838,54843,54848,54853,54858,54863,54874,54879,54884,54889,54900,54909,54914,54924,54929,54934,54938,54943,54953,54963,54973,54977,54981,54986,54991,54996,55000,55004,55019,55039,55067,55083,55097,55113,55119,55134,55141,55151,55189,55193,55197,55209,55223],{"__ignoreMap":28},[33,54612,54613],{"class":35,"line":36},[33,54614,20943],{"class":39},[33,54616,54617,54619,54621,54623],{"class":35,"line":43},[33,54618,190],{"class":163},[33,54620,193],{"class":167},[33,54622,164],{"class":163},[33,54624,198],{"class":167},[33,54626,54627,54629,54631,54633],{"class":35,"line":61},[33,54628,190],{"class":163},[33,54630,20970],{"class":167},[33,54632,164],{"class":163},[33,54634,54635],{"class":167}," Environment, BaseLoader\n",[33,54637,54638,54640,54642,54644],{"class":35,"line":73},[33,54639,190],{"class":163},[33,54641,20982],{"class":167},[33,54643,164],{"class":163},[33,54645,20987],{"class":50},[33,54647,54648],{"class":35,"line":88},[33,54649,92],{"emptyLinePlaceholder":91},[33,54651,54652,54655,54657],{"class":35,"line":95},[33,54653,54654],{"class":50},"INVOICE_TEMPLATE",[33,54656,212],{"class":163},[33,54658,54659],{"class":54}," \"\"\"\u003C!DOCTYPE html>\n",[33,54661,54662],{"class":35,"line":101},[33,54663,54664],{"class":54},"\u003Chtml>\u003Chead>\u003Cmeta charset=\"utf-8\">\n",[33,54666,54667],{"class":35,"line":171},[33,54668,54669],{"class":54},"\u003Cstyle>\n",[33,54671,54672],{"class":35,"line":179},[33,54673,54674],{"class":54},"@page {\n",[33,54676,54677],{"class":35,"line":187},[33,54678,54679],{"class":54},"  size: A4;\n",[33,54681,54682],{"class":35,"line":201},[33,54683,54684],{"class":54},"  margin: 18mm 15mm 22mm;\n",[33,54686,54687],{"class":35,"line":206},[33,54688,54689],{"class":54},"  @top-left   { content: \"INVOICE\"; font-size: 8pt; color: #475569; }\n",[33,54691,54692],{"class":35,"line":224},[33,54693,54694],{"class":54},"  @top-right  { content: \"{{ inv.invoice_id }}\"; font-size: 8pt; color: #475569; }\n",[33,54696,54697],{"class":35,"line":229},[33,54698,54699],{"class":54},"  @bottom-center { content: \"Page \" counter(page) \" of \" counter(pages); font-size: 7pt; color: #475569; }\n",[33,54701,54702],{"class":35,"line":235},[33,54703,4113],{"class":54},[33,54705,54706],{"class":35,"line":250},[33,54707,54708],{"class":54},"body   { font-family: sans-serif; font-size: 10pt; color: #0f172a; margin: 0; }\n",[33,54710,54711],{"class":35,"line":266},[33,54712,54713],{"class":54},"h1     { font-size: 22pt; margin-bottom: 2mm; }\n",[33,54715,54716],{"class":35,"line":290},[33,54717,54718],{"class":54},".meta  { font-size: 9pt; color: #475569; margin-bottom: 8mm; }\n",[33,54720,54721],{"class":35,"line":295},[33,54722,54723],{"class":54},"table  { width: 100%; border-collapse: collapse; margin-top: 6mm; }\n",[33,54725,54726],{"class":35,"line":300},[33,54727,54728],{"class":54},"thead tr { background: #2563eb; color: #fff; }\n",[33,54730,54731],{"class":35,"line":317},[33,54732,54733],{"class":54},"th, td   { padding: 5px 8px; font-size: 9pt; }\n",[33,54735,54736],{"class":35,"line":332},[33,54737,54738],{"class":54},"th       { text-align: left; }\n",[33,54740,54741],{"class":35,"line":347},[33,54742,54743],{"class":54},".right   { text-align: right; }\n",[33,54745,54746],{"class":35,"line":374},[33,54747,54748],{"class":54},"tbody tr { page-break-inside: avoid; }              \u002F* prevents row splitting *\u002F\n",[33,54750,54751],{"class":35,"line":397},[33,54752,54753],{"class":54},"tbody tr:nth-child(even) { background: #f6f8fb; }\n",[33,54755,54756],{"class":35,"line":653},[33,54757,54758],{"class":54},".totals  { margin-top: 6mm; text-align: right; font-size: 10pt; }\n",[33,54760,54761],{"class":35,"line":667},[33,54762,54763],{"class":54},".totals td { padding: 2px 8px; }\n",[33,54765,54766],{"class":35,"line":675},[33,54767,54768],{"class":54},".grand   { font-weight: bold; font-size: 12pt; color: #2563eb; }\n",[33,54770,54771],{"class":35,"line":689},[33,54772,54773],{"class":54},".footer  { margin-top: 12mm; font-size: 8pt; color: #475569;\n",[33,54775,54776],{"class":35,"line":703},[33,54777,54778],{"class":54},"           border-top: 1px solid #e2e8f0; padding-top: 4mm; }\n",[33,54780,54781],{"class":35,"line":714},[33,54782,54783],{"class":54},"\u003C\u002Fstyle>\u003C\u002Fhead>\n",[33,54785,54786],{"class":35,"line":723},[33,54787,54788],{"class":54},"\u003Cbody>\n",[33,54790,54791],{"class":35,"line":754},[33,54792,54793],{"class":54},"\u003Ch1>Invoice\u003C\u002Fh1>\n",[33,54795,54796],{"class":35,"line":771},[33,54797,54798],{"class":54},"\u003Cdiv class=\"meta\">\n",[33,54800,54801],{"class":35,"line":777},[33,54802,54803],{"class":54},"  \u003Cstrong>{{ inv.customer }}\u003C\u002Fstrong>\u003Cbr>\n",[33,54805,54806],{"class":35,"line":788},[33,54807,54808],{"class":54},"  Invoice: {{ inv.invoice_id }} &nbsp;|&nbsp; Due: {{ inv.due_date }}\u003Cbr>\n",[33,54810,54811],{"class":35,"line":804},[33,54812,54813],{"class":54},"  Contact: {{ inv.email }}\n",[33,54815,54816],{"class":35,"line":809},[33,54817,54818],{"class":54},"\u003C\u002Fdiv>\n",[33,54820,54821],{"class":35,"line":819},[33,54822,92],{"emptyLinePlaceholder":91},[33,54824,54825],{"class":35,"line":829},[33,54826,54827],{"class":54},"\u003Ctable>\n",[33,54829,54830],{"class":35,"line":834},[33,54831,54832],{"class":54},"  \u003Cthead>\u003Ctr>\n",[33,54834,54835],{"class":35,"line":839},[33,54836,54837],{"class":54},"    \u003Cth>Description\u003C\u002Fth>\n",[33,54839,54840],{"class":35,"line":860},[33,54841,54842],{"class":54},"    \u003Cth class=\"right\">Qty\u003C\u002Fth>\n",[33,54844,54845],{"class":35,"line":887},[33,54846,54847],{"class":54},"    \u003Cth class=\"right\">Rate\u003C\u002Fth>\n",[33,54849,54850],{"class":35,"line":907},[33,54851,54852],{"class":54},"    \u003Cth class=\"right\">Total\u003C\u002Fth>\n",[33,54854,54855],{"class":35,"line":1826},[33,54856,54857],{"class":54},"  \u003C\u002Ftr>\u003C\u002Fthead>\n",[33,54859,54860],{"class":35,"line":1844},[33,54861,54862],{"class":54},"  \u003Ctbody>\n",[33,54864,54865,54868,54871],{"class":35,"line":1858},[33,54866,54867],{"class":54},"  {",[33,54869,54870],{"class":50},"% f",[33,54872,54873],{"class":54},"or item in inv.items %}\n",[33,54875,54876],{"class":35,"line":1871},[33,54877,54878],{"class":54},"  \u003Ctr>\n",[33,54880,54881],{"class":35,"line":1877},[33,54882,54883],{"class":54},"    \u003Ctd>{{ item.desc }}\u003C\u002Ftd>\n",[33,54885,54886],{"class":35,"line":1883},[33,54887,54888],{"class":54},"    \u003Ctd class=\"right\">{{ item.qty }}\u003C\u002Ftd>\n",[33,54890,54891,54894,54897],{"class":35,"line":1915},[33,54892,54893],{"class":54},"    \u003Ctd class=\"right\">${{ \"",[33,54895,54896],{"class":50},"%.2f",[33,54898,54899],{"class":54},"\"|format(item.rate) }}\u003C\u002Ftd>\n",[33,54901,54902,54904,54906],{"class":35,"line":1926},[33,54903,54893],{"class":54},[33,54905,54896],{"class":50},[33,54907,54908],{"class":54},"\"|format(item.total) }}\u003C\u002Ftd>\n",[33,54910,54911],{"class":35,"line":1932},[33,54912,54913],{"class":54},"  \u003C\u002Ftr>\n",[33,54915,54916,54918,54921],{"class":35,"line":1938},[33,54917,54867],{"class":54},[33,54919,54920],{"class":50},"% e",[33,54922,54923],{"class":54},"ndfor %}\n",[33,54925,54926],{"class":35,"line":1950},[33,54927,54928],{"class":54},"  \u003C\u002Ftbody>\n",[33,54930,54931],{"class":35,"line":1958},[33,54932,54933],{"class":54},"\u003C\u002Ftable>\n",[33,54935,54936],{"class":35,"line":4904},[33,54937,92],{"emptyLinePlaceholder":91},[33,54939,54940],{"class":35,"line":4909},[33,54941,54942],{"class":54},"\u003Ctable class=\"totals\">\n",[33,54944,54945,54948,54950],{"class":35,"line":4915},[33,54946,54947],{"class":54},"  \u003Ctr>\u003Ctd>Subtotal\u003C\u002Ftd>\u003Ctd>${{ \"",[33,54949,54896],{"class":50},[33,54951,54952],{"class":54},"\"|format(inv.subtotal) }}\u003C\u002Ftd>\u003C\u002Ftr>\n",[33,54954,54955,54958,54960],{"class":35,"line":4925},[33,54956,54957],{"class":54},"  \u003Ctr>\u003Ctd>Tax ({{ (inv.tax_rate * 100)|int }}%)\u003C\u002Ftd>\u003Ctd>${{ \"",[33,54959,54896],{"class":50},[33,54961,54962],{"class":54},"\"|format(inv.tax_amount) }}\u003C\u002Ftd>\u003C\u002Ftr>\n",[33,54964,54965,54968,54970],{"class":35,"line":4935},[33,54966,54967],{"class":54},"  \u003Ctr class=\"grand\">\u003Ctd>Total Due\u003C\u002Ftd>\u003Ctd>${{ \"",[33,54969,54896],{"class":50},[33,54971,54972],{"class":54},"\"|format(inv.grand_total) }}\u003C\u002Ftd>\u003C\u002Ftr>\n",[33,54974,54975],{"class":35,"line":4941},[33,54976,54933],{"class":54},[33,54978,54979],{"class":35,"line":4950},[33,54980,92],{"emptyLinePlaceholder":91},[33,54982,54983],{"class":35,"line":4960},[33,54984,54985],{"class":54},"\u003Cdiv class=\"footer\">Payment terms: 30 days. Bank transfer preferred.\u003Cbr>\n",[33,54987,54988],{"class":35,"line":4965},[33,54989,54990],{"class":54},"Thank you for your business.\u003C\u002Fdiv>\n",[33,54992,54993],{"class":35,"line":4971},[33,54994,54995],{"class":54},"\u003C\u002Fbody>\u003C\u002Fhtml>\"\"\"\n",[33,54997,54998],{"class":35,"line":4983},[33,54999,92],{"emptyLinePlaceholder":91},[33,55001,55002],{"class":35,"line":4988},[33,55003,92],{"emptyLinePlaceholder":91},[33,55005,55006,55008,55011,55014,55016],{"class":35,"line":4993},[33,55007,562],{"class":163},[33,55009,55010],{"class":46}," render_invoice_weasyprint",[33,55012,55013],{"class":167},"(inv: ",[33,55015,37100],{"class":50},[33,55017,55018],{"class":167},", out_dir: Path) -> Path:\n",[33,55020,55021,55023,55025,55027,55029,55031,55033,55035,55037],{"class":35,"line":5003},[33,55022,28258],{"class":167},[33,55024,869],{"class":238},[33,55026,242],{"class":163},[33,55028,855],{"class":50},[33,55030,365],{"class":167},[33,55032,878],{"class":238},[33,55034,242],{"class":163},[33,55036,855],{"class":50},[33,55038,221],{"class":167},[33,55040,55041,55043,55045,55047,55049,55051,55053,55055,55058,55061,55063,55065],{"class":35,"line":5008},[33,55042,17989],{"class":167},[33,55044,242],{"class":163},[33,55046,40669],{"class":167},[33,55048,1351],{"class":163},[33,55050,1110],{"class":163},[33,55052,274],{"class":54},[33,55054,1115],{"class":50},[33,55056,55057],{"class":167},"inv[",[33,55059,55060],{"class":54},"'invoice_id'",[33,55062,9546],{"class":167},[33,55064,1121],{"class":50},[33,55066,19246],{"class":54},[33,55068,55069,55072,55074,55076,55078,55080],{"class":35,"line":5014},[33,55070,55071],{"class":167},"    env ",[33,55073,242],{"class":163},[33,55075,21111],{"class":167},[33,55077,21114],{"class":238},[33,55079,242],{"class":163},[33,55081,55082],{"class":167},"BaseLoader())\n",[33,55084,55085,55088,55090,55093,55095],{"class":35,"line":5019},[33,55086,55087],{"class":167},"    tmpl ",[33,55089,242],{"class":163},[33,55091,55092],{"class":167}," env.from_string(",[33,55094,54654],{"class":50},[33,55096,221],{"class":167},[33,55098,55099,55101,55103,55105,55108,55110],{"class":35,"line":5032},[33,55100,21200],{"class":167},[33,55102,242],{"class":163},[33,55104,21205],{"class":167},[33,55106,55107],{"class":238},"inv",[33,55109,242],{"class":163},[33,55111,55112],{"class":167},"inv)\n",[33,55114,55115,55117],{"class":35,"line":5039},[33,55116,2424],{"class":163},[33,55118,574],{"class":167},[33,55120,55121,55123,55125,55127,55129,55131],{"class":35,"line":5068},[33,55122,21235],{"class":167},[33,55124,21238],{"class":238},[33,55126,242],{"class":163},[33,55128,21243],{"class":167},[33,55130,1053],{"class":50},[33,55132,55133],{"class":167},"(out))\n",[33,55135,55136,55138],{"class":35,"line":5077},[33,55137,1659],{"class":163},[33,55139,55140],{"class":167}," out\n",[33,55142,55143,55145,55147,55149],{"class":35,"line":5082},[33,55144,2449],{"class":163},[33,55146,783],{"class":50},[33,55148,1852],{"class":163},[33,55150,1855],{"class":167},[33,55152,55153,55155,55157,55159,55161,55163,55165,55167,55169,55171,55173,55175,55177,55179,55181,55183,55185,55187],{"class":35,"line":5089},[33,55154,4051],{"class":163},[33,55156,7590],{"class":50},[33,55158,602],{"class":167},[33,55160,4059],{"class":163},[33,55162,21271],{"class":54},[33,55164,1115],{"class":50},[33,55166,55057],{"class":167},[33,55168,55060],{"class":54},[33,55170,9546],{"class":167},[33,55172,1121],{"class":50},[33,55174,2079],{"class":54},[33,55176,1115],{"class":50},[33,55178,6565],{"class":167},[33,55180,1121],{"class":50},[33,55182,274],{"class":54},[33,55184,1649],{"class":167},[33,55186,190],{"class":163},[33,55188,20843],{"class":167},[33,55190,55191],{"class":35,"line":5098},[33,55192,92],{"emptyLinePlaceholder":91},[33,55194,55195],{"class":35,"line":5105},[33,55196,92],{"emptyLinePlaceholder":91},[33,55198,55199,55201,55204,55206],{"class":35,"line":5110},[33,55200,6124],{"class":163},[33,55202,55203],{"class":167}," inv ",[33,55205,662],{"class":163},[33,55207,55208],{"class":167}," invoices:\n",[33,55210,55211,55213,55215,55218,55221],{"class":35,"line":5115},[33,55212,17306],{"class":167},[33,55214,242],{"class":163},[33,55216,55217],{"class":167}," render_invoice_weasyprint(inv, Path(",[33,55219,55220],{"class":54},"\"invoices\"",[33,55222,371],{"class":167},[33,55224,55225,55227,55229,55231,55234,55236,55238,55240,55242],{"class":35,"line":5128},[33,55226,7268],{"class":50},[33,55228,602],{"class":167},[33,55230,4059],{"class":163},[33,55232,55233],{"class":54},"\"Generated: ",[33,55235,1115],{"class":50},[33,55237,2580],{"class":167},[33,55239,1121],{"class":50},[33,55241,274],{"class":54},[33,55243,221],{"class":167},[18,55245,55247],{"id":55246},"step-3-generate-with-reportlab","Step 3 — Generate with ReportLab",[14,55249,55250],{},"Use ReportLab when you need exact coordinate placement, a company logo, or non-Latin fonts. The canvas approach gives explicit control over every element.",[23,55252,55254],{"className":126,"code":55253,"language":47,"meta":28,"style":28},"# pip install reportlab\nfrom pathlib import Path\nfrom reportlab.lib.pagesizes import A4\nfrom reportlab.lib.units import mm\nfrom reportlab.lib import colors\nfrom reportlab.platypus import (\n    SimpleDocTemplate, Table, TableStyle, Paragraph, Spacer, HRFlowable\n)\nfrom reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle\nfrom reportlab.lib.enums import TA_RIGHT, TA_CENTER\n\nBLUE = colors.HexColor(\"#2563eb\")\nMUTED = colors.HexColor(\"#475569\")\nBORDER = colors.HexColor(\"#e2e8f0\")\nSOFT = colors.HexColor(\"#f6f8fb\")\n\ndef render_invoice_reportlab(inv: dict, out_dir: Path) -> Path:\n    out_dir.mkdir(parents=True, exist_ok=True)\n    out = out_dir \u002F f\"{inv['invoice_id']}_rl.pdf\"\n    styles = getSampleStyleSheet()\n    right_s = ParagraphStyle(\"right_s\", parent=styles[\"Normal\"], alignment=TA_RIGHT)\n\n    doc = SimpleDocTemplate(\n        str(out), pagesize=A4,\n        leftMargin=15*mm, rightMargin=15*mm,\n        topMargin=20*mm, bottomMargin=20*mm,\n    )\n\n    def _header_footer(canvas, doc):\n        canvas.saveState()\n        canvas.setFont(\"Helvetica\", 7)\n        canvas.setFillColor(MUTED)\n        canvas.drawString(15*mm, A4[1]-12*mm, \"INVOICE\")\n        canvas.drawRightString(A4[0]-15*mm, A4[1]-12*mm, inv[\"invoice_id\"])\n        canvas.drawCentredString(A4[0]\u002F2, 10*mm, f\"Page {doc.page}\")\n        canvas.restoreState()\n\n    # Line-item table\n    header_row = [[\"Description\", \"Qty\", \"Rate\", \"Total\"]]\n    item_rows = [\n        [item[\"desc\"], str(item[\"qty\"]),\n         f\"${item['rate']:.2f}\", f\"${item['total']:.2f}\"]\n        for item in inv[\"items\"]\n    ]\n    tbl = Table(\n        header_row + item_rows,\n        colWidths=[95*mm, 20*mm, 30*mm, 30*mm],\n        repeatRows=1,\n    )\n    tbl.setStyle(TableStyle([\n        (\"BACKGROUND\", (0, 0), (-1, 0), BLUE),\n        (\"TEXTCOLOR\", (0, 0), (-1, 0), colors.white),\n        (\"FONTNAME\", (0, 0), (-1, 0), \"Helvetica-Bold\"),\n        (\"FONTSIZE\", (0, 0), (-1, -1), 9),\n        (\"ALIGN\", (1, 0), (-1, -1), \"RIGHT\"),\n        (\"ROWBACKGROUNDS\", (0, 1), (-1, -1), [colors.white, SOFT]),\n        (\"GRID\", (0, 0), (-1, -1), 0.5, BORDER),\n    ]))\n\n    # Totals block as a narrow right-aligned table\n    totals_data = [\n        [\"Subtotal\", f\"${inv['subtotal']:.2f}\"],\n        [f\"Tax ({int(inv['tax_rate']*100)}%)\", f\"${inv['tax_amount']:.2f}\"],\n        [\"Total Due\", f\"${inv['grand_total']:.2f}\"],\n    ]\n    totals_tbl = Table(totals_data, colWidths=[40*mm, 35*mm], hAlign=\"RIGHT\")\n    totals_tbl.setStyle(TableStyle([\n        (\"FONTSIZE\", (0, 0), (-1, -1), 9),\n        (\"ALIGN\", (1, 0), (-1, -1), \"RIGHT\"),\n        (\"FONTNAME\", (0, 2), (-1, 2), \"Helvetica-Bold\"),\n        (\"TEXTCOLOR\", (0, 2), (-1, 2), BLUE),\n        (\"LINEABOVE\", (0, 2), (-1, 2), 0.5, BORDER),\n    ]))\n\n    story = [\n        Paragraph(\"Invoice\", styles[\"h1\"]),\n        Spacer(1, 2*mm),\n        Paragraph(f\"\u003Cb>{inv['customer']}\u003C\u002Fb>\", styles[\"Normal\"]),\n        Paragraph(f\"{inv['invoice_id']} · Due {inv['due_date']}\", styles[\"Normal\"]),\n        Paragraph(inv[\"email\"], styles[\"Normal\"]),\n        Spacer(1, 6*mm),\n        tbl,\n        Spacer(1, 4*mm),\n        totals_tbl,\n        Spacer(1, 8*mm),\n        HRFlowable(width=\"100%\", thickness=0.5, color=BORDER),\n        Spacer(1, 2*mm),\n        Paragraph(\"Payment terms: 30 days. Bank transfer preferred.\", styles[\"Normal\"]),\n    ]\n    try:\n        doc.build(story, onFirstPage=_header_footer, onLaterPages=_header_footer)\n        return out\n    except Exception as exc:\n        raise RuntimeError(f\"ReportLab failed for {inv['invoice_id']}: {exc}\") from exc\n\n\nfor inv in invoices:\n    path = render_invoice_reportlab(inv, Path(\"invoices\"))\n    print(f\"Generated: {path}\")\n",[30,55255,55256,55260,55270,55280,55291,55301,55311,55316,55320,55330,55347,55351,55365,55379,55392,55406,55410,55423,55443,55470,55478,55512,55516,55524,55537,55562,55585,55589,55593,55603,55608,55621,55630,55660,55694,55731,55736,55740,55745,55773,55782,55800,55847,55862,55866,55875,55885,55919,55930,55934,55938,55968,55994,56024,56056,56088,56120,56156,56160,56164,56169,56178,56209,56261,56291,56295,56334,56339,56371,56403,56433,56463,56498,56502,56506,56514,56528,56543,56570,56610,56625,56639,56644,56658,56663,56677,56709,56723,56736,56740,56746,56767,56773,56783,56822,56826,56830,56840,56853],{"__ignoreMap":28},[33,55257,55258],{"class":35,"line":36},[33,55259,20289],{"class":39},[33,55261,55262,55264,55266,55268],{"class":35,"line":43},[33,55263,190],{"class":163},[33,55265,193],{"class":167},[33,55267,164],{"class":163},[33,55269,198],{"class":167},[33,55271,55272,55274,55276,55278],{"class":35,"line":61},[33,55273,190],{"class":163},[33,55275,19044],{"class":167},[33,55277,164],{"class":163},[33,55279,19049],{"class":167},[33,55281,55282,55284,55286,55288],{"class":35,"line":73},[33,55283,190],{"class":163},[33,55285,19080],{"class":167},[33,55287,164],{"class":163},[33,55289,55290],{"class":167}," mm\n",[33,55292,55293,55295,55297,55299],{"class":35,"line":88},[33,55294,190],{"class":163},[33,55296,19056],{"class":167},[33,55298,164],{"class":163},[33,55300,19061],{"class":167},[33,55302,55303,55305,55307,55309],{"class":35,"line":95},[33,55304,190],{"class":163},[33,55306,19092],{"class":167},[33,55308,164],{"class":163},[33,55310,1415],{"class":167},[33,55312,55313],{"class":35,"line":101},[33,55314,55315],{"class":167},"    SimpleDocTemplate, Table, TableStyle, Paragraph, Spacer, HRFlowable\n",[33,55317,55318],{"class":35,"line":171},[33,55319,221],{"class":167},[33,55321,55322,55324,55326,55328],{"class":35,"line":179},[33,55323,190],{"class":163},[33,55325,19068],{"class":167},[33,55327,164],{"class":163},[33,55329,19073],{"class":167},[33,55331,55332,55334,55337,55339,55342,55344],{"class":35,"line":187},[33,55333,190],{"class":163},[33,55335,55336],{"class":167}," reportlab.lib.enums ",[33,55338,164],{"class":163},[33,55340,55341],{"class":50}," TA_RIGHT",[33,55343,365],{"class":167},[33,55345,55346],{"class":50},"TA_CENTER\n",[33,55348,55349],{"class":35,"line":201},[33,55350,92],{"emptyLinePlaceholder":91},[33,55352,55353,55356,55358,55360,55363],{"class":35,"line":206},[33,55354,55355],{"class":50},"BLUE",[33,55357,212],{"class":163},[33,55359,19157],{"class":167},[33,55361,55362],{"class":54},"\"#2563eb\"",[33,55364,221],{"class":167},[33,55366,55367,55370,55372,55374,55377],{"class":35,"line":224},[33,55368,55369],{"class":50},"MUTED",[33,55371,212],{"class":163},[33,55373,19157],{"class":167},[33,55375,55376],{"class":54},"\"#475569\"",[33,55378,221],{"class":167},[33,55380,55381,55383,55385,55387,55390],{"class":35,"line":229},[33,55382,19181],{"class":50},[33,55384,212],{"class":163},[33,55386,19157],{"class":167},[33,55388,55389],{"class":54},"\"#e2e8f0\"",[33,55391,221],{"class":167},[33,55393,55394,55397,55399,55401,55404],{"class":35,"line":235},[33,55395,55396],{"class":50},"SOFT",[33,55398,212],{"class":163},[33,55400,19157],{"class":167},[33,55402,55403],{"class":54},"\"#f6f8fb\"",[33,55405,221],{"class":167},[33,55407,55408],{"class":35,"line":250},[33,55409,92],{"emptyLinePlaceholder":91},[33,55411,55412,55414,55417,55419,55421],{"class":35,"line":266},[33,55413,562],{"class":163},[33,55415,55416],{"class":46}," render_invoice_reportlab",[33,55418,55013],{"class":167},[33,55420,37100],{"class":50},[33,55422,55018],{"class":167},[33,55424,55425,55427,55429,55431,55433,55435,55437,55439,55441],{"class":35,"line":290},[33,55426,28258],{"class":167},[33,55428,869],{"class":238},[33,55430,242],{"class":163},[33,55432,855],{"class":50},[33,55434,365],{"class":167},[33,55436,878],{"class":238},[33,55438,242],{"class":163},[33,55440,855],{"class":50},[33,55442,221],{"class":167},[33,55444,55445,55447,55449,55451,55453,55455,55457,55459,55461,55463,55465,55467],{"class":35,"line":295},[33,55446,17989],{"class":167},[33,55448,242],{"class":163},[33,55450,40669],{"class":167},[33,55452,1351],{"class":163},[33,55454,1110],{"class":163},[33,55456,274],{"class":54},[33,55458,1115],{"class":50},[33,55460,55057],{"class":167},[33,55462,55060],{"class":54},[33,55464,9546],{"class":167},[33,55466,1121],{"class":50},[33,55468,55469],{"class":54},"_rl.pdf\"\n",[33,55471,55472,55474,55476],{"class":35,"line":300},[33,55473,19255],{"class":167},[33,55475,242],{"class":163},[33,55477,19260],{"class":167},[33,55479,55480,55483,55485,55488,55491,55493,55495,55497,55499,55501,55503,55505,55507,55510],{"class":35,"line":317},[33,55481,55482],{"class":167},"    right_s ",[33,55484,242],{"class":163},[33,55486,55487],{"class":167}," ParagraphStyle(",[33,55489,55490],{"class":54},"\"right_s\"",[33,55492,365],{"class":167},[33,55494,19280],{"class":238},[33,55496,242],{"class":163},[33,55498,19285],{"class":167},[33,55500,19348],{"class":54},[33,55502,8314],{"class":167},[33,55504,46396],{"class":238},[33,55506,242],{"class":163},[33,55508,55509],{"class":50},"TA_RIGHT",[33,55511,221],{"class":167},[33,55513,55514],{"class":35,"line":332},[33,55515,92],{"emptyLinePlaceholder":91},[33,55517,55518,55520,55522],{"class":35,"line":347},[33,55519,18224],{"class":167},[33,55521,242],{"class":163},[33,55523,20082],{"class":167},[33,55525,55526,55528,55531,55533,55535],{"class":35,"line":374},[33,55527,35596],{"class":50},[33,55529,55530],{"class":167},"(out), ",[33,55532,20091],{"class":238},[33,55534,242],{"class":163},[33,55536,20096],{"class":167},[33,55538,55539,55542,55544,55546,55548,55551,55553,55555,55557,55559],{"class":35,"line":397},[33,55540,55541],{"class":238},"        leftMargin",[33,55543,242],{"class":163},[33,55545,1646],{"class":50},[33,55547,1769],{"class":163},[33,55549,55550],{"class":167},"mm, ",[33,55552,20112],{"class":238},[33,55554,242],{"class":163},[33,55556,1646],{"class":50},[33,55558,1769],{"class":163},[33,55560,55561],{"class":167},"mm,\n",[33,55563,55564,55567,55569,55571,55573,55575,55577,55579,55581,55583],{"class":35,"line":653},[33,55565,55566],{"class":238},"        topMargin",[33,55568,242],{"class":163},[33,55570,2587],{"class":50},[33,55572,1769],{"class":163},[33,55574,55550],{"class":167},[33,55576,20137],{"class":238},[33,55578,242],{"class":163},[33,55580,2587],{"class":50},[33,55582,1769],{"class":163},[33,55584,55561],{"class":167},[33,55586,55587],{"class":35,"line":667},[33,55588,1202],{"class":167},[33,55590,55591],{"class":35,"line":675},[33,55592,92],{"emptyLinePlaceholder":91},[33,55594,55595,55597,55600],{"class":35,"line":689},[33,55596,1742],{"class":163},[33,55598,55599],{"class":46}," _header_footer",[33,55601,55602],{"class":167},"(canvas, doc):\n",[33,55604,55605],{"class":35,"line":703},[33,55606,55607],{"class":167},"        canvas.saveState()\n",[33,55609,55610,55613,55615,55617,55619],{"class":35,"line":714},[33,55611,55612],{"class":167},"        canvas.setFont(",[33,55614,28546],{"class":54},[33,55616,365],{"class":167},[33,55618,1179],{"class":50},[33,55620,221],{"class":167},[33,55622,55623,55626,55628],{"class":35,"line":723},[33,55624,55625],{"class":167},"        canvas.setFillColor(",[33,55627,55369],{"class":50},[33,55629,221],{"class":167},[33,55631,55632,55635,55637,55639,55642,55644,55646,55648,55651,55653,55655,55658],{"class":35,"line":754},[33,55633,55634],{"class":167},"        canvas.drawString(",[33,55636,1646],{"class":50},[33,55638,1769],{"class":163},[33,55640,55641],{"class":167},"mm, A4[",[33,55643,734],{"class":50},[33,55645,9546],{"class":167},[33,55647,4126],{"class":163},[33,55649,55650],{"class":50},"12",[33,55652,1769],{"class":163},[33,55654,55550],{"class":167},[33,55656,55657],{"class":54},"\"INVOICE\"",[33,55659,221],{"class":167},[33,55661,55662,55665,55667,55669,55671,55673,55675,55677,55679,55681,55683,55685,55687,55690,55692],{"class":35,"line":771},[33,55663,55664],{"class":167},"        canvas.drawRightString(A4[",[33,55666,748],{"class":50},[33,55668,9546],{"class":167},[33,55670,4126],{"class":163},[33,55672,1646],{"class":50},[33,55674,1769],{"class":163},[33,55676,55641],{"class":167},[33,55678,734],{"class":50},[33,55680,9546],{"class":167},[33,55682,4126],{"class":163},[33,55684,55650],{"class":50},[33,55686,1769],{"class":163},[33,55688,55689],{"class":167},"mm, inv[",[33,55691,27353],{"class":54},[33,55693,751],{"class":167},[33,55695,55696,55699,55701,55703,55705,55707,55709,55711,55713,55715,55717,55720,55722,55725,55727,55729],{"class":35,"line":777},[33,55697,55698],{"class":167},"        canvas.drawCentredString(A4[",[33,55700,748],{"class":50},[33,55702,9546],{"class":167},[33,55704,1351],{"class":163},[33,55706,1533],{"class":50},[33,55708,365],{"class":167},[33,55710,3545],{"class":50},[33,55712,1769],{"class":163},[33,55714,55550],{"class":167},[33,55716,4059],{"class":163},[33,55718,55719],{"class":54},"\"Page ",[33,55721,1115],{"class":50},[33,55723,55724],{"class":167},"doc.page",[33,55726,1121],{"class":50},[33,55728,274],{"class":54},[33,55730,221],{"class":167},[33,55732,55733],{"class":35,"line":788},[33,55734,55735],{"class":167},"        canvas.restoreState()\n",[33,55737,55738],{"class":35,"line":804},[33,55739,92],{"emptyLinePlaceholder":91},[33,55741,55742],{"class":35,"line":809},[33,55743,55744],{"class":39},"    # Line-item table\n",[33,55746,55747,55749,55751,55753,55756,55758,55761,55763,55766,55768,55771],{"class":35,"line":819},[33,55748,24488],{"class":167},[33,55750,242],{"class":163},[33,55752,20349],{"class":167},[33,55754,55755],{"class":54},"\"Description\"",[33,55757,365],{"class":167},[33,55759,55760],{"class":54},"\"Qty\"",[33,55762,365],{"class":167},[33,55764,55765],{"class":54},"\"Rate\"",[33,55767,365],{"class":167},[33,55769,55770],{"class":54},"\"Total\"",[33,55772,44162],{"class":167},[33,55774,55775,55778,55780],{"class":35,"line":829},[33,55776,55777],{"class":167},"    item_rows ",[33,55779,242],{"class":163},[33,55781,7473],{"class":167},[33,55783,55784,55787,55790,55792,55794,55796,55798],{"class":35,"line":834},[33,55785,55786],{"class":167},"        [item[",[33,55788,55789],{"class":54},"\"desc\"",[33,55791,8314],{"class":167},[33,55793,1053],{"class":50},[33,55795,54229],{"class":167},[33,55797,54232],{"class":54},[33,55799,12871],{"class":167},[33,55801,55802,55805,55807,55809,55812,55815,55817,55820,55822,55824,55826,55828,55830,55832,55834,55837,55839,55841,55843,55845],{"class":35,"line":839},[33,55803,55804],{"class":163},"         f",[33,55806,18820],{"class":54},[33,55808,1115],{"class":50},[33,55810,55811],{"class":167},"item[",[33,55813,55814],{"class":54},"'rate'",[33,55816,9546],{"class":167},[33,55818,55819],{"class":163},":.2f",[33,55821,1121],{"class":50},[33,55823,274],{"class":54},[33,55825,365],{"class":167},[33,55827,4059],{"class":163},[33,55829,18820],{"class":54},[33,55831,1115],{"class":50},[33,55833,55811],{"class":167},[33,55835,55836],{"class":54},"'total'",[33,55838,9546],{"class":167},[33,55840,55819],{"class":163},[33,55842,1121],{"class":50},[33,55844,274],{"class":54},[33,55846,9202],{"class":167},[33,55848,55849,55851,55853,55855,55858,55860],{"class":35,"line":860},[33,55850,5973],{"class":163},[33,55852,54203],{"class":167},[33,55854,662],{"class":163},[33,55856,55857],{"class":167}," inv[",[33,55859,54159],{"class":54},[33,55861,9202],{"class":167},[33,55863,55864],{"class":35,"line":887},[33,55865,19559],{"class":167},[33,55867,55868,55870,55872],{"class":35,"line":907},[33,55869,14864],{"class":167},[33,55871,242],{"class":163},[33,55873,55874],{"class":167}," Table(\n",[33,55876,55877,55880,55882],{"class":35,"line":1826},[33,55878,55879],{"class":167},"        header_row ",[33,55881,1811],{"class":163},[33,55883,55884],{"class":167}," item_rows,\n",[33,55886,55887,55890,55892,55894,55896,55898,55900,55902,55904,55906,55908,55910,55912,55914,55916],{"class":35,"line":1844},[33,55888,55889],{"class":238},"        colWidths",[33,55891,242],{"class":163},[33,55893,8309],{"class":167},[33,55895,16357],{"class":50},[33,55897,1769],{"class":163},[33,55899,55550],{"class":167},[33,55901,2587],{"class":50},[33,55903,1769],{"class":163},[33,55905,55550],{"class":167},[33,55907,1543],{"class":50},[33,55909,1769],{"class":163},[33,55911,55550],{"class":167},[33,55913,1543],{"class":50},[33,55915,1769],{"class":163},[33,55917,55918],{"class":167},"mm],\n",[33,55920,55921,55924,55926,55928],{"class":35,"line":1858},[33,55922,55923],{"class":238},"        repeatRows",[33,55925,242],{"class":163},[33,55927,734],{"class":50},[33,55929,247],{"class":167},[33,55931,55932],{"class":35,"line":1871},[33,55933,1202],{"class":167},[33,55935,55936],{"class":35,"line":1877},[33,55937,19814],{"class":167},[33,55939,55940,55942,55944,55946,55948,55950,55952,55954,55956,55958,55960,55962,55964,55966],{"class":35,"line":1883},[33,55941,19819],{"class":167},[33,55943,19822],{"class":54},[33,55945,19953],{"class":167},[33,55947,748],{"class":50},[33,55949,365],{"class":167},[33,55951,748],{"class":50},[33,55953,19834],{"class":167},[33,55955,4126],{"class":163},[33,55957,734],{"class":50},[33,55959,365],{"class":167},[33,55961,748],{"class":50},[33,55963,18525],{"class":167},[33,55965,55355],{"class":50},[33,55967,1506],{"class":167},[33,55969,55970,55972,55974,55976,55978,55980,55982,55984,55986,55988,55990,55992],{"class":35,"line":1915},[33,55971,19819],{"class":167},[33,55973,19855],{"class":54},[33,55975,19953],{"class":167},[33,55977,748],{"class":50},[33,55979,365],{"class":167},[33,55981,748],{"class":50},[33,55983,19834],{"class":167},[33,55985,4126],{"class":163},[33,55987,734],{"class":50},[33,55989,365],{"class":167},[33,55991,748],{"class":50},[33,55993,19877],{"class":167},[33,55995,55996,55998,56000,56002,56004,56006,56008,56010,56012,56014,56016,56018,56020,56022],{"class":35,"line":1926},[33,55997,19819],{"class":167},[33,55999,19884],{"class":54},[33,56001,19953],{"class":167},[33,56003,748],{"class":50},[33,56005,365],{"class":167},[33,56007,748],{"class":50},[33,56009,19834],{"class":167},[33,56011,4126],{"class":163},[33,56013,734],{"class":50},[33,56015,365],{"class":167},[33,56017,748],{"class":50},[33,56019,18525],{"class":167},[33,56021,19908],{"class":54},[33,56023,1506],{"class":167},[33,56025,56026,56028,56030,56032,56034,56036,56038,56040,56042,56044,56046,56048,56050,56052,56054],{"class":35,"line":1932},[33,56027,19819],{"class":167},[33,56029,19917],{"class":54},[33,56031,19953],{"class":167},[33,56033,748],{"class":50},[33,56035,365],{"class":167},[33,56037,748],{"class":50},[33,56039,19834],{"class":167},[33,56041,4126],{"class":163},[33,56043,734],{"class":50},[33,56045,365],{"class":167},[33,56047,4126],{"class":163},[33,56049,734],{"class":50},[33,56051,18525],{"class":167},[33,56053,2577],{"class":50},[33,56055,1506],{"class":167},[33,56057,56058,56060,56062,56064,56066,56068,56070,56072,56074,56076,56078,56080,56082,56084,56086],{"class":35,"line":1938},[33,56059,19819],{"class":167},[33,56061,20024],{"class":54},[33,56063,19953],{"class":167},[33,56065,734],{"class":50},[33,56067,365],{"class":167},[33,56069,748],{"class":50},[33,56071,19834],{"class":167},[33,56073,4126],{"class":163},[33,56075,734],{"class":50},[33,56077,365],{"class":167},[33,56079,4126],{"class":163},[33,56081,734],{"class":50},[33,56083,18525],{"class":167},[33,56085,20050],{"class":54},[33,56087,1506],{"class":167},[33,56089,56090,56092,56094,56096,56098,56100,56102,56104,56106,56108,56110,56112,56114,56116,56118],{"class":35,"line":1950},[33,56091,19819],{"class":167},[33,56093,19950],{"class":54},[33,56095,19953],{"class":167},[33,56097,748],{"class":50},[33,56099,365],{"class":167},[33,56101,734],{"class":50},[33,56103,19834],{"class":167},[33,56105,4126],{"class":163},[33,56107,734],{"class":50},[33,56109,365],{"class":167},[33,56111,4126],{"class":163},[33,56113,734],{"class":50},[33,56115,19974],{"class":167},[33,56117,55396],{"class":50},[33,56119,12871],{"class":167},[33,56121,56122,56124,56126,56128,56130,56132,56134,56136,56138,56140,56142,56144,56146,56148,56150,56152,56154],{"class":35,"line":1958},[33,56123,19819],{"class":167},[33,56125,19985],{"class":54},[33,56127,19953],{"class":167},[33,56129,748],{"class":50},[33,56131,365],{"class":167},[33,56133,748],{"class":50},[33,56135,19834],{"class":167},[33,56137,4126],{"class":163},[33,56139,734],{"class":50},[33,56141,365],{"class":167},[33,56143,4126],{"class":163},[33,56145,734],{"class":50},[33,56147,18525],{"class":167},[33,56149,20011],{"class":50},[33,56151,365],{"class":167},[33,56153,19181],{"class":50},[33,56155,1506],{"class":167},[33,56157,56158],{"class":35,"line":4904},[33,56159,20057],{"class":167},[33,56161,56162],{"class":35,"line":4909},[33,56163,92],{"emptyLinePlaceholder":91},[33,56165,56166],{"class":35,"line":4915},[33,56167,56168],{"class":39},"    # Totals block as a narrow right-aligned table\n",[33,56170,56171,56174,56176],{"class":35,"line":4925},[33,56172,56173],{"class":167},"    totals_data ",[33,56175,242],{"class":163},[33,56177,7473],{"class":167},[33,56179,56180,56183,56186,56188,56190,56192,56194,56196,56199,56201,56203,56205,56207],{"class":35,"line":4935},[33,56181,56182],{"class":167},"        [",[33,56184,56185],{"class":54},"\"Subtotal\"",[33,56187,365],{"class":167},[33,56189,4059],{"class":163},[33,56191,18820],{"class":54},[33,56193,1115],{"class":50},[33,56195,55057],{"class":167},[33,56197,56198],{"class":54},"'subtotal'",[33,56200,9546],{"class":167},[33,56202,55819],{"class":163},[33,56204,1121],{"class":50},[33,56206,274],{"class":54},[33,56208,8935],{"class":167},[33,56210,56211,56213,56215,56218,56220,56223,56226,56228,56230,56232,56234,56236,56238,56240,56242,56244,56246,56248,56251,56253,56255,56257,56259],{"class":35,"line":4941},[33,56212,56182],{"class":167},[33,56214,4059],{"class":163},[33,56216,56217],{"class":54},"\"Tax (",[33,56219,18790],{"class":50},[33,56221,56222],{"class":167},"(inv[",[33,56224,56225],{"class":54},"'tax_rate'",[33,56227,9546],{"class":167},[33,56229,1769],{"class":163},[33,56231,2650],{"class":50},[33,56233,12027],{"class":167},[33,56235,1121],{"class":50},[33,56237,40742],{"class":54},[33,56239,365],{"class":167},[33,56241,4059],{"class":163},[33,56243,18820],{"class":54},[33,56245,1115],{"class":50},[33,56247,55057],{"class":167},[33,56249,56250],{"class":54},"'tax_amount'",[33,56252,9546],{"class":167},[33,56254,55819],{"class":163},[33,56256,1121],{"class":50},[33,56258,274],{"class":54},[33,56260,8935],{"class":167},[33,56262,56263,56265,56268,56270,56272,56274,56276,56278,56281,56283,56285,56287,56289],{"class":35,"line":4950},[33,56264,56182],{"class":167},[33,56266,56267],{"class":54},"\"Total Due\"",[33,56269,365],{"class":167},[33,56271,4059],{"class":163},[33,56273,18820],{"class":54},[33,56275,1115],{"class":50},[33,56277,55057],{"class":167},[33,56279,56280],{"class":54},"'grand_total'",[33,56282,9546],{"class":167},[33,56284,55819],{"class":163},[33,56286,1121],{"class":50},[33,56288,274],{"class":54},[33,56290,8935],{"class":167},[33,56292,56293],{"class":35,"line":4960},[33,56294,19559],{"class":167},[33,56296,56297,56300,56302,56305,56307,56309,56311,56313,56315,56317,56320,56322,56325,56328,56330,56332],{"class":35,"line":4965},[33,56298,56299],{"class":167},"    totals_tbl ",[33,56301,242],{"class":163},[33,56303,56304],{"class":167}," Table(totals_data, ",[33,56306,19795],{"class":238},[33,56308,242],{"class":163},[33,56310,8309],{"class":167},[33,56312,26323],{"class":50},[33,56314,1769],{"class":163},[33,56316,55550],{"class":167},[33,56318,56319],{"class":50},"35",[33,56321,1769],{"class":163},[33,56323,56324],{"class":167},"mm], ",[33,56326,56327],{"class":238},"hAlign",[33,56329,242],{"class":163},[33,56331,20050],{"class":54},[33,56333,221],{"class":167},[33,56335,56336],{"class":35,"line":4971},[33,56337,56338],{"class":167},"    totals_tbl.setStyle(TableStyle([\n",[33,56340,56341,56343,56345,56347,56349,56351,56353,56355,56357,56359,56361,56363,56365,56367,56369],{"class":35,"line":4983},[33,56342,19819],{"class":167},[33,56344,19917],{"class":54},[33,56346,19953],{"class":167},[33,56348,748],{"class":50},[33,56350,365],{"class":167},[33,56352,748],{"class":50},[33,56354,19834],{"class":167},[33,56356,4126],{"class":163},[33,56358,734],{"class":50},[33,56360,365],{"class":167},[33,56362,4126],{"class":163},[33,56364,734],{"class":50},[33,56366,18525],{"class":167},[33,56368,2577],{"class":50},[33,56370,1506],{"class":167},[33,56372,56373,56375,56377,56379,56381,56383,56385,56387,56389,56391,56393,56395,56397,56399,56401],{"class":35,"line":4988},[33,56374,19819],{"class":167},[33,56376,20024],{"class":54},[33,56378,19953],{"class":167},[33,56380,734],{"class":50},[33,56382,365],{"class":167},[33,56384,748],{"class":50},[33,56386,19834],{"class":167},[33,56388,4126],{"class":163},[33,56390,734],{"class":50},[33,56392,365],{"class":167},[33,56394,4126],{"class":163},[33,56396,734],{"class":50},[33,56398,18525],{"class":167},[33,56400,20050],{"class":54},[33,56402,1506],{"class":167},[33,56404,56405,56407,56409,56411,56413,56415,56417,56419,56421,56423,56425,56427,56429,56431],{"class":35,"line":4993},[33,56406,19819],{"class":167},[33,56408,19884],{"class":54},[33,56410,19953],{"class":167},[33,56412,748],{"class":50},[33,56414,365],{"class":167},[33,56416,1533],{"class":50},[33,56418,19834],{"class":167},[33,56420,4126],{"class":163},[33,56422,734],{"class":50},[33,56424,365],{"class":167},[33,56426,1533],{"class":50},[33,56428,18525],{"class":167},[33,56430,19908],{"class":54},[33,56432,1506],{"class":167},[33,56434,56435,56437,56439,56441,56443,56445,56447,56449,56451,56453,56455,56457,56459,56461],{"class":35,"line":5003},[33,56436,19819],{"class":167},[33,56438,19855],{"class":54},[33,56440,19953],{"class":167},[33,56442,748],{"class":50},[33,56444,365],{"class":167},[33,56446,1533],{"class":50},[33,56448,19834],{"class":167},[33,56450,4126],{"class":163},[33,56452,734],{"class":50},[33,56454,365],{"class":167},[33,56456,1533],{"class":50},[33,56458,18525],{"class":167},[33,56460,55355],{"class":50},[33,56462,1506],{"class":167},[33,56464,56465,56467,56470,56472,56474,56476,56478,56480,56482,56484,56486,56488,56490,56492,56494,56496],{"class":35,"line":5008},[33,56466,19819],{"class":167},[33,56468,56469],{"class":54},"\"LINEABOVE\"",[33,56471,19953],{"class":167},[33,56473,748],{"class":50},[33,56475,365],{"class":167},[33,56477,1533],{"class":50},[33,56479,19834],{"class":167},[33,56481,4126],{"class":163},[33,56483,734],{"class":50},[33,56485,365],{"class":167},[33,56487,1533],{"class":50},[33,56489,18525],{"class":167},[33,56491,20011],{"class":50},[33,56493,365],{"class":167},[33,56495,19181],{"class":50},[33,56497,1506],{"class":167},[33,56499,56500],{"class":35,"line":5014},[33,56501,20057],{"class":167},[33,56503,56504],{"class":35,"line":5019},[33,56505,92],{"emptyLinePlaceholder":91},[33,56507,56508,56510,56512],{"class":35,"line":5032},[33,56509,19444],{"class":167},[33,56511,242],{"class":163},[33,56513,7473],{"class":167},[33,56515,56516,56518,56520,56523,56526],{"class":35,"line":5039},[33,56517,19453],{"class":167},[33,56519,9185],{"class":54},[33,56521,56522],{"class":167},", styles[",[33,56524,56525],{"class":54},"\"h1\"",[33,56527,12871],{"class":167},[33,56529,56530,56532,56534,56536,56538,56540],{"class":35,"line":5068},[33,56531,19542],{"class":167},[33,56533,734],{"class":50},[33,56535,365],{"class":167},[33,56537,1533],{"class":50},[33,56539,1769],{"class":163},[33,56541,56542],{"class":167},"mm),\n",[33,56544,56545,56547,56549,56552,56554,56556,56558,56560,56562,56564,56566,56568],{"class":35,"line":5077},[33,56546,19453],{"class":167},[33,56548,4059],{"class":163},[33,56550,56551],{"class":54},"\"\u003Cb>",[33,56553,1115],{"class":50},[33,56555,55057],{"class":167},[33,56557,54556],{"class":54},[33,56559,9546],{"class":167},[33,56561,1121],{"class":50},[33,56563,19525],{"class":54},[33,56565,56522],{"class":167},[33,56567,19348],{"class":54},[33,56569,12871],{"class":167},[33,56571,56572,56574,56576,56578,56580,56582,56584,56586,56588,56591,56593,56595,56598,56600,56602,56604,56606,56608],{"class":35,"line":5082},[33,56573,19453],{"class":167},[33,56575,4059],{"class":163},[33,56577,274],{"class":54},[33,56579,1115],{"class":50},[33,56581,55057],{"class":167},[33,56583,55060],{"class":54},[33,56585,9546],{"class":167},[33,56587,1121],{"class":50},[33,56589,56590],{"class":54}," · Due ",[33,56592,1115],{"class":50},[33,56594,55057],{"class":167},[33,56596,56597],{"class":54},"'due_date'",[33,56599,9546],{"class":167},[33,56601,1121],{"class":50},[33,56603,274],{"class":54},[33,56605,56522],{"class":167},[33,56607,19348],{"class":54},[33,56609,12871],{"class":167},[33,56611,56612,56615,56618,56621,56623],{"class":35,"line":5089},[33,56613,56614],{"class":167},"        Paragraph(inv[",[33,56616,56617],{"class":54},"\"email\"",[33,56619,56620],{"class":167},"], styles[",[33,56622,19348],{"class":54},[33,56624,12871],{"class":167},[33,56626,56627,56629,56631,56633,56635,56637],{"class":35,"line":5098},[33,56628,19542],{"class":167},[33,56630,734],{"class":50},[33,56632,365],{"class":167},[33,56634,2681],{"class":50},[33,56636,1769],{"class":163},[33,56638,56542],{"class":167},[33,56640,56641],{"class":35,"line":5105},[33,56642,56643],{"class":167},"        tbl,\n",[33,56645,56646,56648,56650,56652,56654,56656],{"class":35,"line":5110},[33,56647,19542],{"class":167},[33,56649,734],{"class":50},[33,56651,365],{"class":167},[33,56653,1503],{"class":50},[33,56655,1769],{"class":163},[33,56657,56542],{"class":167},[33,56659,56660],{"class":35,"line":5115},[33,56661,56662],{"class":167},"        totals_tbl,\n",[33,56664,56665,56667,56669,56671,56673,56675],{"class":35,"line":5128},[33,56666,19542],{"class":167},[33,56668,734],{"class":50},[33,56670,365],{"class":167},[33,56672,2591],{"class":50},[33,56674,1769],{"class":163},[33,56676,56542],{"class":167},[33,56678,56679,56682,56685,56687,56690,56692,56695,56697,56699,56701,56703,56705,56707],{"class":35,"line":5135},[33,56680,56681],{"class":167},"        HRFlowable(",[33,56683,56684],{"class":238},"width",[33,56686,242],{"class":163},[33,56688,56689],{"class":54},"\"100%\"",[33,56691,365],{"class":167},[33,56693,56694],{"class":238},"thickness",[33,56696,242],{"class":163},[33,56698,20011],{"class":50},[33,56700,365],{"class":167},[33,56702,17245],{"class":238},[33,56704,242],{"class":163},[33,56706,19181],{"class":50},[33,56708,1506],{"class":167},[33,56710,56711,56713,56715,56717,56719,56721],{"class":35,"line":5142},[33,56712,19542],{"class":167},[33,56714,734],{"class":50},[33,56716,365],{"class":167},[33,56718,1533],{"class":50},[33,56720,1769],{"class":163},[33,56722,56542],{"class":167},[33,56724,56725,56727,56730,56732,56734],{"class":35,"line":5151},[33,56726,19453],{"class":167},[33,56728,56729],{"class":54},"\"Payment terms: 30 days. Bank transfer preferred.\"",[33,56731,56522],{"class":167},[33,56733,19348],{"class":54},[33,56735,12871],{"class":167},[33,56737,56738],{"class":35,"line":5156},[33,56739,19559],{"class":167},[33,56741,56742,56744],{"class":35,"line":5161},[33,56743,2424],{"class":163},[33,56745,574],{"class":167},[33,56747,56748,56751,56754,56756,56759,56762,56764],{"class":35,"line":5167},[33,56749,56750],{"class":167},"        doc.build(story, ",[33,56752,56753],{"class":238},"onFirstPage",[33,56755,242],{"class":163},[33,56757,56758],{"class":167},"_header_footer, ",[33,56760,56761],{"class":238},"onLaterPages",[33,56763,242],{"class":163},[33,56765,56766],{"class":167},"_header_footer)\n",[33,56768,56769,56771],{"class":35,"line":5172},[33,56770,1659],{"class":163},[33,56772,55140],{"class":167},[33,56774,56775,56777,56779,56781],{"class":35,"line":5182},[33,56776,2449],{"class":163},[33,56778,783],{"class":50},[33,56780,1852],{"class":163},[33,56782,1855],{"class":167},[33,56784,56785,56787,56789,56791,56793,56796,56798,56800,56802,56804,56806,56808,56810,56812,56814,56816,56818,56820],{"class":35,"line":5195},[33,56786,4051],{"class":163},[33,56788,7590],{"class":50},[33,56790,602],{"class":167},[33,56792,4059],{"class":163},[33,56794,56795],{"class":54},"\"ReportLab failed for ",[33,56797,1115],{"class":50},[33,56799,55057],{"class":167},[33,56801,55060],{"class":54},[33,56803,9546],{"class":167},[33,56805,1121],{"class":50},[33,56807,2079],{"class":54},[33,56809,1115],{"class":50},[33,56811,6565],{"class":167},[33,56813,1121],{"class":50},[33,56815,274],{"class":54},[33,56817,1649],{"class":167},[33,56819,190],{"class":163},[33,56821,20843],{"class":167},[33,56823,56824],{"class":35,"line":5200},[33,56825,92],{"emptyLinePlaceholder":91},[33,56827,56828],{"class":35,"line":5205},[33,56829,92],{"emptyLinePlaceholder":91},[33,56831,56832,56834,56836,56838],{"class":35,"line":5210},[33,56833,6124],{"class":163},[33,56835,55203],{"class":167},[33,56837,662],{"class":163},[33,56839,55208],{"class":167},[33,56841,56842,56844,56846,56849,56851],{"class":35,"line":5215},[33,56843,17306],{"class":167},[33,56845,242],{"class":163},[33,56847,56848],{"class":167}," render_invoice_reportlab(inv, Path(",[33,56850,55220],{"class":54},[33,56852,371],{"class":167},[33,56854,56855,56857,56859,56861,56863,56865,56867,56869,56871],{"class":35,"line":5220},[33,56856,7268],{"class":50},[33,56858,602],{"class":167},[33,56860,4059],{"class":163},[33,56862,55233],{"class":54},[33,56864,1115],{"class":50},[33,56866,2580],{"class":167},[33,56868,1121],{"class":50},[33,56870,274],{"class":54},[33,56872,221],{"class":167},[18,56874,56875],{"id":35801},"Variant fixes",[424,56877,56879],{"id":56878},"accented-names-and-currency-symbols-crash-reportlab","Accented names and currency symbols crash ReportLab",[14,56881,56882],{},"The default Helvetica core font is Type 1 and lacks many Unicode glyphs. Register a TrueType font before building the document:",[23,56884,56886],{"className":126,"code":56885,"language":47,"meta":28,"style":28},"# pip install reportlab\nfrom reportlab.pdfbase import pdfmetrics\nfrom reportlab.pdfbase.ttfonts import TTFont\nfrom pathlib import Path\n\n# Download: https:\u002F\u002Fgithub.com\u002Fdejavu-fonts\u002Fdejavu-fonts\u002Freleases\nfont_path = Path(\"\u002Fusr\u002Fshare\u002Ffonts\u002Ftruetype\u002Fdejavu\u002FDejaVuSans.ttf\")\npdfmetrics.registerFont(TTFont(\"DejaVuSans\", str(font_path)))   # register once at module level\n# Then use \"DejaVuSans\" wherever you previously used \"Helvetica\"\n",[30,56887,56888,56892,56904,56916,56926,56930,56935,56949,56967],{"__ignoreMap":28},[33,56889,56890],{"class":35,"line":36},[33,56891,20289],{"class":39},[33,56893,56894,56896,56899,56901],{"class":35,"line":43},[33,56895,190],{"class":163},[33,56897,56898],{"class":167}," reportlab.pdfbase ",[33,56900,164],{"class":163},[33,56902,56903],{"class":167}," pdfmetrics\n",[33,56905,56906,56908,56911,56913],{"class":35,"line":61},[33,56907,190],{"class":163},[33,56909,56910],{"class":167}," reportlab.pdfbase.ttfonts ",[33,56912,164],{"class":163},[33,56914,56915],{"class":167}," TTFont\n",[33,56917,56918,56920,56922,56924],{"class":35,"line":73},[33,56919,190],{"class":163},[33,56921,193],{"class":167},[33,56923,164],{"class":163},[33,56925,198],{"class":167},[33,56927,56928],{"class":35,"line":88},[33,56929,92],{"emptyLinePlaceholder":91},[33,56931,56932],{"class":35,"line":95},[33,56933,56934],{"class":39},"# Download: https:\u002F\u002Fgithub.com\u002Fdejavu-fonts\u002Fdejavu-fonts\u002Freleases\n",[33,56936,56937,56940,56942,56944,56947],{"class":35,"line":101},[33,56938,56939],{"class":167},"font_path ",[33,56941,242],{"class":163},[33,56943,215],{"class":167},[33,56945,56946],{"class":54},"\"\u002Fusr\u002Fshare\u002Ffonts\u002Ftruetype\u002Fdejavu\u002FDejaVuSans.ttf\"",[33,56948,221],{"class":167},[33,56950,56951,56954,56957,56959,56961,56964],{"class":35,"line":171},[33,56952,56953],{"class":167},"pdfmetrics.registerFont(TTFont(",[33,56955,56956],{"class":54},"\"DejaVuSans\"",[33,56958,365],{"class":167},[33,56960,1053],{"class":50},[33,56962,56963],{"class":167},"(font_path)))   ",[33,56965,56966],{"class":39},"# register once at module level\n",[33,56968,56969],{"class":35,"line":179},[33,56970,56971],{"class":39},"# Then use \"DejaVuSans\" wherever you previously used \"Helvetica\"\n",[14,56973,6571,56974,56976,56977,42706,56979,3035],{},[940,56975,28608],{"href":28607}," for a full walkthrough including CID fonts and ",[30,56978,53907],{},[30,56980,56981],{},"drawString",[424,56983,56985],{"id":56984},"batch-generation-with-error-isolation","Batch generation with error isolation",[14,56987,56988],{},"Isolate per-invoice failures so one bad record does not abort the run:",[23,56990,56992],{"className":126,"code":56991,"language":47,"meta":28,"style":28},"# pip install reportlab weasyprint jinja2 pandas\nfrom pathlib import Path\n\nresults = {\"ok\": [], \"failed\": []}\n\nfor inv in invoices:\n    try:\n        path = render_invoice_weasyprint(inv, Path(\"invoices\"))\n        results[\"ok\"].append(inv[\"invoice_id\"])\n    except Exception as exc:\n        results[\"failed\"].append({\"id\": inv[\"invoice_id\"], \"error\": str(exc)})\n\nprint(f\"Generated {len(results['ok'])}, failed {len(results['failed'])}\")\nfor failure in results[\"failed\"]:\n    print(f\"  FAILED {failure['id']}: {failure['error']}\")\n",[30,56993,56994,56999,57009,57013,57033,57037,57047,57053,57066,57080,57090,57118,57122,57163,57179],{"__ignoreMap":28},[33,56995,56996],{"class":35,"line":36},[33,56997,56998],{"class":39},"# pip install reportlab weasyprint jinja2 pandas\n",[33,57000,57001,57003,57005,57007],{"class":35,"line":43},[33,57002,190],{"class":163},[33,57004,193],{"class":167},[33,57006,164],{"class":163},[33,57008,198],{"class":167},[33,57010,57011],{"class":35,"line":61},[33,57012,92],{"emptyLinePlaceholder":91},[33,57014,57015,57018,57020,57022,57025,57027,57030],{"class":35,"line":73},[33,57016,57017],{"class":167},"results ",[33,57019,242],{"class":163},[33,57021,4098],{"class":167},[33,57023,57024],{"class":54},"\"ok\"",[33,57026,49336],{"class":167},[33,57028,57029],{"class":54},"\"failed\"",[33,57031,57032],{"class":167},": []}\n",[33,57034,57035],{"class":35,"line":88},[33,57036,92],{"emptyLinePlaceholder":91},[33,57038,57039,57041,57043,57045],{"class":35,"line":95},[33,57040,6124],{"class":163},[33,57042,55203],{"class":167},[33,57044,662],{"class":163},[33,57046,55208],{"class":167},[33,57048,57049,57051],{"class":35,"line":101},[33,57050,2424],{"class":163},[33,57052,574],{"class":167},[33,57054,57055,57058,57060,57062,57064],{"class":35,"line":171},[33,57056,57057],{"class":167},"        path ",[33,57059,242],{"class":163},[33,57061,55217],{"class":167},[33,57063,55220],{"class":54},[33,57065,371],{"class":167},[33,57067,57068,57071,57073,57076,57078],{"class":35,"line":179},[33,57069,57070],{"class":167},"        results[",[33,57072,57024],{"class":54},[33,57074,57075],{"class":167},"].append(inv[",[33,57077,27353],{"class":54},[33,57079,751],{"class":167},[33,57081,57082,57084,57086,57088],{"class":35,"line":187},[33,57083,2449],{"class":163},[33,57085,783],{"class":50},[33,57087,1852],{"class":163},[33,57089,1855],{"class":167},[33,57091,57092,57094,57096,57099,57102,57105,57107,57109,57111,57113,57115],{"class":35,"line":201},[33,57093,57070],{"class":167},[33,57095,57029],{"class":54},[33,57097,57098],{"class":167},"].append({",[33,57100,57101],{"class":54},"\"id\"",[33,57103,57104],{"class":167},": inv[",[33,57106,27353],{"class":54},[33,57108,8314],{"class":167},[33,57110,37333],{"class":54},[33,57112,2079],{"class":167},[33,57114,1053],{"class":50},[33,57116,57117],{"class":167},"(exc)})\n",[33,57119,57120],{"class":35,"line":206},[33,57121,92],{"emptyLinePlaceholder":91},[33,57123,57124,57126,57128,57130,57133,57135,57138,57141,57143,57145,57148,57150,57152,57155,57157,57159,57161],{"class":35,"line":224},[33,57125,13474],{"class":50},[33,57127,602],{"class":167},[33,57129,4059],{"class":163},[33,57131,57132],{"class":54},"\"Generated ",[33,57134,4065],{"class":50},[33,57136,57137],{"class":167},"(results[",[33,57139,57140],{"class":54},"'ok'",[33,57142,18798],{"class":167},[33,57144,1121],{"class":50},[33,57146,57147],{"class":54},", failed ",[33,57149,4065],{"class":50},[33,57151,57137],{"class":167},[33,57153,57154],{"class":54},"'failed'",[33,57156,18798],{"class":167},[33,57158,1121],{"class":50},[33,57160,274],{"class":54},[33,57162,221],{"class":167},[33,57164,57165,57167,57170,57172,57175,57177],{"class":35,"line":229},[33,57166,6124],{"class":163},[33,57168,57169],{"class":167}," failure ",[33,57171,662],{"class":163},[33,57173,57174],{"class":167}," results[",[33,57176,57029],{"class":54},[33,57178,17477],{"class":167},[33,57180,57181,57183,57185,57187,57190,57192,57195,57198,57200,57202,57204,57206,57208,57211,57213,57215,57217],{"class":35,"line":235},[33,57182,7268],{"class":50},[33,57184,602],{"class":167},[33,57186,4059],{"class":163},[33,57188,57189],{"class":54},"\"  FAILED ",[33,57191,1115],{"class":50},[33,57193,57194],{"class":167},"failure[",[33,57196,57197],{"class":54},"'id'",[33,57199,9546],{"class":167},[33,57201,1121],{"class":50},[33,57203,2079],{"class":54},[33,57205,1115],{"class":50},[33,57207,57194],{"class":167},[33,57209,57210],{"class":54},"'error'",[33,57212,9546],{"class":167},[33,57214,1121],{"class":50},[33,57216,274],{"class":54},[33,57218,221],{"class":167},[14,57220,57221,57222,3035],{},"Once per-customer PDFs are generated, assemble them into a single batch delivery file using ",[940,57223,52682],{"href":52681},[424,57225,57227],{"id":57226},"read-invoice-data-from-excel-instead-of-csv","Read invoice data from Excel instead of CSV",[14,57229,57230,57231,57234,57235,42238,57238,20891],{},"Data from ",[940,57232,57233],{"href":6935},"pandas-based Excel pipelines"," works identically — swap ",[30,57236,57237],{},"read_csv",[30,57239,57240],{},"read_excel",[23,57242,57244],{"className":126,"code":57243,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nimport pandas as pd\nfrom pathlib import Path\n\ndf = pd.read_excel(Path(\"invoices.xlsx\"), engine=\"openpyxl\")\ndf.columns = df.columns.str.strip().str.lower()\n",[30,57245,57246,57250,57260,57270,57274,57296],{"__ignoreMap":28},[33,57247,57248],{"class":35,"line":36},[33,57249,3952],{"class":39},[33,57251,57252,57254,57256,57258],{"class":35,"line":43},[33,57253,164],{"class":163},[33,57255,492],{"class":167},[33,57257,495],{"class":163},[33,57259,498],{"class":167},[33,57261,57262,57264,57266,57268],{"class":35,"line":61},[33,57263,190],{"class":163},[33,57265,193],{"class":167},[33,57267,164],{"class":163},[33,57269,198],{"class":167},[33,57271,57272],{"class":35,"line":73},[33,57273,92],{"emptyLinePlaceholder":91},[33,57275,57276,57278,57280,57283,57286,57288,57290,57292,57294],{"class":35,"line":88},[33,57277,13459],{"class":167},[33,57279,242],{"class":163},[33,57281,57282],{"class":167}," pd.read_excel(Path(",[33,57284,57285],{"class":54},"\"invoices.xlsx\"",[33,57287,18525],{"class":167},[33,57289,17351],{"class":238},[33,57291,242],{"class":163},[33,57293,17356],{"class":54},[33,57295,221],{"class":167},[33,57297,57298,57301,57303],{"class":35,"line":95},[33,57299,57300],{"class":167},"df.columns ",[33,57302,242],{"class":163},[33,57304,54121],{"class":167},[18,57306,9247],{"id":9246},[23,57308,57310],{"className":126,"code":57309,"language":47,"meta":28,"style":28},"# pip install pypdf\nfrom pathlib import Path\nfrom pypdf import PdfReader\n\ndef verify_invoice_pdf(path: Path, expected_customer: str) -> None:\n    reader = PdfReader(str(path))\n    assert len(reader.pages) >= 1, f\"{path.name}: no pages\"\n    text = \" \".join(p.extract_text() or \"\" for p in reader.pages)\n    assert expected_customer in text, (\n        f\"{path.name}: customer name '{expected_customer}' not found in PDF text\"\n    )\n    print(f\"OK: {path.name}\")\n\nverify_invoice_pdf(Path(\"invoices\u002FINV-001.pdf\"), \"Acme Corp\")\nverify_invoice_pdf(Path(\"invoices\u002FINV-002.pdf\"), \"Béta SARL\")\n",[30,57311,57312,57317,57327,57339,57343,57361,57375,57404,57429,57441,57466,57470,57491,57495,57510],{"__ignoreMap":28},[33,57313,57314],{"class":35,"line":36},[33,57315,57316],{"class":39},"# pip install pypdf\n",[33,57318,57319,57321,57323,57325],{"class":35,"line":43},[33,57320,190],{"class":163},[33,57322,193],{"class":167},[33,57324,164],{"class":163},[33,57326,198],{"class":167},[33,57328,57329,57331,57334,57336],{"class":35,"line":61},[33,57330,190],{"class":163},[33,57332,57333],{"class":167}," pypdf ",[33,57335,164],{"class":163},[33,57337,57338],{"class":167}," PdfReader\n",[33,57340,57341],{"class":35,"line":73},[33,57342,92],{"emptyLinePlaceholder":91},[33,57344,57345,57347,57350,57353,57355,57357,57359],{"class":35,"line":88},[33,57346,562],{"class":163},[33,57348,57349],{"class":46}," verify_invoice_pdf",[33,57351,57352],{"class":167},"(path: Path, expected_customer: ",[33,57354,1053],{"class":50},[33,57356,1617],{"class":167},[33,57358,571],{"class":50},[33,57360,574],{"class":167},[33,57362,57363,57366,57368,57371,57373],{"class":35,"line":95},[33,57364,57365],{"class":167},"    reader ",[33,57367,242],{"class":163},[33,57369,57370],{"class":167}," PdfReader(",[33,57372,1053],{"class":50},[33,57374,21248],{"class":167},[33,57376,57377,57379,57381,57384,57386,57388,57390,57392,57394,57396,57399,57401],{"class":35,"line":101},[33,57378,9228],{"class":163},[33,57380,4037],{"class":50},[33,57382,57383],{"class":167},"(reader.pages) ",[33,57385,43000],{"class":163},[33,57387,1814],{"class":50},[33,57389,365],{"class":167},[33,57391,4059],{"class":163},[33,57393,274],{"class":54},[33,57395,1115],{"class":50},[33,57397,57398],{"class":167},"path.name",[33,57400,1121],{"class":50},[33,57402,57403],{"class":54},": no pages\"\n",[33,57405,57406,57408,57410,57413,57416,57418,57420,57422,57424,57426],{"class":35,"line":171},[33,57407,44654],{"class":167},[33,57409,242],{"class":163},[33,57411,57412],{"class":54}," \" \"",[33,57414,57415],{"class":167},".join(p.extract_text() ",[33,57417,7162],{"class":163},[33,57419,9892],{"class":54},[33,57421,14766],{"class":163},[33,57423,6127],{"class":167},[33,57425,662],{"class":163},[33,57427,57428],{"class":167}," reader.pages)\n",[33,57430,57431,57433,57436,57438],{"class":35,"line":179},[33,57432,9228],{"class":163},[33,57434,57435],{"class":167}," expected_customer ",[33,57437,662],{"class":163},[33,57439,57440],{"class":167}," text, (\n",[33,57442,57443,57445,57447,57449,57451,57453,57456,57458,57461,57463],{"class":35,"line":187},[33,57444,9533],{"class":163},[33,57446,274],{"class":54},[33,57448,1115],{"class":50},[33,57450,57398],{"class":167},[33,57452,1121],{"class":50},[33,57454,57455],{"class":54},": customer name '",[33,57457,1115],{"class":50},[33,57459,57460],{"class":167},"expected_customer",[33,57462,1121],{"class":50},[33,57464,57465],{"class":54},"' not found in PDF text\"\n",[33,57467,57468],{"class":35,"line":201},[33,57469,1202],{"class":167},[33,57471,57472,57474,57476,57478,57481,57483,57485,57487,57489],{"class":35,"line":206},[33,57473,7268],{"class":50},[33,57475,602],{"class":167},[33,57477,4059],{"class":163},[33,57479,57480],{"class":54},"\"OK: ",[33,57482,1115],{"class":50},[33,57484,57398],{"class":167},[33,57486,1121],{"class":50},[33,57488,274],{"class":54},[33,57490,221],{"class":167},[33,57492,57493],{"class":35,"line":224},[33,57494,92],{"emptyLinePlaceholder":91},[33,57496,57497,57500,57503,57505,57508],{"class":35,"line":229},[33,57498,57499],{"class":167},"verify_invoice_pdf(Path(",[33,57501,57502],{"class":54},"\"invoices\u002FINV-001.pdf\"",[33,57504,18525],{"class":167},[33,57506,57507],{"class":54},"\"Acme Corp\"",[33,57509,221],{"class":167},[33,57511,57512,57514,57517,57519,57522],{"class":35,"line":235},[33,57513,57499],{"class":167},[33,57515,57516],{"class":54},"\"invoices\u002FINV-002.pdf\"",[33,57518,18525],{"class":167},[33,57520,57521],{"class":54},"\"Béta SARL\"",[33,57523,221],{"class":167},[18,57525,6918],{"id":6917},[4211,57527,57528,57533,57546,57551],{},[4214,57529,57530,57532],{},[940,57531,26191],{"href":19001}," — parent guide covering WeasyPrint, ReportLab, charts, and pagination in depth",[4214,57534,57535,57537,57538,42238,57540,365,57542,57545],{},[940,57536,28608],{"href":28607}," — fix garbled boxes or ",[30,57539,53869],{},[30,57541,53873],{},[30,57543,57544],{},"™",", accented characters",[4214,57547,57548,57550],{},[940,57549,52682],{"href":52681}," — combine individual invoice PDFs into one batch delivery file",[4214,57552,57553,57555],{},[940,57554,6936],{"href":6935}," — same data sources can drive Excel-format invoices in parallel",[14,57557,6947,57558,3035],{},[940,57559,26191],{"href":19001},[6953,57561,57562],{},"html pre.shiki code .s7eDp, html code.shiki .s7eDp{--shiki-default:#6F42C1}html pre.shiki code .sYBdl, html code.shiki .sYBdl{--shiki-default:#032F62}html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html pre.shiki code .sD7c4, html code.shiki .sD7c4{--shiki-default:#D73A49}html pre.shiki code .sAwPA, html code.shiki .sAwPA{--shiki-default:#6A737D}html pre.shiki code .sgsFI, html code.shiki .sgsFI{--shiki-default:#24292E}html pre.shiki code .sYu0t, html code.shiki .sYu0t{--shiki-default:#005CC5}html pre.shiki code .sqxcx, html code.shiki .sqxcx{--shiki-default:#E36209}",{"title":28,"searchDepth":43,"depth":43,"links":57564},[57565,57566,57567,57568,57569,57570,57571,57576,57577],{"id":53879,"depth":43,"text":53880},{"id":20,"depth":43,"text":21},{"id":54022,"depth":43,"text":54023},{"id":54445,"depth":43,"text":54446},{"id":54597,"depth":43,"text":54598},{"id":55246,"depth":43,"text":55247},{"id":35801,"depth":43,"text":56875,"children":57572},[57573,57574,57575],{"id":56878,"depth":61,"text":56879},{"id":56984,"depth":61,"text":56985},{"id":57226,"depth":61,"text":57227},{"id":9246,"depth":43,"text":9247},{"id":6917,"depth":43,"text":6918},"Dynamic Invoice PDFs","Build per-customer invoice PDFs from a data row using ReportLab and Jinja2+WeasyPrint. Covers line-item loops, tax calculation, totals, and Unicode font handling.",{},"\u002Fautomating-pdf-extraction-generation\u002Fgenerating-pdf-reports-dynamically\u002Fcreate-dynamic-invoice-pdfs-automatically",{"title":53852,"description":57579},"Create Dynamic Invoice PDFs Automatically in Python","automating-pdf-extraction-generation\u002Fgenerating-pdf-reports-dynamically\u002Fcreate-dynamic-invoice-pdfs-automatically\u002Findex",[47,9631,57586,26232,57587],"invoices","weasyprint","zuWwoFYeLZWsbHNwqL8V-BCAJX3OdvcDwmTlPEuZIoE",{"id":57590,"title":28608,"body":57591,"breadcrumbTitle":59417,"canonical":6977,"date":6978,"description":59418,"draft":6980,"extension":6981,"image":6977,"meta":59419,"navigation":91,"path":59420,"robots":6977,"seo":59421,"seoTitle":59422,"stem":59423,"tags":59424,"updatedAt":6978,"__hash__":59427},"content\u002Fautomating-pdf-extraction-generation\u002Fgenerating-pdf-reports-dynamically\u002Ffix-reportlab-unicode-font-errors\u002Findex.md",{"type":7,"value":57592,"toc":59406},[57593,57596,57610,57617,57627,57629,57642,57648,57655,57658,57660,57663,57843,57849,57853,57862,57917,57923,58259,58262,58282,58286,58410,58414,58428,58761,58774,58778,58781,59011,59030,59034,59041,59116,59126,59128,59359,59372,59374,59399,59403],[10,57594,28608],{"id":57595},"fix-reportlab-unicode-font-errors",[14,57597,57598,57599,365,57601,365,57603,57606,57607,57609],{},"ReportLab's built-in Helvetica and Times fonts are Type 1 core fonts. They only cover the Latin-1 subset (roughly 256 code points). Any character outside that range — ",[30,57600,53873],{},[30,57602,57544],{},[30,57604,57605],{},"©",", accented letters beyond basic Latin-1, Arabic, Chinese — produces one of two failures: a ",[30,57608,53869],{}," that crashes the script, or a silent substitution that renders as a small empty box (the \"tofu\" glyph) in the PDF.",[14,57611,57612,57613,57616],{},"This page shows the root cause, a diagnostic snippet to reproduce it, and the fix: registering a TrueType font via ",[30,57614,57615],{},"pdfmetrics.registerFont(TTFont(...))"," so ReportLab can encode the full Unicode range.",[14,57618,57619,57620,57622,57623,57626],{},"This error is common in ",[940,57621,26191],{"href":19001}," pipelines and appears almost universally in invoice generation — see ",[940,57624,53852],{"href":57625},"\u002Fautomating-pdf-extraction-generation\u002Fgenerating-pdf-reports-dynamically\u002Fcreate-dynamic-invoice-pdfs-automatically\u002F"," for the broader invoice pattern.",[18,57628,4287],{"id":7020},[14,57630,57631,57632,57635,57636,57638,57639,57641],{},"ReportLab maps Python strings to PDF glyph indices using an internal encoding table. For the built-in core fonts, that encoding is ",[30,57633,57634],{},"WinAnsiEncoding"," (a Windows-1252 superset of Latin-1). The euro sign ",[30,57637,53873],{}," is at code point U+20AC, which is outside the 0x00–0xFF window used by ",[30,57640,57634],{},". When ReportLab tries to encode it, Python raises:",[23,57643,57646],{"className":57644,"code":57645,"language":2000},[1998],"UnicodeEncodeError: 'latin-1' codec can't encode character '€' in position 3: ordinal not in range(256)\n",[30,57647,57645],{"__ignoreMap":28},[14,57649,57650,57651,57654],{},"When the string goes through a different internal path (e.g. inside ",[30,57652,57653],{},"Paragraph"," with an XML-escaped entity), the error is suppressed but the glyph is missing from the output stream — the PDF viewer substitutes an empty box.",[14,57656,57657],{},"Both symptoms share the same root: the active font has no glyph table entry for the requested code point.",[18,57659,54446],{"id":54445},[14,57661,57662],{},"Run this to reproduce the error before applying the fix:",[23,57664,57666],{"className":126,"code":57665,"language":47,"meta":28,"style":28},"# pip install reportlab\nfrom reportlab.pdfgen import canvas as rl_canvas\nfrom reportlab.lib.pagesizes import A4\nfrom pathlib import Path\n\nOUT = Path(\"\u002Ftmp\u002Frl_unicode_broken.pdf\")\n\ntry:\n    c = rl_canvas.Canvas(str(OUT), pagesize=A4)\n    c.setFont(\"Helvetica\", 12)               # built-in core font — no Unicode\n    c.drawString(50, 700, \"Price: €49.99\")   # U+20AC — outside WinAnsiEncoding\n    c.save()\n    print(\"Script completed — check the PDF for boxes or missing glyphs\")\nexcept UnicodeEncodeError as exc:\n    print(f\"Reproduced: {exc}\")\n",[30,57667,57668,57672,57688,57698,57708,57712,57726,57730,57736,57759,57775,57796,57800,57811,57822],{"__ignoreMap":28},[33,57669,57670],{"class":35,"line":36},[33,57671,20289],{"class":39},[33,57673,57674,57676,57678,57680,57683,57685],{"class":35,"line":43},[33,57675,190],{"class":163},[33,57677,28221],{"class":167},[33,57679,164],{"class":163},[33,57681,57682],{"class":167}," canvas ",[33,57684,495],{"class":163},[33,57686,57687],{"class":167}," rl_canvas\n",[33,57689,57690,57692,57694,57696],{"class":35,"line":61},[33,57691,190],{"class":163},[33,57693,19044],{"class":167},[33,57695,164],{"class":163},[33,57697,19049],{"class":167},[33,57699,57700,57702,57704,57706],{"class":35,"line":73},[33,57701,190],{"class":163},[33,57703,193],{"class":167},[33,57705,164],{"class":163},[33,57707,198],{"class":167},[33,57709,57710],{"class":35,"line":88},[33,57711,92],{"emptyLinePlaceholder":91},[33,57713,57714,57717,57719,57721,57724],{"class":35,"line":95},[33,57715,57716],{"class":50},"OUT",[33,57718,212],{"class":163},[33,57720,215],{"class":167},[33,57722,57723],{"class":54},"\"\u002Ftmp\u002Frl_unicode_broken.pdf\"",[33,57725,221],{"class":167},[33,57727,57728],{"class":35,"line":101},[33,57729,92],{"emptyLinePlaceholder":91},[33,57731,57732,57734],{"class":35,"line":171},[33,57733,35574],{"class":163},[33,57735,574],{"class":167},[33,57737,57738,57740,57742,57745,57747,57749,57751,57753,57755,57757],{"class":35,"line":179},[33,57739,28472],{"class":167},[33,57741,242],{"class":163},[33,57743,57744],{"class":167}," rl_canvas.Canvas(",[33,57746,1053],{"class":50},[33,57748,602],{"class":167},[33,57750,57716],{"class":50},[33,57752,18525],{"class":167},[33,57754,20091],{"class":238},[33,57756,242],{"class":163},[33,57758,28496],{"class":167},[33,57760,57761,57763,57765,57767,57769,57772],{"class":35,"line":187},[33,57762,28510],{"class":167},[33,57764,28546],{"class":54},[33,57766,365],{"class":167},[33,57768,55650],{"class":50},[33,57770,57771],{"class":167},")               ",[33,57773,57774],{"class":39},"# built-in core font — no Unicode\n",[33,57776,57777,57779,57781,57783,57786,57788,57791,57793],{"class":35,"line":201},[33,57778,28523],{"class":167},[33,57780,2680],{"class":50},[33,57782,365],{"class":167},[33,57784,57785],{"class":50},"700",[33,57787,365],{"class":167},[33,57789,57790],{"class":54},"\"Price: €49.99\"",[33,57792,12000],{"class":167},[33,57794,57795],{"class":39},"# U+20AC — outside WinAnsiEncoding\n",[33,57797,57798],{"class":35,"line":206},[33,57799,28601],{"class":167},[33,57801,57802,57804,57806,57809],{"class":35,"line":224},[33,57803,7268],{"class":50},[33,57805,602],{"class":167},[33,57807,57808],{"class":54},"\"Script completed — check the PDF for boxes or missing glyphs\"",[33,57810,221],{"class":167},[33,57812,57813,57815,57818,57820],{"class":35,"line":229},[33,57814,35726],{"class":163},[33,57816,57817],{"class":50}," UnicodeEncodeError",[33,57819,1852],{"class":163},[33,57821,1855],{"class":167},[33,57823,57824,57826,57828,57830,57833,57835,57837,57839,57841],{"class":35,"line":235},[33,57825,7268],{"class":50},[33,57827,602],{"class":167},[33,57829,4059],{"class":163},[33,57831,57832],{"class":54},"\"Reproduced: ",[33,57834,1115],{"class":50},[33,57836,6565],{"class":167},[33,57838,1121],{"class":50},[33,57840,274],{"class":54},[33,57842,221],{"class":167},[14,57844,57845,57846,57848],{},"On most ReportLab versions this raises ",[30,57847,53869],{},". On some it silently writes a box. Either outcome confirms the root cause.",[18,57850,57852],{"id":57851},"fix-register-a-truetype-font","Fix: register a TrueType font",[14,57854,57855,57856,57861],{},"Download a Unicode-complete TTF. ",[940,57857,57860],{"href":57858,"rel":57859},"https:\u002F\u002Fdejavu-fonts.github.io\u002F",[1367],"DejaVu Sans"," is the most portable free option and ships with most Linux distributions.",[23,57863,57865],{"className":25,"code":57864,"language":27,"meta":28,"style":28},"# Linux (Debian\u002FUbuntu)\nsudo apt install fonts-dejavu-core\n# macOS\nbrew install font-dejavu\n\n# Or download manually:\n# https:\u002F\u002Fgithub.com\u002Fdejavu-fonts\u002Fdejavu-fonts\u002Freleases\n# Extract DejaVuSans.ttf from the archive.\n",[30,57866,57867,57872,57884,57889,57898,57902,57907,57912],{"__ignoreMap":28},[33,57868,57869],{"class":35,"line":36},[33,57870,57871],{"class":39},"# Linux (Debian\u002FUbuntu)\n",[33,57873,57874,57876,57879,57881],{"class":35,"line":43},[33,57875,9669],{"class":46},[33,57877,57878],{"class":54}," apt",[33,57880,79],{"class":54},[33,57882,57883],{"class":54}," fonts-dejavu-core\n",[33,57885,57886],{"class":35,"line":61},[33,57887,57888],{"class":39},"# macOS\n",[33,57890,57891,57893,57895],{"class":35,"line":73},[33,57892,35308],{"class":46},[33,57894,79],{"class":54},[33,57896,57897],{"class":54}," font-dejavu\n",[33,57899,57900],{"class":35,"line":88},[33,57901,92],{"emptyLinePlaceholder":91},[33,57903,57904],{"class":35,"line":95},[33,57905,57906],{"class":39},"# Or download manually:\n",[33,57908,57909],{"class":35,"line":101},[33,57910,57911],{"class":39},"# https:\u002F\u002Fgithub.com\u002Fdejavu-fonts\u002Fdejavu-fonts\u002Freleases\n",[33,57913,57914],{"class":35,"line":171},[33,57915,57916],{"class":39},"# Extract DejaVuSans.ttf from the archive.\n",[14,57918,57919,57920,57922],{},"Register and use the font before any canvas or ",[30,57921,57653],{}," call:",[23,57924,57926],{"className":126,"code":57925,"language":47,"meta":28,"style":28},"# pip install reportlab\nfrom pathlib import Path\nfrom reportlab.pdfgen import canvas as rl_canvas\nfrom reportlab.pdfbase import pdfmetrics          # font registry\nfrom reportlab.pdfbase.ttfonts import TTFont       # TrueType loader\nfrom reportlab.lib.pagesizes import A4\n\n# --- Register once at module \u002F script startup ---\nFONT_PATH = Path(\"\u002Fusr\u002Fshare\u002Ffonts\u002Ftruetype\u002Fdejavu\u002FDejaVuSans.ttf\")\nif not FONT_PATH.exists():\n    raise FileNotFoundError(\n        f\"Font not found at {FONT_PATH}. Install fonts-dejavu-core or adjust FONT_PATH.\"\n    )\npdfmetrics.registerFont(TTFont(\"DejaVuSans\", str(FONT_PATH)))  # name + path\n\nOUT = Path(\"\u002Ftmp\u002Frl_unicode_fixed.pdf\")\n\ntry:\n    c = rl_canvas.Canvas(str(OUT), pagesize=A4)\n    c.setFont(\"DejaVuSans\", 12)                    # use the registered name, not \"Helvetica\"\n    c.drawString(50, 750, \"Price: €49.99\")         # euro sign renders correctly\n    c.drawString(50, 730, \"Trademark: ReportLab™\") # trademark symbol\n    c.drawString(50, 710, \"Name: Ångström Müller\") # accented characters\n    c.save()\n    print(f\"Written: {OUT}\")\nexcept Exception as exc:\n    raise RuntimeError(f\"PDF generation failed: {exc}\") from exc\n",[30,57927,57928,57932,57942,57956,57970,57984,57994,57998,58003,58016,58028,58036,58049,58053,58073,58077,58090,58094,58100,58122,58138,58159,58180,58200,58204,58222,58232],{"__ignoreMap":28},[33,57929,57930],{"class":35,"line":36},[33,57931,20289],{"class":39},[33,57933,57934,57936,57938,57940],{"class":35,"line":43},[33,57935,190],{"class":163},[33,57937,193],{"class":167},[33,57939,164],{"class":163},[33,57941,198],{"class":167},[33,57943,57944,57946,57948,57950,57952,57954],{"class":35,"line":61},[33,57945,190],{"class":163},[33,57947,28221],{"class":167},[33,57949,164],{"class":163},[33,57951,57682],{"class":167},[33,57953,495],{"class":163},[33,57955,57687],{"class":167},[33,57957,57958,57960,57962,57964,57967],{"class":35,"line":73},[33,57959,190],{"class":163},[33,57961,56898],{"class":167},[33,57963,164],{"class":163},[33,57965,57966],{"class":167}," pdfmetrics          ",[33,57968,57969],{"class":39},"# font registry\n",[33,57971,57972,57974,57976,57978,57981],{"class":35,"line":88},[33,57973,190],{"class":163},[33,57975,56910],{"class":167},[33,57977,164],{"class":163},[33,57979,57980],{"class":167}," TTFont       ",[33,57982,57983],{"class":39},"# TrueType loader\n",[33,57985,57986,57988,57990,57992],{"class":35,"line":95},[33,57987,190],{"class":163},[33,57989,19044],{"class":167},[33,57991,164],{"class":163},[33,57993,19049],{"class":167},[33,57995,57996],{"class":35,"line":101},[33,57997,92],{"emptyLinePlaceholder":91},[33,57999,58000],{"class":35,"line":171},[33,58001,58002],{"class":39},"# --- Register once at module \u002F script startup ---\n",[33,58004,58005,58008,58010,58012,58014],{"class":35,"line":179},[33,58006,58007],{"class":50},"FONT_PATH",[33,58009,212],{"class":163},[33,58011,215],{"class":167},[33,58013,56946],{"class":54},[33,58015,221],{"class":167},[33,58017,58018,58020,58022,58025],{"class":35,"line":187},[33,58019,2491],{"class":163},[33,58021,620],{"class":163},[33,58023,58024],{"class":50}," FONT_PATH",[33,58026,58027],{"class":167},".exists():\n",[33,58029,58030,58032,58034],{"class":35,"line":201},[33,58031,35742],{"class":163},[33,58033,2945],{"class":50},[33,58035,7637],{"class":167},[33,58037,58038,58040,58043,58046],{"class":35,"line":206},[33,58039,9533],{"class":163},[33,58041,58042],{"class":54},"\"Font not found at ",[33,58044,58045],{"class":50},"{FONT_PATH}",[33,58047,58048],{"class":54},". Install fonts-dejavu-core or adjust FONT_PATH.\"\n",[33,58050,58051],{"class":35,"line":224},[33,58052,1202],{"class":167},[33,58054,58055,58057,58059,58061,58063,58065,58067,58070],{"class":35,"line":229},[33,58056,56953],{"class":167},[33,58058,56956],{"class":54},[33,58060,365],{"class":167},[33,58062,1053],{"class":50},[33,58064,602],{"class":167},[33,58066,58007],{"class":50},[33,58068,58069],{"class":167},")))  ",[33,58071,58072],{"class":39},"# name + path\n",[33,58074,58075],{"class":35,"line":235},[33,58076,92],{"emptyLinePlaceholder":91},[33,58078,58079,58081,58083,58085,58088],{"class":35,"line":250},[33,58080,57716],{"class":50},[33,58082,212],{"class":163},[33,58084,215],{"class":167},[33,58086,58087],{"class":54},"\"\u002Ftmp\u002Frl_unicode_fixed.pdf\"",[33,58089,221],{"class":167},[33,58091,58092],{"class":35,"line":266},[33,58093,92],{"emptyLinePlaceholder":91},[33,58095,58096,58098],{"class":35,"line":290},[33,58097,35574],{"class":163},[33,58099,574],{"class":167},[33,58101,58102,58104,58106,58108,58110,58112,58114,58116,58118,58120],{"class":35,"line":295},[33,58103,28472],{"class":167},[33,58105,242],{"class":163},[33,58107,57744],{"class":167},[33,58109,1053],{"class":50},[33,58111,602],{"class":167},[33,58113,57716],{"class":50},[33,58115,18525],{"class":167},[33,58117,20091],{"class":238},[33,58119,242],{"class":163},[33,58121,28496],{"class":167},[33,58123,58124,58126,58128,58130,58132,58135],{"class":35,"line":300},[33,58125,28510],{"class":167},[33,58127,56956],{"class":54},[33,58129,365],{"class":167},[33,58131,55650],{"class":50},[33,58133,58134],{"class":167},")                    ",[33,58136,58137],{"class":39},"# use the registered name, not \"Helvetica\"\n",[33,58139,58140,58142,58144,58146,58149,58151,58153,58156],{"class":35,"line":317},[33,58141,28523],{"class":167},[33,58143,2680],{"class":50},[33,58145,365],{"class":167},[33,58147,58148],{"class":50},"750",[33,58150,365],{"class":167},[33,58152,57790],{"class":54},[33,58154,58155],{"class":167},")         ",[33,58157,58158],{"class":39},"# euro sign renders correctly\n",[33,58160,58161,58163,58165,58167,58170,58172,58175,58177],{"class":35,"line":332},[33,58162,28523],{"class":167},[33,58164,2680],{"class":50},[33,58166,365],{"class":167},[33,58168,58169],{"class":50},"730",[33,58171,365],{"class":167},[33,58173,58174],{"class":54},"\"Trademark: ReportLab™\"",[33,58176,1649],{"class":167},[33,58178,58179],{"class":39},"# trademark symbol\n",[33,58181,58182,58184,58186,58188,58190,58192,58195,58197],{"class":35,"line":347},[33,58183,28523],{"class":167},[33,58185,2680],{"class":50},[33,58187,365],{"class":167},[33,58189,49888],{"class":50},[33,58191,365],{"class":167},[33,58193,58194],{"class":54},"\"Name: Ångström Müller\"",[33,58196,1649],{"class":167},[33,58198,58199],{"class":39},"# accented characters\n",[33,58201,58202],{"class":35,"line":374},[33,58203,28601],{"class":167},[33,58205,58206,58208,58210,58212,58215,58218,58220],{"class":35,"line":397},[33,58207,7268],{"class":50},[33,58209,602],{"class":167},[33,58211,4059],{"class":163},[33,58213,58214],{"class":54},"\"Written: ",[33,58216,58217],{"class":50},"{OUT}",[33,58219,274],{"class":54},[33,58221,221],{"class":167},[33,58223,58224,58226,58228,58230],{"class":35,"line":653},[33,58225,35726],{"class":163},[33,58227,783],{"class":50},[33,58229,1852],{"class":163},[33,58231,1855],{"class":167},[33,58233,58234,58236,58238,58240,58242,58245,58247,58249,58251,58253,58255,58257],{"class":35,"line":667},[33,58235,35742],{"class":163},[33,58237,7590],{"class":50},[33,58239,602],{"class":167},[33,58241,4059],{"class":163},[33,58243,58244],{"class":54},"\"PDF generation failed: ",[33,58246,1115],{"class":50},[33,58248,6565],{"class":167},[33,58250,1121],{"class":50},[33,58252,274],{"class":54},[33,58254,1649],{"class":167},[33,58256,190],{"class":163},[33,58258,20843],{"class":167},[14,58260,58261],{},"Key changes on each modified line:",[4211,58263,58264,58270],{},[4214,58265,58266,58269],{},[30,58267,58268],{},"pdfmetrics.registerFont(TTFont(\"DejaVuSans\", str(FONT_PATH)))"," — loads the TTF glyph table into ReportLab's registry; do this once before any draw call.",[4214,58271,58272,58275,58276,58278,58279,3035],{},[30,58273,58274],{},"c.setFont(\"DejaVuSans\", 12)"," — switches the active font to the registered TrueType font; the string ",[30,58277,56956],{}," must match the first argument to ",[30,58280,58281],{},"TTFont(...)",[18,58283,58285],{"id":58284},"svg-core-font-vs-truetype-font-glyph-lookup","SVG: core font vs. TrueType font glyph lookup",[2540,58287,2547,58290,2547,58293,2547,58296,2547,2547,2547,58321,2547,58323,2547,58327,2547,2547,58331,2547,2547,58335,2547,58338,2547,58341,2547,2547,58343,2547,2547,58346,2547,58350,2547,58355,2547,58357,2547,2547,58363,2547,2547,2547,58365,2547,58367,2547,58369,2547,2547,58372,2547,2547,58374,2547,58378,2547,58382,2547,2547,58385,2547,2547,58387,2547,58389,2547,58392,2547,58395,2547,2547,58399,2547,58402,2547,58406],{"viewBox":58288,"role":2543,"ariaLabel":58289,"xmlns":2545,"style":2546},"0 0 760 260","Comparison of glyph lookup path for Helvetica core font versus a registered TrueType font in ReportLab",[2549,58291,58292],{},"Core font vs TrueType font glyph lookup",[2553,58294,58295],{},"Shows that Helvetica uses WinAnsiEncoding which covers only 256 code points and fails on € (U+20AC), while a registered TTFont uses a full Unicode cmap and succeeds.",[2557,58297,2559,58298,2559,58307,2559,58316,2547],{},[2561,58299,2564,58301,2564,58304,2559],{"id":58300,"x1":748,"y1":748,"x2":748,"y2":734},"fix-rl-grad-red",[2566,58302],{"offset":748,"style":58303},"stop-color:#fee2e2",[2566,58305],{"offset":734,"style":58306},"stop-color:#fecaca",[2561,58308,2564,58310,2564,58313,2559],{"id":58309,"x1":748,"y1":748,"x2":748,"y2":734},"fix-rl-grad-green",[2566,58311],{"offset":748,"style":58312},"stop-color:#dcfce7",[2566,58314],{"offset":734,"style":58315},"stop-color:#bbf7d0",[2573,58317,2564,58319,2559],{"id":58318,"viewBox":2576,"refX":2577,"refY":1153,"markerWidth":1179,"markerHeight":1179,"orient":2578},"fix-rl-arrow",[2580,58320],{"d":2582,"fill":2583},[2585,58322],{"x":2587,"y":1543,"width":2679,"height":2680,"rx":1179,"fill":2592,"stroke":2593,"style":2594},[2000,58324,58326],{"x":16991,"y":49813,"fill":2599,"style":58325},"text-anchor:middle;font-size:12px;font-weight:600","String",[2000,58328,58330],{"x":16991,"y":58329,"fill":2583,"style":2605},"69","\"€49.99\"",[35,58332],{"x1":2588,"y1":58333,"x2":16982,"y2":58333,"stroke":2583,"markerEnd":58334,"style":2594},"55","url(#fix-rl-arrow)",[2585,58336],{"x":58337,"y":1543,"width":2610,"height":2680,"rx":1179,"fill":2592,"stroke":2593,"style":2594},"180",[2000,58339,58340],{"x":2618,"y":49813,"fill":2599,"style":58325},"Helvetica",[2000,58342,57634],{"x":2618,"y":58329,"fill":2583,"style":2605},[35,58344],{"x1":58345,"y1":58333,"x2":26369,"y2":58333,"stroke":2583,"markerEnd":58334,"style":2594},"340",[2585,58347],{"x":2677,"y":1543,"width":2610,"height":2680,"rx":1179,"fill":58348,"stroke":58349,"style":2594},"url(#fix-rl-grad-red)","#fca5a5",[2000,58351,58354],{"x":58352,"y":49813,"fill":58353,"style":58325},"470","#991b1b","€ not found",[2000,58356,53869],{"x":58352,"y":58329,"fill":58353,"style":2605},[2000,58358,58362],{"x":58359,"y":58360,"fill":58353,"style":58361},"600","58","text-anchor:start;font-size:11px","\n✗ fails\n",[35,58364],{"x1":2587,"y1":11099,"x2":49883,"y2":11099,"stroke":2593,"style":11105},[2585,58366],{"x":2587,"y":2588,"width":2679,"height":2680,"rx":1179,"fill":2592,"stroke":2593,"style":2594},[2000,58368,58326],{"x":16991,"y":11112,"fill":2599,"style":58325},[2000,58370,58330],{"x":16991,"y":58371,"fill":2583,"style":2605},"169",[35,58373],{"x1":2588,"y1":2598,"x2":16982,"y2":2598,"stroke":2583,"markerEnd":58334,"style":2594},[2585,58375],{"x":58337,"y":2588,"width":2610,"height":2680,"rx":1179,"fill":58376,"stroke":58377,"style":2594},"url(#fix-rl-grad-green)","#86efac",[2000,58379,58381],{"x":2618,"y":11112,"fill":58380,"style":58325},"#14532d","TTFont (DejaVuSans)",[2000,58383,58384],{"x":2618,"y":58371,"fill":58380,"style":2605},"Unicode cmap",[35,58386],{"x1":58345,"y1":2598,"x2":26369,"y2":2598,"stroke":2583,"markerEnd":58334,"style":2594},[2585,58388],{"x":2677,"y":2588,"width":2610,"height":2680,"rx":1179,"fill":58376,"stroke":58377,"style":2594},[2000,58390,58391],{"x":58352,"y":11112,"fill":58380,"style":58325},"€ → glyph 0x20AC",[2000,58393,58394],{"x":58352,"y":58371,"fill":58380,"style":2605},"renders correctly",[2000,58396,58398],{"x":58359,"y":11132,"fill":58397,"style":58361},"#15803d","\n✓ works\n",[2585,58400],{"x":2587,"y":58401,"width":49882,"height":56319,"rx":2681,"fill":2615,"stroke":2593,"style":11105},"210",[2000,58403,58405],{"x":58404,"y":26410,"fill":2583,"style":49873},"36","Fix: ",[2000,58407,58409],{"x":2590,"y":26410,"fill":2599,"style":58408},"font-size:11px;font-family:monospace","pdfmetrics.registerFont(TTFont(\"DejaVuSans\", path))  →  c.setFont(\"DejaVuSans\", 12)",[18,58411,58413],{"id":58412},"variant-fix-1-platypus-paragraph-styles","Variant fix 1 — Platypus Paragraph styles",[14,58415,58416,58417,58419,58420,58423,58424,58427],{},"When using ",[30,58418,57653],{}," flowables (the usual path in ",[30,58421,58422],{},"SimpleDocTemplate","), set the font name on the ",[30,58425,58426],{},"ParagraphStyle",", not on the canvas:",[23,58429,58431],{"className":126,"code":58430,"language":47,"meta":28,"style":28},"# pip install reportlab\nfrom pathlib import Path\nfrom reportlab.pdfbase import pdfmetrics\nfrom reportlab.pdfbase.ttfonts import TTFont\nfrom reportlab.platypus import SimpleDocTemplate, Paragraph\nfrom reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle\nfrom reportlab.lib.pagesizes import A4\n\nFONT_PATH = Path(\"\u002Fusr\u002Fshare\u002Ffonts\u002Ftruetype\u002Fdejavu\u002FDejaVuSans.ttf\")\npdfmetrics.registerFont(TTFont(\"DejaVuSans\", str(FONT_PATH)))\n\nstyles = getSampleStyleSheet()\n# Override the font on a custom style — do NOT mutate the built-in \"Normal\" style\nunicode_style = ParagraphStyle(\n    \"unicode_body\",\n    parent=styles[\"Normal\"],\n    fontName=\"DejaVuSans\",   # changed from \"Helvetica\"\n    fontSize=11,\n    leading=15,\n)\n\nOUT = Path(\"\u002Ftmp\u002Frl_para_unicode.pdf\")\ntry:\n    doc = SimpleDocTemplate(str(OUT), pagesize=A4)\n    story = [\n        Paragraph(\"Invoice total: €1,249.00\", unicode_style),\n        Paragraph(\"Trademark: Python™\", unicode_style),\n        Paragraph(\"Contact: Ångström Müller\", unicode_style),\n    ]\n    doc.build(story)\n    print(f\"Written: {OUT}\")\nexcept Exception as exc:\n    raise RuntimeError(f\"Build failed: {exc}\") from exc\n",[30,58432,58433,58437,58447,58457,58467,58478,58488,58498,58502,58514,58530,58534,58543,58548,58557,58564,58577,58591,58602,58613,58617,58621,58634,58640,58663,58671,58681,58690,58699,58703,58708,58724,58734],{"__ignoreMap":28},[33,58434,58435],{"class":35,"line":36},[33,58436,20289],{"class":39},[33,58438,58439,58441,58443,58445],{"class":35,"line":43},[33,58440,190],{"class":163},[33,58442,193],{"class":167},[33,58444,164],{"class":163},[33,58446,198],{"class":167},[33,58448,58449,58451,58453,58455],{"class":35,"line":61},[33,58450,190],{"class":163},[33,58452,56898],{"class":167},[33,58454,164],{"class":163},[33,58456,56903],{"class":167},[33,58458,58459,58461,58463,58465],{"class":35,"line":73},[33,58460,190],{"class":163},[33,58462,56910],{"class":167},[33,58464,164],{"class":163},[33,58466,56915],{"class":167},[33,58468,58469,58471,58473,58475],{"class":35,"line":88},[33,58470,190],{"class":163},[33,58472,19092],{"class":167},[33,58474,164],{"class":163},[33,58476,58477],{"class":167}," SimpleDocTemplate, Paragraph\n",[33,58479,58480,58482,58484,58486],{"class":35,"line":95},[33,58481,190],{"class":163},[33,58483,19068],{"class":167},[33,58485,164],{"class":163},[33,58487,19073],{"class":167},[33,58489,58490,58492,58494,58496],{"class":35,"line":101},[33,58491,190],{"class":163},[33,58493,19044],{"class":167},[33,58495,164],{"class":163},[33,58497,19049],{"class":167},[33,58499,58500],{"class":35,"line":171},[33,58501,92],{"emptyLinePlaceholder":91},[33,58503,58504,58506,58508,58510,58512],{"class":35,"line":179},[33,58505,58007],{"class":50},[33,58507,212],{"class":163},[33,58509,215],{"class":167},[33,58511,56946],{"class":54},[33,58513,221],{"class":167},[33,58515,58516,58518,58520,58522,58524,58526,58528],{"class":35,"line":187},[33,58517,56953],{"class":167},[33,58519,56956],{"class":54},[33,58521,365],{"class":167},[33,58523,1053],{"class":50},[33,58525,602],{"class":167},[33,58527,58007],{"class":50},[33,58529,23269],{"class":167},[33,58531,58532],{"class":35,"line":201},[33,58533,92],{"emptyLinePlaceholder":91},[33,58535,58536,58539,58541],{"class":35,"line":206},[33,58537,58538],{"class":167},"styles ",[33,58540,242],{"class":163},[33,58542,19260],{"class":167},[33,58544,58545],{"class":35,"line":224},[33,58546,58547],{"class":39},"# Override the font on a custom style — do NOT mutate the built-in \"Normal\" style\n",[33,58549,58550,58553,58555],{"class":35,"line":229},[33,58551,58552],{"class":167},"unicode_style ",[33,58554,242],{"class":163},[33,58556,19270],{"class":167},[33,58558,58559,58562],{"class":35,"line":235},[33,58560,58561],{"class":54},"    \"unicode_body\"",[33,58563,247],{"class":167},[33,58565,58566,58569,58571,58573,58575],{"class":35,"line":250},[33,58567,58568],{"class":238},"    parent",[33,58570,242],{"class":163},[33,58572,19285],{"class":167},[33,58574,19348],{"class":54},[33,58576,8935],{"class":167},[33,58578,58579,58582,58584,58586,58588],{"class":35,"line":266},[33,58580,58581],{"class":238},"    fontName",[33,58583,242],{"class":163},[33,58585,56956],{"class":54},[33,58587,1166],{"class":167},[33,58589,58590],{"class":39},"# changed from \"Helvetica\"\n",[33,58592,58593,58596,58598,58600],{"class":35,"line":290},[33,58594,58595],{"class":238},"    fontSize",[33,58597,242],{"class":163},[33,58599,17260],{"class":50},[33,58601,247],{"class":167},[33,58603,58604,58607,58609,58611],{"class":35,"line":295},[33,58605,58606],{"class":238},"    leading",[33,58608,242],{"class":163},[33,58610,1646],{"class":50},[33,58612,247],{"class":167},[33,58614,58615],{"class":35,"line":300},[33,58616,221],{"class":167},[33,58618,58619],{"class":35,"line":317},[33,58620,92],{"emptyLinePlaceholder":91},[33,58622,58623,58625,58627,58629,58632],{"class":35,"line":332},[33,58624,57716],{"class":50},[33,58626,212],{"class":163},[33,58628,215],{"class":167},[33,58630,58631],{"class":54},"\"\u002Ftmp\u002Frl_para_unicode.pdf\"",[33,58633,221],{"class":167},[33,58635,58636,58638],{"class":35,"line":347},[33,58637,35574],{"class":163},[33,58639,574],{"class":167},[33,58641,58642,58644,58646,58649,58651,58653,58655,58657,58659,58661],{"class":35,"line":374},[33,58643,18224],{"class":167},[33,58645,242],{"class":163},[33,58647,58648],{"class":167}," SimpleDocTemplate(",[33,58650,1053],{"class":50},[33,58652,602],{"class":167},[33,58654,57716],{"class":50},[33,58656,18525],{"class":167},[33,58658,20091],{"class":238},[33,58660,242],{"class":163},[33,58662,28496],{"class":167},[33,58664,58665,58667,58669],{"class":35,"line":397},[33,58666,19444],{"class":167},[33,58668,242],{"class":163},[33,58670,7473],{"class":167},[33,58672,58673,58675,58678],{"class":35,"line":653},[33,58674,19453],{"class":167},[33,58676,58677],{"class":54},"\"Invoice total: €1,249.00\"",[33,58679,58680],{"class":167},", unicode_style),\n",[33,58682,58683,58685,58688],{"class":35,"line":667},[33,58684,19453],{"class":167},[33,58686,58687],{"class":54},"\"Trademark: Python™\"",[33,58689,58680],{"class":167},[33,58691,58692,58694,58697],{"class":35,"line":675},[33,58693,19453],{"class":167},[33,58695,58696],{"class":54},"\"Contact: Ångström Müller\"",[33,58698,58680],{"class":167},[33,58700,58701],{"class":35,"line":689},[33,58702,19559],{"class":167},[33,58704,58705],{"class":35,"line":703},[33,58706,58707],{"class":167},"    doc.build(story)\n",[33,58709,58710,58712,58714,58716,58718,58720,58722],{"class":35,"line":714},[33,58711,7268],{"class":50},[33,58713,602],{"class":167},[33,58715,4059],{"class":163},[33,58717,58214],{"class":54},[33,58719,58217],{"class":50},[33,58721,274],{"class":54},[33,58723,221],{"class":167},[33,58725,58726,58728,58730,58732],{"class":35,"line":723},[33,58727,35726],{"class":163},[33,58729,783],{"class":50},[33,58731,1852],{"class":163},[33,58733,1855],{"class":167},[33,58735,58736,58738,58740,58742,58744,58747,58749,58751,58753,58755,58757,58759],{"class":35,"line":754},[33,58737,35742],{"class":163},[33,58739,7590],{"class":50},[33,58741,602],{"class":167},[33,58743,4059],{"class":163},[33,58745,58746],{"class":54},"\"Build failed: ",[33,58748,1115],{"class":50},[33,58750,6565],{"class":167},[33,58752,1121],{"class":50},[33,58754,274],{"class":54},[33,58756,1649],{"class":167},[33,58758,190],{"class":163},[33,58760,20843],{"class":167},[14,58762,58763,58764,58767,58768,58770,58771,58773],{},"Changed line: ",[30,58765,58766],{},"fontName=\"DejaVuSans\""," in the ",[30,58769,58426],{}," constructor — this propagates to all ",[30,58772,57653],{}," flowables that use this style.",[18,58775,58777],{"id":58776},"variant-fix-2-cid-fonts-for-cjk-characters","Variant fix 2 — CID fonts for CJK characters",[14,58779,58780],{},"For Chinese, Japanese, or Korean text, DejaVu Sans may not have sufficient coverage. Use a CID (Composite) font instead:",[23,58782,58784],{"className":126,"code":58783,"language":47,"meta":28,"style":28},"# pip install reportlab\nfrom reportlab.pdfbase import pdfmetrics\nfrom reportlab.pdfbase.cidfonts import UnicodeCIDFont\n\n# ReportLab ships CID support for CJK fonts; no external file needed\npdfmetrics.registerFont(UnicodeCIDFont(\"HeiseiKakuGo-W5\"))  # Japanese sans-serif\n\nfrom reportlab.pdfgen import canvas as rl_canvas\nfrom reportlab.lib.pagesizes import A4\nfrom pathlib import Path\n\nOUT = Path(\"\u002Ftmp\u002Frl_cid_unicode.pdf\")\ntry:\n    c = rl_canvas.Canvas(str(OUT), pagesize=A4)\n    c.setFont(\"HeiseiKakuGo-W5\", 14)          # use the CID font name\n    c.drawString(50, 700, \"日本語テスト\")       # Japanese text\n    c.save()\n    print(f\"Written: {OUT}\")\nexcept Exception as exc:\n    raise RuntimeError(f\"CID font render failed: {exc}\") from exc\n",[30,58785,58786,58790,58800,58812,58816,58821,58835,58839,58853,58863,58873,58877,58890,58896,58918,58934,58954,58958,58974,58984],{"__ignoreMap":28},[33,58787,58788],{"class":35,"line":36},[33,58789,20289],{"class":39},[33,58791,58792,58794,58796,58798],{"class":35,"line":43},[33,58793,190],{"class":163},[33,58795,56898],{"class":167},[33,58797,164],{"class":163},[33,58799,56903],{"class":167},[33,58801,58802,58804,58807,58809],{"class":35,"line":61},[33,58803,190],{"class":163},[33,58805,58806],{"class":167}," reportlab.pdfbase.cidfonts ",[33,58808,164],{"class":163},[33,58810,58811],{"class":167}," UnicodeCIDFont\n",[33,58813,58814],{"class":35,"line":73},[33,58815,92],{"emptyLinePlaceholder":91},[33,58817,58818],{"class":35,"line":88},[33,58819,58820],{"class":39},"# ReportLab ships CID support for CJK fonts; no external file needed\n",[33,58822,58823,58826,58829,58832],{"class":35,"line":95},[33,58824,58825],{"class":167},"pdfmetrics.registerFont(UnicodeCIDFont(",[33,58827,58828],{"class":54},"\"HeiseiKakuGo-W5\"",[33,58830,58831],{"class":167},"))  ",[33,58833,58834],{"class":39},"# Japanese sans-serif\n",[33,58836,58837],{"class":35,"line":101},[33,58838,92],{"emptyLinePlaceholder":91},[33,58840,58841,58843,58845,58847,58849,58851],{"class":35,"line":171},[33,58842,190],{"class":163},[33,58844,28221],{"class":167},[33,58846,164],{"class":163},[33,58848,57682],{"class":167},[33,58850,495],{"class":163},[33,58852,57687],{"class":167},[33,58854,58855,58857,58859,58861],{"class":35,"line":179},[33,58856,190],{"class":163},[33,58858,19044],{"class":167},[33,58860,164],{"class":163},[33,58862,19049],{"class":167},[33,58864,58865,58867,58869,58871],{"class":35,"line":187},[33,58866,190],{"class":163},[33,58868,193],{"class":167},[33,58870,164],{"class":163},[33,58872,198],{"class":167},[33,58874,58875],{"class":35,"line":201},[33,58876,92],{"emptyLinePlaceholder":91},[33,58878,58879,58881,58883,58885,58888],{"class":35,"line":206},[33,58880,57716],{"class":50},[33,58882,212],{"class":163},[33,58884,215],{"class":167},[33,58886,58887],{"class":54},"\"\u002Ftmp\u002Frl_cid_unicode.pdf\"",[33,58889,221],{"class":167},[33,58891,58892,58894],{"class":35,"line":224},[33,58893,35574],{"class":163},[33,58895,574],{"class":167},[33,58897,58898,58900,58902,58904,58906,58908,58910,58912,58914,58916],{"class":35,"line":229},[33,58899,28472],{"class":167},[33,58901,242],{"class":163},[33,58903,57744],{"class":167},[33,58905,1053],{"class":50},[33,58907,602],{"class":167},[33,58909,57716],{"class":50},[33,58911,18525],{"class":167},[33,58913,20091],{"class":238},[33,58915,242],{"class":163},[33,58917,28496],{"class":167},[33,58919,58920,58922,58924,58926,58928,58931],{"class":35,"line":235},[33,58921,28510],{"class":167},[33,58923,58828],{"class":54},[33,58925,365],{"class":167},[33,58927,19368],{"class":50},[33,58929,58930],{"class":167},")          ",[33,58932,58933],{"class":39},"# use the CID font name\n",[33,58935,58936,58938,58940,58942,58944,58946,58949,58951],{"class":35,"line":250},[33,58937,28523],{"class":167},[33,58939,2680],{"class":50},[33,58941,365],{"class":167},[33,58943,57785],{"class":50},[33,58945,365],{"class":167},[33,58947,58948],{"class":54},"\"日本語テスト\"",[33,58950,8815],{"class":167},[33,58952,58953],{"class":39},"# Japanese text\n",[33,58955,58956],{"class":35,"line":266},[33,58957,28601],{"class":167},[33,58959,58960,58962,58964,58966,58968,58970,58972],{"class":35,"line":290},[33,58961,7268],{"class":50},[33,58963,602],{"class":167},[33,58965,4059],{"class":163},[33,58967,58214],{"class":54},[33,58969,58217],{"class":50},[33,58971,274],{"class":54},[33,58973,221],{"class":167},[33,58975,58976,58978,58980,58982],{"class":35,"line":295},[33,58977,35726],{"class":163},[33,58979,783],{"class":50},[33,58981,1852],{"class":163},[33,58983,1855],{"class":167},[33,58985,58986,58988,58990,58992,58994,58997,58999,59001,59003,59005,59007,59009],{"class":35,"line":300},[33,58987,35742],{"class":163},[33,58989,7590],{"class":50},[33,58991,602],{"class":167},[33,58993,4059],{"class":163},[33,58995,58996],{"class":54},"\"CID font render failed: ",[33,58998,1115],{"class":50},[33,59000,6565],{"class":167},[33,59002,1121],{"class":50},[33,59004,274],{"class":54},[33,59006,1649],{"class":167},[33,59008,190],{"class":163},[33,59010,20843],{"class":167},[14,59012,59013,59014,59017,59018,59021,59022,59025,59026,59029],{},"Available built-in CID fonts: ",[30,59015,59016],{},"HeiseiKakuGo-W5"," (Japanese), ",[30,59019,59020],{},"HeiseiMin-W3"," (Japanese serif), ",[30,59023,59024],{},"HYSMyeongJo-Medium"," (Korean), ",[30,59027,59028],{},"STSong-Light"," (Simplified Chinese).",[18,59031,59033],{"id":59032},"variant-fix-3-encodingutf-8-on-data-sources","Variant fix 3 — encoding='utf-8' on data sources",[14,59035,59036,59037,59040],{},"If the crash happens ",[26245,59038,59039],{},"before"," any PDF call, the issue is in data loading, not in ReportLab. The fix is in the CSV\u002Ffile read:",[23,59042,59044],{"className":126,"code":59043,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\nfrom pathlib import Path\n\n# Wrong — omitting encoding lets Python pick the system default (often latin-1 on Windows)\n# df = pd.read_csv(Path(\"invoices.csv\"))\n\n# Correct — explicit UTF-8 prevents UnicodeDecodeError at load time\ndf = pd.read_csv(Path(\"invoices.csv\"), encoding=\"utf-8\")   # added encoding='utf-8'\n",[30,59045,59046,59050,59060,59070,59074,59079,59084,59088,59093],{"__ignoreMap":28},[33,59047,59048],{"class":35,"line":36},[33,59049,8895],{"class":39},[33,59051,59052,59054,59056,59058],{"class":35,"line":43},[33,59053,164],{"class":163},[33,59055,492],{"class":167},[33,59057,495],{"class":163},[33,59059,498],{"class":167},[33,59061,59062,59064,59066,59068],{"class":35,"line":61},[33,59063,190],{"class":163},[33,59065,193],{"class":167},[33,59067,164],{"class":163},[33,59069,198],{"class":167},[33,59071,59072],{"class":35,"line":73},[33,59073,92],{"emptyLinePlaceholder":91},[33,59075,59076],{"class":35,"line":88},[33,59077,59078],{"class":39},"# Wrong — omitting encoding lets Python pick the system default (often latin-1 on Windows)\n",[33,59080,59081],{"class":35,"line":95},[33,59082,59083],{"class":39},"# df = pd.read_csv(Path(\"invoices.csv\"))\n",[33,59085,59086],{"class":35,"line":101},[33,59087,92],{"emptyLinePlaceholder":91},[33,59089,59090],{"class":35,"line":171},[33,59091,59092],{"class":39},"# Correct — explicit UTF-8 prevents UnicodeDecodeError at load time\n",[33,59094,59095,59097,59099,59101,59103,59105,59107,59109,59111,59113],{"class":35,"line":179},[33,59096,13459],{"class":167},[33,59098,242],{"class":163},[33,59100,46182],{"class":167},[33,59102,54440],{"class":54},[33,59104,18525],{"class":167},[33,59106,27249],{"class":238},[33,59108,242],{"class":163},[33,59110,1195],{"class":54},[33,59112,12000],{"class":167},[33,59114,59115],{"class":39},"# added encoding='utf-8'\n",[14,59117,59118,59119,2012,59122,59125],{},"If the source file was saved in Windows-1252 (common from Excel), use ",[30,59120,59121],{},"encoding=\"cp1252\"",[30,59123,59124],{},"encoding=\"utf-8-sig\""," (for files with a BOM).",[18,59127,9247],{"id":9246},[23,59129,59131],{"className":126,"code":59130,"language":47,"meta":28,"style":28},"# pip install reportlab pypdf\nfrom pathlib import Path\nfrom pypdf import PdfReader\n\ndef verify_unicode_in_pdf(path: Path, expected_chars: list[str]) -> None:\n    reader = PdfReader(str(path))\n    text = \" \".join(p.extract_text() or \"\" for p in reader.pages)\n    for char in expected_chars:\n        # Note: pypdf's text extraction may not round-trip all glyphs perfectly,\n        # but the absence of UnicodeEncodeError during build is the primary signal.\n        if char not in text:\n            print(f\"  Warning: '{char}' not found in extracted text (may be a pypdf limitation)\")\n        else:\n            print(f\"  OK: '{char}' present in extracted text\")\n    print(f\"PDF built successfully: {path.name} ({len(reader.pages)} page(s))\")\n\nverify_unicode_in_pdf(Path(\"\u002Ftmp\u002Frl_unicode_fixed.pdf\"), [\"€\", \"™\", \"Å\"])\n",[30,59132,59133,59138,59148,59158,59162,59180,59192,59214,59226,59231,59236,59248,59271,59277,59299,59330,59334],{"__ignoreMap":28},[33,59134,59135],{"class":35,"line":36},[33,59136,59137],{"class":39},"# pip install reportlab pypdf\n",[33,59139,59140,59142,59144,59146],{"class":35,"line":43},[33,59141,190],{"class":163},[33,59143,193],{"class":167},[33,59145,164],{"class":163},[33,59147,198],{"class":167},[33,59149,59150,59152,59154,59156],{"class":35,"line":61},[33,59151,190],{"class":163},[33,59153,57333],{"class":167},[33,59155,164],{"class":163},[33,59157,57338],{"class":167},[33,59159,59160],{"class":35,"line":73},[33,59161,92],{"emptyLinePlaceholder":91},[33,59163,59164,59166,59169,59172,59174,59176,59178],{"class":35,"line":88},[33,59165,562],{"class":163},[33,59167,59168],{"class":46}," verify_unicode_in_pdf",[33,59170,59171],{"class":167},"(path: Path, expected_chars: list[",[33,59173,1053],{"class":50},[33,59175,28895],{"class":167},[33,59177,571],{"class":50},[33,59179,574],{"class":167},[33,59181,59182,59184,59186,59188,59190],{"class":35,"line":95},[33,59183,57365],{"class":167},[33,59185,242],{"class":163},[33,59187,57370],{"class":167},[33,59189,1053],{"class":50},[33,59191,21248],{"class":167},[33,59193,59194,59196,59198,59200,59202,59204,59206,59208,59210,59212],{"class":35,"line":101},[33,59195,44654],{"class":167},[33,59197,242],{"class":163},[33,59199,57412],{"class":54},[33,59201,57415],{"class":167},[33,59203,7162],{"class":163},[33,59205,9892],{"class":54},[33,59207,14766],{"class":163},[33,59209,6127],{"class":167},[33,59211,662],{"class":163},[33,59213,57428],{"class":167},[33,59215,59216,59218,59221,59223],{"class":35,"line":171},[33,59217,656],{"class":163},[33,59219,59220],{"class":167}," char ",[33,59222,662],{"class":163},[33,59224,59225],{"class":167}," expected_chars:\n",[33,59227,59228],{"class":35,"line":179},[33,59229,59230],{"class":39},"        # Note: pypdf's text extraction may not round-trip all glyphs perfectly,\n",[33,59232,59233],{"class":35,"line":187},[33,59234,59235],{"class":39},"        # but the absence of UnicodeEncodeError during build is the primary signal.\n",[33,59237,59238,59240,59242,59244,59246],{"class":35,"line":201},[33,59239,8221],{"class":163},[33,59241,59220],{"class":167},[33,59243,7999],{"class":163},[33,59245,8002],{"class":163},[33,59247,44613],{"class":167},[33,59249,59250,59252,59254,59256,59259,59261,59264,59266,59269],{"class":35,"line":206},[33,59251,9364],{"class":50},[33,59253,602],{"class":167},[33,59255,4059],{"class":163},[33,59257,59258],{"class":54},"\"  Warning: '",[33,59260,1115],{"class":50},[33,59262,59263],{"class":167},"char",[33,59265,1121],{"class":50},[33,59267,59268],{"class":54},"' not found in extracted text (may be a pypdf limitation)\"",[33,59270,221],{"class":167},[33,59272,59273,59275],{"class":35,"line":224},[33,59274,41290],{"class":163},[33,59276,574],{"class":167},[33,59278,59279,59281,59283,59285,59288,59290,59292,59294,59297],{"class":35,"line":229},[33,59280,9364],{"class":50},[33,59282,602],{"class":167},[33,59284,4059],{"class":163},[33,59286,59287],{"class":54},"\"  OK: '",[33,59289,1115],{"class":50},[33,59291,59263],{"class":167},[33,59293,1121],{"class":50},[33,59295,59296],{"class":54},"' present in extracted text\"",[33,59298,221],{"class":167},[33,59300,59301,59303,59305,59307,59310,59312,59314,59316,59318,59320,59323,59325,59328],{"class":35,"line":235},[33,59302,7268],{"class":50},[33,59304,602],{"class":167},[33,59306,4059],{"class":163},[33,59308,59309],{"class":54},"\"PDF built successfully: ",[33,59311,1115],{"class":50},[33,59313,57398],{"class":167},[33,59315,1121],{"class":50},[33,59317,17583],{"class":54},[33,59319,4065],{"class":50},[33,59321,59322],{"class":167},"(reader.pages)",[33,59324,1121],{"class":50},[33,59326,59327],{"class":54}," page(s))\"",[33,59329,221],{"class":167},[33,59331,59332],{"class":35,"line":250},[33,59333,92],{"emptyLinePlaceholder":91},[33,59335,59336,59339,59341,59344,59347,59349,59352,59354,59357],{"class":35,"line":266},[33,59337,59338],{"class":167},"verify_unicode_in_pdf(Path(",[33,59340,58087],{"class":54},[33,59342,59343],{"class":167},"), [",[33,59345,59346],{"class":54},"\"€\"",[33,59348,365],{"class":167},[33,59350,59351],{"class":54},"\"™\"",[33,59353,365],{"class":167},[33,59355,59356],{"class":54},"\"Å\"",[33,59358,751],{"class":167},[14,59360,59361,59362,10065,59365,59368,59369,59371],{},"The primary verification signal is that ",[30,59363,59364],{},"pdfmetrics.registerFont",[30,59366,59367],{},"doc.build"," complete without raising ",[30,59370,53869],{},". Visual inspection in a PDF viewer confirms glyph rendering.",[18,59373,6918],{"id":6917},[4211,59375,59376,59381,59389,59394],{},[4214,59377,59378,59380],{},[940,59379,26191],{"href":19001}," — the parent guide where this error commonly appears; ReportLab canvas and Platypus patterns",[4214,59382,59383,59385,59386,59388],{},[940,59384,53852],{"href":57625}," — invoice pipeline where ",[30,59387,53873],{}," and accented customer names trigger this error",[4214,59390,59391,59393],{},[940,59392,27254],{"href":27253}," — fix encoding issues in the data source before they reach the PDF renderer",[4214,59395,59396,59398],{},[940,59397,6943],{"href":6942}," — full PDF automation overview",[14,59400,6947,59401,3035],{},[940,59402,26191],{"href":19001},[6953,59404,59405],{},"html pre.shiki code .sAwPA, html code.shiki .sAwPA{--shiki-default:#6A737D}html pre.shiki code .sD7c4, html code.shiki .sD7c4{--shiki-default:#D73A49}html pre.shiki code .sgsFI, html code.shiki .sgsFI{--shiki-default:#24292E}html pre.shiki code .sYu0t, html code.shiki .sYu0t{--shiki-default:#005CC5}html pre.shiki code .sYBdl, html code.shiki .sYBdl{--shiki-default:#032F62}html pre.shiki code .sqxcx, html code.shiki .sqxcx{--shiki-default:#E36209}html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html pre.shiki code .s7eDp, html code.shiki .s7eDp{--shiki-default:#6F42C1}",{"title":28,"searchDepth":43,"depth":43,"links":59407},[59408,59409,59410,59411,59412,59413,59414,59415,59416],{"id":7020,"depth":43,"text":4287},{"id":54445,"depth":43,"text":54446},{"id":57851,"depth":43,"text":57852},{"id":58284,"depth":43,"text":58285},{"id":58412,"depth":43,"text":58413},{"id":58776,"depth":43,"text":58777},{"id":59032,"depth":43,"text":59033},{"id":9246,"depth":43,"text":9247},{"id":6917,"depth":43,"text":6918},"Unicode Font Errors","Fix garbled boxes or UnicodeEncodeError in ReportLab output for €, ™, and accented characters by registering a TrueType font with pdfmetrics.registerFont.",{},"\u002Fautomating-pdf-extraction-generation\u002Fgenerating-pdf-reports-dynamically\u002Ffix-reportlab-unicode-font-errors",{"title":28608,"description":59418},"Fix ReportLab Unicode Font Errors (€ ™ Accented Chars)","automating-pdf-extraction-generation\u002Fgenerating-pdf-reports-dynamically\u002Ffix-reportlab-unicode-font-errors\u002Findex",[47,9631,26232,59425,59426],"unicode","fonts","60PthVb_1i_PeLlYuJ1CtTHKLfSIl3k4C2dlH3u7WgU",{"id":59429,"title":26191,"body":59430,"breadcrumbTitle":64795,"canonical":6977,"date":46387,"description":64796,"draft":6980,"extension":6981,"image":6977,"meta":64797,"navigation":91,"path":64798,"robots":6977,"seo":64799,"seoTitle":64800,"stem":64801,"tags":64802,"updatedAt":6978,"__hash__":64804},"content\u002Fautomating-pdf-extraction-generation\u002Fgenerating-pdf-reports-dynamically\u002Findex.md",{"type":7,"value":59431,"toc":64776},[59432,59435,59438,59446,59448,59489,59492,59561,59565,59568,59887,59893,59897,59994,59998,60007,60436,60440,60455,61495,61499,61514,61978,61981,61985,62005,62067,62071,62268,62272,62275,62417,62419,62710,62713,62742,62744,62846,62849,64723,64727,64745,64747,64769,64773],[10,59433,26191],{"id":59434},"generating-pdf-reports-dynamically",[14,59436,59437],{},"Manually assembling PDFs breaks the moment the data changes. Dynamic generation keeps the layout fixed and feeds fresh data in at render time — one script, many documents. This guide covers the two main Python stacks (ReportLab for canvas-level control, Jinja2+WeasyPrint for HTML-to-PDF), then the practical problems: headers\u002Ffooters, multi-page tables, embedded charts, and Unicode fonts.",[14,59439,59440,59441,59443,59444,3035],{},"For broader context see ",[940,59442,6943],{"href":6942},". If you also need to assemble separately generated PDFs into one deliverable, the merge step is in ",[940,59445,52682],{"href":52681},[18,59447,21],{"id":20},[23,59449,59451],{"className":25,"code":59450,"language":27,"meta":28,"style":28},"# System deps (WeasyPrint needs Cairo + Pango)\n# Debian\u002FUbuntu: sudo apt install libcairo2 libpango-1.0-0 libpangocairo-1.0-0\n# macOS: brew install cairo pango\n\npip install reportlab weasyprint jinja2 pandas matplotlib\n",[30,59452,59453,59458,59463,59468,59472],{"__ignoreMap":28},[33,59454,59455],{"class":35,"line":36},[33,59456,59457],{"class":39},"# System deps (WeasyPrint needs Cairo + Pango)\n",[33,59459,59460],{"class":35,"line":43},[33,59461,59462],{"class":39},"# Debian\u002FUbuntu: sudo apt install libcairo2 libpango-1.0-0 libpangocairo-1.0-0\n",[33,59464,59465],{"class":35,"line":61},[33,59466,59467],{"class":39},"# macOS: brew install cairo pango\n",[33,59469,59470],{"class":35,"line":73},[33,59471,92],{"emptyLinePlaceholder":91},[33,59473,59474,59476,59478,59480,59482,59484,59486],{"class":35,"line":88},[33,59475,76],{"class":46},[33,59477,79],{"class":54},[33,59479,16198],{"class":54},[33,59481,20930],{"class":54},[33,59483,53938],{"class":54},[33,59485,16183],{"class":54},[33,59487,59488],{"class":54}," matplotlib\n",[14,59490,59491],{},"Create a test data file:",[23,59493,59495],{"className":25,"code":59494,"language":27,"meta":28,"style":28},"mkdir -p reports data\npython - \u003C\u003C'EOF'\nimport csv\nrows = [\n    {\"customer\":\"Acme Corp\",\"region\":\"North\",\"revenue\":82000,\"costs\":54000},\n    {\"customer\":\"Beta Ltd\",\"region\":\"South\",\"revenue\":61000,\"costs\":41000},\n    {\"customer\":\"Gamma Inc\",\"region\":\"East\",\"revenue\":74000,\"costs\":49000},\n]\nwith open(\"data\u002Fsales.csv\",\"w\",newline=\"\") as f:\n    w = csv.DictWriter(f, fieldnames=rows[0].keys()); w.writeheader(); w.writerows(rows)\nEOF\n",[30,59496,59497,59511,59521,59525,59529,59534,59539,59544,59548,59553,59557],{"__ignoreMap":28},[33,59498,59499,59502,59505,59508],{"class":35,"line":36},[33,59500,59501],{"class":46},"mkdir",[33,59503,59504],{"class":50}," -p",[33,59506,59507],{"class":54}," reports",[33,59509,59510],{"class":54}," data\n",[33,59512,59513,59515,59517,59519],{"class":35,"line":43},[33,59514,47],{"class":46},[33,59516,39025],{"class":54},[33,59518,53957],{"class":163},[33,59520,53960],{"class":54},[33,59522,59523],{"class":35,"line":61},[33,59524,53965],{"class":54},[33,59526,59527],{"class":35,"line":73},[33,59528,53970],{"class":54},[33,59530,59531],{"class":35,"line":88},[33,59532,59533],{"class":54},"    {\"customer\":\"Acme Corp\",\"region\":\"North\",\"revenue\":82000,\"costs\":54000},\n",[33,59535,59536],{"class":35,"line":95},[33,59537,59538],{"class":54},"    {\"customer\":\"Beta Ltd\",\"region\":\"South\",\"revenue\":61000,\"costs\":41000},\n",[33,59540,59541],{"class":35,"line":101},[33,59542,59543],{"class":54},"    {\"customer\":\"Gamma Inc\",\"region\":\"East\",\"revenue\":74000,\"costs\":49000},\n",[33,59545,59546],{"class":35,"line":171},[33,59547,9202],{"class":54},[33,59549,59550],{"class":35,"line":179},[33,59551,59552],{"class":54},"with open(\"data\u002Fsales.csv\",\"w\",newline=\"\") as f:\n",[33,59554,59555],{"class":35,"line":187},[33,59556,54014],{"class":54},[33,59558,59559],{"class":35,"line":201},[33,59560,54019],{"class":54},[18,59562,59564],{"id":59563},"step-1-inspect-and-normalise-input-data","Step 1 — Inspect and normalise input data",[14,59566,59567],{},"Load and validate before passing to any renderer. Garbage in, garbage PDF out.",[23,59569,59571],{"className":126,"code":59570,"language":47,"meta":28,"style":28},"# pip install pandas\nfrom pathlib import Path\nimport pandas as pd\n\nDATA = Path(\"data\u002Fsales.csv\")\n\ndef load_report_data(path: Path) -> list[dict]:\n    try:\n        df = pd.read_csv(path, encoding=\"utf-8\")\n        df.columns = df.columns.str.strip().str.lower()\n        required = {\"customer\", \"region\", \"revenue\", \"costs\"}\n        missing = required - set(df.columns)\n        if missing:\n            raise ValueError(f\"Missing columns: {missing}\")\n        df[\"profit\"] = df[\"revenue\"] - df[\"costs\"]\n        df[\"margin\"] = (df[\"profit\"] \u002F df[\"revenue\"] * 100).round(1)\n        return df.to_dict(\"records\")\n    except FileNotFoundError:\n        raise SystemExit(f\"Data file not found: {path}\")\n    except Exception as exc:\n        raise SystemExit(f\"Data load failed: {exc}\")\n\nrows = load_report_data(DATA)\n",[30,59572,59573,59577,59587,59597,59601,59615,59619,59632,59638,59654,59662,59689,59705,59711,59734,59759,59795,59805,59813,59836,59846,59869,59873],{"__ignoreMap":28},[33,59574,59575],{"class":35,"line":36},[33,59576,8895],{"class":39},[33,59578,59579,59581,59583,59585],{"class":35,"line":43},[33,59580,190],{"class":163},[33,59582,193],{"class":167},[33,59584,164],{"class":163},[33,59586,198],{"class":167},[33,59588,59589,59591,59593,59595],{"class":35,"line":61},[33,59590,164],{"class":163},[33,59592,492],{"class":167},[33,59594,495],{"class":163},[33,59596,498],{"class":167},[33,59598,59599],{"class":35,"line":73},[33,59600,92],{"emptyLinePlaceholder":91},[33,59602,59603,59606,59608,59610,59613],{"class":35,"line":88},[33,59604,59605],{"class":50},"DATA",[33,59607,212],{"class":163},[33,59609,215],{"class":167},[33,59611,59612],{"class":54},"\"data\u002Fsales.csv\"",[33,59614,221],{"class":167},[33,59616,59617],{"class":35,"line":95},[33,59618,92],{"emptyLinePlaceholder":91},[33,59620,59621,59623,59626,59628,59630],{"class":35,"line":101},[33,59622,562],{"class":163},[33,59624,59625],{"class":46}," load_report_data",[33,59627,54082],{"class":167},[33,59629,37100],{"class":50},[33,59631,17477],{"class":167},[33,59633,59634,59636],{"class":35,"line":171},[33,59635,2424],{"class":163},[33,59637,574],{"class":167},[33,59639,59640,59642,59644,59646,59648,59650,59652],{"class":35,"line":179},[33,59641,7930],{"class":167},[33,59643,242],{"class":163},[33,59645,27411],{"class":167},[33,59647,27249],{"class":238},[33,59649,242],{"class":163},[33,59651,1195],{"class":54},[33,59653,221],{"class":167},[33,59655,59656,59658,59660],{"class":35,"line":187},[33,59657,10842],{"class":167},[33,59659,242],{"class":163},[33,59661,54121],{"class":167},[33,59663,59664,59667,59669,59671,59674,59676,59678,59680,59682,59684,59687],{"class":35,"line":201},[33,59665,59666],{"class":167},"        required ",[33,59668,242],{"class":163},[33,59670,4098],{"class":167},[33,59672,59673],{"class":54},"\"customer\"",[33,59675,365],{"class":167},[33,59677,16649],{"class":54},[33,59679,365],{"class":167},[33,59681,16465],{"class":54},[33,59683,365],{"class":167},[33,59685,59686],{"class":54},"\"costs\"",[33,59688,4113],{"class":167},[33,59690,59691,59694,59696,59699,59701,59703],{"class":35,"line":206},[33,59692,59693],{"class":167},"        missing ",[33,59695,242],{"class":163},[33,59697,59698],{"class":167}," required ",[33,59700,4126],{"class":163},[33,59702,4129],{"class":50},[33,59704,4132],{"class":167},[33,59706,59707,59709],{"class":35,"line":224},[33,59708,8221],{"class":163},[33,59710,4139],{"class":167},[33,59712,59713,59716,59718,59720,59722,59724,59726,59728,59730,59732],{"class":35,"line":229},[33,59714,59715],{"class":163},"            raise",[33,59717,4054],{"class":50},[33,59719,602],{"class":167},[33,59721,4059],{"class":163},[33,59723,4152],{"class":54},[33,59725,1115],{"class":50},[33,59727,4157],{"class":167},[33,59729,1121],{"class":50},[33,59731,274],{"class":54},[33,59733,221],{"class":167},[33,59735,59736,59738,59741,59743,59745,59747,59749,59751,59753,59755,59757],{"class":35,"line":235},[33,59737,10902],{"class":167},[33,59739,59740],{"class":54},"\"profit\"",[33,59742,763],{"class":167},[33,59744,242],{"class":163},[33,59746,7935],{"class":167},[33,59748,16465],{"class":54},[33,59750,763],{"class":167},[33,59752,4126],{"class":163},[33,59754,7935],{"class":167},[33,59756,59686],{"class":54},[33,59758,9202],{"class":167},[33,59760,59761,59763,59765,59767,59769,59772,59774,59776,59778,59780,59782,59784,59786,59788,59791,59793],{"class":35,"line":250},[33,59762,10902],{"class":167},[33,59764,16456],{"class":54},[33,59766,763],{"class":167},[33,59768,242],{"class":163},[33,59770,59771],{"class":167}," (df[",[33,59773,59740],{"class":54},[33,59775,763],{"class":167},[33,59777,1351],{"class":163},[33,59779,7935],{"class":167},[33,59781,16465],{"class":54},[33,59783,763],{"class":167},[33,59785,1769],{"class":163},[33,59787,18366],{"class":50},[33,59789,59790],{"class":167},").round(",[33,59792,734],{"class":50},[33,59794,221],{"class":167},[33,59796,59797,59799,59801,59803],{"class":35,"line":266},[33,59798,1659],{"class":163},[33,59800,54131],{"class":167},[33,59802,21222],{"class":54},[33,59804,221],{"class":167},[33,59806,59807,59809,59811],{"class":35,"line":290},[33,59808,2449],{"class":163},[33,59810,2945],{"class":50},[33,59812,574],{"class":167},[33,59814,59815,59817,59819,59821,59823,59826,59828,59830,59832,59834],{"class":35,"line":295},[33,59816,4051],{"class":163},[33,59818,16617],{"class":50},[33,59820,602],{"class":167},[33,59822,4059],{"class":163},[33,59824,59825],{"class":54},"\"Data file not found: ",[33,59827,1115],{"class":50},[33,59829,2580],{"class":167},[33,59831,1121],{"class":50},[33,59833,274],{"class":54},[33,59835,221],{"class":167},[33,59837,59838,59840,59842,59844],{"class":35,"line":300},[33,59839,2449],{"class":163},[33,59841,783],{"class":50},[33,59843,1852],{"class":163},[33,59845,1855],{"class":167},[33,59847,59848,59850,59852,59854,59856,59859,59861,59863,59865,59867],{"class":35,"line":317},[33,59849,4051],{"class":163},[33,59851,16617],{"class":50},[33,59853,602],{"class":167},[33,59855,4059],{"class":163},[33,59857,59858],{"class":54},"\"Data load failed: ",[33,59860,1115],{"class":50},[33,59862,6565],{"class":167},[33,59864,1121],{"class":50},[33,59866,274],{"class":54},[33,59868,221],{"class":167},[33,59870,59871],{"class":35,"line":332},[33,59872,92],{"emptyLinePlaceholder":91},[33,59874,59875,59878,59880,59883,59885],{"class":35,"line":347},[33,59876,59877],{"class":167},"rows ",[33,59879,242],{"class":163},[33,59881,59882],{"class":167}," load_report_data(",[33,59884,59605],{"class":50},[33,59886,221],{"class":167},[14,59888,57230,59889,59892],{},[940,59890,59891],{"href":9598},"pandas-based pipelines"," drops straight into this loader — the same normalisation pattern applies.",[18,59894,59896],{"id":59895},"the-template-to-pdf-flow","The template-to-PDF flow",[2540,59898,2547,59900,2547,59903,2547,59906,2547,2547,59928,2547,59930,2547,59934,2547,2547,59937,2547,2547,59941,2547,59944,2547,59947,2547,2547,59950,2547,2547,59954,2547,2547,59957,2547,59961,2547,59965,2547,2547,59968,2547,59970,2547,59973,2547,2547,59977,2547,2547,59979,2547,2547,59982,2547,59984,2547,59988,2547,59991],{"viewBox":2542,"role":2543,"ariaLabel":59899,"xmlns":2545,"style":2546},"Template to PDF rendering flow: data source feeds normalisation, which feeds both the Jinja2+WeasyPrint path and the ReportLab canvas path, both producing a PDF that is optionally merged into an assembly",[2549,59901,59902],{},"Template → Data → Render → Assemble flow",[2553,59904,59905],{},"Shows how raw data moves through normalisation into either a Jinja2+WeasyPrint HTML path or a ReportLab canvas path, then optionally into a merge\u002Fassemble step to produce the final PDF.",[2557,59907,2559,59908,2559,59915,2559,59923,2547],{},[2561,59909,2564,59911,2564,59913,2559],{"id":59910,"x1":748,"y1":748,"x2":734,"y2":748},"gen-reports-grad-blue",[2566,59912],{"offset":748,"style":2568},[2566,59914],{"offset":734,"style":2571},[2561,59916,2564,59918,2564,59920,2559],{"id":59917,"x1":748,"y1":748,"x2":734,"y2":748},"gen-reports-grad-soft",[2566,59919],{"offset":748,"style":2571},[2566,59921],{"offset":734,"style":59922},"stop-color:#f6f8fb",[2573,59924,2564,59926,2559],{"id":59925,"viewBox":2576,"refX":2577,"refY":1153,"markerWidth":1179,"markerHeight":1179,"orient":2578},"gen-reports-arrow",[2580,59927],{"d":2582,"fill":2583},[2585,59929],{"x":2587,"y":2588,"width":2609,"height":2590,"rx":2591,"fill":2592,"stroke":2593,"style":2594},[2000,59931,59933],{"x":2630,"y":2598,"fill":2599,"style":59932},"text-anchor:middle;font-size:13px;font-weight:600","Data Source",[2000,59935,59936],{"x":2630,"y":2604,"fill":2583,"style":2605},"CSV \u002F DB \u002F API",[35,59938],{"x1":2610,"y1":2610,"x2":59939,"y2":2610,"stroke":2583,"markerEnd":59940,"style":2594},"218","url(#gen-reports-arrow)",[2585,59942],{"x":2701,"y":2588,"width":2609,"height":2590,"rx":2591,"fill":59943,"stroke":2593,"style":2594},"url(#gen-reports-grad-soft)",[2000,59945,59946],{"x":11231,"y":2598,"fill":2599,"style":59932},"Normalise",[2000,59948,59949],{"x":11231,"y":2604,"fill":2583,"style":2605},"pandas \u002F dicts",[35,59951],{"x1":59952,"y1":2598,"x2":59953,"y2":2630,"stroke":2583,"markerEnd":59940,"style":2594},"360","418",[35,59955],{"x1":59952,"y1":59956,"x2":59953,"y2":26410,"stroke":2583,"markerEnd":59940,"style":2594},"165",[2585,59958],{"x":59959,"y":2680,"width":2610,"height":38748,"rx":2591,"fill":59960,"stroke":2593,"style":2594},"420","url(#gen-reports-grad-blue)",[2000,59962,59964],{"x":13437,"y":59963,"fill":2599,"style":59932},"78","Jinja2 template",[2000,59966,59967],{"x":13437,"y":38741,"fill":2599,"style":2605},"→ HTML → WeasyPrint",[2585,59969],{"x":59959,"y":2611,"width":2610,"height":38748,"rx":2591,"fill":59960,"stroke":2593,"style":2594},[2000,59971,59972],{"x":13437,"y":11126,"fill":2599,"style":59932},"ReportLab canvas",[2000,59974,59976],{"x":13437,"y":59975,"fill":2599,"style":2605},"246","platypus \u002F drawString",[35,59978],{"x1":49894,"y1":12900,"x2":11207,"y2":2598,"stroke":2583,"markerEnd":59940,"style":2594},[35,59980],{"x1":49894,"y1":59981,"x2":11207,"y2":59956,"stroke":2583,"markerEnd":59940,"style":2594},"235",[2585,59983],{"x":49863,"y":2589,"width":2589,"height":2597,"rx":2591,"fill":2592,"stroke":11166,"style":11210},[2000,59985,59987],{"x":59986,"y":11112,"fill":11166,"style":59932},"680","PDF output",[2000,59989,59990],{"x":59986,"y":11115,"fill":2583,"style":2605},"merge \u002F split",[2000,59992,59993],{"x":59986,"y":11119,"fill":2583,"style":2605},"optional",[18,59995,59997],{"id":59996},"step-2-jinja2-weasyprint-html-to-pdf-path","Step 2 — Jinja2 + WeasyPrint: HTML-to-PDF path",[14,59999,60000,60001,60003,60004,3035],{},"Best when you want CSS layout, responsive tables, and page headers\u002Ffooters via ",[30,60002,54604],{}," rules. Unicode fonts work out of the box if you declare them in ",[30,60005,60006],{},"@font-face",[23,60008,60010],{"className":126,"code":60009,"language":47,"meta":28,"style":28},"# pip install weasyprint jinja2\nfrom pathlib import Path\nfrom jinja2 import Environment, BaseLoader\nfrom weasyprint import HTML, CSS\nimport io\n\nTEMPLATE_SRC = \"\"\"\u003C!DOCTYPE html>\n\u003Chtml>\u003Chead>\u003Cmeta charset=\"utf-8\">\n\u003Cstyle>\n@page {\n  size: A4;\n  margin: 20mm 15mm 25mm;\n  @top-center { content: \"Sales Report — {{ period }}\"; font-size: 9pt; color: #475569; }\n  @bottom-right { content: \"Page \" counter(page) \" of \" counter(pages); font-size: 8pt; color: #475569; }\n}\nbody { font-family: sans-serif; font-size: 10pt; color: #0f172a; }\nh1 { font-size: 18pt; margin-bottom: 4mm; }\ntable { width: 100%; border-collapse: collapse; margin-top: 6mm; }\nth { background: #2563eb; color: #fff; padding: 5px 8px; text-align: left; font-size: 9pt; }\ntd { padding: 4px 8px; border-bottom: 1px solid #e2e8f0; font-size: 9pt; }\ntr { page-break-inside: avoid; }\n.right { text-align: right; }\n.summary { margin-top: 8mm; font-weight: bold; }\n\u003C\u002Fstyle>\u003C\u002Fhead>\n\u003Cbody>\n\u003Ch1>Sales Performance\u003C\u002Fh1>\n\u003Cp>Period: {{ period }}\u003C\u002Fp>\n\u003Ctable>\n  \u003Cthead>\u003Ctr>\n    \u003Cth>Customer\u003C\u002Fth>\u003Cth>Region\u003C\u002Fth>\n    \u003Cth class=\"right\">Revenue\u003C\u002Fth>\u003Cth class=\"right\">Costs\u003C\u002Fth>\n    \u003Cth class=\"right\">Profit\u003C\u002Fth>\u003Cth class=\"right\">Margin %\u003C\u002Fth>\n  \u003C\u002Ftr>\u003C\u002Fthead>\n  \u003Ctbody>\n  {% for r in rows %}\n  \u003Ctr>\n    \u003Ctd>{{ r.customer }}\u003C\u002Ftd>\u003Ctd>{{ r.region }}\u003C\u002Ftd>\n    \u003Ctd class=\"right\">${{ \"{:,.0f}\".format(r.revenue) }}\u003C\u002Ftd>\n    \u003Ctd class=\"right\">${{ \"{:,.0f}\".format(r.costs) }}\u003C\u002Ftd>\n    \u003Ctd class=\"right\">${{ \"{:,.0f}\".format(r.profit) }}\u003C\u002Ftd>\n    \u003Ctd class=\"right\">{{ r.margin }}%\u003C\u002Ftd>\n  \u003C\u002Ftr>\n  {% endfor %}\n  \u003C\u002Ftbody>\n\u003C\u002Ftable>\n\u003Cp class=\"summary\">Total revenue: ${{ \"{:,.0f}\".format(rows|sum(attribute=\"revenue\")) }}\u003C\u002Fp>\n\u003C\u002Fbody>\u003C\u002Fhtml>\"\"\"\n\ndef render_weasyprint(rows: list[dict], period: str, out: Path) -> None:\n    env = Environment(loader=BaseLoader())\n    tmpl = env.from_string(TEMPLATE_SRC)\n    html_str = tmpl.render(rows=rows, period=period)\n    try:\n        HTML(string=html_str).write_pdf(str(out))\n        print(f\"Written: {out}\")\n    except Exception as exc:\n        raise RuntimeError(f\"WeasyPrint render failed: {exc}\") from exc\n\nrender_weasyprint(rows, \"Q3 2026\", Path(\"reports\u002Fsales_weasyprint.pdf\"))\n",[30,60011,60012,60016,60026,60036,60052,60059,60063,60072,60076,60080,60084,60088,60093,60098,60103,60107,60112,60117,60122,60127,60132,60137,60142,60147,60151,60155,60160,60165,60169,60173,60178,60183,60188,60192,60196,60205,60209,60214,60219,60224,60229,60234,60238,60246,60250,60254,60259,60263,60267,60291,60305,60317,60340,60346,60360,60380,60390,60417,60421],{"__ignoreMap":28},[33,60013,60014],{"class":35,"line":36},[33,60015,20943],{"class":39},[33,60017,60018,60020,60022,60024],{"class":35,"line":43},[33,60019,190],{"class":163},[33,60021,193],{"class":167},[33,60023,164],{"class":163},[33,60025,198],{"class":167},[33,60027,60028,60030,60032,60034],{"class":35,"line":61},[33,60029,190],{"class":163},[33,60031,20970],{"class":167},[33,60033,164],{"class":163},[33,60035,54635],{"class":167},[33,60037,60038,60040,60042,60044,60047,60049],{"class":35,"line":73},[33,60039,190],{"class":163},[33,60041,20982],{"class":167},[33,60043,164],{"class":163},[33,60045,60046],{"class":50}," HTML",[33,60048,365],{"class":167},[33,60050,60051],{"class":50},"CSS\n",[33,60053,60054,60056],{"class":35,"line":88},[33,60055,164],{"class":163},[33,60057,60058],{"class":167}," io\n",[33,60060,60061],{"class":35,"line":95},[33,60062,92],{"emptyLinePlaceholder":91},[33,60064,60065,60068,60070],{"class":35,"line":101},[33,60066,60067],{"class":50},"TEMPLATE_SRC",[33,60069,212],{"class":163},[33,60071,54659],{"class":54},[33,60073,60074],{"class":35,"line":171},[33,60075,54664],{"class":54},[33,60077,60078],{"class":35,"line":179},[33,60079,54669],{"class":54},[33,60081,60082],{"class":35,"line":187},[33,60083,54674],{"class":54},[33,60085,60086],{"class":35,"line":201},[33,60087,54679],{"class":54},[33,60089,60090],{"class":35,"line":206},[33,60091,60092],{"class":54},"  margin: 20mm 15mm 25mm;\n",[33,60094,60095],{"class":35,"line":224},[33,60096,60097],{"class":54},"  @top-center { content: \"Sales Report — {{ period }}\"; font-size: 9pt; color: #475569; }\n",[33,60099,60100],{"class":35,"line":229},[33,60101,60102],{"class":54},"  @bottom-right { content: \"Page \" counter(page) \" of \" counter(pages); font-size: 8pt; color: #475569; }\n",[33,60104,60105],{"class":35,"line":235},[33,60106,4113],{"class":54},[33,60108,60109],{"class":35,"line":250},[33,60110,60111],{"class":54},"body { font-family: sans-serif; font-size: 10pt; color: #0f172a; }\n",[33,60113,60114],{"class":35,"line":266},[33,60115,60116],{"class":54},"h1 { font-size: 18pt; margin-bottom: 4mm; }\n",[33,60118,60119],{"class":35,"line":290},[33,60120,60121],{"class":54},"table { width: 100%; border-collapse: collapse; margin-top: 6mm; }\n",[33,60123,60124],{"class":35,"line":295},[33,60125,60126],{"class":54},"th { background: #2563eb; color: #fff; padding: 5px 8px; text-align: left; font-size: 9pt; }\n",[33,60128,60129],{"class":35,"line":300},[33,60130,60131],{"class":54},"td { padding: 4px 8px; border-bottom: 1px solid #e2e8f0; font-size: 9pt; }\n",[33,60133,60134],{"class":35,"line":317},[33,60135,60136],{"class":54},"tr { page-break-inside: avoid; }\n",[33,60138,60139],{"class":35,"line":332},[33,60140,60141],{"class":54},".right { text-align: right; }\n",[33,60143,60144],{"class":35,"line":347},[33,60145,60146],{"class":54},".summary { margin-top: 8mm; font-weight: bold; }\n",[33,60148,60149],{"class":35,"line":374},[33,60150,54783],{"class":54},[33,60152,60153],{"class":35,"line":397},[33,60154,54788],{"class":54},[33,60156,60157],{"class":35,"line":653},[33,60158,60159],{"class":54},"\u003Ch1>Sales Performance\u003C\u002Fh1>\n",[33,60161,60162],{"class":35,"line":667},[33,60163,60164],{"class":54},"\u003Cp>Period: {{ period }}\u003C\u002Fp>\n",[33,60166,60167],{"class":35,"line":675},[33,60168,54827],{"class":54},[33,60170,60171],{"class":35,"line":689},[33,60172,54832],{"class":54},[33,60174,60175],{"class":35,"line":703},[33,60176,60177],{"class":54},"    \u003Cth>Customer\u003C\u002Fth>\u003Cth>Region\u003C\u002Fth>\n",[33,60179,60180],{"class":35,"line":714},[33,60181,60182],{"class":54},"    \u003Cth class=\"right\">Revenue\u003C\u002Fth>\u003Cth class=\"right\">Costs\u003C\u002Fth>\n",[33,60184,60185],{"class":35,"line":723},[33,60186,60187],{"class":54},"    \u003Cth class=\"right\">Profit\u003C\u002Fth>\u003Cth class=\"right\">Margin %\u003C\u002Fth>\n",[33,60189,60190],{"class":35,"line":754},[33,60191,54857],{"class":54},[33,60193,60194],{"class":35,"line":771},[33,60195,54862],{"class":54},[33,60197,60198,60200,60202],{"class":35,"line":777},[33,60199,54867],{"class":54},[33,60201,54870],{"class":50},[33,60203,60204],{"class":54},"or r in rows %}\n",[33,60206,60207],{"class":35,"line":788},[33,60208,54878],{"class":54},[33,60210,60211],{"class":35,"line":804},[33,60212,60213],{"class":54},"    \u003Ctd>{{ r.customer }}\u003C\u002Ftd>\u003Ctd>{{ r.region }}\u003C\u002Ftd>\n",[33,60215,60216],{"class":35,"line":809},[33,60217,60218],{"class":54},"    \u003Ctd class=\"right\">${{ \"{:,.0f}\".format(r.revenue) }}\u003C\u002Ftd>\n",[33,60220,60221],{"class":35,"line":819},[33,60222,60223],{"class":54},"    \u003Ctd class=\"right\">${{ \"{:,.0f}\".format(r.costs) }}\u003C\u002Ftd>\n",[33,60225,60226],{"class":35,"line":829},[33,60227,60228],{"class":54},"    \u003Ctd class=\"right\">${{ \"{:,.0f}\".format(r.profit) }}\u003C\u002Ftd>\n",[33,60230,60231],{"class":35,"line":834},[33,60232,60233],{"class":54},"    \u003Ctd class=\"right\">{{ r.margin }}%\u003C\u002Ftd>\n",[33,60235,60236],{"class":35,"line":839},[33,60237,54913],{"class":54},[33,60239,60240,60242,60244],{"class":35,"line":860},[33,60241,54867],{"class":54},[33,60243,54920],{"class":50},[33,60245,54923],{"class":54},[33,60247,60248],{"class":35,"line":887},[33,60249,54928],{"class":54},[33,60251,60252],{"class":35,"line":907},[33,60253,54933],{"class":54},[33,60255,60256],{"class":35,"line":1826},[33,60257,60258],{"class":54},"\u003Cp class=\"summary\">Total revenue: ${{ \"{:,.0f}\".format(rows|sum(attribute=\"revenue\")) }}\u003C\u002Fp>\n",[33,60260,60261],{"class":35,"line":1844},[33,60262,54995],{"class":54},[33,60264,60265],{"class":35,"line":1858},[33,60266,92],{"emptyLinePlaceholder":91},[33,60268,60269,60271,60274,60277,60279,60282,60284,60287,60289],{"class":35,"line":1871},[33,60270,562],{"class":163},[33,60272,60273],{"class":46}," render_weasyprint",[33,60275,60276],{"class":167},"(rows: list[",[33,60278,37100],{"class":50},[33,60280,60281],{"class":167},"], period: ",[33,60283,1053],{"class":50},[33,60285,60286],{"class":167},", out: Path) -> ",[33,60288,571],{"class":50},[33,60290,574],{"class":167},[33,60292,60293,60295,60297,60299,60301,60303],{"class":35,"line":1877},[33,60294,55071],{"class":167},[33,60296,242],{"class":163},[33,60298,21111],{"class":167},[33,60300,21114],{"class":238},[33,60302,242],{"class":163},[33,60304,55082],{"class":167},[33,60306,60307,60309,60311,60313,60315],{"class":35,"line":1883},[33,60308,55087],{"class":167},[33,60310,242],{"class":163},[33,60312,55092],{"class":167},[33,60314,60067],{"class":50},[33,60316,221],{"class":167},[33,60318,60319,60321,60323,60325,60327,60329,60332,60335,60337],{"class":35,"line":1915},[33,60320,21200],{"class":167},[33,60322,242],{"class":163},[33,60324,21205],{"class":167},[33,60326,18629],{"class":238},[33,60328,242],{"class":163},[33,60330,60331],{"class":167},"rows, ",[33,60333,60334],{"class":238},"period",[33,60336,242],{"class":163},[33,60338,60339],{"class":167},"period)\n",[33,60341,60342,60344],{"class":35,"line":1926},[33,60343,2424],{"class":163},[33,60345,574],{"class":167},[33,60347,60348,60350,60352,60354,60356,60358],{"class":35,"line":1932},[33,60349,21235],{"class":167},[33,60351,21238],{"class":238},[33,60353,242],{"class":163},[33,60355,21243],{"class":167},[33,60357,1053],{"class":50},[33,60359,55133],{"class":167},[33,60361,60362,60364,60366,60368,60370,60372,60374,60376,60378],{"class":35,"line":1938},[33,60363,9414],{"class":50},[33,60365,602],{"class":167},[33,60367,4059],{"class":163},[33,60369,58214],{"class":54},[33,60371,1115],{"class":50},[33,60373,18014],{"class":167},[33,60375,1121],{"class":50},[33,60377,274],{"class":54},[33,60379,221],{"class":167},[33,60381,60382,60384,60386,60388],{"class":35,"line":1950},[33,60383,2449],{"class":163},[33,60385,783],{"class":50},[33,60387,1852],{"class":163},[33,60389,1855],{"class":167},[33,60391,60392,60394,60396,60398,60400,60403,60405,60407,60409,60411,60413,60415],{"class":35,"line":1958},[33,60393,4051],{"class":163},[33,60395,7590],{"class":50},[33,60397,602],{"class":167},[33,60399,4059],{"class":163},[33,60401,60402],{"class":54},"\"WeasyPrint render failed: ",[33,60404,1115],{"class":50},[33,60406,6565],{"class":167},[33,60408,1121],{"class":50},[33,60410,274],{"class":54},[33,60412,1649],{"class":167},[33,60414,190],{"class":163},[33,60416,20843],{"class":167},[33,60418,60419],{"class":35,"line":4904},[33,60420,92],{"emptyLinePlaceholder":91},[33,60422,60423,60426,60429,60431,60434],{"class":35,"line":4909},[33,60424,60425],{"class":167},"render_weasyprint(rows, ",[33,60427,60428],{"class":54},"\"Q3 2026\"",[33,60430,409],{"class":167},[33,60432,60433],{"class":54},"\"reports\u002Fsales_weasyprint.pdf\"",[33,60435,371],{"class":167},[18,60437,60439],{"id":60438},"step-3-reportlab-canvas-level-control","Step 3 — ReportLab: canvas-level control",[14,60441,60442,60443,2012,60445,60447,60448,37025,60450,60452,60453,3035],{},"Use ReportLab when you need exact coordinate placement, vector graphics, or custom fonts for symbols like ",[30,60444,53873],{},[30,60446,57544],{},". If you hit garbled boxes or a ",[30,60449,53869],{},[940,60451,28608],{"href":28607}," — the short answer is to register a TrueType font with ",[30,60454,59364],{},[23,60456,60458],{"className":126,"code":60457,"language":47,"meta":28,"style":28},"# pip install reportlab\nfrom pathlib import Path\nfrom reportlab.lib.pagesizes import A4\nfrom reportlab.lib.units import mm\nfrom reportlab.lib import colors\nfrom reportlab.platypus import (\n    SimpleDocTemplate, Table, TableStyle, Paragraph, Spacer\n)\nfrom reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle\nfrom reportlab.lib.enums import TA_RIGHT\n\nOUT = Path(\"reports\u002Fsales_reportlab.pdf\")\n\ndef render_reportlab(rows: list[dict], period: str, out: Path) -> None:\n    out.parent.mkdir(parents=True, exist_ok=True)\n    styles = getSampleStyleSheet()\n    right_style = ParagraphStyle(\"right\", parent=styles[\"Normal\"], alignment=TA_RIGHT)\n\n    doc = SimpleDocTemplate(\n        str(out), pagesize=A4,\n        leftMargin=15*mm, rightMargin=15*mm,\n        topMargin=20*mm, bottomMargin=20*mm,\n    )\n\n    def _header_footer(canvas, doc):\n        canvas.saveState()\n        canvas.setFont(\"Helvetica\", 8)\n        canvas.setFillColor(colors.HexColor(\"#475569\"))\n        canvas.drawString(15*mm, A4[1] - 12*mm, f\"Sales Report — {period}\")\n        canvas.drawRightString(A4[0] - 15*mm, 10*mm, f\"Page {doc.page}\")\n        canvas.restoreState()\n\n    header = [[\"Customer\", \"Region\", \"Revenue\", \"Costs\", \"Profit\", \"Margin %\"]]\n    data_rows = [\n        [r[\"customer\"], r[\"region\"],\n         f\"${r['revenue']:,.0f}\", f\"${r['costs']:,.0f}\",\n         f\"${r['profit']:,.0f}\", f\"{r['margin']}%\"]\n        for r in rows\n    ]\n    table_data = header + data_rows\n\n    col_widths = [50*mm, 30*mm, 28*mm, 28*mm, 28*mm, 22*mm]\n    tbl = Table(table_data, colWidths=col_widths, repeatRows=1)\n    tbl.setStyle(TableStyle([\n        (\"BACKGROUND\", (0, 0), (-1, 0), colors.HexColor(\"#2563eb\")),\n        (\"TEXTCOLOR\", (0, 0), (-1, 0), colors.white),\n        (\"FONTNAME\", (0, 0), (-1, 0), \"Helvetica-Bold\"),\n        (\"FONTSIZE\", (0, 0), (-1, -1), 9),\n        (\"ROWBACKGROUNDS\", (0, 1), (-1, -1), [colors.white, colors.HexColor(\"#f6f8fb\")]),\n        (\"GRID\", (0, 0), (-1, -1), 0.5, colors.HexColor(\"#e2e8f0\")),\n        (\"ALIGN\", (2, 0), (-1, -1), \"RIGHT\"),\n        (\"ROWBACKGROUNDS\", (0, 1), (-1, -1), [colors.white, colors.HexColor(\"#f6f8fb\")]),\n    ]))\n\n    story = [\n        Paragraph(f\"Sales Performance — {period}\", styles[\"h1\"]),\n        Spacer(1, 6*mm),\n        tbl,\n    ]\n    try:\n        doc.build(story, onFirstPage=_header_footer, onLaterPages=_header_footer)\n        print(f\"Written: {out}\")\n    except Exception as exc:\n        raise RuntimeError(f\"ReportLab build failed: {exc}\") from exc\n\nrender_reportlab(rows, \"Q3 2026\", OUT)\n",[30,60459,60460,60464,60474,60484,60494,60504,60514,60519,60523,60533,60544,60548,60561,60565,60586,60607,60615,60647,60651,60659,60671,60693,60715,60719,60723,60731,60735,60747,60756,60794,60830,60834,60838,60874,60883,60897,60941,60982,60992,60996,61009,61013,61058,61080,61084,61115,61141,61171,61203,61237,61274,61306,61338,61342,61346,61354,61377,61391,61395,61399,61405,61421,61441,61451,61478,61482],{"__ignoreMap":28},[33,60461,60462],{"class":35,"line":36},[33,60463,20289],{"class":39},[33,60465,60466,60468,60470,60472],{"class":35,"line":43},[33,60467,190],{"class":163},[33,60469,193],{"class":167},[33,60471,164],{"class":163},[33,60473,198],{"class":167},[33,60475,60476,60478,60480,60482],{"class":35,"line":61},[33,60477,190],{"class":163},[33,60479,19044],{"class":167},[33,60481,164],{"class":163},[33,60483,19049],{"class":167},[33,60485,60486,60488,60490,60492],{"class":35,"line":73},[33,60487,190],{"class":163},[33,60489,19080],{"class":167},[33,60491,164],{"class":163},[33,60493,55290],{"class":167},[33,60495,60496,60498,60500,60502],{"class":35,"line":88},[33,60497,190],{"class":163},[33,60499,19056],{"class":167},[33,60501,164],{"class":163},[33,60503,19061],{"class":167},[33,60505,60506,60508,60510,60512],{"class":35,"line":95},[33,60507,190],{"class":163},[33,60509,19092],{"class":167},[33,60511,164],{"class":163},[33,60513,1415],{"class":167},[33,60515,60516],{"class":35,"line":101},[33,60517,60518],{"class":167},"    SimpleDocTemplate, Table, TableStyle, Paragraph, Spacer\n",[33,60520,60521],{"class":35,"line":171},[33,60522,221],{"class":167},[33,60524,60525,60527,60529,60531],{"class":35,"line":179},[33,60526,190],{"class":163},[33,60528,19068],{"class":167},[33,60530,164],{"class":163},[33,60532,19073],{"class":167},[33,60534,60535,60537,60539,60541],{"class":35,"line":187},[33,60536,190],{"class":163},[33,60538,55336],{"class":167},[33,60540,164],{"class":163},[33,60542,60543],{"class":50}," TA_RIGHT\n",[33,60545,60546],{"class":35,"line":201},[33,60547,92],{"emptyLinePlaceholder":91},[33,60549,60550,60552,60554,60556,60559],{"class":35,"line":206},[33,60551,57716],{"class":50},[33,60553,212],{"class":163},[33,60555,215],{"class":167},[33,60557,60558],{"class":54},"\"reports\u002Fsales_reportlab.pdf\"",[33,60560,221],{"class":167},[33,60562,60563],{"class":35,"line":224},[33,60564,92],{"emptyLinePlaceholder":91},[33,60566,60567,60569,60572,60574,60576,60578,60580,60582,60584],{"class":35,"line":229},[33,60568,562],{"class":163},[33,60570,60571],{"class":46}," render_reportlab",[33,60573,60276],{"class":167},[33,60575,37100],{"class":50},[33,60577,60281],{"class":167},[33,60579,1053],{"class":50},[33,60581,60286],{"class":167},[33,60583,571],{"class":50},[33,60585,574],{"class":167},[33,60587,60588,60591,60593,60595,60597,60599,60601,60603,60605],{"class":35,"line":235},[33,60589,60590],{"class":167},"    out.parent.mkdir(",[33,60592,869],{"class":238},[33,60594,242],{"class":163},[33,60596,855],{"class":50},[33,60598,365],{"class":167},[33,60600,878],{"class":238},[33,60602,242],{"class":163},[33,60604,855],{"class":50},[33,60606,221],{"class":167},[33,60608,60609,60611,60613],{"class":35,"line":250},[33,60610,19255],{"class":167},[33,60612,242],{"class":163},[33,60614,19260],{"class":167},[33,60616,60617,60620,60622,60624,60627,60629,60631,60633,60635,60637,60639,60641,60643,60645],{"class":35,"line":266},[33,60618,60619],{"class":167},"    right_style ",[33,60621,242],{"class":163},[33,60623,55487],{"class":167},[33,60625,60626],{"class":54},"\"right\"",[33,60628,365],{"class":167},[33,60630,19280],{"class":238},[33,60632,242],{"class":163},[33,60634,19285],{"class":167},[33,60636,19348],{"class":54},[33,60638,8314],{"class":167},[33,60640,46396],{"class":238},[33,60642,242],{"class":163},[33,60644,55509],{"class":50},[33,60646,221],{"class":167},[33,60648,60649],{"class":35,"line":290},[33,60650,92],{"emptyLinePlaceholder":91},[33,60652,60653,60655,60657],{"class":35,"line":295},[33,60654,18224],{"class":167},[33,60656,242],{"class":163},[33,60658,20082],{"class":167},[33,60660,60661,60663,60665,60667,60669],{"class":35,"line":300},[33,60662,35596],{"class":50},[33,60664,55530],{"class":167},[33,60666,20091],{"class":238},[33,60668,242],{"class":163},[33,60670,20096],{"class":167},[33,60672,60673,60675,60677,60679,60681,60683,60685,60687,60689,60691],{"class":35,"line":317},[33,60674,55541],{"class":238},[33,60676,242],{"class":163},[33,60678,1646],{"class":50},[33,60680,1769],{"class":163},[33,60682,55550],{"class":167},[33,60684,20112],{"class":238},[33,60686,242],{"class":163},[33,60688,1646],{"class":50},[33,60690,1769],{"class":163},[33,60692,55561],{"class":167},[33,60694,60695,60697,60699,60701,60703,60705,60707,60709,60711,60713],{"class":35,"line":332},[33,60696,55566],{"class":238},[33,60698,242],{"class":163},[33,60700,2587],{"class":50},[33,60702,1769],{"class":163},[33,60704,55550],{"class":167},[33,60706,20137],{"class":238},[33,60708,242],{"class":163},[33,60710,2587],{"class":50},[33,60712,1769],{"class":163},[33,60714,55561],{"class":167},[33,60716,60717],{"class":35,"line":347},[33,60718,1202],{"class":167},[33,60720,60721],{"class":35,"line":374},[33,60722,92],{"emptyLinePlaceholder":91},[33,60724,60725,60727,60729],{"class":35,"line":397},[33,60726,1742],{"class":163},[33,60728,55599],{"class":46},[33,60730,55602],{"class":167},[33,60732,60733],{"class":35,"line":653},[33,60734,55607],{"class":167},[33,60736,60737,60739,60741,60743,60745],{"class":35,"line":667},[33,60738,55612],{"class":167},[33,60740,28546],{"class":54},[33,60742,365],{"class":167},[33,60744,2591],{"class":50},[33,60746,221],{"class":167},[33,60748,60749,60752,60754],{"class":35,"line":675},[33,60750,60751],{"class":167},"        canvas.setFillColor(colors.HexColor(",[33,60753,55376],{"class":54},[33,60755,371],{"class":167},[33,60757,60758,60760,60762,60764,60766,60768,60770,60772,60775,60777,60779,60781,60784,60786,60788,60790,60792],{"class":35,"line":689},[33,60759,55634],{"class":167},[33,60761,1646],{"class":50},[33,60763,1769],{"class":163},[33,60765,55641],{"class":167},[33,60767,734],{"class":50},[33,60769,763],{"class":167},[33,60771,4126],{"class":163},[33,60773,60774],{"class":50}," 12",[33,60776,1769],{"class":163},[33,60778,55550],{"class":167},[33,60780,4059],{"class":163},[33,60782,60783],{"class":54},"\"Sales Report — ",[33,60785,1115],{"class":50},[33,60787,60334],{"class":167},[33,60789,1121],{"class":50},[33,60791,274],{"class":54},[33,60793,221],{"class":167},[33,60795,60796,60798,60800,60802,60804,60806,60808,60810,60812,60814,60816,60818,60820,60822,60824,60826,60828],{"class":35,"line":703},[33,60797,55664],{"class":167},[33,60799,748],{"class":50},[33,60801,763],{"class":167},[33,60803,4126],{"class":163},[33,60805,47416],{"class":50},[33,60807,1769],{"class":163},[33,60809,55550],{"class":167},[33,60811,3545],{"class":50},[33,60813,1769],{"class":163},[33,60815,55550],{"class":167},[33,60817,4059],{"class":163},[33,60819,55719],{"class":54},[33,60821,1115],{"class":50},[33,60823,55724],{"class":167},[33,60825,1121],{"class":50},[33,60827,274],{"class":54},[33,60829,221],{"class":167},[33,60831,60832],{"class":35,"line":714},[33,60833,55735],{"class":167},[33,60835,60836],{"class":35,"line":723},[33,60837,92],{"emptyLinePlaceholder":91},[33,60839,60840,60842,60844,60846,60849,60851,60853,60855,60857,60859,60862,60864,60867,60869,60872],{"class":35,"line":754},[33,60841,13245],{"class":167},[33,60843,242],{"class":163},[33,60845,20349],{"class":167},[33,60847,60848],{"class":54},"\"Customer\"",[33,60850,365],{"class":167},[33,60852,11865],{"class":54},[33,60854,365],{"class":167},[33,60856,12925],{"class":54},[33,60858,365],{"class":167},[33,60860,60861],{"class":54},"\"Costs\"",[33,60863,365],{"class":167},[33,60865,60866],{"class":54},"\"Profit\"",[33,60868,365],{"class":167},[33,60870,60871],{"class":54},"\"Margin %\"",[33,60873,44162],{"class":167},[33,60875,60876,60879,60881],{"class":35,"line":771},[33,60877,60878],{"class":167},"    data_rows ",[33,60880,242],{"class":163},[33,60882,7473],{"class":167},[33,60884,60885,60888,60890,60893,60895],{"class":35,"line":777},[33,60886,60887],{"class":167},"        [r[",[33,60889,59673],{"class":54},[33,60891,60892],{"class":167},"], r[",[33,60894,16649],{"class":54},[33,60896,8935],{"class":167},[33,60898,60899,60901,60903,60905,60908,60910,60912,60914,60916,60918,60920,60922,60924,60926,60928,60931,60933,60935,60937,60939],{"class":35,"line":788},[33,60900,55804],{"class":163},[33,60902,18820],{"class":54},[33,60904,1115],{"class":50},[33,60906,60907],{"class":167},"r[",[33,60909,18828],{"class":54},[33,60911,9546],{"class":167},[33,60913,18410],{"class":163},[33,60915,1121],{"class":50},[33,60917,274],{"class":54},[33,60919,365],{"class":167},[33,60921,4059],{"class":163},[33,60923,18820],{"class":54},[33,60925,1115],{"class":50},[33,60927,60907],{"class":167},[33,60929,60930],{"class":54},"'costs'",[33,60932,9546],{"class":167},[33,60934,18410],{"class":163},[33,60936,1121],{"class":50},[33,60938,274],{"class":54},[33,60940,247],{"class":167},[33,60942,60943,60945,60947,60949,60951,60954,60956,60958,60960,60962,60964,60966,60968,60970,60972,60974,60976,60978,60980],{"class":35,"line":804},[33,60944,55804],{"class":163},[33,60946,18820],{"class":54},[33,60948,1115],{"class":50},[33,60950,60907],{"class":167},[33,60952,60953],{"class":54},"'profit'",[33,60955,9546],{"class":167},[33,60957,18410],{"class":163},[33,60959,1121],{"class":50},[33,60961,274],{"class":54},[33,60963,365],{"class":167},[33,60965,4059],{"class":163},[33,60967,274],{"class":54},[33,60969,1115],{"class":50},[33,60971,60907],{"class":167},[33,60973,18857],{"class":54},[33,60975,9546],{"class":167},[33,60977,1121],{"class":50},[33,60979,38058],{"class":54},[33,60981,9202],{"class":167},[33,60983,60984,60986,60988,60990],{"class":35,"line":809},[33,60985,5973],{"class":163},[33,60987,45721],{"class":167},[33,60989,662],{"class":163},[33,60991,44355],{"class":167},[33,60993,60994],{"class":35,"line":819},[33,60995,19559],{"class":167},[33,60997,60998,61000,61002,61004,61006],{"class":35,"line":829},[33,60999,19605],{"class":167},[33,61001,242],{"class":163},[33,61003,17788],{"class":167},[33,61005,1811],{"class":163},[33,61007,61008],{"class":167}," data_rows\n",[33,61010,61011],{"class":35,"line":834},[33,61012,92],{"emptyLinePlaceholder":91},[33,61014,61015,61017,61019,61021,61023,61025,61027,61029,61031,61033,61035,61037,61039,61041,61043,61045,61047,61049,61051,61053,61055],{"class":35,"line":839},[33,61016,19746],{"class":167},[33,61018,242],{"class":163},[33,61020,9178],{"class":167},[33,61022,2680],{"class":50},[33,61024,1769],{"class":163},[33,61026,55550],{"class":167},[33,61028,1543],{"class":50},[33,61030,1769],{"class":163},[33,61032,55550],{"class":167},[33,61034,11104],{"class":50},[33,61036,1769],{"class":163},[33,61038,55550],{"class":167},[33,61040,11104],{"class":50},[33,61042,1769],{"class":163},[33,61044,55550],{"class":167},[33,61046,11104],{"class":50},[33,61048,1769],{"class":163},[33,61050,55550],{"class":167},[33,61052,11103],{"class":50},[33,61054,1769],{"class":163},[33,61056,61057],{"class":167},"mm]\n",[33,61059,61060,61062,61064,61066,61068,61070,61072,61074,61076,61078],{"class":35,"line":860},[33,61061,14864],{"class":167},[33,61063,242],{"class":163},[33,61065,19792],{"class":167},[33,61067,19795],{"class":238},[33,61069,242],{"class":163},[33,61071,19800],{"class":167},[33,61073,19803],{"class":238},[33,61075,242],{"class":163},[33,61077,734],{"class":50},[33,61079,221],{"class":167},[33,61081,61082],{"class":35,"line":887},[33,61083,19814],{"class":167},[33,61085,61086,61088,61090,61092,61094,61096,61098,61100,61102,61104,61106,61108,61111,61113],{"class":35,"line":907},[33,61087,19819],{"class":167},[33,61089,19822],{"class":54},[33,61091,19953],{"class":167},[33,61093,748],{"class":50},[33,61095,365],{"class":167},[33,61097,748],{"class":50},[33,61099,19834],{"class":167},[33,61101,4126],{"class":163},[33,61103,734],{"class":50},[33,61105,365],{"class":167},[33,61107,748],{"class":50},[33,61109,61110],{"class":167},"), colors.HexColor(",[33,61112,55362],{"class":54},[33,61114,1571],{"class":167},[33,61116,61117,61119,61121,61123,61125,61127,61129,61131,61133,61135,61137,61139],{"class":35,"line":1826},[33,61118,19819],{"class":167},[33,61120,19855],{"class":54},[33,61122,19953],{"class":167},[33,61124,748],{"class":50},[33,61126,365],{"class":167},[33,61128,748],{"class":50},[33,61130,19834],{"class":167},[33,61132,4126],{"class":163},[33,61134,734],{"class":50},[33,61136,365],{"class":167},[33,61138,748],{"class":50},[33,61140,19877],{"class":167},[33,61142,61143,61145,61147,61149,61151,61153,61155,61157,61159,61161,61163,61165,61167,61169],{"class":35,"line":1844},[33,61144,19819],{"class":167},[33,61146,19884],{"class":54},[33,61148,19953],{"class":167},[33,61150,748],{"class":50},[33,61152,365],{"class":167},[33,61154,748],{"class":50},[33,61156,19834],{"class":167},[33,61158,4126],{"class":163},[33,61160,734],{"class":50},[33,61162,365],{"class":167},[33,61164,748],{"class":50},[33,61166,18525],{"class":167},[33,61168,19908],{"class":54},[33,61170,1506],{"class":167},[33,61172,61173,61175,61177,61179,61181,61183,61185,61187,61189,61191,61193,61195,61197,61199,61201],{"class":35,"line":1858},[33,61174,19819],{"class":167},[33,61176,19917],{"class":54},[33,61178,19953],{"class":167},[33,61180,748],{"class":50},[33,61182,365],{"class":167},[33,61184,748],{"class":50},[33,61186,19834],{"class":167},[33,61188,4126],{"class":163},[33,61190,734],{"class":50},[33,61192,365],{"class":167},[33,61194,4126],{"class":163},[33,61196,734],{"class":50},[33,61198,18525],{"class":167},[33,61200,2577],{"class":50},[33,61202,1506],{"class":167},[33,61204,61205,61207,61209,61211,61213,61215,61217,61219,61221,61223,61225,61227,61229,61232,61234],{"class":35,"line":1871},[33,61206,19819],{"class":167},[33,61208,19950],{"class":54},[33,61210,19953],{"class":167},[33,61212,748],{"class":50},[33,61214,365],{"class":167},[33,61216,734],{"class":50},[33,61218,19834],{"class":167},[33,61220,4126],{"class":163},[33,61222,734],{"class":50},[33,61224,365],{"class":167},[33,61226,4126],{"class":163},[33,61228,734],{"class":50},[33,61230,61231],{"class":167},"), [colors.white, colors.HexColor(",[33,61233,55403],{"class":54},[33,61235,61236],{"class":167},")]),\n",[33,61238,61239,61241,61243,61245,61247,61249,61251,61253,61255,61257,61259,61261,61263,61265,61267,61270,61272],{"class":35,"line":1877},[33,61240,19819],{"class":167},[33,61242,19985],{"class":54},[33,61244,19953],{"class":167},[33,61246,748],{"class":50},[33,61248,365],{"class":167},[33,61250,748],{"class":50},[33,61252,19834],{"class":167},[33,61254,4126],{"class":163},[33,61256,734],{"class":50},[33,61258,365],{"class":167},[33,61260,4126],{"class":163},[33,61262,734],{"class":50},[33,61264,18525],{"class":167},[33,61266,20011],{"class":50},[33,61268,61269],{"class":167},", colors.HexColor(",[33,61271,55389],{"class":54},[33,61273,1571],{"class":167},[33,61275,61276,61278,61280,61282,61284,61286,61288,61290,61292,61294,61296,61298,61300,61302,61304],{"class":35,"line":1883},[33,61277,19819],{"class":167},[33,61279,20024],{"class":54},[33,61281,19953],{"class":167},[33,61283,1533],{"class":50},[33,61285,365],{"class":167},[33,61287,748],{"class":50},[33,61289,19834],{"class":167},[33,61291,4126],{"class":163},[33,61293,734],{"class":50},[33,61295,365],{"class":167},[33,61297,4126],{"class":163},[33,61299,734],{"class":50},[33,61301,18525],{"class":167},[33,61303,20050],{"class":54},[33,61305,1506],{"class":167},[33,61307,61308,61310,61312,61314,61316,61318,61320,61322,61324,61326,61328,61330,61332,61334,61336],{"class":35,"line":1915},[33,61309,19819],{"class":167},[33,61311,19950],{"class":54},[33,61313,19953],{"class":167},[33,61315,748],{"class":50},[33,61317,365],{"class":167},[33,61319,734],{"class":50},[33,61321,19834],{"class":167},[33,61323,4126],{"class":163},[33,61325,734],{"class":50},[33,61327,365],{"class":167},[33,61329,4126],{"class":163},[33,61331,734],{"class":50},[33,61333,61231],{"class":167},[33,61335,55403],{"class":54},[33,61337,61236],{"class":167},[33,61339,61340],{"class":35,"line":1926},[33,61341,20057],{"class":167},[33,61343,61344],{"class":35,"line":1932},[33,61345,92],{"emptyLinePlaceholder":91},[33,61347,61348,61350,61352],{"class":35,"line":1938},[33,61349,19444],{"class":167},[33,61351,242],{"class":163},[33,61353,7473],{"class":167},[33,61355,61356,61358,61360,61363,61365,61367,61369,61371,61373,61375],{"class":35,"line":1950},[33,61357,19453],{"class":167},[33,61359,4059],{"class":163},[33,61361,61362],{"class":54},"\"Sales Performance — ",[33,61364,1115],{"class":50},[33,61366,60334],{"class":167},[33,61368,1121],{"class":50},[33,61370,274],{"class":54},[33,61372,56522],{"class":167},[33,61374,56525],{"class":54},[33,61376,12871],{"class":167},[33,61378,61379,61381,61383,61385,61387,61389],{"class":35,"line":1958},[33,61380,19542],{"class":167},[33,61382,734],{"class":50},[33,61384,365],{"class":167},[33,61386,2681],{"class":50},[33,61388,1769],{"class":163},[33,61390,56542],{"class":167},[33,61392,61393],{"class":35,"line":4904},[33,61394,56643],{"class":167},[33,61396,61397],{"class":35,"line":4909},[33,61398,19559],{"class":167},[33,61400,61401,61403],{"class":35,"line":4915},[33,61402,2424],{"class":163},[33,61404,574],{"class":167},[33,61406,61407,61409,61411,61413,61415,61417,61419],{"class":35,"line":4925},[33,61408,56750],{"class":167},[33,61410,56753],{"class":238},[33,61412,242],{"class":163},[33,61414,56758],{"class":167},[33,61416,56761],{"class":238},[33,61418,242],{"class":163},[33,61420,56766],{"class":167},[33,61422,61423,61425,61427,61429,61431,61433,61435,61437,61439],{"class":35,"line":4935},[33,61424,9414],{"class":50},[33,61426,602],{"class":167},[33,61428,4059],{"class":163},[33,61430,58214],{"class":54},[33,61432,1115],{"class":50},[33,61434,18014],{"class":167},[33,61436,1121],{"class":50},[33,61438,274],{"class":54},[33,61440,221],{"class":167},[33,61442,61443,61445,61447,61449],{"class":35,"line":4941},[33,61444,2449],{"class":163},[33,61446,783],{"class":50},[33,61448,1852],{"class":163},[33,61450,1855],{"class":167},[33,61452,61453,61455,61457,61459,61461,61464,61466,61468,61470,61472,61474,61476],{"class":35,"line":4950},[33,61454,4051],{"class":163},[33,61456,7590],{"class":50},[33,61458,602],{"class":167},[33,61460,4059],{"class":163},[33,61462,61463],{"class":54},"\"ReportLab build failed: ",[33,61465,1115],{"class":50},[33,61467,6565],{"class":167},[33,61469,1121],{"class":50},[33,61471,274],{"class":54},[33,61473,1649],{"class":167},[33,61475,190],{"class":163},[33,61477,20843],{"class":167},[33,61479,61480],{"class":35,"line":4960},[33,61481,92],{"emptyLinePlaceholder":91},[33,61483,61484,61487,61489,61491,61493],{"class":35,"line":4965},[33,61485,61486],{"class":167},"render_reportlab(rows, ",[33,61488,60428],{"class":54},[33,61490,365],{"class":167},[33,61492,57716],{"class":50},[33,61494,221],{"class":167},[18,61496,61498],{"id":61497},"step-4-embed-a-matplotlib-chart","Step 4 — Embed a Matplotlib chart",[14,61500,61501,61502,61505,61506,61509,61510,61513],{},"Charts go in as ",[30,61503,61504],{},"BytesIO"," images. In the ReportLab path use ",[30,61507,61508],{},"Image","; in the WeasyPrint path encode as base64 and inject into an ",[30,61511,61512],{},"\u003Cimg>"," tag.",[23,61515,61517],{"className":126,"code":61516,"language":47,"meta":28,"style":28},"# pip install reportlab matplotlib\nimport io\nimport matplotlib\nmatplotlib.use(\"Agg\")\nimport matplotlib.pyplot as plt\nfrom reportlab.platypus import Image\n\ndef bar_chart_image(rows: list[dict], width_pts: float = 400, height_pts: float = 180):\n    \"\"\"Return a ReportLab Image flowable containing a revenue bar chart.\"\"\"\n    labels = [r[\"customer\"] for r in rows]\n    revenue = [r[\"revenue\"] for r in rows]\n    costs = [r[\"costs\"] for r in rows]\n\n    fig, ax = plt.subplots(figsize=(width_pts \u002F 72, height_pts \u002F 72))\n    x = range(len(labels))\n    ax.bar([i - 0.2 for i in x], revenue, width=0.35, label=\"Revenue\", color=\"#2563eb\")\n    ax.bar([i + 0.2 for i in x], costs, width=0.35, label=\"Costs\", color=\"#dbeafe\")\n    ax.set_xticks(list(x))\n    ax.set_xticklabels(labels, fontsize=8)\n    ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda v, _: f\"${v\u002F1000:.0f}k\"))\n    ax.legend(fontsize=8)\n    ax.spines[[\"top\", \"right\"]].set_visible(False)\n    plt.tight_layout()\n\n    buf = io.BytesIO()\n    fig.savefig(buf, format=\"png\", dpi=150)\n    plt.close(fig)\n    buf.seek(0)\n    return Image(buf, width=width_pts, height=height_pts)\n",[30,61518,61519,61524,61530,61536,61546,61558,61568,61572,61605,61610,61632,61653,61674,61678,61710,61726,61770,61812,61822,61836,61869,61882,61900,61905,61909,61919,61942,61947,61956],{"__ignoreMap":28},[33,61520,61521],{"class":35,"line":36},[33,61522,61523],{"class":39},"# pip install reportlab matplotlib\n",[33,61525,61526,61528],{"class":35,"line":43},[33,61527,164],{"class":163},[33,61529,60058],{"class":167},[33,61531,61532,61534],{"class":35,"line":61},[33,61533,164],{"class":163},[33,61535,59488],{"class":167},[33,61537,61538,61541,61544],{"class":35,"line":73},[33,61539,61540],{"class":167},"matplotlib.use(",[33,61542,61543],{"class":54},"\"Agg\"",[33,61545,221],{"class":167},[33,61547,61548,61550,61553,61555],{"class":35,"line":88},[33,61549,164],{"class":163},[33,61551,61552],{"class":167}," matplotlib.pyplot ",[33,61554,495],{"class":163},[33,61556,61557],{"class":167}," plt\n",[33,61559,61560,61562,61564,61566],{"class":35,"line":95},[33,61561,190],{"class":163},[33,61563,19092],{"class":167},[33,61565,164],{"class":163},[33,61567,47171],{"class":167},[33,61569,61570],{"class":35,"line":101},[33,61571,92],{"emptyLinePlaceholder":91},[33,61573,61574,61576,61579,61581,61583,61586,61588,61590,61593,61596,61598,61600,61603],{"class":35,"line":171},[33,61575,562],{"class":163},[33,61577,61578],{"class":46}," bar_chart_image",[33,61580,60276],{"class":167},[33,61582,37100],{"class":50},[33,61584,61585],{"class":167},"], width_pts: ",[33,61587,1720],{"class":50},[33,61589,212],{"class":163},[33,61591,61592],{"class":50}," 400",[33,61594,61595],{"class":167},", height_pts: ",[33,61597,1720],{"class":50},[33,61599,212],{"class":163},[33,61601,61602],{"class":50}," 180",[33,61604,1737],{"class":167},[33,61606,61607],{"class":35,"line":179},[33,61608,61609],{"class":54},"    \"\"\"Return a ReportLab Image flowable containing a revenue bar chart.\"\"\"\n",[33,61611,61612,61615,61617,61620,61622,61624,61626,61628,61630],{"class":35,"line":187},[33,61613,61614],{"class":167},"    labels ",[33,61616,242],{"class":163},[33,61618,61619],{"class":167}," [r[",[33,61621,59673],{"class":54},[33,61623,763],{"class":167},[33,61625,6124],{"class":163},[33,61627,45721],{"class":167},[33,61629,662],{"class":163},[33,61631,47749],{"class":167},[33,61633,61634,61637,61639,61641,61643,61645,61647,61649,61651],{"class":35,"line":201},[33,61635,61636],{"class":167},"    revenue ",[33,61638,242],{"class":163},[33,61640,61619],{"class":167},[33,61642,16465],{"class":54},[33,61644,763],{"class":167},[33,61646,6124],{"class":163},[33,61648,45721],{"class":167},[33,61650,662],{"class":163},[33,61652,47749],{"class":167},[33,61654,61655,61658,61660,61662,61664,61666,61668,61670,61672],{"class":35,"line":206},[33,61656,61657],{"class":167},"    costs ",[33,61659,242],{"class":163},[33,61661,61619],{"class":167},[33,61663,59686],{"class":54},[33,61665,763],{"class":167},[33,61667,6124],{"class":163},[33,61669,45721],{"class":167},[33,61671,662],{"class":163},[33,61673,47749],{"class":167},[33,61675,61676],{"class":35,"line":224},[33,61677,92],{"emptyLinePlaceholder":91},[33,61679,61680,61683,61685,61688,61691,61693,61696,61698,61701,61704,61706,61708],{"class":35,"line":229},[33,61681,61682],{"class":167},"    fig, ax ",[33,61684,242],{"class":163},[33,61686,61687],{"class":167}," plt.subplots(",[33,61689,61690],{"class":238},"figsize",[33,61692,242],{"class":163},[33,61694,61695],{"class":167},"(width_pts ",[33,61697,1351],{"class":163},[33,61699,61700],{"class":50}," 72",[33,61702,61703],{"class":167},", height_pts ",[33,61705,1351],{"class":163},[33,61707,61700],{"class":50},[33,61709,371],{"class":167},[33,61711,61712,61715,61717,61719,61721,61723],{"class":35,"line":235},[33,61713,61714],{"class":167},"    x ",[33,61716,242],{"class":163},[33,61718,1801],{"class":50},[33,61720,602],{"class":167},[33,61722,928],{"class":50},[33,61724,61725],{"class":167},"(labels))\n",[33,61727,61728,61731,61733,61735,61737,61739,61741,61744,61746,61748,61751,61753,61756,61758,61760,61762,61764,61766,61768],{"class":35,"line":250},[33,61729,61730],{"class":167},"    ax.bar([i ",[33,61732,4126],{"class":163},[33,61734,46243],{"class":50},[33,61736,14766],{"class":163},[33,61738,47269],{"class":167},[33,61740,662],{"class":163},[33,61742,61743],{"class":167}," x], revenue, ",[33,61745,56684],{"class":238},[33,61747,242],{"class":163},[33,61749,61750],{"class":50},"0.35",[33,61752,365],{"class":167},[33,61754,61755],{"class":238},"label",[33,61757,242],{"class":163},[33,61759,12925],{"class":54},[33,61761,365],{"class":167},[33,61763,17245],{"class":238},[33,61765,242],{"class":163},[33,61767,55362],{"class":54},[33,61769,221],{"class":167},[33,61771,61772,61774,61776,61778,61780,61782,61784,61787,61789,61791,61793,61795,61797,61799,61801,61803,61805,61807,61810],{"class":35,"line":266},[33,61773,61730],{"class":167},[33,61775,1811],{"class":163},[33,61777,46243],{"class":50},[33,61779,14766],{"class":163},[33,61781,47269],{"class":167},[33,61783,662],{"class":163},[33,61785,61786],{"class":167}," x], costs, ",[33,61788,56684],{"class":238},[33,61790,242],{"class":163},[33,61792,61750],{"class":50},[33,61794,365],{"class":167},[33,61796,61755],{"class":238},[33,61798,242],{"class":163},[33,61800,60861],{"class":54},[33,61802,365],{"class":167},[33,61804,17245],{"class":238},[33,61806,242],{"class":163},[33,61808,61809],{"class":54},"\"#dbeafe\"",[33,61811,221],{"class":167},[33,61813,61814,61817,61819],{"class":35,"line":290},[33,61815,61816],{"class":167},"    ax.set_xticks(",[33,61818,25066],{"class":50},[33,61820,61821],{"class":167},"(x))\n",[33,61823,61824,61827,61830,61832,61834],{"class":35,"line":295},[33,61825,61826],{"class":167},"    ax.set_xticklabels(labels, ",[33,61828,61829],{"class":238},"fontsize",[33,61831,242],{"class":163},[33,61833,2591],{"class":50},[33,61835,221],{"class":167},[33,61837,61838,61841,61843,61846,61848,61850,61852,61855,61857,61859,61862,61864,61867],{"class":35,"line":300},[33,61839,61840],{"class":167},"    ax.yaxis.set_major_formatter(plt.FuncFormatter(",[33,61842,39839],{"class":163},[33,61844,61845],{"class":167}," v, _: ",[33,61847,4059],{"class":163},[33,61849,18820],{"class":54},[33,61851,1115],{"class":50},[33,61853,61854],{"class":167},"v",[33,61856,1351],{"class":163},[33,61858,11821],{"class":50},[33,61860,61861],{"class":163},":.0f",[33,61863,1121],{"class":50},[33,61865,61866],{"class":54},"k\"",[33,61868,371],{"class":167},[33,61870,61871,61874,61876,61878,61880],{"class":35,"line":317},[33,61872,61873],{"class":167},"    ax.legend(",[33,61875,61829],{"class":238},[33,61877,242],{"class":163},[33,61879,2591],{"class":50},[33,61881,221],{"class":167},[33,61883,61884,61887,61889,61891,61893,61896,61898],{"class":35,"line":332},[33,61885,61886],{"class":167},"    ax.spines[[",[33,61888,43412],{"class":54},[33,61890,365],{"class":167},[33,61892,60626],{"class":54},[33,61894,61895],{"class":167},"]].set_visible(",[33,61897,902],{"class":50},[33,61899,221],{"class":167},[33,61901,61902],{"class":35,"line":347},[33,61903,61904],{"class":167},"    plt.tight_layout()\n",[33,61906,61907],{"class":35,"line":374},[33,61908,92],{"emptyLinePlaceholder":91},[33,61910,61911,61914,61916],{"class":35,"line":397},[33,61912,61913],{"class":167},"    buf ",[33,61915,242],{"class":163},[33,61917,61918],{"class":167}," io.BytesIO()\n",[33,61920,61921,61924,61927,61929,61932,61934,61936,61938,61940],{"class":35,"line":653},[33,61922,61923],{"class":167},"    fig.savefig(buf, ",[33,61925,61926],{"class":238},"format",[33,61928,242],{"class":163},[33,61930,61931],{"class":54},"\"png\"",[33,61933,365],{"class":167},[33,61935,46966],{"class":238},[33,61937,242],{"class":163},[33,61939,2635],{"class":50},[33,61941,221],{"class":167},[33,61943,61944],{"class":35,"line":667},[33,61945,61946],{"class":167},"    plt.close(fig)\n",[33,61948,61949,61952,61954],{"class":35,"line":675},[33,61950,61951],{"class":167},"    buf.seek(",[33,61953,748],{"class":50},[33,61955,221],{"class":167},[33,61957,61958,61960,61963,61965,61967,61970,61973,61975],{"class":35,"line":689},[33,61959,1332],{"class":163},[33,61961,61962],{"class":167}," Image(buf, ",[33,61964,56684],{"class":238},[33,61966,242],{"class":163},[33,61968,61969],{"class":167},"width_pts, ",[33,61971,61972],{"class":238},"height",[33,61974,242],{"class":163},[33,61976,61977],{"class":167},"height_pts)\n",[18,61979,61980],{"id":2708},"Edge cases and variants",[424,61982,61984],{"id":61983},"dynamic-pagination-with-reportlab-platypus","Dynamic pagination with ReportLab Platypus",[14,61986,61987,61989,61990,61993,61994,61997,61998,62001,62002,3035],{},[30,61988,58422],{}," handles page breaks automatically. For tables that span many pages, set ",[30,61991,61992],{},"repeatRows=1"," on the ",[30,61995,61996],{},"Table"," to repeat the header row. If you need to split rows manually (e.g. to add a subtotal per page), subclass ",[30,61999,62000],{},"DocTemplate"," and override ",[30,62003,62004],{},"afterPage",[23,62006,62008],{"className":126,"code":62007,"language":47,"meta":28,"style":28},"# pip install reportlab\nfrom reportlab.platypus import SimpleDocTemplate, Table\nfrom reportlab.lib.pagesizes import A4\n\n# repeatRows=1 keeps the header on every page\ntbl = Table(data, colWidths=col_widths, repeatRows=1)\n",[30,62009,62010,62014,62025,62035,62039,62044],{"__ignoreMap":28},[33,62011,62012],{"class":35,"line":36},[33,62013,20289],{"class":39},[33,62015,62016,62018,62020,62022],{"class":35,"line":43},[33,62017,190],{"class":163},[33,62019,19092],{"class":167},[33,62021,164],{"class":163},[33,62023,62024],{"class":167}," SimpleDocTemplate, Table\n",[33,62026,62027,62029,62031,62033],{"class":35,"line":61},[33,62028,190],{"class":163},[33,62030,19044],{"class":167},[33,62032,164],{"class":163},[33,62034,19049],{"class":167},[33,62036,62037],{"class":35,"line":73},[33,62038,92],{"emptyLinePlaceholder":91},[33,62040,62041],{"class":35,"line":88},[33,62042,62043],{"class":39},"# repeatRows=1 keeps the header on every page\n",[33,62045,62046,62049,62051,62053,62055,62057,62059,62061,62063,62065],{"class":35,"line":95},[33,62047,62048],{"class":167},"tbl ",[33,62050,242],{"class":163},[33,62052,20372],{"class":167},[33,62054,19795],{"class":238},[33,62056,242],{"class":163},[33,62058,19800],{"class":167},[33,62060,19803],{"class":238},[33,62062,242],{"class":163},[33,62064,734],{"class":50},[33,62066,221],{"class":167},[424,62068,62070],{"id":62069},"multi-section-report-with-cover-page","Multi-section report with cover page",[23,62072,62074],{"className":126,"code":62073,"language":47,"meta":28,"style":28},"# pip install reportlab\nfrom reportlab.platypus import SimpleDocTemplate, PageBreak, Paragraph\nfrom reportlab.lib.styles import getSampleStyleSheet\nfrom pathlib import Path\n\nstyles = getSampleStyleSheet()\n\ndef build_multi_section(sections: list[dict], out: Path) -> None:\n    \"\"\"sections = [{\"title\": str, \"rows\": [...]}]\"\"\"\n    doc = SimpleDocTemplate(str(out))\n    story = []\n    for i, sec in enumerate(sections):\n        if i > 0:\n            story.append(PageBreak())\n        story.append(Paragraph(sec[\"title\"], styles[\"h1\"]))\n        # ... build table from sec[\"rows\"]\n    try:\n        doc.build(story)\n    except Exception as exc:\n        raise RuntimeError(f\"Build failed: {exc}\") from exc\n",[30,62075,62076,62080,62091,62102,62112,62116,62124,62128,62147,62152,62164,62172,62186,62198,62203,62217,62222,62228,62232,62242],{"__ignoreMap":28},[33,62077,62078],{"class":35,"line":36},[33,62079,20289],{"class":39},[33,62081,62082,62084,62086,62088],{"class":35,"line":43},[33,62083,190],{"class":163},[33,62085,19092],{"class":167},[33,62087,164],{"class":163},[33,62089,62090],{"class":167}," SimpleDocTemplate, PageBreak, Paragraph\n",[33,62092,62093,62095,62097,62099],{"class":35,"line":61},[33,62094,190],{"class":163},[33,62096,19068],{"class":167},[33,62098,164],{"class":163},[33,62100,62101],{"class":167}," getSampleStyleSheet\n",[33,62103,62104,62106,62108,62110],{"class":35,"line":73},[33,62105,190],{"class":163},[33,62107,193],{"class":167},[33,62109,164],{"class":163},[33,62111,198],{"class":167},[33,62113,62114],{"class":35,"line":88},[33,62115,92],{"emptyLinePlaceholder":91},[33,62117,62118,62120,62122],{"class":35,"line":95},[33,62119,58538],{"class":167},[33,62121,242],{"class":163},[33,62123,19260],{"class":167},[33,62125,62126],{"class":35,"line":101},[33,62127,92],{"emptyLinePlaceholder":91},[33,62129,62130,62132,62135,62138,62140,62143,62145],{"class":35,"line":171},[33,62131,562],{"class":163},[33,62133,62134],{"class":46}," build_multi_section",[33,62136,62137],{"class":167},"(sections: list[",[33,62139,37100],{"class":50},[33,62141,62142],{"class":167},"], out: Path) -> ",[33,62144,571],{"class":50},[33,62146,574],{"class":167},[33,62148,62149],{"class":35,"line":179},[33,62150,62151],{"class":54},"    \"\"\"sections = [{\"title\": str, \"rows\": [...]}]\"\"\"\n",[33,62153,62154,62156,62158,62160,62162],{"class":35,"line":187},[33,62155,18224],{"class":167},[33,62157,242],{"class":163},[33,62159,58648],{"class":167},[33,62161,1053],{"class":50},[33,62163,55133],{"class":167},[33,62165,62166,62168,62170],{"class":35,"line":201},[33,62167,19444],{"class":167},[33,62169,242],{"class":163},[33,62171,589],{"class":167},[33,62173,62174,62176,62179,62181,62183],{"class":35,"line":206},[33,62175,656],{"class":163},[33,62177,62178],{"class":167}," i, sec ",[33,62180,662],{"class":163},[33,62182,7403],{"class":50},[33,62184,62185],{"class":167},"(sections):\n",[33,62187,62188,62190,62192,62194,62196],{"class":35,"line":224},[33,62189,8221],{"class":163},[33,62191,47269],{"class":167},[33,62193,6009],{"class":163},[33,62195,10791],{"class":50},[33,62197,574],{"class":167},[33,62199,62200],{"class":35,"line":229},[33,62201,62202],{"class":167},"            story.append(PageBreak())\n",[33,62204,62205,62208,62211,62213,62215],{"class":35,"line":235},[33,62206,62207],{"class":167},"        story.append(Paragraph(sec[",[33,62209,62210],{"class":54},"\"title\"",[33,62212,56620],{"class":167},[33,62214,56525],{"class":54},[33,62216,7211],{"class":167},[33,62218,62219],{"class":35,"line":250},[33,62220,62221],{"class":39},"        # ... build table from sec[\"rows\"]\n",[33,62223,62224,62226],{"class":35,"line":266},[33,62225,2424],{"class":163},[33,62227,574],{"class":167},[33,62229,62230],{"class":35,"line":290},[33,62231,20154],{"class":167},[33,62233,62234,62236,62238,62240],{"class":35,"line":295},[33,62235,2449],{"class":163},[33,62237,783],{"class":50},[33,62239,1852],{"class":163},[33,62241,1855],{"class":167},[33,62243,62244,62246,62248,62250,62252,62254,62256,62258,62260,62262,62264,62266],{"class":35,"line":300},[33,62245,4051],{"class":163},[33,62247,7590],{"class":50},[33,62249,602],{"class":167},[33,62251,4059],{"class":163},[33,62253,58746],{"class":54},[33,62255,1115],{"class":50},[33,62257,6565],{"class":167},[33,62259,1121],{"class":50},[33,62261,274],{"class":54},[33,62263,1649],{"class":167},[33,62265,190],{"class":163},[33,62267,20843],{"class":167},[424,62269,62271],{"id":62270},"weasyprint-with-external-css-file","WeasyPrint with external CSS file",[14,62273,62274],{},"Keep CSS separate for maintainability:",[23,62276,62278],{"className":126,"code":62277,"language":47,"meta":28,"style":28},"# pip install weasyprint jinja2\nfrom weasyprint import HTML, CSS\nfrom pathlib import Path\n\nhtml_str = \"\u003Chtml>...\u003C\u002Fhtml>\"  # rendered Jinja2 output\ncss = CSS(filename=str(Path(\"templates\u002Freport.css\")))\ntry:\n    HTML(string=html_str).write_pdf(\"out.pdf\", stylesheets=[css])\nexcept Exception as exc:\n    raise RuntimeError(f\"Render failed: {exc}\") from exc\n",[30,62279,62280,62284,62298,62308,62312,62325,62350,62356,62380,62390],{"__ignoreMap":28},[33,62281,62282],{"class":35,"line":36},[33,62283,20943],{"class":39},[33,62285,62286,62288,62290,62292,62294,62296],{"class":35,"line":43},[33,62287,190],{"class":163},[33,62289,20982],{"class":167},[33,62291,164],{"class":163},[33,62293,60046],{"class":50},[33,62295,365],{"class":167},[33,62297,60051],{"class":50},[33,62299,62300,62302,62304,62306],{"class":35,"line":61},[33,62301,190],{"class":163},[33,62303,193],{"class":167},[33,62305,164],{"class":163},[33,62307,198],{"class":167},[33,62309,62310],{"class":35,"line":73},[33,62311,92],{"emptyLinePlaceholder":91},[33,62313,62314,62317,62319,62322],{"class":35,"line":88},[33,62315,62316],{"class":167},"html_str ",[33,62318,242],{"class":163},[33,62320,62321],{"class":54}," \"\u003Chtml>...\u003C\u002Fhtml>\"",[33,62323,62324],{"class":39},"  # rendered Jinja2 output\n",[33,62326,62327,62330,62332,62335,62338,62340,62342,62345,62348],{"class":35,"line":95},[33,62328,62329],{"class":167},"css ",[33,62331,242],{"class":163},[33,62333,62334],{"class":167}," CSS(",[33,62336,62337],{"class":238},"filename",[33,62339,242],{"class":163},[33,62341,1053],{"class":50},[33,62343,62344],{"class":167},"(Path(",[33,62346,62347],{"class":54},"\"templates\u002Freport.css\"",[33,62349,23269],{"class":167},[33,62351,62352,62354],{"class":35,"line":101},[33,62353,35574],{"class":163},[33,62355,574],{"class":167},[33,62357,62358,62361,62363,62365,62367,62370,62372,62375,62377],{"class":35,"line":171},[33,62359,62360],{"class":167},"    HTML(",[33,62362,21238],{"class":238},[33,62364,242],{"class":163},[33,62366,21243],{"class":167},[33,62368,62369],{"class":54},"\"out.pdf\"",[33,62371,365],{"class":167},[33,62373,62374],{"class":238},"stylesheets",[33,62376,242],{"class":163},[33,62378,62379],{"class":167},"[css])\n",[33,62381,62382,62384,62386,62388],{"class":35,"line":179},[33,62383,35726],{"class":163},[33,62385,783],{"class":50},[33,62387,1852],{"class":163},[33,62389,1855],{"class":167},[33,62391,62392,62394,62396,62398,62400,62403,62405,62407,62409,62411,62413,62415],{"class":35,"line":187},[33,62393,35742],{"class":163},[33,62395,7590],{"class":50},[33,62397,602],{"class":167},[33,62399,4059],{"class":163},[33,62401,62402],{"class":54},"\"Render failed: ",[33,62404,1115],{"class":50},[33,62406,6565],{"class":167},[33,62408,1121],{"class":50},[33,62410,274],{"class":54},[33,62412,1649],{"class":167},[33,62414,190],{"class":163},[33,62416,20843],{"class":167},[18,62418,52030],{"id":52029},[23,62420,62422],{"className":126,"code":62421,"language":47,"meta":28,"style":28},"# pip install pypdf\nfrom pathlib import Path\nfrom pypdf import PdfReader\n\ndef validate_pdf(path: Path, min_pages: int = 1) -> None:\n    try:\n        reader = PdfReader(str(path))\n        assert len(reader.pages) >= min_pages, f\"Expected >= {min_pages} pages, got {len(reader.pages)}\"\n        # Spot-check first page text is non-empty\n        text = reader.pages[0].extract_text() or \"\"\n        assert len(text.strip()) > 20, \"First page appears empty\"\n        print(f\"OK: {path.name} ({len(reader.pages)} pages)\")\n    except AssertionError as exc:\n        raise RuntimeError(f\"Validation failed for {path}: {exc}\") from exc\n    except Exception as exc:\n        raise RuntimeError(f\"Could not read {path}: {exc}\") from exc\n\nvalidate_pdf(Path(\"reports\u002Fsales_reportlab.pdf\"))\nvalidate_pdf(Path(\"reports\u002Fsales_weasyprint.pdf\"))\n",[30,62423,62424,62428,62438,62448,62452,62474,62480,62493,62529,62534,62552,62570,62599,62609,62644,62654,62689,62693,62702],{"__ignoreMap":28},[33,62425,62426],{"class":35,"line":36},[33,62427,57316],{"class":39},[33,62429,62430,62432,62434,62436],{"class":35,"line":43},[33,62431,190],{"class":163},[33,62433,193],{"class":167},[33,62435,164],{"class":163},[33,62437,198],{"class":167},[33,62439,62440,62442,62444,62446],{"class":35,"line":61},[33,62441,190],{"class":163},[33,62443,57333],{"class":167},[33,62445,164],{"class":163},[33,62447,57338],{"class":167},[33,62449,62450],{"class":35,"line":73},[33,62451,92],{"emptyLinePlaceholder":91},[33,62453,62454,62456,62459,62462,62464,62466,62468,62470,62472],{"class":35,"line":88},[33,62455,562],{"class":163},[33,62457,62458],{"class":46}," validate_pdf",[33,62460,62461],{"class":167},"(path: Path, min_pages: ",[33,62463,1059],{"class":50},[33,62465,212],{"class":163},[33,62467,1814],{"class":50},[33,62469,1617],{"class":167},[33,62471,571],{"class":50},[33,62473,574],{"class":167},[33,62475,62476,62478],{"class":35,"line":95},[33,62477,2424],{"class":163},[33,62479,574],{"class":167},[33,62481,62482,62485,62487,62489,62491],{"class":35,"line":101},[33,62483,62484],{"class":167},"        reader ",[33,62486,242],{"class":163},[33,62488,57370],{"class":167},[33,62490,1053],{"class":50},[33,62492,21248],{"class":167},[33,62494,62495,62497,62499,62501,62503,62506,62508,62511,62513,62516,62518,62521,62523,62525,62527],{"class":35,"line":171},[33,62496,21485],{"class":163},[33,62498,4037],{"class":50},[33,62500,57383],{"class":167},[33,62502,43000],{"class":163},[33,62504,62505],{"class":167}," min_pages, ",[33,62507,4059],{"class":163},[33,62509,62510],{"class":54},"\"Expected >= ",[33,62512,1115],{"class":50},[33,62514,62515],{"class":167},"min_pages",[33,62517,1121],{"class":50},[33,62519,62520],{"class":54}," pages, got ",[33,62522,4065],{"class":50},[33,62524,59322],{"class":167},[33,62526,1121],{"class":50},[33,62528,7504],{"class":54},[33,62530,62531],{"class":35,"line":179},[33,62532,62533],{"class":39},"        # Spot-check first page text is non-empty\n",[33,62535,62536,62538,62540,62543,62545,62548,62550],{"class":35,"line":187},[33,62537,47289],{"class":167},[33,62539,242],{"class":163},[33,62541,62542],{"class":167}," reader.pages[",[33,62544,748],{"class":50},[33,62546,62547],{"class":167},"].extract_text() ",[33,62549,7162],{"class":163},[33,62551,13126],{"class":54},[33,62553,62554,62556,62558,62561,62563,62565,62567],{"class":35,"line":201},[33,62555,21485],{"class":163},[33,62557,4037],{"class":50},[33,62559,62560],{"class":167},"(text.strip()) ",[33,62562,6009],{"class":163},[33,62564,43599],{"class":50},[33,62566,365],{"class":167},[33,62568,62569],{"class":54},"\"First page appears empty\"\n",[33,62571,62572,62574,62576,62578,62580,62582,62584,62586,62588,62590,62592,62594,62597],{"class":35,"line":206},[33,62573,9414],{"class":50},[33,62575,602],{"class":167},[33,62577,4059],{"class":163},[33,62579,57480],{"class":54},[33,62581,1115],{"class":50},[33,62583,57398],{"class":167},[33,62585,1121],{"class":50},[33,62587,17583],{"class":54},[33,62589,4065],{"class":50},[33,62591,59322],{"class":167},[33,62593,1121],{"class":50},[33,62595,62596],{"class":54}," pages)\"",[33,62598,221],{"class":167},[33,62600,62601,62603,62605,62607],{"class":35,"line":224},[33,62602,2449],{"class":163},[33,62604,9445],{"class":50},[33,62606,1852],{"class":163},[33,62608,1855],{"class":167},[33,62610,62611,62613,62615,62617,62619,62622,62624,62626,62628,62630,62632,62634,62636,62638,62640,62642],{"class":35,"line":229},[33,62612,4051],{"class":163},[33,62614,7590],{"class":50},[33,62616,602],{"class":167},[33,62618,4059],{"class":163},[33,62620,62621],{"class":54},"\"Validation failed for ",[33,62623,1115],{"class":50},[33,62625,2580],{"class":167},[33,62627,1121],{"class":50},[33,62629,2079],{"class":54},[33,62631,1115],{"class":50},[33,62633,6565],{"class":167},[33,62635,1121],{"class":50},[33,62637,274],{"class":54},[33,62639,1649],{"class":167},[33,62641,190],{"class":163},[33,62643,20843],{"class":167},[33,62645,62646,62648,62650,62652],{"class":35,"line":235},[33,62647,2449],{"class":163},[33,62649,783],{"class":50},[33,62651,1852],{"class":163},[33,62653,1855],{"class":167},[33,62655,62656,62658,62660,62662,62664,62667,62669,62671,62673,62675,62677,62679,62681,62683,62685,62687],{"class":35,"line":250},[33,62657,4051],{"class":163},[33,62659,7590],{"class":50},[33,62661,602],{"class":167},[33,62663,4059],{"class":163},[33,62665,62666],{"class":54},"\"Could not read ",[33,62668,1115],{"class":50},[33,62670,2580],{"class":167},[33,62672,1121],{"class":50},[33,62674,2079],{"class":54},[33,62676,1115],{"class":50},[33,62678,6565],{"class":167},[33,62680,1121],{"class":50},[33,62682,274],{"class":54},[33,62684,1649],{"class":167},[33,62686,190],{"class":163},[33,62688,20843],{"class":167},[33,62690,62691],{"class":35,"line":266},[33,62692,92],{"emptyLinePlaceholder":91},[33,62694,62695,62698,62700],{"class":35,"line":290},[33,62696,62697],{"class":167},"validate_pdf(Path(",[33,62699,60558],{"class":54},[33,62701,371],{"class":167},[33,62703,62704,62706,62708],{"class":35,"line":295},[33,62705,62697],{"class":167},[33,62707,60433],{"class":54},[33,62709,371],{"class":167},[18,62711,62712],{"id":4208},"Performance and scale notes",[4211,62714,62715,62723,62732,62735],{},[4214,62716,62717,62719,62720,62722],{},[1974,62718,20883],{}," builds an in-memory DOM before rendering. At >500 rows per page, memory climbs fast. Chunk data into page-sized batches and use ",[940,62721,52682],{"href":52681}," to combine the pieces.",[4214,62724,62725,62728,62729,62731],{},[1974,62726,62727],{},"ReportLab Platypus"," streams flowables and handles large reports better. Still, avoid loading entire DataFrames into ",[30,62730,61996],{}," objects — pre-serialise to plain Python lists first.",[4214,62733,62734],{},"For nightly batch runs, pre-render all Jinja2 templates to HTML strings in parallel (threads are fine — it is CPU-light), then render PDFs sequentially or in a process pool to avoid GIL contention on the rendering step.",[4214,62736,62737,62738,62741],{},"Matplotlib figure creation is not thread-safe; use ",[30,62739,62740],{},"matplotlib.use(\"Agg\")"," and create figures inside worker processes, not threads.",[18,62743,4271],{"id":4270},[4273,62745,62746,62756],{},[4276,62747,62748],{},[4279,62749,62750,62752,62754],{},[4282,62751,14317],{},[4282,62753,4287],{},[4282,62755,4290],{},[4292,62757,62758,62778,62793,62814,62828],{},[4279,62759,62760,62765,62768],{},[4297,62761,62762],{},[30,62763,62764],{},"OSError: no library called \"cairo-2\" was found",[4297,62766,62767],{},"WeasyPrint system dependency missing",[4297,62769,62770,62773,62774,62777],{},[30,62771,62772],{},"sudo apt install libcairo2"," (Linux) or ",[30,62775,62776],{},"brew install cairo"," (macOS)",[4279,62779,62780,62785,62788],{},[4297,62781,62782,62784],{},[30,62783,53869],{}," or garbled boxes in ReportLab",[4297,62786,62787],{},"Default Helvetica core font lacks the glyph",[4297,62789,62790,62791],{},"Register a TrueType font — see ",[940,62792,28608],{"href":28607},[4279,62794,62795,62798,62805],{},[4297,62796,62797],{},"Table rows split mid-cell in WeasyPrint",[4297,62799,62800,62801,42706,62803],{},"Missing ",[30,62802,53894],{},[30,62804,53897],{},[4297,62806,4358,62807,62810,62811,62813],{},[30,62808,62809],{},"tr { page-break-inside: avoid; }"," to the ",[30,62812,54604],{}," CSS block",[4279,62815,62816,62822,62825],{},[4297,62817,62818,62821],{},[30,62819,62820],{},"LayoutError: Flowable ... too large"," in ReportLab",[4297,62823,62824],{},"A single flowable taller than the page frame",[4297,62826,62827],{},"Split the flowable or reduce font size; for tables, chunk rows",[4279,62829,62830,62833,62838],{},[4297,62831,62832],{},"Headers\u002Ffooters bleed into content area",[4297,62834,62835,62837],{},[30,62836,54604],{}," margin smaller than header\u002Ffooter height",[4297,62839,46082,62840,1351,62843,62845],{},[30,62841,62842],{},"topMargin",[30,62844,20137],{}," to accommodate the fixed elements",[18,62847,62848],{"id":4401},"Complete working script",[23,62850,62852],{"className":126,"code":62851,"language":47,"meta":28,"style":28},"# pip install reportlab weasyprint jinja2 pandas matplotlib pypdf\n\"\"\"gen_reports.py — generate a sales PDF report from a CSV, with chart.\n\nUsage:\n  python gen_reports.py --data data\u002Fsales.csv --out reports\u002Fsales.pdf --engine reportlab\n  python gen_reports.py --data data\u002Fsales.csv --out reports\u002Fsales.pdf --engine weasyprint\n\"\"\"\nimport argparse\nimport io\nimport sys\nfrom pathlib import Path\n\nimport matplotlib\nmatplotlib.use(\"Agg\")\nimport matplotlib.pyplot as plt\nimport pandas as pd\nfrom jinja2 import Environment, BaseLoader\nfrom weasyprint import HTML\nfrom reportlab.lib.pagesizes import A4\nfrom reportlab.lib.units import mm\nfrom reportlab.lib import colors\nfrom reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph, Spacer, Image\nfrom reportlab.lib.styles import getSampleStyleSheet\nfrom pypdf import PdfReader\n\nWEASYPRINT_TEMPLATE = \"\"\"\u003C!DOCTYPE html>\n\u003Chtml>\u003Chead>\u003Cmeta charset=\"utf-8\">\u003Cstyle>\n@page { size: A4; margin: 20mm 15mm 25mm;\n  @top-center { content: \"{{ period }}\"; font-size: 9pt; color: #475569; }\n  @bottom-right { content: \"Page \" counter(page); font-size: 8pt; color: #475569; } }\nbody { font-family: sans-serif; font-size: 10pt; }\nh1 { font-size: 16pt; }\ntable { width: 100%; border-collapse: collapse; }\nth { background: #2563eb; color: #fff; padding: 4px 8px; }\ntd { padding: 4px 8px; border-bottom: 1px solid #e2e8f0; }\ntr { page-break-inside: avoid; }\n.right { text-align: right; }\n\u003C\u002Fstyle>\u003C\u002Fhead>\u003Cbody>\n\u003Ch1>Sales Performance — {{ period }}\u003C\u002Fh1>\n\u003Ctable>\u003Cthead>\u003Ctr>\n  \u003Cth>Customer\u003C\u002Fth>\u003Cth>Region\u003C\u002Fth>\u003Cth class=\"right\">Revenue\u003C\u002Fth>\n  \u003Cth class=\"right\">Costs\u003C\u002Fth>\u003Cth class=\"right\">Profit\u003C\u002Fth>\u003Cth class=\"right\">Margin %\u003C\u002Fth>\n\u003C\u002Ftr>\u003C\u002Fthead>\u003Ctbody>\n{% for r in rows %}\n\u003Ctr>\u003Ctd>{{ r.customer }}\u003C\u002Ftd>\u003Ctd>{{ r.region }}\u003C\u002Ftd>\n  \u003Ctd class=\"right\">${{ \"{:,.0f}\".format(r.revenue) }}\u003C\u002Ftd>\n  \u003Ctd class=\"right\">${{ \"{:,.0f}\".format(r.costs) }}\u003C\u002Ftd>\n  \u003Ctd class=\"right\">${{ \"{:,.0f}\".format(r.profit) }}\u003C\u002Ftd>\n  \u003Ctd class=\"right\">{{ r.margin }}%\u003C\u002Ftd>\u003C\u002Ftr>\n{% endfor %}\n\u003C\u002Ftbody>\u003C\u002Ftable>\u003C\u002Fbody>\u003C\u002Fhtml>\"\"\"\n\n\ndef load_data(path: Path) -> list[dict]:\n    df = pd.read_csv(path, encoding=\"utf-8\")\n    df.columns = df.columns.str.strip().str.lower()\n    df[\"profit\"] = df[\"revenue\"] - df[\"costs\"]\n    df[\"margin\"] = (df[\"profit\"] \u002F df[\"revenue\"] * 100).round(1)\n    return df.to_dict(\"records\")\n\n\ndef make_chart(rows: list[dict]) -> io.BytesIO:\n    labels = [r[\"customer\"] for r in rows]\n    fig, ax = plt.subplots(figsize=(5, 2.5))\n    x = range(len(labels))\n    ax.bar([i - 0.2 for i in x], [r[\"revenue\"] for r in rows], 0.35,\n           label=\"Revenue\", color=\"#2563eb\")\n    ax.bar([i + 0.2 for i in x], [r[\"costs\"] for r in rows], 0.35,\n           label=\"Costs\", color=\"#dbeafe\")\n    ax.set_xticks(list(x)); ax.set_xticklabels(labels, fontsize=8)\n    ax.legend(fontsize=8); ax.spines[[\"top\",\"right\"]].set_visible(False)\n    plt.tight_layout()\n    buf = io.BytesIO(); fig.savefig(buf, format=\"png\", dpi=150); plt.close(fig)\n    buf.seek(0); return buf\n\n\ndef render_weasyprint(rows, period, out: Path) -> None:\n    tmpl = Environment(loader=BaseLoader()).from_string(WEASYPRINT_TEMPLATE)\n    HTML(string=tmpl.render(rows=rows, period=period)).write_pdf(str(out))\n\n\ndef render_reportlab(rows, period, out: Path) -> None:\n    styles = getSampleStyleSheet()\n    doc = SimpleDocTemplate(str(out), pagesize=A4,\n                            leftMargin=15*mm, rightMargin=15*mm,\n                            topMargin=20*mm, bottomMargin=20*mm)\n\n    def _hf(canvas, doc):\n        canvas.saveState()\n        canvas.setFont(\"Helvetica\", 8)\n        canvas.setFillColor(colors.HexColor(\"#475569\"))\n        canvas.drawString(15*mm, A4[1]-12*mm, f\"Sales Report — {period}\")\n        canvas.drawRightString(A4[0]-15*mm, 10*mm, f\"Page {doc.page}\")\n        canvas.restoreState()\n\n    header = [[\"Customer\",\"Region\",\"Revenue\",\"Costs\",\"Profit\",\"Margin %\"]]\n    data = [[r[\"customer\"], r[\"region\"], f\"${r['revenue']:,.0f}\", f\"${r['costs']:,.0f}\",\n             f\"${r['profit']:,.0f}\", f\"{r['margin']}%\"] for r in rows]\n    tbl = Table(header+data, colWidths=[50*mm,30*mm,28*mm,28*mm,28*mm,22*mm], repeatRows=1)\n    tbl.setStyle(TableStyle([\n        (\"BACKGROUND\",(0,0),(-1,0),colors.HexColor(\"#2563eb\")),\n        (\"TEXTCOLOR\",(0,0),(-1,0),colors.white),\n        (\"FONTNAME\",(0,0),(-1,0),\"Helvetica-Bold\"),\n        (\"FONTSIZE\",(0,0),(-1,-1),9),\n        (\"GRID\",(0,0),(-1,-1),0.5,colors.HexColor(\"#e2e8f0\")),\n        (\"ALIGN\",(2,0),(-1,-1),\"RIGHT\"),\n    ]))\n    chart_buf = make_chart(rows)\n    story = [Paragraph(f\"Sales Performance — {period}\", styles[\"h1\"]),\n             Spacer(1, 4*mm), Image(chart_buf, width=360, height=180),\n             Spacer(1, 4*mm), tbl]\n    doc.build(story, onFirstPage=_hf, onLaterPages=_hf)\n\n\ndef validate(path: Path) -> None:\n    r = PdfReader(str(path))\n    assert len(r.pages) >= 1, \"PDF has no pages\"\n    print(f\"OK: {path.name} ({len(r.pages)} page(s))\")\n\n\ndef main():\n    ap = argparse.ArgumentParser()\n    ap.add_argument(\"--data\", required=True)\n    ap.add_argument(\"--out\", required=True)\n    ap.add_argument(\"--engine\", choices=[\"reportlab\",\"weasyprint\"], default=\"reportlab\")\n    ap.add_argument(\"--period\", default=\"Q3 2026\")\n    args = ap.parse_args()\n\n    data_path = Path(args.data)\n    out_path = Path(args.out)\n    out_path.parent.mkdir(parents=True, exist_ok=True)\n\n    try:\n        rows = load_data(data_path)\n    except Exception as exc:\n        sys.exit(f\"Data load error: {exc}\")\n\n    try:\n        if args.engine == \"weasyprint\":\n            render_weasyprint(rows, args.period, out_path)\n        else:\n            render_reportlab(rows, args.period, out_path)\n        validate(out_path)\n    except Exception as exc:\n        sys.exit(f\"Render error: {exc}\")\n\nif __name__ == \"__main__\":\n    main()\n",[30,62853,62854,62859,62864,62868,62872,62877,62882,62886,62892,62898,62904,62914,62918,62924,62932,62942,62952,62962,62972,62982,62992,63002,63013,63023,63033,63037,63046,63051,63056,63061,63066,63071,63076,63081,63086,63091,63095,63099,63104,63109,63114,63119,63124,63129,63137,63142,63147,63152,63157,63162,63170,63175,63179,63183,63196,63212,63220,63244,63278,63288,63292,63296,63310,63330,63352,63366,63400,63419,63451,63469,63486,63512,63516,63542,63557,63561,63565,63578,63597,63625,63629,63633,63645,63653,63671,63694,63718,63722,63731,63735,63747,63755,63791,63827,63831,63835,63867,63924,63973,64038,64042,64075,64102,64133,64165,64202,64234,64238,64248,64275,64307,64322,64341,64345,64349,64361,64374,64392,64421,64425,64429,64437,64446,64463,64479,64512,64529,64537,64541,64551,64560,64581,64585,64591,64600,64610,64629,64633,64639,64653,64658,64664,64669,64674,64684,64703,64707,64719],{"__ignoreMap":28},[33,62855,62856],{"class":35,"line":36},[33,62857,62858],{"class":39},"# pip install reportlab weasyprint jinja2 pandas matplotlib pypdf\n",[33,62860,62861],{"class":35,"line":43},[33,62862,62863],{"class":54},"\"\"\"gen_reports.py — generate a sales PDF report from a CSV, with chart.\n",[33,62865,62866],{"class":35,"line":61},[33,62867,92],{"emptyLinePlaceholder":91},[33,62869,62870],{"class":35,"line":73},[33,62871,4435],{"class":54},[33,62873,62874],{"class":35,"line":88},[33,62875,62876],{"class":54},"  python gen_reports.py --data data\u002Fsales.csv --out reports\u002Fsales.pdf --engine reportlab\n",[33,62878,62879],{"class":35,"line":95},[33,62880,62881],{"class":54},"  python gen_reports.py --data data\u002Fsales.csv --out reports\u002Fsales.pdf --engine weasyprint\n",[33,62883,62884],{"class":35,"line":101},[33,62885,139],{"class":54},[33,62887,62888,62890],{"class":35,"line":171},[33,62889,164],{"class":163},[33,62891,4461],{"class":167},[33,62893,62894,62896],{"class":35,"line":179},[33,62895,164],{"class":163},[33,62897,60058],{"class":167},[33,62899,62900,62902],{"class":35,"line":187},[33,62901,164],{"class":163},[33,62903,168],{"class":167},[33,62905,62906,62908,62910,62912],{"class":35,"line":201},[33,62907,190],{"class":163},[33,62909,193],{"class":167},[33,62911,164],{"class":163},[33,62913,198],{"class":167},[33,62915,62916],{"class":35,"line":206},[33,62917,92],{"emptyLinePlaceholder":91},[33,62919,62920,62922],{"class":35,"line":224},[33,62921,164],{"class":163},[33,62923,59488],{"class":167},[33,62925,62926,62928,62930],{"class":35,"line":229},[33,62927,61540],{"class":167},[33,62929,61543],{"class":54},[33,62931,221],{"class":167},[33,62933,62934,62936,62938,62940],{"class":35,"line":235},[33,62935,164],{"class":163},[33,62937,61552],{"class":167},[33,62939,495],{"class":163},[33,62941,61557],{"class":167},[33,62943,62944,62946,62948,62950],{"class":35,"line":250},[33,62945,164],{"class":163},[33,62947,492],{"class":167},[33,62949,495],{"class":163},[33,62951,498],{"class":167},[33,62953,62954,62956,62958,62960],{"class":35,"line":266},[33,62955,190],{"class":163},[33,62957,20970],{"class":167},[33,62959,164],{"class":163},[33,62961,54635],{"class":167},[33,62963,62964,62966,62968,62970],{"class":35,"line":290},[33,62965,190],{"class":163},[33,62967,20982],{"class":167},[33,62969,164],{"class":163},[33,62971,20987],{"class":50},[33,62973,62974,62976,62978,62980],{"class":35,"line":295},[33,62975,190],{"class":163},[33,62977,19044],{"class":167},[33,62979,164],{"class":163},[33,62981,19049],{"class":167},[33,62983,62984,62986,62988,62990],{"class":35,"line":300},[33,62985,190],{"class":163},[33,62987,19080],{"class":167},[33,62989,164],{"class":163},[33,62991,55290],{"class":167},[33,62993,62994,62996,62998,63000],{"class":35,"line":317},[33,62995,190],{"class":163},[33,62997,19056],{"class":167},[33,62999,164],{"class":163},[33,63001,19061],{"class":167},[33,63003,63004,63006,63008,63010],{"class":35,"line":332},[33,63005,190],{"class":163},[33,63007,19092],{"class":167},[33,63009,164],{"class":163},[33,63011,63012],{"class":167}," SimpleDocTemplate, Table, TableStyle, Paragraph, Spacer, Image\n",[33,63014,63015,63017,63019,63021],{"class":35,"line":347},[33,63016,190],{"class":163},[33,63018,19068],{"class":167},[33,63020,164],{"class":163},[33,63022,62101],{"class":167},[33,63024,63025,63027,63029,63031],{"class":35,"line":374},[33,63026,190],{"class":163},[33,63028,57333],{"class":167},[33,63030,164],{"class":163},[33,63032,57338],{"class":167},[33,63034,63035],{"class":35,"line":397},[33,63036,92],{"emptyLinePlaceholder":91},[33,63038,63039,63042,63044],{"class":35,"line":653},[33,63040,63041],{"class":50},"WEASYPRINT_TEMPLATE",[33,63043,212],{"class":163},[33,63045,54659],{"class":54},[33,63047,63048],{"class":35,"line":667},[33,63049,63050],{"class":54},"\u003Chtml>\u003Chead>\u003Cmeta charset=\"utf-8\">\u003Cstyle>\n",[33,63052,63053],{"class":35,"line":675},[33,63054,63055],{"class":54},"@page { size: A4; margin: 20mm 15mm 25mm;\n",[33,63057,63058],{"class":35,"line":689},[33,63059,63060],{"class":54},"  @top-center { content: \"{{ period }}\"; font-size: 9pt; color: #475569; }\n",[33,63062,63063],{"class":35,"line":703},[33,63064,63065],{"class":54},"  @bottom-right { content: \"Page \" counter(page); font-size: 8pt; color: #475569; } }\n",[33,63067,63068],{"class":35,"line":714},[33,63069,63070],{"class":54},"body { font-family: sans-serif; font-size: 10pt; }\n",[33,63072,63073],{"class":35,"line":723},[33,63074,63075],{"class":54},"h1 { font-size: 16pt; }\n",[33,63077,63078],{"class":35,"line":754},[33,63079,63080],{"class":54},"table { width: 100%; border-collapse: collapse; }\n",[33,63082,63083],{"class":35,"line":771},[33,63084,63085],{"class":54},"th { background: #2563eb; color: #fff; padding: 4px 8px; }\n",[33,63087,63088],{"class":35,"line":777},[33,63089,63090],{"class":54},"td { padding: 4px 8px; border-bottom: 1px solid #e2e8f0; }\n",[33,63092,63093],{"class":35,"line":788},[33,63094,60136],{"class":54},[33,63096,63097],{"class":35,"line":804},[33,63098,60141],{"class":54},[33,63100,63101],{"class":35,"line":809},[33,63102,63103],{"class":54},"\u003C\u002Fstyle>\u003C\u002Fhead>\u003Cbody>\n",[33,63105,63106],{"class":35,"line":819},[33,63107,63108],{"class":54},"\u003Ch1>Sales Performance — {{ period }}\u003C\u002Fh1>\n",[33,63110,63111],{"class":35,"line":829},[33,63112,63113],{"class":54},"\u003Ctable>\u003Cthead>\u003Ctr>\n",[33,63115,63116],{"class":35,"line":834},[33,63117,63118],{"class":54},"  \u003Cth>Customer\u003C\u002Fth>\u003Cth>Region\u003C\u002Fth>\u003Cth class=\"right\">Revenue\u003C\u002Fth>\n",[33,63120,63121],{"class":35,"line":839},[33,63122,63123],{"class":54},"  \u003Cth class=\"right\">Costs\u003C\u002Fth>\u003Cth class=\"right\">Profit\u003C\u002Fth>\u003Cth class=\"right\">Margin %\u003C\u002Fth>\n",[33,63125,63126],{"class":35,"line":860},[33,63127,63128],{"class":54},"\u003C\u002Ftr>\u003C\u002Fthead>\u003Ctbody>\n",[33,63130,63131,63133,63135],{"class":35,"line":887},[33,63132,1115],{"class":54},[33,63134,54870],{"class":50},[33,63136,60204],{"class":54},[33,63138,63139],{"class":35,"line":907},[33,63140,63141],{"class":54},"\u003Ctr>\u003Ctd>{{ r.customer }}\u003C\u002Ftd>\u003Ctd>{{ r.region }}\u003C\u002Ftd>\n",[33,63143,63144],{"class":35,"line":1826},[33,63145,63146],{"class":54},"  \u003Ctd class=\"right\">${{ \"{:,.0f}\".format(r.revenue) }}\u003C\u002Ftd>\n",[33,63148,63149],{"class":35,"line":1844},[33,63150,63151],{"class":54},"  \u003Ctd class=\"right\">${{ \"{:,.0f}\".format(r.costs) }}\u003C\u002Ftd>\n",[33,63153,63154],{"class":35,"line":1858},[33,63155,63156],{"class":54},"  \u003Ctd class=\"right\">${{ \"{:,.0f}\".format(r.profit) }}\u003C\u002Ftd>\n",[33,63158,63159],{"class":35,"line":1871},[33,63160,63161],{"class":54},"  \u003Ctd class=\"right\">{{ r.margin }}%\u003C\u002Ftd>\u003C\u002Ftr>\n",[33,63163,63164,63166,63168],{"class":35,"line":1877},[33,63165,1115],{"class":54},[33,63167,54920],{"class":50},[33,63169,54923],{"class":54},[33,63171,63172],{"class":35,"line":1883},[33,63173,63174],{"class":54},"\u003C\u002Ftbody>\u003C\u002Ftable>\u003C\u002Fbody>\u003C\u002Fhtml>\"\"\"\n",[33,63176,63177],{"class":35,"line":1915},[33,63178,92],{"emptyLinePlaceholder":91},[33,63180,63181],{"class":35,"line":1926},[33,63182,92],{"emptyLinePlaceholder":91},[33,63184,63185,63187,63190,63192,63194],{"class":35,"line":1932},[33,63186,562],{"class":163},[33,63188,63189],{"class":46}," load_data",[33,63191,54082],{"class":167},[33,63193,37100],{"class":50},[33,63195,17477],{"class":167},[33,63197,63198,63200,63202,63204,63206,63208,63210],{"class":35,"line":1938},[33,63199,4025],{"class":167},[33,63201,242],{"class":163},[33,63203,27411],{"class":167},[33,63205,27249],{"class":238},[33,63207,242],{"class":163},[33,63209,1195],{"class":54},[33,63211,221],{"class":167},[33,63213,63214,63216,63218],{"class":35,"line":1950},[33,63215,27546],{"class":167},[33,63217,242],{"class":163},[33,63219,54121],{"class":167},[33,63221,63222,63224,63226,63228,63230,63232,63234,63236,63238,63240,63242],{"class":35,"line":1958},[33,63223,27581],{"class":167},[33,63225,59740],{"class":54},[33,63227,763],{"class":167},[33,63229,242],{"class":163},[33,63231,7935],{"class":167},[33,63233,16465],{"class":54},[33,63235,763],{"class":167},[33,63237,4126],{"class":163},[33,63239,7935],{"class":167},[33,63241,59686],{"class":54},[33,63243,9202],{"class":167},[33,63245,63246,63248,63250,63252,63254,63256,63258,63260,63262,63264,63266,63268,63270,63272,63274,63276],{"class":35,"line":4904},[33,63247,27581],{"class":167},[33,63249,16456],{"class":54},[33,63251,763],{"class":167},[33,63253,242],{"class":163},[33,63255,59771],{"class":167},[33,63257,59740],{"class":54},[33,63259,763],{"class":167},[33,63261,1351],{"class":163},[33,63263,7935],{"class":167},[33,63265,16465],{"class":54},[33,63267,763],{"class":167},[33,63269,1769],{"class":163},[33,63271,18366],{"class":50},[33,63273,59790],{"class":167},[33,63275,734],{"class":50},[33,63277,221],{"class":167},[33,63279,63280,63282,63284,63286],{"class":35,"line":4909},[33,63281,1332],{"class":163},[33,63283,54131],{"class":167},[33,63285,21222],{"class":54},[33,63287,221],{"class":167},[33,63289,63290],{"class":35,"line":4915},[33,63291,92],{"emptyLinePlaceholder":91},[33,63293,63294],{"class":35,"line":4925},[33,63295,92],{"emptyLinePlaceholder":91},[33,63297,63298,63300,63303,63305,63307],{"class":35,"line":4935},[33,63299,562],{"class":163},[33,63301,63302],{"class":46}," make_chart",[33,63304,60276],{"class":167},[33,63306,37100],{"class":50},[33,63308,63309],{"class":167},"]) -> io.BytesIO:\n",[33,63311,63312,63314,63316,63318,63320,63322,63324,63326,63328],{"class":35,"line":4941},[33,63313,61614],{"class":167},[33,63315,242],{"class":163},[33,63317,61619],{"class":167},[33,63319,59673],{"class":54},[33,63321,763],{"class":167},[33,63323,6124],{"class":163},[33,63325,45721],{"class":167},[33,63327,662],{"class":163},[33,63329,47749],{"class":167},[33,63331,63332,63334,63336,63338,63340,63342,63344,63346,63348,63350],{"class":35,"line":4950},[33,63333,61682],{"class":167},[33,63335,242],{"class":163},[33,63337,61687],{"class":167},[33,63339,61690],{"class":238},[33,63341,242],{"class":163},[33,63343,602],{"class":167},[33,63345,1153],{"class":50},[33,63347,365],{"class":167},[33,63349,19760],{"class":50},[33,63351,371],{"class":167},[33,63353,63354,63356,63358,63360,63362,63364],{"class":35,"line":4960},[33,63355,61714],{"class":167},[33,63357,242],{"class":163},[33,63359,1801],{"class":50},[33,63361,602],{"class":167},[33,63363,928],{"class":50},[33,63365,61725],{"class":167},[33,63367,63368,63370,63372,63374,63376,63378,63380,63383,63385,63387,63389,63391,63393,63396,63398],{"class":35,"line":4965},[33,63369,61730],{"class":167},[33,63371,4126],{"class":163},[33,63373,46243],{"class":50},[33,63375,14766],{"class":163},[33,63377,47269],{"class":167},[33,63379,662],{"class":163},[33,63381,63382],{"class":167}," x], [r[",[33,63384,16465],{"class":54},[33,63386,763],{"class":167},[33,63388,6124],{"class":163},[33,63390,45721],{"class":167},[33,63392,662],{"class":163},[33,63394,63395],{"class":167}," rows], ",[33,63397,61750],{"class":50},[33,63399,247],{"class":167},[33,63401,63402,63405,63407,63409,63411,63413,63415,63417],{"class":35,"line":4971},[33,63403,63404],{"class":238},"           label",[33,63406,242],{"class":163},[33,63408,12925],{"class":54},[33,63410,365],{"class":167},[33,63412,17245],{"class":238},[33,63414,242],{"class":163},[33,63416,55362],{"class":54},[33,63418,221],{"class":167},[33,63420,63421,63423,63425,63427,63429,63431,63433,63435,63437,63439,63441,63443,63445,63447,63449],{"class":35,"line":4983},[33,63422,61730],{"class":167},[33,63424,1811],{"class":163},[33,63426,46243],{"class":50},[33,63428,14766],{"class":163},[33,63430,47269],{"class":167},[33,63432,662],{"class":163},[33,63434,63382],{"class":167},[33,63436,59686],{"class":54},[33,63438,763],{"class":167},[33,63440,6124],{"class":163},[33,63442,45721],{"class":167},[33,63444,662],{"class":163},[33,63446,63395],{"class":167},[33,63448,61750],{"class":50},[33,63450,247],{"class":167},[33,63452,63453,63455,63457,63459,63461,63463,63465,63467],{"class":35,"line":4988},[33,63454,63404],{"class":238},[33,63456,242],{"class":163},[33,63458,60861],{"class":54},[33,63460,365],{"class":167},[33,63462,17245],{"class":238},[33,63464,242],{"class":163},[33,63466,61809],{"class":54},[33,63468,221],{"class":167},[33,63470,63471,63473,63475,63478,63480,63482,63484],{"class":35,"line":4993},[33,63472,61816],{"class":167},[33,63474,25066],{"class":50},[33,63476,63477],{"class":167},"(x)); ax.set_xticklabels(labels, ",[33,63479,61829],{"class":238},[33,63481,242],{"class":163},[33,63483,2591],{"class":50},[33,63485,221],{"class":167},[33,63487,63488,63490,63492,63494,63496,63499,63501,63504,63506,63508,63510],{"class":35,"line":5003},[33,63489,61873],{"class":167},[33,63491,61829],{"class":238},[33,63493,242],{"class":163},[33,63495,2591],{"class":50},[33,63497,63498],{"class":167},"); ax.spines[[",[33,63500,43412],{"class":54},[33,63502,63503],{"class":167},",",[33,63505,60626],{"class":54},[33,63507,61895],{"class":167},[33,63509,902],{"class":50},[33,63511,221],{"class":167},[33,63513,63514],{"class":35,"line":5008},[33,63515,61904],{"class":167},[33,63517,63518,63520,63522,63525,63527,63529,63531,63533,63535,63537,63539],{"class":35,"line":5014},[33,63519,61913],{"class":167},[33,63521,242],{"class":163},[33,63523,63524],{"class":167}," io.BytesIO(); fig.savefig(buf, ",[33,63526,61926],{"class":238},[33,63528,242],{"class":163},[33,63530,61931],{"class":54},[33,63532,365],{"class":167},[33,63534,46966],{"class":238},[33,63536,242],{"class":163},[33,63538,2635],{"class":50},[33,63540,63541],{"class":167},"); plt.close(fig)\n",[33,63543,63544,63546,63548,63551,63554],{"class":35,"line":5019},[33,63545,61951],{"class":167},[33,63547,748],{"class":50},[33,63549,63550],{"class":167},"); ",[33,63552,63553],{"class":163},"return",[33,63555,63556],{"class":167}," buf\n",[33,63558,63559],{"class":35,"line":5032},[33,63560,92],{"emptyLinePlaceholder":91},[33,63562,63563],{"class":35,"line":5039},[33,63564,92],{"emptyLinePlaceholder":91},[33,63566,63567,63569,63571,63574,63576],{"class":35,"line":5068},[33,63568,562],{"class":163},[33,63570,60273],{"class":46},[33,63572,63573],{"class":167},"(rows, period, out: Path) -> ",[33,63575,571],{"class":50},[33,63577,574],{"class":167},[33,63579,63580,63582,63584,63586,63588,63590,63593,63595],{"class":35,"line":5077},[33,63581,55087],{"class":167},[33,63583,242],{"class":163},[33,63585,21111],{"class":167},[33,63587,21114],{"class":238},[33,63589,242],{"class":163},[33,63591,63592],{"class":167},"BaseLoader()).from_string(",[33,63594,63041],{"class":50},[33,63596,221],{"class":167},[33,63598,63599,63601,63603,63605,63608,63610,63612,63614,63616,63618,63621,63623],{"class":35,"line":5082},[33,63600,62360],{"class":167},[33,63602,21238],{"class":238},[33,63604,242],{"class":163},[33,63606,63607],{"class":167},"tmpl.render(",[33,63609,18629],{"class":238},[33,63611,242],{"class":163},[33,63613,60331],{"class":167},[33,63615,60334],{"class":238},[33,63617,242],{"class":163},[33,63619,63620],{"class":167},"period)).write_pdf(",[33,63622,1053],{"class":50},[33,63624,55133],{"class":167},[33,63626,63627],{"class":35,"line":5089},[33,63628,92],{"emptyLinePlaceholder":91},[33,63630,63631],{"class":35,"line":5098},[33,63632,92],{"emptyLinePlaceholder":91},[33,63634,63635,63637,63639,63641,63643],{"class":35,"line":5105},[33,63636,562],{"class":163},[33,63638,60571],{"class":46},[33,63640,63573],{"class":167},[33,63642,571],{"class":50},[33,63644,574],{"class":167},[33,63646,63647,63649,63651],{"class":35,"line":5110},[33,63648,19255],{"class":167},[33,63650,242],{"class":163},[33,63652,19260],{"class":167},[33,63654,63655,63657,63659,63661,63663,63665,63667,63669],{"class":35,"line":5115},[33,63656,18224],{"class":167},[33,63658,242],{"class":163},[33,63660,58648],{"class":167},[33,63662,1053],{"class":50},[33,63664,55530],{"class":167},[33,63666,20091],{"class":238},[33,63668,242],{"class":163},[33,63670,20096],{"class":167},[33,63672,63673,63676,63678,63680,63682,63684,63686,63688,63690,63692],{"class":35,"line":5128},[33,63674,63675],{"class":238},"                            leftMargin",[33,63677,242],{"class":163},[33,63679,1646],{"class":50},[33,63681,1769],{"class":163},[33,63683,55550],{"class":167},[33,63685,20112],{"class":238},[33,63687,242],{"class":163},[33,63689,1646],{"class":50},[33,63691,1769],{"class":163},[33,63693,55561],{"class":167},[33,63695,63696,63699,63701,63703,63705,63707,63709,63711,63713,63715],{"class":35,"line":5135},[33,63697,63698],{"class":238},"                            topMargin",[33,63700,242],{"class":163},[33,63702,2587],{"class":50},[33,63704,1769],{"class":163},[33,63706,55550],{"class":167},[33,63708,20137],{"class":238},[33,63710,242],{"class":163},[33,63712,2587],{"class":50},[33,63714,1769],{"class":163},[33,63716,63717],{"class":167},"mm)\n",[33,63719,63720],{"class":35,"line":5142},[33,63721,92],{"emptyLinePlaceholder":91},[33,63723,63724,63726,63729],{"class":35,"line":5151},[33,63725,1742],{"class":163},[33,63727,63728],{"class":46}," _hf",[33,63730,55602],{"class":167},[33,63732,63733],{"class":35,"line":5156},[33,63734,55607],{"class":167},[33,63736,63737,63739,63741,63743,63745],{"class":35,"line":5161},[33,63738,55612],{"class":167},[33,63740,28546],{"class":54},[33,63742,365],{"class":167},[33,63744,2591],{"class":50},[33,63746,221],{"class":167},[33,63748,63749,63751,63753],{"class":35,"line":5167},[33,63750,60751],{"class":167},[33,63752,55376],{"class":54},[33,63754,371],{"class":167},[33,63756,63757,63759,63761,63763,63765,63767,63769,63771,63773,63775,63777,63779,63781,63783,63785,63787,63789],{"class":35,"line":5172},[33,63758,55634],{"class":167},[33,63760,1646],{"class":50},[33,63762,1769],{"class":163},[33,63764,55641],{"class":167},[33,63766,734],{"class":50},[33,63768,9546],{"class":167},[33,63770,4126],{"class":163},[33,63772,55650],{"class":50},[33,63774,1769],{"class":163},[33,63776,55550],{"class":167},[33,63778,4059],{"class":163},[33,63780,60783],{"class":54},[33,63782,1115],{"class":50},[33,63784,60334],{"class":167},[33,63786,1121],{"class":50},[33,63788,274],{"class":54},[33,63790,221],{"class":167},[33,63792,63793,63795,63797,63799,63801,63803,63805,63807,63809,63811,63813,63815,63817,63819,63821,63823,63825],{"class":35,"line":5182},[33,63794,55664],{"class":167},[33,63796,748],{"class":50},[33,63798,9546],{"class":167},[33,63800,4126],{"class":163},[33,63802,1646],{"class":50},[33,63804,1769],{"class":163},[33,63806,55550],{"class":167},[33,63808,3545],{"class":50},[33,63810,1769],{"class":163},[33,63812,55550],{"class":167},[33,63814,4059],{"class":163},[33,63816,55719],{"class":54},[33,63818,1115],{"class":50},[33,63820,55724],{"class":167},[33,63822,1121],{"class":50},[33,63824,274],{"class":54},[33,63826,221],{"class":167},[33,63828,63829],{"class":35,"line":5195},[33,63830,55735],{"class":167},[33,63832,63833],{"class":35,"line":5200},[33,63834,92],{"emptyLinePlaceholder":91},[33,63836,63837,63839,63841,63843,63845,63847,63849,63851,63853,63855,63857,63859,63861,63863,63865],{"class":35,"line":5205},[33,63838,13245],{"class":167},[33,63840,242],{"class":163},[33,63842,20349],{"class":167},[33,63844,60848],{"class":54},[33,63846,63503],{"class":167},[33,63848,11865],{"class":54},[33,63850,63503],{"class":167},[33,63852,12925],{"class":54},[33,63854,63503],{"class":167},[33,63856,60861],{"class":54},[33,63858,63503],{"class":167},[33,63860,60866],{"class":54},[33,63862,63503],{"class":167},[33,63864,60871],{"class":54},[33,63866,44162],{"class":167},[33,63868,63869,63871,63873,63876,63878,63880,63882,63884,63886,63888,63890,63892,63894,63896,63898,63900,63902,63904,63906,63908,63910,63912,63914,63916,63918,63920,63922],{"class":35,"line":5210},[33,63870,24507],{"class":167},[33,63872,242],{"class":163},[33,63874,63875],{"class":167}," [[r[",[33,63877,59673],{"class":54},[33,63879,60892],{"class":167},[33,63881,16649],{"class":54},[33,63883,8314],{"class":167},[33,63885,4059],{"class":163},[33,63887,18820],{"class":54},[33,63889,1115],{"class":50},[33,63891,60907],{"class":167},[33,63893,18828],{"class":54},[33,63895,9546],{"class":167},[33,63897,18410],{"class":163},[33,63899,1121],{"class":50},[33,63901,274],{"class":54},[33,63903,365],{"class":167},[33,63905,4059],{"class":163},[33,63907,18820],{"class":54},[33,63909,1115],{"class":50},[33,63911,60907],{"class":167},[33,63913,60930],{"class":54},[33,63915,9546],{"class":167},[33,63917,18410],{"class":163},[33,63919,1121],{"class":50},[33,63921,274],{"class":54},[33,63923,247],{"class":167},[33,63925,63926,63929,63931,63933,63935,63937,63939,63941,63943,63945,63947,63949,63951,63953,63955,63957,63959,63961,63963,63965,63967,63969,63971],{"class":35,"line":5215},[33,63927,63928],{"class":163},"             f",[33,63930,18820],{"class":54},[33,63932,1115],{"class":50},[33,63934,60907],{"class":167},[33,63936,60953],{"class":54},[33,63938,9546],{"class":167},[33,63940,18410],{"class":163},[33,63942,1121],{"class":50},[33,63944,274],{"class":54},[33,63946,365],{"class":167},[33,63948,4059],{"class":163},[33,63950,274],{"class":54},[33,63952,1115],{"class":50},[33,63954,60907],{"class":167},[33,63956,18857],{"class":54},[33,63958,9546],{"class":167},[33,63960,1121],{"class":50},[33,63962,38058],{"class":54},[33,63964,763],{"class":167},[33,63966,6124],{"class":163},[33,63968,45721],{"class":167},[33,63970,662],{"class":163},[33,63972,47749],{"class":167},[33,63974,63975,63977,63979,63982,63984,63987,63989,63991,63993,63995,63997,64000,64002,64004,64006,64008,64010,64012,64014,64016,64018,64020,64022,64024,64026,64028,64030,64032,64034,64036],{"class":35,"line":5220},[33,63976,14864],{"class":167},[33,63978,242],{"class":163},[33,63980,63981],{"class":167}," Table(header",[33,63983,1811],{"class":163},[33,63985,63986],{"class":167},"data, ",[33,63988,19795],{"class":238},[33,63990,242],{"class":163},[33,63992,8309],{"class":167},[33,63994,2680],{"class":50},[33,63996,1769],{"class":163},[33,63998,63999],{"class":167},"mm,",[33,64001,1543],{"class":50},[33,64003,1769],{"class":163},[33,64005,63999],{"class":167},[33,64007,11104],{"class":50},[33,64009,1769],{"class":163},[33,64011,63999],{"class":167},[33,64013,11104],{"class":50},[33,64015,1769],{"class":163},[33,64017,63999],{"class":167},[33,64019,11104],{"class":50},[33,64021,1769],{"class":163},[33,64023,63999],{"class":167},[33,64025,11103],{"class":50},[33,64027,1769],{"class":163},[33,64029,56324],{"class":167},[33,64031,19803],{"class":238},[33,64033,242],{"class":163},[33,64035,734],{"class":50},[33,64037,221],{"class":167},[33,64039,64040],{"class":35,"line":5227},[33,64041,19814],{"class":167},[33,64043,64044,64046,64048,64051,64053,64055,64057,64060,64062,64064,64066,64068,64071,64073],{"class":35,"line":5232},[33,64045,19819],{"class":167},[33,64047,19822],{"class":54},[33,64049,64050],{"class":167},",(",[33,64052,748],{"class":50},[33,64054,63503],{"class":167},[33,64056,748],{"class":50},[33,64058,64059],{"class":167},"),(",[33,64061,4126],{"class":163},[33,64063,734],{"class":50},[33,64065,63503],{"class":167},[33,64067,748],{"class":50},[33,64069,64070],{"class":167},"),colors.HexColor(",[33,64072,55362],{"class":54},[33,64074,1571],{"class":167},[33,64076,64077,64079,64081,64083,64085,64087,64089,64091,64093,64095,64097,64099],{"class":35,"line":5237},[33,64078,19819],{"class":167},[33,64080,19855],{"class":54},[33,64082,64050],{"class":167},[33,64084,748],{"class":50},[33,64086,63503],{"class":167},[33,64088,748],{"class":50},[33,64090,64059],{"class":167},[33,64092,4126],{"class":163},[33,64094,734],{"class":50},[33,64096,63503],{"class":167},[33,64098,748],{"class":50},[33,64100,64101],{"class":167},"),colors.white),\n",[33,64103,64104,64106,64108,64110,64112,64114,64116,64118,64120,64122,64124,64126,64129,64131],{"class":35,"line":5251},[33,64105,19819],{"class":167},[33,64107,19884],{"class":54},[33,64109,64050],{"class":167},[33,64111,748],{"class":50},[33,64113,63503],{"class":167},[33,64115,748],{"class":50},[33,64117,64059],{"class":167},[33,64119,4126],{"class":163},[33,64121,734],{"class":50},[33,64123,63503],{"class":167},[33,64125,748],{"class":50},[33,64127,64128],{"class":167},"),",[33,64130,19908],{"class":54},[33,64132,1506],{"class":167},[33,64134,64135,64137,64139,64141,64143,64145,64147,64149,64151,64153,64155,64157,64159,64161,64163],{"class":35,"line":5259},[33,64136,19819],{"class":167},[33,64138,19917],{"class":54},[33,64140,64050],{"class":167},[33,64142,748],{"class":50},[33,64144,63503],{"class":167},[33,64146,748],{"class":50},[33,64148,64059],{"class":167},[33,64150,4126],{"class":163},[33,64152,734],{"class":50},[33,64154,63503],{"class":167},[33,64156,4126],{"class":163},[33,64158,734],{"class":50},[33,64160,64128],{"class":167},[33,64162,2577],{"class":50},[33,64164,1506],{"class":167},[33,64166,64167,64169,64171,64173,64175,64177,64179,64181,64183,64185,64187,64189,64191,64193,64195,64198,64200],{"class":35,"line":5264},[33,64168,19819],{"class":167},[33,64170,19985],{"class":54},[33,64172,64050],{"class":167},[33,64174,748],{"class":50},[33,64176,63503],{"class":167},[33,64178,748],{"class":50},[33,64180,64059],{"class":167},[33,64182,4126],{"class":163},[33,64184,734],{"class":50},[33,64186,63503],{"class":167},[33,64188,4126],{"class":163},[33,64190,734],{"class":50},[33,64192,64128],{"class":167},[33,64194,20011],{"class":50},[33,64196,64197],{"class":167},",colors.HexColor(",[33,64199,55389],{"class":54},[33,64201,1571],{"class":167},[33,64203,64204,64206,64208,64210,64212,64214,64216,64218,64220,64222,64224,64226,64228,64230,64232],{"class":35,"line":5269},[33,64205,19819],{"class":167},[33,64207,20024],{"class":54},[33,64209,64050],{"class":167},[33,64211,1533],{"class":50},[33,64213,63503],{"class":167},[33,64215,748],{"class":50},[33,64217,64059],{"class":167},[33,64219,4126],{"class":163},[33,64221,734],{"class":50},[33,64223,63503],{"class":167},[33,64225,4126],{"class":163},[33,64227,734],{"class":50},[33,64229,64128],{"class":167},[33,64231,20050],{"class":54},[33,64233,1506],{"class":167},[33,64235,64236],{"class":35,"line":5283},[33,64237,20057],{"class":167},[33,64239,64240,64243,64245],{"class":35,"line":5293},[33,64241,64242],{"class":167},"    chart_buf ",[33,64244,242],{"class":163},[33,64246,64247],{"class":167}," make_chart(rows)\n",[33,64249,64250,64252,64254,64257,64259,64261,64263,64265,64267,64269,64271,64273],{"class":35,"line":5303},[33,64251,19444],{"class":167},[33,64253,242],{"class":163},[33,64255,64256],{"class":167}," [Paragraph(",[33,64258,4059],{"class":163},[33,64260,61362],{"class":54},[33,64262,1115],{"class":50},[33,64264,60334],{"class":167},[33,64266,1121],{"class":50},[33,64268,274],{"class":54},[33,64270,56522],{"class":167},[33,64272,56525],{"class":54},[33,64274,12871],{"class":167},[33,64276,64277,64280,64282,64284,64286,64288,64291,64293,64295,64297,64299,64301,64303,64305],{"class":35,"line":5313},[33,64278,64279],{"class":167},"             Spacer(",[33,64281,734],{"class":50},[33,64283,365],{"class":167},[33,64285,1503],{"class":50},[33,64287,1769],{"class":163},[33,64289,64290],{"class":167},"mm), Image(chart_buf, ",[33,64292,56684],{"class":238},[33,64294,242],{"class":163},[33,64296,59952],{"class":50},[33,64298,365],{"class":167},[33,64300,61972],{"class":238},[33,64302,242],{"class":163},[33,64304,58337],{"class":50},[33,64306,1506],{"class":167},[33,64308,64309,64311,64313,64315,64317,64319],{"class":35,"line":5320},[33,64310,64279],{"class":167},[33,64312,734],{"class":50},[33,64314,365],{"class":167},[33,64316,1503],{"class":50},[33,64318,1769],{"class":163},[33,64320,64321],{"class":167},"mm), tbl]\n",[33,64323,64324,64327,64329,64331,64334,64336,64338],{"class":35,"line":5325},[33,64325,64326],{"class":167},"    doc.build(story, ",[33,64328,56753],{"class":238},[33,64330,242],{"class":163},[33,64332,64333],{"class":167},"_hf, ",[33,64335,56761],{"class":238},[33,64337,242],{"class":163},[33,64339,64340],{"class":167},"_hf)\n",[33,64342,64343],{"class":35,"line":5330},[33,64344,92],{"emptyLinePlaceholder":91},[33,64346,64347],{"class":35,"line":5344},[33,64348,92],{"emptyLinePlaceholder":91},[33,64350,64351,64353,64355,64357,64359],{"class":35,"line":5349},[33,64352,562],{"class":163},[33,64354,25052],{"class":46},[33,64356,3743],{"class":167},[33,64358,571],{"class":50},[33,64360,574],{"class":167},[33,64362,64363,64366,64368,64370,64372],{"class":35,"line":5354},[33,64364,64365],{"class":167},"    r ",[33,64367,242],{"class":163},[33,64369,57370],{"class":167},[33,64371,1053],{"class":50},[33,64373,21248],{"class":167},[33,64375,64376,64378,64380,64383,64385,64387,64389],{"class":35,"line":5368},[33,64377,9228],{"class":163},[33,64379,4037],{"class":50},[33,64381,64382],{"class":167},"(r.pages) ",[33,64384,43000],{"class":163},[33,64386,1814],{"class":50},[33,64388,365],{"class":167},[33,64390,64391],{"class":54},"\"PDF has no pages\"\n",[33,64393,64394,64396,64398,64400,64402,64404,64406,64408,64410,64412,64415,64417,64419],{"class":35,"line":5377},[33,64395,7268],{"class":50},[33,64397,602],{"class":167},[33,64399,4059],{"class":163},[33,64401,57480],{"class":54},[33,64403,1115],{"class":50},[33,64405,57398],{"class":167},[33,64407,1121],{"class":50},[33,64409,17583],{"class":54},[33,64411,4065],{"class":50},[33,64413,64414],{"class":167},"(r.pages)",[33,64416,1121],{"class":50},[33,64418,59327],{"class":54},[33,64420,221],{"class":167},[33,64422,64423],{"class":35,"line":5382},[33,64424,92],{"emptyLinePlaceholder":91},[33,64426,64427],{"class":35,"line":5389},[33,64428,92],{"emptyLinePlaceholder":91},[33,64430,64431,64433,64435],{"class":35,"line":5399},[33,64432,562],{"class":163},[33,64434,6636],{"class":46},[33,64436,25419],{"class":167},[33,64438,64439,64441,64443],{"class":35,"line":5404},[33,64440,15498],{"class":167},[33,64442,242],{"class":163},[33,64444,64445],{"class":167}," argparse.ArgumentParser()\n",[33,64447,64448,64450,64453,64455,64457,64459,64461],{"class":35,"line":5409},[33,64449,15516],{"class":167},[33,64451,64452],{"class":54},"\"--data\"",[33,64454,365],{"class":167},[33,64456,25448],{"class":238},[33,64458,242],{"class":163},[33,64460,855],{"class":50},[33,64462,221],{"class":167},[33,64464,64465,64467,64469,64471,64473,64475,64477],{"class":35,"line":5414},[33,64466,15516],{"class":167},[33,64468,41152],{"class":54},[33,64470,365],{"class":167},[33,64472,25448],{"class":238},[33,64474,242],{"class":163},[33,64476,855],{"class":50},[33,64478,221],{"class":167},[33,64480,64481,64483,64486,64488,64490,64492,64494,64497,64499,64502,64504,64506,64508,64510],{"class":35,"line":5419},[33,64482,15516],{"class":167},[33,64484,64485],{"class":54},"\"--engine\"",[33,64487,365],{"class":167},[33,64489,15558],{"class":238},[33,64491,242],{"class":163},[33,64493,8309],{"class":167},[33,64495,64496],{"class":54},"\"reportlab\"",[33,64498,63503],{"class":167},[33,64500,64501],{"class":54},"\"weasyprint\"",[33,64503,8314],{"class":167},[33,64505,6685],{"class":238},[33,64507,242],{"class":163},[33,64509,64496],{"class":54},[33,64511,221],{"class":167},[33,64513,64514,64516,64519,64521,64523,64525,64527],{"class":35,"line":5425},[33,64515,15516],{"class":167},[33,64517,64518],{"class":54},"\"--period\"",[33,64520,365],{"class":167},[33,64522,6685],{"class":238},[33,64524,242],{"class":163},[33,64526,60428],{"class":54},[33,64528,221],{"class":167},[33,64530,64531,64533,64535],{"class":35,"line":5430},[33,64532,6766],{"class":167},[33,64534,242],{"class":163},[33,64536,15655],{"class":167},[33,64538,64539],{"class":35,"line":5440},[33,64540,92],{"emptyLinePlaceholder":91},[33,64542,64543,64546,64548],{"class":35,"line":5451},[33,64544,64545],{"class":167},"    data_path ",[33,64547,242],{"class":163},[33,64549,64550],{"class":167}," Path(args.data)\n",[33,64552,64553,64555,64557],{"class":35,"line":5464},[33,64554,6388],{"class":167},[33,64556,242],{"class":163},[33,64558,64559],{"class":167}," Path(args.out)\n",[33,64561,64562,64565,64567,64569,64571,64573,64575,64577,64579],{"class":35,"line":5497},[33,64563,64564],{"class":167},"    out_path.parent.mkdir(",[33,64566,869],{"class":238},[33,64568,242],{"class":163},[33,64570,855],{"class":50},[33,64572,365],{"class":167},[33,64574,878],{"class":238},[33,64576,242],{"class":163},[33,64578,855],{"class":50},[33,64580,221],{"class":167},[33,64582,64583],{"class":35,"line":5514},[33,64584,92],{"emptyLinePlaceholder":91},[33,64586,64587,64589],{"class":35,"line":5527},[33,64588,2424],{"class":163},[33,64590,574],{"class":167},[33,64592,64593,64595,64597],{"class":35,"line":5532},[33,64594,8549],{"class":167},[33,64596,242],{"class":163},[33,64598,64599],{"class":167}," load_data(data_path)\n",[33,64601,64602,64604,64606,64608],{"class":35,"line":5537},[33,64603,2449],{"class":163},[33,64605,783],{"class":50},[33,64607,1852],{"class":163},[33,64609,1855],{"class":167},[33,64611,64612,64614,64616,64619,64621,64623,64625,64627],{"class":35,"line":5543},[33,64613,2995],{"class":167},[33,64615,4059],{"class":163},[33,64617,64618],{"class":54},"\"Data load error: ",[33,64620,1115],{"class":50},[33,64622,6565],{"class":167},[33,64624,1121],{"class":50},[33,64626,274],{"class":54},[33,64628,221],{"class":167},[33,64630,64631],{"class":35,"line":5548},[33,64632,92],{"emptyLinePlaceholder":91},[33,64634,64635,64637],{"class":35,"line":5570},[33,64636,2424],{"class":163},[33,64638,574],{"class":167},[33,64640,64641,64643,64646,64648,64651],{"class":35,"line":5577},[33,64642,8221],{"class":163},[33,64644,64645],{"class":167}," args.engine ",[33,64647,1865],{"class":163},[33,64649,64650],{"class":54}," \"weasyprint\"",[33,64652,574],{"class":167},[33,64654,64655],{"class":35,"line":5584},[33,64656,64657],{"class":167},"            render_weasyprint(rows, args.period, out_path)\n",[33,64659,64660,64662],{"class":35,"line":5591},[33,64661,41290],{"class":163},[33,64663,574],{"class":167},[33,64665,64666],{"class":35,"line":5602},[33,64667,64668],{"class":167},"            render_reportlab(rows, args.period, out_path)\n",[33,64670,64671],{"class":35,"line":5607},[33,64672,64673],{"class":167},"        validate(out_path)\n",[33,64675,64676,64678,64680,64682],{"class":35,"line":5623},[33,64677,2449],{"class":163},[33,64679,783],{"class":50},[33,64681,1852],{"class":163},[33,64683,1855],{"class":167},[33,64685,64686,64688,64690,64693,64695,64697,64699,64701],{"class":35,"line":5630},[33,64687,2995],{"class":167},[33,64689,4059],{"class":163},[33,64691,64692],{"class":54},"\"Render error: ",[33,64694,1115],{"class":50},[33,64696,6565],{"class":167},[33,64698,1121],{"class":50},[33,64700,274],{"class":54},[33,64702,221],{"class":167},[33,64704,64705],{"class":35,"line":5640},[33,64706,92],{"emptyLinePlaceholder":91},[33,64708,64709,64711,64713,64715,64717],{"class":35,"line":5654},[33,64710,2491],{"class":163},[33,64712,2494],{"class":50},[33,64714,2497],{"class":163},[33,64716,2500],{"class":54},[33,64718,574],{"class":167},[33,64720,64721],{"class":35,"line":5667},[33,64722,6914],{"class":167},[18,64724,64726],{"id":64725},"pages-in-this-section","Pages in this section",[4211,64728,64729,64734],{},[4214,64730,64731,64733],{},[940,64732,53852],{"href":57625}," — per-customer invoices with line-item loops, tax calculation, and totals",[4214,64735,64736,64738,64739,42238,64741,365,64743,57545],{},[940,64737,28608],{"href":28607}," — garbled boxes or ",[30,64740,53869],{},[30,64742,53873],{},[30,64744,57544],{},[18,64746,6918],{"id":6917},[4211,64748,64749,64754,64759,64764],{},[4214,64750,64751,64753],{},[940,64752,6943],{"href":6942}," — parent guide covering the full PDF automation stack",[4214,64755,64756,64758],{},[940,64757,52682],{"href":52681}," — assemble per-customer PDFs into one batch deliverable",[4214,64760,64761,64763],{},[940,64762,6936],{"href":6935}," — same report logic when the output is an Excel workbook instead of a PDF",[4214,64765,64766,64768],{},[940,64767,9599],{"href":9598}," — prepare the data layer before feeding it to any renderer",[14,64770,6947,64771,3035],{},[940,64772,6943],{"href":6942},[6953,64774,64775],{},"html pre.shiki code .sAwPA, html code.shiki .sAwPA{--shiki-default:#6A737D}html pre.shiki code .s7eDp, html code.shiki .s7eDp{--shiki-default:#6F42C1}html pre.shiki code .sYBdl, html code.shiki .sYBdl{--shiki-default:#032F62}html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html pre.shiki code .sYu0t, html code.shiki .sYu0t{--shiki-default:#005CC5}html pre.shiki code .sD7c4, html code.shiki .sD7c4{--shiki-default:#D73A49}html pre.shiki code .sgsFI, html code.shiki .sgsFI{--shiki-default:#24292E}html pre.shiki code .sqxcx, html code.shiki .sqxcx{--shiki-default:#E36209}",{"title":28,"searchDepth":43,"depth":43,"links":64777},[64778,64779,64780,64781,64782,64783,64784,64789,64790,64791,64792,64793,64794],{"id":20,"depth":43,"text":21},{"id":59563,"depth":43,"text":59564},{"id":59895,"depth":43,"text":59896},{"id":59996,"depth":43,"text":59997},{"id":60438,"depth":43,"text":60439},{"id":61497,"depth":43,"text":61498},{"id":2708,"depth":43,"text":61980,"children":64785},[64786,64787,64788],{"id":61983,"depth":61,"text":61984},{"id":62069,"depth":61,"text":62070},{"id":62270,"depth":61,"text":62271},{"id":52029,"depth":43,"text":52030},{"id":4208,"depth":43,"text":62712},{"id":4270,"depth":43,"text":4271},{"id":4401,"depth":43,"text":62848},{"id":64725,"depth":43,"text":64726},{"id":6917,"depth":43,"text":6918},"Dynamic PDF Reports","Build data-driven PDF reports in Python with ReportLab and Jinja2+WeasyPrint. Covers data binding, headers\u002Ffooters, pagination, charts, and batch delivery.",{},"\u002Fautomating-pdf-extraction-generation\u002Fgenerating-pdf-reports-dynamically",{"title":26191,"description":64796},"Generating PDF Reports Dynamically with Python","automating-pdf-extraction-generation\u002Fgenerating-pdf-reports-dynamically\u002Findex",[47,9631,26232,57587,64803],"jinja2","66m2fp-qEhvpNBsgBInp31ORk4hdzuoKLdqTu6JBQII",{"id":64806,"title":6943,"body":64807,"breadcrumbTitle":68007,"canonical":6977,"date":46387,"description":68008,"draft":6980,"extension":6981,"image":6977,"meta":68009,"navigation":91,"path":68010,"robots":6977,"seo":68011,"seoTitle":68012,"stem":68013,"tags":68014,"updatedAt":6978,"__hash__":68017},"content\u002Fautomating-pdf-extraction-generation\u002Findex.md",{"type":7,"value":64808,"toc":67995},[64809,64812,64822,64841,64949,64960,64962,64965,65094,65107,65109,65112,65190,65196,65202,65224,65227,65477,65479,65486,65730,65735,65951,65968,65970,65985,66403,66415,66417,66429,66680,66696,66698,66717,66845,66853,67221,67226,67228,67231,67234,67416,67421,67793,67800,67802,67898,67900,67908,67921,67927,67938,67955,67957,67989,67993],[10,64810,6943],{"id":64811},"automating-pdf-extraction-generation",[14,64813,64814,64815,2008,64818,64821],{},"A handful of PDFs is a copy-paste chore. Ten thousand of them is a data problem. The moment a finance team starts re-keying invoice totals, or an ops team manually stitches cover pages onto monthly statements, the workflow stops scaling: humans miss rows, transpose digits, and silently drop pages, and there is no audit trail when a number is wrong. The PDF format makes this worse than most — it encodes ",[26245,64816,64817],{},"visual position",[26245,64819,64820],{},"logical structure",", so a \"table\" is really a scatter of glyphs at x\u002Fy coordinates with no notion of rows or columns. Manual handling papers over that gap with human pattern-matching; automation has to reconstruct the structure explicitly. This guide lays out the full Python pipeline for doing that reliably: pull structured data out of existing PDFs, clean and normalize it, consolidate many files into one dataset, and generate new documents from data — plus the production hardening that keeps it running unattended.",[14,64823,64824,64825,64828,64829,64832,64833,64836,64837,64840],{},"The pipeline has four stages, and almost every PDF automation job is some arrangement of them: ",[1974,64826,64827],{},"extract"," raw text and tables, ",[1974,64830,64831],{},"transform"," them into typed, clean records, ",[1974,64834,64835],{},"consolidate"," many files into one dataset, and ",[1974,64838,64839],{},"generate"," new PDFs or reports from the result.",[2540,64842,2547,64844,2547,64847,2547,64850,2547,64864,2547,64866,2547,64869,2547,64872,2547,64875,2547,64878,2547,64881,2547,64883,2547,64886,2547,64889,2547,64891,2547,64893,2547,64895,2547,64898,2547,64902,2547,64904,2547,64907,2547,64910,2547,64913,2547,64917,2547,64919,2547,64922,2547,64924,2547,64926,2547,64931,2547,64934,2547,64937,2547,64940,2547,64944,2547,64947],{"viewBox":11071,"role":2543,"ariaLabel":64843,"xmlns":2545,"style":2546},"PDF automation data flow from input PDFs through extract, transform, consolidate, and generate stages",[2549,64845,64846],{},"PDF automation data flow",[2553,64848,64849],{},"Input PDFs flow through four stages — extract, transform, consolidate, generate — producing reports, a data warehouse, and secured archives.",[2557,64851,2559,64852,2559,64859,2547],{},[2561,64853,2564,64855,2564,64857,2559],{"id":64854,"x1":748,"y1":748,"x2":734,"y2":748},"pdf-pillar-grad",[2566,64856],{"offset":748,"style":2568},[2566,64858],{"offset":734,"style":2571},[2573,64860,2564,64862,2559],{"id":64861,"viewBox":2576,"refX":2577,"refY":1153,"markerWidth":1179,"markerHeight":1179,"orient":2578},"pdf-pillar-arrow",[2580,64863],{"d":2582,"fill":2583},[2585,64865],{"x":24213,"y":2589,"width":2589,"height":49823,"rx":3545,"fill":2592,"stroke":2593,"style":2594},[2000,64867,64868],{"x":26350,"y":2635,"fill":2599,"style":16983},"Input PDFs",[2000,64870,64871],{"x":26350,"y":11115,"fill":2583,"style":2685},"scans · forms",[35,64873],{"x1":11108,"y1":26360,"x2":16982,"y2":26360,"stroke":2583,"markerEnd":64874,"style":2594},"url(#pdf-pillar-arrow)",[2585,64876],{"x":16982,"y":2589,"width":2589,"height":49823,"rx":3545,"fill":64877,"stroke":2593,"style":2594},"url(#pdf-pillar-grad)",[2000,64879,26355],{"x":64880,"y":11218,"fill":2599,"style":16983},"238",[2000,64882,943],{"x":64880,"y":11198,"fill":2583,"style":2685},[2000,64884,64885],{"x":64880,"y":11202,"fill":2583,"style":2685},"camelot · OCR",[35,64887],{"x1":64888,"y1":26360,"x2":58345,"y2":26360,"stroke":2583,"markerEnd":64874,"style":2594},"298",[2585,64890],{"x":58345,"y":2589,"width":2589,"height":49823,"rx":3545,"fill":64877,"stroke":2593,"style":2594},[2000,64892,26370],{"x":47140,"y":11218,"fill":2599,"style":16983},[2000,64894,26376],{"x":47140,"y":11198,"fill":2583,"style":2685},[2000,64896,64897],{"x":47140,"y":11202,"fill":2583,"style":2685},"normalize",[35,64899],{"x1":64900,"y1":26360,"x2":64901,"y2":26360,"stroke":2583,"markerEnd":64874,"style":2594},"460","502",[2585,64903],{"x":64901,"y":2589,"width":2589,"height":49823,"rx":3545,"fill":64877,"stroke":2593,"style":2594},[2000,64905,26386],{"x":64906,"y":11218,"fill":2599,"style":16983},"562",[2000,64908,64909],{"x":64906,"y":11198,"fill":2583,"style":2685},"concat · merge",[2000,64911,64912],{"x":64906,"y":11202,"fill":2583,"style":2685},"dedup",[35,64914],{"x1":64915,"y1":26360,"x2":64916,"y2":26360,"stroke":2583,"markerEnd":64874,"style":2594},"622","664",[2585,64918],{"x":64916,"y":2589,"width":17018,"height":49823,"rx":3545,"fill":2592,"stroke":2593,"style":2594},[2000,64920,26403],{"x":64921,"y":2635,"fill":2599,"style":16983},"706",[2000,64923,19002],{"x":64921,"y":11115,"fill":2583,"style":2685},[2585,64925],{"x":64901,"y":26410,"width":59975,"height":2590,"rx":3545,"fill":2615,"stroke":2593,"style":2594},[2000,64927,64930],{"x":64928,"y":64929,"fill":2599,"style":2600},"625","258","Outputs",[2000,64932,64933],{"x":58359,"y":2684,"fill":2583,"style":2605},"reports · CSV\u002FParquet · secured archive",[35,64935],{"x1":64921,"y1":64936,"x2":64921,"y2":26410,"stroke":2583,"markerEnd":64874,"style":2594},"192",[2585,64938],{"x":16982,"y":26323,"width":64939,"height":26341,"rx":3545,"fill":2615,"stroke":2593,"style":2594},"282",[2000,64941,64943],{"x":64942,"y":2590,"fill":2599,"style":2600},"319","Production harness",[2000,64945,64946],{"x":64942,"y":26350,"fill":2583,"style":2685},"cron \u002F GitHub Actions · logging · retries",[35,64948],{"x1":64942,"y1":2589,"x2":64942,"y2":17018,"stroke":2583,"markerEnd":64874,"style":2594},[14,64950,64951,64952,365,64954,365,64956,26462,64958,3035],{},"The rest of this overview walks each stage in order, with the libraries that own each one. If you already know which stage you're stuck on, jump straight to the relevant guide: ",[940,64953,9592],{"href":942},[940,64955,26191],{"href":19001},[940,64957,52682],{"href":52681},[940,64959,36756],{"href":26957},[18,64961,26469],{"id":26468},[14,64963,64964],{},"There is no single PDF library that does everything well. Each stage has a tool that owns it, and the most common cause of a brittle pipeline is reaching for the wrong one — using a text extractor on a scanned image, or a layout parser on a ruled financial table. Pick per task, not per project.",[4273,64966,64967,64980],{},[4276,64968,64969],{},[4279,64970,64971,64973,64976,64978],{},[4282,64972,26485],{},[4282,64974,64975],{},"Best for",[4282,64977,26491],{},[4282,64979,26494],{},[4292,64981,64982,64998,65018,65039,65057,65073],{},[4279,64983,64984,64988,64991,64995],{},[4297,64985,64986],{},[940,64987,943],{"href":942},[4297,64989,64990],{},"Layout-aware text + coordinate-precise tables on born-digital PDFs",[4297,64992,64993],{},[30,64994,26510],{},[4297,64996,64997],{},"Scanned\u002Fimage-only pages (no text layer to read)",[4279,64999,65000,65004,65010,65015],{},[4297,65001,65002],{},[940,65003,16139],{"href":9605},[4297,65005,65006,65007,65009],{},"Tables with visible ruling lines (",[30,65008,36830],{}," mode)",[4297,65011,65012],{},[30,65013,65014],{},"pip install \"camelot-py[base]\"",[4297,65016,65017],{},"Borderless tables or huge batches — it's slow",[4279,65019,65020,65028,65031,65036],{},[4297,65021,65022],{},[940,65023,65024,65025,12027],{"href":26957},"PyMuPDF (",[30,65026,65027],{},"fitz",[4297,65029,65030],{},"Fast rendering to images, metadata, page rasterization for OCR",[4297,65032,65033],{},[30,65034,65035],{},"pip install PyMuPDF",[4297,65037,65038],{},"Fine-grained table cell detection — use pdfplumber",[4279,65040,65041,65046,65049,65054],{},[4297,65042,65043],{},[940,65044,65045],{"href":52681},"pypdf",[4297,65047,65048],{},"Merge, split, rotate, encrypt, read form fields",[4297,65050,65051],{},[30,65052,65053],{},"pip install pypdf",[4297,65055,65056],{},"Extracting clean tabular text — it has no layout model",[4279,65058,65059,65063,65066,65070],{},[4297,65060,65061],{},[940,65062,19002],{"href":19001},[4297,65064,65065],{},"Programmatic generation of pixel-precise PDFs from data",[4297,65067,65068],{},[30,65069,26587],{},[4297,65071,65072],{},"HTML-first layouts — WeasyPrint fits those better",[4279,65074,65075,65079,65082,65091],{},[4297,65076,65077],{},[940,65078,26361],{"href":26957},[4297,65080,65081],{},"OCR of scanned\u002Fimage pages into text",[4297,65083,65084,65087,65088],{},[30,65085,65086],{},"apt install tesseract-ocr"," + ",[30,65089,65090],{},"pip install pytesseract",[4297,65092,65093],{},"Born-digital PDFs that already have a text layer",[14,65095,65096,65097,65100,65101,65104,65105,3035],{},"A quick rule of thumb: if you can select and copy text in a PDF viewer, the page is ",[26245,65098,65099],{},"born-digital"," and pdfplumber or camelot will read it directly. If selecting grabs the whole page as one block (or nothing), it's a ",[26245,65102,65103],{},"scan"," and you need the Tesseract path. The decision logic for tables specifically — lattice vs. stream, pdfplumber vs. camelot vs. tabula — is laid out in ",[940,65106,9606],{"href":9605},[18,65108,26619],{"id":26618},[14,65110,65111],{},"Isolate every automation project in a virtualenv. PDF libraries pull in heavy native dependencies (PyMuPDF ships MuPDF, camelot wants Ghostscript), and an unpinned install will drift the moment a transitive dependency releases. Pin everything.",[23,65113,65115],{"className":25,"code":65114,"language":27,"meta":28,"style":28},"# Create and activate an isolated environment\npython3 -m venv .venv\nsource .venv\u002Fbin\u002Factivate            # Windows: .venv\\Scripts\\activate\npython -m pip install --upgrade pip\n\n# System dependencies (Debian\u002FUbuntu) for OCR and camelot lattice mode\nsudo apt-get install -y tesseract-ocr ghostscript\n\npip install -r requirements.txt\n",[30,65116,65117,65121,65131,65139,65153,65157,65162,65176,65180],{"__ignoreMap":28},[33,65118,65119],{"class":35,"line":36},[33,65120,26635],{"class":39},[33,65122,65123,65125,65127,65129],{"class":35,"line":43},[33,65124,2011],{"class":46},[33,65126,51],{"class":50},[33,65128,55],{"class":54},[33,65130,58],{"class":54},[33,65132,65133,65135,65137],{"class":35,"line":61},[33,65134,64],{"class":50},[33,65136,67],{"class":54},[33,65138,26654],{"class":39},[33,65140,65141,65143,65145,65147,65149,65151],{"class":35,"line":73},[33,65142,47],{"class":46},[33,65144,51],{"class":50},[33,65146,26663],{"class":54},[33,65148,79],{"class":54},[33,65150,26668],{"class":50},[33,65152,26671],{"class":54},[33,65154,65155],{"class":35,"line":88},[33,65156,92],{"emptyLinePlaceholder":91},[33,65158,65159],{"class":35,"line":95},[33,65160,65161],{"class":39},"# System dependencies (Debian\u002FUbuntu) for OCR and camelot lattice mode\n",[33,65163,65164,65166,65168,65170,65172,65174],{"class":35,"line":101},[33,65165,9669],{"class":46},[33,65167,9672],{"class":54},[33,65169,79],{"class":54},[33,65171,20912],{"class":50},[33,65173,26693],{"class":54},[33,65175,36900],{"class":54},[33,65177,65178],{"class":35,"line":171},[33,65179,92],{"emptyLinePlaceholder":91},[33,65181,65182,65184,65186,65188],{"class":35,"line":179},[33,65183,76],{"class":46},[33,65185,79],{"class":54},[33,65187,26709],{"class":50},[33,65189,26712],{"class":54},[14,65191,65192,65193,65195],{},"Pin the versions in ",[30,65194,26625],{}," so a rebuild three months from now produces the same bytes. These are known-good as of this writing; bump deliberately, not accidentally.",[23,65197,65200],{"className":65198,"code":65199,"language":2000,"meta":28},[1998],"# requirements.txt\npdfplumber==0.11.4\ncamelot-py[base]==0.11.0\nPyMuPDF==1.24.10\npypdf==4.3.1\nreportlab==4.2.2\npytesseract==0.3.13\npandas==2.2.2\npdf2image==1.17.0\n",[30,65201,65199],{"__ignoreMap":28},[14,65203,65204,65205,65207,65208,65210,65211,65213,65214,65216,65217,65219,65220,3035],{},"A note on imports that trip people up: PyMuPDF installs as ",[30,65206,52011],{}," but imports as ",[30,65209,65027],{},", and camelot frequently fails its first import on Linux with a missing Ghostscript or ",[30,65212,41520],{}," dependency — that specific failure is solved in ",[940,65215,9739],{"href":9738},". Tesseract not being on ",[30,65218,122],{}," produces its own classic error, covered in ",[940,65221,65223],{"href":65222},"\u002Fautomating-pdf-extraction-generation\u002Fscanning-and-ocr-processing-with-python\u002Ffix-tesseract-not-found-error\u002F","Fix \"TesseractNotFoundError\" in Python",[14,65225,65226],{},"Set up logging once, at module load, so every stage of the pipeline writes to the same stream. Unattended jobs are only debuggable if they leave a trail.",[23,65228,65230],{"className":126,"code":65229,"language":47,"meta":28,"style":28},"# pip install (stdlib only)\nimport logging\nfrom pathlib import Path\n\nLOG_DIR = Path(\"logs\")\nLOG_DIR.mkdir(exist_ok=True)\n\nlogging.basicConfig(\n    level=logging.INFO,\n    format=\"%(asctime)s | %(levelname)s | %(message)s\",\n    datefmt=\"%Y-%m-%d %H:%M:%S\",\n    handlers=[\n        logging.FileHandler(LOG_DIR \u002F \"pdf_pipeline.log\"),\n        logging.StreamHandler(),\n    ],\n)\nlog = logging.getLogger(\"pdf_pipeline\")\n\n\ndef workspace(pdf_dir: str) -> Path:\n    \"\"\"Resolve and validate an input directory with cross-platform paths.\"\"\"\n    target = Path(pdf_dir).expanduser().resolve()\n    if not target.is_dir():\n        log.error(\"Workspace directory does not exist: %s\", target)\n        raise FileNotFoundError(f\"Extraction directory not found: {target}\")\n    return target\n",[30,65231,65232,65236,65242,65252,65256,65268,65282,65286,65290,65302,65324,65338,65346,65359,65363,65367,65371,65384,65388,65392,65407,65412,65422,65431,65446,65470],{"__ignoreMap":28},[33,65233,65234],{"class":35,"line":36},[33,65235,26734],{"class":39},[33,65237,65238,65240],{"class":35,"line":43},[33,65239,164],{"class":163},[33,65241,184],{"class":167},[33,65243,65244,65246,65248,65250],{"class":35,"line":61},[33,65245,190],{"class":163},[33,65247,193],{"class":167},[33,65249,164],{"class":163},[33,65251,198],{"class":167},[33,65253,65254],{"class":35,"line":73},[33,65255,92],{"emptyLinePlaceholder":91},[33,65257,65258,65260,65262,65264,65266],{"class":35,"line":88},[33,65259,1023],{"class":50},[33,65261,212],{"class":163},[33,65263,215],{"class":167},[33,65265,26765],{"class":54},[33,65267,221],{"class":167},[33,65269,65270,65272,65274,65276,65278,65280],{"class":35,"line":95},[33,65271,1023],{"class":50},[33,65273,1078],{"class":167},[33,65275,878],{"class":238},[33,65277,242],{"class":163},[33,65279,855],{"class":50},[33,65281,221],{"class":167},[33,65283,65284],{"class":35,"line":101},[33,65285,92],{"emptyLinePlaceholder":91},[33,65287,65288],{"class":35,"line":171},[33,65289,232],{"class":167},[33,65291,65292,65294,65296,65298,65300],{"class":35,"line":179},[33,65293,253],{"class":238},[33,65295,242],{"class":163},[33,65297,258],{"class":167},[33,65299,1067],{"class":50},[33,65301,247],{"class":167},[33,65303,65304,65306,65308,65310,65312,65314,65316,65318,65320,65322],{"class":35,"line":187},[33,65305,269],{"class":238},[33,65307,242],{"class":163},[33,65309,274],{"class":54},[33,65311,277],{"class":50},[33,65313,26814],{"class":54},[33,65315,26817],{"class":50},[33,65317,26814],{"class":54},[33,65319,26827],{"class":50},[33,65321,274],{"class":54},[33,65323,247],{"class":167},[33,65325,65326,65328,65330,65332,65334,65336],{"class":35,"line":201},[33,65327,26836],{"class":238},[33,65329,242],{"class":163},[33,65331,1244],{"class":54},[33,65333,916],{"class":50},[33,65335,26845],{"class":54},[33,65337,247],{"class":167},[33,65339,65340,65342,65344],{"class":35,"line":206},[33,65341,26852],{"class":238},[33,65343,242],{"class":163},[33,65345,26857],{"class":167},[33,65347,65348,65350,65352,65354,65357],{"class":35,"line":224},[33,65349,26862],{"class":167},[33,65351,1023],{"class":50},[33,65353,1107],{"class":163},[33,65355,65356],{"class":54}," \"pdf_pipeline.log\"",[33,65358,1506],{"class":167},[33,65360,65361],{"class":35,"line":229},[33,65362,26875],{"class":167},[33,65364,65365],{"class":35,"line":235},[33,65366,26880],{"class":167},[33,65368,65369],{"class":35,"line":250},[33,65370,221],{"class":167},[33,65372,65373,65375,65377,65379,65382],{"class":35,"line":266},[33,65374,28695],{"class":167},[33,65376,242],{"class":163},[33,65378,544],{"class":167},[33,65380,65381],{"class":54},"\"pdf_pipeline\"",[33,65383,221],{"class":167},[33,65385,65386],{"class":35,"line":290},[33,65387,92],{"emptyLinePlaceholder":91},[33,65389,65390],{"class":35,"line":295},[33,65391,92],{"emptyLinePlaceholder":91},[33,65393,65394,65396,65399,65402,65404],{"class":35,"line":300},[33,65395,562],{"class":163},[33,65397,65398],{"class":46}," workspace",[33,65400,65401],{"class":167},"(pdf_dir: ",[33,65403,1053],{"class":50},[33,65405,65406],{"class":167},") -> Path:\n",[33,65408,65409],{"class":35,"line":317},[33,65410,65411],{"class":54},"    \"\"\"Resolve and validate an input directory with cross-platform paths.\"\"\"\n",[33,65413,65414,65417,65419],{"class":35,"line":332},[33,65415,65416],{"class":167},"    target ",[33,65418,242],{"class":163},[33,65420,65421],{"class":167}," Path(pdf_dir).expanduser().resolve()\n",[33,65423,65424,65426,65428],{"class":35,"line":347},[33,65425,617],{"class":163},[33,65427,620],{"class":163},[33,65429,65430],{"class":167}," target.is_dir():\n",[33,65432,65433,65436,65439,65441,65443],{"class":35,"line":374},[33,65434,65435],{"class":167},"        log.error(",[33,65437,65438],{"class":54},"\"Workspace directory does not exist: ",[33,65440,309],{"class":50},[33,65442,274],{"class":54},[33,65444,65445],{"class":167},", target)\n",[33,65447,65448,65450,65452,65454,65456,65459,65461,65464,65466,65468],{"class":35,"line":397},[33,65449,4051],{"class":163},[33,65451,2945],{"class":50},[33,65453,602],{"class":167},[33,65455,4059],{"class":163},[33,65457,65458],{"class":54},"\"Extraction directory not found: ",[33,65460,1115],{"class":50},[33,65462,65463],{"class":167},"target",[33,65465,1121],{"class":50},[33,65467,274],{"class":54},[33,65469,221],{"class":167},[33,65471,65472,65474],{"class":35,"line":653},[33,65473,1332],{"class":163},[33,65475,65476],{"class":167}," target\n",[18,65478,26941],{"id":26940},[14,65480,65481,65482,65485],{},"Ingestion is where most pipelines silently lose data, because the failure mode is rarely an exception — it's wrong text with no error. The cardinal rule: ",[1974,65483,65484],{},"classify the page before you parse it."," A born-digital page has a text layer pdfplumber can read; a scanned page returns empty strings and needs OCR. Branch on that distinction up front rather than discovering it three stages downstream when totals don't reconcile.",[23,65487,65489],{"className":126,"code":65488,"language":47,"meta":28,"style":28},"# pip install pdfplumber\nimport logging\nfrom pathlib import Path\n\nimport pdfplumber\n\nlog = logging.getLogger(\"pdf_pipeline\")\n\n\ndef classify_page_has_text(pdf_path: Path, min_chars: int = 20) -> bool:\n    \"\"\"Return True if the first page has a real text layer (born-digital),\n    False if it's likely a scan that needs the OCR path.\"\"\"\n    if not pdf_path.exists():\n        raise FileNotFoundError(f\"Source PDF missing: {pdf_path}\")\n    try:\n        with pdfplumber.open(pdf_path) as pdf:\n            text = pdf.pages[0].extract_text() or \"\"\n            has_text = len(text.strip()) >= min_chars\n            log.info(\"%s -> %s\", pdf_path.name, \"digital\" if has_text else \"scanned\")\n            return has_text\n    except Exception as exc:                      # corrupt\u002Fencrypted file\n        log.warning(\"Could not open %s: %s\", pdf_path.name, exc)\n        return False\n",[30,65490,65491,65495,65501,65511,65515,65521,65525,65537,65541,65545,65567,65572,65577,65585,65607,65613,65623,65639,65654,65686,65693,65707,65724],{"__ignoreMap":28},[33,65492,65493],{"class":35,"line":36},[33,65494,9763],{"class":39},[33,65496,65497,65499],{"class":35,"line":43},[33,65498,164],{"class":163},[33,65500,184],{"class":167},[33,65502,65503,65505,65507,65509],{"class":35,"line":61},[33,65504,190],{"class":163},[33,65506,193],{"class":167},[33,65508,164],{"class":163},[33,65510,198],{"class":167},[33,65512,65513],{"class":35,"line":73},[33,65514,92],{"emptyLinePlaceholder":91},[33,65516,65517,65519],{"class":35,"line":88},[33,65518,164],{"class":163},[33,65520,485],{"class":167},[33,65522,65523],{"class":35,"line":95},[33,65524,92],{"emptyLinePlaceholder":91},[33,65526,65527,65529,65531,65533,65535],{"class":35,"line":101},[33,65528,28695],{"class":167},[33,65530,242],{"class":163},[33,65532,544],{"class":167},[33,65534,65381],{"class":54},[33,65536,221],{"class":167},[33,65538,65539],{"class":35,"line":171},[33,65540,92],{"emptyLinePlaceholder":91},[33,65542,65543],{"class":35,"line":179},[33,65544,92],{"emptyLinePlaceholder":91},[33,65546,65547,65549,65552,65555,65557,65559,65561,65563,65565],{"class":35,"line":187},[33,65548,562],{"class":163},[33,65550,65551],{"class":46}," classify_page_has_text",[33,65553,65554],{"class":167},"(pdf_path: Path, min_chars: ",[33,65556,1059],{"class":50},[33,65558,212],{"class":163},[33,65560,43599],{"class":50},[33,65562,1617],{"class":167},[33,65564,2821],{"class":50},[33,65566,574],{"class":167},[33,65568,65569],{"class":35,"line":201},[33,65570,65571],{"class":54},"    \"\"\"Return True if the first page has a real text layer (born-digital),\n",[33,65573,65574],{"class":35,"line":206},[33,65575,65576],{"class":54},"    False if it's likely a scan that needs the OCR path.\"\"\"\n",[33,65578,65579,65581,65583],{"class":35,"line":224},[33,65580,617],{"class":163},[33,65582,620],{"class":163},[33,65584,21595],{"class":167},[33,65586,65587,65589,65591,65593,65595,65597,65599,65601,65603,65605],{"class":35,"line":229},[33,65588,4051],{"class":163},[33,65590,2945],{"class":50},[33,65592,602],{"class":167},[33,65594,4059],{"class":163},[33,65596,27064],{"class":54},[33,65598,1115],{"class":50},[33,65600,27069],{"class":167},[33,65602,1121],{"class":50},[33,65604,274],{"class":54},[33,65606,221],{"class":167},[33,65608,65609,65611],{"class":35,"line":235},[33,65610,2424],{"class":163},[33,65612,574],{"class":167},[33,65614,65615,65617,65619,65621],{"class":35,"line":250},[33,65616,2191],{"class":163},[33,65618,681],{"class":167},[33,65620,495],{"class":163},[33,65622,686],{"class":167},[33,65624,65625,65627,65629,65631,65633,65635,65637],{"class":35,"line":266},[33,65626,44589],{"class":167},[33,65628,242],{"class":163},[33,65630,9870],{"class":167},[33,65632,748],{"class":50},[33,65634,62547],{"class":167},[33,65636,7162],{"class":163},[33,65638,13126],{"class":54},[33,65640,65641,65643,65645,65647,65649,65651],{"class":35,"line":290},[33,65642,9879],{"class":167},[33,65644,242],{"class":163},[33,65646,4037],{"class":50},[33,65648,62560],{"class":167},[33,65650,43000],{"class":163},[33,65652,65653],{"class":167}," min_chars\n",[33,65655,65656,65659,65661,65663,65666,65668,65670,65672,65675,65677,65679,65681,65684],{"class":35,"line":295},[33,65657,65658],{"class":167},"            log.info(",[33,65660,274],{"class":54},[33,65662,309],{"class":50},[33,65664,65665],{"class":54}," -> ",[33,65667,309],{"class":50},[33,65669,274],{"class":54},[33,65671,6250],{"class":167},[33,65673,65674],{"class":54},"\"digital\"",[33,65676,9994],{"class":163},[33,65678,9970],{"class":167},[33,65680,7489],{"class":163},[33,65682,65683],{"class":54}," \"scanned\"",[33,65685,221],{"class":167},[33,65687,65688,65690],{"class":35,"line":300},[33,65689,28782],{"class":163},[33,65691,65692],{"class":167}," has_text\n",[33,65694,65695,65697,65699,65701,65704],{"class":35,"line":317},[33,65696,2449],{"class":163},[33,65698,783],{"class":50},[33,65700,1852],{"class":163},[33,65702,65703],{"class":167}," exc:                      ",[33,65705,65706],{"class":39},"# corrupt\u002Fencrypted file\n",[33,65708,65709,65711,65713,65715,65717,65719,65721],{"class":35,"line":332},[33,65710,29039],{"class":167},[33,65712,43335],{"class":54},[33,65714,309],{"class":50},[33,65716,2079],{"class":54},[33,65718,309],{"class":50},[33,65720,274],{"class":54},[33,65722,65723],{"class":167},", pdf_path.name, exc)\n",[33,65725,65726,65728],{"class":35,"line":347},[33,65727,1659],{"class":163},[33,65729,2903],{"class":50},[14,65731,65732,65733,3035],{},"For born-digital files, pdfplumber reads tables with their geometry intact. The most common ingestion bug — columns bleeding into one another or rows misaligning — is almost always a coordinate-detection issue, not a code bug; the diagnostic walkthrough lives in ",[940,65734,10535],{"href":10534},[23,65736,65738],{"className":126,"code":65737,"language":47,"meta":28,"style":28},"# pip install pdfplumber\nimport logging\nfrom pathlib import Path\n\nimport pdfplumber\n\nlog = logging.getLogger(\"pdf_pipeline\")\n\n\ndef extract_rows(pdf_path: Path) -> list[list[str]]:\n    \"\"\"Extract every table row across all pages, normalizing None cells.\"\"\"\n    rows: list[list[str]] = []\n    try:\n        with pdfplumber.open(pdf_path) as pdf:\n            for page in pdf.pages:\n                for table in page.extract_tables():\n                    for row in table:\n                        rows.append([(c or \"\").strip() for c in row])\n    except Exception as exc:\n        log.error(\"Extraction failed for %s: %s\", pdf_path.name, exc)\n        raise\n    log.info(\"%s -> %d rows\", pdf_path.name, len(rows))\n    return rows\n",[30,65739,65740,65744,65750,65760,65764,65770,65774,65786,65790,65794,65808,65813,65825,65831,65841,65851,65861,65871,65891,65901,65918,65923,65945],{"__ignoreMap":28},[33,65741,65742],{"class":35,"line":36},[33,65743,9763],{"class":39},[33,65745,65746,65748],{"class":35,"line":43},[33,65747,164],{"class":163},[33,65749,184],{"class":167},[33,65751,65752,65754,65756,65758],{"class":35,"line":61},[33,65753,190],{"class":163},[33,65755,193],{"class":167},[33,65757,164],{"class":163},[33,65759,198],{"class":167},[33,65761,65762],{"class":35,"line":73},[33,65763,92],{"emptyLinePlaceholder":91},[33,65765,65766,65768],{"class":35,"line":88},[33,65767,164],{"class":163},[33,65769,485],{"class":167},[33,65771,65772],{"class":35,"line":95},[33,65773,92],{"emptyLinePlaceholder":91},[33,65775,65776,65778,65780,65782,65784],{"class":35,"line":101},[33,65777,28695],{"class":167},[33,65779,242],{"class":163},[33,65781,544],{"class":167},[33,65783,65381],{"class":54},[33,65785,221],{"class":167},[33,65787,65788],{"class":35,"line":171},[33,65789,92],{"emptyLinePlaceholder":91},[33,65791,65792],{"class":35,"line":179},[33,65793,92],{"emptyLinePlaceholder":91},[33,65795,65796,65798,65801,65804,65806],{"class":35,"line":187},[33,65797,562],{"class":163},[33,65799,65800],{"class":46}," extract_rows",[33,65802,65803],{"class":167},"(pdf_path: Path) -> list[list[",[33,65805,1053],{"class":50},[33,65807,43900],{"class":167},[33,65809,65810],{"class":35,"line":201},[33,65811,65812],{"class":54},"    \"\"\"Extract every table row across all pages, normalizing None cells.\"\"\"\n",[33,65814,65815,65817,65819,65821,65823],{"class":35,"line":206},[33,65816,13076],{"class":167},[33,65818,1053],{"class":50},[33,65820,13081],{"class":167},[33,65822,242],{"class":163},[33,65824,589],{"class":167},[33,65826,65827,65829],{"class":35,"line":224},[33,65828,2424],{"class":163},[33,65830,574],{"class":167},[33,65832,65833,65835,65837,65839],{"class":35,"line":229},[33,65834,2191],{"class":163},[33,65836,681],{"class":167},[33,65838,495],{"class":163},[33,65840,686],{"class":167},[33,65842,65843,65845,65847,65849],{"class":35,"line":235},[33,65844,1793],{"class":163},[33,65846,695],{"class":167},[33,65848,662],{"class":163},[33,65850,700],{"class":167},[33,65852,65853,65855,65857,65859],{"class":35,"line":250},[33,65854,692],{"class":163},[33,65856,5998],{"class":167},[33,65858,662],{"class":163},[33,65860,27124],{"class":167},[33,65862,65863,65865,65867,65869],{"class":35,"line":266},[33,65864,13668],{"class":163},[33,65866,3844],{"class":167},[33,65868,662],{"class":163},[33,65870,720],{"class":167},[33,65872,65873,65876,65878,65880,65882,65884,65886,65888],{"class":35,"line":290},[33,65874,65875],{"class":167},"                        rows.append([(c ",[33,65877,7162],{"class":163},[33,65879,9892],{"class":54},[33,65881,27136],{"class":167},[33,65883,6124],{"class":163},[33,65885,7486],{"class":167},[33,65887,662],{"class":163},[33,65889,65890],{"class":167}," row])\n",[33,65892,65893,65895,65897,65899],{"class":35,"line":295},[33,65894,2449],{"class":163},[33,65896,783],{"class":50},[33,65898,1852],{"class":163},[33,65900,1855],{"class":167},[33,65902,65903,65905,65908,65910,65912,65914,65916],{"class":35,"line":300},[33,65904,65435],{"class":167},[33,65906,65907],{"class":54},"\"Extraction failed for ",[33,65909,309],{"class":50},[33,65911,2079],{"class":54},[33,65913,309],{"class":50},[33,65915,274],{"class":54},[33,65917,65723],{"class":167},[33,65919,65920],{"class":35,"line":317},[33,65921,65922],{"class":163},"        raise\n",[33,65924,65925,65927,65929,65931,65933,65935,65938,65940,65942],{"class":35,"line":332},[33,65926,29002],{"class":167},[33,65928,274],{"class":54},[33,65930,309],{"class":50},[33,65932,65665],{"class":54},[33,65934,916],{"class":50},[33,65936,65937],{"class":54}," rows\"",[33,65939,6250],{"class":167},[33,65941,928],{"class":50},[33,65943,65944],{"class":167},"(rows))\n",[33,65946,65947,65949],{"class":35,"line":347},[33,65948,1332],{"class":163},[33,65950,44355],{"class":167},[14,65952,65953,65954,65956,65957,65959,65960,65963,65964,3035],{},"Scanned files take the OCR branch: rasterize each page with PyMuPDF, then hand the image to Tesseract. The full preprocessing recipe — deskew, threshold, upscale — is in ",[940,65955,10077],{"href":10076}," and the OCR-specific tuning in ",[940,65958,36756],{"href":26957},". Encrypted files are a third branch entirely: pdfplumber raises before it reads a byte, so check ",[30,65961,65962],{},"pypdf.PdfReader(path).is_encrypted"," and decrypt first — see ",[940,65965,65967],{"href":65966},"\u002Fautomating-pdf-extraction-generation\u002Fwatermarking-and-securing-pdfs\u002F","Watermarking and Securing PDFs",[18,65969,27435],{"id":27434},[14,65971,65972,65973,365,65976,365,65979,65981,65982,65984],{},"Extraction gives you strings. Every cell is text — ",[30,65974,65975],{},"\"1,234.50\"",[30,65977,65978],{},"\"  $0.00 \"",[30,65980,27824],{}," — and none of it is typed. The transformation stage turns those strings into a clean, typed table with a stable schema, and it is where you enforce correctness with assertions rather than hope. Load the extracted rows into pandas (the same dataframe-centric approach used throughout ",[940,65983,26258],{"href":26257},"), coerce types explicitly, and fail loudly when a coercion can't be trusted.",[23,65986,65988],{"className":126,"code":65987,"language":47,"meta":28,"style":28},"# pip install pandas\nimport logging\n\nimport pandas as pd\n\nlog = logging.getLogger(\"pdf_pipeline\")\n\nEXPECTED = [\"invoice_id\", \"date\", \"amount\"]\n\n\ndef normalize(rows: list[list[str]]) -> pd.DataFrame:\n    \"\"\"Build a typed, schema-normalized frame from raw extracted rows.\"\"\"\n    if not rows:\n        return pd.DataFrame(columns=EXPECTED)\n\n    df = pd.DataFrame(rows[1:], columns=rows[0])      # first row is the header\n    df.columns = [c.strip().lower().replace(\" \", \"_\") for c in df.columns]\n\n    # Coerce types explicitly; errors='coerce' turns junk into NaN, not crashes\n    df[\"amount\"] = (\n        df[\"amount\"].str.replace(r\"[,$\\s]\", \"\", regex=True)\n    )\n    df[\"amount\"] = pd.to_numeric(df[\"amount\"], errors=\"coerce\")\n    df[\"date\"] = pd.to_datetime(df[\"date\"], errors=\"coerce\", dayfirst=False)\n\n    # Quarantine rows that failed coercion instead of silently keeping them\n    bad = df[df[\"amount\"].isna() | df[\"date\"].isna()]\n    if not bad.empty:\n        log.warning(\"Dropping %d rows that failed type coercion\", len(bad))\n        df = df.drop(bad.index)\n\n    missing = set(EXPECTED) - set(df.columns)\n    if missing:\n        raise ValueError(f\"Schema mismatch, missing columns: {missing}\")\n    return df[EXPECTED].reset_index(drop=True)\n",[30,65989,65990,65994,66000,66004,66014,66018,66030,66034,66054,66058,66062,66076,66081,66089,66103,66107,66132,66156,66160,66165,66177,66208,66212,66236,66268,66272,66277,66297,66305,66324,66333,66337,66357,66363,66385],{"__ignoreMap":28},[33,65991,65992],{"class":35,"line":36},[33,65993,8895],{"class":39},[33,65995,65996,65998],{"class":35,"line":43},[33,65997,164],{"class":163},[33,65999,184],{"class":167},[33,66001,66002],{"class":35,"line":61},[33,66003,92],{"emptyLinePlaceholder":91},[33,66005,66006,66008,66010,66012],{"class":35,"line":73},[33,66007,164],{"class":163},[33,66009,492],{"class":167},[33,66011,495],{"class":163},[33,66013,498],{"class":167},[33,66015,66016],{"class":35,"line":88},[33,66017,92],{"emptyLinePlaceholder":91},[33,66019,66020,66022,66024,66026,66028],{"class":35,"line":95},[33,66021,28695],{"class":167},[33,66023,242],{"class":163},[33,66025,544],{"class":167},[33,66027,65381],{"class":54},[33,66029,221],{"class":167},[33,66031,66032],{"class":35,"line":101},[33,66033,92],{"emptyLinePlaceholder":91},[33,66035,66036,66038,66040,66042,66044,66046,66048,66050,66052],{"class":35,"line":171},[33,66037,27469],{"class":50},[33,66039,212],{"class":163},[33,66041,9178],{"class":167},[33,66043,27353],{"class":54},[33,66045,365],{"class":167},[33,66047,4101],{"class":54},[33,66049,365],{"class":167},[33,66051,4106],{"class":54},[33,66053,9202],{"class":167},[33,66055,66056],{"class":35,"line":179},[33,66057,92],{"emptyLinePlaceholder":91},[33,66059,66060],{"class":35,"line":187},[33,66061,92],{"emptyLinePlaceholder":91},[33,66063,66064,66066,66069,66071,66073],{"class":35,"line":201},[33,66065,562],{"class":163},[33,66067,66068],{"class":46}," normalize",[33,66070,47650],{"class":167},[33,66072,1053],{"class":50},[33,66074,66075],{"class":167},"]]) -> pd.DataFrame:\n",[33,66077,66078],{"class":35,"line":206},[33,66079,66080],{"class":54},"    \"\"\"Build a typed, schema-normalized frame from raw extracted rows.\"\"\"\n",[33,66082,66083,66085,66087],{"class":35,"line":224},[33,66084,617],{"class":163},[33,66086,620],{"class":163},[33,66088,8723],{"class":167},[33,66090,66091,66093,66095,66097,66099,66101],{"class":35,"line":229},[33,66092,1659],{"class":163},[33,66094,27521],{"class":167},[33,66096,740],{"class":238},[33,66098,242],{"class":163},[33,66100,27469],{"class":50},[33,66102,221],{"class":167},[33,66104,66105],{"class":35,"line":235},[33,66106,92],{"emptyLinePlaceholder":91},[33,66108,66109,66111,66113,66115,66117,66119,66121,66123,66125,66127,66130],{"class":35,"line":250},[33,66110,4025],{"class":167},[33,66112,242],{"class":163},[33,66114,13261],{"class":167},[33,66116,734],{"class":50},[33,66118,737],{"class":167},[33,66120,740],{"class":238},[33,66122,242],{"class":163},[33,66124,27235],{"class":167},[33,66126,748],{"class":50},[33,66128,66129],{"class":167},"])      ",[33,66131,27243],{"class":39},[33,66133,66134,66136,66138,66140,66142,66144,66146,66148,66150,66152,66154],{"class":35,"line":266},[33,66135,27546],{"class":167},[33,66137,242],{"class":163},[33,66139,27551],{"class":167},[33,66141,17294],{"class":54},[33,66143,365],{"class":167},[33,66145,7764],{"class":54},[33,66147,1649],{"class":167},[33,66149,6124],{"class":163},[33,66151,7486],{"class":167},[33,66153,662],{"class":163},[33,66155,12624],{"class":167},[33,66157,66158],{"class":35,"line":290},[33,66159,92],{"emptyLinePlaceholder":91},[33,66161,66162],{"class":35,"line":295},[33,66163,66164],{"class":39},"    # Coerce types explicitly; errors='coerce' turns junk into NaN, not crashes\n",[33,66166,66167,66169,66171,66173,66175],{"class":35,"line":300},[33,66168,27581],{"class":167},[33,66170,4106],{"class":54},[33,66172,763],{"class":167},[33,66174,242],{"class":163},[33,66176,1415],{"class":167},[33,66178,66179,66181,66183,66186,66188,66190,66192,66194,66196,66198,66200,66202,66204,66206],{"class":35,"line":317},[33,66180,10902],{"class":167},[33,66182,4106],{"class":54},[33,66184,66185],{"class":167},"].str.replace(",[33,66187,11977],{"class":163},[33,66189,274],{"class":54},[33,66191,27610],{"class":50},[33,66193,274],{"class":54},[33,66195,365],{"class":167},[33,66197,3198],{"class":54},[33,66199,365],{"class":167},[33,66201,11993],{"class":238},[33,66203,242],{"class":163},[33,66205,855],{"class":50},[33,66207,221],{"class":167},[33,66209,66210],{"class":35,"line":332},[33,66211,1202],{"class":167},[33,66213,66214,66216,66218,66220,66222,66224,66226,66228,66230,66232,66234],{"class":35,"line":347},[33,66215,27581],{"class":167},[33,66217,4106],{"class":54},[33,66219,763],{"class":167},[33,66221,242],{"class":163},[33,66223,27643],{"class":167},[33,66225,4106],{"class":54},[33,66227,8314],{"class":167},[33,66229,8317],{"class":238},[33,66231,242],{"class":163},[33,66233,12107],{"class":54},[33,66235,221],{"class":167},[33,66237,66238,66240,66242,66244,66246,66248,66250,66252,66254,66256,66258,66260,66262,66264,66266],{"class":35,"line":374},[33,66239,27581],{"class":167},[33,66241,4101],{"class":54},[33,66243,763],{"class":167},[33,66245,242],{"class":163},[33,66247,27668],{"class":167},[33,66249,4101],{"class":54},[33,66251,8314],{"class":167},[33,66253,8317],{"class":238},[33,66255,242],{"class":163},[33,66257,12107],{"class":54},[33,66259,365],{"class":167},[33,66261,27683],{"class":238},[33,66263,242],{"class":163},[33,66265,902],{"class":50},[33,66267,221],{"class":167},[33,66269,66270],{"class":35,"line":397},[33,66271,92],{"emptyLinePlaceholder":91},[33,66273,66274],{"class":35,"line":653},[33,66275,66276],{"class":39},"    # Quarantine rows that failed coercion instead of silently keeping them\n",[33,66278,66279,66281,66283,66285,66287,66289,66291,66293,66295],{"class":35,"line":667},[33,66280,27698],{"class":167},[33,66282,242],{"class":163},[33,66284,27703],{"class":167},[33,66286,4106],{"class":54},[33,66288,27708],{"class":167},[33,66290,7654],{"class":163},[33,66292,7935],{"class":167},[33,66294,4101],{"class":54},[33,66296,27717],{"class":167},[33,66298,66299,66301,66303],{"class":35,"line":675},[33,66300,617],{"class":163},[33,66302,620],{"class":163},[33,66304,27726],{"class":167},[33,66306,66307,66309,66312,66314,66317,66319,66321],{"class":35,"line":689},[33,66308,29039],{"class":167},[33,66310,66311],{"class":54},"\"Dropping ",[33,66313,916],{"class":50},[33,66315,66316],{"class":54}," rows that failed type coercion\"",[33,66318,365],{"class":167},[33,66320,928],{"class":50},[33,66322,66323],{"class":167},"(bad))\n",[33,66325,66326,66328,66330],{"class":35,"line":703},[33,66327,7930],{"class":167},[33,66329,242],{"class":163},[33,66331,66332],{"class":167}," df.drop(bad.index)\n",[33,66334,66335],{"class":35,"line":714},[33,66336,92],{"emptyLinePlaceholder":91},[33,66338,66339,66341,66343,66345,66347,66349,66351,66353,66355],{"class":35,"line":723},[33,66340,4118],{"class":167},[33,66342,242],{"class":163},[33,66344,4129],{"class":50},[33,66346,602],{"class":167},[33,66348,27469],{"class":50},[33,66350,1649],{"class":167},[33,66352,4126],{"class":163},[33,66354,4129],{"class":50},[33,66356,4132],{"class":167},[33,66358,66359,66361],{"class":35,"line":754},[33,66360,617],{"class":163},[33,66362,4139],{"class":167},[33,66364,66365,66367,66369,66371,66373,66375,66377,66379,66381,66383],{"class":35,"line":771},[33,66366,4051],{"class":163},[33,66368,4054],{"class":50},[33,66370,602],{"class":167},[33,66372,4059],{"class":163},[33,66374,27781],{"class":54},[33,66376,1115],{"class":50},[33,66378,4157],{"class":167},[33,66380,1121],{"class":50},[33,66382,274],{"class":54},[33,66384,221],{"class":167},[33,66386,66387,66389,66391,66393,66395,66397,66399,66401],{"class":35,"line":777},[33,66388,1332],{"class":163},[33,66390,7935],{"class":167},[33,66392,27469],{"class":50},[33,66394,27802],{"class":167},[33,66396,10868],{"class":238},[33,66398,242],{"class":163},[33,66400,855],{"class":50},[33,66402,221],{"class":167},[14,66404,66405,66406,66408,66409,66411,66412,66414],{},"The pattern that matters here is ",[30,66407,27816],{}," plus a quarantine step: bad values become ",[30,66410,8884],{},", you log and drop them, and a downstream report never inherits a silently-wrong ",[30,66413,748],{},". Schema normalization — lowercasing and underscoring headers — means files from different sources line up for the next stage even when their column casing differs.",[18,66416,27835],{"id":27834},[14,66418,66419,66420,66422,66423,66425,66426,66428],{},"A single invoice is rarely the unit of work; a month of them is. Consolidation merges many per-file frames into one dataset, and the operation you choose changes the result. Use ",[30,66421,8366],{}," to stack rows from files that share a schema. Use ",[30,66424,27844],{}," (a SQL-style join) to enrich rows with a lookup table — say, attaching customer names from a master list keyed on ",[30,66427,28014],{},". Then dedup, because the same PDF reprocessed twice will otherwise double-count.",[23,66430,66432],{"className":126,"code":66431,"language":47,"meta":28,"style":28},"# pip install pandas\nimport logging\nfrom pathlib import Path\n\nimport pandas as pd\n\nlog = logging.getLogger(\"pdf_pipeline\")\n\n\ndef consolidate(frames: list[pd.DataFrame], lookup: pd.DataFrame | None = None) -> pd.DataFrame:\n    \"\"\"Stack per-file frames, optionally enrich via join, then dedup.\"\"\"\n    if not frames:\n        return pd.DataFrame()\n\n    combined = pd.concat(frames, ignore_index=True)          # stack same-schema rows\n    before = len(combined)\n\n    if lookup is not None:\n        combined = combined.merge(lookup, on=\"invoice_id\", how=\"left\")  # enrich\n\n    combined = combined.drop_duplicates(subset=[\"invoice_id\"], keep=\"first\")\n    log.info(\"Consolidated %d -> %d rows after dedup\", before, len(combined))\n    return combined.reset_index(drop=True)\n",[30,66433,66434,66438,66444,66454,66458,66468,66472,66484,66488,66492,66511,66516,66524,66530,66534,66552,66564,66568,66583,66611,66615,66641,66665],{"__ignoreMap":28},[33,66435,66436],{"class":35,"line":36},[33,66437,8895],{"class":39},[33,66439,66440,66442],{"class":35,"line":43},[33,66441,164],{"class":163},[33,66443,184],{"class":167},[33,66445,66446,66448,66450,66452],{"class":35,"line":61},[33,66447,190],{"class":163},[33,66449,193],{"class":167},[33,66451,164],{"class":163},[33,66453,198],{"class":167},[33,66455,66456],{"class":35,"line":73},[33,66457,92],{"emptyLinePlaceholder":91},[33,66459,66460,66462,66464,66466],{"class":35,"line":88},[33,66461,164],{"class":163},[33,66463,492],{"class":167},[33,66465,495],{"class":163},[33,66467,498],{"class":167},[33,66469,66470],{"class":35,"line":95},[33,66471,92],{"emptyLinePlaceholder":91},[33,66473,66474,66476,66478,66480,66482],{"class":35,"line":101},[33,66475,28695],{"class":167},[33,66477,242],{"class":163},[33,66479,544],{"class":167},[33,66481,65381],{"class":54},[33,66483,221],{"class":167},[33,66485,66486],{"class":35,"line":171},[33,66487,92],{"emptyLinePlaceholder":91},[33,66489,66490],{"class":35,"line":179},[33,66491,92],{"emptyLinePlaceholder":91},[33,66493,66494,66496,66498,66501,66503,66505,66507,66509],{"class":35,"line":187},[33,66495,562],{"class":163},[33,66497,27879],{"class":46},[33,66499,66500],{"class":167},"(frames: list[pd.DataFrame], lookup: pd.DataFrame ",[33,66502,7654],{"class":163},[33,66504,7657],{"class":50},[33,66506,212],{"class":163},[33,66508,7657],{"class":50},[33,66510,7668],{"class":167},[33,66512,66513],{"class":35,"line":201},[33,66514,66515],{"class":54},"    \"\"\"Stack per-file frames, optionally enrich via join, then dedup.\"\"\"\n",[33,66517,66518,66520,66522],{"class":35,"line":206},[33,66519,617],{"class":163},[33,66521,620],{"class":163},[33,66523,816],{"class":167},[33,66525,66526,66528],{"class":35,"line":224},[33,66527,1659],{"class":163},[33,66529,7721],{"class":167},[33,66531,66532],{"class":35,"line":229},[33,66533,92],{"emptyLinePlaceholder":91},[33,66535,66536,66538,66540,66542,66544,66546,66548,66550],{"class":35,"line":235},[33,66537,842],{"class":167},[33,66539,242],{"class":163},[33,66541,847],{"class":167},[33,66543,850],{"class":238},[33,66545,242],{"class":163},[33,66547,855],{"class":50},[33,66549,58930],{"class":167},[33,66551,27959],{"class":39},[33,66553,66554,66557,66559,66561],{"class":35,"line":250},[33,66555,66556],{"class":167},"    before ",[33,66558,242],{"class":163},[33,66560,4037],{"class":50},[33,66562,66563],{"class":167},"(combined)\n",[33,66565,66566],{"class":35,"line":266},[33,66567,92],{"emptyLinePlaceholder":91},[33,66569,66570,66572,66575,66577,66579,66581],{"class":35,"line":290},[33,66571,617],{"class":163},[33,66573,66574],{"class":167}," lookup ",[33,66576,3847],{"class":163},[33,66578,620],{"class":163},[33,66580,7657],{"class":50},[33,66582,574],{"class":167},[33,66584,66585,66587,66589,66592,66594,66596,66598,66600,66602,66604,66606,66608],{"class":35,"line":295},[33,66586,28029],{"class":167},[33,66588,242],{"class":163},[33,66590,66591],{"class":167}," combined.merge(lookup, ",[33,66593,2091],{"class":238},[33,66595,242],{"class":163},[33,66597,27353],{"class":54},[33,66599,365],{"class":167},[33,66601,28045],{"class":238},[33,66603,242],{"class":163},[33,66605,28050],{"class":54},[33,66607,10922],{"class":167},[33,66609,66610],{"class":39},"# enrich\n",[33,66612,66613],{"class":35,"line":300},[33,66614,92],{"emptyLinePlaceholder":91},[33,66616,66617,66619,66621,66623,66625,66627,66629,66631,66633,66635,66637,66639],{"class":35,"line":317},[33,66618,842],{"class":167},[33,66620,242],{"class":163},[33,66622,28063],{"class":167},[33,66624,28066],{"class":238},[33,66626,242],{"class":163},[33,66628,8309],{"class":167},[33,66630,27353],{"class":54},[33,66632,8314],{"class":167},[33,66634,28077],{"class":238},[33,66636,242],{"class":163},[33,66638,28082],{"class":54},[33,66640,221],{"class":167},[33,66642,66643,66645,66648,66650,66652,66654,66657,66660,66662],{"class":35,"line":332},[33,66644,29002],{"class":167},[33,66646,66647],{"class":54},"\"Consolidated ",[33,66649,916],{"class":50},[33,66651,65665],{"class":54},[33,66653,916],{"class":50},[33,66655,66656],{"class":54}," rows after dedup\"",[33,66658,66659],{"class":167},", before, ",[33,66661,928],{"class":50},[33,66663,66664],{"class":167},"(combined))\n",[33,66666,66667,66669,66672,66674,66676,66678],{"class":35,"line":347},[33,66668,1332],{"class":163},[33,66670,66671],{"class":167}," combined.reset_index(",[33,66673,10868],{"class":238},[33,66675,242],{"class":163},[33,66677,855],{"class":50},[33,66679,221],{"class":167},[14,66681,66682,66683,66685,66686,1351,66688,66690,66691,66693,66694,3035],{},"Two failure modes dominate here. First, joining on a key with inconsistent whitespace or casing silently produces all-",[30,66684,8884],{}," enrichment columns — normalize the join key on both sides first. Second, when two frames share a non-key column name, pandas appends ",[30,66687,28106],{},[30,66689,28109],{}," suffixes that quietly corrupt downstream column references; that exact problem and its fix are covered in ",[940,66692,28114],{"href":28113},". For pulling tabular PDF data into a dataframe specifically, the dedicated workflow is ",[940,66695,948],{"href":947},[18,66697,28123],{"id":28122},[14,66699,66700,66701,66703,66704,66707,66708,66710,66711,66713,66714,66716],{},"The consolidated frame branches two ways: a ",[26245,66702,28129],{}," artifact for downstream systems, and a ",[26245,66705,66706],{},"document"," artifact for humans. For the data path, write CSV or Parquet. The single most common gotcha is pandas writing a phantom index column into your CSV — pass ",[30,66709,28142],{},", the fix detailed in ",[940,66712,28147],{"href":28146},". Always set ",[30,66715,46120],{}," explicitly so accented names survive, and prefer Parquet when a BI tool will consume the output, since it preserves dtypes that CSV flattens to strings.",[23,66718,66720],{"className":126,"code":66719,"language":47,"meta":28,"style":28},"# pip install pandas pyarrow\nfrom pathlib import Path\n\nimport pandas as pd\n\n\ndef serialize(df: pd.DataFrame, out_dir: Path) -> None:\n    \"\"\"Write BI-ready CSV and Parquet artifacts with explicit options.\"\"\"\n    out_dir.mkdir(parents=True, exist_ok=True)\n    df.to_csv(out_dir \u002F \"invoices.csv\", index=False, encoding=\"utf-8\")   # no phantom index\n    df.to_parquet(out_dir \u002F \"invoices.parquet\", index=False)            # dtypes preserved\n",[30,66721,66722,66727,66737,66741,66751,66755,66759,66772,66777,66797,66825],{"__ignoreMap":28},[33,66723,66724],{"class":35,"line":36},[33,66725,66726],{"class":39},"# pip install pandas pyarrow\n",[33,66728,66729,66731,66733,66735],{"class":35,"line":43},[33,66730,190],{"class":163},[33,66732,193],{"class":167},[33,66734,164],{"class":163},[33,66736,198],{"class":167},[33,66738,66739],{"class":35,"line":61},[33,66740,92],{"emptyLinePlaceholder":91},[33,66742,66743,66745,66747,66749],{"class":35,"line":73},[33,66744,164],{"class":163},[33,66746,492],{"class":167},[33,66748,495],{"class":163},[33,66750,498],{"class":167},[33,66752,66753],{"class":35,"line":88},[33,66754,92],{"emptyLinePlaceholder":91},[33,66756,66757],{"class":35,"line":95},[33,66758,92],{"emptyLinePlaceholder":91},[33,66760,66761,66763,66766,66768,66770],{"class":35,"line":101},[33,66762,562],{"class":163},[33,66764,66765],{"class":46}," serialize",[33,66767,28244],{"class":167},[33,66769,571],{"class":50},[33,66771,574],{"class":167},[33,66773,66774],{"class":35,"line":171},[33,66775,66776],{"class":54},"    \"\"\"Write BI-ready CSV and Parquet artifacts with explicit options.\"\"\"\n",[33,66778,66779,66781,66783,66785,66787,66789,66791,66793,66795],{"class":35,"line":179},[33,66780,28258],{"class":167},[33,66782,869],{"class":238},[33,66784,242],{"class":163},[33,66786,855],{"class":50},[33,66788,365],{"class":167},[33,66790,878],{"class":238},[33,66792,242],{"class":163},[33,66794,855],{"class":50},[33,66796,221],{"class":167},[33,66798,66799,66801,66803,66805,66807,66809,66811,66813,66815,66817,66819,66821,66823],{"class":35,"line":187},[33,66800,28288],{"class":167},[33,66802,1351],{"class":163},[33,66804,28293],{"class":54},[33,66806,365],{"class":167},[33,66808,897],{"class":238},[33,66810,242],{"class":163},[33,66812,902],{"class":50},[33,66814,365],{"class":167},[33,66816,27249],{"class":238},[33,66818,242],{"class":163},[33,66820,1195],{"class":54},[33,66822,12000],{"class":167},[33,66824,28314],{"class":39},[33,66826,66827,66829,66831,66833,66835,66837,66839,66841,66843],{"class":35,"line":201},[33,66828,28319],{"class":167},[33,66830,1351],{"class":163},[33,66832,28324],{"class":54},[33,66834,365],{"class":167},[33,66836,897],{"class":238},[33,66838,242],{"class":163},[33,66840,902],{"class":50},[33,66842,27956],{"class":167},[33,66844,28338],{"class":39},[14,66846,66847,66848,66850,66851,3035],{},"The document path generates a new PDF from the same data with ReportLab — a cover page, a summary table, and per-record detail. Generation is the inverse of extraction: you place text at explicit coordinates on a canvas, then assemble pages with pypdf. The full template-driven approach, including variable-length pagination, is in ",[940,66849,26191],{"href":19001},", and the invoice-specific version in ",[940,66852,53852],{"href":57625},[23,66854,66856],{"className":126,"code":66855,"language":47,"meta":28,"style":28},"# pip install reportlab pypdf\nimport logging\nfrom pathlib import Path\n\nfrom pypdf import PdfReader, PdfWriter\nfrom reportlab.lib.pagesizes import A4\nfrom reportlab.pdfgen import canvas\n\nlog = logging.getLogger(\"pdf_pipeline\")\n\n\ndef generate_report(title: str, summary: dict, out_path: Path) -> None:\n    \"\"\"Render a one-page summary PDF, then assemble it via pypdf.\"\"\"\n    tmp = out_path.with_suffix(\".tmp.pdf\")\n    try:\n        c = canvas.Canvas(str(tmp), pagesize=A4)\n        width, height = A4\n        c.setFont(\"Helvetica-Bold\", 16)\n        c.drawString(50, height - 60, title)\n        c.setFont(\"Helvetica\", 11)\n        y = height - 100\n        for key, value in summary.items():\n            c.drawString(50, y, f\"{key}: {value}\")\n            y -= 20\n        c.save()\n\n        reader = PdfReader(tmp)\n        writer = PdfWriter()\n        writer.append(reader)\n        with out_path.open(\"wb\") as fh:\n            writer.write(fh)\n        log.info(\"Report written to %s\", out_path)\n    finally:\n        tmp.unlink(missing_ok=True)              # clean up the temp file either way\n",[30,66857,66858,66862,66868,66878,66882,66893,66903,66913,66917,66929,66933,66937,66961,66966,66981,66987,67007,67016,67029,67045,67057,67072,67084,67117,67127,67132,67136,67145,67155,67160,67177,67182,67197,67203],{"__ignoreMap":28},[33,66859,66860],{"class":35,"line":36},[33,66861,59137],{"class":39},[33,66863,66864,66866],{"class":35,"line":43},[33,66865,164],{"class":163},[33,66867,184],{"class":167},[33,66869,66870,66872,66874,66876],{"class":35,"line":61},[33,66871,190],{"class":163},[33,66873,193],{"class":167},[33,66875,164],{"class":163},[33,66877,198],{"class":167},[33,66879,66880],{"class":35,"line":73},[33,66881,92],{"emptyLinePlaceholder":91},[33,66883,66884,66886,66888,66890],{"class":35,"line":88},[33,66885,190],{"class":163},[33,66887,57333],{"class":167},[33,66889,164],{"class":163},[33,66891,66892],{"class":167}," PdfReader, PdfWriter\n",[33,66894,66895,66897,66899,66901],{"class":35,"line":95},[33,66896,190],{"class":163},[33,66898,19044],{"class":167},[33,66900,164],{"class":163},[33,66902,19049],{"class":167},[33,66904,66905,66907,66909,66911],{"class":35,"line":101},[33,66906,190],{"class":163},[33,66908,28221],{"class":167},[33,66910,164],{"class":163},[33,66912,28226],{"class":167},[33,66914,66915],{"class":35,"line":171},[33,66916,92],{"emptyLinePlaceholder":91},[33,66918,66919,66921,66923,66925,66927],{"class":35,"line":179},[33,66920,28695],{"class":167},[33,66922,242],{"class":163},[33,66924,544],{"class":167},[33,66926,65381],{"class":54},[33,66928,221],{"class":167},[33,66930,66931],{"class":35,"line":187},[33,66932,92],{"emptyLinePlaceholder":91},[33,66934,66935],{"class":35,"line":201},[33,66936,92],{"emptyLinePlaceholder":91},[33,66938,66939,66941,66944,66947,66949,66952,66954,66957,66959],{"class":35,"line":206},[33,66940,562],{"class":163},[33,66942,66943],{"class":46}," generate_report",[33,66945,66946],{"class":167},"(title: ",[33,66948,1053],{"class":50},[33,66950,66951],{"class":167},", summary: ",[33,66953,37100],{"class":50},[33,66955,66956],{"class":167},", out_path: Path) -> ",[33,66958,571],{"class":50},[33,66960,574],{"class":167},[33,66962,66963],{"class":35,"line":224},[33,66964,66965],{"class":54},"    \"\"\"Render a one-page summary PDF, then assemble it via pypdf.\"\"\"\n",[33,66967,66968,66971,66973,66976,66979],{"class":35,"line":229},[33,66969,66970],{"class":167},"    tmp ",[33,66972,242],{"class":163},[33,66974,66975],{"class":167}," out_path.with_suffix(",[33,66977,66978],{"class":54},"\".tmp.pdf\"",[33,66980,221],{"class":167},[33,66982,66983,66985],{"class":35,"line":235},[33,66984,2424],{"class":163},[33,66986,574],{"class":167},[33,66988,66989,66992,66994,66996,66998,67001,67003,67005],{"class":35,"line":250},[33,66990,66991],{"class":167},"        c ",[33,66993,242],{"class":163},[33,66995,28477],{"class":167},[33,66997,1053],{"class":50},[33,66999,67000],{"class":167},"(tmp), ",[33,67002,20091],{"class":238},[33,67004,242],{"class":163},[33,67006,28496],{"class":167},[33,67008,67009,67012,67014],{"class":35,"line":266},[33,67010,67011],{"class":167},"        width, height ",[33,67013,242],{"class":163},[33,67015,19049],{"class":167},[33,67017,67018,67021,67023,67025,67027],{"class":35,"line":290},[33,67019,67020],{"class":167},"        c.setFont(",[33,67022,19908],{"class":54},[33,67024,365],{"class":167},[33,67026,24213],{"class":50},[33,67028,221],{"class":167},[33,67030,67031,67034,67036,67038,67040,67042],{"class":35,"line":295},[33,67032,67033],{"class":167},"        c.drawString(",[33,67035,2680],{"class":50},[33,67037,28528],{"class":167},[33,67039,4126],{"class":163},[33,67041,28533],{"class":50},[33,67043,67044],{"class":167},", title)\n",[33,67046,67047,67049,67051,67053,67055],{"class":35,"line":300},[33,67048,67020],{"class":167},[33,67050,28546],{"class":54},[33,67052,365],{"class":167},[33,67054,17260],{"class":50},[33,67056,221],{"class":167},[33,67058,67059,67062,67064,67067,67069],{"class":35,"line":317},[33,67060,67061],{"class":167},"        y ",[33,67063,242],{"class":163},[33,67065,67066],{"class":167}," height ",[33,67068,4126],{"class":163},[33,67070,67071],{"class":50}," 100\n",[33,67073,67074,67076,67079,67081],{"class":35,"line":332},[33,67075,5973],{"class":163},[33,67077,67078],{"class":167}," key, value ",[33,67080,662],{"class":163},[33,67082,67083],{"class":167}," summary.items():\n",[33,67085,67086,67089,67091,67094,67096,67098,67100,67102,67104,67106,67108,67111,67113,67115],{"class":35,"line":347},[33,67087,67088],{"class":167},"            c.drawString(",[33,67090,2680],{"class":50},[33,67092,67093],{"class":167},", y, ",[33,67095,4059],{"class":163},[33,67097,274],{"class":54},[33,67099,1115],{"class":50},[33,67101,44114],{"class":167},[33,67103,1121],{"class":50},[33,67105,2079],{"class":54},[33,67107,1115],{"class":50},[33,67109,67110],{"class":167},"value",[33,67112,1121],{"class":50},[33,67114,274],{"class":54},[33,67116,221],{"class":167},[33,67118,67119,67122,67125],{"class":35,"line":374},[33,67120,67121],{"class":167},"            y ",[33,67123,67124],{"class":163},"-=",[33,67126,37298],{"class":50},[33,67128,67129],{"class":35,"line":397},[33,67130,67131],{"class":167},"        c.save()\n",[33,67133,67134],{"class":35,"line":653},[33,67135,92],{"emptyLinePlaceholder":91},[33,67137,67138,67140,67142],{"class":35,"line":667},[33,67139,62484],{"class":167},[33,67141,242],{"class":163},[33,67143,67144],{"class":167}," PdfReader(tmp)\n",[33,67146,67147,67150,67152],{"class":35,"line":675},[33,67148,67149],{"class":167},"        writer ",[33,67151,242],{"class":163},[33,67153,67154],{"class":167}," PdfWriter()\n",[33,67156,67157],{"class":35,"line":689},[33,67158,67159],{"class":167},"        writer.append(reader)\n",[33,67161,67162,67164,67167,67170,67172,67174],{"class":35,"line":703},[33,67163,2191],{"class":163},[33,67165,67166],{"class":167}," out_path.open(",[33,67168,67169],{"class":54},"\"wb\"",[33,67171,1649],{"class":167},[33,67173,495],{"class":163},[33,67175,67176],{"class":167}," fh:\n",[33,67178,67179],{"class":35,"line":714},[33,67180,67181],{"class":167},"            writer.write(fh)\n",[33,67183,67184,67187,67190,67192,67194],{"class":35,"line":723},[33,67185,67186],{"class":167},"        log.info(",[33,67188,67189],{"class":54},"\"Report written to ",[33,67191,309],{"class":50},[33,67193,274],{"class":54},[33,67195,67196],{"class":167},", out_path)\n",[33,67198,67199,67201],{"class":35,"line":754},[33,67200,3018],{"class":163},[33,67202,574],{"class":167},[33,67204,67205,67208,67211,67213,67215,67218],{"class":35,"line":771},[33,67206,67207],{"class":167},"        tmp.unlink(",[33,67209,67210],{"class":238},"missing_ok",[33,67212,242],{"class":163},[33,67214,855],{"class":50},[33,67216,67217],{"class":167},")              ",[33,67219,67220],{"class":39},"# clean up the temp file either way\n",[14,67222,67223,67224,3035],{},"If your generated PDFs need non-Latin text, ReportLab's default fonts will silently drop the glyphs — register a Unicode TTF, as covered in ",[940,67225,28608],{"href":28607},[18,67227,28616],{"id":28615},[14,67229,67230],{},"A pipeline that runs once on your laptop is a demo. Production means it runs unattended on a schedule, survives the one corrupt file in a batch of a thousand, retries transient failures, and tells you what happened. Three pieces make that real: scheduling, retries, and the logging you already wired up at setup.",[14,67232,67233],{},"Schedule with cron for a single host, or GitHub Actions when you want the run logged, versioned, and email-alerted for free. A nightly Actions workflow:",[23,67235,67237],{"className":2062,"code":67236,"language":2064,"meta":28,"style":28},"# .github\u002Fworkflows\u002Fpdf-pipeline.yml\nname: pdf-pipeline\non:\n  schedule:\n    - cron: \"0 3 * * *\"        # 03:00 UTC daily\n  workflow_dispatch:            # allow manual runs\njobs:\n  run:\n    runs-on: ubuntu-latest\n    steps:\n      - uses: actions\u002Fcheckout@v4\n      - uses: actions\u002Fsetup-python@v5\n        with:\n          python-version: \"3.12\"\n      - run: sudo apt-get update && sudo apt-get install -y tesseract-ocr ghostscript\n      - run: pip install -r requirements.txt\n      - run: python run_pipeline.py --in .\u002Finbox --out .\u002Fout\n      - uses: actions\u002Fupload-artifact@v4\n        with:\n          name: pipeline-logs\n          path: logs\u002F\n",[30,67238,67239,67244,67253,67259,67265,67279,67289,67295,67301,67309,67315,67325,67335,67341,67349,67361,67371,67382,67392,67398,67407],{"__ignoreMap":28},[33,67240,67241],{"class":35,"line":36},[33,67242,67243],{"class":39},"# .github\u002Fworkflows\u002Fpdf-pipeline.yml\n",[33,67245,67246,67248,67250],{"class":35,"line":43},[33,67247,1118],{"class":2076},[33,67249,2079],{"class":167},[33,67251,67252],{"class":54},"pdf-pipeline\n",[33,67254,67255,67257],{"class":35,"line":61},[33,67256,2091],{"class":50},[33,67258,574],{"class":167},[33,67260,67261,67263],{"class":35,"line":73},[33,67262,2098],{"class":2076},[33,67264,574],{"class":167},[33,67266,67267,67269,67271,67273,67276],{"class":35,"line":88},[33,67268,2105],{"class":167},[33,67270,2108],{"class":2076},[33,67272,2079],{"class":167},[33,67274,67275],{"class":54},"\"0 3 * * *\"",[33,67277,67278],{"class":39},"        # 03:00 UTC daily\n",[33,67280,67281,67283,67286],{"class":35,"line":95},[33,67282,2121],{"class":2076},[33,67284,67285],{"class":167},":            ",[33,67287,67288],{"class":39},"# allow manual runs\n",[33,67290,67291,67293],{"class":35,"line":101},[33,67292,2136],{"class":2076},[33,67294,574],{"class":167},[33,67296,67297,67299],{"class":35,"line":171},[33,67298,42507],{"class":2076},[33,67300,574],{"class":167},[33,67302,67303,67305,67307],{"class":35,"line":179},[33,67304,2150],{"class":2076},[33,67306,2079],{"class":167},[33,67308,2155],{"class":54},[33,67310,67311,67313],{"class":35,"line":187},[33,67312,2160],{"class":2076},[33,67314,574],{"class":167},[33,67316,67317,67319,67321,67323],{"class":35,"line":201},[33,67318,2167],{"class":167},[33,67320,2170],{"class":2076},[33,67322,2079],{"class":167},[33,67324,2175],{"class":54},[33,67326,67327,67329,67331,67333],{"class":35,"line":206},[33,67328,2167],{"class":167},[33,67330,2170],{"class":2076},[33,67332,2079],{"class":167},[33,67334,2186],{"class":54},[33,67336,67337,67339],{"class":35,"line":224},[33,67338,2191],{"class":2076},[33,67340,574],{"class":167},[33,67342,67343,67345,67347],{"class":35,"line":229},[33,67344,2198],{"class":2076},[33,67346,2079],{"class":167},[33,67348,2203],{"class":54},[33,67350,67351,67353,67356,67358],{"class":35,"line":235},[33,67352,2167],{"class":167},[33,67354,67355],{"class":2076},"run",[33,67357,2079],{"class":167},[33,67359,67360],{"class":54},"sudo apt-get update && sudo apt-get install -y tesseract-ocr ghostscript\n",[33,67362,67363,67365,67367,67369],{"class":35,"line":250},[33,67364,2167],{"class":167},[33,67366,67355],{"class":2076},[33,67368,2079],{"class":167},[33,67370,2224],{"class":54},[33,67372,67373,67375,67377,67379],{"class":35,"line":266},[33,67374,2167],{"class":167},[33,67376,67355],{"class":2076},[33,67378,2079],{"class":167},[33,67380,67381],{"class":54},"python run_pipeline.py --in .\u002Finbox --out .\u002Fout\n",[33,67383,67384,67386,67388,67390],{"class":35,"line":290},[33,67385,2167],{"class":167},[33,67387,2170],{"class":2076},[33,67389,2079],{"class":167},[33,67391,2292],{"class":54},[33,67393,67394,67396],{"class":35,"line":295},[33,67395,2191],{"class":2076},[33,67397,574],{"class":167},[33,67399,67400,67402,67404],{"class":35,"line":300},[33,67401,2303],{"class":2076},[33,67403,2079],{"class":167},[33,67405,67406],{"class":54},"pipeline-logs\n",[33,67408,67409,67411,67413],{"class":35,"line":317},[33,67410,2313],{"class":2076},[33,67412,2079],{"class":167},[33,67414,67415],{"class":54},"logs\u002F\n",[14,67417,67418,67419,3035],{},"Wrap the per-file work so one bad PDF can't kill the batch, and retry I\u002FO-bound steps with backoff. The principle: isolate each file's failure, log it, and keep going — then report the failure count at the end so a partial run is visible, not silent. The same scheduling-and-logging discipline generalizes across document types in ",[940,67420,5],{"href":26465},[23,67422,67424],{"className":126,"code":67423,"language":47,"meta":28,"style":28},"# pip install (stdlib only)\nimport logging\nimport time\nfrom pathlib import Path\nfrom typing import Callable\n\nlog = logging.getLogger(\"pdf_pipeline\")\n\n\ndef with_retry(fn: Callable, *args, attempts: int = 3, base_delay: float = 1.0):\n    \"\"\"Retry a callable with exponential backoff; re-raise after the last try.\"\"\"\n    for attempt in range(1, attempts + 1):\n        try:\n            return fn(*args)\n        except Exception as exc:\n            if attempt == attempts:\n                log.error(\"Giving up after %d attempts: %s\", attempts, exc)\n                raise\n            delay = base_delay * (2 ** (attempt - 1))\n            log.warning(\"Attempt %d failed (%s); retrying in %.1fs\", attempt, exc, delay)\n            time.sleep(delay)\n\n\ndef run_batch(pdf_dir: Path, process: Callable[[Path], None]) -> None:\n    \"\"\"Process every PDF, isolating per-file failures so one bad file\n    can't abort the batch. Reports the failure count at the end.\"\"\"\n    failures = 0\n    pdfs = sorted(pdf_dir.glob(\"*.pdf\"))\n    for pdf in pdfs:\n        try:\n            with_retry(process, pdf)\n        except Exception:\n            failures += 1                         # already logged inside with_retry\n    log.info(\"Batch done: %d processed, %d failed\", len(pdfs) - failures, failures)\n    if failures:\n        log.warning(\"%d file(s) need manual review\", failures)\n",[30,67425,67426,67430,67436,67442,67452,67462,67466,67478,67482,67486,67514,67518,67540,67546,67556,67566,67576,67596,67600,67624,67644,67648,67652,67656,67673,67678,67683,67691,67706,67718,67724,67729,67737,67748,67774,67780],{"__ignoreMap":28},[33,67427,67428],{"class":35,"line":36},[33,67429,26734],{"class":39},[33,67431,67432,67434],{"class":35,"line":43},[33,67433,164],{"class":163},[33,67435,184],{"class":167},[33,67437,67438,67440],{"class":35,"line":61},[33,67439,164],{"class":163},[33,67441,1689],{"class":167},[33,67443,67444,67446,67448,67450],{"class":35,"line":73},[33,67445,190],{"class":163},[33,67447,193],{"class":167},[33,67449,164],{"class":163},[33,67451,198],{"class":167},[33,67453,67454,67456,67458,67460],{"class":35,"line":88},[33,67455,190],{"class":163},[33,67457,28681],{"class":167},[33,67459,164],{"class":163},[33,67461,28686],{"class":167},[33,67463,67464],{"class":35,"line":95},[33,67465,92],{"emptyLinePlaceholder":91},[33,67467,67468,67470,67472,67474,67476],{"class":35,"line":101},[33,67469,28695],{"class":167},[33,67471,242],{"class":163},[33,67473,544],{"class":167},[33,67475,65381],{"class":54},[33,67477,221],{"class":167},[33,67479,67480],{"class":35,"line":171},[33,67481,92],{"emptyLinePlaceholder":91},[33,67483,67484],{"class":35,"line":179},[33,67485,92],{"emptyLinePlaceholder":91},[33,67487,67488,67490,67492,67494,67496,67498,67500,67502,67504,67506,67508,67510,67512],{"class":35,"line":187},[33,67489,562],{"class":163},[33,67491,28718],{"class":46},[33,67493,28721],{"class":167},[33,67495,1769],{"class":163},[33,67497,28726],{"class":167},[33,67499,1059],{"class":50},[33,67501,212],{"class":163},[33,67503,1714],{"class":50},[33,67505,28735],{"class":167},[33,67507,1720],{"class":50},[33,67509,212],{"class":163},[33,67511,28742],{"class":50},[33,67513,1737],{"class":167},[33,67515,67516],{"class":35,"line":201},[33,67517,28749],{"class":54},[33,67519,67520,67522,67524,67526,67528,67530,67532,67534,67536,67538],{"class":35,"line":206},[33,67521,656],{"class":163},[33,67523,1796],{"class":167},[33,67525,662],{"class":163},[33,67527,1801],{"class":50},[33,67529,602],{"class":167},[33,67531,734],{"class":50},[33,67533,1808],{"class":167},[33,67535,1811],{"class":163},[33,67537,1814],{"class":50},[33,67539,1737],{"class":167},[33,67541,67542,67544],{"class":35,"line":224},[33,67543,670],{"class":163},[33,67545,574],{"class":167},[33,67547,67548,67550,67552,67554],{"class":35,"line":229},[33,67549,28782],{"class":163},[33,67551,1832],{"class":167},[33,67553,1769],{"class":163},[33,67555,28789],{"class":167},[33,67557,67558,67560,67562,67564],{"class":35,"line":235},[33,67559,780],{"class":163},[33,67561,783],{"class":50},[33,67563,1852],{"class":163},[33,67565,1855],{"class":167},[33,67567,67568,67570,67572,67574],{"class":35,"line":250},[33,67569,5995],{"class":163},[33,67571,1796],{"class":167},[33,67573,1865],{"class":163},[33,67575,1868],{"class":167},[33,67577,67578,67581,67584,67586,67589,67591,67593],{"class":35,"line":266},[33,67579,67580],{"class":167},"                log.error(",[33,67582,67583],{"class":54},"\"Giving up after ",[33,67585,916],{"class":50},[33,67587,67588],{"class":54}," attempts: ",[33,67590,309],{"class":50},[33,67592,274],{"class":54},[33,67594,67595],{"class":167},", attempts, exc)\n",[33,67597,67598],{"class":35,"line":290},[33,67599,28814],{"class":163},[33,67601,67602,67604,67606,67608,67610,67612,67614,67616,67618,67620,67622],{"class":35,"line":295},[33,67603,28819],{"class":167},[33,67605,242],{"class":163},[33,67607,28824],{"class":167},[33,67609,1769],{"class":163},[33,67611,17583],{"class":167},[33,67613,1533],{"class":50},[33,67615,28833],{"class":163},[33,67617,28836],{"class":167},[33,67619,4126],{"class":163},[33,67621,1814],{"class":50},[33,67623,371],{"class":167},[33,67625,67626,67628,67630,67632,67634,67636,67638,67640,67642],{"class":35,"line":300},[33,67627,28847],{"class":167},[33,67629,28850],{"class":54},[33,67631,916],{"class":50},[33,67633,28855],{"class":54},[33,67635,309],{"class":50},[33,67637,28860],{"class":54},[33,67639,1907],{"class":50},[33,67641,1910],{"class":54},[33,67643,28867],{"class":167},[33,67645,67646],{"class":35,"line":317},[33,67647,28872],{"class":167},[33,67649,67650],{"class":35,"line":332},[33,67651,92],{"emptyLinePlaceholder":91},[33,67653,67654],{"class":35,"line":347},[33,67655,92],{"emptyLinePlaceholder":91},[33,67657,67658,67660,67662,67665,67667,67669,67671],{"class":35,"line":374},[33,67659,562],{"class":163},[33,67661,28887],{"class":46},[33,67663,67664],{"class":167},"(pdf_dir: Path, process: Callable[[Path], ",[33,67666,571],{"class":50},[33,67668,28895],{"class":167},[33,67670,571],{"class":50},[33,67672,574],{"class":167},[33,67674,67675],{"class":35,"line":397},[33,67676,67677],{"class":54},"    \"\"\"Process every PDF, isolating per-file failures so one bad file\n",[33,67679,67680],{"class":35,"line":653},[33,67681,67682],{"class":54},"    can't abort the batch. Reports the failure count at the end.\"\"\"\n",[33,67684,67685,67687,67689],{"class":35,"line":667},[33,67686,28909],{"class":167},[33,67688,242],{"class":163},[33,67690,28914],{"class":50},[33,67692,67693,67696,67698,67700,67702,67704],{"class":35,"line":675},[33,67694,67695],{"class":167},"    pdfs ",[33,67697,242],{"class":163},[33,67699,28924],{"class":50},[33,67701,14074],{"class":167},[33,67703,610],{"class":54},[33,67705,371],{"class":167},[33,67707,67708,67710,67713,67715],{"class":35,"line":689},[33,67709,656],{"class":163},[33,67711,67712],{"class":167}," pdf ",[33,67714,662],{"class":163},[33,67716,67717],{"class":167}," pdfs:\n",[33,67719,67720,67722],{"class":35,"line":703},[33,67721,670],{"class":163},[33,67723,574],{"class":167},[33,67725,67726],{"class":35,"line":714},[33,67727,67728],{"class":167},"            with_retry(process, pdf)\n",[33,67730,67731,67733,67735],{"class":35,"line":723},[33,67732,780],{"class":163},[33,67734,783],{"class":50},[33,67736,574],{"class":167},[33,67738,67739,67741,67743,67745],{"class":35,"line":754},[33,67740,28973],{"class":167},[33,67742,28976],{"class":163},[33,67744,1814],{"class":50},[33,67746,67747],{"class":39},"                         # already logged inside with_retry\n",[33,67749,67750,67752,67754,67756,67759,67761,67763,67765,67767,67770,67772],{"class":35,"line":771},[33,67751,29002],{"class":167},[33,67753,29005],{"class":54},[33,67755,916],{"class":50},[33,67757,67758],{"class":54}," processed, ",[33,67760,916],{"class":50},[33,67762,29015],{"class":54},[33,67764,365],{"class":167},[33,67766,928],{"class":50},[33,67768,67769],{"class":167},"(pdfs) ",[33,67771,4126],{"class":163},[33,67773,29027],{"class":167},[33,67775,67776,67778],{"class":35,"line":777},[33,67777,617],{"class":163},[33,67779,29034],{"class":167},[33,67781,67782,67784,67786,67788,67790],{"class":35,"line":788},[33,67783,29039],{"class":167},[33,67785,274],{"class":54},[33,67787,916],{"class":50},[33,67789,29047],{"class":54},[33,67791,67792],{"class":167},", failures)\n",[14,67794,67795,67796,67799],{},"Make processing ",[1974,67797,67798],{},"idempotent"," — key outputs on a stable identifier so a re-run overwrites rather than duplicates — so a failed nightly job can simply be re-run without double-counting.",[18,67801,29071],{"id":29070},[4273,67803,67804,67814],{},[4276,67805,67806],{},[4279,67807,67808,67810,67812],{},[4282,67809,29080],{},[4282,67811,4287],{},[4282,67813,4290],{},[4292,67815,67816,67827,67838,67853,67873,67884],{},[4279,67817,67818,67821,67824],{},[4297,67819,67820],{},"Extraction returns empty strings",[4297,67822,67823],{},"Scanned\u002Fimage-only page with no text layer",[4297,67825,67826],{},"Classify first; route scans to the Tesseract OCR path",[4279,67828,67829,67832,67835],{},[4297,67830,67831],{},"Columns bleed together or rows misalign",[4297,67833,67834],{},"pdfplumber's coordinate detection misreads the layout",[4297,67836,67837],{},"Tune table settings; see the alignment-fix guide",[4279,67839,67840,67843,67846],{},[4297,67841,67842],{},"Silent extraction failure on some files",[4297,67844,67845],{},"PDF is encrypted; the reader fails before reading",[4297,67847,67848,67849,67852],{},"Check ",[30,67850,67851],{},"PdfReader.is_encrypted"," and decrypt first",[4279,67854,67855,67858,67865],{},[4297,67856,67857],{},"Totals are wrong but no error raised",[4297,67859,29166,67860,67862,67863],{},[30,67861,27820],{}," left untyped or coerced to ",[30,67864,748],{},[4297,67866,67867,67868,67870,67871],{},"Strip separators, use ",[30,67869,29177],{},", quarantine ",[30,67872,8884],{},[4279,67874,67875,67878,67881],{},[4297,67876,67877],{},"OOM on large batches",[4297,67879,67880],{},"Whole documents held in memory across the loop",[4297,67882,67883],{},"Process page-by-page, release references, stream output",[4279,67885,67886,67889,67892],{},[4297,67887,67888],{},"Duplicated rows after re-running",[4297,67890,67891],{},"Non-idempotent consolidation, no dedup",[4297,67893,67894,67895,67897],{},"Key on a stable id; ",[30,67896,29100],{}," and overwrite outputs",[18,67899,29184],{"id":29183},[14,67901,67902,67905,67906,3035],{},[1974,67903,67904],{},"Which Python library should I start with for extraction?","\npdfplumber, for any born-digital PDF — it reads text and tables with coordinates intact and has the gentlest learning curve. Reach for camelot only when tables have visible ruling lines, and add Tesseract only when pages are scanned images. The trade-offs are compared head-to-head in ",[940,67907,9606],{"href":9605},[14,67909,67910,67913,67914,67916,67917,67920],{},[1974,67911,67912],{},"How do I tell a scanned PDF from a digital one in code?","\nOpen the first page and call ",[30,67915,43170],{},". If it returns roughly nothing, the page has no text layer and is a scan — route it to OCR. The ",[30,67918,67919],{},"classify_page_has_text"," snippet above does exactly this branch.",[14,67922,67923,67926],{},[1974,67924,67925],{},"Can I run this whole pipeline without a server?","\nYes. cron on any always-on machine, or GitHub Actions for a serverless schedule with built-in logging and artifact storage — both shown in the production hardening section. No message broker or container orchestration is required for batches up to tens of thousands of files.",[14,67928,67929,29204,67932,67934,67935,67937],{},[1974,67930,67931],{},"How do I keep one corrupt file from killing a nightly batch?",[30,67933,29157],{},", log the failure, increment a counter, and continue — then report the failure count at the end. The ",[30,67936,29210],{}," helper above implements that pattern.",[14,67939,67940,67943,67944,67946,67947,67949,67950,67952,67953,3035],{},[1974,67941,67942],{},"Where does this connect to my Excel or Word workflows?","\nThe transformation and consolidation stages are pure pandas, so the output drops straight into the spreadsheet workflows in ",[940,67945,26258],{"href":26257},". To template the ",[26245,67948,26290],{}," documents as Word files instead of PDFs, see ",[940,67951,26263],{"href":26262},", and for stitching PDF extraction into a larger ETL flow, ",[940,67954,6951],{"href":6950},[18,67956,6918],{"id":6917},[4211,67958,67959,67964,67969,67974,67979,67984],{},[4214,67960,67961,67963],{},[940,67962,9592],{"href":942}," — coordinate-aware table parsing, header detection, and cell-merging logic.",[4214,67965,67966,67968],{},[940,67967,26191],{"href":19001}," — template-driven generation with ReportLab and variable-length pagination.",[4214,67970,67971,67973],{},[940,67972,52682],{"href":52681}," — memory-efficient page assembly, reordering, and splitting with pypdf.",[4214,67975,67976,67978],{},[940,67977,36756],{"href":26957}," — the Tesseract path for image-only and scanned pages.",[4214,67980,67981,67983],{},[940,67982,65967],{"href":65966}," — encryption, password handling, and batch watermarking.",[4214,67985,67986,67988],{},[940,67987,9606],{"href":9605}," — choosing the right table extractor for a given layout.",[14,67990,6947,67991,3035],{},[940,67992,29264],{"href":1351},[6953,67994,6955],{},{"title":28,"searchDepth":43,"depth":43,"links":67996},[67997,67998,67999,68000,68001,68002,68003,68004,68005,68006],{"id":26468,"depth":43,"text":26469},{"id":26618,"depth":43,"text":26619},{"id":26940,"depth":43,"text":26941},{"id":27434,"depth":43,"text":27435},{"id":27834,"depth":43,"text":27835},{"id":28122,"depth":43,"text":28123},{"id":28615,"depth":43,"text":28616},{"id":29070,"depth":43,"text":29071},{"id":29183,"depth":43,"text":29184},{"id":6917,"depth":43,"text":6918},"PDF Automation","End-to-end Python architecture for extracting tables and text from PDFs, transforming the data, consolidating multi-file inputs, and generating reports at scale.",{},"\u002Fautomating-pdf-extraction-generation",{"title":6943,"description":68008},"Python PDF Extraction & Generation Guide","automating-pdf-extraction-generation\u002Findex",[9631,47,68015,68016,943],"extraction","report generation","Zx3koQy-t8pZGpgiw5TcZTywpGCc7G8otaoWhPEEiUo",{"id":68019,"title":68020,"body":68021,"breadcrumbTitle":71101,"canonical":6977,"date":46387,"description":71102,"draft":6980,"extension":6981,"image":6977,"meta":71103,"navigation":91,"path":71104,"robots":6977,"seo":71105,"seoTitle":71106,"stem":71107,"tags":71108,"updatedAt":6978,"__hash__":71110},"content\u002Fautomating-pdf-extraction-generation\u002Fmerging-and-splitting-pdf-documents\u002Fbatch-merge-pdfs-with-python-script\u002Findex.md","Batch Merge PDFs with a Python Script",{"type":7,"value":68022,"toc":71090},[68023,68026,68057,68066,68068,68071,68113,68117,68120,68384,68399,68403,69206,69209,69261,69265,69271,70016,70020,70027,70593,70595,70598,70862,70865,70873,70875,70981,70983,71000,71021,71037,71055,71057,71083,71087],[10,68024,68020],{"id":68025},"batch-merge-pdfs-with-a-python-script",[14,68027,68028,68029,68032,68033,68036,68037,68040,68041,68044,68045,68048,68049,10065,68053,68056],{},"When a batch merge across a large directory halts, the failure is almost always one of three things: ",[30,68030,68031],{},"PdfReadError"," (corrupted or truncated header), ",[30,68034,68035],{},"PermissionError"," (unclosed file handle on Windows), or silent misordering because Python's default ",[30,68038,68039],{},"sorted()"," puts ",[30,68042,68043],{},"Report_10.pdf"," before ",[30,68046,68047],{},"Report_2.pdf",". This guide provides a production-ready merge script using ",[940,68050,65045],{"href":68051,"rel":68052},"https:\u002F\u002Fpypdf.readthedocs.io\u002F",[1367],[30,68054,68055],{},"pathlib"," that handles all three.",[14,68058,68059,68060,68063,68064,3035],{},"For the foundational merge and split primitives — ",[30,68061,68062],{},"PdfWriter.append()",", outline preservation, and page-size normalization — see ",[940,68065,52682],{"href":52681},[18,68067,7021],{"id":7020},[14,68069,68070],{},"Three failure modes cover the vast majority of batch merge errors:",[35387,68072,68073,68087,68100],{},[4214,68074,68075,68078,68079,68082,68083,68086],{},[1974,68076,68077],{},"Malformed PDF headers."," Missing ",[30,68080,68081],{},"%PDF-"," signatures or truncated cross-reference tables raise ",[30,68084,68085],{},"pypdf.errors.PdfReadError",". One corrupt file in 200 kills the whole run if not caught.",[4214,68088,68089,68092,68093,68096,68097,3035],{},[1974,68090,68091],{},"Encrypted files."," Password-protected documents raise ",[30,68094,68095],{},"FileNotDecryptedError"," the moment you access any page attribute without first calling ",[30,68098,68099],{},"reader.decrypt(password)",[4214,68101,68102,68105,68106,68109,68110,3035],{},[1974,68103,68104],{},"Unclosed file handles."," On Windows, ",[30,68107,68108],{},"PdfReader"," objects that remain open after the function exits hold OS locks. A second run on the same directory raises ",[30,68111,68112],{},"PermissionError: [Errno 13] Permission denied",[18,68114,68116],{"id":68115},"diagnostic-snippet","Diagnostic Snippet",[14,68118,68119],{},"Before writing a merge script, confirm which files are problematic:",[23,68121,68123],{"className":126,"code":68122,"language":47,"meta":28,"style":28},"# pip install pypdf\nfrom pypdf import PdfReader\nfrom pypdf.errors import PdfReadError\nfrom pathlib import Path\n\ndef audit_pdfs(input_dir: Path) -> None:\n    \"\"\"Print status of every PDF in a directory: ok \u002F encrypted \u002F corrupt.\"\"\"\n    for pdf in sorted(input_dir.glob(\"*.pdf\")):\n        try:\n            with open(pdf, \"rb\") as fh:\n                reader = PdfReader(fh)\n                status = \"encrypted\" if reader.is_encrypted else f\"ok ({len(reader.pages)} pages)\"\n        except PdfReadError as exc:\n            status = f\"corrupt: {exc}\"\n        except Exception as exc:\n            status = f\"error: {exc}\"\n        print(f\"{pdf.name:40s}  {status}\")\n\nif __name__ == \"__main__\":\n    audit_pdfs(Path(\".\u002Finput_pdfs\"))\n",[30,68124,68125,68129,68139,68151,68161,68165,68179,68184,68201,68207,68226,68236,68267,68278,68298,68308,68327,68358,68362,68374],{"__ignoreMap":28},[33,68126,68127],{"class":35,"line":36},[33,68128,57316],{"class":39},[33,68130,68131,68133,68135,68137],{"class":35,"line":43},[33,68132,190],{"class":163},[33,68134,57333],{"class":167},[33,68136,164],{"class":163},[33,68138,57338],{"class":167},[33,68140,68141,68143,68146,68148],{"class":35,"line":61},[33,68142,190],{"class":163},[33,68144,68145],{"class":167}," pypdf.errors ",[33,68147,164],{"class":163},[33,68149,68150],{"class":167}," PdfReadError\n",[33,68152,68153,68155,68157,68159],{"class":35,"line":73},[33,68154,190],{"class":163},[33,68156,193],{"class":167},[33,68158,164],{"class":163},[33,68160,198],{"class":167},[33,68162,68163],{"class":35,"line":88},[33,68164,92],{"emptyLinePlaceholder":91},[33,68166,68167,68169,68172,68175,68177],{"class":35,"line":95},[33,68168,562],{"class":163},[33,68170,68171],{"class":46}," audit_pdfs",[33,68173,68174],{"class":167},"(input_dir: Path) -> ",[33,68176,571],{"class":50},[33,68178,574],{"class":167},[33,68180,68181],{"class":35,"line":101},[33,68182,68183],{"class":54},"    \"\"\"Print status of every PDF in a directory: ok \u002F encrypted \u002F corrupt.\"\"\"\n",[33,68185,68186,68188,68190,68192,68194,68197,68199],{"class":35,"line":171},[33,68187,656],{"class":163},[33,68189,67712],{"class":167},[33,68191,662],{"class":163},[33,68193,28924],{"class":50},[33,68195,68196],{"class":167},"(input_dir.glob(",[33,68198,610],{"class":54},[33,68200,8687],{"class":167},[33,68202,68203,68205],{"class":35,"line":179},[33,68204,670],{"class":163},[33,68206,574],{"class":167},[33,68208,68209,68211,68214,68217,68220,68222,68224],{"class":35,"line":187},[33,68210,678],{"class":163},[33,68212,68213],{"class":50}," open",[33,68215,68216],{"class":167},"(pdf, ",[33,68218,68219],{"class":54},"\"rb\"",[33,68221,1649],{"class":167},[33,68223,495],{"class":163},[33,68225,67176],{"class":167},[33,68227,68228,68231,68233],{"class":35,"line":201},[33,68229,68230],{"class":167},"                reader ",[33,68232,242],{"class":163},[33,68234,68235],{"class":167}," PdfReader(fh)\n",[33,68237,68238,68241,68243,68246,68248,68251,68253,68255,68258,68260,68262,68264],{"class":35,"line":206},[33,68239,68240],{"class":167},"                status ",[33,68242,242],{"class":163},[33,68244,68245],{"class":54}," \"encrypted\"",[33,68247,9994],{"class":163},[33,68249,68250],{"class":167}," reader.is_encrypted ",[33,68252,7489],{"class":163},[33,68254,1110],{"class":163},[33,68256,68257],{"class":54},"\"ok (",[33,68259,4065],{"class":50},[33,68261,59322],{"class":167},[33,68263,1121],{"class":50},[33,68265,68266],{"class":54}," pages)\"\n",[33,68268,68269,68271,68274,68276],{"class":35,"line":224},[33,68270,780],{"class":163},[33,68272,68273],{"class":167}," PdfReadError ",[33,68275,495],{"class":163},[33,68277,1855],{"class":167},[33,68279,68280,68283,68285,68287,68290,68292,68294,68296],{"class":35,"line":229},[33,68281,68282],{"class":167},"            status ",[33,68284,242],{"class":163},[33,68286,1110],{"class":163},[33,68288,68289],{"class":54},"\"corrupt: ",[33,68291,1115],{"class":50},[33,68293,6565],{"class":167},[33,68295,1121],{"class":50},[33,68297,7504],{"class":54},[33,68299,68300,68302,68304,68306],{"class":35,"line":235},[33,68301,780],{"class":163},[33,68303,783],{"class":50},[33,68305,1852],{"class":163},[33,68307,1855],{"class":167},[33,68309,68310,68312,68314,68316,68319,68321,68323,68325],{"class":35,"line":250},[33,68311,68282],{"class":167},[33,68313,242],{"class":163},[33,68315,1110],{"class":163},[33,68317,68318],{"class":54},"\"error: ",[33,68320,1115],{"class":50},[33,68322,6565],{"class":167},[33,68324,1121],{"class":50},[33,68326,7504],{"class":54},[33,68328,68329,68331,68333,68335,68337,68339,68342,68345,68347,68349,68352,68354,68356],{"class":35,"line":266},[33,68330,9414],{"class":50},[33,68332,602],{"class":167},[33,68334,4059],{"class":163},[33,68336,274],{"class":54},[33,68338,1115],{"class":50},[33,68340,68341],{"class":167},"pdf.name",[33,68343,68344],{"class":163},":40s",[33,68346,1121],{"class":50},[33,68348,54867],{"class":50},[33,68350,68351],{"class":167},"status",[33,68353,1121],{"class":50},[33,68355,274],{"class":54},[33,68357,221],{"class":167},[33,68359,68360],{"class":35,"line":290},[33,68361,92],{"emptyLinePlaceholder":91},[33,68363,68364,68366,68368,68370,68372],{"class":35,"line":295},[33,68365,2491],{"class":163},[33,68367,2494],{"class":50},[33,68369,2497],{"class":163},[33,68371,2500],{"class":54},[33,68373,574],{"class":167},[33,68375,68376,68379,68382],{"class":35,"line":300},[33,68377,68378],{"class":167},"    audit_pdfs(Path(",[33,68380,68381],{"class":54},"\".\u002Finput_pdfs\"",[33,68383,371],{"class":167},[14,68385,68386,68387,68390,68391,68394,68395,68398],{},"Run this first. Files marked ",[30,68388,68389],{},"corrupt"," need manual repair with ",[30,68392,68393],{},"pikepdf"," or removal. Files marked ",[30,68396,68397],{},"encrypted"," require either a password or exclusion from the batch.",[18,68400,68402],{"id":68401},"fix-implementation-robust-merge-script","Fix Implementation: Robust Merge Script",[23,68404,68406],{"className":126,"code":68405,"language":47,"meta":28,"style":28},"# pip install pypdf\nimport re\nfrom pathlib import Path\n\nfrom pypdf import PdfWriter, PdfReader\nfrom pypdf.errors import PdfReadError, FileNotDecryptedError\n\n\ndef natural_sort_key(filepath: Path) -> list:\n    \"\"\"\n    Split filename into text\u002Finteger tokens so that\n    Report_2.pdf sorts before Report_10.pdf.\n    \"\"\"\n    return [\n        int(c) if c.isdigit() else c.lower()\n        for c in re.split(r\"(\\d+)\", filepath.name)\n    ]\n\n\ndef batch_merge_pdfs(\n    input_dir: Path,\n    output_path: Path,\n    password: str = \"\",\n) -> int:\n    \"\"\"\n    Merge all readable PDFs in input_dir (natural sort) into output_path.\n    Returns the number of successfully merged files.\n    Skips corrupt, encrypted-without-password, and locked files with a log line.\n    \"\"\"\n    if not input_dir.is_dir():\n        raise FileNotFoundError(f\"Input directory not found: {input_dir}\")\n\n    pdf_files = sorted(input_dir.glob(\"*.pdf\"), key=natural_sort_key)\n    writer = PdfWriter()\n    merged_count = 0\n\n    for pdf in pdf_files:\n        try:\n            with open(pdf, \"rb\") as fh:           # 'with' guarantees handle closure\n                reader = PdfReader(fh)\n                if reader.is_encrypted:\n                    if not password:\n                        print(f\"[SKIP] Encrypted (no password supplied): {pdf.name}\")\n                        continue\n                    result = reader.decrypt(password)\n                    if result == 0:\n                        print(f\"[SKIP] Wrong password for: {pdf.name}\")\n                        continue\n                writer.append(reader, import_outline=True)  # preserves bookmarks\n                merged_count += 1\n        except PdfReadError as exc:\n            print(f\"[SKIP] Corrupt PDF: {pdf.name} — {exc}\")\n        except FileNotDecryptedError as exc:\n            print(f\"[SKIP] Decrypt failed: {pdf.name} — {exc}\")\n        except PermissionError as exc:\n            print(f\"[SKIP] File locked: {pdf.name} — {exc}\")\n        except Exception as exc:\n            print(f\"[SKIP] Unexpected error on {pdf.name}: {exc}\")\n\n    if merged_count == 0:\n        print(\"[WARN] No valid PDFs found; output not written.\")\n        writer.close()\n        return 0\n\n    output_path.parent.mkdir(parents=True, exist_ok=True)\n    with open(output_path, \"wb\") as out:\n        writer.write(out)\n    writer.close()   # release internal buffer references\n    print(f\"[OK] Merged {merged_count}\u002F{len(pdf_files)} files → {output_path}\")\n    return merged_count\n\n\nif __name__ == \"__main__\":\n    batch_merge_pdfs(\n        Path(\".\u002Finput_pdfs\"),\n        Path(\".\u002Foutput\u002Fmerged.pdf\"),\n    )\n",[30,68407,68408,68412,68418,68428,68432,68443,68454,68458,68462,68476,68480,68485,68490,68494,68500,68518,68545,68549,68553,68557,68566,68571,68576,68589,68597,68601,68606,68611,68616,68620,68629,68652,68656,68677,68686,68695,68699,68709,68715,68735,68743,68750,68759,68781,68785,68795,68808,68829,68833,68850,68859,68869,68898,68909,68938,68948,68977,68987,69016,69020,69033,69044,69049,69055,69059,69080,69098,69103,69111,69152,69159,69163,69167,69179,69184,69193,69202],{"__ignoreMap":28},[33,68409,68410],{"class":35,"line":36},[33,68411,57316],{"class":39},[33,68413,68414,68416],{"class":35,"line":43},[33,68415,164],{"class":163},[33,68417,11917],{"class":167},[33,68419,68420,68422,68424,68426],{"class":35,"line":61},[33,68421,190],{"class":163},[33,68423,193],{"class":167},[33,68425,164],{"class":163},[33,68427,198],{"class":167},[33,68429,68430],{"class":35,"line":73},[33,68431,92],{"emptyLinePlaceholder":91},[33,68433,68434,68436,68438,68440],{"class":35,"line":88},[33,68435,190],{"class":163},[33,68437,57333],{"class":167},[33,68439,164],{"class":163},[33,68441,68442],{"class":167}," PdfWriter, PdfReader\n",[33,68444,68445,68447,68449,68451],{"class":35,"line":95},[33,68446,190],{"class":163},[33,68448,68145],{"class":167},[33,68450,164],{"class":163},[33,68452,68453],{"class":167}," PdfReadError, FileNotDecryptedError\n",[33,68455,68456],{"class":35,"line":101},[33,68457,92],{"emptyLinePlaceholder":91},[33,68459,68460],{"class":35,"line":171},[33,68461,92],{"emptyLinePlaceholder":91},[33,68463,68464,68466,68469,68472,68474],{"class":35,"line":179},[33,68465,562],{"class":163},[33,68467,68468],{"class":46}," natural_sort_key",[33,68470,68471],{"class":167},"(filepath: Path) -> ",[33,68473,25066],{"class":50},[33,68475,574],{"class":167},[33,68477,68478],{"class":35,"line":187},[33,68479,7673],{"class":54},[33,68481,68482],{"class":35,"line":201},[33,68483,68484],{"class":54},"    Split filename into text\u002Finteger tokens so that\n",[33,68486,68487],{"class":35,"line":206},[33,68488,68489],{"class":54},"    Report_2.pdf sorts before Report_10.pdf.\n",[33,68491,68492],{"class":35,"line":224},[33,68493,7673],{"class":54},[33,68495,68496,68498],{"class":35,"line":229},[33,68497,1332],{"class":163},[33,68499,7473],{"class":167},[33,68501,68502,68505,68508,68510,68513,68515],{"class":35,"line":235},[33,68503,68504],{"class":50},"        int",[33,68506,68507],{"class":167},"(c) ",[33,68509,2491],{"class":163},[33,68511,68512],{"class":167}," c.isdigit() ",[33,68514,7489],{"class":163},[33,68516,68517],{"class":167}," c.lower()\n",[33,68519,68520,68522,68524,68526,68529,68531,68533,68536,68538,68540,68542],{"class":35,"line":250},[33,68521,5973],{"class":163},[33,68523,7486],{"class":167},[33,68525,662],{"class":163},[33,68527,68528],{"class":167}," re.split(",[33,68530,11977],{"class":163},[33,68532,274],{"class":54},[33,68534,68535],{"class":50},"(\\d",[33,68537,1811],{"class":163},[33,68539,12027],{"class":50},[33,68541,274],{"class":54},[33,68543,68544],{"class":167},", filepath.name)\n",[33,68546,68547],{"class":35,"line":266},[33,68548,19559],{"class":167},[33,68550,68551],{"class":35,"line":290},[33,68552,92],{"emptyLinePlaceholder":91},[33,68554,68555],{"class":35,"line":295},[33,68556,92],{"emptyLinePlaceholder":91},[33,68558,68559,68561,68564],{"class":35,"line":300},[33,68560,562],{"class":163},[33,68562,68563],{"class":46}," batch_merge_pdfs",[33,68565,7637],{"class":167},[33,68567,68568],{"class":35,"line":317},[33,68569,68570],{"class":167},"    input_dir: Path,\n",[33,68572,68573],{"class":35,"line":332},[33,68574,68575],{"class":167},"    output_path: Path,\n",[33,68577,68578,68581,68583,68585,68587],{"class":35,"line":347},[33,68579,68580],{"class":167},"    password: ",[33,68582,1053],{"class":50},[33,68584,212],{"class":163},[33,68586,9892],{"class":54},[33,68588,247],{"class":167},[33,68590,68591,68593,68595],{"class":35,"line":374},[33,68592,1617],{"class":167},[33,68594,1059],{"class":50},[33,68596,574],{"class":167},[33,68598,68599],{"class":35,"line":397},[33,68600,7673],{"class":54},[33,68602,68603],{"class":35,"line":653},[33,68604,68605],{"class":54},"    Merge all readable PDFs in input_dir (natural sort) into output_path.\n",[33,68607,68608],{"class":35,"line":667},[33,68609,68610],{"class":54},"    Returns the number of successfully merged files.\n",[33,68612,68613],{"class":35,"line":675},[33,68614,68615],{"class":54},"    Skips corrupt, encrypted-without-password, and locked files with a log line.\n",[33,68617,68618],{"class":35,"line":689},[33,68619,7673],{"class":54},[33,68621,68622,68624,68626],{"class":35,"line":703},[33,68623,617],{"class":163},[33,68625,620],{"class":163},[33,68627,68628],{"class":167}," input_dir.is_dir():\n",[33,68630,68631,68633,68635,68637,68639,68642,68644,68646,68648,68650],{"class":35,"line":714},[33,68632,4051],{"class":163},[33,68634,2945],{"class":50},[33,68636,602],{"class":167},[33,68638,4059],{"class":163},[33,68640,68641],{"class":54},"\"Input directory not found: ",[33,68643,1115],{"class":50},[33,68645,6814],{"class":167},[33,68647,1121],{"class":50},[33,68649,274],{"class":54},[33,68651,221],{"class":167},[33,68653,68654],{"class":35,"line":723},[33,68655,92],{"emptyLinePlaceholder":91},[33,68657,68658,68660,68662,68664,68666,68668,68670,68672,68674],{"class":35,"line":754},[33,68659,594],{"class":167},[33,68661,242],{"class":163},[33,68663,28924],{"class":50},[33,68665,68196],{"class":167},[33,68667,610],{"class":54},[33,68669,18525],{"class":167},[33,68671,44114],{"class":238},[33,68673,242],{"class":163},[33,68675,68676],{"class":167},"natural_sort_key)\n",[33,68678,68679,68682,68684],{"class":35,"line":771},[33,68680,68681],{"class":167},"    writer ",[33,68683,242],{"class":163},[33,68685,67154],{"class":167},[33,68687,68688,68691,68693],{"class":35,"line":777},[33,68689,68690],{"class":167},"    merged_count ",[33,68692,242],{"class":163},[33,68694,28914],{"class":50},[33,68696,68697],{"class":35,"line":788},[33,68698,92],{"emptyLinePlaceholder":91},[33,68700,68701,68703,68705,68707],{"class":35,"line":804},[33,68702,656],{"class":163},[33,68704,67712],{"class":167},[33,68706,662],{"class":163},[33,68708,623],{"class":167},[33,68710,68711,68713],{"class":35,"line":809},[33,68712,670],{"class":163},[33,68714,574],{"class":167},[33,68716,68717,68719,68721,68723,68725,68727,68729,68732],{"class":35,"line":819},[33,68718,678],{"class":163},[33,68720,68213],{"class":50},[33,68722,68216],{"class":167},[33,68724,68219],{"class":54},[33,68726,1649],{"class":167},[33,68728,495],{"class":163},[33,68730,68731],{"class":167}," fh:           ",[33,68733,68734],{"class":39},"# 'with' guarantees handle closure\n",[33,68736,68737,68739,68741],{"class":35,"line":829},[33,68738,68230],{"class":167},[33,68740,242],{"class":163},[33,68742,68235],{"class":167},[33,68744,68745,68747],{"class":35,"line":834},[33,68746,7170],{"class":163},[33,68748,68749],{"class":167}," reader.is_encrypted:\n",[33,68751,68752,68754,68756],{"class":35,"line":839},[33,68753,717],{"class":163},[33,68755,620],{"class":163},[33,68757,68758],{"class":167}," password:\n",[33,68760,68761,68764,68766,68768,68771,68773,68775,68777,68779],{"class":35,"line":860},[33,68762,68763],{"class":50},"                        print",[33,68765,602],{"class":167},[33,68767,4059],{"class":163},[33,68769,68770],{"class":54},"\"[SKIP] Encrypted (no password supplied): ",[33,68772,1115],{"class":50},[33,68774,68341],{"class":167},[33,68776,1121],{"class":50},[33,68778,274],{"class":54},[33,68780,221],{"class":167},[33,68782,68783],{"class":35,"line":887},[33,68784,7458],{"class":163},[33,68786,68787,68790,68792],{"class":35,"line":907},[33,68788,68789],{"class":167},"                    result ",[33,68791,242],{"class":163},[33,68793,68794],{"class":167}," reader.decrypt(password)\n",[33,68796,68797,68799,68802,68804,68806],{"class":35,"line":1826},[33,68798,717],{"class":163},[33,68800,68801],{"class":167}," result ",[33,68803,1865],{"class":163},[33,68805,10791],{"class":50},[33,68807,574],{"class":167},[33,68809,68810,68812,68814,68816,68819,68821,68823,68825,68827],{"class":35,"line":1844},[33,68811,68763],{"class":50},[33,68813,602],{"class":167},[33,68815,4059],{"class":163},[33,68817,68818],{"class":54},"\"[SKIP] Wrong password for: ",[33,68820,1115],{"class":50},[33,68822,68341],{"class":167},[33,68824,1121],{"class":50},[33,68826,274],{"class":54},[33,68828,221],{"class":167},[33,68830,68831],{"class":35,"line":1858},[33,68832,7458],{"class":163},[33,68834,68835,68838,68841,68843,68845,68847],{"class":35,"line":1871},[33,68836,68837],{"class":167},"                writer.append(reader, ",[33,68839,68840],{"class":238},"import_outline",[33,68842,242],{"class":163},[33,68844,855],{"class":50},[33,68846,10922],{"class":167},[33,68848,68849],{"class":39},"# preserves bookmarks\n",[33,68851,68852,68855,68857],{"class":35,"line":1877},[33,68853,68854],{"class":167},"                merged_count ",[33,68856,28976],{"class":163},[33,68858,17709],{"class":50},[33,68860,68861,68863,68865,68867],{"class":35,"line":1883},[33,68862,780],{"class":163},[33,68864,68273],{"class":167},[33,68866,495],{"class":163},[33,68868,1855],{"class":167},[33,68870,68871,68873,68875,68877,68880,68882,68884,68886,68888,68890,68892,68894,68896],{"class":35,"line":1915},[33,68872,9364],{"class":50},[33,68874,602],{"class":167},[33,68876,4059],{"class":163},[33,68878,68879],{"class":54},"\"[SKIP] Corrupt PDF: ",[33,68881,1115],{"class":50},[33,68883,68341],{"class":167},[33,68885,1121],{"class":50},[33,68887,6242],{"class":54},[33,68889,1115],{"class":50},[33,68891,6565],{"class":167},[33,68893,1121],{"class":50},[33,68895,274],{"class":54},[33,68897,221],{"class":167},[33,68899,68900,68902,68905,68907],{"class":35,"line":1926},[33,68901,780],{"class":163},[33,68903,68904],{"class":167}," FileNotDecryptedError ",[33,68906,495],{"class":163},[33,68908,1855],{"class":167},[33,68910,68911,68913,68915,68917,68920,68922,68924,68926,68928,68930,68932,68934,68936],{"class":35,"line":1932},[33,68912,9364],{"class":50},[33,68914,602],{"class":167},[33,68916,4059],{"class":163},[33,68918,68919],{"class":54},"\"[SKIP] Decrypt failed: ",[33,68921,1115],{"class":50},[33,68923,68341],{"class":167},[33,68925,1121],{"class":50},[33,68927,6242],{"class":54},[33,68929,1115],{"class":50},[33,68931,6565],{"class":167},[33,68933,1121],{"class":50},[33,68935,274],{"class":54},[33,68937,221],{"class":167},[33,68939,68940,68942,68944,68946],{"class":35,"line":1938},[33,68941,780],{"class":163},[33,68943,17393],{"class":50},[33,68945,1852],{"class":163},[33,68947,1855],{"class":167},[33,68949,68950,68952,68954,68956,68959,68961,68963,68965,68967,68969,68971,68973,68975],{"class":35,"line":1950},[33,68951,9364],{"class":50},[33,68953,602],{"class":167},[33,68955,4059],{"class":163},[33,68957,68958],{"class":54},"\"[SKIP] File locked: ",[33,68960,1115],{"class":50},[33,68962,68341],{"class":167},[33,68964,1121],{"class":50},[33,68966,6242],{"class":54},[33,68968,1115],{"class":50},[33,68970,6565],{"class":167},[33,68972,1121],{"class":50},[33,68974,274],{"class":54},[33,68976,221],{"class":167},[33,68978,68979,68981,68983,68985],{"class":35,"line":1958},[33,68980,780],{"class":163},[33,68982,783],{"class":50},[33,68984,1852],{"class":163},[33,68986,1855],{"class":167},[33,68988,68989,68991,68993,68995,68998,69000,69002,69004,69006,69008,69010,69012,69014],{"class":35,"line":4904},[33,68990,9364],{"class":50},[33,68992,602],{"class":167},[33,68994,4059],{"class":163},[33,68996,68997],{"class":54},"\"[SKIP] Unexpected error on ",[33,68999,1115],{"class":50},[33,69001,68341],{"class":167},[33,69003,1121],{"class":50},[33,69005,2079],{"class":54},[33,69007,1115],{"class":50},[33,69009,6565],{"class":167},[33,69011,1121],{"class":50},[33,69013,274],{"class":54},[33,69015,221],{"class":167},[33,69017,69018],{"class":35,"line":4909},[33,69019,92],{"emptyLinePlaceholder":91},[33,69021,69022,69024,69027,69029,69031],{"class":35,"line":4915},[33,69023,617],{"class":163},[33,69025,69026],{"class":167}," merged_count ",[33,69028,1865],{"class":163},[33,69030,10791],{"class":50},[33,69032,574],{"class":167},[33,69034,69035,69037,69039,69042],{"class":35,"line":4925},[33,69036,9414],{"class":50},[33,69038,602],{"class":167},[33,69040,69041],{"class":54},"\"[WARN] No valid PDFs found; output not written.\"",[33,69043,221],{"class":167},[33,69045,69046],{"class":35,"line":4935},[33,69047,69048],{"class":167},"        writer.close()\n",[33,69050,69051,69053],{"class":35,"line":4941},[33,69052,1659],{"class":163},[33,69054,28914],{"class":50},[33,69056,69057],{"class":35,"line":4950},[33,69058,92],{"emptyLinePlaceholder":91},[33,69060,69061,69064,69066,69068,69070,69072,69074,69076,69078],{"class":35,"line":4960},[33,69062,69063],{"class":167},"    output_path.parent.mkdir(",[33,69065,869],{"class":238},[33,69067,242],{"class":163},[33,69069,855],{"class":50},[33,69071,365],{"class":167},[33,69073,878],{"class":238},[33,69075,242],{"class":163},[33,69077,855],{"class":50},[33,69079,221],{"class":167},[33,69081,69082,69084,69086,69089,69091,69093,69095],{"class":35,"line":4965},[33,69083,1635],{"class":163},[33,69085,68213],{"class":50},[33,69087,69088],{"class":167},"(output_path, ",[33,69090,67169],{"class":54},[33,69092,1649],{"class":167},[33,69094,495],{"class":163},[33,69096,69097],{"class":167}," out:\n",[33,69099,69100],{"class":35,"line":4971},[33,69101,69102],{"class":167},"        writer.write(out)\n",[33,69104,69105,69108],{"class":35,"line":4983},[33,69106,69107],{"class":167},"    writer.close()   ",[33,69109,69110],{"class":39},"# release internal buffer references\n",[33,69112,69113,69115,69117,69119,69122,69124,69127,69129,69131,69133,69136,69138,69141,69143,69146,69148,69150],{"class":35,"line":4988},[33,69114,7268],{"class":50},[33,69116,602],{"class":167},[33,69118,4059],{"class":163},[33,69120,69121],{"class":54},"\"[OK] Merged ",[33,69123,1115],{"class":50},[33,69125,69126],{"class":167},"merged_count",[33,69128,1121],{"class":50},[33,69130,1351],{"class":54},[33,69132,4065],{"class":50},[33,69134,69135],{"class":167},"(pdf_files)",[33,69137,1121],{"class":50},[33,69139,69140],{"class":54}," files → ",[33,69142,1115],{"class":50},[33,69144,69145],{"class":167},"output_path",[33,69147,1121],{"class":50},[33,69149,274],{"class":54},[33,69151,221],{"class":167},[33,69153,69154,69156],{"class":35,"line":4993},[33,69155,1332],{"class":163},[33,69157,69158],{"class":167}," merged_count\n",[33,69160,69161],{"class":35,"line":5003},[33,69162,92],{"emptyLinePlaceholder":91},[33,69164,69165],{"class":35,"line":5008},[33,69166,92],{"emptyLinePlaceholder":91},[33,69168,69169,69171,69173,69175,69177],{"class":35,"line":5014},[33,69170,2491],{"class":163},[33,69172,2494],{"class":50},[33,69174,2497],{"class":163},[33,69176,2500],{"class":54},[33,69178,574],{"class":167},[33,69180,69181],{"class":35,"line":5019},[33,69182,69183],{"class":167},"    batch_merge_pdfs(\n",[33,69185,69186,69189,69191],{"class":35,"line":5032},[33,69187,69188],{"class":167},"        Path(",[33,69190,68381],{"class":54},[33,69192,1506],{"class":167},[33,69194,69195,69197,69200],{"class":35,"line":5039},[33,69196,69188],{"class":167},[33,69198,69199],{"class":54},"\".\u002Foutput\u002Fmerged.pdf\"",[33,69201,1506],{"class":167},[33,69203,69204],{"class":35,"line":5068},[33,69205,1202],{"class":167},[14,69207,69208],{},"Key decisions in this script:",[4211,69210,69211,69229,69245,69253],{},[4214,69212,69213,46332,69216,69219,69220,69222,69223,2008,69226,3035],{},[1974,69214,69215],{},"Natural sort.",[30,69217,69218],{},"natural_sort_key"," splits filenames on digit boundaries so ",[30,69221,68043],{}," follows ",[30,69224,69225],{},"Report_9.pdf",[30,69227,69228],{},"Report_1.pdf",[4214,69230,69231,69237,69238,69241,69242,69244],{},[1974,69232,69233,69236],{},[30,69234,69235],{},"with open()"," per file."," Each reader is closed after ",[30,69239,69240],{},"append()"," executes. Never accumulate open ",[30,69243,68108],{}," objects across iterations.",[4214,69246,69247,69252],{},[1974,69248,69249,3035],{},[30,69250,69251],{},"import_outline=True"," Passes each source document's bookmarks into the writer. Without this, all navigation structure is lost.",[4214,69254,69255,69260],{},[1974,69256,69257,3035],{},[30,69258,69259],{},"writer.close()"," Flushes pypdf's internal page-reference cache. Omitting it leaks memory on long-running processes.",[18,69262,69264],{"id":69263},"variant-merge-with-argparse-cli","Variant: Merge with argparse CLI",[14,69266,69267,69268,69270],{},"The function above is importable. Wrap it with ",[30,69269,40372],{}," for command-line use:",[23,69272,69274],{"className":126,"code":69273,"language":47,"meta":28,"style":28},"# pip install pypdf\n#!\u002Fusr\u002Fbin\u002Fenv python3\n\"\"\"\nmerge_folder.py — merge all PDFs in a directory.\nUsage: python merge_folder.py --input .\u002Fdocs --output merged.pdf [--password SECRET]\n\"\"\"\nimport argparse\nimport re\nfrom pathlib import Path\n\nfrom pypdf import PdfWriter, PdfReader\nfrom pypdf.errors import PdfReadError, FileNotDecryptedError\n\n\ndef natural_sort_key(p: Path) -> list:\n    return [int(c) if c.isdigit() else c.lower() for c in re.split(r\"(\\d+)\", p.name)]\n\n\ndef run(args: argparse.Namespace) -> None:\n    input_dir = Path(args.input)\n    output_path = Path(args.output)\n    password = args.password or \"\"\n\n    if not input_dir.is_dir():\n        raise SystemExit(f\"Not a directory: {input_dir}\")\n\n    pdf_files = sorted(input_dir.glob(\"*.pdf\"), key=natural_sort_key)\n    print(f\"Found {len(pdf_files)} PDFs in {input_dir}\")\n\n    writer = PdfWriter()\n    merged = 0\n    for pdf in pdf_files:\n        try:\n            with open(pdf, \"rb\") as fh:\n                reader = PdfReader(fh)\n                if reader.is_encrypted:\n                    if not password or reader.decrypt(password) == 0:\n                        print(f\"[SKIP] {pdf.name} — encrypted\")\n                        continue\n                writer.append(reader, import_outline=True)\n                merged += 1\n                print(f\"  + {pdf.name}\")\n        except (PdfReadError, FileNotDecryptedError, PermissionError) as exc:\n            print(f\"[SKIP] {pdf.name} — {exc}\")\n\n    if merged == 0:\n        raise SystemExit(\"No files merged; output not written.\")\n\n    output_path.parent.mkdir(parents=True, exist_ok=True)\n    with open(output_path, \"wb\") as out:\n        writer.write(out)\n    writer.close()\n    print(f\"\\nMerged {merged}\u002F{len(pdf_files)} → {output_path}\")\n\n\ndef main() -> None:\n    ap = argparse.ArgumentParser(description=\"Merge all PDFs in a folder\")\n    ap.add_argument(\"--input\", required=True, help=\"Directory of source PDFs\")\n    ap.add_argument(\"--output\", required=True, help=\"Output PDF path\")\n    ap.add_argument(\"--password\", default=\"\", help=\"Shared password for encrypted files\")\n    run(ap.parse_args())\n\n\nif __name__ == \"__main__\":\n    main()\n",[30,69275,69276,69280,69284,69288,69293,69298,69302,69308,69314,69324,69328,69338,69348,69352,69356,69369,69411,69415,69419,69433,69443,69453,69467,69471,69479,69502,69506,69526,69555,69559,69567,69576,69586,69592,69608,69616,69622,69642,69664,69668,69680,69689,69710,69725,69753,69757,69769,69782,69786,69806,69822,69826,69831,69874,69878,69882,69894,69911,69936,69961,69987,69992,69996,70000,70012],{"__ignoreMap":28},[33,69277,69278],{"class":35,"line":36},[33,69279,57316],{"class":39},[33,69281,69282],{"class":35,"line":43},[33,69283,14447],{"class":39},[33,69285,69286],{"class":35,"line":61},[33,69287,139],{"class":54},[33,69289,69290],{"class":35,"line":73},[33,69291,69292],{"class":54},"merge_folder.py — merge all PDFs in a directory.\n",[33,69294,69295],{"class":35,"line":88},[33,69296,69297],{"class":54},"Usage: python merge_folder.py --input .\u002Fdocs --output merged.pdf [--password SECRET]\n",[33,69299,69300],{"class":35,"line":95},[33,69301,139],{"class":54},[33,69303,69304,69306],{"class":35,"line":101},[33,69305,164],{"class":163},[33,69307,4461],{"class":167},[33,69309,69310,69312],{"class":35,"line":171},[33,69311,164],{"class":163},[33,69313,11917],{"class":167},[33,69315,69316,69318,69320,69322],{"class":35,"line":179},[33,69317,190],{"class":163},[33,69319,193],{"class":167},[33,69321,164],{"class":163},[33,69323,198],{"class":167},[33,69325,69326],{"class":35,"line":187},[33,69327,92],{"emptyLinePlaceholder":91},[33,69329,69330,69332,69334,69336],{"class":35,"line":201},[33,69331,190],{"class":163},[33,69333,57333],{"class":167},[33,69335,164],{"class":163},[33,69337,68442],{"class":167},[33,69339,69340,69342,69344,69346],{"class":35,"line":206},[33,69341,190],{"class":163},[33,69343,68145],{"class":167},[33,69345,164],{"class":163},[33,69347,68453],{"class":167},[33,69349,69350],{"class":35,"line":224},[33,69351,92],{"emptyLinePlaceholder":91},[33,69353,69354],{"class":35,"line":229},[33,69355,92],{"emptyLinePlaceholder":91},[33,69357,69358,69360,69362,69365,69367],{"class":35,"line":235},[33,69359,562],{"class":163},[33,69361,68468],{"class":46},[33,69363,69364],{"class":167},"(p: Path) -> ",[33,69366,25066],{"class":50},[33,69368,574],{"class":167},[33,69370,69371,69373,69375,69377,69379,69381,69383,69385,69388,69390,69392,69394,69396,69398,69400,69402,69404,69406,69408],{"class":35,"line":250},[33,69372,1332],{"class":163},[33,69374,9178],{"class":167},[33,69376,1059],{"class":50},[33,69378,68507],{"class":167},[33,69380,2491],{"class":163},[33,69382,68512],{"class":167},[33,69384,7489],{"class":163},[33,69386,69387],{"class":167}," c.lower() ",[33,69389,6124],{"class":163},[33,69391,7486],{"class":167},[33,69393,662],{"class":163},[33,69395,68528],{"class":167},[33,69397,11977],{"class":163},[33,69399,274],{"class":54},[33,69401,68535],{"class":50},[33,69403,1811],{"class":163},[33,69405,12027],{"class":50},[33,69407,274],{"class":54},[33,69409,69410],{"class":167},", p.name)]\n",[33,69412,69413],{"class":35,"line":266},[33,69414,92],{"emptyLinePlaceholder":91},[33,69416,69417],{"class":35,"line":290},[33,69418,92],{"emptyLinePlaceholder":91},[33,69420,69421,69423,69426,69429,69431],{"class":35,"line":295},[33,69422,562],{"class":163},[33,69424,69425],{"class":46}," run",[33,69427,69428],{"class":167},"(args: argparse.Namespace) -> ",[33,69430,571],{"class":50},[33,69432,574],{"class":167},[33,69434,69435,69438,69440],{"class":35,"line":300},[33,69436,69437],{"class":167},"    input_dir ",[33,69439,242],{"class":163},[33,69441,69442],{"class":167}," Path(args.input)\n",[33,69444,69445,69448,69450],{"class":35,"line":317},[33,69446,69447],{"class":167},"    output_path ",[33,69449,242],{"class":163},[33,69451,69452],{"class":167}," Path(args.output)\n",[33,69454,69455,69458,69460,69463,69465],{"class":35,"line":332},[33,69456,69457],{"class":167},"    password ",[33,69459,242],{"class":163},[33,69461,69462],{"class":167}," args.password ",[33,69464,7162],{"class":163},[33,69466,13126],{"class":54},[33,69468,69469],{"class":35,"line":347},[33,69470,92],{"emptyLinePlaceholder":91},[33,69472,69473,69475,69477],{"class":35,"line":374},[33,69474,617],{"class":163},[33,69476,620],{"class":163},[33,69478,68628],{"class":167},[33,69480,69481,69483,69485,69487,69489,69492,69494,69496,69498,69500],{"class":35,"line":397},[33,69482,4051],{"class":163},[33,69484,16617],{"class":50},[33,69486,602],{"class":167},[33,69488,4059],{"class":163},[33,69490,69491],{"class":54},"\"Not a directory: ",[33,69493,1115],{"class":50},[33,69495,6814],{"class":167},[33,69497,1121],{"class":50},[33,69499,274],{"class":54},[33,69501,221],{"class":167},[33,69503,69504],{"class":35,"line":653},[33,69505,92],{"emptyLinePlaceholder":91},[33,69507,69508,69510,69512,69514,69516,69518,69520,69522,69524],{"class":35,"line":667},[33,69509,594],{"class":167},[33,69511,242],{"class":163},[33,69513,28924],{"class":50},[33,69515,68196],{"class":167},[33,69517,610],{"class":54},[33,69519,18525],{"class":167},[33,69521,44114],{"class":238},[33,69523,242],{"class":163},[33,69525,68676],{"class":167},[33,69527,69528,69530,69532,69534,69536,69538,69540,69542,69545,69547,69549,69551,69553],{"class":35,"line":675},[33,69529,7268],{"class":50},[33,69531,602],{"class":167},[33,69533,4059],{"class":163},[33,69535,51247],{"class":54},[33,69537,4065],{"class":50},[33,69539,69135],{"class":167},[33,69541,1121],{"class":50},[33,69543,69544],{"class":54}," PDFs in ",[33,69546,1115],{"class":50},[33,69548,6814],{"class":167},[33,69550,1121],{"class":50},[33,69552,274],{"class":54},[33,69554,221],{"class":167},[33,69556,69557],{"class":35,"line":689},[33,69558,92],{"emptyLinePlaceholder":91},[33,69560,69561,69563,69565],{"class":35,"line":703},[33,69562,68681],{"class":167},[33,69564,242],{"class":163},[33,69566,67154],{"class":167},[33,69568,69569,69572,69574],{"class":35,"line":714},[33,69570,69571],{"class":167},"    merged ",[33,69573,242],{"class":163},[33,69575,28914],{"class":50},[33,69577,69578,69580,69582,69584],{"class":35,"line":723},[33,69579,656],{"class":163},[33,69581,67712],{"class":167},[33,69583,662],{"class":163},[33,69585,623],{"class":167},[33,69587,69588,69590],{"class":35,"line":754},[33,69589,670],{"class":163},[33,69591,574],{"class":167},[33,69593,69594,69596,69598,69600,69602,69604,69606],{"class":35,"line":771},[33,69595,678],{"class":163},[33,69597,68213],{"class":50},[33,69599,68216],{"class":167},[33,69601,68219],{"class":54},[33,69603,1649],{"class":167},[33,69605,495],{"class":163},[33,69607,67176],{"class":167},[33,69609,69610,69612,69614],{"class":35,"line":777},[33,69611,68230],{"class":167},[33,69613,242],{"class":163},[33,69615,68235],{"class":167},[33,69617,69618,69620],{"class":35,"line":788},[33,69619,7170],{"class":163},[33,69621,68749],{"class":167},[33,69623,69624,69626,69628,69631,69633,69636,69638,69640],{"class":35,"line":804},[33,69625,717],{"class":163},[33,69627,620],{"class":163},[33,69629,69630],{"class":167}," password ",[33,69632,7162],{"class":163},[33,69634,69635],{"class":167}," reader.decrypt(password) ",[33,69637,1865],{"class":163},[33,69639,10791],{"class":50},[33,69641,574],{"class":167},[33,69643,69644,69646,69648,69650,69653,69655,69657,69659,69662],{"class":35,"line":809},[33,69645,68763],{"class":50},[33,69647,602],{"class":167},[33,69649,4059],{"class":163},[33,69651,69652],{"class":54},"\"[SKIP] ",[33,69654,1115],{"class":50},[33,69656,68341],{"class":167},[33,69658,1121],{"class":50},[33,69660,69661],{"class":54}," — encrypted\"",[33,69663,221],{"class":167},[33,69665,69666],{"class":35,"line":819},[33,69667,7458],{"class":163},[33,69669,69670,69672,69674,69676,69678],{"class":35,"line":829},[33,69671,68837],{"class":167},[33,69673,68840],{"class":238},[33,69675,242],{"class":163},[33,69677,855],{"class":50},[33,69679,221],{"class":167},[33,69681,69682,69685,69687],{"class":35,"line":834},[33,69683,69684],{"class":167},"                merged ",[33,69686,28976],{"class":163},[33,69688,17709],{"class":50},[33,69690,69691,69693,69695,69697,69700,69702,69704,69706,69708],{"class":35,"line":839},[33,69692,8264],{"class":50},[33,69694,602],{"class":167},[33,69696,4059],{"class":163},[33,69698,69699],{"class":54},"\"  + ",[33,69701,1115],{"class":50},[33,69703,68341],{"class":167},[33,69705,1121],{"class":50},[33,69707,274],{"class":54},[33,69709,221],{"class":167},[33,69711,69712,69714,69717,69719,69721,69723],{"class":35,"line":860},[33,69713,780],{"class":163},[33,69715,69716],{"class":167}," (PdfReadError, FileNotDecryptedError, ",[33,69718,68035],{"class":50},[33,69720,1649],{"class":167},[33,69722,495],{"class":163},[33,69724,1855],{"class":167},[33,69726,69727,69729,69731,69733,69735,69737,69739,69741,69743,69745,69747,69749,69751],{"class":35,"line":887},[33,69728,9364],{"class":50},[33,69730,602],{"class":167},[33,69732,4059],{"class":163},[33,69734,69652],{"class":54},[33,69736,1115],{"class":50},[33,69738,68341],{"class":167},[33,69740,1121],{"class":50},[33,69742,6242],{"class":54},[33,69744,1115],{"class":50},[33,69746,6565],{"class":167},[33,69748,1121],{"class":50},[33,69750,274],{"class":54},[33,69752,221],{"class":167},[33,69754,69755],{"class":35,"line":907},[33,69756,92],{"emptyLinePlaceholder":91},[33,69758,69759,69761,69763,69765,69767],{"class":35,"line":1826},[33,69760,617],{"class":163},[33,69762,8778],{"class":167},[33,69764,1865],{"class":163},[33,69766,10791],{"class":50},[33,69768,574],{"class":167},[33,69770,69771,69773,69775,69777,69780],{"class":35,"line":1844},[33,69772,4051],{"class":163},[33,69774,16617],{"class":50},[33,69776,602],{"class":167},[33,69778,69779],{"class":54},"\"No files merged; output not written.\"",[33,69781,221],{"class":167},[33,69783,69784],{"class":35,"line":1858},[33,69785,92],{"emptyLinePlaceholder":91},[33,69787,69788,69790,69792,69794,69796,69798,69800,69802,69804],{"class":35,"line":1871},[33,69789,69063],{"class":167},[33,69791,869],{"class":238},[33,69793,242],{"class":163},[33,69795,855],{"class":50},[33,69797,365],{"class":167},[33,69799,878],{"class":238},[33,69801,242],{"class":163},[33,69803,855],{"class":50},[33,69805,221],{"class":167},[33,69807,69808,69810,69812,69814,69816,69818,69820],{"class":35,"line":1877},[33,69809,1635],{"class":163},[33,69811,68213],{"class":50},[33,69813,69088],{"class":167},[33,69815,67169],{"class":54},[33,69817,1649],{"class":167},[33,69819,495],{"class":163},[33,69821,69097],{"class":167},[33,69823,69824],{"class":35,"line":1883},[33,69825,69102],{"class":167},[33,69827,69828],{"class":35,"line":1915},[33,69829,69830],{"class":167},"    writer.close()\n",[33,69832,69833,69835,69837,69839,69841,69843,69846,69848,69851,69853,69855,69857,69859,69861,69864,69866,69868,69870,69872],{"class":35,"line":1926},[33,69834,7268],{"class":50},[33,69836,602],{"class":167},[33,69838,4059],{"class":163},[33,69840,274],{"class":54},[33,69842,25830],{"class":50},[33,69844,69845],{"class":54},"Merged ",[33,69847,1115],{"class":50},[33,69849,69850],{"class":167},"merged",[33,69852,1121],{"class":50},[33,69854,1351],{"class":54},[33,69856,4065],{"class":50},[33,69858,69135],{"class":167},[33,69860,1121],{"class":50},[33,69862,69863],{"class":54}," → ",[33,69865,1115],{"class":50},[33,69867,69145],{"class":167},[33,69869,1121],{"class":50},[33,69871,274],{"class":54},[33,69873,221],{"class":167},[33,69875,69876],{"class":35,"line":1932},[33,69877,92],{"emptyLinePlaceholder":91},[33,69879,69880],{"class":35,"line":1938},[33,69881,92],{"emptyLinePlaceholder":91},[33,69883,69884,69886,69888,69890,69892],{"class":35,"line":1950},[33,69885,562],{"class":163},[33,69887,6636],{"class":46},[33,69889,568],{"class":167},[33,69891,571],{"class":50},[33,69893,574],{"class":167},[33,69895,69896,69898,69900,69902,69904,69906,69909],{"class":35,"line":1958},[33,69897,15498],{"class":167},[33,69899,242],{"class":163},[33,69901,6653],{"class":167},[33,69903,6656],{"class":238},[33,69905,242],{"class":163},[33,69907,69908],{"class":54},"\"Merge all PDFs in a folder\"",[33,69910,221],{"class":167},[33,69912,69913,69915,69917,69919,69921,69923,69925,69927,69929,69931,69934],{"class":35,"line":4904},[33,69914,15516],{"class":167},[33,69916,6672],{"class":54},[33,69918,365],{"class":167},[33,69920,25448],{"class":238},[33,69922,242],{"class":163},[33,69924,855],{"class":50},[33,69926,365],{"class":167},[33,69928,25463],{"class":238},[33,69930,242],{"class":163},[33,69932,69933],{"class":54},"\"Directory of source PDFs\"",[33,69935,221],{"class":167},[33,69937,69938,69940,69942,69944,69946,69948,69950,69952,69954,69956,69959],{"class":35,"line":4909},[33,69939,15516],{"class":167},[33,69941,6699],{"class":54},[33,69943,365],{"class":167},[33,69945,25448],{"class":238},[33,69947,242],{"class":163},[33,69949,855],{"class":50},[33,69951,365],{"class":167},[33,69953,25463],{"class":238},[33,69955,242],{"class":163},[33,69957,69958],{"class":54},"\"Output PDF path\"",[33,69960,221],{"class":167},[33,69962,69963,69965,69968,69970,69972,69974,69976,69978,69980,69982,69985],{"class":35,"line":4915},[33,69964,15516],{"class":167},[33,69966,69967],{"class":54},"\"--password\"",[33,69969,365],{"class":167},[33,69971,6685],{"class":238},[33,69973,242],{"class":163},[33,69975,3198],{"class":54},[33,69977,365],{"class":167},[33,69979,25463],{"class":238},[33,69981,242],{"class":163},[33,69983,69984],{"class":54},"\"Shared password for encrypted files\"",[33,69986,221],{"class":167},[33,69988,69989],{"class":35,"line":4925},[33,69990,69991],{"class":167},"    run(ap.parse_args())\n",[33,69993,69994],{"class":35,"line":4935},[33,69995,92],{"emptyLinePlaceholder":91},[33,69997,69998],{"class":35,"line":4941},[33,69999,92],{"emptyLinePlaceholder":91},[33,70001,70002,70004,70006,70008,70010],{"class":35,"line":4950},[33,70003,2491],{"class":163},[33,70005,2494],{"class":50},[33,70007,2497],{"class":163},[33,70009,2500],{"class":54},[33,70011,574],{"class":167},[33,70013,70014],{"class":35,"line":4960},[33,70015,6914],{"class":167},[18,70017,70019],{"id":70018},"variant-chunked-merge-for-500-files","Variant: Chunked Merge for 500+ Files",[14,70021,70022,70023,70026],{},"Holding page references for 500 PDFs in a single ",[30,70024,70025],{},"PdfWriter"," can exhaust memory if any source contains large embedded images. Merge in chunks of 100, write to temporary files, then merge the temporaries:",[23,70028,70030],{"className":126,"code":70029,"language":47,"meta":28,"style":28},"# pip install pypdf\nimport math\nimport shutil\nimport tempfile\nfrom pathlib import Path\n\nfrom pypdf import PdfWriter, PdfReader\nfrom pypdf.errors import PdfReadError\n\n\ndef chunked_merge(\n    files: list[Path],\n    output: Path,\n    chunk_size: int = 100,\n) -> None:\n    \"\"\"Merge a large file list in bounded-memory chunks.\"\"\"\n    tmp_dir = Path(tempfile.mkdtemp(prefix=\"pdf_merge_\"))\n    try:\n        chunk_paths: list[Path] = []\n        n_chunks = math.ceil(len(files) \u002F chunk_size)\n\n        for chunk_idx in range(n_chunks):\n            chunk = files[chunk_idx * chunk_size : (chunk_idx + 1) * chunk_size]\n            chunk_out = tmp_dir \u002F f\"chunk_{chunk_idx:04d}.pdf\"\n            writer = PdfWriter()\n            for f in chunk:\n                try:\n                    with open(f, \"rb\") as fh:\n                        writer.append(PdfReader(fh), import_outline=True)\n                except PdfReadError as exc:\n                    print(f\"[SKIP] {f.name}: {exc}\")\n            with open(chunk_out, \"wb\") as out:\n                writer.write(out)\n            writer.close()\n            chunk_paths.append(chunk_out)\n            print(f\"Chunk {chunk_idx + 1}\u002F{n_chunks} written ({len(chunk)} files)\")\n\n        # Final pass: merge chunk files\n        final = PdfWriter()\n        for cp in chunk_paths:\n            with open(cp, \"rb\") as fh:\n                final.append(PdfReader(fh), import_outline=True)\n        output.parent.mkdir(parents=True, exist_ok=True)\n        with open(output, \"wb\") as out:\n            final.write(out)\n        final.close()\n        print(f\"Final merge complete → {output}\")\n    finally:\n        shutil.rmtree(tmp_dir, ignore_errors=True)\n",[30,70031,70032,70036,70043,70049,70056,70066,70070,70080,70090,70094,70098,70107,70112,70117,70130,70138,70143,70163,70169,70178,70198,70202,70216,70242,70271,70280,70291,70297,70315,70328,70338,70367,70384,70389,70394,70399,70443,70447,70452,70461,70473,70490,70503,70524,70541,70546,70551,70573,70579],{"__ignoreMap":28},[33,70033,70034],{"class":35,"line":36},[33,70035,57316],{"class":39},[33,70037,70038,70040],{"class":35,"line":43},[33,70039,164],{"class":163},[33,70041,70042],{"class":167}," math\n",[33,70044,70045,70047],{"class":35,"line":61},[33,70046,164],{"class":163},[33,70048,41706],{"class":167},[33,70050,70051,70053],{"class":35,"line":73},[33,70052,164],{"class":163},[33,70054,70055],{"class":167}," tempfile\n",[33,70057,70058,70060,70062,70064],{"class":35,"line":88},[33,70059,190],{"class":163},[33,70061,193],{"class":167},[33,70063,164],{"class":163},[33,70065,198],{"class":167},[33,70067,70068],{"class":35,"line":95},[33,70069,92],{"emptyLinePlaceholder":91},[33,70071,70072,70074,70076,70078],{"class":35,"line":101},[33,70073,190],{"class":163},[33,70075,57333],{"class":167},[33,70077,164],{"class":163},[33,70079,68442],{"class":167},[33,70081,70082,70084,70086,70088],{"class":35,"line":171},[33,70083,190],{"class":163},[33,70085,68145],{"class":167},[33,70087,164],{"class":163},[33,70089,68150],{"class":167},[33,70091,70092],{"class":35,"line":179},[33,70093,92],{"emptyLinePlaceholder":91},[33,70095,70096],{"class":35,"line":187},[33,70097,92],{"emptyLinePlaceholder":91},[33,70099,70100,70102,70105],{"class":35,"line":201},[33,70101,562],{"class":163},[33,70103,70104],{"class":46}," chunked_merge",[33,70106,7637],{"class":167},[33,70108,70109],{"class":35,"line":206},[33,70110,70111],{"class":167},"    files: list[Path],\n",[33,70113,70114],{"class":35,"line":224},[33,70115,70116],{"class":167},"    output: Path,\n",[33,70118,70119,70122,70124,70126,70128],{"class":35,"line":229},[33,70120,70121],{"class":167},"    chunk_size: ",[33,70123,1059],{"class":50},[33,70125,212],{"class":163},[33,70127,18366],{"class":50},[33,70129,247],{"class":167},[33,70131,70132,70134,70136],{"class":35,"line":235},[33,70133,1617],{"class":167},[33,70135,571],{"class":50},[33,70137,574],{"class":167},[33,70139,70140],{"class":35,"line":250},[33,70141,70142],{"class":54},"    \"\"\"Merge a large file list in bounded-memory chunks.\"\"\"\n",[33,70144,70145,70148,70150,70153,70156,70158,70161],{"class":35,"line":266},[33,70146,70147],{"class":167},"    tmp_dir ",[33,70149,242],{"class":163},[33,70151,70152],{"class":167}," Path(tempfile.mkdtemp(",[33,70154,70155],{"class":238},"prefix",[33,70157,242],{"class":163},[33,70159,70160],{"class":54},"\"pdf_merge_\"",[33,70162,371],{"class":167},[33,70164,70165,70167],{"class":35,"line":290},[33,70166,2424],{"class":163},[33,70168,574],{"class":167},[33,70170,70171,70174,70176],{"class":35,"line":295},[33,70172,70173],{"class":167},"        chunk_paths: list[Path] ",[33,70175,242],{"class":163},[33,70177,589],{"class":167},[33,70179,70180,70183,70185,70188,70190,70193,70195],{"class":35,"line":300},[33,70181,70182],{"class":167},"        n_chunks ",[33,70184,242],{"class":163},[33,70186,70187],{"class":167}," math.ceil(",[33,70189,928],{"class":50},[33,70191,70192],{"class":167},"(files) ",[33,70194,1351],{"class":163},[33,70196,70197],{"class":167}," chunk_size)\n",[33,70199,70200],{"class":35,"line":317},[33,70201,92],{"emptyLinePlaceholder":91},[33,70203,70204,70206,70209,70211,70213],{"class":35,"line":332},[33,70205,5973],{"class":163},[33,70207,70208],{"class":167}," chunk_idx ",[33,70210,662],{"class":163},[33,70212,1801],{"class":50},[33,70214,70215],{"class":167},"(n_chunks):\n",[33,70217,70218,70221,70223,70226,70228,70231,70233,70235,70237,70239],{"class":35,"line":347},[33,70219,70220],{"class":167},"            chunk ",[33,70222,242],{"class":163},[33,70224,70225],{"class":167}," files[chunk_idx ",[33,70227,1769],{"class":163},[33,70229,70230],{"class":167}," chunk_size : (chunk_idx ",[33,70232,1811],{"class":163},[33,70234,1814],{"class":50},[33,70236,1649],{"class":167},[33,70238,1769],{"class":163},[33,70240,70241],{"class":167}," chunk_size]\n",[33,70243,70244,70247,70249,70252,70254,70256,70259,70261,70264,70267,70269],{"class":35,"line":374},[33,70245,70246],{"class":167},"            chunk_out ",[33,70248,242],{"class":163},[33,70250,70251],{"class":167}," tmp_dir ",[33,70253,1351],{"class":163},[33,70255,1110],{"class":163},[33,70257,70258],{"class":54},"\"chunk_",[33,70260,1115],{"class":50},[33,70262,70263],{"class":167},"chunk_idx",[33,70265,70266],{"class":163},":04d",[33,70268,1121],{"class":50},[33,70270,19246],{"class":54},[33,70272,70273,70276,70278],{"class":35,"line":397},[33,70274,70275],{"class":167},"            writer ",[33,70277,242],{"class":163},[33,70279,67154],{"class":167},[33,70281,70282,70284,70286,70288],{"class":35,"line":653},[33,70283,1793],{"class":163},[33,70285,8832],{"class":167},[33,70287,662],{"class":163},[33,70289,70290],{"class":167}," chunk:\n",[33,70292,70293,70295],{"class":35,"line":667},[33,70294,1821],{"class":163},[33,70296,574],{"class":167},[33,70298,70299,70302,70304,70307,70309,70311,70313],{"class":35,"line":675},[33,70300,70301],{"class":163},"                    with",[33,70303,68213],{"class":50},[33,70305,70306],{"class":167},"(f, ",[33,70308,68219],{"class":54},[33,70310,1649],{"class":167},[33,70312,495],{"class":163},[33,70314,67176],{"class":167},[33,70316,70317,70320,70322,70324,70326],{"class":35,"line":689},[33,70318,70319],{"class":167},"                        writer.append(PdfReader(fh), ",[33,70321,68840],{"class":238},[33,70323,242],{"class":163},[33,70325,855],{"class":50},[33,70327,221],{"class":167},[33,70329,70330,70332,70334,70336],{"class":35,"line":703},[33,70331,1847],{"class":163},[33,70333,68273],{"class":167},[33,70335,495],{"class":163},[33,70337,1855],{"class":167},[33,70339,70340,70342,70344,70346,70348,70350,70353,70355,70357,70359,70361,70363,70365],{"class":35,"line":714},[33,70341,41012],{"class":50},[33,70343,602],{"class":167},[33,70345,4059],{"class":163},[33,70347,69652],{"class":54},[33,70349,1115],{"class":50},[33,70351,70352],{"class":167},"f.name",[33,70354,1121],{"class":50},[33,70356,2079],{"class":54},[33,70358,1115],{"class":50},[33,70360,6565],{"class":167},[33,70362,1121],{"class":50},[33,70364,274],{"class":54},[33,70366,221],{"class":167},[33,70368,70369,70371,70373,70376,70378,70380,70382],{"class":35,"line":723},[33,70370,678],{"class":163},[33,70372,68213],{"class":50},[33,70374,70375],{"class":167},"(chunk_out, ",[33,70377,67169],{"class":54},[33,70379,1649],{"class":167},[33,70381,495],{"class":163},[33,70383,69097],{"class":167},[33,70385,70386],{"class":35,"line":754},[33,70387,70388],{"class":167},"                writer.write(out)\n",[33,70390,70391],{"class":35,"line":771},[33,70392,70393],{"class":167},"            writer.close()\n",[33,70395,70396],{"class":35,"line":777},[33,70397,70398],{"class":167},"            chunk_paths.append(chunk_out)\n",[33,70400,70401,70403,70405,70407,70410,70412,70415,70417,70419,70421,70423,70426,70428,70431,70433,70436,70438,70441],{"class":35,"line":788},[33,70402,9364],{"class":50},[33,70404,602],{"class":167},[33,70406,4059],{"class":163},[33,70408,70409],{"class":54},"\"Chunk ",[33,70411,1115],{"class":50},[33,70413,70414],{"class":167},"chunk_idx ",[33,70416,1811],{"class":163},[33,70418,11022],{"class":50},[33,70420,1351],{"class":54},[33,70422,1115],{"class":50},[33,70424,70425],{"class":167},"n_chunks",[33,70427,1121],{"class":50},[33,70429,70430],{"class":54}," written (",[33,70432,4065],{"class":50},[33,70434,70435],{"class":167},"(chunk)",[33,70437,1121],{"class":50},[33,70439,70440],{"class":54}," files)\"",[33,70442,221],{"class":167},[33,70444,70445],{"class":35,"line":804},[33,70446,92],{"emptyLinePlaceholder":91},[33,70448,70449],{"class":35,"line":809},[33,70450,70451],{"class":39},"        # Final pass: merge chunk files\n",[33,70453,70454,70457,70459],{"class":35,"line":819},[33,70455,70456],{"class":167},"        final ",[33,70458,242],{"class":163},[33,70460,67154],{"class":167},[33,70462,70463,70465,70468,70470],{"class":35,"line":829},[33,70464,5973],{"class":163},[33,70466,70467],{"class":167}," cp ",[33,70469,662],{"class":163},[33,70471,70472],{"class":167}," chunk_paths:\n",[33,70474,70475,70477,70479,70482,70484,70486,70488],{"class":35,"line":834},[33,70476,678],{"class":163},[33,70478,68213],{"class":50},[33,70480,70481],{"class":167},"(cp, ",[33,70483,68219],{"class":54},[33,70485,1649],{"class":167},[33,70487,495],{"class":163},[33,70489,67176],{"class":167},[33,70491,70492,70495,70497,70499,70501],{"class":35,"line":839},[33,70493,70494],{"class":167},"                final.append(PdfReader(fh), ",[33,70496,68840],{"class":238},[33,70498,242],{"class":163},[33,70500,855],{"class":50},[33,70502,221],{"class":167},[33,70504,70505,70508,70510,70512,70514,70516,70518,70520,70522],{"class":35,"line":860},[33,70506,70507],{"class":167},"        output.parent.mkdir(",[33,70509,869],{"class":238},[33,70511,242],{"class":163},[33,70513,855],{"class":50},[33,70515,365],{"class":167},[33,70517,878],{"class":238},[33,70519,242],{"class":163},[33,70521,855],{"class":50},[33,70523,221],{"class":167},[33,70525,70526,70528,70530,70533,70535,70537,70539],{"class":35,"line":887},[33,70527,2191],{"class":163},[33,70529,68213],{"class":50},[33,70531,70532],{"class":167},"(output, ",[33,70534,67169],{"class":54},[33,70536,1649],{"class":167},[33,70538,495],{"class":163},[33,70540,69097],{"class":167},[33,70542,70543],{"class":35,"line":907},[33,70544,70545],{"class":167},"            final.write(out)\n",[33,70547,70548],{"class":35,"line":1826},[33,70549,70550],{"class":167},"        final.close()\n",[33,70552,70553,70555,70557,70559,70562,70564,70567,70569,70571],{"class":35,"line":1844},[33,70554,9414],{"class":50},[33,70556,602],{"class":167},[33,70558,4059],{"class":163},[33,70560,70561],{"class":54},"\"Final merge complete → ",[33,70563,1115],{"class":50},[33,70565,70566],{"class":167},"output",[33,70568,1121],{"class":50},[33,70570,274],{"class":54},[33,70572,221],{"class":167},[33,70574,70575,70577],{"class":35,"line":1858},[33,70576,3018],{"class":163},[33,70578,574],{"class":167},[33,70580,70581,70584,70587,70589,70591],{"class":35,"line":1871},[33,70582,70583],{"class":167},"        shutil.rmtree(tmp_dir, ",[33,70585,70586],{"class":238},"ignore_errors",[33,70588,242],{"class":163},[33,70590,855],{"class":50},[33,70592,221],{"class":167},[18,70594,9247],{"id":9246},[14,70596,70597],{},"After every batch run, confirm the merged file opens cleanly and has the expected page count:",[23,70599,70601],{"className":126,"code":70600,"language":47,"meta":28,"style":28},"# pip install pypdf\nfrom pypdf import PdfReader\nfrom pathlib import Path\n\n\ndef verify_merge(output_path: Path, expected_pages: int | None = None) -> bool:\n    \"\"\"Return True if the PDF opens without errors and matches expected page count.\"\"\"\n    try:\n        reader = PdfReader(output_path)\n        actual = len(reader.pages)\n        if expected_pages is not None and actual != expected_pages:\n            print(f\"FAIL: expected {expected_pages} pages, got {actual}\")\n            return False\n        print(f\"OK: {output_path.name}  ({actual} pages, {len(reader.outline)} outline items)\")\n        return True\n    except Exception as exc:\n        print(f\"FAIL: {exc}\")\n        return False\n\n\nif __name__ == \"__main__\":\n    verify_merge(Path(\".\u002Foutput\u002Fmerged.pdf\"))\n",[30,70602,70603,70607,70617,70627,70631,70635,70661,70666,70672,70681,70692,70714,70744,70750,70790,70796,70806,70827,70833,70837,70841,70853],{"__ignoreMap":28},[33,70604,70605],{"class":35,"line":36},[33,70606,57316],{"class":39},[33,70608,70609,70611,70613,70615],{"class":35,"line":43},[33,70610,190],{"class":163},[33,70612,57333],{"class":167},[33,70614,164],{"class":163},[33,70616,57338],{"class":167},[33,70618,70619,70621,70623,70625],{"class":35,"line":61},[33,70620,190],{"class":163},[33,70622,193],{"class":167},[33,70624,164],{"class":163},[33,70626,198],{"class":167},[33,70628,70629],{"class":35,"line":73},[33,70630,92],{"emptyLinePlaceholder":91},[33,70632,70633],{"class":35,"line":88},[33,70634,92],{"emptyLinePlaceholder":91},[33,70636,70637,70639,70642,70645,70647,70649,70651,70653,70655,70657,70659],{"class":35,"line":95},[33,70638,562],{"class":163},[33,70640,70641],{"class":46}," verify_merge",[33,70643,70644],{"class":167},"(output_path: Path, expected_pages: ",[33,70646,1059],{"class":50},[33,70648,2850],{"class":163},[33,70650,7657],{"class":50},[33,70652,212],{"class":163},[33,70654,7657],{"class":50},[33,70656,1617],{"class":167},[33,70658,2821],{"class":50},[33,70660,574],{"class":167},[33,70662,70663],{"class":35,"line":101},[33,70664,70665],{"class":54},"    \"\"\"Return True if the PDF opens without errors and matches expected page count.\"\"\"\n",[33,70667,70668,70670],{"class":35,"line":171},[33,70669,2424],{"class":163},[33,70671,574],{"class":167},[33,70673,70674,70676,70678],{"class":35,"line":179},[33,70675,62484],{"class":167},[33,70677,242],{"class":163},[33,70679,70680],{"class":167}," PdfReader(output_path)\n",[33,70682,70683,70685,70687,70689],{"class":35,"line":187},[33,70684,25149],{"class":167},[33,70686,242],{"class":163},[33,70688,4037],{"class":50},[33,70690,70691],{"class":167},"(reader.pages)\n",[33,70693,70694,70696,70699,70701,70703,70705,70707,70709,70711],{"class":35,"line":201},[33,70695,8221],{"class":163},[33,70697,70698],{"class":167}," expected_pages ",[33,70700,3847],{"class":163},[33,70702,620],{"class":163},[33,70704,7657],{"class":50},[33,70706,5615],{"class":163},[33,70708,25170],{"class":167},[33,70710,17877],{"class":163},[33,70712,70713],{"class":167}," expected_pages:\n",[33,70715,70716,70718,70720,70722,70725,70727,70730,70732,70734,70736,70738,70740,70742],{"class":35,"line":206},[33,70717,9364],{"class":50},[33,70719,602],{"class":167},[33,70721,4059],{"class":163},[33,70723,70724],{"class":54},"\"FAIL: expected ",[33,70726,1115],{"class":50},[33,70728,70729],{"class":167},"expected_pages",[33,70731,1121],{"class":50},[33,70733,62520],{"class":54},[33,70735,1115],{"class":50},[33,70737,25201],{"class":167},[33,70739,1121],{"class":50},[33,70741,274],{"class":54},[33,70743,221],{"class":167},[33,70745,70746,70748],{"class":35,"line":224},[33,70747,28782],{"class":163},[33,70749,2903],{"class":50},[33,70751,70752,70754,70756,70758,70760,70762,70765,70767,70769,70771,70773,70775,70778,70780,70783,70785,70788],{"class":35,"line":229},[33,70753,9414],{"class":50},[33,70755,602],{"class":167},[33,70757,4059],{"class":163},[33,70759,57480],{"class":54},[33,70761,1115],{"class":50},[33,70763,70764],{"class":167},"output_path.name",[33,70766,1121],{"class":50},[33,70768,18019],{"class":54},[33,70770,1115],{"class":50},[33,70772,25201],{"class":167},[33,70774,1121],{"class":50},[33,70776,70777],{"class":54}," pages, ",[33,70779,4065],{"class":50},[33,70781,70782],{"class":167},"(reader.outline)",[33,70784,1121],{"class":50},[33,70786,70787],{"class":54}," outline items)\"",[33,70789,221],{"class":167},[33,70791,70792,70794],{"class":35,"line":235},[33,70793,1659],{"class":163},[33,70795,2887],{"class":50},[33,70797,70798,70800,70802,70804],{"class":35,"line":250},[33,70799,2449],{"class":163},[33,70801,783],{"class":50},[33,70803,1852],{"class":163},[33,70805,1855],{"class":167},[33,70807,70808,70810,70812,70814,70817,70819,70821,70823,70825],{"class":35,"line":266},[33,70809,9414],{"class":50},[33,70811,602],{"class":167},[33,70813,4059],{"class":163},[33,70815,70816],{"class":54},"\"FAIL: ",[33,70818,1115],{"class":50},[33,70820,6565],{"class":167},[33,70822,1121],{"class":50},[33,70824,274],{"class":54},[33,70826,221],{"class":167},[33,70828,70829,70831],{"class":35,"line":290},[33,70830,1659],{"class":163},[33,70832,2903],{"class":50},[33,70834,70835],{"class":35,"line":295},[33,70836,92],{"emptyLinePlaceholder":91},[33,70838,70839],{"class":35,"line":300},[33,70840,92],{"emptyLinePlaceholder":91},[33,70842,70843,70845,70847,70849,70851],{"class":35,"line":317},[33,70844,2491],{"class":163},[33,70846,2494],{"class":50},[33,70848,2497],{"class":163},[33,70850,2500],{"class":54},[33,70852,574],{"class":167},[33,70854,70855,70858,70860],{"class":35,"line":332},[33,70856,70857],{"class":167},"    verify_merge(Path(",[33,70859,69199],{"class":54},[33,70861,371],{"class":167},[14,70863,70864],{},"Run this check in CI or as the final step of any scheduled merge job. If the expected page count is known (sum of source pages minus skipped files), pass it explicitly to catch silent data loss.",[14,70866,70867,70868,70870,70871,3035],{},"After verifying, the merged PDF can be handed off to the assembly pipeline described in ",[940,70869,26191],{"href":19001},", or secured as shown in ",[940,70872,65967],{"href":65966},[18,70874,48994],{"id":29070},[4273,70876,70877,70887],{},[4276,70878,70879],{},[4279,70880,70881,70883,70885],{},[4282,70882,29080],{},[4282,70884,4287],{},[4282,70886,4290],{},[4292,70888,70889,70908,70930,70947,70963],{},[4279,70890,70891,70898,70903],{},[4297,70892,70893,70895,70896],{},[30,70894,68043],{}," merged before ",[30,70897,68047],{},[4297,70899,70900,70902],{},[30,70901,68039],{}," uses lexicographic string comparison",[4297,70904,17059,70905,70907],{},[30,70906,69218],{}," with regex digit splitting",[4279,70909,70910,70915,70920],{},[4297,70911,70912,70914],{},[30,70913,68035],{}," on second run (Windows)",[4297,70916,70917,70919],{},[30,70918,68108],{}," objects not closed between iterations",[4297,70921,70922,70923,70926,70927],{},"Wrap every ",[30,70924,70925],{},"PdfReader(fh)"," call inside ",[30,70928,70929],{},"with open(path, \"rb\") as fh:",[4279,70931,70932,70935,70942],{},[4297,70933,70934],{},"Bookmarks absent in merged output",[4297,70936,70937,70939,70940],{},[30,70938,69240],{}," called without ",[30,70941,69251],{},[4297,70943,4358,70944,70946],{},[30,70945,69251],{},"; confirm pypdf version ≥ 3.0",[4279,70948,70949,70955,70960],{},[4297,70950,70951,70954],{},[30,70952,70953],{},"MemoryError"," on large batches",[4297,70956,70957,70958],{},"All page objects held in one ",[30,70959,70025],{},[4297,70961,70962],{},"Use chunked merge (100 files per chunk)",[4279,70964,70965,70968,70974],{},[4297,70966,70967],{},"Merged file is 0 bytes",[4297,70969,70970,70973],{},[30,70971,70972],{},"writer.write()"," called on empty writer",[4297,70975,70976,70977,70980],{},"Guard with ",[30,70978,70979],{},"if len(writer.pages) > 0"," before writing",[18,70982,36626],{"id":36625},[14,70984,70985,70988,70989,70992,70993,70996,70997,70999],{},[1974,70986,70987],{},"Why does my script fail on the 50th PDF in a batch?","\nLikely an accumulated file handle or a corrupt file at position 50. Add ",[30,70990,70991],{},"try\u002Fexcept PdfReadError"," per iteration, and ensure every ",[30,70994,70995],{},"open()"," is inside a ",[30,70998,22271],{}," block. Run the audit snippet above first to identify the bad file.",[14,71001,71002,71005,71006,68044,71009,71011,71012,71014,71015,71017,71018,71020],{},[1974,71003,71004],{},"Can I merge password-protected PDFs automatically?","\nOnly if you know the password. Call ",[30,71007,71008],{},"reader.decrypt(\"password\")",[30,71010,69240],{}," and check the return value: ",[30,71013,748],{}," means wrong password, ",[30,71016,734],{}," means success with the user password, ",[30,71019,1533],{}," means success with the owner password.",[14,71022,71023,71029,71032,71033,71036],{},[1974,71024,71025,71026,71028],{},"Does ",[30,71027,65045],{}," preserve bookmarks and metadata?",[30,71030,71031],{},"PdfWriter.append(reader, import_outline=True)"," retains hierarchical bookmarks. Metadata from the last appended document overwrites earlier values; set explicit metadata with ",[30,71034,71035],{},"writer.add_metadata({\"\u002FTitle\": \"...\", \"\u002FAuthor\": \"...\"})"," after all appends.",[14,71038,71039,71042,71043,71046,71047,71050,71051,71054],{},[1974,71040,71041],{},"How do I merge in a specific custom order rather than sorted filename order?","\nBuild the ",[30,71044,71045],{},"files"," list yourself (e.g., from a manifest CSV) and pass it directly to ",[30,71048,71049],{},"batch_merge_pdfs"," — the natural sort only applies inside the function to the globbed files. Replace the ",[30,71052,71053],{},"sorted(..., key=natural_sort_key)"," line with your pre-ordered list.",[18,71056,6918],{"id":6917},[4211,71058,71059,71071,71078],{},[4214,71060,71061,71063,71064,71066,71067,71070],{},[940,71062,52682],{"href":52681}," — full reference: ",[30,71065,69240],{}," vs ",[30,71068,71069],{},"add_page()",", outline inspection, page reordering, and chunked streaming",[4214,71072,71073,71077],{},[940,71074,71076],{"href":71075},"\u002Fautomating-pdf-extraction-generation\u002Fmerging-and-splitting-pdf-documents\u002Fsplit-pdf-by-page-ranges-with-python\u002F","Split a PDF by Page Ranges with Python"," — inverse operation: parse a ranges string and write one file per range",[4214,71079,71080,71082],{},[940,71081,65967],{"href":65966}," — apply access controls to the merged output",[14,71084,6947,71085,3035],{},[940,71086,52682],{"href":52681},[6953,71088,71089],{},"html pre.shiki code .sAwPA, html code.shiki .sAwPA{--shiki-default:#6A737D}html pre.shiki code .sD7c4, html code.shiki .sD7c4{--shiki-default:#D73A49}html pre.shiki code .sgsFI, html code.shiki .sgsFI{--shiki-default:#24292E}html pre.shiki code .s7eDp, html code.shiki .s7eDp{--shiki-default:#6F42C1}html pre.shiki code .sYu0t, html code.shiki .sYu0t{--shiki-default:#005CC5}html pre.shiki code .sYBdl, html code.shiki .sYBdl{--shiki-default:#032F62}html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html pre.shiki code .sqxcx, html code.shiki .sqxcx{--shiki-default:#E36209}",{"title":28,"searchDepth":43,"depth":43,"links":71091},[71092,71093,71094,71095,71096,71097,71098,71099,71100],{"id":7020,"depth":43,"text":7021},{"id":68115,"depth":43,"text":68116},{"id":68401,"depth":43,"text":68402},{"id":69263,"depth":43,"text":69264},{"id":70018,"depth":43,"text":70019},{"id":9246,"depth":43,"text":9247},{"id":29070,"depth":43,"text":48994},{"id":36625,"depth":43,"text":36626},{"id":6917,"depth":43,"text":6918},"Batch Merge PDFs","Fix PdfReadError and PermissionError in batch PDF merges using pypdf. Natural sort, encrypted-file handling, chunked streaming for 500+ files, and a full argparse CLI.",{},"\u002Fautomating-pdf-extraction-generation\u002Fmerging-and-splitting-pdf-documents\u002Fbatch-merge-pdfs-with-python-script",{"title":68020,"description":71102},"Batch Merge PDFs with Python — pypdf Script","automating-pdf-extraction-generation\u002Fmerging-and-splitting-pdf-documents\u002Fbatch-merge-pdfs-with-python-script\u002Findex",[9631,47,65045,71109],"batch merge","v5fQjzd8L5gPbXWb0WlNocLzvfe37x7feFiMb9lPSgY",{"id":71112,"title":52682,"body":71113,"breadcrumbTitle":75754,"canonical":6977,"date":46387,"description":75755,"draft":6980,"extension":6981,"image":6977,"meta":75756,"navigation":91,"path":75757,"robots":6977,"seo":75758,"seoTitle":75759,"stem":75760,"tags":75761,"updatedAt":6978,"__hash__":75763},"content\u002Fautomating-pdf-extraction-generation\u002Fmerging-and-splitting-pdf-documents\u002Findex.md",{"type":7,"value":71114,"toc":75730},[71115,71118,71136,71144,71146,71180,71183,71262,71266,71269,71470,71486,71490,71600,71608,71617,71983,71996,72000,72222,72233,72237,72243,72562,72566,72570,72573,73108,73117,73119,73123,73369,73375,73379,73389,73592,73596,73874,73876,73879,74149,74152,74154,74162,74178,74184,74501,74521,74523,74647,74649,75688,75692,75705,75707,75724,75728],[10,71116,52682],{"id":71117},"merging-and-splitting-pdf-documents",[14,71119,71120,71121,71124,71125,6242,71128,365,71130,71132,71133,71135],{},"Manually assembling multi-file reports or slicing a 300-page export into per-client packets does not scale. Scripts that call ",[30,71122,71123],{},"writer.add_page()"," in a loop drop bookmarks, break internal links, and exhaust memory on anything beyond a few dozen pages. This guide covers the correct primitives in ",[940,71126,65045],{"href":68051,"rel":71127},[1367],[30,71129,68108],{},[30,71131,70025],{},", and ",[30,71134,69240],{}," — plus page reordering, outline preservation, and streaming patterns for large batches.",[14,71137,71138,71139,71141,71142,3035],{},"The same assembly layer powers ",[940,71140,26191],{"href":19001}," (injecting cover pages and appendices) and feeds the access-control step in ",[940,71143,65967],{"href":65966},[18,71145,21],{"id":20},[23,71147,71149],{"className":25,"code":71148,"language":27,"meta":28,"style":28},"# pip install pypdf pikepdf\npip install pypdf            # pure-Python; covers 95 % of use cases\npip install pikepdf          # C++ wrapper around QPDF; use for repair\u002Flarge-scale work\n",[30,71150,71151,71156,71168],{"__ignoreMap":28},[33,71152,71153],{"class":35,"line":36},[33,71154,71155],{"class":39},"# pip install pypdf pikepdf\n",[33,71157,71158,71160,71162,71165],{"class":35,"line":43},[33,71159,76],{"class":46},[33,71161,79],{"class":54},[33,71163,71164],{"class":54}," pypdf",[33,71166,71167],{"class":39},"            # pure-Python; covers 95 % of use cases\n",[33,71169,71170,71172,71174,71177],{"class":35,"line":61},[33,71171,76],{"class":46},[33,71173,79],{"class":54},[33,71175,71176],{"class":54}," pikepdf",[33,71178,71179],{"class":39},"          # C++ wrapper around QPDF; use for repair\u002Flarge-scale work\n",[14,71181,71182],{},"Create test fixtures quickly:",[23,71184,71186],{"className":25,"code":71185,"language":27,"meta":28,"style":28},"# Generate three small test PDFs with ImageMagick (or use your own files)\nfor i in 1 2 3; do\n  convert -size 595x842 xc:white -pointsize 48 \\\n    -annotate +220+420 \"Page set $i\" \"test_input_$i.pdf\"\ndone\n",[30,71187,71188,71193,71212,71234,71257],{"__ignoreMap":28},[33,71189,71190],{"class":35,"line":36},[33,71191,71192],{"class":39},"# Generate three small test PDFs with ImageMagick (or use your own files)\n",[33,71194,71195,71197,71199,71201,71203,71205,71207,71209],{"class":35,"line":43},[33,71196,6124],{"class":163},[33,71198,47269],{"class":167},[33,71200,662],{"class":163},[33,71202,1814],{"class":54},[33,71204,7451],{"class":54},[33,71206,1714],{"class":54},[33,71208,22506],{"class":167},[33,71210,71211],{"class":163},"do\n",[33,71213,71214,71217,71220,71223,71226,71229,71232],{"class":35,"line":61},[33,71215,71216],{"class":46},"  convert",[33,71218,71219],{"class":50}," -size",[33,71221,71222],{"class":54}," 595x842",[33,71224,71225],{"class":54}," xc:white",[33,71227,71228],{"class":50}," -pointsize",[33,71230,71231],{"class":50}," 48",[33,71233,26120],{"class":50},[33,71235,71236,71239,71242,71245,71248,71250,71253,71255],{"class":35,"line":73},[33,71237,71238],{"class":50},"    -annotate",[33,71240,71241],{"class":54}," +220+420",[33,71243,71244],{"class":54}," \"Page set ",[33,71246,71247],{"class":167},"$i",[33,71249,274],{"class":54},[33,71251,71252],{"class":54}," \"test_input_",[33,71254,71247],{"class":167},[33,71256,19246],{"class":54},[33,71258,71259],{"class":35,"line":88},[33,71260,71261],{"class":163},"done\n",[18,71263,71265],{"id":71264},"inspect-before-you-process","Inspect Before You Process",[14,71267,71268],{},"Before merging or splitting, confirm page count, encryption status, and outline depth:",[23,71270,71272],{"className":126,"code":71271,"language":47,"meta":28,"style":28},"# pip install pypdf\nfrom pypdf import PdfReader\nfrom pathlib import Path\n\ndef inspect_pdf(path: Path) -> dict:\n    \"\"\"Return basic structural metadata for a PDF.\"\"\"\n    try:\n        reader = PdfReader(path)\n        return {\n            \"pages\": len(reader.pages),\n            \"encrypted\": reader.is_encrypted,\n            \"outline_items\": len(reader.outline),\n            \"title\": (reader.metadata or {}).get(\"\u002FTitle\", \"\"),\n        }\n    except Exception as exc:\n        return {\"error\": str(exc)}\n\nif __name__ == \"__main__\":\n    for pdf in sorted(Path(\".\u002Finput_docs\").glob(\"*.pdf\")):\n        print(pdf.name, inspect_pdf(pdf))\n",[30,71273,71274,71278,71288,71298,71302,71315,71320,71326,71335,71341,71353,71361,71373,71395,71400,71410,71425,71429,71441,71463],{"__ignoreMap":28},[33,71275,71276],{"class":35,"line":36},[33,71277,57316],{"class":39},[33,71279,71280,71282,71284,71286],{"class":35,"line":43},[33,71281,190],{"class":163},[33,71283,57333],{"class":167},[33,71285,164],{"class":163},[33,71287,57338],{"class":167},[33,71289,71290,71292,71294,71296],{"class":35,"line":61},[33,71291,190],{"class":163},[33,71293,193],{"class":167},[33,71295,164],{"class":163},[33,71297,198],{"class":167},[33,71299,71300],{"class":35,"line":73},[33,71301,92],{"emptyLinePlaceholder":91},[33,71303,71304,71306,71309,71311,71313],{"class":35,"line":88},[33,71305,562],{"class":163},[33,71307,71308],{"class":46}," inspect_pdf",[33,71310,3743],{"class":167},[33,71312,37100],{"class":50},[33,71314,574],{"class":167},[33,71316,71317],{"class":35,"line":95},[33,71318,71319],{"class":54},"    \"\"\"Return basic structural metadata for a PDF.\"\"\"\n",[33,71321,71322,71324],{"class":35,"line":101},[33,71323,2424],{"class":163},[33,71325,574],{"class":167},[33,71327,71328,71330,71332],{"class":35,"line":171},[33,71329,62484],{"class":167},[33,71331,242],{"class":163},[33,71333,71334],{"class":167}," PdfReader(path)\n",[33,71336,71337,71339],{"class":35,"line":179},[33,71338,1659],{"class":163},[33,71340,16265],{"class":167},[33,71342,71343,71346,71348,71350],{"class":35,"line":187},[33,71344,71345],{"class":54},"            \"pages\"",[33,71347,2079],{"class":167},[33,71349,928],{"class":50},[33,71351,71352],{"class":167},"(reader.pages),\n",[33,71354,71355,71358],{"class":35,"line":201},[33,71356,71357],{"class":54},"            \"encrypted\"",[33,71359,71360],{"class":167},": reader.is_encrypted,\n",[33,71362,71363,71366,71368,71370],{"class":35,"line":206},[33,71364,71365],{"class":54},"            \"outline_items\"",[33,71367,2079],{"class":167},[33,71369,928],{"class":50},[33,71371,71372],{"class":167},"(reader.outline),\n",[33,71374,71375,71378,71381,71383,71386,71389,71391,71393],{"class":35,"line":224},[33,71376,71377],{"class":54},"            \"title\"",[33,71379,71380],{"class":167},": (reader.metadata ",[33,71382,7162],{"class":163},[33,71384,71385],{"class":167}," {}).get(",[33,71387,71388],{"class":54},"\"\u002FTitle\"",[33,71390,365],{"class":167},[33,71392,3198],{"class":54},[33,71394,1506],{"class":167},[33,71396,71397],{"class":35,"line":229},[33,71398,71399],{"class":167},"        }\n",[33,71401,71402,71404,71406,71408],{"class":35,"line":235},[33,71403,2449],{"class":163},[33,71405,783],{"class":50},[33,71407,1852],{"class":163},[33,71409,1855],{"class":167},[33,71411,71412,71414,71416,71418,71420,71422],{"class":35,"line":250},[33,71413,1659],{"class":163},[33,71415,4098],{"class":167},[33,71417,37333],{"class":54},[33,71419,2079],{"class":167},[33,71421,1053],{"class":50},[33,71423,71424],{"class":167},"(exc)}\n",[33,71426,71427],{"class":35,"line":266},[33,71428,92],{"emptyLinePlaceholder":91},[33,71430,71431,71433,71435,71437,71439],{"class":35,"line":290},[33,71432,2491],{"class":163},[33,71434,2494],{"class":50},[33,71436,2497],{"class":163},[33,71438,2500],{"class":54},[33,71440,574],{"class":167},[33,71442,71443,71445,71447,71449,71451,71453,71456,71459,71461],{"class":35,"line":295},[33,71444,656],{"class":163},[33,71446,67712],{"class":167},[33,71448,662],{"class":163},[33,71450,28924],{"class":50},[33,71452,62344],{"class":167},[33,71454,71455],{"class":54},"\".\u002Finput_docs\"",[33,71457,71458],{"class":167},").glob(",[33,71460,610],{"class":54},[33,71462,8687],{"class":167},[33,71464,71465,71467],{"class":35,"line":300},[33,71466,9414],{"class":50},[33,71468,71469],{"class":167},"(pdf.name, inspect_pdf(pdf))\n",[14,71471,71472,71473,71476,71477,71479,71480,71483,71484,3035],{},"Run this on every input directory before the first merge. Files where ",[30,71474,71475],{},"encrypted=True"," need ",[30,71478,68099],{}," before any page access; files with ",[30,71481,71482],{},"error"," keys should be skipped or repaired with ",[30,71485,68393],{},[18,71487,71489],{"id":71488},"core-workflow-merging","Core Workflow: Merging",[2540,71491,2547,71493,2547,71496,2547,71499,2547,2547,71513,2547,71515,2547,71519,2547,71521,2547,71525,2547,71527,2547,2547,71531,2547,71534,2547,71537,2547,2547,71539,2547,71543,2547,71545,2547,2547,71549,2547,2547,71552,2547,71554,2547,71558,2547,2547,71561,2547,2547,71563,2547,71565,2547,71568,2547,2547,71571,2547,71575,2547,71577,2547,71579,2547,71583,2547,71588,2547,71590,2547,71594,2547,71596],{"viewBox":11071,"role":2543,"ariaLabel":71492,"xmlns":2545,"style":2546},"PDF merge and split workflow: multiple source PDFs feed into PdfWriter append, producing a merged PDF; the merged PDF then feeds into PdfWriter per range, producing split output files",[2549,71494,71495],{},"Merge and Split PDF workflow",[2553,71497,71498],{},"Diagram showing multiple source PDFs flowing into PdfWriter.append() to produce a merged PDF, then PdfWriter per page range producing split output files.",[2557,71500,2559,71501,2559,71508,2547],{},[2561,71502,2564,71504,2564,71506,2559],{"id":71503,"x1":748,"y1":748,"x2":734,"y2":748},"merge-split-blue-grad",[2566,71505],{"offset":748,"style":2568},[2566,71507],{"offset":734,"style":2571},[2573,71509,2564,71511,2559],{"id":71510,"viewBox":2576,"refX":2577,"refY":1153,"markerWidth":2681,"markerHeight":2681,"orient":2578},"merge-split-arrow",[2580,71512],{"d":2582,"fill":2583},[2585,71514],{"x":2587,"y":1543,"width":2589,"height":26341,"rx":2591,"fill":2592,"stroke":2593,"style":2594},[2000,71516,71518],{"x":2597,"y":71517,"fill":2599,"style":2600},"57","doc_a.pdf",[2585,71520],{"x":2587,"y":2630,"width":2589,"height":26341,"rx":2591,"fill":2592,"stroke":2593,"style":2594},[2000,71522,71524],{"x":2597,"y":71523,"fill":2599,"style":2600},"117","doc_b.pdf",[2585,71526],{"x":2587,"y":2635,"width":2589,"height":26341,"rx":2591,"fill":2592,"stroke":2593,"style":2594},[2000,71528,71530],{"x":2597,"y":71529,"fill":2599,"style":2600},"177","doc_c.pdf",[35,71532],{"x1":2609,"y1":49813,"x2":17008,"y2":2589,"stroke":2583,"markerEnd":71533,"style":2594},"url(#merge-split-arrow)",[35,71535],{"x1":2609,"y1":71536,"x2":17008,"y2":11214,"stroke":2583,"markerEnd":71533,"style":2594},"112",[35,71538],{"x1":2609,"y1":2604,"x2":17008,"y2":49842,"stroke":2583,"markerEnd":71533,"style":2594},[2585,71540],{"x":17008,"y":71541,"width":2610,"height":38749,"rx":3545,"fill":71542,"stroke":11166,"style":2594},"88","url(#merge-split-blue-grad)",[2000,71544,70025],{"x":2625,"y":11095,"fill":2599,"style":16983},[2000,71546,71548],{"x":2625,"y":71547,"fill":2583,"style":2685},"137",".append(reader)",[35,71550],{"x1":47140,"y1":71551,"x2":49852,"y2":71551,"stroke":2583,"markerEnd":71533,"style":2594},"122",[2585,71553],{"x":49852,"y":38741,"width":2609,"height":49813,"rx":2591,"fill":11165,"stroke":11166,"style":2594},[2000,71555,71557],{"x":71556,"y":71551,"fill":2599,"style":16983},"550","merged.pdf",[2000,71559,71560],{"x":2626,"y":58401,"fill":2583,"style":2685},"\n─── SPLIT ───\n",[35,71562],{"x1":71556,"y1":11194,"x2":71556,"y2":2665,"stroke":2583,"markerEnd":71533,"style":2594},[2585,71564],{"x":64900,"y":2665,"width":58337,"height":38740,"rx":3545,"fill":71542,"stroke":11166,"style":2594},[2000,71566,71567],{"x":71556,"y":49869,"fill":2599,"style":16983},"PdfWriter × range",[2000,71569,71570],{"x":71556,"y":38859,"fill":2583,"style":2685},"one writer per slice",[35,71572],{"x1":71573,"y1":71574,"x2":57785,"y2":26354,"stroke":2583,"markerEnd":71533,"style":2594},"640","253",[35,71576],{"x1":71573,"y1":71574,"x2":57785,"y2":2618,"stroke":2583,"markerEnd":71533,"style":2594},[35,71578],{"x1":71573,"y1":71574,"x2":57785,"y2":11231,"stroke":2583,"markerEnd":71533,"style":2594},[2585,71580],{"x":57785,"y":71581,"width":71582,"height":11164,"rx":1153,"fill":2592,"stroke":2593,"style":2594},"216","46",[2000,71584,71587],{"x":71585,"y":71586,"fill":2599,"style":2605},"723","233","part1",[2585,71589],{"x":57785,"y":38722,"width":71582,"height":11164,"rx":1153,"fill":2592,"stroke":2593,"style":2594},[2000,71591,71593],{"x":71585,"y":71592,"fill":2599,"style":2605},"265","part2",[2585,71595],{"x":57785,"y":49839,"width":71582,"height":11164,"rx":1153,"fill":2592,"stroke":2593,"style":2594},[2000,71597,71599],{"x":71585,"y":71598,"fill":2599,"style":2605},"297","part3",[424,71601,71603,71604,2008,71606],{"id":71602},"step-1-use-append-not-add_page","Step 1 — Use ",[30,71605,69240],{},[30,71607,71069],{},[14,71609,71610,71613,71614,71616],{},[30,71611,71612],{},"PdfWriter.append(reader)"," recursively imports page resources, form fields, annotations, and the document outline. ",[30,71615,71069],{}," does a shallow copy that silently drops all of those.",[23,71618,71620],{"className":126,"code":71619,"language":47,"meta":28,"style":28},"# pip install pypdf\nfrom pypdf import PdfWriter, PdfReader\nfrom pathlib import Path\nimport logging\n\nlogging.basicConfig(level=logging.INFO, format=\"%(levelname)s: %(message)s\")\nlogger = logging.getLogger(__name__)\n\ndef merge_pdfs(input_dir: Path, output_path: Path) -> None:\n    \"\"\"Merge all PDFs in a directory in sorted order, preserving outlines.\"\"\"\n    writer = PdfWriter()\n    try:\n        pdf_files = sorted(input_dir.glob(\"*.pdf\"))\n        if not pdf_files:\n            logger.warning(\"No PDF files found in %s\", input_dir)\n            return\n\n        for pdf_file in pdf_files:\n            logger.info(\"Appending %s\", pdf_file.name)\n            with open(pdf_file, \"rb\") as fh:\n                reader = PdfReader(fh)\n                writer.append(reader, import_outline=True)   # preserves bookmarks\n\n        output_path.parent.mkdir(parents=True, exist_ok=True)\n        with open(output_path, \"wb\") as out:\n            writer.write(out)\n        logger.info(\"Merged %d files → %s\", len(pdf_files), output_path)\n    except Exception as exc:\n        logger.error(\"Merge failed: %s\", exc)\n        raise\n    finally:\n        writer.close()\n\nif __name__ == \"__main__\":\n    merge_pdfs(Path(\".\u002Finput_docs\"), Path(\".\u002Foutput\u002Fmerged.pdf\"))\n",[30,71621,71622,71626,71636,71646,71652,71656,71687,71699,71703,71717,71722,71730,71736,71751,71759,71773,71778,71782,71793,71807,71824,71832,71846,71850,71871,71887,71892,71914,71924,71939,71943,71949,71953,71957,71969],{"__ignoreMap":28},[33,71623,71624],{"class":35,"line":36},[33,71625,57316],{"class":39},[33,71627,71628,71630,71632,71634],{"class":35,"line":43},[33,71629,190],{"class":163},[33,71631,57333],{"class":167},[33,71633,164],{"class":163},[33,71635,68442],{"class":167},[33,71637,71638,71640,71642,71644],{"class":35,"line":61},[33,71639,190],{"class":163},[33,71641,193],{"class":167},[33,71643,164],{"class":163},[33,71645,198],{"class":167},[33,71647,71648,71650],{"class":35,"line":73},[33,71649,164],{"class":163},[33,71651,184],{"class":167},[33,71653,71654],{"class":35,"line":88},[33,71655,92],{"emptyLinePlaceholder":91},[33,71657,71658,71661,71663,71665,71667,71669,71671,71673,71675,71677,71679,71681,71683,71685],{"class":35,"line":95},[33,71659,71660],{"class":167},"logging.basicConfig(",[33,71662,18267],{"class":238},[33,71664,242],{"class":163},[33,71666,258],{"class":167},[33,71668,1067],{"class":50},[33,71670,365],{"class":167},[33,71672,61926],{"class":238},[33,71674,242],{"class":163},[33,71676,274],{"class":54},[33,71678,26817],{"class":50},[33,71680,2079],{"class":54},[33,71682,26827],{"class":50},[33,71684,274],{"class":54},[33,71686,221],{"class":167},[33,71688,71689,71691,71693,71695,71697],{"class":35,"line":101},[33,71690,539],{"class":167},[33,71692,242],{"class":163},[33,71694,544],{"class":167},[33,71696,547],{"class":50},[33,71698,221],{"class":167},[33,71700,71701],{"class":35,"line":171},[33,71702,92],{"emptyLinePlaceholder":91},[33,71704,71705,71707,71710,71713,71715],{"class":35,"line":179},[33,71706,562],{"class":163},[33,71708,71709],{"class":46}," merge_pdfs",[33,71711,71712],{"class":167},"(input_dir: Path, output_path: Path) -> ",[33,71714,571],{"class":50},[33,71716,574],{"class":167},[33,71718,71719],{"class":35,"line":187},[33,71720,71721],{"class":54},"    \"\"\"Merge all PDFs in a directory in sorted order, preserving outlines.\"\"\"\n",[33,71723,71724,71726,71728],{"class":35,"line":201},[33,71725,68681],{"class":167},[33,71727,242],{"class":163},[33,71729,67154],{"class":167},[33,71731,71732,71734],{"class":35,"line":206},[33,71733,2424],{"class":163},[33,71735,574],{"class":167},[33,71737,71738,71741,71743,71745,71747,71749],{"class":35,"line":224},[33,71739,71740],{"class":167},"        pdf_files ",[33,71742,242],{"class":163},[33,71744,28924],{"class":50},[33,71746,68196],{"class":167},[33,71748,610],{"class":54},[33,71750,371],{"class":167},[33,71752,71753,71755,71757],{"class":35,"line":229},[33,71754,8221],{"class":163},[33,71756,620],{"class":163},[33,71758,623],{"class":167},[33,71760,71761,71764,71767,71769,71771],{"class":35,"line":235},[33,71762,71763],{"class":167},"            logger.warning(",[33,71765,71766],{"class":54},"\"No PDF files found in ",[33,71768,309],{"class":50},[33,71770,274],{"class":54},[33,71772,6111],{"class":167},[33,71774,71775],{"class":35,"line":250},[33,71776,71777],{"class":163},"            return\n",[33,71779,71780],{"class":35,"line":266},[33,71781,92],{"emptyLinePlaceholder":91},[33,71783,71784,71786,71789,71791],{"class":35,"line":290},[33,71785,5973],{"class":163},[33,71787,71788],{"class":167}," pdf_file ",[33,71790,662],{"class":163},[33,71792,623],{"class":167},[33,71794,71795,71797,71800,71802,71804],{"class":35,"line":295},[33,71796,6234],{"class":167},[33,71798,71799],{"class":54},"\"Appending ",[33,71801,309],{"class":50},[33,71803,274],{"class":54},[33,71805,71806],{"class":167},", pdf_file.name)\n",[33,71808,71809,71811,71813,71816,71818,71820,71822],{"class":35,"line":300},[33,71810,678],{"class":163},[33,71812,68213],{"class":50},[33,71814,71815],{"class":167},"(pdf_file, ",[33,71817,68219],{"class":54},[33,71819,1649],{"class":167},[33,71821,495],{"class":163},[33,71823,67176],{"class":167},[33,71825,71826,71828,71830],{"class":35,"line":317},[33,71827,68230],{"class":167},[33,71829,242],{"class":163},[33,71831,68235],{"class":167},[33,71833,71834,71836,71838,71840,71842,71844],{"class":35,"line":332},[33,71835,68837],{"class":167},[33,71837,68840],{"class":238},[33,71839,242],{"class":163},[33,71841,855],{"class":50},[33,71843,12000],{"class":167},[33,71845,68849],{"class":39},[33,71847,71848],{"class":35,"line":347},[33,71849,92],{"emptyLinePlaceholder":91},[33,71851,71852,71855,71857,71859,71861,71863,71865,71867,71869],{"class":35,"line":374},[33,71853,71854],{"class":167},"        output_path.parent.mkdir(",[33,71856,869],{"class":238},[33,71858,242],{"class":163},[33,71860,855],{"class":50},[33,71862,365],{"class":167},[33,71864,878],{"class":238},[33,71866,242],{"class":163},[33,71868,855],{"class":50},[33,71870,221],{"class":167},[33,71872,71873,71875,71877,71879,71881,71883,71885],{"class":35,"line":397},[33,71874,2191],{"class":163},[33,71876,68213],{"class":50},[33,71878,69088],{"class":167},[33,71880,67169],{"class":54},[33,71882,1649],{"class":167},[33,71884,495],{"class":163},[33,71886,69097],{"class":167},[33,71888,71889],{"class":35,"line":653},[33,71890,71891],{"class":167},"            writer.write(out)\n",[33,71893,71894,71896,71899,71901,71903,71905,71907,71909,71911],{"class":35,"line":667},[33,71895,2439],{"class":167},[33,71897,71898],{"class":54},"\"Merged ",[33,71900,916],{"class":50},[33,71902,69140],{"class":54},[33,71904,309],{"class":50},[33,71906,274],{"class":54},[33,71908,365],{"class":167},[33,71910,928],{"class":50},[33,71912,71913],{"class":167},"(pdf_files), output_path)\n",[33,71915,71916,71918,71920,71922],{"class":35,"line":675},[33,71917,2449],{"class":163},[33,71919,783],{"class":50},[33,71921,1852],{"class":163},[33,71923,1855],{"class":167},[33,71925,71926,71929,71932,71934,71936],{"class":35,"line":689},[33,71927,71928],{"class":167},"        logger.error(",[33,71930,71931],{"class":54},"\"Merge failed: ",[33,71933,309],{"class":50},[33,71935,274],{"class":54},[33,71937,71938],{"class":167},", exc)\n",[33,71940,71941],{"class":35,"line":703},[33,71942,65922],{"class":163},[33,71944,71945,71947],{"class":35,"line":714},[33,71946,3018],{"class":163},[33,71948,574],{"class":167},[33,71950,71951],{"class":35,"line":723},[33,71952,69048],{"class":167},[33,71954,71955],{"class":35,"line":754},[33,71956,92],{"emptyLinePlaceholder":91},[33,71958,71959,71961,71963,71965,71967],{"class":35,"line":771},[33,71960,2491],{"class":163},[33,71962,2494],{"class":50},[33,71964,2497],{"class":163},[33,71966,2500],{"class":54},[33,71968,574],{"class":167},[33,71970,71971,71974,71976,71979,71981],{"class":35,"line":777},[33,71972,71973],{"class":167},"    merge_pdfs(Path(",[33,71975,71455],{"class":54},[33,71977,71978],{"class":167},"), Path(",[33,71980,69199],{"class":54},[33,71982,371],{"class":167},[14,71984,71985,71986,71988,71989,71992,71993,71995],{},"Open the file inside ",[30,71987,69235],{}," and pass the ",[1974,71990,71991],{},"file object"," (not just the path) to ",[30,71994,68108],{},". This guarantees the OS releases the file descriptor after each iteration — critical on Windows where open handles block subsequent reads.",[424,71997,71999],{"id":71998},"step-2-preserve-and-inspect-the-outline","Step 2 — Preserve and inspect the outline",[23,72001,72003],{"className":126,"code":72002,"language":47,"meta":28,"style":28},"# pip install pypdf\nfrom pypdf import PdfReader\nfrom pathlib import Path\n\ndef print_outline(reader: PdfReader, items=None, depth: int = 0) -> None:\n    \"\"\"Recursively print the document outline (bookmarks).\"\"\"\n    if items is None:\n        items = reader.outline\n    for item in items:\n        if isinstance(item, list):\n            print_outline(reader, item, depth + 1)\n        else:\n            page_num = reader.get_destination_page_number(item) + 1  # 1-based\n            print(\"  \" * depth + f\"[p{page_num}] {item.title}\")\n\nif __name__ == \"__main__\":\n    reader = PdfReader(Path(\".\u002Foutput\u002Fmerged.pdf\"))\n    print_outline(reader)\n",[30,72004,72005,72009,72019,72029,72033,72062,72067,72080,72090,72101,72114,72125,72131,72148,72188,72192,72204,72217],{"__ignoreMap":28},[33,72006,72007],{"class":35,"line":36},[33,72008,57316],{"class":39},[33,72010,72011,72013,72015,72017],{"class":35,"line":43},[33,72012,190],{"class":163},[33,72014,57333],{"class":167},[33,72016,164],{"class":163},[33,72018,57338],{"class":167},[33,72020,72021,72023,72025,72027],{"class":35,"line":61},[33,72022,190],{"class":163},[33,72024,193],{"class":167},[33,72026,164],{"class":163},[33,72028,198],{"class":167},[33,72030,72031],{"class":35,"line":73},[33,72032,92],{"emptyLinePlaceholder":91},[33,72034,72035,72037,72040,72043,72045,72047,72050,72052,72054,72056,72058,72060],{"class":35,"line":88},[33,72036,562],{"class":163},[33,72038,72039],{"class":46}," print_outline",[33,72041,72042],{"class":167},"(reader: PdfReader, items",[33,72044,242],{"class":163},[33,72046,571],{"class":50},[33,72048,72049],{"class":167},", depth: ",[33,72051,1059],{"class":50},[33,72053,212],{"class":163},[33,72055,10791],{"class":50},[33,72057,1617],{"class":167},[33,72059,571],{"class":50},[33,72061,574],{"class":167},[33,72063,72064],{"class":35,"line":95},[33,72065,72066],{"class":54},"    \"\"\"Recursively print the document outline (bookmarks).\"\"\"\n",[33,72068,72069,72071,72074,72076,72078],{"class":35,"line":101},[33,72070,617],{"class":163},[33,72072,72073],{"class":167}," items ",[33,72075,3847],{"class":163},[33,72077,7657],{"class":50},[33,72079,574],{"class":167},[33,72081,72082,72085,72087],{"class":35,"line":171},[33,72083,72084],{"class":167},"        items ",[33,72086,242],{"class":163},[33,72088,72089],{"class":167}," reader.outline\n",[33,72091,72092,72094,72096,72098],{"class":35,"line":179},[33,72093,656],{"class":163},[33,72095,54203],{"class":167},[33,72097,662],{"class":163},[33,72099,72100],{"class":167}," items:\n",[33,72102,72103,72105,72107,72110,72112],{"class":35,"line":187},[33,72104,8221],{"class":163},[33,72106,36538],{"class":50},[33,72108,72109],{"class":167},"(item, ",[33,72111,25066],{"class":50},[33,72113,1737],{"class":167},[33,72115,72116,72119,72121,72123],{"class":35,"line":201},[33,72117,72118],{"class":167},"            print_outline(reader, item, depth ",[33,72120,1811],{"class":163},[33,72122,1814],{"class":50},[33,72124,221],{"class":167},[33,72126,72127,72129],{"class":35,"line":206},[33,72128,41290],{"class":163},[33,72130,574],{"class":167},[33,72132,72133,72136,72138,72141,72143,72145],{"class":35,"line":224},[33,72134,72135],{"class":167},"            page_num ",[33,72137,242],{"class":163},[33,72139,72140],{"class":167}," reader.get_destination_page_number(item) ",[33,72142,1811],{"class":163},[33,72144,1814],{"class":50},[33,72146,72147],{"class":39},"  # 1-based\n",[33,72149,72150,72152,72154,72157,72159,72162,72164,72166,72169,72171,72173,72175,72177,72179,72182,72184,72186],{"class":35,"line":229},[33,72151,9364],{"class":50},[33,72153,602],{"class":167},[33,72155,72156],{"class":54},"\"  \"",[33,72158,1156],{"class":163},[33,72160,72161],{"class":167}," depth ",[33,72163,1811],{"class":163},[33,72165,1110],{"class":163},[33,72167,72168],{"class":54},"\"[p",[33,72170,1115],{"class":50},[33,72172,40156],{"class":167},[33,72174,1121],{"class":50},[33,72176,763],{"class":54},[33,72178,1115],{"class":50},[33,72180,72181],{"class":167},"item.title",[33,72183,1121],{"class":50},[33,72185,274],{"class":54},[33,72187,221],{"class":167},[33,72189,72190],{"class":35,"line":235},[33,72191,92],{"emptyLinePlaceholder":91},[33,72193,72194,72196,72198,72200,72202],{"class":35,"line":250},[33,72195,2491],{"class":163},[33,72197,2494],{"class":50},[33,72199,2497],{"class":163},[33,72201,2500],{"class":54},[33,72203,574],{"class":167},[33,72205,72206,72208,72210,72213,72215],{"class":35,"line":266},[33,72207,57365],{"class":167},[33,72209,242],{"class":163},[33,72211,72212],{"class":167}," PdfReader(Path(",[33,72214,69199],{"class":54},[33,72216,371],{"class":167},[33,72218,72219],{"class":35,"line":290},[33,72220,72221],{"class":167},"    print_outline(reader)\n",[14,72223,72224,72225,72228,72229,72232],{},"After merging, run this to confirm outlines from every source document are present. If a source had no outline, that is expected; if a source had one and it is missing, you passed ",[30,72226,72227],{},"import_outline=False"," (the default before pypdf 3.x — pin ",[30,72230,72231],{},"pypdf>=3.0",").",[424,72234,72236],{"id":72235},"step-3-reorder-pages-before-writing","Step 3 — Reorder pages before writing",[14,72238,72239,72240,72242],{},"Sometimes you need to rearrange pages without re-reading every file. ",[30,72241,70025],{}," exposes its internal page list; manipulate it directly:",[23,72244,72246],{"className":126,"code":72245,"language":47,"meta":28,"style":28},"# pip install pypdf\nfrom pypdf import PdfWriter, PdfReader\nfrom pathlib import Path\n\ndef reorder_pages(input_path: Path, output_path: Path, new_order: list[int]) -> None:\n    \"\"\"\n    Write pages in new_order (1-based) to output_path.\n    Example: new_order=[3,1,2] puts page 3 first.\n    \"\"\"\n    reader = PdfReader(input_path)\n    writer = PdfWriter()\n    total = len(reader.pages)\n    try:\n        for page_num in new_order:\n            if not 1 \u003C= page_num \u003C= total:\n                raise ValueError(f\"Page {page_num} out of range (1–{total})\")\n            writer.add_page(reader.pages[page_num - 1])   # convert 1-based → 0-based\n        with open(output_path, \"wb\") as out:\n            writer.write(out)\n        print(f\"Reordered {len(new_order)} pages → {output_path}\")\n    except Exception as exc:\n        print(f\"Reorder failed: {exc}\")\n        raise\n    finally:\n        writer.close()\n\nif __name__ == \"__main__\":\n    reorder_pages(Path(\".\u002Fsource.pdf\"), Path(\".\u002Freordered.pdf\"), [3, 1, 2])\n",[30,72247,72248,72252,72262,72272,72276,72294,72298,72303,72308,72312,72321,72329,72340,72346,72358,72376,72409,72423,72439,72443,72474,72484,72505,72509,72515,72519,72523,72535],{"__ignoreMap":28},[33,72249,72250],{"class":35,"line":36},[33,72251,57316],{"class":39},[33,72253,72254,72256,72258,72260],{"class":35,"line":43},[33,72255,190],{"class":163},[33,72257,57333],{"class":167},[33,72259,164],{"class":163},[33,72261,68442],{"class":167},[33,72263,72264,72266,72268,72270],{"class":35,"line":61},[33,72265,190],{"class":163},[33,72267,193],{"class":167},[33,72269,164],{"class":163},[33,72271,198],{"class":167},[33,72273,72274],{"class":35,"line":73},[33,72275,92],{"emptyLinePlaceholder":91},[33,72277,72278,72280,72283,72286,72288,72290,72292],{"class":35,"line":88},[33,72279,562],{"class":163},[33,72281,72282],{"class":46}," reorder_pages",[33,72284,72285],{"class":167},"(input_path: Path, output_path: Path, new_order: list[",[33,72287,1059],{"class":50},[33,72289,28895],{"class":167},[33,72291,571],{"class":50},[33,72293,574],{"class":167},[33,72295,72296],{"class":35,"line":95},[33,72297,7673],{"class":54},[33,72299,72300],{"class":35,"line":101},[33,72301,72302],{"class":54},"    Write pages in new_order (1-based) to output_path.\n",[33,72304,72305],{"class":35,"line":171},[33,72306,72307],{"class":54},"    Example: new_order=[3,1,2] puts page 3 first.\n",[33,72309,72310],{"class":35,"line":179},[33,72311,7673],{"class":54},[33,72313,72314,72316,72318],{"class":35,"line":187},[33,72315,57365],{"class":167},[33,72317,242],{"class":163},[33,72319,72320],{"class":167}," PdfReader(input_path)\n",[33,72322,72323,72325,72327],{"class":35,"line":201},[33,72324,68681],{"class":167},[33,72326,242],{"class":163},[33,72328,67154],{"class":167},[33,72330,72331,72334,72336,72338],{"class":35,"line":206},[33,72332,72333],{"class":167},"    total ",[33,72335,242],{"class":163},[33,72337,4037],{"class":50},[33,72339,70691],{"class":167},[33,72341,72342,72344],{"class":35,"line":224},[33,72343,2424],{"class":163},[33,72345,574],{"class":167},[33,72347,72348,72350,72353,72355],{"class":35,"line":229},[33,72349,5973],{"class":163},[33,72351,72352],{"class":167}," page_num ",[33,72354,662],{"class":163},[33,72356,72357],{"class":167}," new_order:\n",[33,72359,72360,72362,72364,72366,72369,72371,72373],{"class":35,"line":235},[33,72361,5995],{"class":163},[33,72363,620],{"class":163},[33,72365,1814],{"class":50},[33,72367,72368],{"class":163}," \u003C=",[33,72370,72352],{"class":167},[33,72372,44223],{"class":163},[33,72374,72375],{"class":167}," total:\n",[33,72377,72378,72380,72382,72384,72386,72388,72390,72392,72394,72397,72399,72402,72404,72407],{"class":35,"line":250},[33,72379,16804],{"class":163},[33,72381,4054],{"class":50},[33,72383,602],{"class":167},[33,72385,4059],{"class":163},[33,72387,55719],{"class":54},[33,72389,1115],{"class":50},[33,72391,40156],{"class":167},[33,72393,1121],{"class":50},[33,72395,72396],{"class":54}," out of range (1–",[33,72398,1115],{"class":50},[33,72400,72401],{"class":167},"total",[33,72403,1121],{"class":50},[33,72405,72406],{"class":54},")\"",[33,72408,221],{"class":167},[33,72410,72411,72414,72416,72418,72420],{"class":35,"line":266},[33,72412,72413],{"class":167},"            writer.add_page(reader.pages[page_num ",[33,72415,4126],{"class":163},[33,72417,1814],{"class":50},[33,72419,7283],{"class":167},[33,72421,72422],{"class":39},"# convert 1-based → 0-based\n",[33,72424,72425,72427,72429,72431,72433,72435,72437],{"class":35,"line":290},[33,72426,2191],{"class":163},[33,72428,68213],{"class":50},[33,72430,69088],{"class":167},[33,72432,67169],{"class":54},[33,72434,1649],{"class":167},[33,72436,495],{"class":163},[33,72438,69097],{"class":167},[33,72440,72441],{"class":35,"line":295},[33,72442,71891],{"class":167},[33,72444,72445,72447,72449,72451,72454,72456,72459,72461,72464,72466,72468,72470,72472],{"class":35,"line":300},[33,72446,9414],{"class":50},[33,72448,602],{"class":167},[33,72450,4059],{"class":163},[33,72452,72453],{"class":54},"\"Reordered ",[33,72455,4065],{"class":50},[33,72457,72458],{"class":167},"(new_order)",[33,72460,1121],{"class":50},[33,72462,72463],{"class":54}," pages → ",[33,72465,1115],{"class":50},[33,72467,69145],{"class":167},[33,72469,1121],{"class":50},[33,72471,274],{"class":54},[33,72473,221],{"class":167},[33,72475,72476,72478,72480,72482],{"class":35,"line":317},[33,72477,2449],{"class":163},[33,72479,783],{"class":50},[33,72481,1852],{"class":163},[33,72483,1855],{"class":167},[33,72485,72486,72488,72490,72492,72495,72497,72499,72501,72503],{"class":35,"line":332},[33,72487,9414],{"class":50},[33,72489,602],{"class":167},[33,72491,4059],{"class":163},[33,72493,72494],{"class":54},"\"Reorder failed: ",[33,72496,1115],{"class":50},[33,72498,6565],{"class":167},[33,72500,1121],{"class":50},[33,72502,274],{"class":54},[33,72504,221],{"class":167},[33,72506,72507],{"class":35,"line":347},[33,72508,65922],{"class":163},[33,72510,72511,72513],{"class":35,"line":374},[33,72512,3018],{"class":163},[33,72514,574],{"class":167},[33,72516,72517],{"class":35,"line":397},[33,72518,69048],{"class":167},[33,72520,72521],{"class":35,"line":653},[33,72522,92],{"emptyLinePlaceholder":91},[33,72524,72525,72527,72529,72531,72533],{"class":35,"line":667},[33,72526,2491],{"class":163},[33,72528,2494],{"class":50},[33,72530,2497],{"class":163},[33,72532,2500],{"class":54},[33,72534,574],{"class":167},[33,72536,72537,72540,72543,72545,72548,72550,72552,72554,72556,72558,72560],{"class":35,"line":675},[33,72538,72539],{"class":167},"    reorder_pages(Path(",[33,72541,72542],{"class":54},"\".\u002Fsource.pdf\"",[33,72544,71978],{"class":167},[33,72546,72547],{"class":54},"\".\u002Freordered.pdf\"",[33,72549,59343],{"class":167},[33,72551,10258],{"class":50},[33,72553,365],{"class":167},[33,72555,734],{"class":50},[33,72557,365],{"class":167},[33,72559,1533],{"class":50},[33,72561,751],{"class":167},[18,72563,72565],{"id":72564},"core-workflow-splitting","Core Workflow: Splitting",[424,72567,72569],{"id":72568},"range-based-split-1-based-ui-0-based-pypdf","Range-based split (1-based UI, 0-based pypdf)",[14,72571,72572],{},"The single most common indexing mistake: PDF viewers show page 1, pypdf stores it at index 0. Always subtract 1 when converting user-visible page numbers to slice indices.",[23,72574,72576],{"className":126,"code":72575,"language":47,"meta":28,"style":28},"# pip install pypdf\nfrom pypdf import PdfReader, PdfWriter\nfrom pathlib import Path\n\ndef split_pdf_by_ranges(\n    input_path: Path,\n    output_dir: Path,\n    ranges: list[tuple[int, int]],\n) -> list[Path]:\n    \"\"\"\n    Split a PDF by 1-based page ranges.\n    ranges=[(1,3),(4,8)] → two output files.\n    Returns list of created paths.\n    \"\"\"\n    output_dir.mkdir(parents=True, exist_ok=True)\n    created: list[Path] = []\n    try:\n        with open(input_path, \"rb\") as fh:\n            reader = PdfReader(fh)\n            total = len(reader.pages)\n\n            for idx, (start, end) in enumerate(ranges, start=1):\n                if start \u003C 1 or end > total or start > end:\n                    raise ValueError(\n                        f\"Invalid range ({start}–{end}) for {total}-page document\"\n                    )\n                writer = PdfWriter()\n                for page_num in range(start - 1, end):   # 0-based\n                    writer.add_page(reader.pages[page_num])\n\n                out_path = output_dir \u002F f\"{input_path.stem}_part{idx:02d}.pdf\"\n                with open(out_path, \"wb\") as out:\n                    writer.write(out)\n                writer.close()\n                created.append(out_path)\n                print(f\"Created: {out_path}  ({end - start + 1} pages)\")\n\n    except Exception as exc:\n        print(f\"Split failed: {exc}\")\n        raise\n    return created\n\nif __name__ == \"__main__\":\n    split_pdf_by_ranges(\n        Path(\".\u002Fannual_report.pdf\"),\n        Path(\".\u002Foutput\u002Fsplits\"),\n        [(1, 3), (4, 10), (11, 20)],\n    )\n",[30,72577,72578,72582,72592,72602,72606,72615,72620,72625,72638,72643,72647,72652,72657,72662,72666,72686,72695,72701,72718,72727,72738,72742,72764,72794,72803,72839,72843,72852,72875,72880,72884,72920,72938,72943,72948,72953,72989,72993,73003,73024,73028,73035,73039,73051,73056,73065,73074,73104],{"__ignoreMap":28},[33,72579,72580],{"class":35,"line":36},[33,72581,57316],{"class":39},[33,72583,72584,72586,72588,72590],{"class":35,"line":43},[33,72585,190],{"class":163},[33,72587,57333],{"class":167},[33,72589,164],{"class":163},[33,72591,66892],{"class":167},[33,72593,72594,72596,72598,72600],{"class":35,"line":61},[33,72595,190],{"class":163},[33,72597,193],{"class":167},[33,72599,164],{"class":163},[33,72601,198],{"class":167},[33,72603,72604],{"class":35,"line":73},[33,72605,92],{"emptyLinePlaceholder":91},[33,72607,72608,72610,72613],{"class":35,"line":88},[33,72609,562],{"class":163},[33,72611,72612],{"class":46}," split_pdf_by_ranges",[33,72614,7637],{"class":167},[33,72616,72617],{"class":35,"line":95},[33,72618,72619],{"class":167},"    input_path: Path,\n",[33,72621,72622],{"class":35,"line":101},[33,72623,72624],{"class":167},"    output_dir: Path,\n",[33,72626,72627,72630,72632,72634,72636],{"class":35,"line":171},[33,72628,72629],{"class":167},"    ranges: list[tuple[",[33,72631,1059],{"class":50},[33,72633,365],{"class":167},[33,72635,1059],{"class":50},[33,72637,47404],{"class":167},[33,72639,72640],{"class":35,"line":179},[33,72641,72642],{"class":167},") -> list[Path]:\n",[33,72644,72645],{"class":35,"line":187},[33,72646,7673],{"class":54},[33,72648,72649],{"class":35,"line":201},[33,72650,72651],{"class":54},"    Split a PDF by 1-based page ranges.\n",[33,72653,72654],{"class":35,"line":206},[33,72655,72656],{"class":54},"    ranges=[(1,3),(4,8)] → two output files.\n",[33,72658,72659],{"class":35,"line":224},[33,72660,72661],{"class":54},"    Returns list of created paths.\n",[33,72663,72664],{"class":35,"line":229},[33,72665,7673],{"class":54},[33,72667,72668,72670,72672,72674,72676,72678,72680,72682,72684],{"class":35,"line":235},[33,72669,6346],{"class":167},[33,72671,869],{"class":238},[33,72673,242],{"class":163},[33,72675,855],{"class":50},[33,72677,365],{"class":167},[33,72679,878],{"class":238},[33,72681,242],{"class":163},[33,72683,855],{"class":50},[33,72685,221],{"class":167},[33,72687,72688,72691,72693],{"class":35,"line":250},[33,72689,72690],{"class":167},"    created: list[Path] ",[33,72692,242],{"class":163},[33,72694,589],{"class":167},[33,72696,72697,72699],{"class":35,"line":266},[33,72698,2424],{"class":163},[33,72700,574],{"class":167},[33,72702,72703,72705,72707,72710,72712,72714,72716],{"class":35,"line":290},[33,72704,2191],{"class":163},[33,72706,68213],{"class":50},[33,72708,72709],{"class":167},"(input_path, ",[33,72711,68219],{"class":54},[33,72713,1649],{"class":167},[33,72715,495],{"class":163},[33,72717,67176],{"class":167},[33,72719,72720,72723,72725],{"class":35,"line":295},[33,72721,72722],{"class":167},"            reader ",[33,72724,242],{"class":163},[33,72726,68235],{"class":167},[33,72728,72729,72732,72734,72736],{"class":35,"line":300},[33,72730,72731],{"class":167},"            total ",[33,72733,242],{"class":163},[33,72735,4037],{"class":50},[33,72737,70691],{"class":167},[33,72739,72740],{"class":35,"line":317},[33,72741,92],{"emptyLinePlaceholder":91},[33,72743,72744,72746,72749,72751,72753,72756,72758,72760,72762],{"class":35,"line":332},[33,72745,1793],{"class":163},[33,72747,72748],{"class":167}," idx, (start, end) ",[33,72750,662],{"class":163},[33,72752,7403],{"class":50},[33,72754,72755],{"class":167},"(ranges, ",[33,72757,7409],{"class":238},[33,72759,242],{"class":163},[33,72761,734],{"class":50},[33,72763,1737],{"class":167},[33,72765,72766,72768,72771,72773,72775,72777,72780,72782,72785,72787,72789,72791],{"class":35,"line":347},[33,72767,7170],{"class":163},[33,72769,72770],{"class":167}," start ",[33,72772,4043],{"class":163},[33,72774,1814],{"class":50},[33,72776,37268],{"class":163},[33,72778,72779],{"class":167}," end ",[33,72781,6009],{"class":163},[33,72783,72784],{"class":167}," total ",[33,72786,7162],{"class":163},[33,72788,72770],{"class":167},[33,72790,6009],{"class":163},[33,72792,72793],{"class":167}," end:\n",[33,72795,72796,72799,72801],{"class":35,"line":374},[33,72797,72798],{"class":163},"                    raise",[33,72800,4054],{"class":50},[33,72802,7637],{"class":167},[33,72804,72805,72808,72811,72813,72815,72817,72820,72822,72825,72827,72830,72832,72834,72836],{"class":35,"line":397},[33,72806,72807],{"class":163},"                        f",[33,72809,72810],{"class":54},"\"Invalid range (",[33,72812,1115],{"class":50},[33,72814,7409],{"class":167},[33,72816,1121],{"class":50},[33,72818,72819],{"class":54},"–",[33,72821,1115],{"class":50},[33,72823,72824],{"class":167},"end",[33,72826,1121],{"class":50},[33,72828,72829],{"class":54},") for ",[33,72831,1115],{"class":50},[33,72833,72401],{"class":167},[33,72835,1121],{"class":50},[33,72837,72838],{"class":54},"-page document\"\n",[33,72840,72841],{"class":35,"line":653},[33,72842,1929],{"class":167},[33,72844,72845,72848,72850],{"class":35,"line":667},[33,72846,72847],{"class":167},"                writer ",[33,72849,242],{"class":163},[33,72851,67154],{"class":167},[33,72853,72854,72856,72858,72860,72862,72865,72867,72869,72872],{"class":35,"line":675},[33,72855,692],{"class":163},[33,72857,72352],{"class":167},[33,72859,662],{"class":163},[33,72861,1801],{"class":50},[33,72863,72864],{"class":167},"(start ",[33,72866,4126],{"class":163},[33,72868,1814],{"class":50},[33,72870,72871],{"class":167},", end):   ",[33,72873,72874],{"class":39},"# 0-based\n",[33,72876,72877],{"class":35,"line":689},[33,72878,72879],{"class":167},"                    writer.add_page(reader.pages[page_num])\n",[33,72881,72882],{"class":35,"line":703},[33,72883,92],{"emptyLinePlaceholder":91},[33,72885,72886,72888,72890,72892,72894,72896,72898,72900,72903,72905,72908,72910,72913,72916,72918],{"class":35,"line":714},[33,72887,40664],{"class":167},[33,72889,242],{"class":163},[33,72891,6393],{"class":167},[33,72893,1351],{"class":163},[33,72895,1110],{"class":163},[33,72897,274],{"class":54},[33,72899,1115],{"class":50},[33,72901,72902],{"class":167},"input_path.stem",[33,72904,1121],{"class":50},[33,72906,72907],{"class":54},"_part",[33,72909,1115],{"class":50},[33,72911,72912],{"class":167},"idx",[33,72914,72915],{"class":163},":02d",[33,72917,1121],{"class":50},[33,72919,19246],{"class":54},[33,72921,72922,72925,72927,72930,72932,72934,72936],{"class":35,"line":723},[33,72923,72924],{"class":163},"                with",[33,72926,68213],{"class":50},[33,72928,72929],{"class":167},"(out_path, ",[33,72931,67169],{"class":54},[33,72933,1649],{"class":167},[33,72935,495],{"class":163},[33,72937,69097],{"class":167},[33,72939,72940],{"class":35,"line":754},[33,72941,72942],{"class":167},"                    writer.write(out)\n",[33,72944,72945],{"class":35,"line":771},[33,72946,72947],{"class":167},"                writer.close()\n",[33,72949,72950],{"class":35,"line":777},[33,72951,72952],{"class":167},"                created.append(out_path)\n",[33,72954,72955,72957,72959,72961,72964,72966,72968,72970,72972,72974,72977,72979,72981,72983,72985,72987],{"class":35,"line":788},[33,72956,8264],{"class":50},[33,72958,602],{"class":167},[33,72960,4059],{"class":163},[33,72962,72963],{"class":54},"\"Created: ",[33,72965,1115],{"class":50},[33,72967,40722],{"class":167},[33,72969,1121],{"class":50},[33,72971,18019],{"class":54},[33,72973,1115],{"class":50},[33,72975,72976],{"class":167},"end ",[33,72978,4126],{"class":163},[33,72980,72770],{"class":167},[33,72982,1811],{"class":163},[33,72984,11022],{"class":50},[33,72986,62596],{"class":54},[33,72988,221],{"class":167},[33,72990,72991],{"class":35,"line":804},[33,72992,92],{"emptyLinePlaceholder":91},[33,72994,72995,72997,72999,73001],{"class":35,"line":809},[33,72996,2449],{"class":163},[33,72998,783],{"class":50},[33,73000,1852],{"class":163},[33,73002,1855],{"class":167},[33,73004,73005,73007,73009,73011,73014,73016,73018,73020,73022],{"class":35,"line":819},[33,73006,9414],{"class":50},[33,73008,602],{"class":167},[33,73010,4059],{"class":163},[33,73012,73013],{"class":54},"\"Split failed: ",[33,73015,1115],{"class":50},[33,73017,6565],{"class":167},[33,73019,1121],{"class":50},[33,73021,274],{"class":54},[33,73023,221],{"class":167},[33,73025,73026],{"class":35,"line":829},[33,73027,65922],{"class":163},[33,73029,73030,73032],{"class":35,"line":834},[33,73031,1332],{"class":163},[33,73033,73034],{"class":167}," created\n",[33,73036,73037],{"class":35,"line":839},[33,73038,92],{"emptyLinePlaceholder":91},[33,73040,73041,73043,73045,73047,73049],{"class":35,"line":860},[33,73042,2491],{"class":163},[33,73044,2494],{"class":50},[33,73046,2497],{"class":163},[33,73048,2500],{"class":54},[33,73050,574],{"class":167},[33,73052,73053],{"class":35,"line":887},[33,73054,73055],{"class":167},"    split_pdf_by_ranges(\n",[33,73057,73058,73060,73063],{"class":35,"line":907},[33,73059,69188],{"class":167},[33,73061,73062],{"class":54},"\".\u002Fannual_report.pdf\"",[33,73064,1506],{"class":167},[33,73066,73067,73069,73072],{"class":35,"line":1826},[33,73068,69188],{"class":167},[33,73070,73071],{"class":54},"\".\u002Foutput\u002Fsplits\"",[33,73073,1506],{"class":167},[33,73075,73076,73079,73081,73083,73085,73087,73089,73091,73093,73095,73097,73099,73101],{"class":35,"line":1844},[33,73077,73078],{"class":167},"        [(",[33,73080,734],{"class":50},[33,73082,365],{"class":167},[33,73084,10258],{"class":50},[33,73086,19834],{"class":167},[33,73088,1503],{"class":50},[33,73090,365],{"class":167},[33,73092,3545],{"class":50},[33,73094,19834],{"class":167},[33,73096,17260],{"class":50},[33,73098,365],{"class":167},[33,73100,2587],{"class":50},[33,73102,73103],{"class":167},")],\n",[33,73105,73106],{"class":35,"line":1858},[33,73107,1202],{"class":167},[14,73109,73110,73111,73114,73115,3035],{},"For a command-line interface with range-string parsing (",[30,73112,73113],{},"\"1-3,4-10\"","), see ",[940,73116,71076],{"href":71075},[18,73118,2709],{"id":2708},[424,73120,73122],{"id":73121},"encrypted-source-files","Encrypted source files",[23,73124,73126],{"className":126,"code":73125,"language":47,"meta":28,"style":28},"# pip install pypdf\nfrom pypdf import PdfReader, PdfWriter\nfrom pypdf.errors import FileNotDecryptedError\nfrom pathlib import Path\n\ndef merge_with_password(paths: list[Path], password: str, output: Path) -> None:\n    \"\"\"Merge password-protected PDFs after decrypting with a shared password.\"\"\"\n    writer = PdfWriter()\n    try:\n        for p in paths:\n            with open(p, \"rb\") as fh:\n                reader = PdfReader(fh)\n                if reader.is_encrypted:\n                    result = reader.decrypt(password)\n                    if result == 0:\n                        print(f\"[SKIP] Wrong password for {p.name}\")\n                        continue\n                writer.append(reader, import_outline=True)\n        with open(output, \"wb\") as out:\n            writer.write(out)\n    except FileNotDecryptedError as exc:\n        print(f\"Decryption failed: {exc}\")\n        raise\n    finally:\n        writer.close()\n",[30,73127,73128,73132,73142,73153,73163,73167,73186,73191,73199,73205,73216,73233,73241,73247,73255,73267,73288,73292,73304,73320,73324,73334,73355,73359,73365],{"__ignoreMap":28},[33,73129,73130],{"class":35,"line":36},[33,73131,57316],{"class":39},[33,73133,73134,73136,73138,73140],{"class":35,"line":43},[33,73135,190],{"class":163},[33,73137,57333],{"class":167},[33,73139,164],{"class":163},[33,73141,66892],{"class":167},[33,73143,73144,73146,73148,73150],{"class":35,"line":61},[33,73145,190],{"class":163},[33,73147,68145],{"class":167},[33,73149,164],{"class":163},[33,73151,73152],{"class":167}," FileNotDecryptedError\n",[33,73154,73155,73157,73159,73161],{"class":35,"line":73},[33,73156,190],{"class":163},[33,73158,193],{"class":167},[33,73160,164],{"class":163},[33,73162,198],{"class":167},[33,73164,73165],{"class":35,"line":88},[33,73166,92],{"emptyLinePlaceholder":91},[33,73168,73169,73171,73174,73177,73179,73182,73184],{"class":35,"line":95},[33,73170,562],{"class":163},[33,73172,73173],{"class":46}," merge_with_password",[33,73175,73176],{"class":167},"(paths: list[Path], password: ",[33,73178,1053],{"class":50},[33,73180,73181],{"class":167},", output: Path) -> ",[33,73183,571],{"class":50},[33,73185,574],{"class":167},[33,73187,73188],{"class":35,"line":101},[33,73189,73190],{"class":54},"    \"\"\"Merge password-protected PDFs after decrypting with a shared password.\"\"\"\n",[33,73192,73193,73195,73197],{"class":35,"line":171},[33,73194,68681],{"class":167},[33,73196,242],{"class":163},[33,73198,67154],{"class":167},[33,73200,73201,73203],{"class":35,"line":179},[33,73202,2424],{"class":163},[33,73204,574],{"class":167},[33,73206,73207,73209,73211,73213],{"class":35,"line":187},[33,73208,5973],{"class":163},[33,73210,6127],{"class":167},[33,73212,662],{"class":163},[33,73214,73215],{"class":167}," paths:\n",[33,73217,73218,73220,73222,73225,73227,73229,73231],{"class":35,"line":201},[33,73219,678],{"class":163},[33,73221,68213],{"class":50},[33,73223,73224],{"class":167},"(p, ",[33,73226,68219],{"class":54},[33,73228,1649],{"class":167},[33,73230,495],{"class":163},[33,73232,67176],{"class":167},[33,73234,73235,73237,73239],{"class":35,"line":206},[33,73236,68230],{"class":167},[33,73238,242],{"class":163},[33,73240,68235],{"class":167},[33,73242,73243,73245],{"class":35,"line":224},[33,73244,7170],{"class":163},[33,73246,68749],{"class":167},[33,73248,73249,73251,73253],{"class":35,"line":229},[33,73250,68789],{"class":167},[33,73252,242],{"class":163},[33,73254,68794],{"class":167},[33,73256,73257,73259,73261,73263,73265],{"class":35,"line":235},[33,73258,717],{"class":163},[33,73260,68801],{"class":167},[33,73262,1865],{"class":163},[33,73264,10791],{"class":50},[33,73266,574],{"class":167},[33,73268,73269,73271,73273,73275,73278,73280,73282,73284,73286],{"class":35,"line":250},[33,73270,68763],{"class":50},[33,73272,602],{"class":167},[33,73274,4059],{"class":163},[33,73276,73277],{"class":54},"\"[SKIP] Wrong password for ",[33,73279,1115],{"class":50},[33,73281,14190],{"class":167},[33,73283,1121],{"class":50},[33,73285,274],{"class":54},[33,73287,221],{"class":167},[33,73289,73290],{"class":35,"line":266},[33,73291,7458],{"class":163},[33,73293,73294,73296,73298,73300,73302],{"class":35,"line":290},[33,73295,68837],{"class":167},[33,73297,68840],{"class":238},[33,73299,242],{"class":163},[33,73301,855],{"class":50},[33,73303,221],{"class":167},[33,73305,73306,73308,73310,73312,73314,73316,73318],{"class":35,"line":295},[33,73307,2191],{"class":163},[33,73309,68213],{"class":50},[33,73311,70532],{"class":167},[33,73313,67169],{"class":54},[33,73315,1649],{"class":167},[33,73317,495],{"class":163},[33,73319,69097],{"class":167},[33,73321,73322],{"class":35,"line":300},[33,73323,71891],{"class":167},[33,73325,73326,73328,73330,73332],{"class":35,"line":317},[33,73327,2449],{"class":163},[33,73329,68904],{"class":167},[33,73331,495],{"class":163},[33,73333,1855],{"class":167},[33,73335,73336,73338,73340,73342,73345,73347,73349,73351,73353],{"class":35,"line":332},[33,73337,9414],{"class":50},[33,73339,602],{"class":167},[33,73341,4059],{"class":163},[33,73343,73344],{"class":54},"\"Decryption failed: ",[33,73346,1115],{"class":50},[33,73348,6565],{"class":167},[33,73350,1121],{"class":50},[33,73352,274],{"class":54},[33,73354,221],{"class":167},[33,73356,73357],{"class":35,"line":347},[33,73358,65922],{"class":163},[33,73360,73361,73363],{"class":35,"line":374},[33,73362,3018],{"class":163},[33,73364,574],{"class":167},[33,73366,73367],{"class":35,"line":397},[33,73368,69048],{"class":167},[14,73370,73371,73372,73374],{},"After merging, the output PDF is unencrypted. Re-apply protection as described in ",[940,73373,65967],{"href":65966}," before distributing.",[424,73376,73378],{"id":73377},"mismatched-page-sizes","Mismatched page sizes",[14,73380,73381,73382,73385,73386,73388],{},"When sources have different ",[30,73383,73384],{},"\u002FMediaBox"," dimensions (e.g., mixing A4 and Letter), the merged PDF preserves each page's original size. If uniform sizing is required, copy the target ",[30,73387,73384],{}," onto each page object after appending:",[23,73390,73392],{"className":126,"code":73391,"language":47,"meta":28,"style":28},"# pip install pypdf\nfrom pypdf import PdfWriter, PdfReader\nfrom pypdf.generic import RectangleObject\nfrom pathlib import Path\n\nA4 = RectangleObject((0, 0, 595.28, 841.89))   # points\n\ndef merge_normalize_size(paths: list[Path], output: Path) -> None:\n    writer = PdfWriter()\n    try:\n        for p in paths:\n            with open(p, \"rb\") as fh:\n                reader = PdfReader(fh)\n                writer.append(reader)\n        # Normalize all pages to A4 after appending\n        for page in writer.pages:\n            page.mediabox = A4\n        with open(output, \"wb\") as out:\n            writer.write(out)\n    finally:\n        writer.close()\n",[30,73393,73394,73398,73408,73420,73430,73434,73466,73470,73484,73492,73498,73508,73524,73532,73537,73542,73553,73562,73578,73582,73588],{"__ignoreMap":28},[33,73395,73396],{"class":35,"line":36},[33,73397,57316],{"class":39},[33,73399,73400,73402,73404,73406],{"class":35,"line":43},[33,73401,190],{"class":163},[33,73403,57333],{"class":167},[33,73405,164],{"class":163},[33,73407,68442],{"class":167},[33,73409,73410,73412,73415,73417],{"class":35,"line":61},[33,73411,190],{"class":163},[33,73413,73414],{"class":167}," pypdf.generic ",[33,73416,164],{"class":163},[33,73418,73419],{"class":167}," RectangleObject\n",[33,73421,73422,73424,73426,73428],{"class":35,"line":73},[33,73423,190],{"class":163},[33,73425,193],{"class":167},[33,73427,164],{"class":163},[33,73429,198],{"class":167},[33,73431,73432],{"class":35,"line":88},[33,73433,92],{"emptyLinePlaceholder":91},[33,73435,73436,73439,73441,73444,73446,73448,73450,73452,73455,73457,73460,73463],{"class":35,"line":95},[33,73437,73438],{"class":167},"A4 ",[33,73440,242],{"class":163},[33,73442,73443],{"class":167}," RectangleObject((",[33,73445,748],{"class":50},[33,73447,365],{"class":167},[33,73449,748],{"class":50},[33,73451,365],{"class":167},[33,73453,73454],{"class":50},"595.28",[33,73456,365],{"class":167},[33,73458,73459],{"class":50},"841.89",[33,73461,73462],{"class":167},"))   ",[33,73464,73465],{"class":39},"# points\n",[33,73467,73468],{"class":35,"line":101},[33,73469,92],{"emptyLinePlaceholder":91},[33,73471,73472,73474,73477,73480,73482],{"class":35,"line":171},[33,73473,562],{"class":163},[33,73475,73476],{"class":46}," merge_normalize_size",[33,73478,73479],{"class":167},"(paths: list[Path], output: Path) -> ",[33,73481,571],{"class":50},[33,73483,574],{"class":167},[33,73485,73486,73488,73490],{"class":35,"line":179},[33,73487,68681],{"class":167},[33,73489,242],{"class":163},[33,73491,67154],{"class":167},[33,73493,73494,73496],{"class":35,"line":187},[33,73495,2424],{"class":163},[33,73497,574],{"class":167},[33,73499,73500,73502,73504,73506],{"class":35,"line":201},[33,73501,5973],{"class":163},[33,73503,6127],{"class":167},[33,73505,662],{"class":163},[33,73507,73215],{"class":167},[33,73509,73510,73512,73514,73516,73518,73520,73522],{"class":35,"line":206},[33,73511,678],{"class":163},[33,73513,68213],{"class":50},[33,73515,73224],{"class":167},[33,73517,68219],{"class":54},[33,73519,1649],{"class":167},[33,73521,495],{"class":163},[33,73523,67176],{"class":167},[33,73525,73526,73528,73530],{"class":35,"line":224},[33,73527,68230],{"class":167},[33,73529,242],{"class":163},[33,73531,68235],{"class":167},[33,73533,73534],{"class":35,"line":229},[33,73535,73536],{"class":167},"                writer.append(reader)\n",[33,73538,73539],{"class":35,"line":235},[33,73540,73541],{"class":39},"        # Normalize all pages to A4 after appending\n",[33,73543,73544,73546,73548,73550],{"class":35,"line":250},[33,73545,5973],{"class":163},[33,73547,695],{"class":167},[33,73549,662],{"class":163},[33,73551,73552],{"class":167}," writer.pages:\n",[33,73554,73555,73558,73560],{"class":35,"line":266},[33,73556,73557],{"class":167},"            page.mediabox ",[33,73559,242],{"class":163},[33,73561,19049],{"class":167},[33,73563,73564,73566,73568,73570,73572,73574,73576],{"class":35,"line":290},[33,73565,2191],{"class":163},[33,73567,68213],{"class":50},[33,73569,70532],{"class":167},[33,73571,67169],{"class":54},[33,73573,1649],{"class":167},[33,73575,495],{"class":163},[33,73577,69097],{"class":167},[33,73579,73580],{"class":35,"line":295},[33,73581,71891],{"class":167},[33,73583,73584,73586],{"class":35,"line":300},[33,73585,3018],{"class":163},[33,73587,574],{"class":167},[33,73589,73590],{"class":35,"line":317},[33,73591,69048],{"class":167},[424,73593,73595],{"id":73594},"split-every-n-pages","Split every N pages",[23,73597,73599],{"className":126,"code":73598,"language":47,"meta":28,"style":28},"# pip install pypdf\nfrom pypdf import PdfReader, PdfWriter\nfrom pathlib import Path\nimport math\n\ndef split_every_n(input_path: Path, output_dir: Path, n: int) -> list[Path]:\n    \"\"\"Split a PDF into chunks of n pages each.\"\"\"\n    output_dir.mkdir(parents=True, exist_ok=True)\n    created: list[Path] = []\n    with open(input_path, \"rb\") as fh:\n        reader = PdfReader(fh)\n        total = len(reader.pages)\n        chunks = math.ceil(total \u002F n)\n        for chunk_idx in range(chunks):\n            writer = PdfWriter()\n            start = chunk_idx * n\n            end = min(start + n, total)\n            for i in range(start, end):\n                writer.add_page(reader.pages[i])\n            out_path = output_dir \u002F f\"{input_path.stem}_chunk{chunk_idx + 1:02d}.pdf\"\n            with open(out_path, \"wb\") as out:\n                writer.write(out)\n            writer.close()\n            created.append(out_path)\n    return created\n",[30,73600,73601,73605,73615,73625,73631,73635,73649,73654,73674,73682,73698,73706,73717,73731,73744,73752,73766,73783,73796,73801,73839,73855,73859,73863,73868],{"__ignoreMap":28},[33,73602,73603],{"class":35,"line":36},[33,73604,57316],{"class":39},[33,73606,73607,73609,73611,73613],{"class":35,"line":43},[33,73608,190],{"class":163},[33,73610,57333],{"class":167},[33,73612,164],{"class":163},[33,73614,66892],{"class":167},[33,73616,73617,73619,73621,73623],{"class":35,"line":61},[33,73618,190],{"class":163},[33,73620,193],{"class":167},[33,73622,164],{"class":163},[33,73624,198],{"class":167},[33,73626,73627,73629],{"class":35,"line":73},[33,73628,164],{"class":163},[33,73630,70042],{"class":167},[33,73632,73633],{"class":35,"line":88},[33,73634,92],{"emptyLinePlaceholder":91},[33,73636,73637,73639,73642,73645,73647],{"class":35,"line":95},[33,73638,562],{"class":163},[33,73640,73641],{"class":46}," split_every_n",[33,73643,73644],{"class":167},"(input_path: Path, output_dir: Path, n: ",[33,73646,1059],{"class":50},[33,73648,72642],{"class":167},[33,73650,73651],{"class":35,"line":101},[33,73652,73653],{"class":54},"    \"\"\"Split a PDF into chunks of n pages each.\"\"\"\n",[33,73655,73656,73658,73660,73662,73664,73666,73668,73670,73672],{"class":35,"line":171},[33,73657,6346],{"class":167},[33,73659,869],{"class":238},[33,73661,242],{"class":163},[33,73663,855],{"class":50},[33,73665,365],{"class":167},[33,73667,878],{"class":238},[33,73669,242],{"class":163},[33,73671,855],{"class":50},[33,73673,221],{"class":167},[33,73675,73676,73678,73680],{"class":35,"line":179},[33,73677,72690],{"class":167},[33,73679,242],{"class":163},[33,73681,589],{"class":167},[33,73683,73684,73686,73688,73690,73692,73694,73696],{"class":35,"line":187},[33,73685,1635],{"class":163},[33,73687,68213],{"class":50},[33,73689,72709],{"class":167},[33,73691,68219],{"class":54},[33,73693,1649],{"class":167},[33,73695,495],{"class":163},[33,73697,67176],{"class":167},[33,73699,73700,73702,73704],{"class":35,"line":201},[33,73701,62484],{"class":167},[33,73703,242],{"class":163},[33,73705,68235],{"class":167},[33,73707,73708,73711,73713,73715],{"class":35,"line":206},[33,73709,73710],{"class":167},"        total ",[33,73712,242],{"class":163},[33,73714,4037],{"class":50},[33,73716,70691],{"class":167},[33,73718,73719,73721,73723,73726,73728],{"class":35,"line":224},[33,73720,21926],{"class":167},[33,73722,242],{"class":163},[33,73724,73725],{"class":167}," math.ceil(total ",[33,73727,1351],{"class":163},[33,73729,73730],{"class":167}," n)\n",[33,73732,73733,73735,73737,73739,73741],{"class":35,"line":229},[33,73734,5973],{"class":163},[33,73736,70208],{"class":167},[33,73738,662],{"class":163},[33,73740,1801],{"class":50},[33,73742,73743],{"class":167},"(chunks):\n",[33,73745,73746,73748,73750],{"class":35,"line":235},[33,73747,70275],{"class":167},[33,73749,242],{"class":163},[33,73751,67154],{"class":167},[33,73753,73754,73757,73759,73761,73763],{"class":35,"line":250},[33,73755,73756],{"class":167},"            start ",[33,73758,242],{"class":163},[33,73760,70208],{"class":167},[33,73762,1769],{"class":163},[33,73764,73765],{"class":167}," n\n",[33,73767,73768,73771,73773,73776,73778,73780],{"class":35,"line":266},[33,73769,73770],{"class":167},"            end ",[33,73772,242],{"class":163},[33,73774,73775],{"class":50}," min",[33,73777,72864],{"class":167},[33,73779,1811],{"class":163},[33,73781,73782],{"class":167}," n, total)\n",[33,73784,73785,73787,73789,73791,73793],{"class":35,"line":290},[33,73786,1793],{"class":163},[33,73788,47269],{"class":167},[33,73790,662],{"class":163},[33,73792,1801],{"class":50},[33,73794,73795],{"class":167},"(start, end):\n",[33,73797,73798],{"class":35,"line":295},[33,73799,73800],{"class":167},"                writer.add_page(reader.pages[i])\n",[33,73802,73803,73806,73808,73810,73812,73814,73816,73818,73820,73822,73825,73827,73829,73831,73833,73835,73837],{"class":35,"line":300},[33,73804,73805],{"class":167},"            out_path ",[33,73807,242],{"class":163},[33,73809,6393],{"class":167},[33,73811,1351],{"class":163},[33,73813,1110],{"class":163},[33,73815,274],{"class":54},[33,73817,1115],{"class":50},[33,73819,72902],{"class":167},[33,73821,1121],{"class":50},[33,73823,73824],{"class":54},"_chunk",[33,73826,1115],{"class":50},[33,73828,70414],{"class":167},[33,73830,1811],{"class":163},[33,73832,1814],{"class":50},[33,73834,72915],{"class":163},[33,73836,1121],{"class":50},[33,73838,19246],{"class":54},[33,73840,73841,73843,73845,73847,73849,73851,73853],{"class":35,"line":317},[33,73842,678],{"class":163},[33,73844,68213],{"class":50},[33,73846,72929],{"class":167},[33,73848,67169],{"class":54},[33,73850,1649],{"class":167},[33,73852,495],{"class":163},[33,73854,69097],{"class":167},[33,73856,73857],{"class":35,"line":332},[33,73858,70388],{"class":167},[33,73860,73861],{"class":35,"line":347},[33,73862,70393],{"class":167},[33,73864,73865],{"class":35,"line":374},[33,73866,73867],{"class":167},"            created.append(out_path)\n",[33,73869,73870,73872],{"class":35,"line":397},[33,73871,1332],{"class":163},[33,73873,73034],{"class":167},[18,73875,52030],{"id":52029},[14,73877,73878],{},"After every merge or split, assert structural integrity:",[23,73880,73882],{"className":126,"code":73881,"language":47,"meta":28,"style":28},"# pip install pypdf\nfrom pypdf import PdfReader\nfrom pathlib import Path\n\ndef validate_pdf_output(\n    output_path: Path,\n    expected_pages: int | None = None,\n) -> bool:\n    \"\"\"Return True if the PDF opens cleanly and page count matches expectation.\"\"\"\n    try:\n        reader = PdfReader(output_path)\n        actual = len(reader.pages)\n        if expected_pages is not None and actual != expected_pages:\n            print(f\"FAIL: expected {expected_pages} pages, got {actual}\")\n            return False\n        print(f\"OK: {output_path.name}  ({actual} pages)\")\n        return True\n    except Exception as exc:\n        print(f\"FAIL: {output_path.name}  ({exc})\")\n        return False\n\nif __name__ == \"__main__\":\n    # After merging 3 files with 5+7+8 pages:\n    validate_pdf_output(Path(\".\u002Foutput\u002Fmerged.pdf\"), expected_pages=20)\n",[30,73883,73884,73888,73898,73908,73912,73921,73925,73942,73950,73955,73961,73969,73979,73999,74027,74033,74061,74067,74077,74105,74111,74115,74127,74132],{"__ignoreMap":28},[33,73885,73886],{"class":35,"line":36},[33,73887,57316],{"class":39},[33,73889,73890,73892,73894,73896],{"class":35,"line":43},[33,73891,190],{"class":163},[33,73893,57333],{"class":167},[33,73895,164],{"class":163},[33,73897,57338],{"class":167},[33,73899,73900,73902,73904,73906],{"class":35,"line":61},[33,73901,190],{"class":163},[33,73903,193],{"class":167},[33,73905,164],{"class":163},[33,73907,198],{"class":167},[33,73909,73910],{"class":35,"line":73},[33,73911,92],{"emptyLinePlaceholder":91},[33,73913,73914,73916,73919],{"class":35,"line":88},[33,73915,562],{"class":163},[33,73917,73918],{"class":46}," validate_pdf_output",[33,73920,7637],{"class":167},[33,73922,73923],{"class":35,"line":95},[33,73924,68575],{"class":167},[33,73926,73927,73930,73932,73934,73936,73938,73940],{"class":35,"line":101},[33,73928,73929],{"class":167},"    expected_pages: ",[33,73931,1059],{"class":50},[33,73933,2850],{"class":163},[33,73935,7657],{"class":50},[33,73937,212],{"class":163},[33,73939,7657],{"class":50},[33,73941,247],{"class":167},[33,73943,73944,73946,73948],{"class":35,"line":171},[33,73945,1617],{"class":167},[33,73947,2821],{"class":50},[33,73949,574],{"class":167},[33,73951,73952],{"class":35,"line":179},[33,73953,73954],{"class":54},"    \"\"\"Return True if the PDF opens cleanly and page count matches expectation.\"\"\"\n",[33,73956,73957,73959],{"class":35,"line":187},[33,73958,2424],{"class":163},[33,73960,574],{"class":167},[33,73962,73963,73965,73967],{"class":35,"line":201},[33,73964,62484],{"class":167},[33,73966,242],{"class":163},[33,73968,70680],{"class":167},[33,73970,73971,73973,73975,73977],{"class":35,"line":206},[33,73972,25149],{"class":167},[33,73974,242],{"class":163},[33,73976,4037],{"class":50},[33,73978,70691],{"class":167},[33,73980,73981,73983,73985,73987,73989,73991,73993,73995,73997],{"class":35,"line":224},[33,73982,8221],{"class":163},[33,73984,70698],{"class":167},[33,73986,3847],{"class":163},[33,73988,620],{"class":163},[33,73990,7657],{"class":50},[33,73992,5615],{"class":163},[33,73994,25170],{"class":167},[33,73996,17877],{"class":163},[33,73998,70713],{"class":167},[33,74000,74001,74003,74005,74007,74009,74011,74013,74015,74017,74019,74021,74023,74025],{"class":35,"line":229},[33,74002,9364],{"class":50},[33,74004,602],{"class":167},[33,74006,4059],{"class":163},[33,74008,70724],{"class":54},[33,74010,1115],{"class":50},[33,74012,70729],{"class":167},[33,74014,1121],{"class":50},[33,74016,62520],{"class":54},[33,74018,1115],{"class":50},[33,74020,25201],{"class":167},[33,74022,1121],{"class":50},[33,74024,274],{"class":54},[33,74026,221],{"class":167},[33,74028,74029,74031],{"class":35,"line":235},[33,74030,28782],{"class":163},[33,74032,2903],{"class":50},[33,74034,74035,74037,74039,74041,74043,74045,74047,74049,74051,74053,74055,74057,74059],{"class":35,"line":250},[33,74036,9414],{"class":50},[33,74038,602],{"class":167},[33,74040,4059],{"class":163},[33,74042,57480],{"class":54},[33,74044,1115],{"class":50},[33,74046,70764],{"class":167},[33,74048,1121],{"class":50},[33,74050,18019],{"class":54},[33,74052,1115],{"class":50},[33,74054,25201],{"class":167},[33,74056,1121],{"class":50},[33,74058,62596],{"class":54},[33,74060,221],{"class":167},[33,74062,74063,74065],{"class":35,"line":266},[33,74064,1659],{"class":163},[33,74066,2887],{"class":50},[33,74068,74069,74071,74073,74075],{"class":35,"line":290},[33,74070,2449],{"class":163},[33,74072,783],{"class":50},[33,74074,1852],{"class":163},[33,74076,1855],{"class":167},[33,74078,74079,74081,74083,74085,74087,74089,74091,74093,74095,74097,74099,74101,74103],{"class":35,"line":295},[33,74080,9414],{"class":50},[33,74082,602],{"class":167},[33,74084,4059],{"class":163},[33,74086,70816],{"class":54},[33,74088,1115],{"class":50},[33,74090,70764],{"class":167},[33,74092,1121],{"class":50},[33,74094,18019],{"class":54},[33,74096,1115],{"class":50},[33,74098,6565],{"class":167},[33,74100,1121],{"class":50},[33,74102,72406],{"class":54},[33,74104,221],{"class":167},[33,74106,74107,74109],{"class":35,"line":300},[33,74108,1659],{"class":163},[33,74110,2903],{"class":50},[33,74112,74113],{"class":35,"line":317},[33,74114,92],{"emptyLinePlaceholder":91},[33,74116,74117,74119,74121,74123,74125],{"class":35,"line":332},[33,74118,2491],{"class":163},[33,74120,2494],{"class":50},[33,74122,2497],{"class":163},[33,74124,2500],{"class":54},[33,74126,574],{"class":167},[33,74128,74129],{"class":35,"line":347},[33,74130,74131],{"class":39},"    # After merging 3 files with 5+7+8 pages:\n",[33,74133,74134,74137,74139,74141,74143,74145,74147],{"class":35,"line":374},[33,74135,74136],{"class":167},"    validate_pdf_output(Path(",[33,74138,69199],{"class":54},[33,74140,18525],{"class":167},[33,74142,70729],{"class":238},[33,74144,242],{"class":163},[33,74146,2587],{"class":50},[33,74148,221],{"class":167},[14,74150,74151],{},"Also open a random output file in a PDF viewer after running automated tests — automated checks catch structural errors but not rendering artifacts from corrupt font streams.",[18,74153,21810],{"id":21809},[14,74155,74156,46332,74159,74161],{},[1974,74157,74158],{},"Memory model:",[30,74160,70025],{}," accumulates page references in memory but does not load pixel data. Peak memory is proportional to the largest single page's resource dictionary, not the sum of all pages. A merge of 500 one-page documents uses far less memory than a merge of 5 documents with embedded high-resolution images.",[14,74163,74164,74167,74168,74170,74171,74173,74174,74177],{},[1974,74165,74166],{},"Streaming large batches:"," For batches over 200 files, avoid holding all ",[30,74169,68108],{}," objects open simultaneously. The pattern in the merge snippet above — open, append, close inside ",[30,74172,22271],{}," — is correct. Avoid ",[30,74175,74176],{},"readers = [PdfReader(p) for p in files]"," patterns.",[14,74179,74180,74183],{},[1974,74181,74182],{},"Chunked intermediate merge:"," For 1 000+ files, merge in chunks of 100 to temporary files, then do a final merge of the temporaries. This caps peak RAM and isolates corruption to specific chunks.",[23,74185,74187],{"className":126,"code":74186,"language":47,"meta":28,"style":28},"# pip install pypdf\nfrom pypdf import PdfWriter, PdfReader\nfrom pathlib import Path\nimport tempfile, shutil\n\ndef chunked_merge(all_files: list[Path], output: Path, chunk_size: int = 100) -> None:\n    \"\"\"Merge a large list of PDFs in chunks to keep memory bounded.\"\"\"\n    tmp_dir = Path(tempfile.mkdtemp())\n    try:\n        chunk_files: list[Path] = []\n        for i in range(0, len(all_files), chunk_size):\n            chunk = all_files[i:i + chunk_size]\n            chunk_out = tmp_dir \u002F f\"chunk_{i \u002F\u002F chunk_size:04d}.pdf\"\n            writer = PdfWriter()\n            for p in chunk:\n                with open(p, \"rb\") as fh:\n                    writer.append(PdfReader(fh))\n            with open(chunk_out, \"wb\") as out:\n                writer.write(out)\n            writer.close()\n            chunk_files.append(chunk_out)\n\n        # Final merge of chunk files\n        final_writer = PdfWriter()\n        for c in chunk_files:\n            with open(c, \"rb\") as fh:\n                final_writer.append(PdfReader(fh))\n        with open(output, \"wb\") as out:\n            final_writer.write(out)\n        final_writer.close()\n    finally:\n        shutil.rmtree(tmp_dir, ignore_errors=True)\n",[30,74188,74189,74193,74203,74213,74220,74224,74245,74250,74259,74265,74274,74295,74308,74338,74346,74356,74372,74377,74393,74397,74401,74406,74410,74415,74424,74435,74452,74457,74473,74478,74483,74489],{"__ignoreMap":28},[33,74190,74191],{"class":35,"line":36},[33,74192,57316],{"class":39},[33,74194,74195,74197,74199,74201],{"class":35,"line":43},[33,74196,190],{"class":163},[33,74198,57333],{"class":167},[33,74200,164],{"class":163},[33,74202,68442],{"class":167},[33,74204,74205,74207,74209,74211],{"class":35,"line":61},[33,74206,190],{"class":163},[33,74208,193],{"class":167},[33,74210,164],{"class":163},[33,74212,198],{"class":167},[33,74214,74215,74217],{"class":35,"line":73},[33,74216,164],{"class":163},[33,74218,74219],{"class":167}," tempfile, shutil\n",[33,74221,74222],{"class":35,"line":88},[33,74223,92],{"emptyLinePlaceholder":91},[33,74225,74226,74228,74230,74233,74235,74237,74239,74241,74243],{"class":35,"line":95},[33,74227,562],{"class":163},[33,74229,70104],{"class":46},[33,74231,74232],{"class":167},"(all_files: list[Path], output: Path, chunk_size: ",[33,74234,1059],{"class":50},[33,74236,212],{"class":163},[33,74238,18366],{"class":50},[33,74240,1617],{"class":167},[33,74242,571],{"class":50},[33,74244,574],{"class":167},[33,74246,74247],{"class":35,"line":101},[33,74248,74249],{"class":54},"    \"\"\"Merge a large list of PDFs in chunks to keep memory bounded.\"\"\"\n",[33,74251,74252,74254,74256],{"class":35,"line":171},[33,74253,70147],{"class":167},[33,74255,242],{"class":163},[33,74257,74258],{"class":167}," Path(tempfile.mkdtemp())\n",[33,74260,74261,74263],{"class":35,"line":179},[33,74262,2424],{"class":163},[33,74264,574],{"class":167},[33,74266,74267,74270,74272],{"class":35,"line":187},[33,74268,74269],{"class":167},"        chunk_files: list[Path] ",[33,74271,242],{"class":163},[33,74273,589],{"class":167},[33,74275,74276,74278,74280,74282,74284,74286,74288,74290,74292],{"class":35,"line":201},[33,74277,5973],{"class":163},[33,74279,47269],{"class":167},[33,74281,662],{"class":163},[33,74283,1801],{"class":50},[33,74285,602],{"class":167},[33,74287,748],{"class":50},[33,74289,365],{"class":167},[33,74291,928],{"class":50},[33,74293,74294],{"class":167},"(all_files), chunk_size):\n",[33,74296,74297,74299,74301,74304,74306],{"class":35,"line":206},[33,74298,70220],{"class":167},[33,74300,242],{"class":163},[33,74302,74303],{"class":167}," all_files[i:i ",[33,74305,1811],{"class":163},[33,74307,70241],{"class":167},[33,74309,74310,74312,74314,74316,74318,74320,74322,74324,74326,74329,74332,74334,74336],{"class":35,"line":224},[33,74311,70246],{"class":167},[33,74313,242],{"class":163},[33,74315,70251],{"class":167},[33,74317,1351],{"class":163},[33,74319,1110],{"class":163},[33,74321,70258],{"class":54},[33,74323,1115],{"class":50},[33,74325,11017],{"class":167},[33,74327,74328],{"class":163},"\u002F\u002F",[33,74330,74331],{"class":167}," chunk_size",[33,74333,70266],{"class":163},[33,74335,1121],{"class":50},[33,74337,19246],{"class":54},[33,74339,74340,74342,74344],{"class":35,"line":229},[33,74341,70275],{"class":167},[33,74343,242],{"class":163},[33,74345,67154],{"class":167},[33,74347,74348,74350,74352,74354],{"class":35,"line":235},[33,74349,1793],{"class":163},[33,74351,6127],{"class":167},[33,74353,662],{"class":163},[33,74355,70290],{"class":167},[33,74357,74358,74360,74362,74364,74366,74368,74370],{"class":35,"line":250},[33,74359,72924],{"class":163},[33,74361,68213],{"class":50},[33,74363,73224],{"class":167},[33,74365,68219],{"class":54},[33,74367,1649],{"class":167},[33,74369,495],{"class":163},[33,74371,67176],{"class":167},[33,74373,74374],{"class":35,"line":266},[33,74375,74376],{"class":167},"                    writer.append(PdfReader(fh))\n",[33,74378,74379,74381,74383,74385,74387,74389,74391],{"class":35,"line":290},[33,74380,678],{"class":163},[33,74382,68213],{"class":50},[33,74384,70375],{"class":167},[33,74386,67169],{"class":54},[33,74388,1649],{"class":167},[33,74390,495],{"class":163},[33,74392,69097],{"class":167},[33,74394,74395],{"class":35,"line":295},[33,74396,70388],{"class":167},[33,74398,74399],{"class":35,"line":300},[33,74400,70393],{"class":167},[33,74402,74403],{"class":35,"line":317},[33,74404,74405],{"class":167},"            chunk_files.append(chunk_out)\n",[33,74407,74408],{"class":35,"line":332},[33,74409,92],{"emptyLinePlaceholder":91},[33,74411,74412],{"class":35,"line":347},[33,74413,74414],{"class":39},"        # Final merge of chunk files\n",[33,74416,74417,74420,74422],{"class":35,"line":374},[33,74418,74419],{"class":167},"        final_writer ",[33,74421,242],{"class":163},[33,74423,67154],{"class":167},[33,74425,74426,74428,74430,74432],{"class":35,"line":397},[33,74427,5973],{"class":163},[33,74429,7486],{"class":167},[33,74431,662],{"class":163},[33,74433,74434],{"class":167}," chunk_files:\n",[33,74436,74437,74439,74441,74444,74446,74448,74450],{"class":35,"line":653},[33,74438,678],{"class":163},[33,74440,68213],{"class":50},[33,74442,74443],{"class":167},"(c, ",[33,74445,68219],{"class":54},[33,74447,1649],{"class":167},[33,74449,495],{"class":163},[33,74451,67176],{"class":167},[33,74453,74454],{"class":35,"line":667},[33,74455,74456],{"class":167},"                final_writer.append(PdfReader(fh))\n",[33,74458,74459,74461,74463,74465,74467,74469,74471],{"class":35,"line":675},[33,74460,2191],{"class":163},[33,74462,68213],{"class":50},[33,74464,70532],{"class":167},[33,74466,67169],{"class":54},[33,74468,1649],{"class":167},[33,74470,495],{"class":163},[33,74472,69097],{"class":167},[33,74474,74475],{"class":35,"line":689},[33,74476,74477],{"class":167},"            final_writer.write(out)\n",[33,74479,74480],{"class":35,"line":703},[33,74481,74482],{"class":167},"        final_writer.close()\n",[33,74484,74485,74487],{"class":35,"line":714},[33,74486,3018],{"class":163},[33,74488,574],{"class":167},[33,74490,74491,74493,74495,74497,74499],{"class":35,"line":723},[33,74492,70583],{"class":167},[33,74494,70586],{"class":238},[33,74496,242],{"class":163},[33,74498,855],{"class":50},[33,74500,221],{"class":167},[14,74502,74503,74508,74509,74512,74513,17583,74515,74518,74519,3035],{},[1974,74504,74505,74507],{},[30,74506,68393],{}," for repair:"," If source files have corrupted cross-reference tables (",[30,74510,74511],{},"PdfReadError: EOF marker not found","), open them first with ",[30,74514,68393],{},[30,74516,74517],{},"pikepdf.open(path, allow_overwriting_input=False)","), save a repaired copy, then process with ",[30,74520,65045],{},[18,74522,4271],{"id":4270},[4273,74524,74525,74535],{},[4276,74526,74527],{},[4279,74528,74529,74531,74533],{},[4282,74530,14317],{},[4282,74532,4287],{},[4282,74534,4290],{},[4292,74536,74537,74555,74573,74590,74614,74632],{},[4279,74538,74539,74543,74546],{},[4297,74540,74541],{},[30,74542,74511],{},[4297,74544,74545],{},"Truncated or corrupted file",[4297,74547,74548,74549,74551,74552,74554],{},"Open and re-save with ",[30,74550,68393],{},"; wrap in ",[30,74553,70991],{}," to skip in batch",[4279,74556,74557,74561,74564],{},[4297,74558,74559],{},[30,74560,68095],{},[4297,74562,74563],{},"Accessing pages of encrypted PDF before decrypting",[4297,74565,74566,74567,74569,74570,74572],{},"Call ",[30,74568,68099],{}," and check the return value (",[30,74571,748],{}," = wrong password)",[4279,74574,74575,74580,74583],{},[4297,74576,74577],{},[30,74578,74579],{},"PermissionError: [Errno 13]",[4297,74581,74582],{},"File handle still open (Windows)",[4297,74584,74585,74586,74589],{},"Always use ",[30,74587,74588],{},"with open(path, \"rb\") as fh: reader = PdfReader(fh)"," pattern",[4279,74591,74592,74595,74604],{},[4297,74593,74594],{},"Bookmarks missing in merged output",[4297,74596,74597,74599,74600,49047,74602],{},[30,74598,72227],{}," (old default) or using ",[30,74601,71069],{},[30,74603,69240],{},[4297,74605,14408,74606,36661,74608,74610,74611,74613],{},[30,74607,69251],{},[30,74609,69240],{},"; never use ",[30,74612,71069],{}," for full-document merges",[4279,74615,74616,74619,74625],{},[4297,74617,74618],{},"Different page sizes in output",[4297,74620,74621,74622,74624],{},"Source documents have mixed ",[30,74623,73384],{}," values",[4297,74626,74627,74628,74631],{},"Normalize ",[30,74629,74630],{},"page.mediabox"," after appending, or accept per-page sizes",[4279,74633,74634,74637,74642],{},[4297,74635,74636],{},"Output file is 0 bytes",[4297,74638,74639,74641],{},[30,74640,70972],{}," called before any pages appended",[4297,74643,67848,74644,70980],{},[30,74645,74646],{},"len(writer.pages) > 0",[18,74648,14437],{"id":14436},[23,74650,74652],{"className":126,"code":74651,"language":47,"meta":28,"style":28},"#!\u002Fusr\u002Fbin\u002Fenv python3\n# pip install pypdf\n\"\"\"\nmerge_split.py — merge all PDFs in a folder, then split the result by ranges.\nUsage: python merge_split.py --input .\u002Fdocs --output .\u002Fout --split 1-5,6-10\n\"\"\"\nimport argparse\nimport re\nfrom pathlib import Path\n\nfrom pypdf import PdfReader, PdfWriter\nfrom pypdf.errors import PdfReadError\n\n\ndef natural_key(p: Path) -> list:\n    return [int(c) if c.isdigit() else c.lower() for c in re.split(r\"(\\d+)\", p.name)]\n\n\ndef merge(input_dir: Path, output: Path) -> int:\n    \"\"\"Return page count of merged file.\"\"\"\n    writer = PdfWriter()\n    files = sorted(input_dir.glob(\"*.pdf\"), key=natural_key)\n    for f in files:\n        try:\n            with open(f, \"rb\") as fh:\n                writer.append(PdfReader(fh), import_outline=True)\n        except PdfReadError as exc:\n            print(f\"[SKIP] {f.name}: {exc}\")\n    output.parent.mkdir(parents=True, exist_ok=True)\n    with open(output, \"wb\") as out:\n        writer.write(out)\n    count = len(writer.pages)\n    writer.close()\n    print(f\"Merged {len(files)} files → {output}  ({count} pages)\")\n    return count\n\n\ndef parse_ranges(spec: str) -> list[tuple[int, int]]:\n    \"\"\"Parse '1-5,6-10' → [(1,5),(6,10)].\"\"\"\n    result = []\n    for part in spec.split(\",\"):\n        part = part.strip()\n        if \"-\" in part:\n            a, b = part.split(\"-\", 1)\n            result.append((int(a), int(b)))\n        else:\n            n = int(part)\n            result.append((n, n))\n    return result\n\n\ndef split(input_path: Path, output_dir: Path, ranges: list[tuple[int, int]]) -> None:\n    output_dir.mkdir(parents=True, exist_ok=True)\n    with open(input_path, \"rb\") as fh:\n        reader = PdfReader(fh)\n        total = len(reader.pages)\n        for idx, (start, end) in enumerate(ranges, 1):\n            if not (1 \u003C= start \u003C= end \u003C= total):\n                print(f\"[SKIP] Range ({start}-{end}) invalid for {total}-page doc\")\n                continue\n            writer = PdfWriter()\n            for i in range(start - 1, end):\n                writer.add_page(reader.pages[i])\n            out_path = output_dir \u002F f\"{input_path.stem}_part{idx:02d}.pdf\"\n            with open(out_path, \"wb\") as out:\n                writer.write(out)\n            writer.close()\n            print(f\"  part{idx:02d}: pages {start}–{end} → {out_path.name}\")\n\n\ndef main() -> None:\n    ap = argparse.ArgumentParser(description=\"Merge folder of PDFs, optionally split result\")\n    ap.add_argument(\"--input\", required=True, type=Path, help=\"Directory of source PDFs\")\n    ap.add_argument(\"--output\", required=True, type=Path, help=\"Output directory\")\n    ap.add_argument(\"--split\", default=\"\", help=\"Page ranges to split, e.g. '1-5,6-10'\")\n    args = ap.parse_args()\n\n    merged_path = args.output \u002F \"merged.pdf\"\n    merge(args.input, merged_path)\n\n    if args.split:\n        ranges = parse_ranges(args.split)\n        split(merged_path, args.output \u002F \"splits\", ranges)\n\n\nif __name__ == \"__main__\":\n    main()\n",[30,74653,74654,74658,74662,74666,74671,74676,74680,74686,74692,74702,74706,74716,74726,74730,74734,74747,74787,74791,74795,74809,74814,74822,74844,74855,74861,74877,74890,74900,74928,74949,74965,74969,74980,74984,75022,75029,75033,75037,75059,75064,75072,75088,75098,75110,75129,75144,75150,75162,75167,75173,75177,75181,75204,75224,75240,75248,75258,75274,75297,75336,75340,75348,75367,75371,75403,75419,75423,75427,75476,75480,75484,75496,75513,75543,75573,75599,75607,75611,75625,75630,75634,75641,75651,75664,75668,75672,75684],{"__ignoreMap":28},[33,74655,74656],{"class":35,"line":36},[33,74657,14447],{"class":39},[33,74659,74660],{"class":35,"line":43},[33,74661,57316],{"class":39},[33,74663,74664],{"class":35,"line":61},[33,74665,139],{"class":54},[33,74667,74668],{"class":35,"line":73},[33,74669,74670],{"class":54},"merge_split.py — merge all PDFs in a folder, then split the result by ranges.\n",[33,74672,74673],{"class":35,"line":88},[33,74674,74675],{"class":54},"Usage: python merge_split.py --input .\u002Fdocs --output .\u002Fout --split 1-5,6-10\n",[33,74677,74678],{"class":35,"line":95},[33,74679,139],{"class":54},[33,74681,74682,74684],{"class":35,"line":101},[33,74683,164],{"class":163},[33,74685,4461],{"class":167},[33,74687,74688,74690],{"class":35,"line":171},[33,74689,164],{"class":163},[33,74691,11917],{"class":167},[33,74693,74694,74696,74698,74700],{"class":35,"line":179},[33,74695,190],{"class":163},[33,74697,193],{"class":167},[33,74699,164],{"class":163},[33,74701,198],{"class":167},[33,74703,74704],{"class":35,"line":187},[33,74705,92],{"emptyLinePlaceholder":91},[33,74707,74708,74710,74712,74714],{"class":35,"line":201},[33,74709,190],{"class":163},[33,74711,57333],{"class":167},[33,74713,164],{"class":163},[33,74715,66892],{"class":167},[33,74717,74718,74720,74722,74724],{"class":35,"line":206},[33,74719,190],{"class":163},[33,74721,68145],{"class":167},[33,74723,164],{"class":163},[33,74725,68150],{"class":167},[33,74727,74728],{"class":35,"line":224},[33,74729,92],{"emptyLinePlaceholder":91},[33,74731,74732],{"class":35,"line":229},[33,74733,92],{"emptyLinePlaceholder":91},[33,74735,74736,74738,74741,74743,74745],{"class":35,"line":235},[33,74737,562],{"class":163},[33,74739,74740],{"class":46}," natural_key",[33,74742,69364],{"class":167},[33,74744,25066],{"class":50},[33,74746,574],{"class":167},[33,74748,74749,74751,74753,74755,74757,74759,74761,74763,74765,74767,74769,74771,74773,74775,74777,74779,74781,74783,74785],{"class":35,"line":250},[33,74750,1332],{"class":163},[33,74752,9178],{"class":167},[33,74754,1059],{"class":50},[33,74756,68507],{"class":167},[33,74758,2491],{"class":163},[33,74760,68512],{"class":167},[33,74762,7489],{"class":163},[33,74764,69387],{"class":167},[33,74766,6124],{"class":163},[33,74768,7486],{"class":167},[33,74770,662],{"class":163},[33,74772,68528],{"class":167},[33,74774,11977],{"class":163},[33,74776,274],{"class":54},[33,74778,68535],{"class":50},[33,74780,1811],{"class":163},[33,74782,12027],{"class":50},[33,74784,274],{"class":54},[33,74786,69410],{"class":167},[33,74788,74789],{"class":35,"line":266},[33,74790,92],{"emptyLinePlaceholder":91},[33,74792,74793],{"class":35,"line":290},[33,74794,92],{"emptyLinePlaceholder":91},[33,74796,74797,74799,74802,74805,74807],{"class":35,"line":295},[33,74798,562],{"class":163},[33,74800,74801],{"class":46}," merge",[33,74803,74804],{"class":167},"(input_dir: Path, output: Path) -> ",[33,74806,1059],{"class":50},[33,74808,574],{"class":167},[33,74810,74811],{"class":35,"line":300},[33,74812,74813],{"class":54},"    \"\"\"Return page count of merged file.\"\"\"\n",[33,74815,74816,74818,74820],{"class":35,"line":317},[33,74817,68681],{"class":167},[33,74819,242],{"class":163},[33,74821,67154],{"class":167},[33,74823,74824,74827,74829,74831,74833,74835,74837,74839,74841],{"class":35,"line":332},[33,74825,74826],{"class":167},"    files ",[33,74828,242],{"class":163},[33,74830,28924],{"class":50},[33,74832,68196],{"class":167},[33,74834,610],{"class":54},[33,74836,18525],{"class":167},[33,74838,44114],{"class":238},[33,74840,242],{"class":163},[33,74842,74843],{"class":167},"natural_key)\n",[33,74845,74846,74848,74850,74852],{"class":35,"line":347},[33,74847,656],{"class":163},[33,74849,8832],{"class":167},[33,74851,662],{"class":163},[33,74853,74854],{"class":167}," files:\n",[33,74856,74857,74859],{"class":35,"line":374},[33,74858,670],{"class":163},[33,74860,574],{"class":167},[33,74862,74863,74865,74867,74869,74871,74873,74875],{"class":35,"line":397},[33,74864,678],{"class":163},[33,74866,68213],{"class":50},[33,74868,70306],{"class":167},[33,74870,68219],{"class":54},[33,74872,1649],{"class":167},[33,74874,495],{"class":163},[33,74876,67176],{"class":167},[33,74878,74879,74882,74884,74886,74888],{"class":35,"line":653},[33,74880,74881],{"class":167},"                writer.append(PdfReader(fh), ",[33,74883,68840],{"class":238},[33,74885,242],{"class":163},[33,74887,855],{"class":50},[33,74889,221],{"class":167},[33,74891,74892,74894,74896,74898],{"class":35,"line":667},[33,74893,780],{"class":163},[33,74895,68273],{"class":167},[33,74897,495],{"class":163},[33,74899,1855],{"class":167},[33,74901,74902,74904,74906,74908,74910,74912,74914,74916,74918,74920,74922,74924,74926],{"class":35,"line":675},[33,74903,9364],{"class":50},[33,74905,602],{"class":167},[33,74907,4059],{"class":163},[33,74909,69652],{"class":54},[33,74911,1115],{"class":50},[33,74913,70352],{"class":167},[33,74915,1121],{"class":50},[33,74917,2079],{"class":54},[33,74919,1115],{"class":50},[33,74921,6565],{"class":167},[33,74923,1121],{"class":50},[33,74925,274],{"class":54},[33,74927,221],{"class":167},[33,74929,74930,74933,74935,74937,74939,74941,74943,74945,74947],{"class":35,"line":689},[33,74931,74932],{"class":167},"    output.parent.mkdir(",[33,74934,869],{"class":238},[33,74936,242],{"class":163},[33,74938,855],{"class":50},[33,74940,365],{"class":167},[33,74942,878],{"class":238},[33,74944,242],{"class":163},[33,74946,855],{"class":50},[33,74948,221],{"class":167},[33,74950,74951,74953,74955,74957,74959,74961,74963],{"class":35,"line":703},[33,74952,1635],{"class":163},[33,74954,68213],{"class":50},[33,74956,70532],{"class":167},[33,74958,67169],{"class":54},[33,74960,1649],{"class":167},[33,74962,495],{"class":163},[33,74964,69097],{"class":167},[33,74966,74967],{"class":35,"line":714},[33,74968,69102],{"class":167},[33,74970,74971,74973,74975,74977],{"class":35,"line":723},[33,74972,40867],{"class":167},[33,74974,242],{"class":163},[33,74976,4037],{"class":50},[33,74978,74979],{"class":167},"(writer.pages)\n",[33,74981,74982],{"class":35,"line":754},[33,74983,69830],{"class":167},[33,74985,74986,74988,74990,74992,74994,74996,74999,75001,75003,75005,75007,75009,75011,75013,75016,75018,75020],{"class":35,"line":771},[33,74987,7268],{"class":50},[33,74989,602],{"class":167},[33,74991,4059],{"class":163},[33,74993,71898],{"class":54},[33,74995,4065],{"class":50},[33,74997,74998],{"class":167},"(files)",[33,75000,1121],{"class":50},[33,75002,69140],{"class":54},[33,75004,1115],{"class":50},[33,75006,70566],{"class":167},[33,75008,1121],{"class":50},[33,75010,18019],{"class":54},[33,75012,1115],{"class":50},[33,75014,75015],{"class":167},"count",[33,75017,1121],{"class":50},[33,75019,62596],{"class":54},[33,75021,221],{"class":167},[33,75023,75024,75026],{"class":35,"line":777},[33,75025,1332],{"class":163},[33,75027,75028],{"class":167}," count\n",[33,75030,75031],{"class":35,"line":788},[33,75032,92],{"emptyLinePlaceholder":91},[33,75034,75035],{"class":35,"line":804},[33,75036,92],{"emptyLinePlaceholder":91},[33,75038,75039,75041,75044,75047,75049,75051,75053,75055,75057],{"class":35,"line":809},[33,75040,562],{"class":163},[33,75042,75043],{"class":46}," parse_ranges",[33,75045,75046],{"class":167},"(spec: ",[33,75048,1053],{"class":50},[33,75050,47201],{"class":167},[33,75052,1059],{"class":50},[33,75054,365],{"class":167},[33,75056,1059],{"class":50},[33,75058,43900],{"class":167},[33,75060,75061],{"class":35,"line":819},[33,75062,75063],{"class":54},"    \"\"\"Parse '1-5,6-10' → [(1,5),(6,10)].\"\"\"\n",[33,75065,75066,75068,75070],{"class":35,"line":829},[33,75067,8842],{"class":167},[33,75069,242],{"class":163},[33,75071,589],{"class":167},[33,75073,75074,75076,75079,75081,75084,75086],{"class":35,"line":834},[33,75075,656],{"class":163},[33,75077,75078],{"class":167}," part ",[33,75080,662],{"class":163},[33,75082,75083],{"class":167}," spec.split(",[33,75085,15900],{"class":54},[33,75087,1737],{"class":167},[33,75089,75090,75093,75095],{"class":35,"line":839},[33,75091,75092],{"class":167},"        part ",[33,75094,242],{"class":163},[33,75096,75097],{"class":167}," part.strip()\n",[33,75099,75100,75102,75105,75107],{"class":35,"line":860},[33,75101,8221],{"class":163},[33,75103,75104],{"class":54}," \"-\"",[33,75106,8002],{"class":163},[33,75108,75109],{"class":167}," part:\n",[33,75111,75112,75115,75117,75120,75123,75125,75127],{"class":35,"line":887},[33,75113,75114],{"class":167},"            a, b ",[33,75116,242],{"class":163},[33,75118,75119],{"class":167}," part.split(",[33,75121,75122],{"class":54},"\"-\"",[33,75124,365],{"class":167},[33,75126,734],{"class":50},[33,75128,221],{"class":167},[33,75130,75131,75134,75136,75139,75141],{"class":35,"line":907},[33,75132,75133],{"class":167},"            result.append((",[33,75135,1059],{"class":50},[33,75137,75138],{"class":167},"(a), ",[33,75140,1059],{"class":50},[33,75142,75143],{"class":167},"(b)))\n",[33,75145,75146,75148],{"class":35,"line":1826},[33,75147,41290],{"class":163},[33,75149,574],{"class":167},[33,75151,75152,75155,75157,75159],{"class":35,"line":1844},[33,75153,75154],{"class":167},"            n ",[33,75156,242],{"class":163},[33,75158,3149],{"class":50},[33,75160,75161],{"class":167},"(part)\n",[33,75163,75164],{"class":35,"line":1858},[33,75165,75166],{"class":167},"            result.append((n, n))\n",[33,75168,75169,75171],{"class":35,"line":1871},[33,75170,1332],{"class":163},[33,75172,49632],{"class":167},[33,75174,75175],{"class":35,"line":1877},[33,75176,92],{"emptyLinePlaceholder":91},[33,75178,75179],{"class":35,"line":1883},[33,75180,92],{"emptyLinePlaceholder":91},[33,75182,75183,75185,75188,75191,75193,75195,75197,75200,75202],{"class":35,"line":1915},[33,75184,562],{"class":163},[33,75186,75187],{"class":46}," split",[33,75189,75190],{"class":167},"(input_path: Path, output_dir: Path, ranges: list[tuple[",[33,75192,1059],{"class":50},[33,75194,365],{"class":167},[33,75196,1059],{"class":50},[33,75198,75199],{"class":167},"]]) -> ",[33,75201,571],{"class":50},[33,75203,574],{"class":167},[33,75205,75206,75208,75210,75212,75214,75216,75218,75220,75222],{"class":35,"line":1926},[33,75207,6346],{"class":167},[33,75209,869],{"class":238},[33,75211,242],{"class":163},[33,75213,855],{"class":50},[33,75215,365],{"class":167},[33,75217,878],{"class":238},[33,75219,242],{"class":163},[33,75221,855],{"class":50},[33,75223,221],{"class":167},[33,75225,75226,75228,75230,75232,75234,75236,75238],{"class":35,"line":1932},[33,75227,1635],{"class":163},[33,75229,68213],{"class":50},[33,75231,72709],{"class":167},[33,75233,68219],{"class":54},[33,75235,1649],{"class":167},[33,75237,495],{"class":163},[33,75239,67176],{"class":167},[33,75241,75242,75244,75246],{"class":35,"line":1938},[33,75243,62484],{"class":167},[33,75245,242],{"class":163},[33,75247,68235],{"class":167},[33,75249,75250,75252,75254,75256],{"class":35,"line":1950},[33,75251,73710],{"class":167},[33,75253,242],{"class":163},[33,75255,4037],{"class":50},[33,75257,70691],{"class":167},[33,75259,75260,75262,75264,75266,75268,75270,75272],{"class":35,"line":1958},[33,75261,5973],{"class":163},[33,75263,72748],{"class":167},[33,75265,662],{"class":163},[33,75267,7403],{"class":50},[33,75269,72755],{"class":167},[33,75271,734],{"class":50},[33,75273,1737],{"class":167},[33,75275,75276,75278,75280,75282,75284,75286,75288,75290,75292,75294],{"class":35,"line":4904},[33,75277,5995],{"class":163},[33,75279,620],{"class":163},[33,75281,17583],{"class":167},[33,75283,734],{"class":50},[33,75285,72368],{"class":163},[33,75287,72770],{"class":167},[33,75289,44223],{"class":163},[33,75291,72779],{"class":167},[33,75293,44223],{"class":163},[33,75295,75296],{"class":167}," total):\n",[33,75298,75299,75301,75303,75305,75308,75310,75312,75314,75316,75318,75320,75322,75325,75327,75329,75331,75334],{"class":35,"line":4909},[33,75300,8264],{"class":50},[33,75302,602],{"class":167},[33,75304,4059],{"class":163},[33,75306,75307],{"class":54},"\"[SKIP] Range (",[33,75309,1115],{"class":50},[33,75311,7409],{"class":167},[33,75313,1121],{"class":50},[33,75315,4126],{"class":54},[33,75317,1115],{"class":50},[33,75319,72824],{"class":167},[33,75321,1121],{"class":50},[33,75323,75324],{"class":54},") invalid for ",[33,75326,1115],{"class":50},[33,75328,72401],{"class":167},[33,75330,1121],{"class":50},[33,75332,75333],{"class":54},"-page doc\"",[33,75335,221],{"class":167},[33,75337,75338],{"class":35,"line":4915},[33,75339,12315],{"class":163},[33,75341,75342,75344,75346],{"class":35,"line":4925},[33,75343,70275],{"class":167},[33,75345,242],{"class":163},[33,75347,67154],{"class":167},[33,75349,75350,75352,75354,75356,75358,75360,75362,75364],{"class":35,"line":4935},[33,75351,1793],{"class":163},[33,75353,47269],{"class":167},[33,75355,662],{"class":163},[33,75357,1801],{"class":50},[33,75359,72864],{"class":167},[33,75361,4126],{"class":163},[33,75363,1814],{"class":50},[33,75365,75366],{"class":167},", end):\n",[33,75368,75369],{"class":35,"line":4941},[33,75370,73800],{"class":167},[33,75372,75373,75375,75377,75379,75381,75383,75385,75387,75389,75391,75393,75395,75397,75399,75401],{"class":35,"line":4950},[33,75374,73805],{"class":167},[33,75376,242],{"class":163},[33,75378,6393],{"class":167},[33,75380,1351],{"class":163},[33,75382,1110],{"class":163},[33,75384,274],{"class":54},[33,75386,1115],{"class":50},[33,75388,72902],{"class":167},[33,75390,1121],{"class":50},[33,75392,72907],{"class":54},[33,75394,1115],{"class":50},[33,75396,72912],{"class":167},[33,75398,72915],{"class":163},[33,75400,1121],{"class":50},[33,75402,19246],{"class":54},[33,75404,75405,75407,75409,75411,75413,75415,75417],{"class":35,"line":4960},[33,75406,678],{"class":163},[33,75408,68213],{"class":50},[33,75410,72929],{"class":167},[33,75412,67169],{"class":54},[33,75414,1649],{"class":167},[33,75416,495],{"class":163},[33,75418,69097],{"class":167},[33,75420,75421],{"class":35,"line":4965},[33,75422,70388],{"class":167},[33,75424,75425],{"class":35,"line":4971},[33,75426,70393],{"class":167},[33,75428,75429,75431,75433,75435,75438,75440,75442,75444,75446,75449,75451,75453,75455,75457,75459,75461,75463,75465,75467,75470,75472,75474],{"class":35,"line":4983},[33,75430,9364],{"class":50},[33,75432,602],{"class":167},[33,75434,4059],{"class":163},[33,75436,75437],{"class":54},"\"  part",[33,75439,1115],{"class":50},[33,75441,72912],{"class":167},[33,75443,72915],{"class":163},[33,75445,1121],{"class":50},[33,75447,75448],{"class":54},": pages ",[33,75450,1115],{"class":50},[33,75452,7409],{"class":167},[33,75454,1121],{"class":50},[33,75456,72819],{"class":54},[33,75458,1115],{"class":50},[33,75460,72824],{"class":167},[33,75462,1121],{"class":50},[33,75464,69863],{"class":54},[33,75466,1115],{"class":50},[33,75468,75469],{"class":167},"out_path.name",[33,75471,1121],{"class":50},[33,75473,274],{"class":54},[33,75475,221],{"class":167},[33,75477,75478],{"class":35,"line":4988},[33,75479,92],{"emptyLinePlaceholder":91},[33,75481,75482],{"class":35,"line":4993},[33,75483,92],{"emptyLinePlaceholder":91},[33,75485,75486,75488,75490,75492,75494],{"class":35,"line":5003},[33,75487,562],{"class":163},[33,75489,6636],{"class":46},[33,75491,568],{"class":167},[33,75493,571],{"class":50},[33,75495,574],{"class":167},[33,75497,75498,75500,75502,75504,75506,75508,75511],{"class":35,"line":5008},[33,75499,15498],{"class":167},[33,75501,242],{"class":163},[33,75503,6653],{"class":167},[33,75505,6656],{"class":238},[33,75507,242],{"class":163},[33,75509,75510],{"class":54},"\"Merge folder of PDFs, optionally split result\"",[33,75512,221],{"class":167},[33,75514,75515,75517,75519,75521,75523,75525,75527,75529,75531,75533,75535,75537,75539,75541],{"class":35,"line":5014},[33,75516,15516],{"class":167},[33,75518,6672],{"class":54},[33,75520,365],{"class":167},[33,75522,25448],{"class":238},[33,75524,242],{"class":163},[33,75526,855],{"class":50},[33,75528,365],{"class":167},[33,75530,6677],{"class":238},[33,75532,242],{"class":163},[33,75534,6682],{"class":167},[33,75536,25463],{"class":238},[33,75538,242],{"class":163},[33,75540,69933],{"class":54},[33,75542,221],{"class":167},[33,75544,75545,75547,75549,75551,75553,75555,75557,75559,75561,75563,75565,75567,75569,75571],{"class":35,"line":5019},[33,75546,15516],{"class":167},[33,75548,6699],{"class":54},[33,75550,365],{"class":167},[33,75552,25448],{"class":238},[33,75554,242],{"class":163},[33,75556,855],{"class":50},[33,75558,365],{"class":167},[33,75560,6677],{"class":238},[33,75562,242],{"class":163},[33,75564,6682],{"class":167},[33,75566,25463],{"class":238},[33,75568,242],{"class":163},[33,75570,25501],{"class":54},[33,75572,221],{"class":167},[33,75574,75575,75577,75580,75582,75584,75586,75588,75590,75592,75594,75597],{"class":35,"line":5032},[33,75576,15516],{"class":167},[33,75578,75579],{"class":54},"\"--split\"",[33,75581,365],{"class":167},[33,75583,6685],{"class":238},[33,75585,242],{"class":163},[33,75587,3198],{"class":54},[33,75589,365],{"class":167},[33,75591,25463],{"class":238},[33,75593,242],{"class":163},[33,75595,75596],{"class":54},"\"Page ranges to split, e.g. '1-5,6-10'\"",[33,75598,221],{"class":167},[33,75600,75601,75603,75605],{"class":35,"line":5039},[33,75602,6766],{"class":167},[33,75604,242],{"class":163},[33,75606,15655],{"class":167},[33,75608,75609],{"class":35,"line":5068},[33,75610,92],{"emptyLinePlaceholder":91},[33,75612,75613,75616,75618,75620,75622],{"class":35,"line":5077},[33,75614,75615],{"class":167},"    merged_path ",[33,75617,242],{"class":163},[33,75619,53645],{"class":167},[33,75621,1351],{"class":163},[33,75623,75624],{"class":54}," \"merged.pdf\"\n",[33,75626,75627],{"class":35,"line":5082},[33,75628,75629],{"class":167},"    merge(args.input, merged_path)\n",[33,75631,75632],{"class":35,"line":5089},[33,75633,92],{"emptyLinePlaceholder":91},[33,75635,75636,75638],{"class":35,"line":5098},[33,75637,617],{"class":163},[33,75639,75640],{"class":167}," args.split:\n",[33,75642,75643,75646,75648],{"class":35,"line":5105},[33,75644,75645],{"class":167},"        ranges ",[33,75647,242],{"class":163},[33,75649,75650],{"class":167}," parse_ranges(args.split)\n",[33,75652,75653,75656,75658,75661],{"class":35,"line":5110},[33,75654,75655],{"class":167},"        split(merged_path, args.output ",[33,75657,1351],{"class":163},[33,75659,75660],{"class":54}," \"splits\"",[33,75662,75663],{"class":167},", ranges)\n",[33,75665,75666],{"class":35,"line":5115},[33,75667,92],{"emptyLinePlaceholder":91},[33,75669,75670],{"class":35,"line":5128},[33,75671,92],{"emptyLinePlaceholder":91},[33,75673,75674,75676,75678,75680,75682],{"class":35,"line":5135},[33,75675,2491],{"class":163},[33,75677,2494],{"class":50},[33,75679,2497],{"class":163},[33,75681,2500],{"class":54},[33,75683,574],{"class":167},[33,75685,75686],{"class":35,"line":5142},[33,75687,6914],{"class":167},[18,75689,75691],{"id":75690},"guides-in-this-section","Guides in This Section",[4211,75693,75694,75700],{},[4214,75695,75696,75699],{},[940,75697,68020],{"href":75698},"\u002Fautomating-pdf-extraction-generation\u002Fmerging-and-splitting-pdf-documents\u002Fbatch-merge-pdfs-with-python-script\u002F"," — argparse CLI for merging all PDFs in a folder with natural sort and error recovery",[4214,75701,75702,75704],{},[940,75703,71076],{"href":71075}," — parse a ranges string, off-by-one pitfalls, split every N pages, split on bookmarks",[18,75706,6918],{"id":6917},[4211,75708,75709,75714,75719],{},[4214,75710,75711,75713],{},[940,75712,26191],{"href":19001}," — use merge operations to assemble cover pages, body sections, and appendices into final reports",[4214,75715,75716,75718],{},[940,75717,65967],{"href":65966}," — apply password protection and watermarks after the merge\u002Fsplit step",[4214,75720,75721,75723],{},[940,75722,9592],{"href":942}," — coordinate-based parsing of the content inside the pages you are assembling",[14,75725,6947,75726,3035],{},[940,75727,6943],{"href":6942},[6953,75729,26204],{},{"title":28,"searchDepth":43,"depth":43,"links":75731},[75732,75733,75734,75740,75743,75748,75749,75750,75751,75752,75753],{"id":20,"depth":43,"text":21},{"id":71264,"depth":43,"text":71265},{"id":71488,"depth":43,"text":71489,"children":75735},[75736,75738,75739],{"id":71602,"depth":61,"text":75737},"Step 1 — Use append(), not add_page()",{"id":71998,"depth":61,"text":71999},{"id":72235,"depth":61,"text":72236},{"id":72564,"depth":43,"text":72565,"children":75741},[75742],{"id":72568,"depth":61,"text":72569},{"id":2708,"depth":43,"text":2709,"children":75744},[75745,75746,75747],{"id":73121,"depth":61,"text":73122},{"id":73377,"depth":61,"text":73378},{"id":73594,"depth":61,"text":73595},{"id":52029,"depth":43,"text":52030},{"id":21809,"depth":43,"text":21810},{"id":4270,"depth":43,"text":4271},{"id":14436,"depth":43,"text":14437},{"id":75690,"depth":43,"text":75691},{"id":6917,"depth":43,"text":6918},"Merge & Split PDFs","Merge and split PDF files with pypdf — PdfReader, PdfWriter, page reordering, bookmark preservation, and memory-efficient streaming for large batches.",{},"\u002Fautomating-pdf-extraction-generation\u002Fmerging-and-splitting-pdf-documents",{"title":52682,"description":75755},"Merge and Split PDF Documents with Python","automating-pdf-extraction-generation\u002Fmerging-and-splitting-pdf-documents\u002Findex",[9631,47,65045,75762],"batch processing","ixfjcaFd7cruapizievhgT7HR19koOQKxp1XcWrqjYc",{"id":75765,"title":71076,"body":75766,"breadcrumbTitle":79632,"canonical":6977,"date":6978,"description":79633,"draft":6980,"extension":6981,"image":6977,"meta":79634,"navigation":91,"path":79635,"robots":6977,"seo":79636,"seoTitle":79637,"stem":79638,"tags":79639,"updatedAt":6978,"__hash__":79641},"content\u002Fautomating-pdf-extraction-generation\u002Fmerging-and-splitting-pdf-documents\u002Fsplit-pdf-by-page-ranges-with-python\u002Findex.md",{"type":7,"value":75767,"toc":79620},[75768,75771,75778,75791,75795,75806,75818,75831,75837,75841,76075,76078,76082,76731,76737,76741,76747,77138,77141,77887,77891,77894,78344,78351,78355,78358,78975,78981,78983,79421,79424,79432,79434,79540,79542,79561,79570,79579,79595,79597,79614,79618],[10,75769,71076],{"id":75770},"split-a-pdf-by-page-ranges-with-python",[14,75772,75773,75774,75777],{},"You have a 40-page PDF — a monthly report, a contract, a scanned bundle — and you need to produce separate files for pages 1–10, 11–25, and 26–40. A naive slice loop either writes the wrong pages or crashes because pypdf uses 0-based indexing while every PDF viewer shows 1-based page numbers. This guide covers the correct split pattern, the range-string parser that powers a ",[30,75775,75776],{},"--split \"1-10,11-25,26-40\""," CLI argument, and three variants: split every N pages, split on bookmarks, and handle invalid ranges gracefully.",[14,75779,75780,75781,43180,75784,10065,75786,75788,75789,3035],{},"All split operations use ",[940,75782,65045],{"href":68051,"rel":75783},[1367],[30,75785,68108],{},[30,75787,70025],{},"; the same library handles the merging side of the workflow in ",[940,75790,52682],{"href":52681},[18,75792,75794],{"id":75793},"root-cause-off-by-one-between-ui-and-pypdf","Root Cause: Off-by-One Between UI and pypdf",[14,75796,75797,75798,75801,75802,75805],{},"PDF viewers display page 1 as the first page. pypdf stores pages in a zero-indexed list: ",[30,75799,75800],{},"reader.pages[0]"," is page 1, ",[30,75803,75804],{},"reader.pages[1]"," is page 2, and so on.",[14,75807,75808,75809,75812,75813,1351,75815,75817],{},"The error appears silently: a script that writes ",[30,75810,75811],{},"range(start, end)"," with 1-based ",[30,75814,7409],{},[30,75816,72824],{}," values shifts every page one position forward and drops the real last page of each range.",[14,75819,75820,75821,75823,75824,75827,75828,20891],{},"The fix is mechanical — always subtract 1 from the user-visible start when entering pypdf's index space, and use ",[30,75822,72824],{}," (not ",[30,75825,75826],{},"end - 1",") as the exclusive stop of ",[30,75829,75830],{},"range()",[23,75832,75835],{"className":75833,"code":75834,"language":2000},[1998],"User says \"pages 1–5\"\npypdf indices: 0, 1, 2, 3, 4\nrange() call:  range(1 - 1, 5)  →  range(0, 5)  ✓\n",[30,75836,75834],{"__ignoreMap":28},[18,75838,75840],{"id":75839},"diagnostic-confirm-page-count-before-splitting","Diagnostic: Confirm Page Count Before Splitting",[23,75842,75844],{"className":126,"code":75843,"language":47,"meta":28,"style":28},"# pip install pypdf\nfrom pypdf import PdfReader\nfrom pathlib import Path\n\ndef inspect(path: Path) -> None:\n    \"\"\"Print page count and first-level outline entries.\"\"\"\n    try:\n        reader = PdfReader(path)\n        print(f\"{path.name}: {len(reader.pages)} pages, encrypted={reader.is_encrypted}\")\n        for item in reader.outline:\n            if hasattr(item, \"title\"):\n                pg = reader.get_destination_page_number(item) + 1\n                print(f\"  [p{pg}] {item.title}\")\n    except Exception as exc:\n        print(f\"Could not inspect {path.name}: {exc}\")\n\nif __name__ == \"__main__\":\n    inspect(Path(\".\u002Fsource_document.pdf\"))\n",[30,75845,75846,75850,75860,75870,75874,75887,75892,75898,75906,75944,75955,75968,75981,76011,76021,76049,76053,76065],{"__ignoreMap":28},[33,75847,75848],{"class":35,"line":36},[33,75849,57316],{"class":39},[33,75851,75852,75854,75856,75858],{"class":35,"line":43},[33,75853,190],{"class":163},[33,75855,57333],{"class":167},[33,75857,164],{"class":163},[33,75859,57338],{"class":167},[33,75861,75862,75864,75866,75868],{"class":35,"line":61},[33,75863,190],{"class":163},[33,75865,193],{"class":167},[33,75867,164],{"class":163},[33,75869,198],{"class":167},[33,75871,75872],{"class":35,"line":73},[33,75873,92],{"emptyLinePlaceholder":91},[33,75875,75876,75878,75881,75883,75885],{"class":35,"line":88},[33,75877,562],{"class":163},[33,75879,75880],{"class":46}," inspect",[33,75882,3743],{"class":167},[33,75884,571],{"class":50},[33,75886,574],{"class":167},[33,75888,75889],{"class":35,"line":95},[33,75890,75891],{"class":54},"    \"\"\"Print page count and first-level outline entries.\"\"\"\n",[33,75893,75894,75896],{"class":35,"line":101},[33,75895,2424],{"class":163},[33,75897,574],{"class":167},[33,75899,75900,75902,75904],{"class":35,"line":171},[33,75901,62484],{"class":167},[33,75903,242],{"class":163},[33,75905,71334],{"class":167},[33,75907,75908,75910,75912,75914,75916,75918,75920,75922,75924,75926,75928,75930,75933,75935,75938,75940,75942],{"class":35,"line":179},[33,75909,9414],{"class":50},[33,75911,602],{"class":167},[33,75913,4059],{"class":163},[33,75915,274],{"class":54},[33,75917,1115],{"class":50},[33,75919,57398],{"class":167},[33,75921,1121],{"class":50},[33,75923,2079],{"class":54},[33,75925,4065],{"class":50},[33,75927,59322],{"class":167},[33,75929,1121],{"class":50},[33,75931,75932],{"class":54}," pages, encrypted=",[33,75934,1115],{"class":50},[33,75936,75937],{"class":167},"reader.is_encrypted",[33,75939,1121],{"class":50},[33,75941,274],{"class":54},[33,75943,221],{"class":167},[33,75945,75946,75948,75950,75952],{"class":35,"line":187},[33,75947,5973],{"class":163},[33,75949,54203],{"class":167},[33,75951,662],{"class":163},[33,75953,75954],{"class":167}," reader.outline:\n",[33,75956,75957,75959,75962,75964,75966],{"class":35,"line":201},[33,75958,5995],{"class":163},[33,75960,75961],{"class":50}," hasattr",[33,75963,72109],{"class":167},[33,75965,62210],{"class":54},[33,75967,1737],{"class":167},[33,75969,75970,75973,75975,75977,75979],{"class":35,"line":206},[33,75971,75972],{"class":167},"                pg ",[33,75974,242],{"class":163},[33,75976,72140],{"class":167},[33,75978,1811],{"class":163},[33,75980,17709],{"class":50},[33,75982,75983,75985,75987,75989,75992,75994,75997,75999,76001,76003,76005,76007,76009],{"class":35,"line":224},[33,75984,8264],{"class":50},[33,75986,602],{"class":167},[33,75988,4059],{"class":163},[33,75990,75991],{"class":54},"\"  [p",[33,75993,1115],{"class":50},[33,75995,75996],{"class":167},"pg",[33,75998,1121],{"class":50},[33,76000,763],{"class":54},[33,76002,1115],{"class":50},[33,76004,72181],{"class":167},[33,76006,1121],{"class":50},[33,76008,274],{"class":54},[33,76010,221],{"class":167},[33,76012,76013,76015,76017,76019],{"class":35,"line":229},[33,76014,2449],{"class":163},[33,76016,783],{"class":50},[33,76018,1852],{"class":163},[33,76020,1855],{"class":167},[33,76022,76023,76025,76027,76029,76031,76033,76035,76037,76039,76041,76043,76045,76047],{"class":35,"line":235},[33,76024,9414],{"class":50},[33,76026,602],{"class":167},[33,76028,4059],{"class":163},[33,76030,46661],{"class":54},[33,76032,1115],{"class":50},[33,76034,57398],{"class":167},[33,76036,1121],{"class":50},[33,76038,2079],{"class":54},[33,76040,1115],{"class":50},[33,76042,6565],{"class":167},[33,76044,1121],{"class":50},[33,76046,274],{"class":54},[33,76048,221],{"class":167},[33,76050,76051],{"class":35,"line":250},[33,76052,92],{"emptyLinePlaceholder":91},[33,76054,76055,76057,76059,76061,76063],{"class":35,"line":266},[33,76056,2491],{"class":163},[33,76058,2494],{"class":50},[33,76060,2497],{"class":163},[33,76062,2500],{"class":54},[33,76064,574],{"class":167},[33,76066,76067,76070,76073],{"class":35,"line":290},[33,76068,76069],{"class":167},"    inspect(Path(",[33,76071,76072],{"class":54},"\".\u002Fsource_document.pdf\"",[33,76074,371],{"class":167},[14,76076,76077],{},"Run this before splitting. Knowing the total page count lets you validate every requested range before writing a single byte. The outline listing is useful for the bookmark-split variant below.",[18,76079,76081],{"id":76080},"fix-implementation-range-based-split","Fix Implementation: Range-Based Split",[23,76083,76085],{"className":126,"code":76084,"language":47,"meta":28,"style":28},"# pip install pypdf\nfrom pypdf import PdfReader, PdfWriter\nfrom pathlib import Path\n\n\ndef split_by_ranges(\n    input_path: Path,\n    output_dir: Path,\n    ranges: list[tuple[int, int]],\n) -> list[Path]:\n    \"\"\"\n    Split input_path into one PDF per range.\n\n    ranges is a list of (start, end) tuples using 1-based page numbers\n    (matching what PDF viewers show).  Both start and end are inclusive.\n\n    Returns the list of output paths created.\n    \"\"\"\n    output_dir.mkdir(parents=True, exist_ok=True)\n    created: list[Path] = []\n\n    try:\n        with open(input_path, \"rb\") as fh:\n            reader = PdfReader(fh)\n            total = len(reader.pages)\n\n            for idx, (start, end) in enumerate(ranges, start=1):\n                # Validate before touching the writer\n                if start \u003C 1:\n                    raise ValueError(f\"Range {idx}: start={start} must be >= 1\")\n                if end > total:\n                    raise ValueError(\n                        f\"Range {idx}: end={end} exceeds document length ({total} pages)\"\n                    )\n                if start > end:\n                    raise ValueError(f\"Range {idx}: start={start} > end={end}\")\n\n                writer = PdfWriter()\n                # KEY: subtract 1 from start to convert 1-based → 0-based;\n                #      end is the exclusive upper bound for range(), so no adjustment.\n                for page_idx in range(start - 1, end):\n                    writer.add_page(reader.pages[page_idx])\n\n                out_path = output_dir \u002F f\"{input_path.stem}_part{idx:02d}.pdf\"\n                with open(out_path, \"wb\") as out:\n                    writer.write(out)\n                writer.close()\n                created.append(out_path)\n                print(f\"part{idx:02d}: pages {start}–{end} ({end - start + 1} pages) → {out_path.name}\")\n\n    except Exception as exc:\n        print(f\"Split failed: {exc}\")\n        raise\n\n    return created\n\n\nif __name__ == \"__main__\":\n    split_by_ranges(\n        Path(\".\u002Fannual_report.pdf\"),\n        Path(\".\u002Foutput\u002Fsplits\"),\n        [(1, 10), (11, 25), (26, 40)],\n    )\n",[30,76086,76087,76091,76101,76111,76115,76119,76128,76132,76136,76148,76152,76156,76161,76165,76170,76175,76179,76184,76188,76208,76216,76220,76226,76242,76250,76260,76264,76284,76289,76301,76334,76344,76352,76384,76388,76398,76437,76441,76449,76454,76459,76478,76483,76487,76519,76535,76539,76543,76547,76609,76613,76623,76643,76647,76651,76657,76661,76665,76677,76682,76690,76698,76727],{"__ignoreMap":28},[33,76088,76089],{"class":35,"line":36},[33,76090,57316],{"class":39},[33,76092,76093,76095,76097,76099],{"class":35,"line":43},[33,76094,190],{"class":163},[33,76096,57333],{"class":167},[33,76098,164],{"class":163},[33,76100,66892],{"class":167},[33,76102,76103,76105,76107,76109],{"class":35,"line":61},[33,76104,190],{"class":163},[33,76106,193],{"class":167},[33,76108,164],{"class":163},[33,76110,198],{"class":167},[33,76112,76113],{"class":35,"line":73},[33,76114,92],{"emptyLinePlaceholder":91},[33,76116,76117],{"class":35,"line":88},[33,76118,92],{"emptyLinePlaceholder":91},[33,76120,76121,76123,76126],{"class":35,"line":95},[33,76122,562],{"class":163},[33,76124,76125],{"class":46}," split_by_ranges",[33,76127,7637],{"class":167},[33,76129,76130],{"class":35,"line":101},[33,76131,72619],{"class":167},[33,76133,76134],{"class":35,"line":171},[33,76135,72624],{"class":167},[33,76137,76138,76140,76142,76144,76146],{"class":35,"line":179},[33,76139,72629],{"class":167},[33,76141,1059],{"class":50},[33,76143,365],{"class":167},[33,76145,1059],{"class":50},[33,76147,47404],{"class":167},[33,76149,76150],{"class":35,"line":187},[33,76151,72642],{"class":167},[33,76153,76154],{"class":35,"line":201},[33,76155,7673],{"class":54},[33,76157,76158],{"class":35,"line":206},[33,76159,76160],{"class":54},"    Split input_path into one PDF per range.\n",[33,76162,76163],{"class":35,"line":224},[33,76164,92],{"emptyLinePlaceholder":91},[33,76166,76167],{"class":35,"line":229},[33,76168,76169],{"class":54},"    ranges is a list of (start, end) tuples using 1-based page numbers\n",[33,76171,76172],{"class":35,"line":235},[33,76173,76174],{"class":54},"    (matching what PDF viewers show).  Both start and end are inclusive.\n",[33,76176,76177],{"class":35,"line":250},[33,76178,92],{"emptyLinePlaceholder":91},[33,76180,76181],{"class":35,"line":266},[33,76182,76183],{"class":54},"    Returns the list of output paths created.\n",[33,76185,76186],{"class":35,"line":290},[33,76187,7673],{"class":54},[33,76189,76190,76192,76194,76196,76198,76200,76202,76204,76206],{"class":35,"line":295},[33,76191,6346],{"class":167},[33,76193,869],{"class":238},[33,76195,242],{"class":163},[33,76197,855],{"class":50},[33,76199,365],{"class":167},[33,76201,878],{"class":238},[33,76203,242],{"class":163},[33,76205,855],{"class":50},[33,76207,221],{"class":167},[33,76209,76210,76212,76214],{"class":35,"line":300},[33,76211,72690],{"class":167},[33,76213,242],{"class":163},[33,76215,589],{"class":167},[33,76217,76218],{"class":35,"line":317},[33,76219,92],{"emptyLinePlaceholder":91},[33,76221,76222,76224],{"class":35,"line":332},[33,76223,2424],{"class":163},[33,76225,574],{"class":167},[33,76227,76228,76230,76232,76234,76236,76238,76240],{"class":35,"line":347},[33,76229,2191],{"class":163},[33,76231,68213],{"class":50},[33,76233,72709],{"class":167},[33,76235,68219],{"class":54},[33,76237,1649],{"class":167},[33,76239,495],{"class":163},[33,76241,67176],{"class":167},[33,76243,76244,76246,76248],{"class":35,"line":374},[33,76245,72722],{"class":167},[33,76247,242],{"class":163},[33,76249,68235],{"class":167},[33,76251,76252,76254,76256,76258],{"class":35,"line":397},[33,76253,72731],{"class":167},[33,76255,242],{"class":163},[33,76257,4037],{"class":50},[33,76259,70691],{"class":167},[33,76261,76262],{"class":35,"line":653},[33,76263,92],{"emptyLinePlaceholder":91},[33,76265,76266,76268,76270,76272,76274,76276,76278,76280,76282],{"class":35,"line":667},[33,76267,1793],{"class":163},[33,76269,72748],{"class":167},[33,76271,662],{"class":163},[33,76273,7403],{"class":50},[33,76275,72755],{"class":167},[33,76277,7409],{"class":238},[33,76279,242],{"class":163},[33,76281,734],{"class":50},[33,76283,1737],{"class":167},[33,76285,76286],{"class":35,"line":675},[33,76287,76288],{"class":39},"                # Validate before touching the writer\n",[33,76290,76291,76293,76295,76297,76299],{"class":35,"line":689},[33,76292,7170],{"class":163},[33,76294,72770],{"class":167},[33,76296,4043],{"class":163},[33,76298,1814],{"class":50},[33,76300,574],{"class":167},[33,76302,76303,76305,76307,76309,76311,76314,76316,76318,76320,76323,76325,76327,76329,76332],{"class":35,"line":703},[33,76304,72798],{"class":163},[33,76306,4054],{"class":50},[33,76308,602],{"class":167},[33,76310,4059],{"class":163},[33,76312,76313],{"class":54},"\"Range ",[33,76315,1115],{"class":50},[33,76317,72912],{"class":167},[33,76319,1121],{"class":50},[33,76321,76322],{"class":54},": start=",[33,76324,1115],{"class":50},[33,76326,7409],{"class":167},[33,76328,1121],{"class":50},[33,76330,76331],{"class":54}," must be >= 1\"",[33,76333,221],{"class":167},[33,76335,76336,76338,76340,76342],{"class":35,"line":714},[33,76337,7170],{"class":163},[33,76339,72779],{"class":167},[33,76341,6009],{"class":163},[33,76343,72375],{"class":167},[33,76345,76346,76348,76350],{"class":35,"line":723},[33,76347,72798],{"class":163},[33,76349,4054],{"class":50},[33,76351,7637],{"class":167},[33,76353,76354,76356,76358,76360,76362,76364,76367,76369,76371,76373,76376,76378,76380,76382],{"class":35,"line":754},[33,76355,72807],{"class":163},[33,76357,76313],{"class":54},[33,76359,1115],{"class":50},[33,76361,72912],{"class":167},[33,76363,1121],{"class":50},[33,76365,76366],{"class":54},": end=",[33,76368,1115],{"class":50},[33,76370,72824],{"class":167},[33,76372,1121],{"class":50},[33,76374,76375],{"class":54}," exceeds document length (",[33,76377,1115],{"class":50},[33,76379,72401],{"class":167},[33,76381,1121],{"class":50},[33,76383,68266],{"class":54},[33,76385,76386],{"class":35,"line":771},[33,76387,1929],{"class":167},[33,76389,76390,76392,76394,76396],{"class":35,"line":777},[33,76391,7170],{"class":163},[33,76393,72770],{"class":167},[33,76395,6009],{"class":163},[33,76397,72793],{"class":167},[33,76399,76400,76402,76404,76406,76408,76410,76412,76414,76416,76418,76420,76422,76424,76427,76429,76431,76433,76435],{"class":35,"line":788},[33,76401,72798],{"class":163},[33,76403,4054],{"class":50},[33,76405,602],{"class":167},[33,76407,4059],{"class":163},[33,76409,76313],{"class":54},[33,76411,1115],{"class":50},[33,76413,72912],{"class":167},[33,76415,1121],{"class":50},[33,76417,76322],{"class":54},[33,76419,1115],{"class":50},[33,76421,7409],{"class":167},[33,76423,1121],{"class":50},[33,76425,76426],{"class":54}," > end=",[33,76428,1115],{"class":50},[33,76430,72824],{"class":167},[33,76432,1121],{"class":50},[33,76434,274],{"class":54},[33,76436,221],{"class":167},[33,76438,76439],{"class":35,"line":804},[33,76440,92],{"emptyLinePlaceholder":91},[33,76442,76443,76445,76447],{"class":35,"line":809},[33,76444,72847],{"class":167},[33,76446,242],{"class":163},[33,76448,67154],{"class":167},[33,76450,76451],{"class":35,"line":819},[33,76452,76453],{"class":39},"                # KEY: subtract 1 from start to convert 1-based → 0-based;\n",[33,76455,76456],{"class":35,"line":829},[33,76457,76458],{"class":39},"                #      end is the exclusive upper bound for range(), so no adjustment.\n",[33,76460,76461,76463,76466,76468,76470,76472,76474,76476],{"class":35,"line":834},[33,76462,692],{"class":163},[33,76464,76465],{"class":167}," page_idx ",[33,76467,662],{"class":163},[33,76469,1801],{"class":50},[33,76471,72864],{"class":167},[33,76473,4126],{"class":163},[33,76475,1814],{"class":50},[33,76477,75366],{"class":167},[33,76479,76480],{"class":35,"line":839},[33,76481,76482],{"class":167},"                    writer.add_page(reader.pages[page_idx])\n",[33,76484,76485],{"class":35,"line":860},[33,76486,92],{"emptyLinePlaceholder":91},[33,76488,76489,76491,76493,76495,76497,76499,76501,76503,76505,76507,76509,76511,76513,76515,76517],{"class":35,"line":887},[33,76490,40664],{"class":167},[33,76492,242],{"class":163},[33,76494,6393],{"class":167},[33,76496,1351],{"class":163},[33,76498,1110],{"class":163},[33,76500,274],{"class":54},[33,76502,1115],{"class":50},[33,76504,72902],{"class":167},[33,76506,1121],{"class":50},[33,76508,72907],{"class":54},[33,76510,1115],{"class":50},[33,76512,72912],{"class":167},[33,76514,72915],{"class":163},[33,76516,1121],{"class":50},[33,76518,19246],{"class":54},[33,76520,76521,76523,76525,76527,76529,76531,76533],{"class":35,"line":907},[33,76522,72924],{"class":163},[33,76524,68213],{"class":50},[33,76526,72929],{"class":167},[33,76528,67169],{"class":54},[33,76530,1649],{"class":167},[33,76532,495],{"class":163},[33,76534,69097],{"class":167},[33,76536,76537],{"class":35,"line":1826},[33,76538,72942],{"class":167},[33,76540,76541],{"class":35,"line":1844},[33,76542,72947],{"class":167},[33,76544,76545],{"class":35,"line":1858},[33,76546,72952],{"class":167},[33,76548,76549,76551,76553,76555,76558,76560,76562,76564,76566,76568,76570,76572,76574,76576,76578,76580,76582,76584,76586,76588,76590,76592,76594,76596,76599,76601,76603,76605,76607],{"class":35,"line":1871},[33,76550,8264],{"class":50},[33,76552,602],{"class":167},[33,76554,4059],{"class":163},[33,76556,76557],{"class":54},"\"part",[33,76559,1115],{"class":50},[33,76561,72912],{"class":167},[33,76563,72915],{"class":163},[33,76565,1121],{"class":50},[33,76567,75448],{"class":54},[33,76569,1115],{"class":50},[33,76571,7409],{"class":167},[33,76573,1121],{"class":50},[33,76575,72819],{"class":54},[33,76577,1115],{"class":50},[33,76579,72824],{"class":167},[33,76581,1121],{"class":50},[33,76583,17583],{"class":54},[33,76585,1115],{"class":50},[33,76587,72976],{"class":167},[33,76589,4126],{"class":163},[33,76591,72770],{"class":167},[33,76593,1811],{"class":163},[33,76595,11022],{"class":50},[33,76597,76598],{"class":54}," pages) → ",[33,76600,1115],{"class":50},[33,76602,75469],{"class":167},[33,76604,1121],{"class":50},[33,76606,274],{"class":54},[33,76608,221],{"class":167},[33,76610,76611],{"class":35,"line":1877},[33,76612,92],{"emptyLinePlaceholder":91},[33,76614,76615,76617,76619,76621],{"class":35,"line":1883},[33,76616,2449],{"class":163},[33,76618,783],{"class":50},[33,76620,1852],{"class":163},[33,76622,1855],{"class":167},[33,76624,76625,76627,76629,76631,76633,76635,76637,76639,76641],{"class":35,"line":1915},[33,76626,9414],{"class":50},[33,76628,602],{"class":167},[33,76630,4059],{"class":163},[33,76632,73013],{"class":54},[33,76634,1115],{"class":50},[33,76636,6565],{"class":167},[33,76638,1121],{"class":50},[33,76640,274],{"class":54},[33,76642,221],{"class":167},[33,76644,76645],{"class":35,"line":1926},[33,76646,65922],{"class":163},[33,76648,76649],{"class":35,"line":1932},[33,76650,92],{"emptyLinePlaceholder":91},[33,76652,76653,76655],{"class":35,"line":1938},[33,76654,1332],{"class":163},[33,76656,73034],{"class":167},[33,76658,76659],{"class":35,"line":1950},[33,76660,92],{"emptyLinePlaceholder":91},[33,76662,76663],{"class":35,"line":1958},[33,76664,92],{"emptyLinePlaceholder":91},[33,76666,76667,76669,76671,76673,76675],{"class":35,"line":4904},[33,76668,2491],{"class":163},[33,76670,2494],{"class":50},[33,76672,2497],{"class":163},[33,76674,2500],{"class":54},[33,76676,574],{"class":167},[33,76678,76679],{"class":35,"line":4909},[33,76680,76681],{"class":167},"    split_by_ranges(\n",[33,76683,76684,76686,76688],{"class":35,"line":4915},[33,76685,69188],{"class":167},[33,76687,73062],{"class":54},[33,76689,1506],{"class":167},[33,76691,76692,76694,76696],{"class":35,"line":4925},[33,76693,69188],{"class":167},[33,76695,73071],{"class":54},[33,76697,1506],{"class":167},[33,76699,76700,76702,76704,76706,76708,76710,76712,76714,76717,76719,76721,76723,76725],{"class":35,"line":4935},[33,76701,73078],{"class":167},[33,76703,734],{"class":50},[33,76705,365],{"class":167},[33,76707,3545],{"class":50},[33,76709,19834],{"class":167},[33,76711,17260],{"class":50},[33,76713,365],{"class":167},[33,76715,76716],{"class":50},"25",[33,76718,19834],{"class":167},[33,76720,11164],{"class":50},[33,76722,365],{"class":167},[33,76724,26323],{"class":50},[33,76726,73103],{"class":167},[33,76728,76729],{"class":35,"line":4941},[33,76730,1202],{"class":167},[14,76732,76733,76734,76736],{},"Each range gets its own fresh ",[30,76735,70025],{},". Reusing a single writer across ranges would concatenate all ranges into one file instead of producing separate outputs.",[18,76738,76740],{"id":76739},"parse-a-ranges-string-from-the-command-line","Parse a Ranges String from the Command Line",[14,76742,76743,76744,20891],{},"Rather than hard-coding tuples, accept a string like ",[30,76745,76746],{},"\"1-10,11-25,26-40\"",[23,76748,76750],{"className":126,"code":76749,"language":47,"meta":28,"style":28},"# pip install pypdf\nimport re\nfrom pathlib import Path\nfrom pypdf import PdfReader, PdfWriter\n\n\ndef parse_ranges(spec: str) -> list[tuple[int, int]]:\n    \"\"\"\n    Parse '1-10,11-25,26-40' → [(1,10),(11,25),(26,40)].\n    Single-page entries like '5' become (5,5).\n    Raises ValueError for malformed tokens.\n    \"\"\"\n    result: list[tuple[int, int]] = []\n    for token in spec.split(\",\"):\n        token = token.strip()\n        if not token:\n            continue\n        match = re.fullmatch(r\"(\\d+)(?:-(\\d+))?\", token)\n        if not match:\n            raise ValueError(f\"Invalid range token: {token!r}\")\n        a = int(match.group(1))\n        b = int(match.group(2)) if match.group(2) else a\n        if a > b:\n            raise ValueError(f\"Start {a} > end {b} in token {token!r}\")\n        result.append((a, b))\n    return result\n\n\nif __name__ == \"__main__\":\n    print(parse_ranges(\"1-10,11-25,26-40\"))   # [(1, 10), (11, 25), (26, 40)]\n    print(parse_ranges(\"5\"))                   # [(5, 5)]\n    print(parse_ranges(\"1-5, 7-10\"))           # [(1, 5), (7, 10)]\n",[30,76751,76752,76756,76762,76772,76782,76786,76790,76810,76814,76819,76824,76829,76833,76850,76865,76875,76884,76888,76925,76934,76961,76977,77007,77019,77063,77068,77074,77078,77082,77094,77108,77123],{"__ignoreMap":28},[33,76753,76754],{"class":35,"line":36},[33,76755,57316],{"class":39},[33,76757,76758,76760],{"class":35,"line":43},[33,76759,164],{"class":163},[33,76761,11917],{"class":167},[33,76763,76764,76766,76768,76770],{"class":35,"line":61},[33,76765,190],{"class":163},[33,76767,193],{"class":167},[33,76769,164],{"class":163},[33,76771,198],{"class":167},[33,76773,76774,76776,76778,76780],{"class":35,"line":73},[33,76775,190],{"class":163},[33,76777,57333],{"class":167},[33,76779,164],{"class":163},[33,76781,66892],{"class":167},[33,76783,76784],{"class":35,"line":88},[33,76785,92],{"emptyLinePlaceholder":91},[33,76787,76788],{"class":35,"line":95},[33,76789,92],{"emptyLinePlaceholder":91},[33,76791,76792,76794,76796,76798,76800,76802,76804,76806,76808],{"class":35,"line":101},[33,76793,562],{"class":163},[33,76795,75043],{"class":46},[33,76797,75046],{"class":167},[33,76799,1053],{"class":50},[33,76801,47201],{"class":167},[33,76803,1059],{"class":50},[33,76805,365],{"class":167},[33,76807,1059],{"class":50},[33,76809,43900],{"class":167},[33,76811,76812],{"class":35,"line":171},[33,76813,7673],{"class":54},[33,76815,76816],{"class":35,"line":179},[33,76817,76818],{"class":54},"    Parse '1-10,11-25,26-40' → [(1,10),(11,25),(26,40)].\n",[33,76820,76821],{"class":35,"line":187},[33,76822,76823],{"class":54},"    Single-page entries like '5' become (5,5).\n",[33,76825,76826],{"class":35,"line":201},[33,76827,76828],{"class":54},"    Raises ValueError for malformed tokens.\n",[33,76830,76831],{"class":35,"line":206},[33,76832,7673],{"class":54},[33,76834,76835,76838,76840,76842,76844,76846,76848],{"class":35,"line":224},[33,76836,76837],{"class":167},"    result: list[tuple[",[33,76839,1059],{"class":50},[33,76841,365],{"class":167},[33,76843,1059],{"class":50},[33,76845,13081],{"class":167},[33,76847,242],{"class":163},[33,76849,589],{"class":167},[33,76851,76852,76854,76857,76859,76861,76863],{"class":35,"line":229},[33,76853,656],{"class":163},[33,76855,76856],{"class":167}," token ",[33,76858,662],{"class":163},[33,76860,75083],{"class":167},[33,76862,15900],{"class":54},[33,76864,1737],{"class":167},[33,76866,76867,76870,76872],{"class":35,"line":235},[33,76868,76869],{"class":167},"        token ",[33,76871,242],{"class":163},[33,76873,76874],{"class":167}," token.strip()\n",[33,76876,76877,76879,76881],{"class":35,"line":250},[33,76878,8221],{"class":163},[33,76880,620],{"class":163},[33,76882,76883],{"class":167}," token:\n",[33,76885,76886],{"class":35,"line":266},[33,76887,9330],{"class":163},[33,76889,76890,76893,76895,76898,76900,76902,76904,76906,76909,76911,76913,76915,76918,76920,76922],{"class":35,"line":290},[33,76891,76892],{"class":167},"        match ",[33,76894,242],{"class":163},[33,76896,76897],{"class":167}," re.fullmatch(",[33,76899,11977],{"class":163},[33,76901,274],{"class":54},[33,76903,68535],{"class":50},[33,76905,1811],{"class":163},[33,76907,76908],{"class":50},")(?:",[33,76910,4126],{"class":54},[33,76912,68535],{"class":50},[33,76914,1811],{"class":163},[33,76916,76917],{"class":50},"))",[33,76919,36637],{"class":163},[33,76921,274],{"class":54},[33,76923,76924],{"class":167},", token)\n",[33,76926,76927,76929,76931],{"class":35,"line":295},[33,76928,8221],{"class":163},[33,76930,620],{"class":163},[33,76932,76933],{"class":167}," match:\n",[33,76935,76936,76938,76940,76942,76944,76947,76949,76952,76955,76957,76959],{"class":35,"line":300},[33,76937,59715],{"class":163},[33,76939,4054],{"class":50},[33,76941,602],{"class":167},[33,76943,4059],{"class":163},[33,76945,76946],{"class":54},"\"Invalid range token: ",[33,76948,1115],{"class":50},[33,76950,76951],{"class":167},"token",[33,76953,76954],{"class":163},"!r",[33,76956,1121],{"class":50},[33,76958,274],{"class":54},[33,76960,221],{"class":167},[33,76962,76963,76966,76968,76970,76973,76975],{"class":35,"line":317},[33,76964,76965],{"class":167},"        a ",[33,76967,242],{"class":163},[33,76969,3149],{"class":50},[33,76971,76972],{"class":167},"(match.group(",[33,76974,734],{"class":50},[33,76976,371],{"class":167},[33,76978,76979,76982,76984,76986,76988,76990,76993,76995,76998,77000,77002,77004],{"class":35,"line":332},[33,76980,76981],{"class":167},"        b ",[33,76983,242],{"class":163},[33,76985,3149],{"class":50},[33,76987,76972],{"class":167},[33,76989,1533],{"class":50},[33,76991,76992],{"class":167},")) ",[33,76994,2491],{"class":163},[33,76996,76997],{"class":167}," match.group(",[33,76999,1533],{"class":50},[33,77001,1649],{"class":167},[33,77003,7489],{"class":163},[33,77005,77006],{"class":167}," a\n",[33,77008,77009,77011,77014,77016],{"class":35,"line":347},[33,77010,8221],{"class":163},[33,77012,77013],{"class":167}," a ",[33,77015,6009],{"class":163},[33,77017,77018],{"class":167}," b:\n",[33,77020,77021,77023,77025,77027,77029,77032,77034,77036,77038,77041,77043,77046,77048,77051,77053,77055,77057,77059,77061],{"class":35,"line":374},[33,77022,59715],{"class":163},[33,77024,4054],{"class":50},[33,77026,602],{"class":167},[33,77028,4059],{"class":163},[33,77030,77031],{"class":54},"\"Start ",[33,77033,1115],{"class":50},[33,77035,940],{"class":167},[33,77037,1121],{"class":50},[33,77039,77040],{"class":54}," > end ",[33,77042,1115],{"class":50},[33,77044,77045],{"class":167},"b",[33,77047,1121],{"class":50},[33,77049,77050],{"class":54}," in token ",[33,77052,1115],{"class":50},[33,77054,76951],{"class":167},[33,77056,76954],{"class":163},[33,77058,1121],{"class":50},[33,77060,274],{"class":54},[33,77062,221],{"class":167},[33,77064,77065],{"class":35,"line":397},[33,77066,77067],{"class":167},"        result.append((a, b))\n",[33,77069,77070,77072],{"class":35,"line":653},[33,77071,1332],{"class":163},[33,77073,49632],{"class":167},[33,77075,77076],{"class":35,"line":667},[33,77077,92],{"emptyLinePlaceholder":91},[33,77079,77080],{"class":35,"line":675},[33,77081,92],{"emptyLinePlaceholder":91},[33,77083,77084,77086,77088,77090,77092],{"class":35,"line":689},[33,77085,2491],{"class":163},[33,77087,2494],{"class":50},[33,77089,2497],{"class":163},[33,77091,2500],{"class":54},[33,77093,574],{"class":167},[33,77095,77096,77098,77101,77103,77105],{"class":35,"line":703},[33,77097,7268],{"class":50},[33,77099,77100],{"class":167},"(parse_ranges(",[33,77102,76746],{"class":54},[33,77104,73462],{"class":167},[33,77106,77107],{"class":39},"# [(1, 10), (11, 25), (26, 40)]\n",[33,77109,77110,77112,77114,77117,77120],{"class":35,"line":714},[33,77111,7268],{"class":50},[33,77113,77100],{"class":167},[33,77115,77116],{"class":54},"\"5\"",[33,77118,77119],{"class":167},"))                   ",[33,77121,77122],{"class":39},"# [(5, 5)]\n",[33,77124,77125,77127,77129,77132,77135],{"class":35,"line":723},[33,77126,7268],{"class":50},[33,77128,77100],{"class":167},[33,77130,77131],{"class":54},"\"1-5, 7-10\"",[33,77133,77134],{"class":167},"))           ",[33,77136,77137],{"class":39},"# [(1, 5), (7, 10)]\n",[14,77139,77140],{},"Connect this to the CLI:",[23,77142,77144],{"className":126,"code":77143,"language":47,"meta":28,"style":28},"# pip install pypdf\n#!\u002Fusr\u002Fbin\u002Fenv python3\n\"\"\"\nsplit_pdf.py — split a PDF by page ranges.\nUsage: python split_pdf.py --input report.pdf --output .\u002Fsplits --ranges \"1-10,11-25\"\n\"\"\"\nimport argparse\nimport re\nfrom pathlib import Path\n\nfrom pypdf import PdfReader, PdfWriter\n\n\ndef parse_ranges(spec: str) -> list[tuple[int, int]]:\n    result = []\n    for token in spec.split(\",\"):\n        token = token.strip()\n        m = re.fullmatch(r\"(\\d+)(?:-(\\d+))?\", token)\n        if not m:\n            raise argparse.ArgumentTypeError(f\"Bad range: {token!r}\")\n        a, b = int(m.group(1)), int(m.group(2) or m.group(1))\n        if a > b:\n            raise argparse.ArgumentTypeError(f\"start > end in {token!r}\")\n        result.append((a, b))\n    return result\n\n\ndef split(input_path: Path, output_dir: Path, ranges: list[tuple[int, int]]) -> None:\n    output_dir.mkdir(parents=True, exist_ok=True)\n    with open(input_path, \"rb\") as fh:\n        reader = PdfReader(fh)\n        total = len(reader.pages)\n        print(f\"{input_path.name}: {total} pages\")\n        for idx, (start, end) in enumerate(ranges, 1):\n            if not (1 \u003C= start \u003C= end \u003C= total):\n                print(f\"[SKIP] Range ({start}-{end}) invalid for {total}-page document\")\n                continue\n            writer = PdfWriter()\n            for i in range(start - 1, end):       # 0-based index\n                writer.add_page(reader.pages[i])\n            out = output_dir \u002F f\"{input_path.stem}_part{idx:02d}.pdf\"\n            with open(out, \"wb\") as f:\n                writer.write(f)\n            writer.close()\n            print(f\"  part{idx:02d}: p{start}–p{end} → {out.name}\")\n\n\ndef main() -> None:\n    ap = argparse.ArgumentParser(description=\"Split a PDF by page ranges\")\n    ap.add_argument(\"--input\", required=True, type=Path)\n    ap.add_argument(\"--output\", required=True, type=Path)\n    ap.add_argument(\"--ranges\", required=True, help='e.g. \"1-10,11-25,26-40\"')\n    args = ap.parse_args()\n    split(args.input, args.output, parse_ranges(args.ranges))\n\n\nif __name__ == \"__main__\":\n    main()\n",[30,77145,77146,77150,77154,77158,77163,77168,77172,77178,77184,77194,77198,77208,77212,77216,77236,77244,77258,77266,77299,77308,77332,77366,77376,77399,77403,77409,77413,77417,77437,77457,77473,77481,77491,77521,77537,77559,77596,77600,77608,77630,77634,77667,77685,77690,77694,77743,77747,77751,77763,77780,77802,77824,77850,77858,77863,77867,77871,77883],{"__ignoreMap":28},[33,77147,77148],{"class":35,"line":36},[33,77149,57316],{"class":39},[33,77151,77152],{"class":35,"line":43},[33,77153,14447],{"class":39},[33,77155,77156],{"class":35,"line":61},[33,77157,139],{"class":54},[33,77159,77160],{"class":35,"line":73},[33,77161,77162],{"class":54},"split_pdf.py — split a PDF by page ranges.\n",[33,77164,77165],{"class":35,"line":88},[33,77166,77167],{"class":54},"Usage: python split_pdf.py --input report.pdf --output .\u002Fsplits --ranges \"1-10,11-25\"\n",[33,77169,77170],{"class":35,"line":95},[33,77171,139],{"class":54},[33,77173,77174,77176],{"class":35,"line":101},[33,77175,164],{"class":163},[33,77177,4461],{"class":167},[33,77179,77180,77182],{"class":35,"line":171},[33,77181,164],{"class":163},[33,77183,11917],{"class":167},[33,77185,77186,77188,77190,77192],{"class":35,"line":179},[33,77187,190],{"class":163},[33,77189,193],{"class":167},[33,77191,164],{"class":163},[33,77193,198],{"class":167},[33,77195,77196],{"class":35,"line":187},[33,77197,92],{"emptyLinePlaceholder":91},[33,77199,77200,77202,77204,77206],{"class":35,"line":201},[33,77201,190],{"class":163},[33,77203,57333],{"class":167},[33,77205,164],{"class":163},[33,77207,66892],{"class":167},[33,77209,77210],{"class":35,"line":206},[33,77211,92],{"emptyLinePlaceholder":91},[33,77213,77214],{"class":35,"line":224},[33,77215,92],{"emptyLinePlaceholder":91},[33,77217,77218,77220,77222,77224,77226,77228,77230,77232,77234],{"class":35,"line":229},[33,77219,562],{"class":163},[33,77221,75043],{"class":46},[33,77223,75046],{"class":167},[33,77225,1053],{"class":50},[33,77227,47201],{"class":167},[33,77229,1059],{"class":50},[33,77231,365],{"class":167},[33,77233,1059],{"class":50},[33,77235,43900],{"class":167},[33,77237,77238,77240,77242],{"class":35,"line":235},[33,77239,8842],{"class":167},[33,77241,242],{"class":163},[33,77243,589],{"class":167},[33,77245,77246,77248,77250,77252,77254,77256],{"class":35,"line":250},[33,77247,656],{"class":163},[33,77249,76856],{"class":167},[33,77251,662],{"class":163},[33,77253,75083],{"class":167},[33,77255,15900],{"class":54},[33,77257,1737],{"class":167},[33,77259,77260,77262,77264],{"class":35,"line":266},[33,77261,76869],{"class":167},[33,77263,242],{"class":163},[33,77265,76874],{"class":167},[33,77267,77268,77271,77273,77275,77277,77279,77281,77283,77285,77287,77289,77291,77293,77295,77297],{"class":35,"line":290},[33,77269,77270],{"class":167},"        m ",[33,77272,242],{"class":163},[33,77274,76897],{"class":167},[33,77276,11977],{"class":163},[33,77278,274],{"class":54},[33,77280,68535],{"class":50},[33,77282,1811],{"class":163},[33,77284,76908],{"class":50},[33,77286,4126],{"class":54},[33,77288,68535],{"class":50},[33,77290,1811],{"class":163},[33,77292,76917],{"class":50},[33,77294,36637],{"class":163},[33,77296,274],{"class":54},[33,77298,76924],{"class":167},[33,77300,77301,77303,77305],{"class":35,"line":295},[33,77302,8221],{"class":163},[33,77304,620],{"class":163},[33,77306,77307],{"class":167}," m:\n",[33,77309,77310,77312,77315,77317,77320,77322,77324,77326,77328,77330],{"class":35,"line":300},[33,77311,59715],{"class":163},[33,77313,77314],{"class":167}," argparse.ArgumentTypeError(",[33,77316,4059],{"class":163},[33,77318,77319],{"class":54},"\"Bad range: ",[33,77321,1115],{"class":50},[33,77323,76951],{"class":167},[33,77325,76954],{"class":163},[33,77327,1121],{"class":50},[33,77329,274],{"class":54},[33,77331,221],{"class":167},[33,77333,77334,77337,77339,77341,77344,77346,77349,77351,77353,77355,77357,77359,77362,77364],{"class":35,"line":317},[33,77335,77336],{"class":167},"        a, b ",[33,77338,242],{"class":163},[33,77340,3149],{"class":50},[33,77342,77343],{"class":167},"(m.group(",[33,77345,734],{"class":50},[33,77347,77348],{"class":167},")), ",[33,77350,1059],{"class":50},[33,77352,77343],{"class":167},[33,77354,1533],{"class":50},[33,77356,1649],{"class":167},[33,77358,7162],{"class":163},[33,77360,77361],{"class":167}," m.group(",[33,77363,734],{"class":50},[33,77365,371],{"class":167},[33,77367,77368,77370,77372,77374],{"class":35,"line":332},[33,77369,8221],{"class":163},[33,77371,77013],{"class":167},[33,77373,6009],{"class":163},[33,77375,77018],{"class":167},[33,77377,77378,77380,77382,77384,77387,77389,77391,77393,77395,77397],{"class":35,"line":347},[33,77379,59715],{"class":163},[33,77381,77314],{"class":167},[33,77383,4059],{"class":163},[33,77385,77386],{"class":54},"\"start > end in ",[33,77388,1115],{"class":50},[33,77390,76951],{"class":167},[33,77392,76954],{"class":163},[33,77394,1121],{"class":50},[33,77396,274],{"class":54},[33,77398,221],{"class":167},[33,77400,77401],{"class":35,"line":374},[33,77402,77067],{"class":167},[33,77404,77405,77407],{"class":35,"line":397},[33,77406,1332],{"class":163},[33,77408,49632],{"class":167},[33,77410,77411],{"class":35,"line":653},[33,77412,92],{"emptyLinePlaceholder":91},[33,77414,77415],{"class":35,"line":667},[33,77416,92],{"emptyLinePlaceholder":91},[33,77418,77419,77421,77423,77425,77427,77429,77431,77433,77435],{"class":35,"line":675},[33,77420,562],{"class":163},[33,77422,75187],{"class":46},[33,77424,75190],{"class":167},[33,77426,1059],{"class":50},[33,77428,365],{"class":167},[33,77430,1059],{"class":50},[33,77432,75199],{"class":167},[33,77434,571],{"class":50},[33,77436,574],{"class":167},[33,77438,77439,77441,77443,77445,77447,77449,77451,77453,77455],{"class":35,"line":689},[33,77440,6346],{"class":167},[33,77442,869],{"class":238},[33,77444,242],{"class":163},[33,77446,855],{"class":50},[33,77448,365],{"class":167},[33,77450,878],{"class":238},[33,77452,242],{"class":163},[33,77454,855],{"class":50},[33,77456,221],{"class":167},[33,77458,77459,77461,77463,77465,77467,77469,77471],{"class":35,"line":703},[33,77460,1635],{"class":163},[33,77462,68213],{"class":50},[33,77464,72709],{"class":167},[33,77466,68219],{"class":54},[33,77468,1649],{"class":167},[33,77470,495],{"class":163},[33,77472,67176],{"class":167},[33,77474,77475,77477,77479],{"class":35,"line":714},[33,77476,62484],{"class":167},[33,77478,242],{"class":163},[33,77480,68235],{"class":167},[33,77482,77483,77485,77487,77489],{"class":35,"line":723},[33,77484,73710],{"class":167},[33,77486,242],{"class":163},[33,77488,4037],{"class":50},[33,77490,70691],{"class":167},[33,77492,77493,77495,77497,77499,77501,77503,77506,77508,77510,77512,77514,77516,77519],{"class":35,"line":754},[33,77494,9414],{"class":50},[33,77496,602],{"class":167},[33,77498,4059],{"class":163},[33,77500,274],{"class":54},[33,77502,1115],{"class":50},[33,77504,77505],{"class":167},"input_path.name",[33,77507,1121],{"class":50},[33,77509,2079],{"class":54},[33,77511,1115],{"class":50},[33,77513,72401],{"class":167},[33,77515,1121],{"class":50},[33,77517,77518],{"class":54}," pages\"",[33,77520,221],{"class":167},[33,77522,77523,77525,77527,77529,77531,77533,77535],{"class":35,"line":771},[33,77524,5973],{"class":163},[33,77526,72748],{"class":167},[33,77528,662],{"class":163},[33,77530,7403],{"class":50},[33,77532,72755],{"class":167},[33,77534,734],{"class":50},[33,77536,1737],{"class":167},[33,77538,77539,77541,77543,77545,77547,77549,77551,77553,77555,77557],{"class":35,"line":777},[33,77540,5995],{"class":163},[33,77542,620],{"class":163},[33,77544,17583],{"class":167},[33,77546,734],{"class":50},[33,77548,72368],{"class":163},[33,77550,72770],{"class":167},[33,77552,44223],{"class":163},[33,77554,72779],{"class":167},[33,77556,44223],{"class":163},[33,77558,75296],{"class":167},[33,77560,77561,77563,77565,77567,77569,77571,77573,77575,77577,77579,77581,77583,77585,77587,77589,77591,77594],{"class":35,"line":788},[33,77562,8264],{"class":50},[33,77564,602],{"class":167},[33,77566,4059],{"class":163},[33,77568,75307],{"class":54},[33,77570,1115],{"class":50},[33,77572,7409],{"class":167},[33,77574,1121],{"class":50},[33,77576,4126],{"class":54},[33,77578,1115],{"class":50},[33,77580,72824],{"class":167},[33,77582,1121],{"class":50},[33,77584,75324],{"class":54},[33,77586,1115],{"class":50},[33,77588,72401],{"class":167},[33,77590,1121],{"class":50},[33,77592,77593],{"class":54},"-page document\"",[33,77595,221],{"class":167},[33,77597,77598],{"class":35,"line":804},[33,77599,12315],{"class":163},[33,77601,77602,77604,77606],{"class":35,"line":809},[33,77603,70275],{"class":167},[33,77605,242],{"class":163},[33,77607,67154],{"class":167},[33,77609,77610,77612,77614,77616,77618,77620,77622,77624,77627],{"class":35,"line":819},[33,77611,1793],{"class":163},[33,77613,47269],{"class":167},[33,77615,662],{"class":163},[33,77617,1801],{"class":50},[33,77619,72864],{"class":167},[33,77621,4126],{"class":163},[33,77623,1814],{"class":50},[33,77625,77626],{"class":167},", end):       ",[33,77628,77629],{"class":39},"# 0-based index\n",[33,77631,77632],{"class":35,"line":829},[33,77633,73800],{"class":167},[33,77635,77636,77639,77641,77643,77645,77647,77649,77651,77653,77655,77657,77659,77661,77663,77665],{"class":35,"line":834},[33,77637,77638],{"class":167},"            out ",[33,77640,242],{"class":163},[33,77642,6393],{"class":167},[33,77644,1351],{"class":163},[33,77646,1110],{"class":163},[33,77648,274],{"class":54},[33,77650,1115],{"class":50},[33,77652,72902],{"class":167},[33,77654,1121],{"class":50},[33,77656,72907],{"class":54},[33,77658,1115],{"class":50},[33,77660,72912],{"class":167},[33,77662,72915],{"class":163},[33,77664,1121],{"class":50},[33,77666,19246],{"class":54},[33,77668,77669,77671,77673,77676,77678,77680,77682],{"class":35,"line":839},[33,77670,678],{"class":163},[33,77672,68213],{"class":50},[33,77674,77675],{"class":167},"(out, ",[33,77677,67169],{"class":54},[33,77679,1649],{"class":167},[33,77681,495],{"class":163},[33,77683,77684],{"class":167}," f:\n",[33,77686,77687],{"class":35,"line":860},[33,77688,77689],{"class":167},"                writer.write(f)\n",[33,77691,77692],{"class":35,"line":887},[33,77693,70393],{"class":167},[33,77695,77696,77698,77700,77702,77704,77706,77708,77710,77712,77715,77717,77719,77721,77724,77726,77728,77730,77732,77734,77737,77739,77741],{"class":35,"line":907},[33,77697,9364],{"class":50},[33,77699,602],{"class":167},[33,77701,4059],{"class":163},[33,77703,75437],{"class":54},[33,77705,1115],{"class":50},[33,77707,72912],{"class":167},[33,77709,72915],{"class":163},[33,77711,1121],{"class":50},[33,77713,77714],{"class":54},": p",[33,77716,1115],{"class":50},[33,77718,7409],{"class":167},[33,77720,1121],{"class":50},[33,77722,77723],{"class":54},"–p",[33,77725,1115],{"class":50},[33,77727,72824],{"class":167},[33,77729,1121],{"class":50},[33,77731,69863],{"class":54},[33,77733,1115],{"class":50},[33,77735,77736],{"class":167},"out.name",[33,77738,1121],{"class":50},[33,77740,274],{"class":54},[33,77742,221],{"class":167},[33,77744,77745],{"class":35,"line":1826},[33,77746,92],{"emptyLinePlaceholder":91},[33,77748,77749],{"class":35,"line":1844},[33,77750,92],{"emptyLinePlaceholder":91},[33,77752,77753,77755,77757,77759,77761],{"class":35,"line":1858},[33,77754,562],{"class":163},[33,77756,6636],{"class":46},[33,77758,568],{"class":167},[33,77760,571],{"class":50},[33,77762,574],{"class":167},[33,77764,77765,77767,77769,77771,77773,77775,77778],{"class":35,"line":1871},[33,77766,15498],{"class":167},[33,77768,242],{"class":163},[33,77770,6653],{"class":167},[33,77772,6656],{"class":238},[33,77774,242],{"class":163},[33,77776,77777],{"class":54},"\"Split a PDF by page ranges\"",[33,77779,221],{"class":167},[33,77781,77782,77784,77786,77788,77790,77792,77794,77796,77798,77800],{"class":35,"line":1877},[33,77783,15516],{"class":167},[33,77785,6672],{"class":54},[33,77787,365],{"class":167},[33,77789,25448],{"class":238},[33,77791,242],{"class":163},[33,77793,855],{"class":50},[33,77795,365],{"class":167},[33,77797,6677],{"class":238},[33,77799,242],{"class":163},[33,77801,15528],{"class":167},[33,77803,77804,77806,77808,77810,77812,77814,77816,77818,77820,77822],{"class":35,"line":1883},[33,77805,15516],{"class":167},[33,77807,6699],{"class":54},[33,77809,365],{"class":167},[33,77811,25448],{"class":238},[33,77813,242],{"class":163},[33,77815,855],{"class":50},[33,77817,365],{"class":167},[33,77819,6677],{"class":238},[33,77821,242],{"class":163},[33,77823,15528],{"class":167},[33,77825,77826,77828,77831,77833,77835,77837,77839,77841,77843,77845,77848],{"class":35,"line":1915},[33,77827,15516],{"class":167},[33,77829,77830],{"class":54},"\"--ranges\"",[33,77832,365],{"class":167},[33,77834,25448],{"class":238},[33,77836,242],{"class":163},[33,77838,855],{"class":50},[33,77840,365],{"class":167},[33,77842,25463],{"class":238},[33,77844,242],{"class":163},[33,77846,77847],{"class":54},"'e.g. \"1-10,11-25,26-40\"'",[33,77849,221],{"class":167},[33,77851,77852,77854,77856],{"class":35,"line":1926},[33,77853,6766],{"class":167},[33,77855,242],{"class":163},[33,77857,15655],{"class":167},[33,77859,77860],{"class":35,"line":1932},[33,77861,77862],{"class":167},"    split(args.input, args.output, parse_ranges(args.ranges))\n",[33,77864,77865],{"class":35,"line":1938},[33,77866,92],{"emptyLinePlaceholder":91},[33,77868,77869],{"class":35,"line":1950},[33,77870,92],{"emptyLinePlaceholder":91},[33,77872,77873,77875,77877,77879,77881],{"class":35,"line":1958},[33,77874,2491],{"class":163},[33,77876,2494],{"class":50},[33,77878,2497],{"class":163},[33,77880,2500],{"class":54},[33,77882,574],{"class":167},[33,77884,77885],{"class":35,"line":4904},[33,77886,6914],{"class":167},[18,77888,77890],{"id":77889},"variant-split-every-n-pages","Variant: Split Every N Pages",[14,77892,77893],{},"When you do not know the ranges in advance — just \"give me 10-page chunks\":",[23,77895,77897],{"className":126,"code":77896,"language":47,"meta":28,"style":28},"# pip install pypdf\nimport math\nfrom pathlib import Path\nfrom pypdf import PdfReader, PdfWriter\n\n\ndef split_every_n(input_path: Path, output_dir: Path, n: int) -> list[Path]:\n    \"\"\"\n    Split input_path into files of n pages each.\n    The last file may have fewer than n pages.\n    Returns list of created file paths.\n    \"\"\"\n    if n \u003C 1:\n        raise ValueError(f\"n must be >= 1, got {n}\")\n\n    output_dir.mkdir(parents=True, exist_ok=True)\n    created: list[Path] = []\n\n    with open(input_path, \"rb\") as fh:\n        reader = PdfReader(fh)\n        total = len(reader.pages)\n        n_chunks = math.ceil(total \u002F n)\n\n        for chunk in range(n_chunks):\n            start_idx = chunk * n              # 0-based\n            end_idx = min(start_idx + n, total)  # exclusive\n            writer = PdfWriter()\n            for i in range(start_idx, end_idx):\n                writer.add_page(reader.pages[i])\n            out_path = output_dir \u002F f\"{input_path.stem}_chunk{chunk + 1:03d}.pdf\"\n            with open(out_path, \"wb\") as out:\n                writer.write(out)\n            writer.close()\n            created.append(out_path)\n            # Report in 1-based page numbers for readability\n            print(f\"chunk{chunk + 1:03d}: pages {start_idx + 1}–{end_idx} → {out_path.name}\")\n\n    return created\n\n\nif __name__ == \"__main__\":\n    split_every_n(Path(\".\u002Flarge_export.pdf\"), Path(\".\u002Fchunks\"), n=10)\n",[30,77898,77899,77903,77909,77919,77929,77933,77937,77949,77953,77958,77963,77968,77972,77984,78007,78011,78031,78039,78043,78059,78067,78077,78089,78093,78105,78121,78141,78149,78162,78166,78203,78219,78223,78227,78231,78236,78291,78295,78301,78305,78309,78321],{"__ignoreMap":28},[33,77900,77901],{"class":35,"line":36},[33,77902,57316],{"class":39},[33,77904,77905,77907],{"class":35,"line":43},[33,77906,164],{"class":163},[33,77908,70042],{"class":167},[33,77910,77911,77913,77915,77917],{"class":35,"line":61},[33,77912,190],{"class":163},[33,77914,193],{"class":167},[33,77916,164],{"class":163},[33,77918,198],{"class":167},[33,77920,77921,77923,77925,77927],{"class":35,"line":73},[33,77922,190],{"class":163},[33,77924,57333],{"class":167},[33,77926,164],{"class":163},[33,77928,66892],{"class":167},[33,77930,77931],{"class":35,"line":88},[33,77932,92],{"emptyLinePlaceholder":91},[33,77934,77935],{"class":35,"line":95},[33,77936,92],{"emptyLinePlaceholder":91},[33,77938,77939,77941,77943,77945,77947],{"class":35,"line":101},[33,77940,562],{"class":163},[33,77942,73641],{"class":46},[33,77944,73644],{"class":167},[33,77946,1059],{"class":50},[33,77948,72642],{"class":167},[33,77950,77951],{"class":35,"line":171},[33,77952,7673],{"class":54},[33,77954,77955],{"class":35,"line":179},[33,77956,77957],{"class":54},"    Split input_path into files of n pages each.\n",[33,77959,77960],{"class":35,"line":187},[33,77961,77962],{"class":54},"    The last file may have fewer than n pages.\n",[33,77964,77965],{"class":35,"line":201},[33,77966,77967],{"class":54},"    Returns list of created file paths.\n",[33,77969,77970],{"class":35,"line":206},[33,77971,7673],{"class":54},[33,77973,77974,77976,77978,77980,77982],{"class":35,"line":224},[33,77975,617],{"class":163},[33,77977,48941],{"class":167},[33,77979,4043],{"class":163},[33,77981,1814],{"class":50},[33,77983,574],{"class":167},[33,77985,77986,77988,77990,77992,77994,77997,77999,78001,78003,78005],{"class":35,"line":229},[33,77987,4051],{"class":163},[33,77989,4054],{"class":50},[33,77991,602],{"class":167},[33,77993,4059],{"class":163},[33,77995,77996],{"class":54},"\"n must be >= 1, got ",[33,77998,1115],{"class":50},[33,78000,22354],{"class":167},[33,78002,1121],{"class":50},[33,78004,274],{"class":54},[33,78006,221],{"class":167},[33,78008,78009],{"class":35,"line":235},[33,78010,92],{"emptyLinePlaceholder":91},[33,78012,78013,78015,78017,78019,78021,78023,78025,78027,78029],{"class":35,"line":250},[33,78014,6346],{"class":167},[33,78016,869],{"class":238},[33,78018,242],{"class":163},[33,78020,855],{"class":50},[33,78022,365],{"class":167},[33,78024,878],{"class":238},[33,78026,242],{"class":163},[33,78028,855],{"class":50},[33,78030,221],{"class":167},[33,78032,78033,78035,78037],{"class":35,"line":266},[33,78034,72690],{"class":167},[33,78036,242],{"class":163},[33,78038,589],{"class":167},[33,78040,78041],{"class":35,"line":290},[33,78042,92],{"emptyLinePlaceholder":91},[33,78044,78045,78047,78049,78051,78053,78055,78057],{"class":35,"line":295},[33,78046,1635],{"class":163},[33,78048,68213],{"class":50},[33,78050,72709],{"class":167},[33,78052,68219],{"class":54},[33,78054,1649],{"class":167},[33,78056,495],{"class":163},[33,78058,67176],{"class":167},[33,78060,78061,78063,78065],{"class":35,"line":300},[33,78062,62484],{"class":167},[33,78064,242],{"class":163},[33,78066,68235],{"class":167},[33,78068,78069,78071,78073,78075],{"class":35,"line":317},[33,78070,73710],{"class":167},[33,78072,242],{"class":163},[33,78074,4037],{"class":50},[33,78076,70691],{"class":167},[33,78078,78079,78081,78083,78085,78087],{"class":35,"line":332},[33,78080,70182],{"class":167},[33,78082,242],{"class":163},[33,78084,73725],{"class":167},[33,78086,1351],{"class":163},[33,78088,73730],{"class":167},[33,78090,78091],{"class":35,"line":347},[33,78092,92],{"emptyLinePlaceholder":91},[33,78094,78095,78097,78099,78101,78103],{"class":35,"line":374},[33,78096,5973],{"class":163},[33,78098,21937],{"class":167},[33,78100,662],{"class":163},[33,78102,1801],{"class":50},[33,78104,70215],{"class":167},[33,78106,78107,78110,78112,78114,78116,78119],{"class":35,"line":397},[33,78108,78109],{"class":167},"            start_idx ",[33,78111,242],{"class":163},[33,78113,21937],{"class":167},[33,78115,1769],{"class":163},[33,78117,78118],{"class":167}," n              ",[33,78120,72874],{"class":39},[33,78122,78123,78126,78128,78130,78133,78135,78138],{"class":35,"line":653},[33,78124,78125],{"class":167},"            end_idx ",[33,78127,242],{"class":163},[33,78129,73775],{"class":50},[33,78131,78132],{"class":167},"(start_idx ",[33,78134,1811],{"class":163},[33,78136,78137],{"class":167}," n, total)  ",[33,78139,78140],{"class":39},"# exclusive\n",[33,78142,78143,78145,78147],{"class":35,"line":667},[33,78144,70275],{"class":167},[33,78146,242],{"class":163},[33,78148,67154],{"class":167},[33,78150,78151,78153,78155,78157,78159],{"class":35,"line":675},[33,78152,1793],{"class":163},[33,78154,47269],{"class":167},[33,78156,662],{"class":163},[33,78158,1801],{"class":50},[33,78160,78161],{"class":167},"(start_idx, end_idx):\n",[33,78163,78164],{"class":35,"line":689},[33,78165,73800],{"class":167},[33,78167,78168,78170,78172,78174,78176,78178,78180,78182,78184,78186,78188,78190,78193,78195,78197,78199,78201],{"class":35,"line":703},[33,78169,73805],{"class":167},[33,78171,242],{"class":163},[33,78173,6393],{"class":167},[33,78175,1351],{"class":163},[33,78177,1110],{"class":163},[33,78179,274],{"class":54},[33,78181,1115],{"class":50},[33,78183,72902],{"class":167},[33,78185,1121],{"class":50},[33,78187,73824],{"class":54},[33,78189,1115],{"class":50},[33,78191,78192],{"class":167},"chunk ",[33,78194,1811],{"class":163},[33,78196,1814],{"class":50},[33,78198,53658],{"class":163},[33,78200,1121],{"class":50},[33,78202,19246],{"class":54},[33,78204,78205,78207,78209,78211,78213,78215,78217],{"class":35,"line":714},[33,78206,678],{"class":163},[33,78208,68213],{"class":50},[33,78210,72929],{"class":167},[33,78212,67169],{"class":54},[33,78214,1649],{"class":167},[33,78216,495],{"class":163},[33,78218,69097],{"class":167},[33,78220,78221],{"class":35,"line":723},[33,78222,70388],{"class":167},[33,78224,78225],{"class":35,"line":754},[33,78226,70393],{"class":167},[33,78228,78229],{"class":35,"line":771},[33,78230,73867],{"class":167},[33,78232,78233],{"class":35,"line":777},[33,78234,78235],{"class":39},"            # Report in 1-based page numbers for readability\n",[33,78237,78238,78240,78242,78244,78247,78249,78251,78253,78255,78257,78259,78261,78263,78266,78268,78270,78272,78274,78277,78279,78281,78283,78285,78287,78289],{"class":35,"line":788},[33,78239,9364],{"class":50},[33,78241,602],{"class":167},[33,78243,4059],{"class":163},[33,78245,78246],{"class":54},"\"chunk",[33,78248,1115],{"class":50},[33,78250,78192],{"class":167},[33,78252,1811],{"class":163},[33,78254,1814],{"class":50},[33,78256,53658],{"class":163},[33,78258,1121],{"class":50},[33,78260,75448],{"class":54},[33,78262,1115],{"class":50},[33,78264,78265],{"class":167},"start_idx ",[33,78267,1811],{"class":163},[33,78269,11022],{"class":50},[33,78271,72819],{"class":54},[33,78273,1115],{"class":50},[33,78275,78276],{"class":167},"end_idx",[33,78278,1121],{"class":50},[33,78280,69863],{"class":54},[33,78282,1115],{"class":50},[33,78284,75469],{"class":167},[33,78286,1121],{"class":50},[33,78288,274],{"class":54},[33,78290,221],{"class":167},[33,78292,78293],{"class":35,"line":804},[33,78294,92],{"emptyLinePlaceholder":91},[33,78296,78297,78299],{"class":35,"line":809},[33,78298,1332],{"class":163},[33,78300,73034],{"class":167},[33,78302,78303],{"class":35,"line":819},[33,78304,92],{"emptyLinePlaceholder":91},[33,78306,78307],{"class":35,"line":829},[33,78308,92],{"emptyLinePlaceholder":91},[33,78310,78311,78313,78315,78317,78319],{"class":35,"line":834},[33,78312,2491],{"class":163},[33,78314,2494],{"class":50},[33,78316,2497],{"class":163},[33,78318,2500],{"class":54},[33,78320,574],{"class":167},[33,78322,78323,78326,78329,78331,78334,78336,78338,78340,78342],{"class":35,"line":839},[33,78324,78325],{"class":167},"    split_every_n(Path(",[33,78327,78328],{"class":54},"\".\u002Flarge_export.pdf\"",[33,78330,71978],{"class":167},[33,78332,78333],{"class":54},"\".\u002Fchunks\"",[33,78335,18525],{"class":167},[33,78337,22354],{"class":238},[33,78339,242],{"class":163},[33,78341,3545],{"class":50},[33,78343,221],{"class":167},[14,78345,78346,78347,78350],{},"Note: ",[30,78348,78349],{},"start_idx"," is already 0-based here because we compute it directly from the chunk index. No subtraction needed — the subtraction is only required when converting a 1-based user input.",[18,78352,78354],{"id":78353},"variant-split-on-bookmarks","Variant: Split on Bookmarks",[14,78356,78357],{},"When the PDF has a well-structured outline, split at each top-level bookmark:",[23,78359,78361],{"className":126,"code":78360,"language":47,"meta":28,"style":28},"# pip install pypdf\nfrom pathlib import Path\nfrom pypdf import PdfReader, PdfWriter\n\n\ndef split_on_bookmarks(input_path: Path, output_dir: Path) -> list[Path]:\n    \"\"\"\n    Split a PDF at each top-level bookmark.\n    Each section runs from its bookmark's page to the page before the next bookmark.\n    Returns list of created file paths.\n    \"\"\"\n    output_dir.mkdir(parents=True, exist_ok=True)\n    created: list[Path] = []\n\n    with open(input_path, \"rb\") as fh:\n        reader = PdfReader(fh)\n        total = len(reader.pages)\n\n        # Collect top-level bookmarks (skip nested lists)\n        top_level = [\n            item for item in reader.outline\n            if hasattr(item, \"title\")\n        ]\n        if not top_level:\n            print(\"No top-level bookmarks found; nothing to split on.\")\n            return created\n\n        # Build (start_page_0based, title) pairs\n        sections: list[tuple[int, str]] = []\n        for item in top_level:\n            pg_0based = reader.get_destination_page_number(item)\n            sections.append((pg_0based, item.title))\n\n        # Each section ends one page before the next section starts\n        for i, (start_idx, title) in enumerate(sections):\n            end_idx = sections[i + 1][0] if i + 1 \u003C len(sections) else total\n            if start_idx >= end_idx:\n                print(f\"[SKIP] Empty section: {title!r}\")\n                continue\n\n            writer = PdfWriter()\n            for page_idx in range(start_idx, end_idx):\n                writer.add_page(reader.pages[page_idx])\n\n            # Sanitize title for use as filename\n            safe_name = \"\".join(c if c.isalnum() or c in \" _-\" else \"_\" for c in title)\n            out_path = output_dir \u002F f\"{i + 1:02d}_{safe_name[:40]}.pdf\"\n            with open(out_path, \"wb\") as out:\n                writer.write(out)\n            writer.close()\n            created.append(out_path)\n            # Display in 1-based page numbers\n            print(f\"  [{i+1}] '{title}' (p{start_idx+1}–p{end_idx}) → {out_path.name}\")\n\n    return created\n\n\nif __name__ == \"__main__\":\n    split_on_bookmarks(Path(\".\u002Freport_with_bookmarks.pdf\"), Path(\".\u002Fsections\"))\n",[30,78362,78363,78367,78377,78387,78391,78395,78405,78409,78414,78419,78423,78427,78447,78455,78459,78475,78483,78493,78497,78502,78511,78524,78536,78540,78549,78560,78566,78570,78575,78592,78602,78612,78617,78621,78626,78639,78679,78691,78714,78718,78722,78730,78742,78747,78751,78756,78796,78838,78854,78858,78862,78866,78871,78930,78934,78940,78944,78948,78960],{"__ignoreMap":28},[33,78364,78365],{"class":35,"line":36},[33,78366,57316],{"class":39},[33,78368,78369,78371,78373,78375],{"class":35,"line":43},[33,78370,190],{"class":163},[33,78372,193],{"class":167},[33,78374,164],{"class":163},[33,78376,198],{"class":167},[33,78378,78379,78381,78383,78385],{"class":35,"line":61},[33,78380,190],{"class":163},[33,78382,57333],{"class":167},[33,78384,164],{"class":163},[33,78386,66892],{"class":167},[33,78388,78389],{"class":35,"line":73},[33,78390,92],{"emptyLinePlaceholder":91},[33,78392,78393],{"class":35,"line":88},[33,78394,92],{"emptyLinePlaceholder":91},[33,78396,78397,78399,78402],{"class":35,"line":95},[33,78398,562],{"class":163},[33,78400,78401],{"class":46}," split_on_bookmarks",[33,78403,78404],{"class":167},"(input_path: Path, output_dir: Path) -> list[Path]:\n",[33,78406,78407],{"class":35,"line":101},[33,78408,7673],{"class":54},[33,78410,78411],{"class":35,"line":171},[33,78412,78413],{"class":54},"    Split a PDF at each top-level bookmark.\n",[33,78415,78416],{"class":35,"line":179},[33,78417,78418],{"class":54},"    Each section runs from its bookmark's page to the page before the next bookmark.\n",[33,78420,78421],{"class":35,"line":187},[33,78422,77967],{"class":54},[33,78424,78425],{"class":35,"line":201},[33,78426,7673],{"class":54},[33,78428,78429,78431,78433,78435,78437,78439,78441,78443,78445],{"class":35,"line":206},[33,78430,6346],{"class":167},[33,78432,869],{"class":238},[33,78434,242],{"class":163},[33,78436,855],{"class":50},[33,78438,365],{"class":167},[33,78440,878],{"class":238},[33,78442,242],{"class":163},[33,78444,855],{"class":50},[33,78446,221],{"class":167},[33,78448,78449,78451,78453],{"class":35,"line":224},[33,78450,72690],{"class":167},[33,78452,242],{"class":163},[33,78454,589],{"class":167},[33,78456,78457],{"class":35,"line":229},[33,78458,92],{"emptyLinePlaceholder":91},[33,78460,78461,78463,78465,78467,78469,78471,78473],{"class":35,"line":235},[33,78462,1635],{"class":163},[33,78464,68213],{"class":50},[33,78466,72709],{"class":167},[33,78468,68219],{"class":54},[33,78470,1649],{"class":167},[33,78472,495],{"class":163},[33,78474,67176],{"class":167},[33,78476,78477,78479,78481],{"class":35,"line":250},[33,78478,62484],{"class":167},[33,78480,242],{"class":163},[33,78482,68235],{"class":167},[33,78484,78485,78487,78489,78491],{"class":35,"line":266},[33,78486,73710],{"class":167},[33,78488,242],{"class":163},[33,78490,4037],{"class":50},[33,78492,70691],{"class":167},[33,78494,78495],{"class":35,"line":290},[33,78496,92],{"emptyLinePlaceholder":91},[33,78498,78499],{"class":35,"line":295},[33,78500,78501],{"class":39},"        # Collect top-level bookmarks (skip nested lists)\n",[33,78503,78504,78507,78509],{"class":35,"line":300},[33,78505,78506],{"class":167},"        top_level ",[33,78508,242],{"class":163},[33,78510,7473],{"class":167},[33,78512,78513,78516,78518,78520,78522],{"class":35,"line":317},[33,78514,78515],{"class":167},"            item ",[33,78517,6124],{"class":163},[33,78519,54203],{"class":167},[33,78521,662],{"class":163},[33,78523,72089],{"class":167},[33,78525,78526,78528,78530,78532,78534],{"class":35,"line":332},[33,78527,5995],{"class":163},[33,78529,75961],{"class":50},[33,78531,72109],{"class":167},[33,78533,62210],{"class":54},[33,78535,221],{"class":167},[33,78537,78538],{"class":35,"line":347},[33,78539,19722],{"class":167},[33,78541,78542,78544,78546],{"class":35,"line":374},[33,78543,8221],{"class":163},[33,78545,620],{"class":163},[33,78547,78548],{"class":167}," top_level:\n",[33,78550,78551,78553,78555,78558],{"class":35,"line":397},[33,78552,9364],{"class":50},[33,78554,602],{"class":167},[33,78556,78557],{"class":54},"\"No top-level bookmarks found; nothing to split on.\"",[33,78559,221],{"class":167},[33,78561,78562,78564],{"class":35,"line":653},[33,78563,28782],{"class":163},[33,78565,73034],{"class":167},[33,78567,78568],{"class":35,"line":667},[33,78569,92],{"emptyLinePlaceholder":91},[33,78571,78572],{"class":35,"line":675},[33,78573,78574],{"class":39},"        # Build (start_page_0based, title) pairs\n",[33,78576,78577,78580,78582,78584,78586,78588,78590],{"class":35,"line":689},[33,78578,78579],{"class":167},"        sections: list[tuple[",[33,78581,1059],{"class":50},[33,78583,365],{"class":167},[33,78585,1053],{"class":50},[33,78587,13081],{"class":167},[33,78589,242],{"class":163},[33,78591,589],{"class":167},[33,78593,78594,78596,78598,78600],{"class":35,"line":703},[33,78595,5973],{"class":163},[33,78597,54203],{"class":167},[33,78599,662],{"class":163},[33,78601,78548],{"class":167},[33,78603,78604,78607,78609],{"class":35,"line":714},[33,78605,78606],{"class":167},"            pg_0based ",[33,78608,242],{"class":163},[33,78610,78611],{"class":167}," reader.get_destination_page_number(item)\n",[33,78613,78614],{"class":35,"line":723},[33,78615,78616],{"class":167},"            sections.append((pg_0based, item.title))\n",[33,78618,78619],{"class":35,"line":754},[33,78620,92],{"emptyLinePlaceholder":91},[33,78622,78623],{"class":35,"line":771},[33,78624,78625],{"class":39},"        # Each section ends one page before the next section starts\n",[33,78627,78628,78630,78633,78635,78637],{"class":35,"line":777},[33,78629,5973],{"class":163},[33,78631,78632],{"class":167}," i, (start_idx, title) ",[33,78634,662],{"class":163},[33,78636,7403],{"class":50},[33,78638,62185],{"class":167},[33,78640,78641,78643,78645,78648,78650,78652,78654,78656,78658,78660,78662,78664,78666,78669,78671,78674,78676],{"class":35,"line":788},[33,78642,78125],{"class":167},[33,78644,242],{"class":163},[33,78646,78647],{"class":167}," sections[i ",[33,78649,1811],{"class":163},[33,78651,1814],{"class":50},[33,78653,44179],{"class":167},[33,78655,748],{"class":50},[33,78657,763],{"class":167},[33,78659,2491],{"class":163},[33,78661,47269],{"class":167},[33,78663,1811],{"class":163},[33,78665,1814],{"class":50},[33,78667,78668],{"class":163}," \u003C",[33,78670,4037],{"class":50},[33,78672,78673],{"class":167},"(sections) ",[33,78675,7489],{"class":163},[33,78677,78678],{"class":167}," total\n",[33,78680,78681,78683,78686,78688],{"class":35,"line":804},[33,78682,5995],{"class":163},[33,78684,78685],{"class":167}," start_idx ",[33,78687,43000],{"class":163},[33,78689,78690],{"class":167}," end_idx:\n",[33,78692,78693,78695,78697,78699,78702,78704,78706,78708,78710,78712],{"class":35,"line":809},[33,78694,8264],{"class":50},[33,78696,602],{"class":167},[33,78698,4059],{"class":163},[33,78700,78701],{"class":54},"\"[SKIP] Empty section: ",[33,78703,1115],{"class":50},[33,78705,2549],{"class":167},[33,78707,76954],{"class":163},[33,78709,1121],{"class":50},[33,78711,274],{"class":54},[33,78713,221],{"class":167},[33,78715,78716],{"class":35,"line":819},[33,78717,12315],{"class":163},[33,78719,78720],{"class":35,"line":829},[33,78721,92],{"emptyLinePlaceholder":91},[33,78723,78724,78726,78728],{"class":35,"line":834},[33,78725,70275],{"class":167},[33,78727,242],{"class":163},[33,78729,67154],{"class":167},[33,78731,78732,78734,78736,78738,78740],{"class":35,"line":839},[33,78733,1793],{"class":163},[33,78735,76465],{"class":167},[33,78737,662],{"class":163},[33,78739,1801],{"class":50},[33,78741,78161],{"class":167},[33,78743,78744],{"class":35,"line":860},[33,78745,78746],{"class":167},"                writer.add_page(reader.pages[page_idx])\n",[33,78748,78749],{"class":35,"line":887},[33,78750,92],{"emptyLinePlaceholder":91},[33,78752,78753],{"class":35,"line":907},[33,78754,78755],{"class":39},"            # Sanitize title for use as filename\n",[33,78757,78758,78761,78763,78765,78768,78770,78773,78775,78777,78779,78782,78784,78787,78789,78791,78793],{"class":35,"line":1826},[33,78759,78760],{"class":167},"            safe_name ",[33,78762,242],{"class":163},[33,78764,9892],{"class":54},[33,78766,78767],{"class":167},".join(c ",[33,78769,2491],{"class":163},[33,78771,78772],{"class":167}," c.isalnum() ",[33,78774,7162],{"class":163},[33,78776,7486],{"class":167},[33,78778,662],{"class":163},[33,78780,78781],{"class":54}," \" _-\"",[33,78783,15715],{"class":163},[33,78785,78786],{"class":54}," \"_\"",[33,78788,14766],{"class":163},[33,78790,7486],{"class":167},[33,78792,662],{"class":163},[33,78794,78795],{"class":167}," title)\n",[33,78797,78798,78800,78802,78804,78806,78808,78810,78812,78814,78816,78818,78820,78822,78825,78827,78830,78832,78834,78836],{"class":35,"line":1844},[33,78799,73805],{"class":167},[33,78801,242],{"class":163},[33,78803,6393],{"class":167},[33,78805,1351],{"class":163},[33,78807,1110],{"class":163},[33,78809,274],{"class":54},[33,78811,1115],{"class":50},[33,78813,11017],{"class":167},[33,78815,1811],{"class":163},[33,78817,1814],{"class":50},[33,78819,72915],{"class":163},[33,78821,1121],{"class":50},[33,78823,78824],{"class":54},"_",[33,78826,1115],{"class":50},[33,78828,78829],{"class":167},"safe_name[:",[33,78831,26323],{"class":50},[33,78833,9546],{"class":167},[33,78835,1121],{"class":50},[33,78837,19246],{"class":54},[33,78839,78840,78842,78844,78846,78848,78850,78852],{"class":35,"line":1858},[33,78841,678],{"class":163},[33,78843,68213],{"class":50},[33,78845,72929],{"class":167},[33,78847,67169],{"class":54},[33,78849,1649],{"class":167},[33,78851,495],{"class":163},[33,78853,69097],{"class":167},[33,78855,78856],{"class":35,"line":1871},[33,78857,70388],{"class":167},[33,78859,78860],{"class":35,"line":1877},[33,78861,70393],{"class":167},[33,78863,78864],{"class":35,"line":1883},[33,78865,73867],{"class":167},[33,78867,78868],{"class":35,"line":1915},[33,78869,78870],{"class":39},"            # Display in 1-based page numbers\n",[33,78872,78873,78875,78877,78879,78881,78883,78885,78887,78889,78892,78894,78896,78898,78901,78903,78905,78907,78909,78911,78913,78915,78917,78920,78922,78924,78926,78928],{"class":35,"line":1926},[33,78874,9364],{"class":50},[33,78876,602],{"class":167},[33,78878,4059],{"class":163},[33,78880,53685],{"class":54},[33,78882,1115],{"class":50},[33,78884,7499],{"class":167},[33,78886,1811],{"class":163},[33,78888,40161],{"class":50},[33,78890,78891],{"class":54},"] '",[33,78893,1115],{"class":50},[33,78895,2549],{"class":167},[33,78897,1121],{"class":50},[33,78899,78900],{"class":54},"' (p",[33,78902,1115],{"class":50},[33,78904,78349],{"class":167},[33,78906,1811],{"class":163},[33,78908,40161],{"class":50},[33,78910,77723],{"class":54},[33,78912,1115],{"class":50},[33,78914,78276],{"class":167},[33,78916,1121],{"class":50},[33,78918,78919],{"class":54},") → ",[33,78921,1115],{"class":50},[33,78923,75469],{"class":167},[33,78925,1121],{"class":50},[33,78927,274],{"class":54},[33,78929,221],{"class":167},[33,78931,78932],{"class":35,"line":1932},[33,78933,92],{"emptyLinePlaceholder":91},[33,78935,78936,78938],{"class":35,"line":1938},[33,78937,1332],{"class":163},[33,78939,73034],{"class":167},[33,78941,78942],{"class":35,"line":1950},[33,78943,92],{"emptyLinePlaceholder":91},[33,78945,78946],{"class":35,"line":1958},[33,78947,92],{"emptyLinePlaceholder":91},[33,78949,78950,78952,78954,78956,78958],{"class":35,"line":4904},[33,78951,2491],{"class":163},[33,78953,2494],{"class":50},[33,78955,2497],{"class":163},[33,78957,2500],{"class":54},[33,78959,574],{"class":167},[33,78961,78962,78965,78968,78970,78973],{"class":35,"line":4909},[33,78963,78964],{"class":167},"    split_on_bookmarks(Path(",[33,78966,78967],{"class":54},"\".\u002Freport_with_bookmarks.pdf\"",[33,78969,71978],{"class":167},[33,78971,78972],{"class":54},"\".\u002Fsections\"",[33,78974,371],{"class":167},[14,78976,78977,78980],{},[30,78978,78979],{},"get_destination_page_number()"," returns a 0-based index — no subtraction needed since we use it directly as a slice start, not from user input.",[18,78982,9247],{"id":9246},[23,78984,78986],{"className":126,"code":78985,"language":47,"meta":28,"style":28},"# pip install pypdf\nfrom pypdf import PdfReader\nfrom pathlib import Path\n\n\ndef verify_splits(\n    source: Path,\n    output_dir: Path,\n    ranges: list[tuple[int, int]],\n) -> bool:\n    \"\"\"\n    Confirm each split file has the expected page count.\n    Assumes files are named *_part01.pdf, *_part02.pdf, ...\n    \"\"\"\n    all_ok = True\n    for idx, (start, end) in enumerate(ranges, 1):\n        expected = end - start + 1\n        out_path = output_dir \u002F f\"{source.stem}_part{idx:02d}.pdf\"\n        try:\n            actual = len(PdfReader(out_path).pages)\n            status = \"OK\" if actual == expected else f\"FAIL (expected {expected}, got {actual})\"\n        except Exception as exc:\n            status = f\"FAIL ({exc})\"\n            actual = -1\n        print(f\"  part{idx:02d}: {status}\")\n        if \"FAIL\" in status:\n            all_ok = False\n    return all_ok\n\n\nif __name__ == \"__main__\":\n    source = Path(\".\u002Fannual_report.pdf\")\n    ranges = [(1, 10), (11, 25), (26, 40)]\n    split_by_ranges(source, Path(\".\u002Foutput\u002Fsplits\"), ranges)\n    ok = verify_splits(source, Path(\".\u002Foutput\u002Fsplits\"), ranges)\n    print(\"All splits verified.\" if ok else \"Some splits failed verification.\")\n",[30,78987,78988,78992,79002,79012,79016,79020,79029,79034,79038,79050,79058,79062,79067,79072,79076,79085,79101,79118,79152,79158,79170,79212,79222,79241,79251,79281,79293,79302,79309,79313,79317,79329,79342,79376,79386,79400],{"__ignoreMap":28},[33,78989,78990],{"class":35,"line":36},[33,78991,57316],{"class":39},[33,78993,78994,78996,78998,79000],{"class":35,"line":43},[33,78995,190],{"class":163},[33,78997,57333],{"class":167},[33,78999,164],{"class":163},[33,79001,57338],{"class":167},[33,79003,79004,79006,79008,79010],{"class":35,"line":61},[33,79005,190],{"class":163},[33,79007,193],{"class":167},[33,79009,164],{"class":163},[33,79011,198],{"class":167},[33,79013,79014],{"class":35,"line":73},[33,79015,92],{"emptyLinePlaceholder":91},[33,79017,79018],{"class":35,"line":88},[33,79019,92],{"emptyLinePlaceholder":91},[33,79021,79022,79024,79027],{"class":35,"line":95},[33,79023,562],{"class":163},[33,79025,79026],{"class":46}," verify_splits",[33,79028,7637],{"class":167},[33,79030,79031],{"class":35,"line":101},[33,79032,79033],{"class":167},"    source: Path,\n",[33,79035,79036],{"class":35,"line":171},[33,79037,72624],{"class":167},[33,79039,79040,79042,79044,79046,79048],{"class":35,"line":179},[33,79041,72629],{"class":167},[33,79043,1059],{"class":50},[33,79045,365],{"class":167},[33,79047,1059],{"class":50},[33,79049,47404],{"class":167},[33,79051,79052,79054,79056],{"class":35,"line":187},[33,79053,1617],{"class":167},[33,79055,2821],{"class":50},[33,79057,574],{"class":167},[33,79059,79060],{"class":35,"line":201},[33,79061,7673],{"class":54},[33,79063,79064],{"class":35,"line":206},[33,79065,79066],{"class":54},"    Confirm each split file has the expected page count.\n",[33,79068,79069],{"class":35,"line":224},[33,79070,79071],{"class":54},"    Assumes files are named *_part01.pdf, *_part02.pdf, ...\n",[33,79073,79074],{"class":35,"line":229},[33,79075,7673],{"class":54},[33,79077,79078,79081,79083],{"class":35,"line":235},[33,79079,79080],{"class":167},"    all_ok ",[33,79082,242],{"class":163},[33,79084,2887],{"class":50},[33,79086,79087,79089,79091,79093,79095,79097,79099],{"class":35,"line":250},[33,79088,656],{"class":163},[33,79090,72748],{"class":167},[33,79092,662],{"class":163},[33,79094,7403],{"class":50},[33,79096,72755],{"class":167},[33,79098,734],{"class":50},[33,79100,1737],{"class":167},[33,79102,79103,79106,79108,79110,79112,79114,79116],{"class":35,"line":266},[33,79104,79105],{"class":167},"        expected ",[33,79107,242],{"class":163},[33,79109,72779],{"class":167},[33,79111,4126],{"class":163},[33,79113,72770],{"class":167},[33,79115,1811],{"class":163},[33,79117,17709],{"class":50},[33,79119,79120,79123,79125,79127,79129,79131,79133,79135,79138,79140,79142,79144,79146,79148,79150],{"class":35,"line":290},[33,79121,79122],{"class":167},"        out_path ",[33,79124,242],{"class":163},[33,79126,6393],{"class":167},[33,79128,1351],{"class":163},[33,79130,1110],{"class":163},[33,79132,274],{"class":54},[33,79134,1115],{"class":50},[33,79136,79137],{"class":167},"source.stem",[33,79139,1121],{"class":50},[33,79141,72907],{"class":54},[33,79143,1115],{"class":50},[33,79145,72912],{"class":167},[33,79147,72915],{"class":163},[33,79149,1121],{"class":50},[33,79151,19246],{"class":54},[33,79153,79154,79156],{"class":35,"line":295},[33,79155,670],{"class":163},[33,79157,574],{"class":167},[33,79159,79160,79163,79165,79167],{"class":35,"line":300},[33,79161,79162],{"class":167},"            actual ",[33,79164,242],{"class":163},[33,79166,4037],{"class":50},[33,79168,79169],{"class":167},"(PdfReader(out_path).pages)\n",[33,79171,79172,79174,79176,79179,79181,79183,79185,79188,79190,79192,79195,79197,79200,79202,79204,79206,79208,79210],{"class":35,"line":317},[33,79173,68282],{"class":167},[33,79175,242],{"class":163},[33,79177,79178],{"class":54}," \"OK\"",[33,79180,9994],{"class":163},[33,79182,25170],{"class":167},[33,79184,1865],{"class":163},[33,79186,79187],{"class":167}," expected ",[33,79189,7489],{"class":163},[33,79191,1110],{"class":163},[33,79193,79194],{"class":54},"\"FAIL (expected ",[33,79196,1115],{"class":50},[33,79198,79199],{"class":167},"expected",[33,79201,1121],{"class":50},[33,79203,21519],{"class":54},[33,79205,1115],{"class":50},[33,79207,25201],{"class":167},[33,79209,1121],{"class":50},[33,79211,17841],{"class":54},[33,79213,79214,79216,79218,79220],{"class":35,"line":332},[33,79215,780],{"class":163},[33,79217,783],{"class":50},[33,79219,1852],{"class":163},[33,79221,1855],{"class":167},[33,79223,79224,79226,79228,79230,79233,79235,79237,79239],{"class":35,"line":347},[33,79225,68282],{"class":167},[33,79227,242],{"class":163},[33,79229,1110],{"class":163},[33,79231,79232],{"class":54},"\"FAIL (",[33,79234,1115],{"class":50},[33,79236,6565],{"class":167},[33,79238,1121],{"class":50},[33,79240,17841],{"class":54},[33,79242,79243,79245,79247,79249],{"class":35,"line":374},[33,79244,79162],{"class":167},[33,79246,242],{"class":163},[33,79248,39025],{"class":163},[33,79250,15116],{"class":50},[33,79252,79253,79255,79257,79259,79261,79263,79265,79267,79269,79271,79273,79275,79277,79279],{"class":35,"line":397},[33,79254,9414],{"class":50},[33,79256,602],{"class":167},[33,79258,4059],{"class":163},[33,79260,75437],{"class":54},[33,79262,1115],{"class":50},[33,79264,72912],{"class":167},[33,79266,72915],{"class":163},[33,79268,1121],{"class":50},[33,79270,2079],{"class":54},[33,79272,1115],{"class":50},[33,79274,68351],{"class":167},[33,79276,1121],{"class":50},[33,79278,274],{"class":54},[33,79280,221],{"class":167},[33,79282,79283,79285,79288,79290],{"class":35,"line":653},[33,79284,8221],{"class":163},[33,79286,79287],{"class":54}," \"FAIL\"",[33,79289,8002],{"class":163},[33,79291,79292],{"class":167}," status:\n",[33,79294,79295,79298,79300],{"class":35,"line":667},[33,79296,79297],{"class":167},"            all_ok ",[33,79299,242],{"class":163},[33,79301,2903],{"class":50},[33,79303,79304,79306],{"class":35,"line":675},[33,79305,1332],{"class":163},[33,79307,79308],{"class":167}," all_ok\n",[33,79310,79311],{"class":35,"line":689},[33,79312,92],{"emptyLinePlaceholder":91},[33,79314,79315],{"class":35,"line":703},[33,79316,92],{"emptyLinePlaceholder":91},[33,79318,79319,79321,79323,79325,79327],{"class":35,"line":714},[33,79320,2491],{"class":163},[33,79322,2494],{"class":50},[33,79324,2497],{"class":163},[33,79326,2500],{"class":54},[33,79328,574],{"class":167},[33,79330,79331,79334,79336,79338,79340],{"class":35,"line":723},[33,79332,79333],{"class":167},"    source ",[33,79335,242],{"class":163},[33,79337,215],{"class":167},[33,79339,73062],{"class":54},[33,79341,221],{"class":167},[33,79343,79344,79347,79349,79352,79354,79356,79358,79360,79362,79364,79366,79368,79370,79372,79374],{"class":35,"line":754},[33,79345,79346],{"class":167},"    ranges ",[33,79348,242],{"class":163},[33,79350,79351],{"class":167}," [(",[33,79353,734],{"class":50},[33,79355,365],{"class":167},[33,79357,3545],{"class":50},[33,79359,19834],{"class":167},[33,79361,17260],{"class":50},[33,79363,365],{"class":167},[33,79365,76716],{"class":50},[33,79367,19834],{"class":167},[33,79369,11164],{"class":50},[33,79371,365],{"class":167},[33,79373,26323],{"class":50},[33,79375,7767],{"class":167},[33,79377,79378,79381,79383],{"class":35,"line":771},[33,79379,79380],{"class":167},"    split_by_ranges(source, Path(",[33,79382,73071],{"class":54},[33,79384,79385],{"class":167},"), ranges)\n",[33,79387,79388,79391,79393,79396,79398],{"class":35,"line":777},[33,79389,79390],{"class":167},"    ok ",[33,79392,242],{"class":163},[33,79394,79395],{"class":167}," verify_splits(source, Path(",[33,79397,73071],{"class":54},[33,79399,79385],{"class":167},[33,79401,79402,79404,79406,79409,79411,79414,79416,79419],{"class":35,"line":788},[33,79403,7268],{"class":50},[33,79405,602],{"class":167},[33,79407,79408],{"class":54},"\"All splits verified.\"",[33,79410,9994],{"class":163},[33,79412,79413],{"class":167}," ok ",[33,79415,7489],{"class":163},[33,79417,79418],{"class":54}," \"Some splits failed verification.\"",[33,79420,221],{"class":167},[14,79422,79423],{},"The total pages across all split files should equal the number of pages covered by all ranges. If ranges overlap, a page may appear in multiple outputs — that is valid but worth logging explicitly.",[14,79425,79426,79427,79429,79430,3035],{},"After splitting, the individual files can be watermarked or password-protected per recipient using the patterns in ",[940,79428,65967],{"href":65966},", or re-assembled with cover pages via ",[940,79431,26191],{"href":19001},[18,79433,48994],{"id":29070},[4273,79435,79436,79447],{},[4276,79437,79438],{},[4279,79439,79440,79443,79445],{},[4282,79441,79442],{},"Mistake",[4282,79444,4284],{},[4282,79446,4290],{},[4292,79448,79449,79466,79484,79504,79523],{},[4279,79450,79451,79457,79460],{},[4297,79452,79453,75812,79455],{},[30,79454,75811],{},[30,79456,7409],{},[4297,79458,79459],{},"First page of each range is page 2 in the output; last real page is skipped",[4297,79461,17059,79462,79465],{},[30,79463,79464],{},"range(start - 1, end)"," — subtract 1 from start only",[4279,79467,79468,79474,79477],{},[4297,79469,79470,79471,79473],{},"Reusing one ",[30,79472,70025],{}," across ranges",[4297,79475,79476],{},"All ranges concatenated into a single output file",[4297,79478,79479,79480,79483],{},"Create a new ",[30,79481,79482],{},"PdfWriter()"," inside the loop, before each range",[4279,79485,79486,79492,79498],{},[4297,79487,79488,79489],{},"Not checking ",[30,79490,79491],{},"end \u003C= total",[4297,79493,79494,79495],{},"Silent extra blank pages or ",[30,79496,79497],{},"IndexError",[4297,79499,79500,79501,70980],{},"Validate each range against ",[30,79502,79503],{},"len(reader.pages)",[4279,79505,79506,79512,79517],{},[4297,79507,79508,79509,79511],{},"Open ",[30,79510,68108],{}," across all chunks",[4297,79513,79514,79516],{},[30,79515,68035],{}," on Windows; memory growth",[4297,79518,79519,79520,79522],{},"Open with ",[30,79521,74588],{}," and close after each range group",[4279,79524,79525,79531,79534],{},[4297,79526,79527,79528,79530],{},"Using ",[30,79529,78979],{}," output as 1-based",[4297,79532,79533],{},"Off-by-one in bookmark splits",[4297,79535,79536,79537,79539],{},"That method already returns 0-based; use directly in ",[30,79538,75830],{}," without subtracting",[18,79541,36626],{"id":36625},[14,79543,79544,79547,79548,75812,79550,79552,79553,6242,79555,79557,79558,79560],{},[1974,79545,79546],{},"Why does my split file have one fewer page than expected?","\nYou used ",[30,79549,75811],{},[30,79551,72824],{}," instead of 0-based. The fix is ",[30,79554,79464],{},[30,79556,72824],{}," stays unchanged because ",[30,79559,75830],{}," excludes its upper bound, which exactly cancels out the needed off-by-one.",[14,79562,79563,79566,79567,79569],{},[1974,79564,79565],{},"Can I split overlapping ranges (e.g., pages 1–5 and 3–8)?","\nYes. Each range gets its own ",[30,79568,70025],{}," and output file. Pages 3–5 will appear in both outputs. This is intentional for use cases like generating an \"executive summary\" (pages 1–5) that overlaps a \"full section\" (pages 3–8).",[14,79571,79572,79575,79576,79578],{},[1974,79573,79574],{},"How do I split a scanned PDF where there are no bookmarks?","\nUse the range-based approach with manually specified page boundaries, or run OCR first with the pattern in ",[940,79577,36756],{"href":26957}," to detect section boundaries from text content.",[14,79580,79581,79584,79586,79587,79590,79591,79594],{},[1974,79582,79583],{},"Does splitting strip PDF forms or annotations?",[30,79585,71123],{}," does a shallow copy that preserves page-level annotations (highlights, comments, form fields on that page). Interactive form state (",[30,79588,79589],{},"\u002FAcroForm",") at the document level is not copied — re-attach it with ",[30,79592,79593],{},"writer.clone_reader_document_root(reader)"," if needed.",[18,79596,6918],{"id":6917},[4211,79598,79599,79604,79609],{},[4214,79600,79601,79603],{},[940,79602,52682],{"href":52681}," — full reference including bookmark preservation, page reordering, and large-batch streaming",[4214,79605,79606,79608],{},[940,79607,68020],{"href":75698}," — the inverse operation: merge a folder of PDFs with natural sort and error recovery",[4214,79610,79611,79613],{},[940,79612,65967],{"href":65966}," — apply per-recipient watermarks or password protection to the split output files",[14,79615,6947,79616,3035],{},[940,79617,52682],{"href":52681},[6953,79619,71089],{},{"title":28,"searchDepth":43,"depth":43,"links":79621},[79622,79623,79624,79625,79626,79627,79628,79629,79630,79631],{"id":75793,"depth":43,"text":75794},{"id":75839,"depth":43,"text":75840},{"id":76080,"depth":43,"text":76081},{"id":76739,"depth":43,"text":76740},{"id":77889,"depth":43,"text":77890},{"id":78353,"depth":43,"text":78354},{"id":9246,"depth":43,"text":9247},{"id":29070,"depth":43,"text":48994},{"id":36625,"depth":43,"text":36626},{"id":6917,"depth":43,"text":6918},"Split by Page Ranges","Split a PDF into multiple files by page ranges with pypdf. Parse a ranges string, fix 1-based vs 0-based off-by-one errors, split every N pages, and split on bookmarks.",{},"\u002Fautomating-pdf-extraction-generation\u002Fmerging-and-splitting-pdf-documents\u002Fsplit-pdf-by-page-ranges-with-python",{"title":71076,"description":79633},"Split PDF by Page Ranges with Python — pypdf","automating-pdf-extraction-generation\u002Fmerging-and-splitting-pdf-documents\u002Fsplit-pdf-by-page-ranges-with-python\u002Findex",[9631,47,65045,79640],"split pdf","B16LTSnMuKrebas4ynm_3ok9o-lQ6un9Vx_dMQatG1A",{"id":79643,"title":65223,"body":79644,"breadcrumbTitle":81731,"canonical":6977,"date":6978,"description":81732,"draft":6980,"extension":6981,"image":6977,"meta":81733,"navigation":91,"path":81734,"robots":6977,"seo":81735,"seoTitle":81736,"stem":81737,"tags":81738,"updatedAt":6978,"__hash__":81740},"content\u002Fautomating-pdf-extraction-generation\u002Fscanning-and-ocr-processing-with-python\u002Ffix-tesseract-not-found-error\u002Findex.md",{"type":7,"value":79645,"toc":81705},[79646,79649,79655,79661,79671,79673,79698,79716,79718,79721,79840,79851,79855,79859,79900,79903,79927,79941,79943,79995,80009,80011,80059,80068,80074,80078,80088,80091,80106,80109,80119,80126,80132,80332,80338,80433,80440,80443,80713,80721,80725,80738,80808,80811,80833,80838,80863,80867,80874,80989,80996,80999,81005,81012,81072,81075,81245,81253,81256,81262,81268,81302,81305,81347,81353,81361,81364,81370,81373,81386,81392,81417,81419,81422,81673,81679,81681,81698,81702],[10,79647,65223],{"id":79648},"fix-tesseractnotfounderror-in-python",[14,79650,43155,79651,79654],{},[30,79652,79653],{},"pytesseract.image_to_string()"," raises the following error the first time it is run in a fresh environment:",[23,79656,79659],{"className":79657,"code":79658,"language":2000},[1998],"pytesseract.pytesseract.TesseractNotFoundError: tesseract is not installed or it's not in your PATH\n",[30,79660,79658],{"__ignoreMap":28},[14,79662,79663,79664,79667,79668,79670],{},"The error means ",[30,79665,79666],{},"pytesseract"," — the Python wrapper — is installed, but the Tesseract binary it wraps is not. The two are separate: ",[30,79669,65090],{}," installs only the Python adapter; the actual OCR engine must be installed at the OS level independently.",[18,79672,7021],{"id":7020},[14,79674,79675,79677,79678,79680,79681,79684,79685,79688,79689,53908,79691,79694,79695,79697],{},[30,79676,79666],{}," works by shelling out to the ",[30,79679,46820],{}," executable. When you call ",[30,79682,79683],{},"image_to_string",", it runs ",[30,79686,79687],{},"subprocess.check_output([\"tesseract\", ...])"," under the hood. If the binary is absent from ",[30,79690,122],{},[30,79692,79693],{},"TesseractNotFoundError"," before any image processing happens. Installing or updating ",[30,79696,79666],{}," alone cannot fix this — only installing the binary (or telling pytesseract where to find an existing one) resolves it.",[14,79699,79700,79701,79703,79704,79706,79707,79709,79710,79712,79713,79715],{},"The architecture looks like this: your Python script → ",[30,79702,79666],{}," (PyPI package) → ",[30,79705,46820],{}," OS binary → tessdata language files. All three layers must be present. A fresh virtual environment does not inherit system packages, but it does inherit the system ",[30,79708,122],{},", so if ",[30,79711,46820],{}," is installed at the system level, ",[30,79714,79666],{}," inside a venv will find it without any extra configuration.",[18,79717,35017],{"id":35016},[14,79719,79720],{},"Confirm the binary is missing before reaching for any fix:",[23,79722,79724],{"className":126,"code":79723,"language":47,"meta":28,"style":28},"# pip install pytesseract\nimport subprocess\nimport pytesseract\n\n# Check 1: is the binary on PATH at all?\ntry:\n    out = subprocess.check_output([\"tesseract\", \"--version\"], text=True)\n    print(\"Binary found:\", out.splitlines()[0])\nexcept FileNotFoundError:\n    print(\"Binary NOT found on PATH\")\n\n# Check 2: what path is pytesseract looking for?\nprint(\"pytesseract is looking for:\", pytesseract.pytesseract.tesseract_cmd)\n",[30,79725,79726,79730,79736,79742,79746,79751,79757,79784,79800,79808,79819,79823,79828],{"__ignoreMap":28},[33,79727,79728],{"class":35,"line":36},[33,79729,47160],{"class":39},[33,79731,79732,79734],{"class":35,"line":43},[33,79733,164],{"class":163},[33,79735,35040],{"class":167},[33,79737,79738,79740],{"class":35,"line":61},[33,79739,164],{"class":163},[33,79741,47178],{"class":167},[33,79743,79744],{"class":35,"line":73},[33,79745,92],{"emptyLinePlaceholder":91},[33,79747,79748],{"class":35,"line":88},[33,79749,79750],{"class":39},"# Check 1: is the binary on PATH at all?\n",[33,79752,79753,79755],{"class":35,"line":95},[33,79754,35574],{"class":163},[33,79756,574],{"class":167},[33,79758,79759,79761,79763,79766,79769,79771,79774,79776,79778,79780,79782],{"class":35,"line":101},[33,79760,17989],{"class":167},[33,79762,242],{"class":163},[33,79764,79765],{"class":167}," subprocess.check_output([",[33,79767,79768],{"class":54},"\"tesseract\"",[33,79770,365],{"class":167},[33,79772,79773],{"class":54},"\"--version\"",[33,79775,8314],{"class":167},[33,79777,2000],{"class":238},[33,79779,242],{"class":163},[33,79781,855],{"class":50},[33,79783,221],{"class":167},[33,79785,79786,79788,79790,79793,79796,79798],{"class":35,"line":171},[33,79787,7268],{"class":50},[33,79789,602],{"class":167},[33,79791,79792],{"class":54},"\"Binary found:\"",[33,79794,79795],{"class":167},", out.splitlines()[",[33,79797,748],{"class":50},[33,79799,751],{"class":167},[33,79801,79802,79804,79806],{"class":35,"line":179},[33,79803,35726],{"class":163},[33,79805,2945],{"class":50},[33,79807,574],{"class":167},[33,79809,79810,79812,79814,79817],{"class":35,"line":187},[33,79811,7268],{"class":50},[33,79813,602],{"class":167},[33,79815,79816],{"class":54},"\"Binary NOT found on PATH\"",[33,79818,221],{"class":167},[33,79820,79821],{"class":35,"line":201},[33,79822,92],{"emptyLinePlaceholder":91},[33,79824,79825],{"class":35,"line":206},[33,79826,79827],{"class":39},"# Check 2: what path is pytesseract looking for?\n",[33,79829,79830,79832,79834,79837],{"class":35,"line":224},[33,79831,13474],{"class":50},[33,79833,602],{"class":167},[33,79835,79836],{"class":54},"\"pytesseract is looking for:\"",[33,79838,79839],{"class":167},", pytesseract.pytesseract.tesseract_cmd)\n",[14,79841,79842,79843,79846,79847,79850],{},"If Check 1 prints ",[30,79844,79845],{},"Binary NOT found on PATH",", install the binary (see below). If the binary exists but Check 2 shows a wrong path, set ",[30,79848,79849],{},"tesseract_cmd"," directly.",[18,79852,79854],{"id":79853},"fix-1-install-the-tesseract-binary","Fix 1 — Install the Tesseract Binary",[424,79856,79858],{"id":79857},"linux-ubuntu-debian","Linux (Ubuntu \u002F Debian)",[23,79860,79862],{"className":25,"code":79861,"language":27,"meta":28,"style":28},"sudo apt-get update\nsudo apt-get install -y tesseract-ocr\n\n# Verify\ntesseract --version\n",[30,79863,79864,79873,79886,79890,79894],{"__ignoreMap":28},[33,79865,79866,79868,79870],{"class":35,"line":36},[33,79867,9669],{"class":46},[33,79869,9672],{"class":54},[33,79871,79872],{"class":54}," update\n",[33,79874,79875,79877,79879,79881,79883],{"class":35,"line":43},[33,79876,9669],{"class":46},[33,79878,9672],{"class":54},[33,79880,79],{"class":54},[33,79882,20912],{"class":50},[33,79884,79885],{"class":54}," tesseract-ocr\n",[33,79887,79888],{"class":35,"line":61},[33,79889,92],{"emptyLinePlaceholder":91},[33,79891,79892],{"class":35,"line":73},[33,79893,98],{"class":39},[33,79895,79896,79898],{"class":35,"line":88},[33,79897,46820],{"class":46},[33,79899,41864],{"class":50},[14,79901,79902],{},"For additional language packs, install them in the same command:",[23,79904,79906],{"className":25,"code":79905,"language":27,"meta":28,"style":28},"sudo apt-get install -y tesseract-ocr-deu tesseract-ocr-fra tesseract-ocr-spa\n",[30,79907,79908],{"__ignoreMap":28},[33,79909,79910,79912,79914,79916,79918,79921,79924],{"class":35,"line":36},[33,79911,9669],{"class":46},[33,79913,9672],{"class":54},[33,79915,79],{"class":54},[33,79917,20912],{"class":50},[33,79919,79920],{"class":54}," tesseract-ocr-deu",[33,79922,79923],{"class":54}," tesseract-ocr-fra",[33,79925,79926],{"class":54}," tesseract-ocr-spa\n",[14,79928,79929,79930,79933,79934,2012,79937,79940],{},"After installation ",[30,79931,79932],{},"tesseract --version"," should print ",[30,79935,79936],{},"tesseract 4.x.x",[30,79938,79939],{},"5.x.x",". Restart your Python process — no reboot needed.",[424,79942,35293],{"id":35292},[23,79944,79946],{"className":25,"code":79945,"language":27,"meta":28,"style":28},"brew install tesseract\n\n# With extra language packs\nbrew install tesseract-lang\n\n# Verify\ntesseract --version\nwhich tesseract\n",[30,79947,79948,79957,79961,79966,79975,79979,79983,79989],{"__ignoreMap":28},[33,79949,79950,79952,79954],{"class":35,"line":36},[33,79951,35308],{"class":46},[33,79953,79],{"class":54},[33,79955,79956],{"class":54}," tesseract\n",[33,79958,79959],{"class":35,"line":43},[33,79960,92],{"emptyLinePlaceholder":91},[33,79962,79963],{"class":35,"line":61},[33,79964,79965],{"class":39},"# With extra language packs\n",[33,79967,79968,79970,79972],{"class":35,"line":73},[33,79969,35308],{"class":46},[33,79971,79],{"class":54},[33,79973,79974],{"class":54}," tesseract-lang\n",[33,79976,79977],{"class":35,"line":88},[33,79978,92],{"emptyLinePlaceholder":91},[33,79980,79981],{"class":35,"line":95},[33,79982,98],{"class":39},[33,79984,79985,79987],{"class":35,"line":101},[33,79986,46820],{"class":46},[33,79988,41864],{"class":50},[33,79990,79991,79993],{"class":35,"line":171},[33,79992,35269],{"class":50},[33,79994,79956],{"class":54},[14,79996,79997,79998,80001,80002,80005,80006,80008],{},"Homebrew installs to ",[30,79999,80000],{},"\u002Fusr\u002Flocal\u002Fbin\u002Ftesseract"," (Intel) or ",[30,80003,80004],{},"\u002Fopt\u002Fhomebrew\u002Fbin\u002Ftesseract"," (Apple Silicon). Both are on ",[30,80007,122],{}," automatically in a standard Homebrew setup.",[424,80010,35385],{"id":35384},[35387,80012,80013,80020,80026,80056],{},[4214,80014,80015,80016],{},"Download the installer from the UB Mannheim builds: ",[940,80017,80018],{"href":80018,"rel":80019},"https:\u002F\u002Fgithub.com\u002FUB-Mannheim\u002Ftesseract\u002Fwiki",[1367],[4214,80021,80022,80023,3035],{},"Run the installer. Take note of the install path, typically ",[30,80024,80025],{},"C:\\Program Files\\Tesseract-OCR\\",[4214,80027,80028,80029,574,80031],{},"Add that folder to your system ",[30,80030,122],{},[4211,80032,80033,80038,80050],{},[4214,80034,79508,80035,3035],{},[1974,80036,80037],{},"System Properties → Environment Variables",[4214,80039,80040,80041,80043,80044,69863,80047,3035],{},"Select ",[1974,80042,35779],{}," under System variables → ",[1974,80045,80046],{},"Edit",[1974,80048,80049],{},"New",[4214,80051,80052,80053,80055],{},"Paste ",[30,80054,80025],{}," → OK all dialogs.",[4214,80057,80058],{},"Open a new terminal (the current one will not pick up the change) and verify:",[23,80060,80062],{"className":35424,"code":80061,"language":35426,"meta":28,"style":28},"tesseract --version\n",[30,80063,80064],{"__ignoreMap":28},[33,80065,80066],{"class":35,"line":36},[33,80067,80061],{},[14,80069,80070,80071,80073],{},"If you cannot modify the system PATH (corporate machine, CI environment), set ",[30,80072,79849],{}," in code instead — see Fix 2 below.",[424,80075,80077],{"id":80076},"windows-path-troubleshooting","Windows PATH Troubleshooting",[14,80079,80080,80081,80084,80085,80087],{},"The most common Windows failure mode: the installer ran, ",[30,80082,80083],{},"C:\\Program Files\\Tesseract-OCR\\tesseract.exe"," exists, but ",[30,80086,79932],{}," still fails in a new PowerShell window. The cause is almost always a session that predates the PATH change. Open a brand-new terminal after editing PATH — Windows does not propagate environment variable changes to already-open sessions.",[14,80089,80090],{},"To check whether the PATH entry was written correctly without closing your current session:",[23,80092,80094],{"className":35424,"code":80093,"language":35426,"meta":28,"style":28},"# List PATH entries that contain \"tesseract\" (case-insensitive)\n$env:PATH -split \";\" | Where-Object { $_ -match \"tesseract\" }\n",[30,80095,80096,80101],{"__ignoreMap":28},[33,80097,80098],{"class":35,"line":36},[33,80099,80100],{},"# List PATH entries that contain \"tesseract\" (case-insensitive)\n",[33,80102,80103],{"class":35,"line":43},[33,80104,80105],{},"$env:PATH -split \";\" | Where-Object { $_ -match \"tesseract\" }\n",[14,80107,80108],{},"If that returns nothing, the PATH entry was either not saved or was added to the User variables of a different account. Repeat the PATH edit, log out, log back in, and try again.",[14,80110,80111,80112,80115,80116,80118],{},"A second Windows-specific gotcha: spaces in the path. If your install is under ",[30,80113,80114],{},"C:\\Program Files\\",", the space is handled correctly by Windows PATH, but older tools that use raw string concatenation may fail. Setting ",[30,80117,79849],{}," to the full path (Fix 2) sidesteps this entirely.",[18,80120,80122,80123,80125],{"id":80121},"fix-2-set-tesseract_cmd-in-code","Fix 2 — Set ",[30,80124,79849],{}," in Code",[14,80127,80128,80129,80131],{},"If the binary is installed but not on ",[30,80130,122],{},", or if you need to point to a non-default install location, set the path explicitly before calling any pytesseract function:",[23,80133,80135],{"className":126,"code":80134,"language":47,"meta":28,"style":28},"# pip install pytesseract Pillow\nfrom pathlib import Path\nimport pytesseract\nfrom PIL import Image\n\n# Adjust this path to match your actual install location\n# Windows example:\npytesseract.pytesseract.tesseract_cmd = r\"C:\\Program Files\\Tesseract-OCR\\tesseract.exe\"\n\n# macOS Homebrew (Apple Silicon):\n# pytesseract.pytesseract.tesseract_cmd = \"\u002Fopt\u002Fhomebrew\u002Fbin\u002Ftesseract\"\n\n# Linux custom install:\n# pytesseract.pytesseract.tesseract_cmd = \"\u002Fusr\u002Flocal\u002Fbin\u002Ftesseract\"\n\n# Now calls will succeed\ntry:\n    img = Image.open(Path(\"scan.png\"))\n    text = pytesseract.image_to_string(img)\n    print(text[:200])\nexcept pytesseract.pytesseract.TesseractNotFoundError as exc:\n    print(f\"Still not found at: {pytesseract.pytesseract.tesseract_cmd}\")\n    raise\n",[30,80136,80137,80142,80152,80158,80168,80172,80177,80182,80217,80221,80226,80231,80235,80240,80245,80249,80254,80260,80275,80284,80294,80305,80327],{"__ignoreMap":28},[33,80138,80139],{"class":35,"line":36},[33,80140,80141],{"class":39},"# pip install pytesseract Pillow\n",[33,80143,80144,80146,80148,80150],{"class":35,"line":43},[33,80145,190],{"class":163},[33,80147,193],{"class":167},[33,80149,164],{"class":163},[33,80151,198],{"class":167},[33,80153,80154,80156],{"class":35,"line":61},[33,80155,164],{"class":163},[33,80157,47178],{"class":167},[33,80159,80160,80162,80164,80166],{"class":35,"line":73},[33,80161,190],{"class":163},[33,80163,46889],{"class":50},[33,80165,46892],{"class":163},[33,80167,47171],{"class":167},[33,80169,80170],{"class":35,"line":88},[33,80171,92],{"emptyLinePlaceholder":91},[33,80173,80174],{"class":35,"line":95},[33,80175,80176],{"class":39},"# Adjust this path to match your actual install location\n",[33,80178,80179],{"class":35,"line":101},[33,80180,80181],{"class":39},"# Windows example:\n",[33,80183,80184,80187,80189,80191,80194,80197,80200,80203,80206,80209,80212,80214],{"class":35,"line":171},[33,80185,80186],{"class":167},"pytesseract.pytesseract.tesseract_cmd ",[33,80188,242],{"class":163},[33,80190,13035],{"class":163},[33,80192,80193],{"class":54},"\"C:",[33,80195,80196],{"class":12018},"\\P",[33,80198,80199],{"class":54},"rogram Files",[33,80201,80202],{"class":12018},"\\T",[33,80204,80205],{"class":54},"esseract-OCR",[33,80207,80208],{"class":12018},"\\t",[33,80210,80211],{"class":54},"esseract",[33,80213,3035],{"class":50},[33,80215,80216],{"class":54},"exe\"\n",[33,80218,80219],{"class":35,"line":179},[33,80220,92],{"emptyLinePlaceholder":91},[33,80222,80223],{"class":35,"line":187},[33,80224,80225],{"class":39},"# macOS Homebrew (Apple Silicon):\n",[33,80227,80228],{"class":35,"line":201},[33,80229,80230],{"class":39},"# pytesseract.pytesseract.tesseract_cmd = \"\u002Fopt\u002Fhomebrew\u002Fbin\u002Ftesseract\"\n",[33,80232,80233],{"class":35,"line":206},[33,80234,92],{"emptyLinePlaceholder":91},[33,80236,80237],{"class":35,"line":224},[33,80238,80239],{"class":39},"# Linux custom install:\n",[33,80241,80242],{"class":35,"line":229},[33,80243,80244],{"class":39},"# pytesseract.pytesseract.tesseract_cmd = \"\u002Fusr\u002Flocal\u002Fbin\u002Ftesseract\"\n",[33,80246,80247],{"class":35,"line":235},[33,80248,92],{"emptyLinePlaceholder":91},[33,80250,80251],{"class":35,"line":250},[33,80252,80253],{"class":39},"# Now calls will succeed\n",[33,80255,80256,80258],{"class":35,"line":266},[33,80257,35574],{"class":163},[33,80259,574],{"class":167},[33,80261,80262,80265,80267,80270,80273],{"class":35,"line":290},[33,80263,80264],{"class":167},"    img ",[33,80266,242],{"class":163},[33,80268,80269],{"class":167}," Image.open(Path(",[33,80271,80272],{"class":54},"\"scan.png\"",[33,80274,371],{"class":167},[33,80276,80277,80279,80281],{"class":35,"line":295},[33,80278,44654],{"class":167},[33,80280,242],{"class":163},[33,80282,80283],{"class":167}," pytesseract.image_to_string(img)\n",[33,80285,80286,80288,80290,80292],{"class":35,"line":300},[33,80287,7268],{"class":50},[33,80289,44670],{"class":167},[33,80291,2611],{"class":50},[33,80293,751],{"class":167},[33,80295,80296,80298,80301,80303],{"class":35,"line":317},[33,80297,35726],{"class":163},[33,80299,80300],{"class":167}," pytesseract.pytesseract.TesseractNotFoundError ",[33,80302,495],{"class":163},[33,80304,1855],{"class":167},[33,80306,80307,80309,80311,80313,80316,80318,80321,80323,80325],{"class":35,"line":332},[33,80308,7268],{"class":50},[33,80310,602],{"class":167},[33,80312,4059],{"class":163},[33,80314,80315],{"class":54},"\"Still not found at: ",[33,80317,1115],{"class":50},[33,80319,80320],{"class":167},"pytesseract.pytesseract.tesseract_cmd",[33,80322,1121],{"class":50},[33,80324,274],{"class":54},[33,80326,221],{"class":167},[33,80328,80329],{"class":35,"line":347},[33,80330,80331],{"class":163},"    raise\n",[14,80333,17059,80334,80337],{},[30,80335,80336],{},"pathlib.Path"," to resolve the actual location at runtime so the script is portable:",[23,80339,80341],{"className":126,"code":80340,"language":47,"meta":28,"style":28},"# pip install pytesseract\nfrom pathlib import Path\nimport pytesseract\n\n# Auto-detect on Windows where the installer places the binary\n_WIN_DEFAULT = Path(r\"C:\\Program Files\\Tesseract-OCR\\tesseract.exe\")\nif _WIN_DEFAULT.exists():\n    pytesseract.pytesseract.tesseract_cmd = str(_WIN_DEFAULT)\n# On Linux\u002FmacOS the binary should already be on PATH after apt\u002Fbrew install\n",[30,80342,80343,80347,80357,80363,80367,80372,80404,80413,80428],{"__ignoreMap":28},[33,80344,80345],{"class":35,"line":36},[33,80346,47160],{"class":39},[33,80348,80349,80351,80353,80355],{"class":35,"line":43},[33,80350,190],{"class":163},[33,80352,193],{"class":167},[33,80354,164],{"class":163},[33,80356,198],{"class":167},[33,80358,80359,80361],{"class":35,"line":61},[33,80360,164],{"class":163},[33,80362,47178],{"class":167},[33,80364,80365],{"class":35,"line":73},[33,80366,92],{"emptyLinePlaceholder":91},[33,80368,80369],{"class":35,"line":88},[33,80370,80371],{"class":39},"# Auto-detect on Windows where the installer places the binary\n",[33,80373,80374,80377,80379,80381,80383,80385,80387,80389,80391,80393,80395,80397,80399,80402],{"class":35,"line":95},[33,80375,80376],{"class":50},"_WIN_DEFAULT",[33,80378,212],{"class":163},[33,80380,215],{"class":167},[33,80382,11977],{"class":163},[33,80384,80193],{"class":54},[33,80386,80196],{"class":12018},[33,80388,80199],{"class":54},[33,80390,80202],{"class":12018},[33,80392,80205],{"class":54},[33,80394,80208],{"class":12018},[33,80396,80211],{"class":54},[33,80398,3035],{"class":50},[33,80400,80401],{"class":54},"exe\"",[33,80403,221],{"class":167},[33,80405,80406,80408,80411],{"class":35,"line":101},[33,80407,2491],{"class":163},[33,80409,80410],{"class":50}," _WIN_DEFAULT",[33,80412,58027],{"class":167},[33,80414,80415,80418,80420,80422,80424,80426],{"class":35,"line":171},[33,80416,80417],{"class":167},"    pytesseract.pytesseract.tesseract_cmd ",[33,80419,242],{"class":163},[33,80421,7887],{"class":50},[33,80423,602],{"class":167},[33,80425,80376],{"class":50},[33,80427,221],{"class":167},[33,80429,80430],{"class":35,"line":179},[33,80431,80432],{"class":39},"# On Linux\u002FmacOS the binary should already be on PATH after apt\u002Fbrew install\n",[424,80434,80436,80437,80439],{"id":80435},"making-tesseract_cmd-environment-aware","Making ",[30,80438,79849],{}," Environment-Aware",[14,80441,80442],{},"Hard-coding a path makes scripts non-portable. A better pattern reads the path from an environment variable with a sensible fallback:",[23,80444,80446],{"className":126,"code":80445,"language":47,"meta":28,"style":28},"# pip install pytesseract Pillow\nimport os\nfrom pathlib import Path\nimport pytesseract\nfrom PIL import Image\n\ndef configure_tesseract() -> None:\n    \"\"\"\n    Set tesseract_cmd from TESSERACT_CMD env var, or auto-detect\n    the default Windows install location as a fallback.\n    Does nothing on Linux\u002FmacOS where the binary is typically on PATH.\n    \"\"\"\n    env_cmd = os.environ.get(\"TESSERACT_CMD\")\n    if env_cmd:\n        # Explicitly configured — trust it\n        pytesseract.pytesseract.tesseract_cmd = env_cmd\n        return\n\n    # Windows auto-detect\n    win_default = Path(r\"C:\\Program Files\\Tesseract-OCR\\tesseract.exe\")\n    if win_default.exists():\n        pytesseract.pytesseract.tesseract_cmd = str(win_default)\n\n# Call once at module load time\nconfigure_tesseract()\n\n# Then use pytesseract normally\nimg = Image.open(Path(\"scan.png\"))\ntry:\n    text = pytesseract.image_to_string(img)\n    print(text[:200])\nexcept pytesseract.pytesseract.TesseractNotFoundError as exc:\n    raise RuntimeError(\n        \"Set TESSERACT_CMD=\u002Fpath\u002Fto\u002Ftesseract or install via apt\u002Fbrew\"\n    ) from exc\n",[30,80447,80448,80452,80458,80468,80474,80484,80488,80501,80505,80510,80515,80520,80524,80538,80545,80550,80560,80564,80568,80573,80604,80611,80622,80626,80631,80636,80640,80645,80658,80664,80672,80682,80692,80700,80705],{"__ignoreMap":28},[33,80449,80450],{"class":35,"line":36},[33,80451,80141],{"class":39},[33,80453,80454,80456],{"class":35,"line":43},[33,80455,164],{"class":163},[33,80457,176],{"class":167},[33,80459,80460,80462,80464,80466],{"class":35,"line":61},[33,80461,190],{"class":163},[33,80463,193],{"class":167},[33,80465,164],{"class":163},[33,80467,198],{"class":167},[33,80469,80470,80472],{"class":35,"line":73},[33,80471,164],{"class":163},[33,80473,47178],{"class":167},[33,80475,80476,80478,80480,80482],{"class":35,"line":88},[33,80477,190],{"class":163},[33,80479,46889],{"class":50},[33,80481,46892],{"class":163},[33,80483,47171],{"class":167},[33,80485,80486],{"class":35,"line":95},[33,80487,92],{"emptyLinePlaceholder":91},[33,80489,80490,80492,80495,80497,80499],{"class":35,"line":101},[33,80491,562],{"class":163},[33,80493,80494],{"class":46}," configure_tesseract",[33,80496,568],{"class":167},[33,80498,571],{"class":50},[33,80500,574],{"class":167},[33,80502,80503],{"class":35,"line":171},[33,80504,7673],{"class":54},[33,80506,80507],{"class":35,"line":179},[33,80508,80509],{"class":54},"    Set tesseract_cmd from TESSERACT_CMD env var, or auto-detect\n",[33,80511,80512],{"class":35,"line":187},[33,80513,80514],{"class":54},"    the default Windows install location as a fallback.\n",[33,80516,80517],{"class":35,"line":201},[33,80518,80519],{"class":54},"    Does nothing on Linux\u002FmacOS where the binary is typically on PATH.\n",[33,80521,80522],{"class":35,"line":206},[33,80523,7673],{"class":54},[33,80525,80526,80529,80531,80533,80536],{"class":35,"line":224},[33,80527,80528],{"class":167},"    env_cmd ",[33,80530,242],{"class":163},[33,80532,3129],{"class":167},[33,80534,80535],{"class":54},"\"TESSERACT_CMD\"",[33,80537,221],{"class":167},[33,80539,80540,80542],{"class":35,"line":229},[33,80541,617],{"class":163},[33,80543,80544],{"class":167}," env_cmd:\n",[33,80546,80547],{"class":35,"line":235},[33,80548,80549],{"class":39},"        # Explicitly configured — trust it\n",[33,80551,80552,80555,80557],{"class":35,"line":250},[33,80553,80554],{"class":167},"        pytesseract.pytesseract.tesseract_cmd ",[33,80556,242],{"class":163},[33,80558,80559],{"class":167}," env_cmd\n",[33,80561,80562],{"class":35,"line":266},[33,80563,646],{"class":163},[33,80565,80566],{"class":35,"line":290},[33,80567,92],{"emptyLinePlaceholder":91},[33,80569,80570],{"class":35,"line":295},[33,80571,80572],{"class":39},"    # Windows auto-detect\n",[33,80574,80575,80578,80580,80582,80584,80586,80588,80590,80592,80594,80596,80598,80600,80602],{"class":35,"line":300},[33,80576,80577],{"class":167},"    win_default ",[33,80579,242],{"class":163},[33,80581,215],{"class":167},[33,80583,11977],{"class":163},[33,80585,80193],{"class":54},[33,80587,80196],{"class":12018},[33,80589,80199],{"class":54},[33,80591,80202],{"class":12018},[33,80593,80205],{"class":54},[33,80595,80208],{"class":12018},[33,80597,80211],{"class":54},[33,80599,3035],{"class":50},[33,80601,80401],{"class":54},[33,80603,221],{"class":167},[33,80605,80606,80608],{"class":35,"line":317},[33,80607,617],{"class":163},[33,80609,80610],{"class":167}," win_default.exists():\n",[33,80612,80613,80615,80617,80619],{"class":35,"line":332},[33,80614,80554],{"class":167},[33,80616,242],{"class":163},[33,80618,7887],{"class":50},[33,80620,80621],{"class":167},"(win_default)\n",[33,80623,80624],{"class":35,"line":347},[33,80625,92],{"emptyLinePlaceholder":91},[33,80627,80628],{"class":35,"line":374},[33,80629,80630],{"class":39},"# Call once at module load time\n",[33,80632,80633],{"class":35,"line":397},[33,80634,80635],{"class":167},"configure_tesseract()\n",[33,80637,80638],{"class":35,"line":653},[33,80639,92],{"emptyLinePlaceholder":91},[33,80641,80642],{"class":35,"line":667},[33,80643,80644],{"class":39},"# Then use pytesseract normally\n",[33,80646,80647,80650,80652,80654,80656],{"class":35,"line":675},[33,80648,80649],{"class":167},"img ",[33,80651,242],{"class":163},[33,80653,80269],{"class":167},[33,80655,80272],{"class":54},[33,80657,371],{"class":167},[33,80659,80660,80662],{"class":35,"line":689},[33,80661,35574],{"class":163},[33,80663,574],{"class":167},[33,80665,80666,80668,80670],{"class":35,"line":703},[33,80667,44654],{"class":167},[33,80669,242],{"class":163},[33,80671,80283],{"class":167},[33,80673,80674,80676,80678,80680],{"class":35,"line":714},[33,80675,7268],{"class":50},[33,80677,44670],{"class":167},[33,80679,2611],{"class":50},[33,80681,751],{"class":167},[33,80683,80684,80686,80688,80690],{"class":35,"line":723},[33,80685,35726],{"class":163},[33,80687,80300],{"class":167},[33,80689,495],{"class":163},[33,80691,1855],{"class":167},[33,80693,80694,80696,80698],{"class":35,"line":754},[33,80695,35742],{"class":163},[33,80697,7590],{"class":50},[33,80699,7637],{"class":167},[33,80701,80702],{"class":35,"line":771},[33,80703,80704],{"class":54},"        \"Set TESSERACT_CMD=\u002Fpath\u002Fto\u002Ftesseract or install via apt\u002Fbrew\"\n",[33,80706,80707,80709,80711],{"class":35,"line":777},[33,80708,35761],{"class":167},[33,80710,190],{"class":163},[33,80712,20843],{"class":167},[14,80714,35815,80715,36700,80718,80720],{},[30,80716,80717],{},"TESSERACT_CMD",[30,80719,3585],{}," file or your CI secrets, and the same code works across developer machines and deployment targets without any per-machine edits.",[18,80722,80724],{"id":80723},"fix-3-docker-ci-environments","Fix 3 — Docker \u002F CI Environments",[14,80726,80727,80728,80730,80731,36700,80733,80735,80736,3035],{},"In a Docker-based build or CI pipeline, the Python dependencies and the system binary are installed in separate layers. A common mistake is installing ",[30,80729,79666],{}," via ",[30,80732,76],{},[30,80734,26625],{}," without also installing the binary in the ",[30,80737,36045],{},[23,80739,80741],{"className":36048,"code":80740,"language":36050,"meta":28,"style":28},"FROM python:3.12-slim\n\n# Install Tesseract binary first\nRUN apt-get update && apt-get install -y --no-install-recommends \\\n    tesseract-ocr \\\n    tesseract-ocr-eng \\\n    && rm -rf \u002Fvar\u002Flib\u002Fapt\u002Flists\u002F*\n\n# Then install Python deps\nCOPY requirements.txt .\nRUN pip install --no-cache-dir -r requirements.txt\n\nCOPY . \u002Fapp\nWORKDIR \u002Fapp\nCMD [\"python\", \"ocr_pipeline.py\"]\n",[30,80742,80743,80747,80751,80756,80760,80765,80770,80774,80778,80783,80787,80791,80795,80799,80803],{"__ignoreMap":28},[33,80744,80745],{"class":35,"line":36},[33,80746,36057],{},[33,80748,80749],{"class":35,"line":43},[33,80750,92],{"emptyLinePlaceholder":91},[33,80752,80753],{"class":35,"line":61},[33,80754,80755],{},"# Install Tesseract binary first\n",[33,80757,80758],{"class":35,"line":73},[33,80759,42440],{},[33,80761,80762],{"class":35,"line":88},[33,80763,80764],{},"    tesseract-ocr \\\n",[33,80766,80767],{"class":35,"line":95},[33,80768,80769],{},"    tesseract-ocr-eng \\\n",[33,80771,80772],{"class":35,"line":101},[33,80773,42460],{},[33,80775,80776],{"class":35,"line":171},[33,80777,92],{"emptyLinePlaceholder":91},[33,80779,80780],{"class":35,"line":179},[33,80781,80782],{},"# Then install Python deps\n",[33,80784,80785],{"class":35,"line":187},[33,80786,36109],{},[33,80788,80789],{"class":35,"line":201},[33,80790,36114],{},[33,80792,80793],{"class":35,"line":206},[33,80794,92],{"emptyLinePlaceholder":91},[33,80796,80797],{"class":35,"line":224},[33,80798,36123],{},[33,80800,80801],{"class":35,"line":229},[33,80802,36128],{},[33,80804,80805],{"class":35,"line":235},[33,80806,80807],{},"CMD [\"python\", \"ocr_pipeline.py\"]\n",[14,80809,80810],{},"Verify inside the container:",[23,80812,80814],{"className":25,"code":80813,"language":27,"meta":28,"style":28},"docker run --rm your-image tesseract --version\n",[30,80815,80816],{"__ignoreMap":28},[33,80817,80818,80820,80822,80825,80828,80831],{"class":35,"line":36},[33,80819,42412],{"class":46},[33,80821,69425],{"class":54},[33,80823,80824],{"class":50}," --rm",[33,80826,80827],{"class":54}," your-image",[33,80829,80830],{"class":54}," tesseract",[33,80832,41864],{"class":50},[14,80834,80835,80836,20891],{},"For GitHub Actions, add a step before ",[30,80837,36846],{},[23,80839,80841],{"className":2062,"code":80840,"language":2064,"meta":28,"style":28},"- name: Install Tesseract\n  run: sudo apt-get install -y tesseract-ocr\n",[30,80842,80843,80854],{"__ignoreMap":28},[33,80844,80845,80847,80849,80851],{"class":35,"line":36},[33,80846,42495],{"class":167},[33,80848,1118],{"class":2076},[33,80850,2079],{"class":167},[33,80852,80853],{"class":54},"Install Tesseract\n",[33,80855,80856,80858,80860],{"class":35,"line":43},[33,80857,42507],{"class":2076},[33,80859,2079],{"class":167},[33,80861,80862],{"class":54},"sudo apt-get install -y tesseract-ocr\n",[424,80864,80866],{"id":80865},"github-actions-caching-the-tesseract-install","GitHub Actions: Caching the Tesseract Install",[14,80868,80869,80870,80873],{},"On hosted GitHub Actions runners ",[30,80871,80872],{},"apt-get install tesseract-ocr"," takes 15–30 seconds. Cache the install to speed up repeated runs:",[23,80875,80877],{"className":2062,"code":80876,"language":2064,"meta":28,"style":28},"- name: Cache Tesseract\n  uses: actions\u002Fcache@v4\n  with:\n    path: \u002Fusr\u002Fshare\u002Ftesseract-ocr\n    key: tesseract-${{ runner.os }}-v5\n\n- name: Install Tesseract (if cache miss)\n  run: |\n    if ! command -v tesseract &> \u002Fdev\u002Fnull; then\n      sudo apt-get install -y tesseract-ocr\n    fi\n\n- name: Install Python deps\n  run: pip install -r requirements.txt\n",[30,80878,80879,80890,80900,80907,80917,80927,80931,80942,80951,80956,80961,80966,80970,80981],{"__ignoreMap":28},[33,80880,80881,80883,80885,80887],{"class":35,"line":36},[33,80882,42495],{"class":167},[33,80884,1118],{"class":2076},[33,80886,2079],{"class":167},[33,80888,80889],{"class":54},"Cache Tesseract\n",[33,80891,80892,80895,80897],{"class":35,"line":43},[33,80893,80894],{"class":2076},"  uses",[33,80896,2079],{"class":167},[33,80898,80899],{"class":54},"actions\u002Fcache@v4\n",[33,80901,80902,80905],{"class":35,"line":61},[33,80903,80904],{"class":2076},"  with",[33,80906,574],{"class":167},[33,80908,80909,80912,80914],{"class":35,"line":73},[33,80910,80911],{"class":2076},"    path",[33,80913,2079],{"class":167},[33,80915,80916],{"class":54},"\u002Fusr\u002Fshare\u002Ftesseract-ocr\n",[33,80918,80919,80922,80924],{"class":35,"line":88},[33,80920,80921],{"class":2076},"    key",[33,80923,2079],{"class":167},[33,80925,80926],{"class":54},"tesseract-${{ runner.os }}-v5\n",[33,80928,80929],{"class":35,"line":95},[33,80930,92],{"emptyLinePlaceholder":91},[33,80932,80933,80935,80937,80939],{"class":35,"line":101},[33,80934,42495],{"class":167},[33,80936,1118],{"class":2076},[33,80938,2079],{"class":167},[33,80940,80941],{"class":54},"Install Tesseract (if cache miss)\n",[33,80943,80944,80946,80948],{"class":35,"line":171},[33,80945,42507],{"class":2076},[33,80947,2079],{"class":167},[33,80949,80950],{"class":163},"|\n",[33,80952,80953],{"class":35,"line":179},[33,80954,80955],{"class":54},"    if ! command -v tesseract &> \u002Fdev\u002Fnull; then\n",[33,80957,80958],{"class":35,"line":187},[33,80959,80960],{"class":54},"      sudo apt-get install -y tesseract-ocr\n",[33,80962,80963],{"class":35,"line":201},[33,80964,80965],{"class":54},"    fi\n",[33,80967,80968],{"class":35,"line":206},[33,80969,92],{"emptyLinePlaceholder":91},[33,80971,80972,80974,80976,80978],{"class":35,"line":224},[33,80973,42495],{"class":167},[33,80975,1118],{"class":2076},[33,80977,2079],{"class":167},[33,80979,80980],{"class":54},"Install Python deps\n",[33,80982,80983,80985,80987],{"class":35,"line":229},[33,80984,42507],{"class":2076},[33,80986,2079],{"class":167},[33,80988,2224],{"class":54},[18,80990,80992,80993,12027],{"id":80991},"fix-4-missing-language-data-tessdata_prefix","Fix 4 — Missing Language Data (",[30,80994,80995],{},"TESSDATA_PREFIX",[14,80997,80998],{},"A related but distinct error appears when the binary is found but a specific language pack is missing:",[23,81000,81003],{"className":81001,"code":81002,"language":2000},[1998],"Error, could not initialize tesseract API with language \"deu\".\n",[30,81004,81002],{"__ignoreMap":28},[14,81006,81007,81008,81011],{},"This means the language ",[30,81009,81010],{},".traineddata"," file is absent. Fix:",[23,81013,81015],{"className":25,"code":81014,"language":27,"meta":28,"style":28},"# Install the pack\nsudo apt-get install -y tesseract-ocr-deu\n\n# Or set TESSDATA_PREFIX to a custom directory containing .traineddata files\nexport TESSDATA_PREFIX=\u002Fopt\u002Ftessdata\u002F\n\n# Verify available languages\ntesseract --list-langs\n",[30,81016,81017,81022,81035,81039,81044,81056,81060,81065],{"__ignoreMap":28},[33,81018,81019],{"class":35,"line":36},[33,81020,81021],{"class":39},"# Install the pack\n",[33,81023,81024,81026,81028,81030,81032],{"class":35,"line":43},[33,81025,9669],{"class":46},[33,81027,9672],{"class":54},[33,81029,79],{"class":54},[33,81031,20912],{"class":50},[33,81033,81034],{"class":54}," tesseract-ocr-deu\n",[33,81036,81037],{"class":35,"line":61},[33,81038,92],{"emptyLinePlaceholder":91},[33,81040,81041],{"class":35,"line":73},[33,81042,81043],{"class":39},"# Or set TESSDATA_PREFIX to a custom directory containing .traineddata files\n",[33,81045,81046,81048,81051,81053],{"class":35,"line":88},[33,81047,35332],{"class":163},[33,81049,81050],{"class":167}," TESSDATA_PREFIX",[33,81052,242],{"class":163},[33,81054,81055],{"class":167},"\u002Fopt\u002Ftessdata\u002F\n",[33,81057,81058],{"class":35,"line":95},[33,81059,92],{"emptyLinePlaceholder":91},[33,81061,81062],{"class":35,"line":101},[33,81063,81064],{"class":39},"# Verify available languages\n",[33,81066,81067,81069],{"class":35,"line":171},[33,81068,46820],{"class":46},[33,81070,81071],{"class":50}," --list-langs\n",[14,81073,81074],{},"In Python:",[23,81076,81078],{"className":126,"code":81077,"language":47,"meta":28,"style":28},"# pip install pytesseract Pillow\nimport os\nfrom pathlib import Path\nimport pytesseract\nfrom PIL import Image\n\n# Point to a custom tessdata directory if needed\nos.environ[\"TESSDATA_PREFIX\"] = \"\u002Fopt\u002Ftessdata\u002F\"\n\nimg = Image.open(Path(\"german_invoice.png\"))\ntry:\n    text = pytesseract.image_to_string(img, lang=\"deu\")\n    print(text[:200])\nexcept pytesseract.pytesseract.TesseractNotFoundError:\n    print(\"Binary missing — install tesseract-ocr\")\nexcept Exception as exc:\n    # Catches language pack errors\n    print(f\"OCR error: {exc}\")\n",[30,81079,81080,81084,81090,81100,81106,81116,81120,81125,81139,81143,81156,81162,81181,81191,81198,81209,81219,81224],{"__ignoreMap":28},[33,81081,81082],{"class":35,"line":36},[33,81083,80141],{"class":39},[33,81085,81086,81088],{"class":35,"line":43},[33,81087,164],{"class":163},[33,81089,176],{"class":167},[33,81091,81092,81094,81096,81098],{"class":35,"line":61},[33,81093,190],{"class":163},[33,81095,193],{"class":167},[33,81097,164],{"class":163},[33,81099,198],{"class":167},[33,81101,81102,81104],{"class":35,"line":73},[33,81103,164],{"class":163},[33,81105,47178],{"class":167},[33,81107,81108,81110,81112,81114],{"class":35,"line":88},[33,81109,190],{"class":163},[33,81111,46889],{"class":50},[33,81113,46892],{"class":163},[33,81115,47171],{"class":167},[33,81117,81118],{"class":35,"line":95},[33,81119,92],{"emptyLinePlaceholder":91},[33,81121,81122],{"class":35,"line":101},[33,81123,81124],{"class":39},"# Point to a custom tessdata directory if needed\n",[33,81126,81127,81129,81132,81134,81136],{"class":35,"line":171},[33,81128,35884],{"class":167},[33,81130,81131],{"class":54},"\"TESSDATA_PREFIX\"",[33,81133,763],{"class":167},[33,81135,242],{"class":163},[33,81137,81138],{"class":54}," \"\u002Fopt\u002Ftessdata\u002F\"\n",[33,81140,81141],{"class":35,"line":179},[33,81142,92],{"emptyLinePlaceholder":91},[33,81144,81145,81147,81149,81151,81154],{"class":35,"line":187},[33,81146,80649],{"class":167},[33,81148,242],{"class":163},[33,81150,80269],{"class":167},[33,81152,81153],{"class":54},"\"german_invoice.png\"",[33,81155,371],{"class":167},[33,81157,81158,81160],{"class":35,"line":201},[33,81159,35574],{"class":163},[33,81161,574],{"class":167},[33,81163,81164,81166,81168,81171,81174,81176,81179],{"class":35,"line":206},[33,81165,44654],{"class":167},[33,81167,242],{"class":163},[33,81169,81170],{"class":167}," pytesseract.image_to_string(img, ",[33,81172,81173],{"class":238},"lang",[33,81175,242],{"class":163},[33,81177,81178],{"class":54},"\"deu\"",[33,81180,221],{"class":167},[33,81182,81183,81185,81187,81189],{"class":35,"line":224},[33,81184,7268],{"class":50},[33,81186,44670],{"class":167},[33,81188,2611],{"class":50},[33,81190,751],{"class":167},[33,81192,81193,81195],{"class":35,"line":229},[33,81194,35726],{"class":163},[33,81196,81197],{"class":167}," pytesseract.pytesseract.TesseractNotFoundError:\n",[33,81199,81200,81202,81204,81207],{"class":35,"line":235},[33,81201,7268],{"class":50},[33,81203,602],{"class":167},[33,81205,81206],{"class":54},"\"Binary missing — install tesseract-ocr\"",[33,81208,221],{"class":167},[33,81210,81211,81213,81215,81217],{"class":35,"line":250},[33,81212,35726],{"class":163},[33,81214,783],{"class":50},[33,81216,1852],{"class":163},[33,81218,1855],{"class":167},[33,81220,81221],{"class":35,"line":266},[33,81222,81223],{"class":39},"    # Catches language pack errors\n",[33,81225,81226,81228,81230,81232,81235,81237,81239,81241,81243],{"class":35,"line":290},[33,81227,7268],{"class":50},[33,81229,602],{"class":167},[33,81231,4059],{"class":163},[33,81233,81234],{"class":54},"\"OCR error: ",[33,81236,1115],{"class":50},[33,81238,6565],{"class":167},[33,81240,1121],{"class":50},[33,81242,274],{"class":54},[33,81244,221],{"class":167},[18,81246,81248,81249,81252],{"id":81247},"variant-tesseracterror-after-the-binary-is-found","Variant: ",[30,81250,81251],{},"TesseractError"," After the Binary Is Found",[14,81254,81255],{},"Once the binary is installed and on PATH, a second class of error can appear:",[23,81257,81260],{"className":81258,"code":81259,"language":2000},[1998],"pytesseract.pytesseract.TesseractError: (1, 'Error, could not initialize tesseract API')\n",[30,81261,81259],{"__ignoreMap":28},[14,81263,81264,81265,81267],{},"This is not the same as ",[30,81266,79693],{},". The binary was found but crashed during initialisation. Common causes:",[4211,81269,81270,81279,81293],{},[4214,81271,81272,81275,81276,3035],{},[1974,81273,81274],{},"Corrupt tessdata directory:"," reinstall with ",[30,81277,81278],{},"sudo apt-get install --reinstall tesseract-ocr",[4214,81280,81281,81286,81287,81289,81290,81292],{},[1974,81282,81283,81284,20891],{},"Wrong ",[30,81285,80995],{}," the environment variable points to a directory that does not contain the expected ",[30,81288,81010],{}," files. Unset ",[30,81291,80995],{}," and let Tesseract use its compiled-in default path.",[4214,81294,81295,81298,81299,81301],{},[1974,81296,81297],{},"Version mismatch:"," a Tesseract 5 binary with Tesseract 4 language files (or vice versa). Check ",[30,81300,79932],{}," and ensure your language pack packages match.",[14,81303,81304],{},"Quick diagnostic:",[23,81306,81308],{"className":25,"code":81307,"language":27,"meta":28,"style":28},"# Print the tessdata directory Tesseract is actually using\ntesseract --print-parameters 2>&1 | grep tessdata_dir\n\n# List detected languages\ntesseract --list-langs\n",[30,81309,81310,81315,81332,81336,81341],{"__ignoreMap":28},[33,81311,81312],{"class":35,"line":36},[33,81313,81314],{"class":39},"# Print the tessdata directory Tesseract is actually using\n",[33,81316,81317,81319,81322,81325,81327,81329],{"class":35,"line":43},[33,81318,46820],{"class":46},[33,81320,81321],{"class":50}," --print-parameters",[33,81323,81324],{"class":163}," 2>&1",[33,81326,2850],{"class":163},[33,81328,41954],{"class":46},[33,81330,81331],{"class":54}," tessdata_dir\n",[33,81333,81334],{"class":35,"line":61},[33,81335,92],{"emptyLinePlaceholder":91},[33,81337,81338],{"class":35,"line":73},[33,81339,81340],{"class":39},"# List detected languages\n",[33,81342,81343,81345],{"class":35,"line":88},[33,81344,46820],{"class":46},[33,81346,81071],{"class":50},[14,81348,41963,81349,81352],{},[30,81350,81351],{},"--list-langs"," returns an empty list or crashes, the tessdata directory is either missing or misconfigured.",[18,81354,81248,81356,42238,81358,81360],{"id":81355},"variant-importerror-for-pytesseract-itself",[30,81357,42237],{},[30,81359,79666],{}," Itself",[14,81362,81363],{},"If your error is:",[23,81365,81368],{"className":81366,"code":81367,"language":2000},[1998],"ModuleNotFoundError: No module named 'pytesseract'\n",[30,81369,81367],{"__ignoreMap":28},[14,81371,81372],{},"the Python wrapper is missing from the current environment. Install it:",[23,81374,81376],{"className":25,"code":81375,"language":27,"meta":28,"style":28},"pip install pytesseract\n",[30,81377,81378],{"__ignoreMap":28},[33,81379,81380,81382,81384],{"class":35,"line":36},[33,81381,76],{"class":46},[33,81383,79],{"class":54},[33,81385,47178],{"class":54},[14,81387,81388,81389,81391],{},"This is distinct from ",[30,81390,79693],{},". In a virtual environment, always confirm you are installing into the correct env:",[23,81393,81395],{"className":25,"code":81394,"language":27,"meta":28,"style":28},"which python          # should point inside your venv\npip show pytesseract  # should show a Version line\n",[30,81396,81397,81406],{"__ignoreMap":28},[33,81398,81399,81401,81403],{"class":35,"line":36},[33,81400,35269],{"class":50},[33,81402,42312],{"class":54},[33,81404,81405],{"class":39},"          # should point inside your venv\n",[33,81407,81408,81410,81412,81414],{"class":35,"line":43},[33,81409,76],{"class":46},[33,81411,41946],{"class":54},[33,81413,46802],{"class":54},[33,81415,81416],{"class":39},"  # should show a Version line\n",[18,81418,9247],{"id":9246},[14,81420,81421],{},"After applying any fix, run this end-to-end smoke test to confirm the full stack is working:",[23,81423,81425],{"className":126,"code":81424,"language":47,"meta":28,"style":28},"# pip install pytesseract Pillow\nfrom PIL import Image, ImageDraw, ImageFont\nimport pytesseract\n\ndef smoke_test_ocr() -> None:\n    \"\"\"\n    Create a minimal in-memory image with known text, run OCR,\n    and assert the result matches.\n    \"\"\"\n    # Draw \"Hello OCR\" on a white image\n    img = Image.new(\"RGB\", (200, 60), color=(255, 255, 255))\n    draw = ImageDraw.Draw(img)\n    draw.text((10, 15), \"Hello OCR\", fill=(0, 0, 0))\n\n    result = pytesseract.image_to_string(img).strip()\n    assert \"Hello\" in result, f\"OCR smoke test failed — got: {result!r}\"\n    version = pytesseract.get_tesseract_version()\n    print(f\"OK — Tesseract {version}, recognised: {result!r}\")\n\nsmoke_test_ocr()\n# → OK — Tesseract 5.3.x, recognised: 'Hello OCR'\n",[30,81426,81427,81431,81442,81448,81452,81465,81469,81474,81479,81483,81488,81528,81538,81575,81579,81588,81616,81626,81659,81663,81668],{"__ignoreMap":28},[33,81428,81429],{"class":35,"line":36},[33,81430,80141],{"class":39},[33,81432,81433,81435,81437,81439],{"class":35,"line":43},[33,81434,190],{"class":163},[33,81436,46889],{"class":50},[33,81438,46892],{"class":163},[33,81440,81441],{"class":167}," Image, ImageDraw, ImageFont\n",[33,81443,81444,81446],{"class":35,"line":61},[33,81445,164],{"class":163},[33,81447,47178],{"class":167},[33,81449,81450],{"class":35,"line":73},[33,81451,92],{"emptyLinePlaceholder":91},[33,81453,81454,81456,81459,81461,81463],{"class":35,"line":88},[33,81455,562],{"class":163},[33,81457,81458],{"class":46}," smoke_test_ocr",[33,81460,568],{"class":167},[33,81462,571],{"class":50},[33,81464,574],{"class":167},[33,81466,81467],{"class":35,"line":95},[33,81468,7673],{"class":54},[33,81470,81471],{"class":35,"line":101},[33,81472,81473],{"class":54},"    Create a minimal in-memory image with known text, run OCR,\n",[33,81475,81476],{"class":35,"line":171},[33,81477,81478],{"class":54},"    and assert the result matches.\n",[33,81480,81481],{"class":35,"line":179},[33,81482,7673],{"class":54},[33,81484,81485],{"class":35,"line":187},[33,81486,81487],{"class":39},"    # Draw \"Hello OCR\" on a white image\n",[33,81489,81490,81492,81494,81497,81500,81502,81504,81506,81508,81510,81512,81514,81516,81518,81520,81522,81524,81526],{"class":35,"line":201},[33,81491,80264],{"class":167},[33,81493,242],{"class":163},[33,81495,81496],{"class":167}," Image.new(",[33,81498,81499],{"class":54},"\"RGB\"",[33,81501,19953],{"class":167},[33,81503,2611],{"class":50},[33,81505,365],{"class":167},[33,81507,2590],{"class":50},[33,81509,18525],{"class":167},[33,81511,17245],{"class":238},[33,81513,242],{"class":163},[33,81515,602],{"class":167},[33,81517,2678],{"class":50},[33,81519,365],{"class":167},[33,81521,2678],{"class":50},[33,81523,365],{"class":167},[33,81525,2678],{"class":50},[33,81527,371],{"class":167},[33,81529,81530,81533,81535],{"class":35,"line":206},[33,81531,81532],{"class":167},"    draw ",[33,81534,242],{"class":163},[33,81536,81537],{"class":167}," ImageDraw.Draw(img)\n",[33,81539,81540,81543,81545,81547,81549,81551,81554,81556,81559,81561,81563,81565,81567,81569,81571,81573],{"class":35,"line":224},[33,81541,81542],{"class":167},"    draw.text((",[33,81544,3545],{"class":50},[33,81546,365],{"class":167},[33,81548,1646],{"class":50},[33,81550,18525],{"class":167},[33,81552,81553],{"class":54},"\"Hello OCR\"",[33,81555,365],{"class":167},[33,81557,81558],{"class":238},"fill",[33,81560,242],{"class":163},[33,81562,602],{"class":167},[33,81564,748],{"class":50},[33,81566,365],{"class":167},[33,81568,748],{"class":50},[33,81570,365],{"class":167},[33,81572,748],{"class":50},[33,81574,371],{"class":167},[33,81576,81577],{"class":35,"line":229},[33,81578,92],{"emptyLinePlaceholder":91},[33,81580,81581,81583,81585],{"class":35,"line":235},[33,81582,8842],{"class":167},[33,81584,242],{"class":163},[33,81586,81587],{"class":167}," pytesseract.image_to_string(img).strip()\n",[33,81589,81590,81592,81595,81597,81600,81602,81605,81607,81610,81612,81614],{"class":35,"line":250},[33,81591,9228],{"class":163},[33,81593,81594],{"class":54}," \"Hello\"",[33,81596,8002],{"class":163},[33,81598,81599],{"class":167}," result, ",[33,81601,4059],{"class":163},[33,81603,81604],{"class":54},"\"OCR smoke test failed — got: ",[33,81606,1115],{"class":50},[33,81608,81609],{"class":167},"result",[33,81611,76954],{"class":163},[33,81613,1121],{"class":50},[33,81615,7504],{"class":54},[33,81617,81618,81621,81623],{"class":35,"line":266},[33,81619,81620],{"class":167},"    version ",[33,81622,242],{"class":163},[33,81624,81625],{"class":167}," pytesseract.get_tesseract_version()\n",[33,81627,81628,81630,81632,81634,81637,81639,81642,81644,81647,81649,81651,81653,81655,81657],{"class":35,"line":290},[33,81629,7268],{"class":50},[33,81631,602],{"class":167},[33,81633,4059],{"class":163},[33,81635,81636],{"class":54},"\"OK — Tesseract ",[33,81638,1115],{"class":50},[33,81640,81641],{"class":167},"version",[33,81643,1121],{"class":50},[33,81645,81646],{"class":54},", recognised: ",[33,81648,1115],{"class":50},[33,81650,81609],{"class":167},[33,81652,76954],{"class":163},[33,81654,1121],{"class":50},[33,81656,274],{"class":54},[33,81658,221],{"class":167},[33,81660,81661],{"class":35,"line":295},[33,81662,92],{"emptyLinePlaceholder":91},[33,81664,81665],{"class":35,"line":300},[33,81666,81667],{"class":167},"smoke_test_ocr()\n",[33,81669,81670],{"class":35,"line":317},[33,81671,81672],{"class":39},"# → OK — Tesseract 5.3.x, recognised: 'Hello OCR'\n",[14,81674,81675,81676,81678],{},"If the assertion passes, the binary, the Python wrapper, and the default language pack are all correctly configured. The full ",[940,81677,36756],{"href":26957}," pipeline will run without further changes.",[18,81680,6918],{"id":6917},[4211,81682,81683,81688,81693],{},[4214,81684,81685,81687],{},[940,81686,36756],{"href":26957}," — full pipeline: rasterize, preprocess, OCR, searchable PDF",[4214,81689,81690,81692],{},[940,81691,10077],{"href":10076}," — coordinate-clustering to extract tabular data once OCR is working",[4214,81694,81695,81697],{},[940,81696,9592],{"href":942}," — standard (non-OCR) table extraction for vector PDFs",[14,81699,6947,81700,3035],{},[940,81701,36756],{"href":26957},[6953,81703,81704],{},"html pre.shiki code .sAwPA, html code.shiki .sAwPA{--shiki-default:#6A737D}html pre.shiki code .sD7c4, html code.shiki .sD7c4{--shiki-default:#D73A49}html pre.shiki code .sgsFI, html code.shiki .sgsFI{--shiki-default:#24292E}html pre.shiki code .sYBdl, html code.shiki .sYBdl{--shiki-default:#032F62}html pre.shiki code .sqxcx, html code.shiki .sqxcx{--shiki-default:#E36209}html pre.shiki code .sYu0t, html code.shiki .sYu0t{--shiki-default:#005CC5}html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html pre.shiki code .s7eDp, html code.shiki .s7eDp{--shiki-default:#6F42C1}html pre.shiki code .s691h, html code.shiki .s691h{--shiki-default:#22863A;--shiki-default-font-weight:bold}html pre.shiki code .shJU0, html code.shiki .shJU0{--shiki-default:#22863A}",{"title":28,"searchDepth":43,"depth":43,"links":81706},[81707,81708,81709,81715,81720,81723,81725,81727,81729,81730],{"id":7020,"depth":43,"text":7021},{"id":35016,"depth":43,"text":35017},{"id":79853,"depth":43,"text":79854,"children":81710},[81711,81712,81713,81714],{"id":79857,"depth":61,"text":79858},{"id":35292,"depth":61,"text":35293},{"id":35384,"depth":61,"text":35385},{"id":80076,"depth":61,"text":80077},{"id":80121,"depth":43,"text":81716,"children":81717},"Fix 2 — Set tesseract_cmd in Code",[81718],{"id":80435,"depth":61,"text":81719},"Making tesseract_cmd Environment-Aware",{"id":80723,"depth":43,"text":80724,"children":81721},[81722],{"id":80865,"depth":61,"text":80866},{"id":80991,"depth":43,"text":81724},"Fix 4 — Missing Language Data (TESSDATA_PREFIX)",{"id":81247,"depth":43,"text":81726},"Variant: TesseractError After the Binary Is Found",{"id":81355,"depth":43,"text":81728},"Variant: ImportError for pytesseract Itself",{"id":9246,"depth":43,"text":9247},{"id":6917,"depth":43,"text":6918},"Fix TesseractNotFoundError","Fix pytesseract.pytesseract.TesseractNotFoundError by installing the Tesseract binary, adding it to PATH, or setting tesseract_cmd — covers Linux, macOS, Windows, and Docker.",{},"\u002Fautomating-pdf-extraction-generation\u002Fscanning-and-ocr-processing-with-python\u002Ffix-tesseract-not-found-error",{"title":65223,"description":81732},"Fix pytesseract TesseractNotFoundError — All Platforms","automating-pdf-extraction-generation\u002Fscanning-and-ocr-processing-with-python\u002Ffix-tesseract-not-found-error\u002Findex",[47,49143,46820,81739],"debugging","nmVTQCOsPx2OYWbQ3oZR66Fp0nzb3bfpWK0iowzqGns",{"id":81742,"title":36756,"body":81743,"breadcrumbTitle":86096,"canonical":6977,"date":46387,"description":86097,"draft":6980,"extension":6981,"image":6977,"meta":86098,"navigation":91,"path":86099,"robots":6977,"seo":86100,"seoTitle":86101,"stem":86102,"tags":86103,"updatedAt":6978,"__hash__":86104},"content\u002Fautomating-pdf-extraction-generation\u002Fscanning-and-ocr-processing-with-python\u002Findex.md",{"type":7,"value":81744,"toc":86073},[81745,81748,81751,81754,81756,81765,81860,81863,81883,81887,81895,82129,82143,82147,82150,82156,82365,82381,82385,82503,82506,82912,82916,82940,82943,83033,83037,83045,83474,83478,83536,83547,83551,83560,83830,83836,83840,83843,84186,84192,84194,84198,84204,84270,84276,84280,84287,84316,84320,84323,84444,84446,84449,84731,84738,84740,84774,85019,85021,85129,85133,86043,86045,86067,86071],[10,81746,36756],{"id":81747},"scanning-and-ocr-processing-with-python",[14,81749,81750],{},"Scanned PDFs are image containers — no embedded text, no selectable characters. Every standard parser returns empty results because there is nothing to parse. The fix is a three-stage pipeline: rasterize each page to a high-DPI image, preprocess that image to maximise contrast and alignment, then feed it to an OCR engine that produces a text layer you can search, index, or pipe into downstream extraction.",[14,81752,81753],{},"Generic \"just call pytesseract\" tutorials skip the preprocessing stage. That works on clean scans at ideal DPI; it fails on faded invoices, skewed photographs, and low-contrast forms. This guide covers the full pipeline end-to-end.",[18,81755,21],{"id":20},[14,81757,81758,81759,81761,81762,3035],{},"System binary and Python packages must both be present. pytesseract is only a wrapper — without the Tesseract binary the wrapper raises ",[30,81760,79693],{}," immediately. If you hit that error, see ",[940,81763,81764],{"href":65222},"Fix TesseractNotFoundError in Python",[23,81766,81768],{"className":25,"code":81767,"language":27,"meta":28,"style":28},"# Ubuntu\u002FDebian\nsudo apt-get install tesseract-ocr\n\n# macOS\nbrew install tesseract\n\n# Windows — download from https:\u002F\u002Fgithub.com\u002FUB-Mannheim\u002Ftesseract\u002Fwiki and add to PATH\n\n# Python packages\npip install pytesseract pymupdf pdf2image Pillow opencv-python numpy\n\n# For extra language packs (e.g. German + French):\nsudo apt-get install tesseract-ocr-deu tesseract-ocr-fra\n",[30,81769,81770,81774,81784,81788,81792,81800,81804,81809,81813,81817,81838,81842,81847],{"__ignoreMap":28},[33,81771,81772],{"class":35,"line":36},[33,81773,20901],{"class":39},[33,81775,81776,81778,81780,81782],{"class":35,"line":43},[33,81777,9669],{"class":46},[33,81779,9672],{"class":54},[33,81781,79],{"class":54},[33,81783,79885],{"class":54},[33,81785,81786],{"class":35,"line":61},[33,81787,92],{"emptyLinePlaceholder":91},[33,81789,81790],{"class":35,"line":73},[33,81791,57888],{"class":39},[33,81793,81794,81796,81798],{"class":35,"line":88},[33,81795,35308],{"class":46},[33,81797,79],{"class":54},[33,81799,79956],{"class":54},[33,81801,81802],{"class":35,"line":95},[33,81803,92],{"emptyLinePlaceholder":91},[33,81805,81806],{"class":35,"line":101},[33,81807,81808],{"class":39},"# Windows — download from https:\u002F\u002Fgithub.com\u002FUB-Mannheim\u002Ftesseract\u002Fwiki and add to PATH\n",[33,81810,81811],{"class":35,"line":171},[33,81812,92],{"emptyLinePlaceholder":91},[33,81814,81815],{"class":35,"line":179},[33,81816,9692],{"class":39},[33,81818,81819,81821,81823,81825,81828,81830,81833,81835],{"class":35,"line":187},[33,81820,76],{"class":46},[33,81822,79],{"class":54},[33,81824,46802],{"class":54},[33,81826,81827],{"class":54}," pymupdf",[33,81829,46799],{"class":54},[33,81831,81832],{"class":54}," Pillow",[33,81834,42024],{"class":54},[33,81836,81837],{"class":54}," numpy\n",[33,81839,81840],{"class":35,"line":201},[33,81841,92],{"emptyLinePlaceholder":91},[33,81843,81844],{"class":35,"line":206},[33,81845,81846],{"class":39},"# For extra language packs (e.g. German + French):\n",[33,81848,81849,81851,81853,81855,81857],{"class":35,"line":224},[33,81850,9669],{"class":46},[33,81852,9672],{"class":54},[33,81854,79],{"class":54},[33,81856,79920],{"class":54},[33,81858,81859],{"class":54}," tesseract-ocr-fra\n",[14,81861,81862],{},"Verify the installation before writing any pipeline code:",[23,81864,81866],{"className":25,"code":81865,"language":27,"meta":28,"style":28},"tesseract --version\npython -c \"import pytesseract; print(pytesseract.get_tesseract_version())\"\n",[30,81867,81868,81874],{"__ignoreMap":28},[33,81869,81870,81872],{"class":35,"line":36},[33,81871,46820],{"class":46},[33,81873,41864],{"class":50},[33,81875,81876,81878,81880],{"class":35,"line":43},[33,81877,47],{"class":46},[33,81879,106],{"class":50},[33,81881,81882],{"class":54}," \"import pytesseract; print(pytesseract.get_tesseract_version())\"\n",[18,81884,81886],{"id":81885},"step-1-diagnose-the-pdf-before-choosing-a-path","Step 1 — Diagnose the PDF Before Choosing a Path",[14,81888,81889,81890,81894],{},"Not every \"scanned\" PDF is purely raster. Some contain a hidden text layer added by a previous OCR pass. Running the full preprocessing+OCR pipeline on those wastes time and can degrade quality. Use ",[940,81891,52011],{"href":81892,"rel":81893},"https:\u002F\u002Fpymupdf.readthedocs.io\u002F",[1367]," to check first.",[23,81896,81898],{"className":126,"code":81897,"language":47,"meta":28,"style":28},"# pip install pymupdf\nfrom pathlib import Path\nimport fitz  # PyMuPDF\n\ndef classify_pdf(pdf_path: Path) -> str:\n    \"\"\"\n    Returns 'text' if the PDF has an embedded text layer,\n    'raster' if it is a pure scan that needs OCR.\n    \"\"\"\n    try:\n        doc = fitz.open(pdf_path)\n        char_count = sum(len(page.get_text(\"text\").strip()) for page in doc)\n        doc.close()\n        return \"text\" if char_count > 20 else \"raster\"\n    except Exception as exc:\n        raise RuntimeError(f\"Could not open {pdf_path}: {exc}\") from exc\n\npdf = Path(\"contract_scan.pdf\")\nkind = classify_pdf(pdf)\nprint(f\"{pdf.name}: {kind}\")\n# → contract_scan.pdf: raster\n",[30,81899,81900,81904,81914,81922,81926,81938,81942,81947,81952,81956,81962,81971,81998,82002,82023,82033,82067,82071,82085,82095,82124],{"__ignoreMap":28},[33,81901,81902],{"class":35,"line":36},[33,81903,46530],{"class":39},[33,81905,81906,81908,81910,81912],{"class":35,"line":43},[33,81907,190],{"class":163},[33,81909,193],{"class":167},[33,81911,164],{"class":163},[33,81913,198],{"class":167},[33,81915,81916,81918,81920],{"class":35,"line":61},[33,81917,164],{"class":163},[33,81919,46547],{"class":167},[33,81921,46550],{"class":39},[33,81923,81924],{"class":35,"line":73},[33,81925,92],{"emptyLinePlaceholder":91},[33,81927,81928,81930,81932,81934,81936],{"class":35,"line":88},[33,81929,562],{"class":163},[33,81931,9810],{"class":46},[33,81933,37097],{"class":167},[33,81935,1053],{"class":50},[33,81937,574],{"class":167},[33,81939,81940],{"class":35,"line":95},[33,81941,7673],{"class":54},[33,81943,81944],{"class":35,"line":101},[33,81945,81946],{"class":54},"    Returns 'text' if the PDF has an embedded text layer,\n",[33,81948,81949],{"class":35,"line":171},[33,81950,81951],{"class":54},"    'raster' if it is a pure scan that needs OCR.\n",[33,81953,81954],{"class":35,"line":179},[33,81955,7673],{"class":54},[33,81957,81958,81960],{"class":35,"line":187},[33,81959,2424],{"class":163},[33,81961,574],{"class":167},[33,81963,81964,81966,81968],{"class":35,"line":201},[33,81965,20077],{"class":167},[33,81967,242],{"class":163},[33,81969,81970],{"class":167}," fitz.open(pdf_path)\n",[33,81972,81973,81976,81978,81980,81982,81984,81986,81988,81990,81992,81994,81996],{"class":35,"line":206},[33,81974,81975],{"class":167},"        char_count ",[33,81977,242],{"class":163},[33,81979,46601],{"class":50},[33,81981,602],{"class":167},[33,81983,928],{"class":50},[33,81985,46608],{"class":167},[33,81987,3459],{"class":54},[33,81989,46613],{"class":167},[33,81991,6124],{"class":163},[33,81993,695],{"class":167},[33,81995,662],{"class":163},[33,81997,46622],{"class":167},[33,81999,82000],{"class":35,"line":224},[33,82001,46627],{"class":167},[33,82003,82004,82006,82009,82011,82014,82016,82018,82020],{"class":35,"line":229},[33,82005,1659],{"class":163},[33,82007,82008],{"class":54}," \"text\"",[33,82010,9994],{"class":163},[33,82012,82013],{"class":167}," char_count ",[33,82015,6009],{"class":163},[33,82017,43599],{"class":50},[33,82019,15715],{"class":163},[33,82021,82022],{"class":54}," \"raster\"\n",[33,82024,82025,82027,82029,82031],{"class":35,"line":235},[33,82026,2449],{"class":163},[33,82028,783],{"class":50},[33,82030,1852],{"class":163},[33,82032,1855],{"class":167},[33,82034,82035,82037,82039,82041,82043,82045,82047,82049,82051,82053,82055,82057,82059,82061,82063,82065],{"class":35,"line":250},[33,82036,4051],{"class":163},[33,82038,7590],{"class":50},[33,82040,602],{"class":167},[33,82042,4059],{"class":163},[33,82044,43335],{"class":54},[33,82046,1115],{"class":50},[33,82048,27069],{"class":167},[33,82050,1121],{"class":50},[33,82052,2079],{"class":54},[33,82054,1115],{"class":50},[33,82056,6565],{"class":167},[33,82058,1121],{"class":50},[33,82060,274],{"class":54},[33,82062,1649],{"class":167},[33,82064,190],{"class":163},[33,82066,20843],{"class":167},[33,82068,82069],{"class":35,"line":266},[33,82070,92],{"emptyLinePlaceholder":91},[33,82072,82073,82076,82078,82080,82083],{"class":35,"line":290},[33,82074,82075],{"class":167},"pdf ",[33,82077,242],{"class":163},[33,82079,215],{"class":167},[33,82081,82082],{"class":54},"\"contract_scan.pdf\"",[33,82084,221],{"class":167},[33,82086,82087,82090,82092],{"class":35,"line":295},[33,82088,82089],{"class":167},"kind ",[33,82091,242],{"class":163},[33,82093,82094],{"class":167}," classify_pdf(pdf)\n",[33,82096,82097,82099,82101,82103,82105,82107,82109,82111,82113,82115,82118,82120,82122],{"class":35,"line":300},[33,82098,13474],{"class":50},[33,82100,602],{"class":167},[33,82102,4059],{"class":163},[33,82104,274],{"class":54},[33,82106,1115],{"class":50},[33,82108,68341],{"class":167},[33,82110,1121],{"class":50},[33,82112,2079],{"class":54},[33,82114,1115],{"class":50},[33,82116,82117],{"class":167},"kind",[33,82119,1121],{"class":50},[33,82121,274],{"class":54},[33,82123,221],{"class":167},[33,82125,82126],{"class":35,"line":317},[33,82127,82128],{"class":39},"# → contract_scan.pdf: raster\n",[14,82130,82131,82132,82134,82135,82138,82139,82142],{},"If the result is ",[30,82133,2000],{},", use ",[940,82136,82137],{"href":942},"pdfplumber or camelot"," directly — no OCR needed. If it is ",[30,82140,82141],{},"raster",", continue with the pipeline below.",[18,82144,82146],{"id":82145},"step-2-rasterize-pdf-pages","Step 2 — Rasterize PDF Pages",[14,82148,82149],{},"Convert each page to a PIL Image at 300 DPI minimum. Below 300 DPI, character strokes blur and Tesseract misreads adjacent characters.",[14,82151,82152,82153,82155],{},"Two options: ",[30,82154,49144],{}," (wraps Poppler, simple API) or PyMuPDF (no Poppler dependency, faster on large files).",[23,82157,82159],{"className":126,"code":82158,"language":47,"meta":28,"style":28},"# pip install pymupdf Pillow\nfrom pathlib import Path\nfrom PIL import Image\nimport fitz\n\ndef rasterize_pdf_pymupdf(pdf_path: Path, dpi: int = 300) -> list[Image.Image]:\n    \"\"\"Render each page of a PDF to a PIL Image at the specified DPI.\"\"\"\n    pages: list[Image.Image] = []\n    try:\n        doc = fitz.open(pdf_path)\n        for page in doc:\n            pix = page.get_pixmap(dpi=dpi)\n            img = Image.frombytes(\"RGB\", [pix.width, pix.height], pix.samples)\n            pages.append(img)\n        doc.close()\n    except Exception as exc:\n        raise RuntimeError(f\"Rasterize failed for {pdf_path}: {exc}\") from exc\n    return pages\n\n# Alternative: pdf2image (requires poppler installed separately)\n# from pdf2image import convert_from_path\n# pages = convert_from_path(\"contract_scan.pdf\", dpi=300)\n",[30,82160,82161,82166,82176,82186,82193,82197,82215,82220,82229,82235,82243,82254,82270,82285,82290,82294,82304,82339,82346,82350,82355,82360],{"__ignoreMap":28},[33,82162,82163],{"class":35,"line":36},[33,82164,82165],{"class":39},"# pip install pymupdf Pillow\n",[33,82167,82168,82170,82172,82174],{"class":35,"line":43},[33,82169,190],{"class":163},[33,82171,193],{"class":167},[33,82173,164],{"class":163},[33,82175,198],{"class":167},[33,82177,82178,82180,82182,82184],{"class":35,"line":61},[33,82179,190],{"class":163},[33,82181,46889],{"class":50},[33,82183,46892],{"class":163},[33,82185,47171],{"class":167},[33,82187,82188,82190],{"class":35,"line":73},[33,82189,164],{"class":163},[33,82191,82192],{"class":167}," fitz\n",[33,82194,82195],{"class":35,"line":88},[33,82196,92],{"emptyLinePlaceholder":91},[33,82198,82199,82201,82204,82207,82209,82211,82213],{"class":35,"line":95},[33,82200,562],{"class":163},[33,82202,82203],{"class":46}," rasterize_pdf_pymupdf",[33,82205,82206],{"class":167},"(pdf_path: Path, dpi: ",[33,82208,1059],{"class":50},[33,82210,212],{"class":163},[33,82212,46935],{"class":50},[33,82214,46938],{"class":167},[33,82216,82217],{"class":35,"line":101},[33,82218,82219],{"class":54},"    \"\"\"Render each page of a PDF to a PIL Image at the specified DPI.\"\"\"\n",[33,82221,82222,82225,82227],{"class":35,"line":171},[33,82223,82224],{"class":167},"    pages: list[Image.Image] ",[33,82226,242],{"class":163},[33,82228,589],{"class":167},[33,82230,82231,82233],{"class":35,"line":179},[33,82232,2424],{"class":163},[33,82234,574],{"class":167},[33,82236,82237,82239,82241],{"class":35,"line":187},[33,82238,20077],{"class":167},[33,82240,242],{"class":163},[33,82242,81970],{"class":167},[33,82244,82245,82247,82249,82251],{"class":35,"line":201},[33,82246,5973],{"class":163},[33,82248,695],{"class":167},[33,82250,662],{"class":163},[33,82252,82253],{"class":167}," doc:\n",[33,82255,82256,82259,82261,82264,82266,82268],{"class":35,"line":206},[33,82257,82258],{"class":167},"            pix ",[33,82260,242],{"class":163},[33,82262,82263],{"class":167}," page.get_pixmap(",[33,82265,46966],{"class":238},[33,82267,242],{"class":163},[33,82269,46971],{"class":167},[33,82271,82272,82275,82277,82280,82282],{"class":35,"line":224},[33,82273,82274],{"class":167},"            img ",[33,82276,242],{"class":163},[33,82278,82279],{"class":167}," Image.frombytes(",[33,82281,81499],{"class":54},[33,82283,82284],{"class":167},", [pix.width, pix.height], pix.samples)\n",[33,82286,82287],{"class":35,"line":229},[33,82288,82289],{"class":167},"            pages.append(img)\n",[33,82291,82292],{"class":35,"line":235},[33,82293,46627],{"class":167},[33,82295,82296,82298,82300,82302],{"class":35,"line":250},[33,82297,2449],{"class":163},[33,82299,783],{"class":50},[33,82301,1852],{"class":163},[33,82303,1855],{"class":167},[33,82305,82306,82308,82310,82312,82314,82317,82319,82321,82323,82325,82327,82329,82331,82333,82335,82337],{"class":35,"line":266},[33,82307,4051],{"class":163},[33,82309,7590],{"class":50},[33,82311,602],{"class":167},[33,82313,4059],{"class":163},[33,82315,82316],{"class":54},"\"Rasterize failed for ",[33,82318,1115],{"class":50},[33,82320,27069],{"class":167},[33,82322,1121],{"class":50},[33,82324,2079],{"class":54},[33,82326,1115],{"class":50},[33,82328,6565],{"class":167},[33,82330,1121],{"class":50},[33,82332,274],{"class":54},[33,82334,1649],{"class":167},[33,82336,190],{"class":163},[33,82338,20843],{"class":167},[33,82340,82341,82343],{"class":35,"line":290},[33,82342,1332],{"class":163},[33,82344,82345],{"class":167}," pages\n",[33,82347,82348],{"class":35,"line":295},[33,82349,92],{"emptyLinePlaceholder":91},[33,82351,82352],{"class":35,"line":300},[33,82353,82354],{"class":39},"# Alternative: pdf2image (requires poppler installed separately)\n",[33,82356,82357],{"class":35,"line":317},[33,82358,82359],{"class":39},"# from pdf2image import convert_from_path\n",[33,82361,82362],{"class":35,"line":332},[33,82363,82364],{"class":39},"# pages = convert_from_path(\"contract_scan.pdf\", dpi=300)\n",[14,82366,82367,82368,82370,82371,17583,82374,36608,82377,82380],{},"PyMuPDF does not require Poppler. ",[30,82369,49144],{}," requires ",[30,82372,82373],{},"poppler-utils",[30,82375,82376],{},"apt-get install poppler-utils",[30,82378,82379],{},"brew install poppler",") but can be more convenient for batch workflows.",[18,82382,82384],{"id":82383},"step-3-preprocess-images-for-ocr","Step 3 — Preprocess Images for OCR",[2540,82386,2547,82388,2547,82391,2547,82394,2547,2547,82408,2547,82410,2547,82414,2547,82417,2547,82420,2547,82423,2547,82426,2547,82428,2547,82431,2547,82434,2547,82437,2547,82440,2547,82443,2547,82447,2547,82450,2547,82454,2547,82456,2547,82458,2547,82461,2547,82464,2547,82466,2547,82469,2547,82473,2547,82476,2547,82479,2547,82482,2547,82485,2547,2547,82487,2547,82490,2547,82494,2547,82497,82500],{"viewBox":11071,"role":2543,"ariaLabel":82387,"xmlns":2545,"style":2546},"OCR pipeline: rasterize, preprocess (grayscale, threshold, deskew, denoise), OCR, postprocess",[2549,82389,82390],{},"Preprocess → OCR → Postprocess pipeline",[2553,82392,82393],{},"Flow diagram showing the four stages of the scanning OCR pipeline: rasterize PDF pages, preprocess images (grayscale, Otsu threshold, deskew, denoise), run Tesseract OCR with PSM config, and postprocess (confidence filter, text normalise, embed text layer).",[2557,82395,2559,82396,2559,82403,2547],{},[2561,82397,2564,82399,2564,82401,2559],{"id":82398,"x1":748,"y1":748,"x2":734,"y2":748},"ocr-scan-grad",[2566,82400],{"offset":748,"style":2568},[2566,82402],{"offset":734,"style":2571},[2573,82404,2564,82406,2559],{"id":82405,"viewBox":2576,"refX":2577,"refY":1153,"markerWidth":1179,"markerHeight":1179,"orient":2578},"ocr-scan-arrow",[2580,82407],{"d":2582,"fill":2583},[2585,82409],{"x":55650,"y":2680,"width":11194,"height":2701,"rx":2591,"fill":2615,"stroke":2593,"style":2594},[2000,82411,82413],{"x":82412,"y":26350,"fill":2599,"style":38718},"86","Rasterize",[2000,82415,49144],{"x":82412,"y":82416,"fill":2583,"style":2685},"98",[2000,82418,82419],{"x":82412,"y":26322,"fill":2583,"style":2685},"or PyMuPDF",[2000,82421,82422],{"x":82412,"y":2609,"fill":11166,"style":2685},"300 DPI",[2000,82424,82425],{"x":82412,"y":11132,"fill":2583,"style":2685},"PIL Image",[2585,82427],{"x":38843,"y":2680,"width":11194,"height":2701,"rx":2591,"fill":2615,"stroke":2593,"style":2594},[2000,82429,82430],{"x":26446,"y":26350,"fill":2599,"style":38718},"Preprocess",[2000,82432,82433],{"x":26446,"y":82416,"fill":2583,"style":2685},"Grayscale",[2000,82435,82436],{"x":26446,"y":26322,"fill":2583,"style":2685},"Otsu threshold",[2000,82438,82439],{"x":26446,"y":26332,"fill":2583,"style":2685},"Deskew",[2000,82441,82442],{"x":26446,"y":11112,"fill":2583,"style":2685},"Denoise",[2000,82444,82446],{"x":26446,"y":82445,"fill":11166,"style":2605},"174","OpenCV \u002F Pillow",[2585,82448],{"x":2626,"y":2680,"width":11194,"height":2701,"rx":2591,"fill":82449,"stroke":2593,"style":2594},"url(#ocr-scan-grad)",[2000,82451,82453],{"x":82452,"y":26350,"fill":2599,"style":38718},"454","Tesseract OCR",[2000,82455,79683],{"x":82452,"y":82416,"fill":2599,"style":2685},[2000,82457,48364],{"x":82452,"y":26322,"fill":2599,"style":2685},[2000,82459,82460],{"x":82452,"y":26332,"fill":2599,"style":2685},"PSM flags",[2000,82462,82463],{"x":82452,"y":11112,"fill":2599,"style":2685},"lang packs",[2000,82465,79666],{"x":82452,"y":82445,"fill":2599,"style":2605},[2585,82467],{"x":82468,"y":2680,"width":11194,"height":2701,"rx":2591,"fill":2615,"stroke":2593,"style":2594},"564",[2000,82470,82472],{"x":82471,"y":26350,"fill":2599,"style":38718},"638","Postprocess",[2000,82474,82475],{"x":82471,"y":82416,"fill":2583,"style":2685},"Confidence filter",[2000,82477,82478],{"x":82471,"y":26322,"fill":2583,"style":2685},"Text normalise",[2000,82480,82481],{"x":82471,"y":26332,"fill":2583,"style":2685},"Embed text layer",[2000,82483,82484],{"x":82471,"y":11112,"fill":2583,"style":2685},"→ searchable PDF",[2000,82486,52011],{"x":82471,"y":82445,"fill":11166,"style":2605},[35,82488],{"x1":2610,"y1":2610,"x2":64936,"y2":2610,"stroke":2583,"markerEnd":82489,"style":2594},"url(#ocr-scan-arrow)",[35,82491],{"x1":82492,"y1":2610,"x2":82493,"y2":2610,"stroke":2583,"markerEnd":82489,"style":2594},"344","376",[35,82495],{"x1":82496,"y1":2610,"x2":49853,"y2":2610,"stroke":2583,"markerEnd":82489,"style":2594},"528",[2000,82498,82499],{"x":82412,"y":11231,"fill":2583,"style":2605},"\nScanned PDF\n",[2000,82501,82502],{"x":82471,"y":11231,"fill":2583,"style":2605},"\nSearchable PDF\n",[14,82504,82505],{},"Raw scans have low contrast, background grain, and slight rotation. Preprocessing before OCR is not optional for production accuracy — it moves character recognition confidence from 40–60 % up to 85–95 % on typical office scans.",[23,82507,82509],{"className":126,"code":82508,"language":47,"meta":28,"style":28},"# pip install opencv-python numpy Pillow\nfrom pathlib import Path\nimport cv2\nimport numpy as np\nfrom PIL import Image\n\ndef preprocess_for_ocr(pil_img: Image.Image) -> np.ndarray:\n    \"\"\"\n    Convert a PIL Image to a denoised, deskewed, binarised numpy array\n    suitable for Tesseract input.\n    \"\"\"\n    # Convert PIL → OpenCV grayscale\n    img = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2GRAY)\n\n    # Otsu's binarisation — optimal threshold calculated automatically\n    _, thresh = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)\n\n    # Fast non-local means denoise (h=10 is good for mild scanner grain)\n    denoised = cv2.fastNlMeansDenoising(thresh, h=10)\n\n    # Deskew via minAreaRect on foreground pixels\n    coords = np.column_stack(np.where(denoised > 0))\n    if len(coords) >= 4:\n        rect = cv2.minAreaRect(coords)\n        angle = rect[-1]\n        angle = -(90 + angle) if angle \u003C -45 else -angle\n        h, w = denoised.shape[:2]\n        M = cv2.getRotationMatrix2D((w \u002F\u002F 2, h \u002F\u002F 2), angle, 1.0)\n        denoised = cv2.warpAffine(\n            denoised, M, (w, h),\n            flags=cv2.INTER_CUBIC,\n            borderMode=cv2.BORDER_REPLICATE,\n        )\n\n    return denoised\n\n\ndef save_debug_image(arr: np.ndarray, output_path: Path) -> None:\n    \"\"\"Optionally save the preprocessed image for visual inspection.\"\"\"\n    cv2.imwrite(str(output_path), arr)\n",[30,82510,82511,82516,82526,82532,82542,82552,82556,82565,82569,82574,82579,82583,82588,82602,82606,82611,82642,82646,82651,82670,82674,82679,82695,82711,82721,82737,82773,82787,82816,82826,82831,82846,82860,82864,82868,82875,82879,82883,82897,82902],{"__ignoreMap":28},[33,82512,82513],{"class":35,"line":36},[33,82514,82515],{"class":39},"# pip install opencv-python numpy Pillow\n",[33,82517,82518,82520,82522,82524],{"class":35,"line":43},[33,82519,190],{"class":163},[33,82521,193],{"class":167},[33,82523,164],{"class":163},[33,82525,198],{"class":167},[33,82527,82528,82530],{"class":35,"line":61},[33,82529,164],{"class":163},[33,82531,41647],{"class":167},[33,82533,82534,82536,82538,82540],{"class":35,"line":73},[33,82535,164],{"class":163},[33,82537,48399],{"class":167},[33,82539,495],{"class":163},[33,82541,48404],{"class":167},[33,82543,82544,82546,82548,82550],{"class":35,"line":88},[33,82545,190],{"class":163},[33,82547,46889],{"class":50},[33,82549,46892],{"class":163},[33,82551,47171],{"class":167},[33,82553,82554],{"class":35,"line":95},[33,82555,92],{"emptyLinePlaceholder":91},[33,82557,82558,82560,82562],{"class":35,"line":101},[33,82559,562],{"class":163},[33,82561,48421],{"class":46},[33,82563,82564],{"class":167},"(pil_img: Image.Image) -> np.ndarray:\n",[33,82566,82567],{"class":35,"line":171},[33,82568,7673],{"class":54},[33,82570,82571],{"class":35,"line":179},[33,82572,82573],{"class":54},"    Convert a PIL Image to a denoised, deskewed, binarised numpy array\n",[33,82575,82576],{"class":35,"line":187},[33,82577,82578],{"class":54},"    suitable for Tesseract input.\n",[33,82580,82581],{"class":35,"line":201},[33,82582,7673],{"class":54},[33,82584,82585],{"class":35,"line":206},[33,82586,82587],{"class":39},"    # Convert PIL → OpenCV grayscale\n",[33,82589,82590,82592,82594,82597,82600],{"class":35,"line":224},[33,82591,80264],{"class":167},[33,82593,242],{"class":163},[33,82595,82596],{"class":167}," cv2.cvtColor(np.array(pil_img), cv2.",[33,82598,82599],{"class":50},"COLOR_RGB2GRAY",[33,82601,221],{"class":167},[33,82603,82604],{"class":35,"line":229},[33,82605,92],{"emptyLinePlaceholder":91},[33,82607,82608],{"class":35,"line":235},[33,82609,82610],{"class":39},"    # Otsu's binarisation — optimal threshold calculated automatically\n",[33,82612,82613,82616,82618,82621,82623,82625,82627,82630,82632,82635,82637,82640],{"class":35,"line":250},[33,82614,82615],{"class":167},"    _, thresh ",[33,82617,242],{"class":163},[33,82619,82620],{"class":167}," cv2.threshold(img, ",[33,82622,748],{"class":50},[33,82624,365],{"class":167},[33,82626,2678],{"class":50},[33,82628,82629],{"class":167},", cv2.",[33,82631,48521],{"class":50},[33,82633,82634],{"class":163}," +",[33,82636,41661],{"class":167},[33,82638,82639],{"class":50},"THRESH_OTSU",[33,82641,221],{"class":167},[33,82643,82644],{"class":35,"line":266},[33,82645,92],{"emptyLinePlaceholder":91},[33,82647,82648],{"class":35,"line":290},[33,82649,82650],{"class":39},"    # Fast non-local means denoise (h=10 is good for mild scanner grain)\n",[33,82652,82653,82656,82658,82661,82664,82666,82668],{"class":35,"line":295},[33,82654,82655],{"class":167},"    denoised ",[33,82657,242],{"class":163},[33,82659,82660],{"class":167}," cv2.fastNlMeansDenoising(thresh, ",[33,82662,82663],{"class":238},"h",[33,82665,242],{"class":163},[33,82667,3545],{"class":50},[33,82669,221],{"class":167},[33,82671,82672],{"class":35,"line":300},[33,82673,92],{"emptyLinePlaceholder":91},[33,82675,82676],{"class":35,"line":317},[33,82677,82678],{"class":39},"    # Deskew via minAreaRect on foreground pixels\n",[33,82680,82681,82684,82686,82689,82691,82693],{"class":35,"line":332},[33,82682,82683],{"class":167},"    coords ",[33,82685,242],{"class":163},[33,82687,82688],{"class":167}," np.column_stack(np.where(denoised ",[33,82690,6009],{"class":163},[33,82692,10791],{"class":50},[33,82694,371],{"class":167},[33,82696,82697,82699,82701,82704,82706,82709],{"class":35,"line":347},[33,82698,617],{"class":163},[33,82700,4037],{"class":50},[33,82702,82703],{"class":167},"(coords) ",[33,82705,43000],{"class":163},[33,82707,82708],{"class":50}," 4",[33,82710,574],{"class":167},[33,82712,82713,82716,82718],{"class":35,"line":374},[33,82714,82715],{"class":167},"        rect ",[33,82717,242],{"class":163},[33,82719,82720],{"class":167}," cv2.minAreaRect(coords)\n",[33,82722,82723,82726,82728,82731,82733,82735],{"class":35,"line":397},[33,82724,82725],{"class":167},"        angle ",[33,82727,242],{"class":163},[33,82729,82730],{"class":167}," rect[",[33,82732,4126],{"class":163},[33,82734,734],{"class":50},[33,82736,9202],{"class":167},[33,82738,82739,82741,82743,82745,82747,82749,82751,82754,82756,82759,82761,82763,82766,82768,82770],{"class":35,"line":653},[33,82740,82725],{"class":167},[33,82742,242],{"class":163},[33,82744,39025],{"class":163},[33,82746,602],{"class":167},[33,82748,2630],{"class":50},[33,82750,82634],{"class":163},[33,82752,82753],{"class":167}," angle) ",[33,82755,2491],{"class":163},[33,82757,82758],{"class":167}," angle ",[33,82760,4043],{"class":163},[33,82762,39025],{"class":163},[33,82764,82765],{"class":50},"45",[33,82767,15715],{"class":163},[33,82769,39025],{"class":163},[33,82771,82772],{"class":167},"angle\n",[33,82774,82775,82778,82780,82783,82785],{"class":35,"line":667},[33,82776,82777],{"class":167},"        h, w ",[33,82779,242],{"class":163},[33,82781,82782],{"class":167}," denoised.shape[:",[33,82784,1533],{"class":50},[33,82786,9202],{"class":167},[33,82788,82789,82792,82794,82797,82799,82801,82804,82806,82808,82811,82814],{"class":35,"line":675},[33,82790,82791],{"class":167},"        M ",[33,82793,242],{"class":163},[33,82795,82796],{"class":167}," cv2.getRotationMatrix2D((w ",[33,82798,74328],{"class":163},[33,82800,7451],{"class":50},[33,82802,82803],{"class":167},", h ",[33,82805,74328],{"class":163},[33,82807,7451],{"class":50},[33,82809,82810],{"class":167},"), angle, ",[33,82812,82813],{"class":50},"1.0",[33,82815,221],{"class":167},[33,82817,82818,82821,82823],{"class":35,"line":689},[33,82819,82820],{"class":167},"        denoised ",[33,82822,242],{"class":163},[33,82824,82825],{"class":167}," cv2.warpAffine(\n",[33,82827,82828],{"class":35,"line":703},[33,82829,82830],{"class":167},"            denoised, M, (w, h),\n",[33,82832,82833,82836,82838,82841,82844],{"class":35,"line":714},[33,82834,82835],{"class":238},"            flags",[33,82837,242],{"class":163},[33,82839,82840],{"class":167},"cv2.",[33,82842,82843],{"class":50},"INTER_CUBIC",[33,82845,247],{"class":167},[33,82847,82848,82851,82853,82855,82858],{"class":35,"line":723},[33,82849,82850],{"class":238},"            borderMode",[33,82852,242],{"class":163},[33,82854,82840],{"class":167},[33,82856,82857],{"class":50},"BORDER_REPLICATE",[33,82859,247],{"class":167},[33,82861,82862],{"class":35,"line":754},[33,82863,5867],{"class":167},[33,82865,82866],{"class":35,"line":771},[33,82867,92],{"emptyLinePlaceholder":91},[33,82869,82870,82872],{"class":35,"line":777},[33,82871,1332],{"class":163},[33,82873,82874],{"class":167}," denoised\n",[33,82876,82877],{"class":35,"line":788},[33,82878,92],{"emptyLinePlaceholder":91},[33,82880,82881],{"class":35,"line":804},[33,82882,92],{"emptyLinePlaceholder":91},[33,82884,82885,82887,82890,82893,82895],{"class":35,"line":809},[33,82886,562],{"class":163},[33,82888,82889],{"class":46}," save_debug_image",[33,82891,82892],{"class":167},"(arr: np.ndarray, output_path: Path) -> ",[33,82894,571],{"class":50},[33,82896,574],{"class":167},[33,82898,82899],{"class":35,"line":819},[33,82900,82901],{"class":54},"    \"\"\"Optionally save the preprocessed image for visual inspection.\"\"\"\n",[33,82903,82904,82907,82909],{"class":35,"line":829},[33,82905,82906],{"class":167},"    cv2.imwrite(",[33,82908,1053],{"class":50},[33,82910,82911],{"class":167},"(output_path), arr)\n",[424,82913,82915],{"id":82914},"when-to-skip-which-steps","When to Skip Which Steps",[4211,82917,82918,82924,82930],{},[4214,82919,82920,82923],{},[1974,82921,82922],{},"Already-binarised images"," (black-and-white TIFF scans): skip the threshold step; applying Otsu again may invert regions.",[4214,82925,82926,82929],{},[1974,82927,82928],{},"Digital-born PDFs saved as images"," (e.g. screenshots): skip deskew; they are already axis-aligned.",[4214,82931,82932,82935,82936,82939],{},[1974,82933,82934],{},"High-resolution colour scans (600+ DPI)",": apply ",[30,82937,82938],{},"cv2.resize"," to scale down to 300 DPI first — larger images slow Tesseract without accuracy benefit.",[14,82941,82942],{},"For adaptive thresholding on low-contrast or shadow-heavy scans, replace the Otsu step with:",[23,82944,82946],{"className":126,"code":82945,"language":47,"meta":28,"style":28},"# pip install opencv-python\nimport cv2\n\n# Adaptive Gaussian threshold — better for uneven illumination\nthresh = cv2.adaptiveThreshold(\n    img, 255,\n    cv2.ADAPTIVE_THRESH_GAUSSIAN_C,\n    cv2.THRESH_BINARY,\n    blockSize=31,   # must be odd; increase for larger text\n    C=10,\n)\n",[30,82947,82948,82953,82959,82963,82968,82977,82986,82995,83003,83018,83029],{"__ignoreMap":28},[33,82949,82950],{"class":35,"line":36},[33,82951,82952],{"class":39},"# pip install opencv-python\n",[33,82954,82955,82957],{"class":35,"line":43},[33,82956,164],{"class":163},[33,82958,41647],{"class":167},[33,82960,82961],{"class":35,"line":61},[33,82962,92],{"emptyLinePlaceholder":91},[33,82964,82965],{"class":35,"line":73},[33,82966,82967],{"class":39},"# Adaptive Gaussian threshold — better for uneven illumination\n",[33,82969,82970,82973,82975],{"class":35,"line":88},[33,82971,82972],{"class":167},"thresh ",[33,82974,242],{"class":163},[33,82976,48495],{"class":167},[33,82978,82979,82982,82984],{"class":35,"line":95},[33,82980,82981],{"class":167},"    img, ",[33,82983,2678],{"class":50},[33,82985,247],{"class":167},[33,82987,82988,82991,82993],{"class":35,"line":101},[33,82989,82990],{"class":167},"    cv2.",[33,82992,48512],{"class":50},[33,82994,247],{"class":167},[33,82996,82997,82999,83001],{"class":35,"line":171},[33,82998,82990],{"class":167},[33,83000,48521],{"class":50},[33,83002,247],{"class":167},[33,83004,83005,83008,83010,83013,83015],{"class":35,"line":179},[33,83006,83007],{"class":238},"    blockSize",[33,83009,242],{"class":163},[33,83011,83012],{"class":50},"31",[33,83014,1166],{"class":167},[33,83016,83017],{"class":39},"# must be odd; increase for larger text\n",[33,83019,83020,83023,83025,83027],{"class":35,"line":187},[33,83021,83022],{"class":238},"    C",[33,83024,242],{"class":163},[33,83026,3545],{"class":50},[33,83028,247],{"class":167},[33,83030,83031],{"class":35,"line":201},[33,83032,221],{"class":167},[18,83034,83036],{"id":83035},"step-4-run-tesseract-ocr","Step 4 — Run Tesseract OCR",[14,83038,83039,83041,83042,83044],{},[30,83040,79683],{}," returns plain text. ",[30,83043,48364],{}," returns per-word bounding boxes and confidence scores — use it whenever you need spatial layout or confidence filtering.",[23,83046,83048],{"className":126,"code":83047,"language":47,"meta":28,"style":28},"# pip install pytesseract Pillow\nfrom pathlib import Path\nimport pytesseract\nfrom PIL import Image\nimport numpy as np\n\ndef ocr_page(arr: np.ndarray, lang: str = \"eng\", psm: int = 3) -> str:\n    \"\"\"\n    Run Tesseract on a preprocessed numpy array and return the full text.\n    psm=3: fully automatic page segmentation (default).\n    psm=6: assume a single uniform block of text.\n    psm=11: sparse text — find as much text as possible in no particular order.\n    \"\"\"\n    pil_img = Image.fromarray(arr)\n    config = f\"--psm {psm}\"\n    try:\n        return pytesseract.image_to_string(pil_img, lang=lang, config=config)\n    except pytesseract.pytesseract.TesseractNotFoundError:\n        raise RuntimeError(\n            \"Tesseract binary not found — see \"\n            \"\u002Fautomating-pdf-extraction-generation\u002Fscanning-and-ocr-processing-with-python\u002Ffix-tesseract-not-found-error\u002F\"\n        )\n\n\ndef ocr_page_with_confidence(\n    arr: np.ndarray,\n    lang: str = \"eng\",\n    psm: int = 3,\n    min_conf: int = 60,\n) -> str:\n    \"\"\"\n    Run OCR and discard tokens below min_conf. Reduces noise in output\n    at the cost of occasionally dropping low-quality but correct characters.\n    \"\"\"\n    pil_img = Image.fromarray(arr)\n    config = f\"--psm {psm}\"\n    data = pytesseract.image_to_data(\n        pil_img, lang=lang, config=config,\n        output_type=pytesseract.Output.DICT,\n    )\n    tokens = [\n        data[\"text\"][i]\n        for i in range(len(data[\"text\"]))\n        if int(data[\"conf\"][i]) >= min_conf and data[\"text\"][i].strip()\n    ]\n    return \" \".join(tokens)\n",[30,83049,83050,83054,83064,83070,83080,83090,83094,83125,83129,83134,83139,83144,83149,83153,83163,83184,83190,83212,83218,83226,83231,83236,83240,83244,83248,83257,83262,83275,83288,83301,83309,83313,83318,83323,83327,83335,83353,83362,83380,83393,83397,83405,83415,83435,83461,83465],{"__ignoreMap":28},[33,83051,83052],{"class":35,"line":36},[33,83053,80141],{"class":39},[33,83055,83056,83058,83060,83062],{"class":35,"line":43},[33,83057,190],{"class":163},[33,83059,193],{"class":167},[33,83061,164],{"class":163},[33,83063,198],{"class":167},[33,83065,83066,83068],{"class":35,"line":61},[33,83067,164],{"class":163},[33,83069,47178],{"class":167},[33,83071,83072,83074,83076,83078],{"class":35,"line":73},[33,83073,190],{"class":163},[33,83075,46889],{"class":50},[33,83077,46892],{"class":163},[33,83079,47171],{"class":167},[33,83081,83082,83084,83086,83088],{"class":35,"line":88},[33,83083,164],{"class":163},[33,83085,48399],{"class":167},[33,83087,495],{"class":163},[33,83089,48404],{"class":167},[33,83091,83092],{"class":35,"line":95},[33,83093,92],{"emptyLinePlaceholder":91},[33,83095,83096,83098,83100,83103,83105,83107,83110,83113,83115,83117,83119,83121,83123],{"class":35,"line":101},[33,83097,562],{"class":163},[33,83099,47189],{"class":46},[33,83101,83102],{"class":167},"(arr: np.ndarray, lang: ",[33,83104,1053],{"class":50},[33,83106,212],{"class":163},[33,83108,83109],{"class":54}," \"eng\"",[33,83111,83112],{"class":167},", psm: ",[33,83114,1059],{"class":50},[33,83116,212],{"class":163},[33,83118,1714],{"class":50},[33,83120,1617],{"class":167},[33,83122,1053],{"class":50},[33,83124,574],{"class":167},[33,83126,83127],{"class":35,"line":171},[33,83128,7673],{"class":54},[33,83130,83131],{"class":35,"line":179},[33,83132,83133],{"class":54},"    Run Tesseract on a preprocessed numpy array and return the full text.\n",[33,83135,83136],{"class":35,"line":187},[33,83137,83138],{"class":54},"    psm=3: fully automatic page segmentation (default).\n",[33,83140,83141],{"class":35,"line":201},[33,83142,83143],{"class":54},"    psm=6: assume a single uniform block of text.\n",[33,83145,83146],{"class":35,"line":206},[33,83147,83148],{"class":54},"    psm=11: sparse text — find as much text as possible in no particular order.\n",[33,83150,83151],{"class":35,"line":224},[33,83152,7673],{"class":54},[33,83154,83155,83158,83160],{"class":35,"line":229},[33,83156,83157],{"class":167},"    pil_img ",[33,83159,242],{"class":163},[33,83161,83162],{"class":167}," Image.fromarray(arr)\n",[33,83164,83165,83168,83170,83172,83175,83177,83180,83182],{"class":35,"line":235},[33,83166,83167],{"class":167},"    config ",[33,83169,242],{"class":163},[33,83171,1110],{"class":163},[33,83173,83174],{"class":54},"\"--psm ",[33,83176,1115],{"class":50},[33,83178,83179],{"class":167},"psm",[33,83181,1121],{"class":50},[33,83183,7504],{"class":54},[33,83185,83186,83188],{"class":35,"line":250},[33,83187,2424],{"class":163},[33,83189,574],{"class":167},[33,83191,83192,83194,83197,83199,83201,83204,83207,83209],{"class":35,"line":266},[33,83193,1659],{"class":163},[33,83195,83196],{"class":167}," pytesseract.image_to_string(pil_img, ",[33,83198,81173],{"class":238},[33,83200,242],{"class":163},[33,83202,83203],{"class":167},"lang, ",[33,83205,83206],{"class":238},"config",[33,83208,242],{"class":163},[33,83210,83211],{"class":167},"config)\n",[33,83213,83214,83216],{"class":35,"line":290},[33,83215,2449],{"class":163},[33,83217,81197],{"class":167},[33,83219,83220,83222,83224],{"class":35,"line":295},[33,83221,4051],{"class":163},[33,83223,7590],{"class":50},[33,83225,7637],{"class":167},[33,83227,83228],{"class":35,"line":300},[33,83229,83230],{"class":54},"            \"Tesseract binary not found — see \"\n",[33,83232,83233],{"class":35,"line":317},[33,83234,83235],{"class":54},"            \"\u002Fautomating-pdf-extraction-generation\u002Fscanning-and-ocr-processing-with-python\u002Ffix-tesseract-not-found-error\u002F\"\n",[33,83237,83238],{"class":35,"line":332},[33,83239,5867],{"class":167},[33,83241,83242],{"class":35,"line":347},[33,83243,92],{"emptyLinePlaceholder":91},[33,83245,83246],{"class":35,"line":374},[33,83247,92],{"emptyLinePlaceholder":91},[33,83249,83250,83252,83255],{"class":35,"line":397},[33,83251,562],{"class":163},[33,83253,83254],{"class":46}," ocr_page_with_confidence",[33,83256,7637],{"class":167},[33,83258,83259],{"class":35,"line":653},[33,83260,83261],{"class":167},"    arr: np.ndarray,\n",[33,83263,83264,83267,83269,83271,83273],{"class":35,"line":667},[33,83265,83266],{"class":167},"    lang: ",[33,83268,1053],{"class":50},[33,83270,212],{"class":163},[33,83272,83109],{"class":54},[33,83274,247],{"class":167},[33,83276,83277,83280,83282,83284,83286],{"class":35,"line":675},[33,83278,83279],{"class":167},"    psm: ",[33,83281,1059],{"class":50},[33,83283,212],{"class":163},[33,83285,1714],{"class":50},[33,83287,247],{"class":167},[33,83289,83290,83293,83295,83297,83299],{"class":35,"line":689},[33,83291,83292],{"class":167},"    min_conf: ",[33,83294,1059],{"class":50},[33,83296,212],{"class":163},[33,83298,28533],{"class":50},[33,83300,247],{"class":167},[33,83302,83303,83305,83307],{"class":35,"line":703},[33,83304,1617],{"class":167},[33,83306,1053],{"class":50},[33,83308,574],{"class":167},[33,83310,83311],{"class":35,"line":714},[33,83312,7673],{"class":54},[33,83314,83315],{"class":35,"line":723},[33,83316,83317],{"class":54},"    Run OCR and discard tokens below min_conf. Reduces noise in output\n",[33,83319,83320],{"class":35,"line":754},[33,83321,83322],{"class":54},"    at the cost of occasionally dropping low-quality but correct characters.\n",[33,83324,83325],{"class":35,"line":771},[33,83326,7673],{"class":54},[33,83328,83329,83331,83333],{"class":35,"line":777},[33,83330,83157],{"class":167},[33,83332,242],{"class":163},[33,83334,83162],{"class":167},[33,83336,83337,83339,83341,83343,83345,83347,83349,83351],{"class":35,"line":788},[33,83338,83167],{"class":167},[33,83340,242],{"class":163},[33,83342,1110],{"class":163},[33,83344,83174],{"class":54},[33,83346,1115],{"class":50},[33,83348,83179],{"class":167},[33,83350,1121],{"class":50},[33,83352,7504],{"class":54},[33,83354,83355,83357,83359],{"class":35,"line":804},[33,83356,24507],{"class":167},[33,83358,242],{"class":163},[33,83360,83361],{"class":167}," pytesseract.image_to_data(\n",[33,83363,83364,83367,83369,83371,83373,83375,83377],{"class":35,"line":809},[33,83365,83366],{"class":167},"        pil_img, ",[33,83368,81173],{"class":238},[33,83370,242],{"class":163},[33,83372,83203],{"class":167},[33,83374,83206],{"class":238},[33,83376,242],{"class":163},[33,83378,83379],{"class":167},"config,\n",[33,83381,83382,83385,83387,83389,83391],{"class":35,"line":819},[33,83383,83384],{"class":238},"        output_type",[33,83386,242],{"class":163},[33,83388,47248],{"class":167},[33,83390,47251],{"class":50},[33,83392,247],{"class":167},[33,83394,83395],{"class":35,"line":829},[33,83396,1202],{"class":167},[33,83398,83399,83401,83403],{"class":35,"line":834},[33,83400,47258],{"class":167},[33,83402,242],{"class":163},[33,83404,7473],{"class":167},[33,83406,83407,83410,83412],{"class":35,"line":839},[33,83408,83409],{"class":167},"        data[",[33,83411,3459],{"class":54},[33,83413,83414],{"class":167},"][i]\n",[33,83416,83417,83419,83421,83423,83425,83427,83429,83431,83433],{"class":35,"line":860},[33,83418,5973],{"class":163},[33,83420,47269],{"class":167},[33,83422,662],{"class":163},[33,83424,1801],{"class":50},[33,83426,602],{"class":167},[33,83428,928],{"class":50},[33,83430,20361],{"class":167},[33,83432,3459],{"class":54},[33,83434,7211],{"class":167},[33,83436,83437,83439,83441,83443,83445,83448,83450,83453,83455,83457,83459],{"class":35,"line":887},[33,83438,8221],{"class":163},[33,83440,3149],{"class":50},[33,83442,20361],{"class":167},[33,83444,47313],{"class":54},[33,83446,83447],{"class":167},"][i]) ",[33,83449,43000],{"class":163},[33,83451,83452],{"class":167}," min_conf ",[33,83454,6001],{"class":163},[33,83456,47294],{"class":167},[33,83458,3459],{"class":54},[33,83460,47299],{"class":167},[33,83462,83463],{"class":35,"line":907},[33,83464,19559],{"class":167},[33,83466,83467,83469,83471],{"class":35,"line":1826},[33,83468,1332],{"class":163},[33,83470,57412],{"class":54},[33,83472,83473],{"class":167},".join(tokens)\n",[424,83475,83477],{"id":83476},"page-segmentation-mode-reference","Page Segmentation Mode Reference",[4273,83479,83480,83490],{},[4276,83481,83482],{},[4279,83483,83484,83487],{},[4282,83485,83486],{},"PSM",[4282,83488,83489],{},"Use when",[4292,83491,83492,83500,83507,83514,83521,83528],{},[4279,83493,83494,83497],{},[4297,83495,83496],{},"3 (default)",[4297,83498,83499],{},"Multi-column page with mixed content",[4279,83501,83502,83504],{},[4297,83503,1503],{},[4297,83505,83506],{},"Single column, variable text sizes",[4279,83508,83509,83511],{},[4297,83510,2681],{},[4297,83512,83513],{},"Single uniform block of text",[4279,83515,83516,83518],{},[4297,83517,1179],{},[4297,83519,83520],{},"Single text line",[4279,83522,83523,83525],{},[4297,83524,17260],{},[4297,83526,83527],{},"Sparse text — forms with scattered labels",[4279,83529,83530,83533],{},[4297,83531,83532],{},"13",[4297,83534,83535],{},"Raw line — no layout analysis",[14,83537,83538,83539,83542,83543,83546],{},"Wrong PSM on multi-column pages causes Tesseract to concatenate columns horizontally, producing garbled output. When in doubt, try ",[30,83540,83541],{},"--psm 3"," first, then ",[30,83544,83545],{},"--psm 4"," if columns merge incorrectly.",[18,83548,83550],{"id":83549},"step-5-confidence-filtering","Step 5 — Confidence Filtering",[14,83552,83553,83555,83556,83559],{},[30,83554,48364],{}," returns a ",[30,83557,83558],{},"-1"," confidence for non-word tokens (spaces, line separators). Filter those out before joining:",[23,83561,83563],{"className":126,"code":83562,"language":47,"meta":28,"style":28},"# pip install pytesseract Pillow\nimport pytesseract\nfrom PIL import Image\nimport numpy as np\n\ndef extract_high_confidence_words(\n    arr: np.ndarray,\n    min_conf: int = 65,\n) -> list[dict]:\n    \"\"\"\n    Return a list of dicts with text, bounding box, and confidence\n    for tokens that pass the confidence threshold.\n    \"\"\"\n    pil_img = Image.fromarray(arr)\n    data = pytesseract.image_to_data(pil_img, output_type=pytesseract.Output.DICT)\n    results = []\n    for i, conf in enumerate(data[\"conf\"]):\n        if int(conf) \u003C min_conf:\n            continue\n        text = data[\"text\"][i].strip()\n        if not text:\n            continue\n        results.append({\n            \"text\": text,\n            \"conf\": int(conf),\n            \"x\": data[\"left\"][i],\n            \"y\": data[\"top\"][i],\n            \"w\": data[\"width\"][i],\n            \"h\": data[\"height\"][i],\n        })\n    return results\n",[30,83564,83565,83569,83575,83585,83595,83599,83608,83612,83625,83633,83637,83642,83647,83651,83659,83678,83686,83704,83718,83722,83734,83742,83746,83751,83759,83771,83784,83795,83807,83819,83824],{"__ignoreMap":28},[33,83566,83567],{"class":35,"line":36},[33,83568,80141],{"class":39},[33,83570,83571,83573],{"class":35,"line":43},[33,83572,164],{"class":163},[33,83574,47178],{"class":167},[33,83576,83577,83579,83581,83583],{"class":35,"line":61},[33,83578,190],{"class":163},[33,83580,46889],{"class":50},[33,83582,46892],{"class":163},[33,83584,47171],{"class":167},[33,83586,83587,83589,83591,83593],{"class":35,"line":73},[33,83588,164],{"class":163},[33,83590,48399],{"class":167},[33,83592,495],{"class":163},[33,83594,48404],{"class":167},[33,83596,83597],{"class":35,"line":88},[33,83598,92],{"emptyLinePlaceholder":91},[33,83600,83601,83603,83606],{"class":35,"line":95},[33,83602,562],{"class":163},[33,83604,83605],{"class":46}," extract_high_confidence_words",[33,83607,7637],{"class":167},[33,83609,83610],{"class":35,"line":101},[33,83611,83261],{"class":167},[33,83613,83614,83616,83618,83620,83623],{"class":35,"line":171},[33,83615,83292],{"class":167},[33,83617,1059],{"class":50},[33,83619,212],{"class":163},[33,83621,83622],{"class":50}," 65",[33,83624,247],{"class":167},[33,83626,83627,83629,83631],{"class":35,"line":179},[33,83628,44775],{"class":167},[33,83630,37100],{"class":50},[33,83632,17477],{"class":167},[33,83634,83635],{"class":35,"line":187},[33,83636,7673],{"class":54},[33,83638,83639],{"class":35,"line":201},[33,83640,83641],{"class":54},"    Return a list of dicts with text, bounding box, and confidence\n",[33,83643,83644],{"class":35,"line":206},[33,83645,83646],{"class":54},"    for tokens that pass the confidence threshold.\n",[33,83648,83649],{"class":35,"line":224},[33,83650,7673],{"class":54},[33,83652,83653,83655,83657],{"class":35,"line":229},[33,83654,83157],{"class":167},[33,83656,242],{"class":163},[33,83658,83162],{"class":167},[33,83660,83661,83663,83665,83668,83670,83672,83674,83676],{"class":35,"line":235},[33,83662,24507],{"class":167},[33,83664,242],{"class":163},[33,83666,83667],{"class":167}," pytesseract.image_to_data(pil_img, ",[33,83669,47243],{"class":238},[33,83671,242],{"class":163},[33,83673,47248],{"class":167},[33,83675,47251],{"class":50},[33,83677,221],{"class":167},[33,83679,83680,83682,83684],{"class":35,"line":250},[33,83681,37112],{"class":167},[33,83683,242],{"class":163},[33,83685,589],{"class":167},[33,83687,83688,83690,83693,83695,83697,83699,83701],{"class":35,"line":266},[33,83689,656],{"class":163},[33,83691,83692],{"class":167}," i, conf ",[33,83694,662],{"class":163},[33,83696,7403],{"class":50},[33,83698,20361],{"class":167},[33,83700,47313],{"class":54},[33,83702,83703],{"class":167},"]):\n",[33,83705,83706,83708,83710,83713,83715],{"class":35,"line":290},[33,83707,8221],{"class":163},[33,83709,3149],{"class":50},[33,83711,83712],{"class":167},"(conf) ",[33,83714,4043],{"class":163},[33,83716,83717],{"class":167}," min_conf:\n",[33,83719,83720],{"class":35,"line":295},[33,83721,9330],{"class":163},[33,83723,83724,83726,83728,83730,83732],{"class":35,"line":300},[33,83725,47289],{"class":167},[33,83727,242],{"class":163},[33,83729,47294],{"class":167},[33,83731,3459],{"class":54},[33,83733,47299],{"class":167},[33,83735,83736,83738,83740],{"class":35,"line":317},[33,83737,8221],{"class":163},[33,83739,620],{"class":163},[33,83741,44613],{"class":167},[33,83743,83744],{"class":35,"line":332},[33,83745,9330],{"class":163},[33,83747,83748],{"class":35,"line":347},[33,83749,83750],{"class":167},"        results.append({\n",[33,83752,83753,83756],{"class":35,"line":374},[33,83754,83755],{"class":54},"            \"text\"",[33,83757,83758],{"class":167},": text,\n",[33,83760,83761,83764,83766,83768],{"class":35,"line":397},[33,83762,83763],{"class":54},"            \"conf\"",[33,83765,2079],{"class":167},[33,83767,1059],{"class":50},[33,83769,83770],{"class":167},"(conf),\n",[33,83772,83773,83776,83779,83781],{"class":35,"line":653},[33,83774,83775],{"class":54},"            \"x\"",[33,83777,83778],{"class":167},": data[",[33,83780,28050],{"class":54},[33,83782,83783],{"class":167},"][i],\n",[33,83785,83786,83789,83791,83793],{"class":35,"line":667},[33,83787,83788],{"class":54},"            \"y\"",[33,83790,83778],{"class":167},[33,83792,43412],{"class":54},[33,83794,83783],{"class":167},[33,83796,83797,83800,83802,83805],{"class":35,"line":675},[33,83798,83799],{"class":54},"            \"w\"",[33,83801,83778],{"class":167},[33,83803,83804],{"class":54},"\"width\"",[33,83806,83783],{"class":167},[33,83808,83809,83812,83814,83817],{"class":35,"line":689},[33,83810,83811],{"class":54},"            \"h\"",[33,83813,83778],{"class":167},[33,83815,83816],{"class":54},"\"height\"",[33,83818,83783],{"class":167},[33,83820,83821],{"class":35,"line":703},[33,83822,83823],{"class":167},"        })\n",[33,83825,83826,83828],{"class":35,"line":714},[33,83827,1332],{"class":163},[33,83829,14211],{"class":167},[14,83831,83832,83833,83835],{},"The spatial data (x, y, w, h) is useful downstream — the coordinate-mapping approach in ",[940,83834,10077],{"href":10076}," uses these bounding boxes to reconstruct row and column structure without needing vector lines.",[18,83837,83839],{"id":83838},"step-6-build-a-searchable-pdf","Step 6 — Build a Searchable PDF",[14,83841,83842],{},"Embed an invisible OCR text layer over the original scan so the file is searchable without changing how it looks.",[23,83844,83846],{"className":126,"code":83845,"language":47,"meta":28,"style":28},"# pip install pymupdf pytesseract Pillow\nfrom pathlib import Path\nimport fitz  # PyMuPDF\nimport pytesseract\nfrom PIL import Image\n\ndef make_searchable_pdf(input_pdf: Path, output_pdf: Path, lang: str = \"eng\") -> None:\n    \"\"\"\n    For each page, run Tesseract to generate a hidden-text PDF overlay,\n    then merge that overlay onto the original scan page.\n    \"\"\"\n    try:\n        doc = fitz.open(input_pdf)\n        for page_num, page in enumerate(doc):\n            pix = page.get_pixmap(dpi=300)\n            img = Image.frombytes(\"RGB\", [pix.width, pix.height], pix.samples)\n\n            # Tesseract produces a PDF with transparent text at the correct positions\n            ocr_pdf_bytes = pytesseract.image_to_pdf_or_hocr(img, extension=\"pdf\", lang=lang)\n\n            overlay = fitz.open(\"pdf\", ocr_pdf_bytes)\n            # show_pdf_page copies the overlay text layer onto the original page\n            page.show_pdf_page(page.rect, overlay, 0)\n            overlay.close()\n\n        doc.save(str(output_pdf), garbage=4, deflate=True)\n        print(f\"Searchable PDF written → {output_pdf}\")\n    except Exception as exc:\n        raise RuntimeError(f\"Failed to create searchable PDF: {exc}\") from exc\n\n\nmake_searchable_pdf(\n    Path(\"scans\u002Finvoice_001.pdf\"),\n    Path(\"output\u002Finvoice_001_searchable.pdf\"),\n)\n",[30,83847,83848,83853,83863,83871,83877,83887,83891,83913,83917,83922,83927,83931,83937,83946,83959,83975,83987,83991,83996,84022,84026,84040,84045,84054,84059,84063,84091,84113,84123,84150,84154,84158,84163,84173,84182],{"__ignoreMap":28},[33,83849,83850],{"class":35,"line":36},[33,83851,83852],{"class":39},"# pip install pymupdf pytesseract Pillow\n",[33,83854,83855,83857,83859,83861],{"class":35,"line":43},[33,83856,190],{"class":163},[33,83858,193],{"class":167},[33,83860,164],{"class":163},[33,83862,198],{"class":167},[33,83864,83865,83867,83869],{"class":35,"line":61},[33,83866,164],{"class":163},[33,83868,46547],{"class":167},[33,83870,46550],{"class":39},[33,83872,83873,83875],{"class":35,"line":73},[33,83874,164],{"class":163},[33,83876,47178],{"class":167},[33,83878,83879,83881,83883,83885],{"class":35,"line":88},[33,83880,190],{"class":163},[33,83882,46889],{"class":50},[33,83884,46892],{"class":163},[33,83886,47171],{"class":167},[33,83888,83889],{"class":35,"line":95},[33,83890,92],{"emptyLinePlaceholder":91},[33,83892,83893,83895,83898,83901,83903,83905,83907,83909,83911],{"class":35,"line":101},[33,83894,562],{"class":163},[33,83896,83897],{"class":46}," make_searchable_pdf",[33,83899,83900],{"class":167},"(input_pdf: Path, output_pdf: Path, lang: ",[33,83902,1053],{"class":50},[33,83904,212],{"class":163},[33,83906,83109],{"class":54},[33,83908,1617],{"class":167},[33,83910,571],{"class":50},[33,83912,574],{"class":167},[33,83914,83915],{"class":35,"line":171},[33,83916,7673],{"class":54},[33,83918,83919],{"class":35,"line":179},[33,83920,83921],{"class":54},"    For each page, run Tesseract to generate a hidden-text PDF overlay,\n",[33,83923,83924],{"class":35,"line":187},[33,83925,83926],{"class":54},"    then merge that overlay onto the original scan page.\n",[33,83928,83929],{"class":35,"line":201},[33,83930,7673],{"class":54},[33,83932,83933,83935],{"class":35,"line":206},[33,83934,2424],{"class":163},[33,83936,574],{"class":167},[33,83938,83939,83941,83943],{"class":35,"line":224},[33,83940,20077],{"class":167},[33,83942,242],{"class":163},[33,83944,83945],{"class":167}," fitz.open(input_pdf)\n",[33,83947,83948,83950,83952,83954,83956],{"class":35,"line":229},[33,83949,5973],{"class":163},[33,83951,7398],{"class":167},[33,83953,662],{"class":163},[33,83955,7403],{"class":50},[33,83957,83958],{"class":167},"(doc):\n",[33,83960,83961,83963,83965,83967,83969,83971,83973],{"class":35,"line":235},[33,83962,82258],{"class":167},[33,83964,242],{"class":163},[33,83966,82263],{"class":167},[33,83968,46966],{"class":238},[33,83970,242],{"class":163},[33,83972,26433],{"class":50},[33,83974,221],{"class":167},[33,83976,83977,83979,83981,83983,83985],{"class":35,"line":250},[33,83978,82274],{"class":167},[33,83980,242],{"class":163},[33,83982,82279],{"class":167},[33,83984,81499],{"class":54},[33,83986,82284],{"class":167},[33,83988,83989],{"class":35,"line":266},[33,83990,92],{"emptyLinePlaceholder":91},[33,83992,83993],{"class":35,"line":290},[33,83994,83995],{"class":39},"            # Tesseract produces a PDF with transparent text at the correct positions\n",[33,83997,83998,84001,84003,84006,84009,84011,84013,84015,84017,84019],{"class":35,"line":295},[33,83999,84000],{"class":167},"            ocr_pdf_bytes ",[33,84002,242],{"class":163},[33,84004,84005],{"class":167}," pytesseract.image_to_pdf_or_hocr(img, ",[33,84007,84008],{"class":238},"extension",[33,84010,242],{"class":163},[33,84012,15519],{"class":54},[33,84014,365],{"class":167},[33,84016,81173],{"class":238},[33,84018,242],{"class":163},[33,84020,84021],{"class":167},"lang)\n",[33,84023,84024],{"class":35,"line":300},[33,84025,92],{"emptyLinePlaceholder":91},[33,84027,84028,84031,84033,84035,84037],{"class":35,"line":317},[33,84029,84030],{"class":167},"            overlay ",[33,84032,242],{"class":163},[33,84034,46587],{"class":167},[33,84036,15519],{"class":54},[33,84038,84039],{"class":167},", ocr_pdf_bytes)\n",[33,84041,84042],{"class":35,"line":332},[33,84043,84044],{"class":39},"            # show_pdf_page copies the overlay text layer onto the original page\n",[33,84046,84047,84050,84052],{"class":35,"line":347},[33,84048,84049],{"class":167},"            page.show_pdf_page(page.rect, overlay, ",[33,84051,748],{"class":50},[33,84053,221],{"class":167},[33,84055,84056],{"class":35,"line":374},[33,84057,84058],{"class":167},"            overlay.close()\n",[33,84060,84061],{"class":35,"line":397},[33,84062,92],{"emptyLinePlaceholder":91},[33,84064,84065,84068,84070,84073,84076,84078,84080,84082,84085,84087,84089],{"class":35,"line":653},[33,84066,84067],{"class":167},"        doc.save(",[33,84069,1053],{"class":50},[33,84071,84072],{"class":167},"(output_pdf), ",[33,84074,84075],{"class":238},"garbage",[33,84077,242],{"class":163},[33,84079,1503],{"class":50},[33,84081,365],{"class":167},[33,84083,84084],{"class":238},"deflate",[33,84086,242],{"class":163},[33,84088,855],{"class":50},[33,84090,221],{"class":167},[33,84092,84093,84095,84097,84099,84102,84104,84107,84109,84111],{"class":35,"line":667},[33,84094,9414],{"class":50},[33,84096,602],{"class":167},[33,84098,4059],{"class":163},[33,84100,84101],{"class":54},"\"Searchable PDF written → ",[33,84103,1115],{"class":50},[33,84105,84106],{"class":167},"output_pdf",[33,84108,1121],{"class":50},[33,84110,274],{"class":54},[33,84112,221],{"class":167},[33,84114,84115,84117,84119,84121],{"class":35,"line":675},[33,84116,2449],{"class":163},[33,84118,783],{"class":50},[33,84120,1852],{"class":163},[33,84122,1855],{"class":167},[33,84124,84125,84127,84129,84131,84133,84136,84138,84140,84142,84144,84146,84148],{"class":35,"line":689},[33,84126,4051],{"class":163},[33,84128,7590],{"class":50},[33,84130,602],{"class":167},[33,84132,4059],{"class":163},[33,84134,84135],{"class":54},"\"Failed to create searchable PDF: ",[33,84137,1115],{"class":50},[33,84139,6565],{"class":167},[33,84141,1121],{"class":50},[33,84143,274],{"class":54},[33,84145,1649],{"class":167},[33,84147,190],{"class":163},[33,84149,20843],{"class":167},[33,84151,84152],{"class":35,"line":703},[33,84153,92],{"emptyLinePlaceholder":91},[33,84155,84156],{"class":35,"line":714},[33,84157,92],{"emptyLinePlaceholder":91},[33,84159,84160],{"class":35,"line":723},[33,84161,84162],{"class":167},"make_searchable_pdf(\n",[33,84164,84165,84168,84171],{"class":35,"line":754},[33,84166,84167],{"class":167},"    Path(",[33,84169,84170],{"class":54},"\"scans\u002Finvoice_001.pdf\"",[33,84172,1506],{"class":167},[33,84174,84175,84177,84180],{"class":35,"line":771},[33,84176,84167],{"class":167},[33,84178,84179],{"class":54},"\"output\u002Finvoice_001_searchable.pdf\"",[33,84181,1506],{"class":167},[33,84183,84184],{"class":35,"line":777},[33,84185,221],{"class":167},[14,84187,84188,84189,84191],{},"The resulting file is visually identical to the original. PDF viewers, grep tools, and indexers can now find text in it. You can batch these files with the pattern in ",[940,84190,68020],{"href":75698}," to produce a single searchable archive.",[18,84193,2709],{"id":2708},[424,84195,84197],{"id":84196},"multi-language-documents","Multi-Language Documents",[14,84199,84200,84201,84203],{},"Pass a ",[30,84202,1811],{},"-delimited language string. Language packs must be installed separately:",[23,84205,84207],{"className":126,"code":84206,"language":47,"meta":28,"style":28},"# pip install pytesseract Pillow\nimport pytesseract\nfrom PIL import Image\n\nimg = Image.open(\"multilingual_form.png\")\n# Requires tesseract-ocr-deu and tesseract-ocr-fra installed\ntext = pytesseract.image_to_string(img, lang=\"eng+deu+fra\")\n",[30,84208,84209,84213,84219,84229,84233,84247,84252],{"__ignoreMap":28},[33,84210,84211],{"class":35,"line":36},[33,84212,80141],{"class":39},[33,84214,84215,84217],{"class":35,"line":43},[33,84216,164],{"class":163},[33,84218,47178],{"class":167},[33,84220,84221,84223,84225,84227],{"class":35,"line":61},[33,84222,190],{"class":163},[33,84224,46889],{"class":50},[33,84226,46892],{"class":163},[33,84228,47171],{"class":167},[33,84230,84231],{"class":35,"line":73},[33,84232,92],{"emptyLinePlaceholder":91},[33,84234,84235,84237,84239,84242,84245],{"class":35,"line":88},[33,84236,80649],{"class":167},[33,84238,242],{"class":163},[33,84240,84241],{"class":167}," Image.open(",[33,84243,84244],{"class":54},"\"multilingual_form.png\"",[33,84246,221],{"class":167},[33,84248,84249],{"class":35,"line":95},[33,84250,84251],{"class":39},"# Requires tesseract-ocr-deu and tesseract-ocr-fra installed\n",[33,84253,84254,84257,84259,84261,84263,84265,84268],{"class":35,"line":101},[33,84255,84256],{"class":167},"text ",[33,84258,242],{"class":163},[33,84260,81170],{"class":167},[33,84262,81173],{"class":238},[33,84264,242],{"class":163},[33,84266,84267],{"class":54},"\"eng+deu+fra\"",[33,84269,221],{"class":167},[14,84271,84272,84273,3035],{},"Check which packs are installed: ",[30,84274,84275],{},"tesseract --list-langs",[424,84277,84279],{"id":84278},"handwriting","Handwriting",[14,84281,84282,84283,84286],{},"Standard Tesseract (",[30,84284,84285],{},"eng",") performs poorly on cursive or non-standard print. Alternatives:",[4211,84288,84289,84298,84304],{},[4214,84290,84291,17583,84294,84297],{},[1974,84292,84293],{},"EasyOCR",[30,84295,84296],{},"pip install easyocr",") — handles handwriting better, runs on CPU but is slower.",[4214,84299,84300,84303],{},[1974,84301,84302],{},"Cloud APIs"," (AWS Textract, Google Vision) — higher accuracy, cost per page, network dependency.",[4214,84305,84306,84312,84313,3035],{},[1974,84307,84308,84309],{},"Tesseract ",[30,84310,84311],{},"script\u002FHanS"," for CJK scripts — install via ",[30,84314,84315],{},"apt-get install tesseract-ocr-script-hans",[424,84317,84319],{"id":84318},"colour-tinted-or-watermarked-scans","Colour-Tinted or Watermarked Scans",[14,84321,84322],{},"Binarisation with Otsu can obliterate faint watermarks alongside background colour. To preserve more signal, use HSV channel separation before thresholding:",[23,84324,84326],{"className":126,"code":84325,"language":47,"meta":28,"style":28},"# pip install opencv-python numpy\nimport cv2\nimport numpy as np\n\ndef threshold_on_value_channel(bgr_img: np.ndarray) -> np.ndarray:\n    \"\"\"\n    Extract the V channel from HSV, then apply Otsu.\n    Reduces colour interference from stamps, highlights, or watermarks.\n    \"\"\"\n    hsv = cv2.cvtColor(bgr_img, cv2.COLOR_BGR2HSV)\n    v_channel = hsv[:, :, 2]\n    _, thresh = cv2.threshold(v_channel, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)\n    return thresh\n",[30,84327,84328,84333,84339,84349,84353,84363,84367,84372,84377,84381,84396,84410,84437],{"__ignoreMap":28},[33,84329,84330],{"class":35,"line":36},[33,84331,84332],{"class":39},"# pip install opencv-python numpy\n",[33,84334,84335,84337],{"class":35,"line":43},[33,84336,164],{"class":163},[33,84338,41647],{"class":167},[33,84340,84341,84343,84345,84347],{"class":35,"line":61},[33,84342,164],{"class":163},[33,84344,48399],{"class":167},[33,84346,495],{"class":163},[33,84348,48404],{"class":167},[33,84350,84351],{"class":35,"line":73},[33,84352,92],{"emptyLinePlaceholder":91},[33,84354,84355,84357,84360],{"class":35,"line":88},[33,84356,562],{"class":163},[33,84358,84359],{"class":46}," threshold_on_value_channel",[33,84361,84362],{"class":167},"(bgr_img: np.ndarray) -> np.ndarray:\n",[33,84364,84365],{"class":35,"line":95},[33,84366,7673],{"class":54},[33,84368,84369],{"class":35,"line":101},[33,84370,84371],{"class":54},"    Extract the V channel from HSV, then apply Otsu.\n",[33,84373,84374],{"class":35,"line":171},[33,84375,84376],{"class":54},"    Reduces colour interference from stamps, highlights, or watermarks.\n",[33,84378,84379],{"class":35,"line":179},[33,84380,7673],{"class":54},[33,84382,84383,84386,84388,84391,84394],{"class":35,"line":187},[33,84384,84385],{"class":167},"    hsv ",[33,84387,242],{"class":163},[33,84389,84390],{"class":167}," cv2.cvtColor(bgr_img, cv2.",[33,84392,84393],{"class":50},"COLOR_BGR2HSV",[33,84395,221],{"class":167},[33,84397,84398,84401,84403,84406,84408],{"class":35,"line":201},[33,84399,84400],{"class":167},"    v_channel ",[33,84402,242],{"class":163},[33,84404,84405],{"class":167}," hsv[:, :, ",[33,84407,1533],{"class":50},[33,84409,9202],{"class":167},[33,84411,84412,84414,84416,84419,84421,84423,84425,84427,84429,84431,84433,84435],{"class":35,"line":206},[33,84413,82615],{"class":167},[33,84415,242],{"class":163},[33,84417,84418],{"class":167}," cv2.threshold(v_channel, ",[33,84420,748],{"class":50},[33,84422,365],{"class":167},[33,84424,2678],{"class":50},[33,84426,82629],{"class":167},[33,84428,48521],{"class":50},[33,84430,82634],{"class":163},[33,84432,41661],{"class":167},[33,84434,82639],{"class":50},[33,84436,221],{"class":167},[33,84438,84439,84441],{"class":35,"line":224},[33,84440,1332],{"class":163},[33,84442,84443],{"class":167}," thresh\n",[18,84445,52030],{"id":52029},[14,84447,84448],{},"After running the pipeline, verify output quality before committing to a batch run:",[23,84450,84452],{"className":126,"code":84451,"language":47,"meta":28,"style":28},"# pip install pytesseract Pillow\nfrom pathlib import Path\nimport pytesseract\nfrom PIL import Image\n\ndef check_ocr_quality(arr, min_mean_conf: float = 70.0) -> dict:\n    \"\"\"\n    Returns mean confidence and a pass\u002Ffail flag.\n    Fail means the scan quality or preprocessing needs adjustment.\n    \"\"\"\n    import numpy as np\n    pil_img = Image.fromarray(arr)\n    data = pytesseract.image_to_data(pil_img, output_type=pytesseract.Output.DICT)\n    confs = [int(c) for c in data[\"conf\"] if int(c) != -1]\n    if not confs:\n        return {\"mean_conf\": 0.0, \"ok\": False, \"word_count\": 0}\n    mean_conf = sum(confs) \u002F len(confs)\n    return {\n        \"mean_conf\": round(mean_conf, 1),\n        \"ok\": mean_conf >= min_mean_conf,\n        \"word_count\": len([t for t in data[\"text\"] if t.strip()]),\n    }\n",[30,84453,84454,84458,84468,84474,84484,84488,84511,84515,84520,84525,84529,84539,84547,84565,84604,84613,84645,84664,84670,84686,84699,84727],{"__ignoreMap":28},[33,84455,84456],{"class":35,"line":36},[33,84457,80141],{"class":39},[33,84459,84460,84462,84464,84466],{"class":35,"line":43},[33,84461,190],{"class":163},[33,84463,193],{"class":167},[33,84465,164],{"class":163},[33,84467,198],{"class":167},[33,84469,84470,84472],{"class":35,"line":61},[33,84471,164],{"class":163},[33,84473,47178],{"class":167},[33,84475,84476,84478,84480,84482],{"class":35,"line":73},[33,84477,190],{"class":163},[33,84479,46889],{"class":50},[33,84481,46892],{"class":163},[33,84483,47171],{"class":167},[33,84485,84486],{"class":35,"line":88},[33,84487,92],{"emptyLinePlaceholder":91},[33,84489,84490,84492,84495,84498,84500,84502,84505,84507,84509],{"class":35,"line":95},[33,84491,562],{"class":163},[33,84493,84494],{"class":46}," check_ocr_quality",[33,84496,84497],{"class":167},"(arr, min_mean_conf: ",[33,84499,1720],{"class":50},[33,84501,212],{"class":163},[33,84503,84504],{"class":50}," 70.0",[33,84506,1617],{"class":167},[33,84508,37100],{"class":50},[33,84510,574],{"class":167},[33,84512,84513],{"class":35,"line":101},[33,84514,7673],{"class":54},[33,84516,84517],{"class":35,"line":171},[33,84518,84519],{"class":54},"    Returns mean confidence and a pass\u002Ffail flag.\n",[33,84521,84522],{"class":35,"line":179},[33,84523,84524],{"class":54},"    Fail means the scan quality or preprocessing needs adjustment.\n",[33,84526,84527],{"class":35,"line":187},[33,84528,7673],{"class":54},[33,84530,84531,84533,84535,84537],{"class":35,"line":201},[33,84532,1627],{"class":163},[33,84534,48399],{"class":167},[33,84536,495],{"class":163},[33,84538,48404],{"class":167},[33,84540,84541,84543,84545],{"class":35,"line":206},[33,84542,83157],{"class":167},[33,84544,242],{"class":163},[33,84546,83162],{"class":167},[33,84548,84549,84551,84553,84555,84557,84559,84561,84563],{"class":35,"line":224},[33,84550,24507],{"class":167},[33,84552,242],{"class":163},[33,84554,83667],{"class":167},[33,84556,47243],{"class":238},[33,84558,242],{"class":163},[33,84560,47248],{"class":167},[33,84562,47251],{"class":50},[33,84564,221],{"class":167},[33,84566,84567,84570,84572,84574,84576,84578,84580,84582,84584,84586,84588,84590,84592,84594,84596,84598,84600,84602],{"class":35,"line":229},[33,84568,84569],{"class":167},"    confs ",[33,84571,242],{"class":163},[33,84573,9178],{"class":167},[33,84575,1059],{"class":50},[33,84577,68507],{"class":167},[33,84579,6124],{"class":163},[33,84581,7486],{"class":167},[33,84583,662],{"class":163},[33,84585,47294],{"class":167},[33,84587,47313],{"class":54},[33,84589,763],{"class":167},[33,84591,2491],{"class":163},[33,84593,3149],{"class":50},[33,84595,68507],{"class":167},[33,84597,17877],{"class":163},[33,84599,39025],{"class":163},[33,84601,734],{"class":50},[33,84603,9202],{"class":167},[33,84605,84606,84608,84610],{"class":35,"line":235},[33,84607,617],{"class":163},[33,84609,620],{"class":163},[33,84611,84612],{"class":167}," confs:\n",[33,84614,84615,84617,84619,84622,84624,84627,84629,84631,84633,84635,84637,84639,84641,84643],{"class":35,"line":250},[33,84616,1659],{"class":163},[33,84618,4098],{"class":167},[33,84620,84621],{"class":54},"\"mean_conf\"",[33,84623,2079],{"class":167},[33,84625,84626],{"class":50},"0.0",[33,84628,365],{"class":167},[33,84630,57024],{"class":54},[33,84632,2079],{"class":167},[33,84634,902],{"class":50},[33,84636,365],{"class":167},[33,84638,43389],{"class":54},[33,84640,2079],{"class":167},[33,84642,748],{"class":50},[33,84644,4113],{"class":167},[33,84646,84647,84650,84652,84654,84657,84659,84661],{"class":35,"line":266},[33,84648,84649],{"class":167},"    mean_conf ",[33,84651,242],{"class":163},[33,84653,46601],{"class":50},[33,84655,84656],{"class":167},"(confs) ",[33,84658,1351],{"class":163},[33,84660,4037],{"class":50},[33,84662,84663],{"class":167},"(confs)\n",[33,84665,84666,84668],{"class":35,"line":290},[33,84667,1332],{"class":163},[33,84669,16265],{"class":167},[33,84671,84672,84675,84677,84679,84682,84684],{"class":35,"line":295},[33,84673,84674],{"class":54},"        \"mean_conf\"",[33,84676,2079],{"class":167},[33,84678,43654],{"class":50},[33,84680,84681],{"class":167},"(mean_conf, ",[33,84683,734],{"class":50},[33,84685,1506],{"class":167},[33,84687,84688,84691,84694,84696],{"class":35,"line":300},[33,84689,84690],{"class":54},"        \"ok\"",[33,84692,84693],{"class":167},": mean_conf ",[33,84695,43000],{"class":163},[33,84697,84698],{"class":167}," min_mean_conf,\n",[33,84700,84701,84703,84705,84707,84710,84712,84714,84716,84718,84720,84722,84724],{"class":35,"line":317},[33,84702,43638],{"class":54},[33,84704,2079],{"class":167},[33,84706,928],{"class":50},[33,84708,84709],{"class":167},"([t ",[33,84711,6124],{"class":163},[33,84713,10818],{"class":167},[33,84715,662],{"class":163},[33,84717,47294],{"class":167},[33,84719,3459],{"class":54},[33,84721,763],{"class":167},[33,84723,2491],{"class":163},[33,84725,84726],{"class":167}," t.strip()]),\n",[33,84728,84729],{"class":35,"line":332},[33,84730,20781],{"class":167},[14,84732,84733,84734,84737],{},"A mean confidence below 60 % usually means the DPI is too low, the binarisation is misconfigured, or the scan has severe physical damage. Inspect the preprocessed image with ",[30,84735,84736],{},"save_debug_image()"," before investigating further.",[18,84739,21810],{"id":21809},[4211,84741,84742,84747,84760],{},[4214,84743,84744,84746],{},[1974,84745,13850],{}," Each 300-DPI A4 page rasterises to roughly 25–35 MB in RAM as a PIL Image. For batch jobs over 100+ pages, process one page at a time and del\u002Fgc between pages.",[4214,84748,84749,84752,84753,84755,84756,84759],{},[1974,84750,84751],{},"Speed:"," Tesseract is single-threaded per call. Use ",[30,84754,4240],{}," to parallelise across pages or files — avoid ",[30,84757,84758],{},"ThreadPoolExecutor"," because Tesseract releases the GIL inconsistently.",[4214,84761,84762,84765,84766,84769,84770,84773],{},[1974,84763,84764],{},"Chunking large PDFs:"," Open with ",[30,84767,84768],{},"fitz.open()",", iterate page by page, and call ",[30,84771,84772],{},"pix = None"," after rasterising each page to release the pixmap.",[23,84775,84777],{"className":126,"code":84776,"language":47,"meta":28,"style":28},"# pip install pymupdf pytesseract Pillow\nfrom pathlib import Path\nfrom concurrent.futures import ProcessPoolExecutor\nimport fitz\nimport pytesseract\nfrom PIL import Image\n\ndef _ocr_page_index(args):\n    pdf_path, page_num = args\n    doc = fitz.open(pdf_path)\n    page = doc[page_num]\n    pix = page.get_pixmap(dpi=300)\n    img = Image.frombytes(\"RGB\", [pix.width, pix.height], pix.samples)\n    doc.close()\n    return page_num, pytesseract.image_to_string(img)\n\ndef batch_ocr(pdf_path: Path, max_workers: int = 4) -> dict[int, str]:\n    \"\"\"Returns {page_num: text} for all pages, parallelised.\"\"\"\n    doc = fitz.open(pdf_path)\n    page_count = len(doc)\n    doc.close()\n    with ProcessPoolExecutor(max_workers=max_workers) as pool:\n        results = pool.map(_ocr_page_index, [(str(pdf_path), i) for i in range(page_count)])\n    return dict(results)\n",[30,84778,84779,84783,84793,84804,84810,84816,84826,84830,84839,84848,84856,84865,84882,84894,84899,84906,84910,84937,84942,84950,84962,84966,84983,85009],{"__ignoreMap":28},[33,84780,84781],{"class":35,"line":36},[33,84782,83852],{"class":39},[33,84784,84785,84787,84789,84791],{"class":35,"line":43},[33,84786,190],{"class":163},[33,84788,193],{"class":167},[33,84790,164],{"class":163},[33,84792,198],{"class":167},[33,84794,84795,84797,84799,84801],{"class":35,"line":61},[33,84796,190],{"class":163},[33,84798,13880],{"class":167},[33,84800,164],{"class":163},[33,84802,84803],{"class":167}," ProcessPoolExecutor\n",[33,84805,84806,84808],{"class":35,"line":73},[33,84807,164],{"class":163},[33,84809,82192],{"class":167},[33,84811,84812,84814],{"class":35,"line":88},[33,84813,164],{"class":163},[33,84815,47178],{"class":167},[33,84817,84818,84820,84822,84824],{"class":35,"line":95},[33,84819,190],{"class":163},[33,84821,46889],{"class":50},[33,84823,46892],{"class":163},[33,84825,47171],{"class":167},[33,84827,84828],{"class":35,"line":101},[33,84829,92],{"emptyLinePlaceholder":91},[33,84831,84832,84834,84837],{"class":35,"line":171},[33,84833,562],{"class":163},[33,84835,84836],{"class":46}," _ocr_page_index",[33,84838,22117],{"class":167},[33,84840,84841,84844,84846],{"class":35,"line":179},[33,84842,84843],{"class":167},"    pdf_path, page_num ",[33,84845,242],{"class":163},[33,84847,22127],{"class":167},[33,84849,84850,84852,84854],{"class":35,"line":187},[33,84851,18224],{"class":167},[33,84853,242],{"class":163},[33,84855,81970],{"class":167},[33,84857,84858,84860,84862],{"class":35,"line":201},[33,84859,39662],{"class":167},[33,84861,242],{"class":163},[33,84863,84864],{"class":167}," doc[page_num]\n",[33,84866,84867,84870,84872,84874,84876,84878,84880],{"class":35,"line":206},[33,84868,84869],{"class":167},"    pix ",[33,84871,242],{"class":163},[33,84873,82263],{"class":167},[33,84875,46966],{"class":238},[33,84877,242],{"class":163},[33,84879,26433],{"class":50},[33,84881,221],{"class":167},[33,84883,84884,84886,84888,84890,84892],{"class":35,"line":224},[33,84885,80264],{"class":167},[33,84887,242],{"class":163},[33,84889,82279],{"class":167},[33,84891,81499],{"class":54},[33,84893,82284],{"class":167},[33,84895,84896],{"class":35,"line":229},[33,84897,84898],{"class":167},"    doc.close()\n",[33,84900,84901,84903],{"class":35,"line":235},[33,84902,1332],{"class":163},[33,84904,84905],{"class":167}," page_num, pytesseract.image_to_string(img)\n",[33,84907,84908],{"class":35,"line":250},[33,84909,92],{"emptyLinePlaceholder":91},[33,84911,84912,84914,84917,84920,84922,84924,84926,84929,84931,84933,84935],{"class":35,"line":266},[33,84913,562],{"class":163},[33,84915,84916],{"class":46}," batch_ocr",[33,84918,84919],{"class":167},"(pdf_path: Path, max_workers: ",[33,84921,1059],{"class":50},[33,84923,212],{"class":163},[33,84925,82708],{"class":50},[33,84927,84928],{"class":167},") -> dict[",[33,84930,1059],{"class":50},[33,84932,365],{"class":167},[33,84934,1053],{"class":50},[33,84936,17477],{"class":167},[33,84938,84939],{"class":35,"line":290},[33,84940,84941],{"class":54},"    \"\"\"Returns {page_num: text} for all pages, parallelised.\"\"\"\n",[33,84943,84944,84946,84948],{"class":35,"line":295},[33,84945,18224],{"class":167},[33,84947,242],{"class":163},[33,84949,81970],{"class":167},[33,84951,84952,84955,84957,84959],{"class":35,"line":300},[33,84953,84954],{"class":167},"    page_count ",[33,84956,242],{"class":163},[33,84958,4037],{"class":50},[33,84960,84961],{"class":167},"(doc)\n",[33,84963,84964],{"class":35,"line":317},[33,84965,84898],{"class":167},[33,84967,84968,84970,84972,84974,84976,84979,84981],{"class":35,"line":332},[33,84969,1635],{"class":163},[33,84971,22274],{"class":167},[33,84973,22277],{"class":238},[33,84975,242],{"class":163},[33,84977,84978],{"class":167},"max_workers) ",[33,84980,495],{"class":163},[33,84982,14105],{"class":167},[33,84984,84985,84988,84990,84993,84995,84998,85000,85002,85004,85006],{"class":35,"line":347},[33,84986,84987],{"class":167},"        results ",[33,84989,242],{"class":163},[33,84991,84992],{"class":167}," pool.map(_ocr_page_index, [(",[33,84994,1053],{"class":50},[33,84996,84997],{"class":167},"(pdf_path), i) ",[33,84999,6124],{"class":163},[33,85001,47269],{"class":167},[33,85003,662],{"class":163},[33,85005,1801],{"class":50},[33,85007,85008],{"class":167},"(page_count)])\n",[33,85010,85011,85013,85016],{"class":35,"line":374},[33,85012,1332],{"class":163},[33,85014,85015],{"class":50}," dict",[33,85017,85018],{"class":167},"(results)\n",[18,85020,4271],{"id":4270},[4273,85022,85023,85034],{},[4276,85024,85025],{},[4279,85026,85027,85030,85032],{},[4282,85028,85029],{},"Error \u002F symptom",[4282,85031,4287],{},[4282,85033,4290],{},[4292,85035,85036,85050,85068,85082,85100,85111],{},[4279,85037,85038,85042,85045],{},[4297,85039,85040],{},[30,85041,79693],{},[4297,85043,85044],{},"Tesseract binary not installed or not on PATH",[4297,85046,85047,85048],{},"Install the binary; see ",[940,85049,81731],{"href":65222},[4279,85051,85052,85057,85060],{},[4297,85053,85054,85055],{},"Empty string from ",[30,85056,79683],{},[4297,85058,85059],{},"PDF has a text layer (not a scan), or image too small \u002F too low DPI",[4297,85061,9574,85062,85065,85066],{},[30,85063,85064],{},"classify_pdf()"," first; enforce ",[30,85067,49057],{},[4279,85069,85070,85073,85076],{},[4297,85071,85072],{},"Garbled multi-column text",[4297,85074,85075],{},"Wrong PSM — Tesseract reads columns left-to-right as a single line",[4297,85077,35815,85078,2012,85080],{},[30,85079,83541],{},[30,85081,83545],{},[4279,85083,85084,85089,85092],{},[4297,85085,85086],{},[30,85087,85088],{},"Error, could not initialize tesseract API with language \"xxx\"",[4297,85090,85091],{},"Language pack not installed",[4297,85093,85094,85097,85098],{},[30,85095,85096],{},"apt-get install tesseract-ocr-xxx"," or set ",[30,85099,80995],{},[4279,85101,85102,85105,85108],{},[4297,85103,85104],{},"Output truncated mid-page",[4297,85106,85107],{},"Tesseract timeout on very large images",[4297,85109,85110],{},"Resize to 300 DPI before passing; split oversized scans into quadrants",[4279,85112,85113,85116,85122],{},[4297,85114,85115],{},"Deskew rotates page 90°",[4297,85117,85118,85121],{},[30,85119,85120],{},"minAreaRect"," angle ambiguity on near-vertical text blocks",[4297,85123,85124,85125,85128],{},"Clamp angle: ",[30,85126,85127],{},"if abs(angle) > 45: angle = 0"," and skip rotation",[18,85130,85132],{"id":85131},"complete-pipeline-script","Complete Pipeline Script",[23,85134,85136],{"className":126,"code":85135,"language":47,"meta":28,"style":28},"#!\u002Fusr\u002Fbin\u002Fenv python3\n# pip install pymupdf pytesseract opencv-python Pillow numpy\n\"\"\"\nocr_pipeline.py — Rasterize a scanned PDF, preprocess pages, run OCR,\nand write a searchable PDF to the output path.\n\nUsage:\n    python ocr_pipeline.py input.pdf output_searchable.pdf --lang eng --psm 3\n\"\"\"\nimport argparse\nimport gc\nfrom pathlib import Path\n\nimport cv2\nimport fitz\nimport numpy as np\nimport pytesseract\nfrom PIL import Image\n\n\ndef preprocess(pil_img: Image.Image) -> np.ndarray:\n    img = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2GRAY)\n    _, thresh = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)\n    denoised = cv2.fastNlMeansDenoising(thresh, h=10)\n    coords = np.column_stack(np.where(denoised > 0))\n    if len(coords) >= 4:\n        rect = cv2.minAreaRect(coords)\n        angle = rect[-1]\n        angle = -(90 + angle) if angle \u003C -45 else -angle\n        h, w = denoised.shape[:2]\n        M = cv2.getRotationMatrix2D((w \u002F\u002F 2, h \u002F\u002F 2), angle, 1.0)\n        denoised = cv2.warpAffine(denoised, M, (w, h),\n                                   flags=cv2.INTER_CUBIC,\n                                   borderMode=cv2.BORDER_REPLICATE)\n    return denoised\n\n\ndef run(input_pdf: Path, output_pdf: Path, lang: str, psm: int) -> None:\n    doc = fitz.open(input_pdf)\n    for i, page in enumerate(doc):\n        pix = page.get_pixmap(dpi=300)\n        raw_img = Image.frombytes(\"RGB\", [pix.width, pix.height], pix.samples)\n        pix = None  # release pixmap memory\n\n        preprocessed = preprocess(raw_img)\n        pil_preprocessed = Image.fromarray(preprocessed)\n\n        ocr_pdf_bytes = pytesseract.image_to_pdf_or_hocr(\n            pil_preprocessed, extension=\"pdf\",\n            lang=lang, config=f\"--psm {psm}\",\n        )\n        overlay = fitz.open(\"pdf\", ocr_pdf_bytes)\n        page.show_pdf_page(page.rect, overlay, 0)\n        overlay.close()\n        gc.collect()\n        print(f\"  processed page {i + 1}\u002F{len(doc)}\")\n\n    doc.save(str(output_pdf), garbage=4, deflate=True)\n    doc.close()\n    print(f\"Done → {output_pdf}\")\n\n\ndef main() -> None:\n    ap = argparse.ArgumentParser(description=\"OCR pipeline for scanned PDFs\")\n    ap.add_argument(\"input\", type=Path, help=\"Input scanned PDF\")\n    ap.add_argument(\"output\", type=Path, help=\"Output searchable PDF\")\n    ap.add_argument(\"--lang\", default=\"eng\", help=\"Tesseract language (default: eng)\")\n    ap.add_argument(\"--psm\", type=int, default=3, help=\"Page segmentation mode (default: 3)\")\n    args = ap.parse_args()\n\n    if not args.input.exists():\n        raise FileNotFoundError(f\"Input not found: {args.input}\")\n    args.output.parent.mkdir(parents=True, exist_ok=True)\n\n    try:\n        run(args.input, args.output, args.lang, args.psm)\n    except pytesseract.pytesseract.TesseractNotFoundError as exc:\n        raise SystemExit(\n            \"Tesseract not found. See: \"\n            \"\u002Fautomating-pdf-extraction-generation\u002Fscanning-and-ocr-processing-with-python\u002Ffix-tesseract-not-found-error\u002F\"\n        ) from exc\n\n\nif __name__ == \"__main__\":\n    main()\n",[30,85137,85138,85142,85147,85151,85156,85161,85165,85169,85174,85178,85184,85191,85201,85205,85211,85217,85227,85233,85243,85247,85251,85260,85272,85298,85314,85328,85342,85350,85364,85396,85408,85432,85441,85454,85467,85473,85477,85481,85501,85509,85521,85538,85551,85562,85566,85576,85586,85590,85600,85613,85640,85644,85657,85666,85671,85676,85708,85712,85737,85741,85762,85766,85770,85782,85799,85823,85846,85873,85907,85915,85919,85927,85949,85969,85973,85979,85984,85994,86002,86007,86011,86019,86023,86027,86039],{"__ignoreMap":28},[33,85139,85140],{"class":35,"line":36},[33,85141,14447],{"class":39},[33,85143,85144],{"class":35,"line":43},[33,85145,85146],{"class":39},"# pip install pymupdf pytesseract opencv-python Pillow numpy\n",[33,85148,85149],{"class":35,"line":61},[33,85150,139],{"class":54},[33,85152,85153],{"class":35,"line":73},[33,85154,85155],{"class":54},"ocr_pipeline.py — Rasterize a scanned PDF, preprocess pages, run OCR,\n",[33,85157,85158],{"class":35,"line":88},[33,85159,85160],{"class":54},"and write a searchable PDF to the output path.\n",[33,85162,85163],{"class":35,"line":95},[33,85164,92],{"emptyLinePlaceholder":91},[33,85166,85167],{"class":35,"line":101},[33,85168,4435],{"class":54},[33,85170,85171],{"class":35,"line":171},[33,85172,85173],{"class":54},"    python ocr_pipeline.py input.pdf output_searchable.pdf --lang eng --psm 3\n",[33,85175,85176],{"class":35,"line":179},[33,85177,139],{"class":54},[33,85179,85180,85182],{"class":35,"line":187},[33,85181,164],{"class":163},[33,85183,4461],{"class":167},[33,85185,85186,85188],{"class":35,"line":201},[33,85187,164],{"class":163},[33,85189,85190],{"class":167}," gc\n",[33,85192,85193,85195,85197,85199],{"class":35,"line":206},[33,85194,190],{"class":163},[33,85196,193],{"class":167},[33,85198,164],{"class":163},[33,85200,198],{"class":167},[33,85202,85203],{"class":35,"line":224},[33,85204,92],{"emptyLinePlaceholder":91},[33,85206,85207,85209],{"class":35,"line":229},[33,85208,164],{"class":163},[33,85210,41647],{"class":167},[33,85212,85213,85215],{"class":35,"line":235},[33,85214,164],{"class":163},[33,85216,82192],{"class":167},[33,85218,85219,85221,85223,85225],{"class":35,"line":250},[33,85220,164],{"class":163},[33,85222,48399],{"class":167},[33,85224,495],{"class":163},[33,85226,48404],{"class":167},[33,85228,85229,85231],{"class":35,"line":266},[33,85230,164],{"class":163},[33,85232,47178],{"class":167},[33,85234,85235,85237,85239,85241],{"class":35,"line":290},[33,85236,190],{"class":163},[33,85238,46889],{"class":50},[33,85240,46892],{"class":163},[33,85242,47171],{"class":167},[33,85244,85245],{"class":35,"line":295},[33,85246,92],{"emptyLinePlaceholder":91},[33,85248,85249],{"class":35,"line":300},[33,85250,92],{"emptyLinePlaceholder":91},[33,85252,85253,85255,85258],{"class":35,"line":317},[33,85254,562],{"class":163},[33,85256,85257],{"class":46}," preprocess",[33,85259,82564],{"class":167},[33,85261,85262,85264,85266,85268,85270],{"class":35,"line":332},[33,85263,80264],{"class":167},[33,85265,242],{"class":163},[33,85267,82596],{"class":167},[33,85269,82599],{"class":50},[33,85271,221],{"class":167},[33,85273,85274,85276,85278,85280,85282,85284,85286,85288,85290,85292,85294,85296],{"class":35,"line":347},[33,85275,82615],{"class":167},[33,85277,242],{"class":163},[33,85279,82620],{"class":167},[33,85281,748],{"class":50},[33,85283,365],{"class":167},[33,85285,2678],{"class":50},[33,85287,82629],{"class":167},[33,85289,48521],{"class":50},[33,85291,82634],{"class":163},[33,85293,41661],{"class":167},[33,85295,82639],{"class":50},[33,85297,221],{"class":167},[33,85299,85300,85302,85304,85306,85308,85310,85312],{"class":35,"line":374},[33,85301,82655],{"class":167},[33,85303,242],{"class":163},[33,85305,82660],{"class":167},[33,85307,82663],{"class":238},[33,85309,242],{"class":163},[33,85311,3545],{"class":50},[33,85313,221],{"class":167},[33,85315,85316,85318,85320,85322,85324,85326],{"class":35,"line":397},[33,85317,82683],{"class":167},[33,85319,242],{"class":163},[33,85321,82688],{"class":167},[33,85323,6009],{"class":163},[33,85325,10791],{"class":50},[33,85327,371],{"class":167},[33,85329,85330,85332,85334,85336,85338,85340],{"class":35,"line":653},[33,85331,617],{"class":163},[33,85333,4037],{"class":50},[33,85335,82703],{"class":167},[33,85337,43000],{"class":163},[33,85339,82708],{"class":50},[33,85341,574],{"class":167},[33,85343,85344,85346,85348],{"class":35,"line":667},[33,85345,82715],{"class":167},[33,85347,242],{"class":163},[33,85349,82720],{"class":167},[33,85351,85352,85354,85356,85358,85360,85362],{"class":35,"line":675},[33,85353,82725],{"class":167},[33,85355,242],{"class":163},[33,85357,82730],{"class":167},[33,85359,4126],{"class":163},[33,85361,734],{"class":50},[33,85363,9202],{"class":167},[33,85365,85366,85368,85370,85372,85374,85376,85378,85380,85382,85384,85386,85388,85390,85392,85394],{"class":35,"line":689},[33,85367,82725],{"class":167},[33,85369,242],{"class":163},[33,85371,39025],{"class":163},[33,85373,602],{"class":167},[33,85375,2630],{"class":50},[33,85377,82634],{"class":163},[33,85379,82753],{"class":167},[33,85381,2491],{"class":163},[33,85383,82758],{"class":167},[33,85385,4043],{"class":163},[33,85387,39025],{"class":163},[33,85389,82765],{"class":50},[33,85391,15715],{"class":163},[33,85393,39025],{"class":163},[33,85395,82772],{"class":167},[33,85397,85398,85400,85402,85404,85406],{"class":35,"line":703},[33,85399,82777],{"class":167},[33,85401,242],{"class":163},[33,85403,82782],{"class":167},[33,85405,1533],{"class":50},[33,85407,9202],{"class":167},[33,85409,85410,85412,85414,85416,85418,85420,85422,85424,85426,85428,85430],{"class":35,"line":714},[33,85411,82791],{"class":167},[33,85413,242],{"class":163},[33,85415,82796],{"class":167},[33,85417,74328],{"class":163},[33,85419,7451],{"class":50},[33,85421,82803],{"class":167},[33,85423,74328],{"class":163},[33,85425,7451],{"class":50},[33,85427,82810],{"class":167},[33,85429,82813],{"class":50},[33,85431,221],{"class":167},[33,85433,85434,85436,85438],{"class":35,"line":723},[33,85435,82820],{"class":167},[33,85437,242],{"class":163},[33,85439,85440],{"class":167}," cv2.warpAffine(denoised, M, (w, h),\n",[33,85442,85443,85446,85448,85450,85452],{"class":35,"line":754},[33,85444,85445],{"class":238},"                                   flags",[33,85447,242],{"class":163},[33,85449,82840],{"class":167},[33,85451,82843],{"class":50},[33,85453,247],{"class":167},[33,85455,85456,85459,85461,85463,85465],{"class":35,"line":771},[33,85457,85458],{"class":238},"                                   borderMode",[33,85460,242],{"class":163},[33,85462,82840],{"class":167},[33,85464,82857],{"class":50},[33,85466,221],{"class":167},[33,85468,85469,85471],{"class":35,"line":777},[33,85470,1332],{"class":163},[33,85472,82874],{"class":167},[33,85474,85475],{"class":35,"line":788},[33,85476,92],{"emptyLinePlaceholder":91},[33,85478,85479],{"class":35,"line":804},[33,85480,92],{"emptyLinePlaceholder":91},[33,85482,85483,85485,85487,85489,85491,85493,85495,85497,85499],{"class":35,"line":809},[33,85484,562],{"class":163},[33,85486,69425],{"class":46},[33,85488,83900],{"class":167},[33,85490,1053],{"class":50},[33,85492,83112],{"class":167},[33,85494,1059],{"class":50},[33,85496,1617],{"class":167},[33,85498,571],{"class":50},[33,85500,574],{"class":167},[33,85502,85503,85505,85507],{"class":35,"line":819},[33,85504,18224],{"class":167},[33,85506,242],{"class":163},[33,85508,83945],{"class":167},[33,85510,85511,85513,85515,85517,85519],{"class":35,"line":829},[33,85512,656],{"class":163},[33,85514,37139],{"class":167},[33,85516,662],{"class":163},[33,85518,7403],{"class":50},[33,85520,83958],{"class":167},[33,85522,85523,85526,85528,85530,85532,85534,85536],{"class":35,"line":834},[33,85524,85525],{"class":167},"        pix ",[33,85527,242],{"class":163},[33,85529,82263],{"class":167},[33,85531,46966],{"class":238},[33,85533,242],{"class":163},[33,85535,26433],{"class":50},[33,85537,221],{"class":167},[33,85539,85540,85543,85545,85547,85549],{"class":35,"line":839},[33,85541,85542],{"class":167},"        raw_img ",[33,85544,242],{"class":163},[33,85546,82279],{"class":167},[33,85548,81499],{"class":54},[33,85550,82284],{"class":167},[33,85552,85553,85555,85557,85559],{"class":35,"line":860},[33,85554,85525],{"class":167},[33,85556,242],{"class":163},[33,85558,7657],{"class":50},[33,85560,85561],{"class":39},"  # release pixmap memory\n",[33,85563,85564],{"class":35,"line":887},[33,85565,92],{"emptyLinePlaceholder":91},[33,85567,85568,85571,85573],{"class":35,"line":907},[33,85569,85570],{"class":167},"        preprocessed ",[33,85572,242],{"class":163},[33,85574,85575],{"class":167}," preprocess(raw_img)\n",[33,85577,85578,85581,85583],{"class":35,"line":1826},[33,85579,85580],{"class":167},"        pil_preprocessed ",[33,85582,242],{"class":163},[33,85584,85585],{"class":167}," Image.fromarray(preprocessed)\n",[33,85587,85588],{"class":35,"line":1844},[33,85589,92],{"emptyLinePlaceholder":91},[33,85591,85592,85595,85597],{"class":35,"line":1858},[33,85593,85594],{"class":167},"        ocr_pdf_bytes ",[33,85596,242],{"class":163},[33,85598,85599],{"class":167}," pytesseract.image_to_pdf_or_hocr(\n",[33,85601,85602,85605,85607,85609,85611],{"class":35,"line":1871},[33,85603,85604],{"class":167},"            pil_preprocessed, ",[33,85606,84008],{"class":238},[33,85608,242],{"class":163},[33,85610,15519],{"class":54},[33,85612,247],{"class":167},[33,85614,85615,85618,85620,85622,85624,85626,85628,85630,85632,85634,85636,85638],{"class":35,"line":1877},[33,85616,85617],{"class":238},"            lang",[33,85619,242],{"class":163},[33,85621,83203],{"class":167},[33,85623,83206],{"class":238},[33,85625,242],{"class":163},[33,85627,4059],{"class":163},[33,85629,83174],{"class":54},[33,85631,1115],{"class":50},[33,85633,83179],{"class":167},[33,85635,1121],{"class":50},[33,85637,274],{"class":54},[33,85639,247],{"class":167},[33,85641,85642],{"class":35,"line":1883},[33,85643,5867],{"class":167},[33,85645,85646,85649,85651,85653,85655],{"class":35,"line":1915},[33,85647,85648],{"class":167},"        overlay ",[33,85650,242],{"class":163},[33,85652,46587],{"class":167},[33,85654,15519],{"class":54},[33,85656,84039],{"class":167},[33,85658,85659,85662,85664],{"class":35,"line":1926},[33,85660,85661],{"class":167},"        page.show_pdf_page(page.rect, overlay, ",[33,85663,748],{"class":50},[33,85665,221],{"class":167},[33,85667,85668],{"class":35,"line":1932},[33,85669,85670],{"class":167},"        overlay.close()\n",[33,85672,85673],{"class":35,"line":1938},[33,85674,85675],{"class":167},"        gc.collect()\n",[33,85677,85678,85680,85682,85684,85687,85689,85691,85693,85695,85697,85699,85702,85704,85706],{"class":35,"line":1950},[33,85679,9414],{"class":50},[33,85681,602],{"class":167},[33,85683,4059],{"class":163},[33,85685,85686],{"class":54},"\"  processed page ",[33,85688,1115],{"class":50},[33,85690,11017],{"class":167},[33,85692,1811],{"class":163},[33,85694,11022],{"class":50},[33,85696,1351],{"class":54},[33,85698,4065],{"class":50},[33,85700,85701],{"class":167},"(doc)",[33,85703,1121],{"class":50},[33,85705,274],{"class":54},[33,85707,221],{"class":167},[33,85709,85710],{"class":35,"line":1958},[33,85711,92],{"emptyLinePlaceholder":91},[33,85713,85714,85717,85719,85721,85723,85725,85727,85729,85731,85733,85735],{"class":35,"line":4904},[33,85715,85716],{"class":167},"    doc.save(",[33,85718,1053],{"class":50},[33,85720,84072],{"class":167},[33,85722,84075],{"class":238},[33,85724,242],{"class":163},[33,85726,1503],{"class":50},[33,85728,365],{"class":167},[33,85730,84084],{"class":238},[33,85732,242],{"class":163},[33,85734,855],{"class":50},[33,85736,221],{"class":167},[33,85738,85739],{"class":35,"line":4909},[33,85740,84898],{"class":167},[33,85742,85743,85745,85747,85749,85752,85754,85756,85758,85760],{"class":35,"line":4915},[33,85744,7268],{"class":50},[33,85746,602],{"class":167},[33,85748,4059],{"class":163},[33,85750,85751],{"class":54},"\"Done → ",[33,85753,1115],{"class":50},[33,85755,84106],{"class":167},[33,85757,1121],{"class":50},[33,85759,274],{"class":54},[33,85761,221],{"class":167},[33,85763,85764],{"class":35,"line":4925},[33,85765,92],{"emptyLinePlaceholder":91},[33,85767,85768],{"class":35,"line":4935},[33,85769,92],{"emptyLinePlaceholder":91},[33,85771,85772,85774,85776,85778,85780],{"class":35,"line":4941},[33,85773,562],{"class":163},[33,85775,6636],{"class":46},[33,85777,568],{"class":167},[33,85779,571],{"class":50},[33,85781,574],{"class":167},[33,85783,85784,85786,85788,85790,85792,85794,85797],{"class":35,"line":4950},[33,85785,15498],{"class":167},[33,85787,242],{"class":163},[33,85789,6653],{"class":167},[33,85791,6656],{"class":238},[33,85793,242],{"class":163},[33,85795,85796],{"class":54},"\"OCR pipeline for scanned PDFs\"",[33,85798,221],{"class":167},[33,85800,85801,85803,85806,85808,85810,85812,85814,85816,85818,85821],{"class":35,"line":4960},[33,85802,15516],{"class":167},[33,85804,85805],{"class":54},"\"input\"",[33,85807,365],{"class":167},[33,85809,6677],{"class":238},[33,85811,242],{"class":163},[33,85813,6682],{"class":167},[33,85815,25463],{"class":238},[33,85817,242],{"class":163},[33,85819,85820],{"class":54},"\"Input scanned PDF\"",[33,85822,221],{"class":167},[33,85824,85825,85827,85829,85831,85833,85835,85837,85839,85841,85844],{"class":35,"line":4965},[33,85826,15516],{"class":167},[33,85828,41169],{"class":54},[33,85830,365],{"class":167},[33,85832,6677],{"class":238},[33,85834,242],{"class":163},[33,85836,6682],{"class":167},[33,85838,25463],{"class":238},[33,85840,242],{"class":163},[33,85842,85843],{"class":54},"\"Output searchable PDF\"",[33,85845,221],{"class":167},[33,85847,85848,85850,85853,85855,85857,85859,85862,85864,85866,85868,85871],{"class":35,"line":4971},[33,85849,15516],{"class":167},[33,85851,85852],{"class":54},"\"--lang\"",[33,85854,365],{"class":167},[33,85856,6685],{"class":238},[33,85858,242],{"class":163},[33,85860,85861],{"class":54},"\"eng\"",[33,85863,365],{"class":167},[33,85865,25463],{"class":238},[33,85867,242],{"class":163},[33,85869,85870],{"class":54},"\"Tesseract language (default: eng)\"",[33,85872,221],{"class":167},[33,85874,85875,85877,85880,85882,85884,85886,85888,85890,85892,85894,85896,85898,85900,85902,85905],{"class":35,"line":4983},[33,85876,15516],{"class":167},[33,85878,85879],{"class":54},"\"--psm\"",[33,85881,365],{"class":167},[33,85883,6677],{"class":238},[33,85885,242],{"class":163},[33,85887,1059],{"class":50},[33,85889,365],{"class":167},[33,85891,6685],{"class":238},[33,85893,242],{"class":163},[33,85895,10258],{"class":50},[33,85897,365],{"class":167},[33,85899,25463],{"class":238},[33,85901,242],{"class":163},[33,85903,85904],{"class":54},"\"Page segmentation mode (default: 3)\"",[33,85906,221],{"class":167},[33,85908,85909,85911,85913],{"class":35,"line":4988},[33,85910,6766],{"class":167},[33,85912,242],{"class":163},[33,85914,15655],{"class":167},[33,85916,85917],{"class":35,"line":4993},[33,85918,92],{"emptyLinePlaceholder":91},[33,85920,85921,85923,85925],{"class":35,"line":5003},[33,85922,617],{"class":163},[33,85924,620],{"class":163},[33,85926,25620],{"class":167},[33,85928,85929,85931,85933,85935,85937,85939,85941,85943,85945,85947],{"class":35,"line":5008},[33,85930,4051],{"class":163},[33,85932,2945],{"class":50},[33,85934,602],{"class":167},[33,85936,4059],{"class":163},[33,85938,16624],{"class":54},[33,85940,1115],{"class":50},[33,85942,25634],{"class":167},[33,85944,1121],{"class":50},[33,85946,274],{"class":54},[33,85948,221],{"class":167},[33,85950,85951,85953,85955,85957,85959,85961,85963,85965,85967],{"class":35,"line":5014},[33,85952,15968],{"class":167},[33,85954,869],{"class":238},[33,85956,242],{"class":163},[33,85958,855],{"class":50},[33,85960,365],{"class":167},[33,85962,878],{"class":238},[33,85964,242],{"class":163},[33,85966,855],{"class":50},[33,85968,221],{"class":167},[33,85970,85971],{"class":35,"line":5019},[33,85972,92],{"emptyLinePlaceholder":91},[33,85974,85975,85977],{"class":35,"line":5032},[33,85976,2424],{"class":163},[33,85978,574],{"class":167},[33,85980,85981],{"class":35,"line":5039},[33,85982,85983],{"class":167},"        run(args.input, args.output, args.lang, args.psm)\n",[33,85985,85986,85988,85990,85992],{"class":35,"line":5068},[33,85987,2449],{"class":163},[33,85989,80300],{"class":167},[33,85991,495],{"class":163},[33,85993,1855],{"class":167},[33,85995,85996,85998,86000],{"class":35,"line":5077},[33,85997,4051],{"class":163},[33,85999,16617],{"class":50},[33,86001,7637],{"class":167},[33,86003,86004],{"class":35,"line":5082},[33,86005,86006],{"class":54},"            \"Tesseract not found. See: \"\n",[33,86008,86009],{"class":35,"line":5089},[33,86010,83235],{"class":54},[33,86012,86013,86015,86017],{"class":35,"line":5098},[33,86014,47018],{"class":167},[33,86016,190],{"class":163},[33,86018,20843],{"class":167},[33,86020,86021],{"class":35,"line":5105},[33,86022,92],{"emptyLinePlaceholder":91},[33,86024,86025],{"class":35,"line":5110},[33,86026,92],{"emptyLinePlaceholder":91},[33,86028,86029,86031,86033,86035,86037],{"class":35,"line":5115},[33,86030,2491],{"class":163},[33,86032,2494],{"class":50},[33,86034,2497],{"class":163},[33,86036,2500],{"class":54},[33,86038,574],{"class":167},[33,86040,86041],{"class":35,"line":5128},[33,86042,6914],{"class":167},[18,86044,6918],{"id":6917},[4211,86046,86047,86052,86057,86062],{},[4214,86048,86049,86051],{},[940,86050,81764],{"href":65222}," — binary install and PATH config for all platforms",[4214,86053,86054,86056],{},[940,86055,10077],{"href":10076}," — coordinate-clustering to reconstruct tabular structure from OCR bounding boxes",[4214,86058,86059,86061],{},[940,86060,9592],{"href":942}," — pdfplumber, camelot, and tabula for vector-text PDFs",[4214,86063,86064,86066],{},[940,86065,52682],{"href":52681}," — batch-process and archive the searchable PDFs this pipeline produces",[14,86068,6947,86069,3035],{},[940,86070,6943],{"href":6942},[6953,86072,64775],{},{"title":28,"searchDepth":43,"depth":43,"links":86074},[86075,86076,86077,86078,86081,86084,86085,86086,86091,86092,86093,86094,86095],{"id":20,"depth":43,"text":21},{"id":81885,"depth":43,"text":81886},{"id":82145,"depth":43,"text":82146},{"id":82383,"depth":43,"text":82384,"children":86079},[86080],{"id":82914,"depth":61,"text":82915},{"id":83035,"depth":43,"text":83036,"children":86082},[86083],{"id":83476,"depth":61,"text":83477},{"id":83549,"depth":43,"text":83550},{"id":83838,"depth":43,"text":83839},{"id":2708,"depth":43,"text":2709,"children":86087},[86088,86089,86090],{"id":84196,"depth":61,"text":84197},{"id":84278,"depth":61,"text":84279},{"id":84318,"depth":61,"text":84319},{"id":52029,"depth":43,"text":52030},{"id":21809,"depth":43,"text":21810},{"id":4270,"depth":43,"text":4271},{"id":85131,"depth":43,"text":85132},{"id":6917,"depth":43,"text":6918},"Scanning & OCR","Rasterize PDFs with pdf2image or PyMuPDF, preprocess with Pillow and OpenCV, run pytesseract OCR with confidence filtering, and embed a searchable text layer.",{},"\u002Fautomating-pdf-extraction-generation\u002Fscanning-and-ocr-processing-with-python",{"title":36756,"description":86097},"Scanning & OCR Processing with Python — Tesseract","automating-pdf-extraction-generation\u002Fscanning-and-ocr-processing-with-python\u002Findex",[47,49143,46820,9631,42584],"oR3dGznhymYc0ctDa1fZwL4y3VsPi7Bu-wfF5ooI6ww",{"id":86106,"title":86107,"body":86108,"breadcrumbTitle":88637,"canonical":6977,"date":46387,"description":88638,"draft":6980,"extension":6981,"image":6977,"meta":88639,"navigation":91,"path":88640,"robots":6977,"seo":88641,"seoTitle":88642,"stem":88643,"tags":88644,"updatedAt":6978,"__hash__":88646},"content\u002Fautomating-pdf-extraction-generation\u002Fwatermarking-and-securing-pdfs\u002Fadd-password-protection-to-pdf-files\u002Findex.md","Add Password Protection to PDF Files",{"type":7,"value":86109,"toc":88624},[86110,86113,86132,86134,86148,86154,86157,86163,86166,86198,86201,86230,86232,86235,86393,86410,86414,86423,86982,86985,87012,87016,87022,87298,87312,87316,87823,87825,87828,88131,88138,88142,88152,88421,88431,88435,88438,88460,88468,88470,88564,88567,88576,88582,88598,88600,88617,88621],[10,86111,86107],{"id":86112},"add-password-protection-to-pdf-files",[14,86114,86115,86116,2012,86118,86121,86122,86124,86125,36661,86128,86131],{},"When adding password protection to PDF files with legacy Python libraries, you hit ",[30,86117,68031],{},[30,86119,86120],{},"NotImplementedError"," because the old RC4 encryption path is deprecated or absent. The fix is migrating to ",[940,86123,65045],{"href":65966}," 3.x and passing ",[30,86126,86127],{},"algorithm=\"AES-256\"",[30,86129,86130],{},"writer.encrypt()",". This page covers the exact error signature, root cause, corrected implementation, permission flags, validation, and batch patterns.",[18,86133,7021],{"id":7020},[14,86135,86136,86137,86140,86141,86143,86144,86147],{},"Legacy ",[30,86138,86139],{},"PyPDF2"," and early ",[30,86142,65045],{}," releases default to RC4-40 or RC4-128. Modern PDF readers either reject these ciphers outright or flag the document as insecure. When you call ",[30,86145,86146],{},".encrypt()"," on an outdated version, the interpreter raises:",[23,86149,86152],{"className":86150,"code":86151,"language":2000,"meta":28},[1998],"NotImplementedError: Encryption algorithm not supported\n",[30,86153,86151],{"__ignoreMap":28},[14,86155,86156],{},"or, when you try to read pages from an already-encrypted file without decrypting first:",[23,86158,86161],{"className":86159,"code":86160,"language":2000,"meta":28},[1998],"pypdf.errors.PdfReadError: Stream has not been decrypted\n",[30,86162,86160],{"__ignoreMap":28},[14,86164,86165],{},"Three specific triggers:",[35387,86167,86168,86177,86189],{},[4214,86169,86170,86173,86174,86176],{},[30,86171,86172],{},"PyPDF2 \u003C 3.0.0"," — the ",[30,86175,86146],{}," method silently falls back to RC4-40, which current Adobe and Chrome PDF engines reject.",[4214,86178,86179,86173,86182,86185,86186,3035],{},[30,86180,86181],{},"pypdf \u003C 3.0.0",[30,86183,86184],{},"algorithm"," parameter did not exist; passing it raises ",[30,86187,86188],{},"TypeError",[4214,86190,86191,86192,86195,86196,3035],{},"Re-encrypting an already-encrypted file without calling ",[30,86193,86194],{},"reader.decrypt()"," first — pypdf cannot parse the cross-reference table of an locked stream, so any write attempt raises ",[30,86197,68031],{},[14,86199,86200],{},"Verify your installation before continuing:",[23,86202,86204],{"className":25,"code":86203,"language":27,"meta":28,"style":28},"pip show pypdf | grep Version\n# Must be 3.0.0 or higher\n# If you see PyPDF2 installed: pip uninstall PyPDF2\n",[30,86205,86206,86220,86225],{"__ignoreMap":28},[33,86207,86208,86210,86212,86214,86216,86218],{"class":35,"line":36},[33,86209,76],{"class":46},[33,86211,41946],{"class":54},[33,86213,71164],{"class":54},[33,86215,2850],{"class":163},[33,86217,41954],{"class":46},[33,86219,42357],{"class":54},[33,86221,86222],{"class":35,"line":43},[33,86223,86224],{"class":39},"# Must be 3.0.0 or higher\n",[33,86226,86227],{"class":35,"line":61},[33,86228,86229],{"class":39},"# If you see PyPDF2 installed: pip uninstall PyPDF2\n",[18,86231,35017],{"id":35016},[14,86233,86234],{},"Confirm the failure mode against your exact file before touching production code:",[23,86236,86238],{"className":126,"code":86237,"language":47,"meta":28,"style":28},"# pip install pypdf\nfrom pathlib import Path\nfrom pypdf import PdfReader\n\nSOURCE = Path(\"report.pdf\")\n\ntry:\n    reader = PdfReader(SOURCE)\n    print(f\"Encrypted: {reader.is_encrypted}\")\n    print(f\"Pages    : {len(reader.pages)}\")\nexcept Exception as exc:\n    # If PdfReadError fires here the file is encrypted and needs decrypt() first\n    print(f\"Open error: {type(exc).__name__}: {exc}\")\n",[30,86239,86240,86244,86254,86264,86268,86282,86286,86292,86304,86325,86346,86356,86361],{"__ignoreMap":28},[33,86241,86242],{"class":35,"line":36},[33,86243,57316],{"class":39},[33,86245,86246,86248,86250,86252],{"class":35,"line":43},[33,86247,190],{"class":163},[33,86249,193],{"class":167},[33,86251,164],{"class":163},[33,86253,198],{"class":167},[33,86255,86256,86258,86260,86262],{"class":35,"line":61},[33,86257,190],{"class":163},[33,86259,57333],{"class":167},[33,86261,164],{"class":163},[33,86263,57338],{"class":167},[33,86265,86266],{"class":35,"line":73},[33,86267,92],{"emptyLinePlaceholder":91},[33,86269,86270,86273,86275,86277,86280],{"class":35,"line":88},[33,86271,86272],{"class":50},"SOURCE",[33,86274,212],{"class":163},[33,86276,215],{"class":167},[33,86278,86279],{"class":54},"\"report.pdf\"",[33,86281,221],{"class":167},[33,86283,86284],{"class":35,"line":95},[33,86285,92],{"emptyLinePlaceholder":91},[33,86287,86288,86290],{"class":35,"line":101},[33,86289,35574],{"class":163},[33,86291,574],{"class":167},[33,86293,86294,86296,86298,86300,86302],{"class":35,"line":171},[33,86295,57365],{"class":167},[33,86297,242],{"class":163},[33,86299,57370],{"class":167},[33,86301,86272],{"class":50},[33,86303,221],{"class":167},[33,86305,86306,86308,86310,86312,86315,86317,86319,86321,86323],{"class":35,"line":179},[33,86307,7268],{"class":50},[33,86309,602],{"class":167},[33,86311,4059],{"class":163},[33,86313,86314],{"class":54},"\"Encrypted: ",[33,86316,1115],{"class":50},[33,86318,75937],{"class":167},[33,86320,1121],{"class":50},[33,86322,274],{"class":54},[33,86324,221],{"class":167},[33,86326,86327,86329,86331,86333,86336,86338,86340,86342,86344],{"class":35,"line":187},[33,86328,7268],{"class":50},[33,86330,602],{"class":167},[33,86332,4059],{"class":163},[33,86334,86335],{"class":54},"\"Pages    : ",[33,86337,4065],{"class":50},[33,86339,59322],{"class":167},[33,86341,1121],{"class":50},[33,86343,274],{"class":54},[33,86345,221],{"class":167},[33,86347,86348,86350,86352,86354],{"class":35,"line":201},[33,86349,35726],{"class":163},[33,86351,783],{"class":50},[33,86353,1852],{"class":163},[33,86355,1855],{"class":167},[33,86357,86358],{"class":35,"line":206},[33,86359,86360],{"class":39},"    # If PdfReadError fires here the file is encrypted and needs decrypt() first\n",[33,86362,86363,86365,86367,86369,86372,86375,86378,86381,86383,86385,86387,86389,86391],{"class":35,"line":224},[33,86364,7268],{"class":50},[33,86366,602],{"class":167},[33,86368,4059],{"class":163},[33,86370,86371],{"class":54},"\"Open error: ",[33,86373,86374],{"class":50},"{type",[33,86376,86377],{"class":167},"(exc).",[33,86379,86380],{"class":50},"__name__}",[33,86382,2079],{"class":54},[33,86384,1115],{"class":50},[33,86386,6565],{"class":167},[33,86388,1121],{"class":50},[33,86390,274],{"class":54},[33,86392,221],{"class":167},[14,86394,41963,86395,4348,86398,86400,86401,86404,86405,4348,86407,86409],{},[30,86396,86397],{},"is_encrypted",[30,86399,855],{}," and you want to re-encrypt, call ",[30,86402,86403],{},"reader.decrypt(existing_password)"," before copying pages to the writer. If ",[30,86406,86397],{},[30,86408,902],{},", proceed directly to the encryption step.",[18,86411,86413],{"id":86412},"fix-aes-256-encryption-with-pypdf","Fix: AES-256 Encryption with pypdf",[14,86415,86416,86417,86419,86420,86422],{},"Replace any ",[30,86418,86139],{}," or legacy ",[30,86421,65045],{}," writer logic with the following:",[23,86424,86426],{"className":126,"code":86425,"language":47,"meta":28,"style":28},"# pip install \"pypdf>=3.17\"\nfrom pathlib import Path\nfrom pypdf import PdfReader, PdfWriter\nimport os\n\nINPUT_PDF   = Path(\"report.pdf\")\nSECURED_PDF = Path(\"report_secured.pdf\")\n\n\ndef encrypt_pdf(\n    source: Path,\n    output: Path,\n    user_password: str,\n    owner_password: str,\n    algorithm: str = \"AES-256\",\n) -> None:\n    \"\"\"\n    Encrypt source PDF with AES-256 and write to output.\n\n    user_password  — required to open\u002Fview the document\n    owner_password — grants full editing rights; overrides permission flags\n    algorithm      — \"AES-256\" for PDF 2.0 compliance; \"AES-128\" for older\n                     reader compatibility; never use \"RC4-*\" for new work\n    \"\"\"\n    if not source.exists():\n        raise FileNotFoundError(f\"Source PDF not found: {source}\")\n\n    try:\n        reader = PdfReader(source)\n\n        # Decrypt first if the source file is already protected\n        if reader.is_encrypted:\n            result = reader.decrypt(os.environ.get(\"PDF_EXISTING_PW\", \"\"))\n            if result == 0:\n                raise ValueError(\"Wrong existing password — cannot re-encrypt\")\n\n        writer = PdfWriter()\n        for page in reader.pages:\n            writer.add_page(page)\n\n        # algorithm=\"AES-256\" selects PDF 2.0-compliant AES-256 (pypdf 3.x+)\n        writer.encrypt(\n            user_password=user_password,\n            owner_password=owner_password,\n            algorithm=algorithm,\n        )\n\n        output.parent.mkdir(parents=True, exist_ok=True)\n        with open(output, \"wb\") as fh:\n            writer.write(fh)\n        print(f\"Encrypted ({algorithm}): {output}\")\n\n    except Exception as exc:\n        # Re-raise so the caller decides whether to halt or continue a batch\n        raise RuntimeError(f\"Encryption failed for {source.name}: {exc}\") from exc\n\n\nif __name__ == \"__main__\":\n    encrypt_pdf(\n        INPUT_PDF,\n        SECURED_PDF,\n        user_password=os.environ[\"PDF_USER_PW\"],    # never hardcode credentials\n        owner_password=os.environ[\"PDF_OWNER_PW\"],\n    )\n",[30,86427,86428,86433,86443,86453,86459,86463,86476,86490,86494,86498,86507,86511,86515,86524,86533,86547,86555,86559,86564,86568,86573,86578,86583,86588,86592,86601,86624,86628,86634,86643,86647,86652,86658,86677,86689,86702,86706,86714,86725,86730,86734,86739,86744,86754,86764,86774,86778,86782,86802,86818,86822,86852,86856,86866,86871,86907,86911,86915,86927,86932,86939,86946,86964,86978],{"__ignoreMap":28},[33,86429,86430],{"class":35,"line":36},[33,86431,86432],{"class":39},"# pip install \"pypdf>=3.17\"\n",[33,86434,86435,86437,86439,86441],{"class":35,"line":43},[33,86436,190],{"class":163},[33,86438,193],{"class":167},[33,86440,164],{"class":163},[33,86442,198],{"class":167},[33,86444,86445,86447,86449,86451],{"class":35,"line":61},[33,86446,190],{"class":163},[33,86448,57333],{"class":167},[33,86450,164],{"class":163},[33,86452,66892],{"class":167},[33,86454,86455,86457],{"class":35,"line":73},[33,86456,164],{"class":163},[33,86458,176],{"class":167},[33,86460,86461],{"class":35,"line":88},[33,86462,92],{"emptyLinePlaceholder":91},[33,86464,86465,86468,86470,86472,86474],{"class":35,"line":95},[33,86466,86467],{"class":50},"INPUT_PDF",[33,86469,21012],{"class":163},[33,86471,215],{"class":167},[33,86473,86279],{"class":54},[33,86475,221],{"class":167},[33,86477,86478,86481,86483,86485,86488],{"class":35,"line":101},[33,86479,86480],{"class":50},"SECURED_PDF",[33,86482,212],{"class":163},[33,86484,215],{"class":167},[33,86486,86487],{"class":54},"\"report_secured.pdf\"",[33,86489,221],{"class":167},[33,86491,86492],{"class":35,"line":171},[33,86493,92],{"emptyLinePlaceholder":91},[33,86495,86496],{"class":35,"line":179},[33,86497,92],{"emptyLinePlaceholder":91},[33,86499,86500,86502,86505],{"class":35,"line":187},[33,86501,562],{"class":163},[33,86503,86504],{"class":46}," encrypt_pdf",[33,86506,7637],{"class":167},[33,86508,86509],{"class":35,"line":201},[33,86510,79033],{"class":167},[33,86512,86513],{"class":35,"line":206},[33,86514,70116],{"class":167},[33,86516,86517,86520,86522],{"class":35,"line":224},[33,86518,86519],{"class":167},"    user_password: ",[33,86521,1053],{"class":50},[33,86523,247],{"class":167},[33,86525,86526,86529,86531],{"class":35,"line":229},[33,86527,86528],{"class":167},"    owner_password: ",[33,86530,1053],{"class":50},[33,86532,247],{"class":167},[33,86534,86535,86538,86540,86542,86545],{"class":35,"line":235},[33,86536,86537],{"class":167},"    algorithm: ",[33,86539,1053],{"class":50},[33,86541,212],{"class":163},[33,86543,86544],{"class":54}," \"AES-256\"",[33,86546,247],{"class":167},[33,86548,86549,86551,86553],{"class":35,"line":250},[33,86550,1617],{"class":167},[33,86552,571],{"class":50},[33,86554,574],{"class":167},[33,86556,86557],{"class":35,"line":266},[33,86558,7673],{"class":54},[33,86560,86561],{"class":35,"line":290},[33,86562,86563],{"class":54},"    Encrypt source PDF with AES-256 and write to output.\n",[33,86565,86566],{"class":35,"line":295},[33,86567,92],{"emptyLinePlaceholder":91},[33,86569,86570],{"class":35,"line":300},[33,86571,86572],{"class":54},"    user_password  — required to open\u002Fview the document\n",[33,86574,86575],{"class":35,"line":317},[33,86576,86577],{"class":54},"    owner_password — grants full editing rights; overrides permission flags\n",[33,86579,86580],{"class":35,"line":332},[33,86581,86582],{"class":54},"    algorithm      — \"AES-256\" for PDF 2.0 compliance; \"AES-128\" for older\n",[33,86584,86585],{"class":35,"line":347},[33,86586,86587],{"class":54},"                     reader compatibility; never use \"RC4-*\" for new work\n",[33,86589,86590],{"class":35,"line":374},[33,86591,7673],{"class":54},[33,86593,86594,86596,86598],{"class":35,"line":397},[33,86595,617],{"class":163},[33,86597,620],{"class":163},[33,86599,86600],{"class":167}," source.exists():\n",[33,86602,86603,86605,86607,86609,86611,86614,86616,86618,86620,86622],{"class":35,"line":653},[33,86604,4051],{"class":163},[33,86606,2945],{"class":50},[33,86608,602],{"class":167},[33,86610,4059],{"class":163},[33,86612,86613],{"class":54},"\"Source PDF not found: ",[33,86615,1115],{"class":50},[33,86617,64],{"class":167},[33,86619,1121],{"class":50},[33,86621,274],{"class":54},[33,86623,221],{"class":167},[33,86625,86626],{"class":35,"line":667},[33,86627,92],{"emptyLinePlaceholder":91},[33,86629,86630,86632],{"class":35,"line":675},[33,86631,2424],{"class":163},[33,86633,574],{"class":167},[33,86635,86636,86638,86640],{"class":35,"line":689},[33,86637,62484],{"class":167},[33,86639,242],{"class":163},[33,86641,86642],{"class":167}," PdfReader(source)\n",[33,86644,86645],{"class":35,"line":703},[33,86646,92],{"emptyLinePlaceholder":91},[33,86648,86649],{"class":35,"line":714},[33,86650,86651],{"class":39},"        # Decrypt first if the source file is already protected\n",[33,86653,86654,86656],{"class":35,"line":723},[33,86655,8221],{"class":163},[33,86657,68749],{"class":167},[33,86659,86660,86663,86665,86668,86671,86673,86675],{"class":35,"line":754},[33,86661,86662],{"class":167},"            result ",[33,86664,242],{"class":163},[33,86666,86667],{"class":167}," reader.decrypt(os.environ.get(",[33,86669,86670],{"class":54},"\"PDF_EXISTING_PW\"",[33,86672,365],{"class":167},[33,86674,3198],{"class":54},[33,86676,371],{"class":167},[33,86678,86679,86681,86683,86685,86687],{"class":35,"line":771},[33,86680,5995],{"class":163},[33,86682,68801],{"class":167},[33,86684,1865],{"class":163},[33,86686,10791],{"class":50},[33,86688,574],{"class":167},[33,86690,86691,86693,86695,86697,86700],{"class":35,"line":777},[33,86692,16804],{"class":163},[33,86694,4054],{"class":50},[33,86696,602],{"class":167},[33,86698,86699],{"class":54},"\"Wrong existing password — cannot re-encrypt\"",[33,86701,221],{"class":167},[33,86703,86704],{"class":35,"line":788},[33,86705,92],{"emptyLinePlaceholder":91},[33,86707,86708,86710,86712],{"class":35,"line":804},[33,86709,67149],{"class":167},[33,86711,242],{"class":163},[33,86713,67154],{"class":167},[33,86715,86716,86718,86720,86722],{"class":35,"line":809},[33,86717,5973],{"class":163},[33,86719,695],{"class":167},[33,86721,662],{"class":163},[33,86723,86724],{"class":167}," reader.pages:\n",[33,86726,86727],{"class":35,"line":819},[33,86728,86729],{"class":167},"            writer.add_page(page)\n",[33,86731,86732],{"class":35,"line":829},[33,86733,92],{"emptyLinePlaceholder":91},[33,86735,86736],{"class":35,"line":834},[33,86737,86738],{"class":39},"        # algorithm=\"AES-256\" selects PDF 2.0-compliant AES-256 (pypdf 3.x+)\n",[33,86740,86741],{"class":35,"line":839},[33,86742,86743],{"class":167},"        writer.encrypt(\n",[33,86745,86746,86749,86751],{"class":35,"line":860},[33,86747,86748],{"class":238},"            user_password",[33,86750,242],{"class":163},[33,86752,86753],{"class":167},"user_password,\n",[33,86755,86756,86759,86761],{"class":35,"line":887},[33,86757,86758],{"class":238},"            owner_password",[33,86760,242],{"class":163},[33,86762,86763],{"class":167},"owner_password,\n",[33,86765,86766,86769,86771],{"class":35,"line":907},[33,86767,86768],{"class":238},"            algorithm",[33,86770,242],{"class":163},[33,86772,86773],{"class":167},"algorithm,\n",[33,86775,86776],{"class":35,"line":1826},[33,86777,5867],{"class":167},[33,86779,86780],{"class":35,"line":1844},[33,86781,92],{"emptyLinePlaceholder":91},[33,86783,86784,86786,86788,86790,86792,86794,86796,86798,86800],{"class":35,"line":1858},[33,86785,70507],{"class":167},[33,86787,869],{"class":238},[33,86789,242],{"class":163},[33,86791,855],{"class":50},[33,86793,365],{"class":167},[33,86795,878],{"class":238},[33,86797,242],{"class":163},[33,86799,855],{"class":50},[33,86801,221],{"class":167},[33,86803,86804,86806,86808,86810,86812,86814,86816],{"class":35,"line":1871},[33,86805,2191],{"class":163},[33,86807,68213],{"class":50},[33,86809,70532],{"class":167},[33,86811,67169],{"class":54},[33,86813,1649],{"class":167},[33,86815,495],{"class":163},[33,86817,67176],{"class":167},[33,86819,86820],{"class":35,"line":1877},[33,86821,67181],{"class":167},[33,86823,86824,86826,86828,86830,86833,86835,86837,86839,86842,86844,86846,86848,86850],{"class":35,"line":1883},[33,86825,9414],{"class":50},[33,86827,602],{"class":167},[33,86829,4059],{"class":163},[33,86831,86832],{"class":54},"\"Encrypted (",[33,86834,1115],{"class":50},[33,86836,86184],{"class":167},[33,86838,1121],{"class":50},[33,86840,86841],{"class":54},"): ",[33,86843,1115],{"class":50},[33,86845,70566],{"class":167},[33,86847,1121],{"class":50},[33,86849,274],{"class":54},[33,86851,221],{"class":167},[33,86853,86854],{"class":35,"line":1915},[33,86855,92],{"emptyLinePlaceholder":91},[33,86857,86858,86860,86862,86864],{"class":35,"line":1926},[33,86859,2449],{"class":163},[33,86861,783],{"class":50},[33,86863,1852],{"class":163},[33,86865,1855],{"class":167},[33,86867,86868],{"class":35,"line":1932},[33,86869,86870],{"class":39},"        # Re-raise so the caller decides whether to halt or continue a batch\n",[33,86872,86873,86875,86877,86879,86881,86884,86886,86889,86891,86893,86895,86897,86899,86901,86903,86905],{"class":35,"line":1938},[33,86874,4051],{"class":163},[33,86876,7590],{"class":50},[33,86878,602],{"class":167},[33,86880,4059],{"class":163},[33,86882,86883],{"class":54},"\"Encryption failed for ",[33,86885,1115],{"class":50},[33,86887,86888],{"class":167},"source.name",[33,86890,1121],{"class":50},[33,86892,2079],{"class":54},[33,86894,1115],{"class":50},[33,86896,6565],{"class":167},[33,86898,1121],{"class":50},[33,86900,274],{"class":54},[33,86902,1649],{"class":167},[33,86904,190],{"class":163},[33,86906,20843],{"class":167},[33,86908,86909],{"class":35,"line":1950},[33,86910,92],{"emptyLinePlaceholder":91},[33,86912,86913],{"class":35,"line":1958},[33,86914,92],{"emptyLinePlaceholder":91},[33,86916,86917,86919,86921,86923,86925],{"class":35,"line":4904},[33,86918,2491],{"class":163},[33,86920,2494],{"class":50},[33,86922,2497],{"class":163},[33,86924,2500],{"class":54},[33,86926,574],{"class":167},[33,86928,86929],{"class":35,"line":4909},[33,86930,86931],{"class":167},"    encrypt_pdf(\n",[33,86933,86934,86937],{"class":35,"line":4915},[33,86935,86936],{"class":50},"        INPUT_PDF",[33,86938,247],{"class":167},[33,86940,86941,86944],{"class":35,"line":4925},[33,86942,86943],{"class":50},"        SECURED_PDF",[33,86945,247],{"class":167},[33,86947,86948,86951,86953,86955,86958,86961],{"class":35,"line":4935},[33,86949,86950],{"class":238},"        user_password",[33,86952,242],{"class":163},[33,86954,35884],{"class":167},[33,86956,86957],{"class":54},"\"PDF_USER_PW\"",[33,86959,86960],{"class":167},"],    ",[33,86962,86963],{"class":39},"# never hardcode credentials\n",[33,86965,86966,86969,86971,86973,86976],{"class":35,"line":4941},[33,86967,86968],{"class":238},"        owner_password",[33,86970,242],{"class":163},[33,86972,35884],{"class":167},[33,86974,86975],{"class":54},"\"PDF_OWNER_PW\"",[33,86977,8935],{"class":167},[33,86979,86980],{"class":35,"line":4950},[33,86981,1202],{"class":167},[14,86983,86984],{},"Key points on the changed lines:",[4211,86986,86987,86992,87000,87005],{},[4214,86988,86989,86991],{},[30,86990,86127],{}," — explicit algorithm selection; without it pypdf defaults to AES-128.",[4214,86993,86994,86996,86997,86999],{},[30,86995,75937],{}," check — prevents ",[30,86998,68031],{}," when the source is already locked.",[4214,87001,87002,87004],{},[30,87003,86194],{}," return-value check — 0 means wrong password, 1 means user-password success, 2 means owner-password success.",[4214,87006,87007,87008,87011],{},"Environment variables — never embed passwords in ",[30,87009,87010],{},".py"," files; they end up in version control.",[18,87013,87015],{"id":87014},"variant-adding-permission-flags","Variant: Adding Permission Flags",[14,87017,87018,87019,20891],{},"Encryption without permission flags leaves all operations open to anyone with the user password. Restrict printing, copying, and editing with ",[30,87020,87021],{},"PermissionFlags",[23,87023,87025],{"className":126,"code":87024,"language":47,"meta":28,"style":28},"# pip install \"pypdf>=3.17\"\nfrom pathlib import Path\nfrom pypdf import PdfReader, PdfWriter\nfrom pypdf.generic import PermissionFlags\nimport os\n\n# Allow viewing and printing; deny content copy and modification\nREAD_AND_PRINT = (\n    PermissionFlags.PRINT_PRINTING\n    | PermissionFlags.PRINT_IN_HIGH_QUALITY\n)\n\n\ndef encrypt_with_permissions(\n    source: Path,\n    output: Path,\n    user_pw: str,\n    owner_pw: str,\n    permissions: int = READ_AND_PRINT,\n) -> None:\n    reader = PdfReader(source)\n    writer = PdfWriter()\n    for page in reader.pages:\n        writer.add_page(page)\n    writer.encrypt(\n        user_password=user_pw,\n        owner_password=owner_pw,\n        permissions_flag=permissions,   # bitmask controls what user-pw holders can do\n        algorithm=\"AES-256\",\n    )\n    output.parent.mkdir(parents=True, exist_ok=True)\n    with open(output, \"wb\") as fh:\n        writer.write(fh)\n",[30,87026,87027,87031,87041,87051,87062,87068,87072,87077,87086,87094,87105,87109,87113,87117,87126,87130,87134,87143,87152,87166,87174,87182,87190,87200,87205,87210,87219,87228,87241,87253,87257,87277,87293],{"__ignoreMap":28},[33,87028,87029],{"class":35,"line":36},[33,87030,86432],{"class":39},[33,87032,87033,87035,87037,87039],{"class":35,"line":43},[33,87034,190],{"class":163},[33,87036,193],{"class":167},[33,87038,164],{"class":163},[33,87040,198],{"class":167},[33,87042,87043,87045,87047,87049],{"class":35,"line":61},[33,87044,190],{"class":163},[33,87046,57333],{"class":167},[33,87048,164],{"class":163},[33,87050,66892],{"class":167},[33,87052,87053,87055,87057,87059],{"class":35,"line":73},[33,87054,190],{"class":163},[33,87056,73414],{"class":167},[33,87058,164],{"class":163},[33,87060,87061],{"class":167}," PermissionFlags\n",[33,87063,87064,87066],{"class":35,"line":88},[33,87065,164],{"class":163},[33,87067,176],{"class":167},[33,87069,87070],{"class":35,"line":95},[33,87071,92],{"emptyLinePlaceholder":91},[33,87073,87074],{"class":35,"line":101},[33,87075,87076],{"class":39},"# Allow viewing and printing; deny content copy and modification\n",[33,87078,87079,87082,87084],{"class":35,"line":171},[33,87080,87081],{"class":50},"READ_AND_PRINT",[33,87083,212],{"class":163},[33,87085,1415],{"class":167},[33,87087,87088,87091],{"class":35,"line":179},[33,87089,87090],{"class":167},"    PermissionFlags.",[33,87092,87093],{"class":50},"PRINT_PRINTING\n",[33,87095,87096,87099,87102],{"class":35,"line":187},[33,87097,87098],{"class":163},"    |",[33,87100,87101],{"class":167}," PermissionFlags.",[33,87103,87104],{"class":50},"PRINT_IN_HIGH_QUALITY\n",[33,87106,87107],{"class":35,"line":201},[33,87108,221],{"class":167},[33,87110,87111],{"class":35,"line":206},[33,87112,92],{"emptyLinePlaceholder":91},[33,87114,87115],{"class":35,"line":224},[33,87116,92],{"emptyLinePlaceholder":91},[33,87118,87119,87121,87124],{"class":35,"line":229},[33,87120,562],{"class":163},[33,87122,87123],{"class":46}," encrypt_with_permissions",[33,87125,7637],{"class":167},[33,87127,87128],{"class":35,"line":235},[33,87129,79033],{"class":167},[33,87131,87132],{"class":35,"line":250},[33,87133,70116],{"class":167},[33,87135,87136,87139,87141],{"class":35,"line":266},[33,87137,87138],{"class":167},"    user_pw: ",[33,87140,1053],{"class":50},[33,87142,247],{"class":167},[33,87144,87145,87148,87150],{"class":35,"line":290},[33,87146,87147],{"class":167},"    owner_pw: ",[33,87149,1053],{"class":50},[33,87151,247],{"class":167},[33,87153,87154,87157,87159,87161,87164],{"class":35,"line":295},[33,87155,87156],{"class":167},"    permissions: ",[33,87158,1059],{"class":50},[33,87160,212],{"class":163},[33,87162,87163],{"class":50}," READ_AND_PRINT",[33,87165,247],{"class":167},[33,87167,87168,87170,87172],{"class":35,"line":300},[33,87169,1617],{"class":167},[33,87171,571],{"class":50},[33,87173,574],{"class":167},[33,87175,87176,87178,87180],{"class":35,"line":317},[33,87177,57365],{"class":167},[33,87179,242],{"class":163},[33,87181,86642],{"class":167},[33,87183,87184,87186,87188],{"class":35,"line":332},[33,87185,68681],{"class":167},[33,87187,242],{"class":163},[33,87189,67154],{"class":167},[33,87191,87192,87194,87196,87198],{"class":35,"line":347},[33,87193,656],{"class":163},[33,87195,695],{"class":167},[33,87197,662],{"class":163},[33,87199,86724],{"class":167},[33,87201,87202],{"class":35,"line":374},[33,87203,87204],{"class":167},"        writer.add_page(page)\n",[33,87206,87207],{"class":35,"line":397},[33,87208,87209],{"class":167},"    writer.encrypt(\n",[33,87211,87212,87214,87216],{"class":35,"line":653},[33,87213,86950],{"class":238},[33,87215,242],{"class":163},[33,87217,87218],{"class":167},"user_pw,\n",[33,87220,87221,87223,87225],{"class":35,"line":667},[33,87222,86968],{"class":238},[33,87224,242],{"class":163},[33,87226,87227],{"class":167},"owner_pw,\n",[33,87229,87230,87233,87235,87238],{"class":35,"line":675},[33,87231,87232],{"class":238},"        permissions_flag",[33,87234,242],{"class":163},[33,87236,87237],{"class":167},"permissions,   ",[33,87239,87240],{"class":39},"# bitmask controls what user-pw holders can do\n",[33,87242,87243,87246,87248,87251],{"class":35,"line":689},[33,87244,87245],{"class":238},"        algorithm",[33,87247,242],{"class":163},[33,87249,87250],{"class":54},"\"AES-256\"",[33,87252,247],{"class":167},[33,87254,87255],{"class":35,"line":703},[33,87256,1202],{"class":167},[33,87258,87259,87261,87263,87265,87267,87269,87271,87273,87275],{"class":35,"line":714},[33,87260,74932],{"class":167},[33,87262,869],{"class":238},[33,87264,242],{"class":163},[33,87266,855],{"class":50},[33,87268,365],{"class":167},[33,87270,878],{"class":238},[33,87272,242],{"class":163},[33,87274,855],{"class":50},[33,87276,221],{"class":167},[33,87278,87279,87281,87283,87285,87287,87289,87291],{"class":35,"line":723},[33,87280,1635],{"class":163},[33,87282,68213],{"class":50},[33,87284,70532],{"class":167},[33,87286,67169],{"class":54},[33,87288,1649],{"class":167},[33,87290,495],{"class":163},[33,87292,67176],{"class":167},[33,87294,87295],{"class":35,"line":754},[33,87296,87297],{"class":167},"        writer.write(fh)\n",[14,87299,39550,87300,87303,87304,87307,87308,87311],{},[30,87301,87302],{},"owner_password"," bypasses all ",[30,87305,87306],{},"permissions_flag"," restrictions regardless. Set it to a different, stronger value than ",[30,87309,87310],{},"user_password"," — some PDF readers silently disable flag enforcement when both passwords are identical.",[18,87313,87315],{"id":87314},"variant-batch-encryption","Variant: Batch Encryption",[23,87317,87319],{"className":126,"code":87318,"language":47,"meta":28,"style":28},"# pip install \"pypdf>=3.17\"\nimport os\nfrom pathlib import Path\nfrom pypdf import PdfReader, PdfWriter\n\nINPUT_DIR  = Path(\".\u002Fraw_pdfs\")\nOUTPUT_DIR = Path(\".\u002Fsecured_pdfs\")\n\n\ndef batch_encrypt(\n    source_dir: Path,\n    output_dir: Path,\n    user_pw: str,\n    owner_pw: str,\n) -> None:\n    output_dir.mkdir(parents=True, exist_ok=True)\n    pdfs = sorted(source_dir.glob(\"*.pdf\"))\n    if not pdfs:\n        print(f\"No PDFs found in {source_dir}\")\n        return\n\n    ok, failed = 0, 0\n    for pdf in pdfs:\n        out = output_dir \u002F f\"secure_{pdf.name}\"\n        try:\n            reader = PdfReader(pdf)\n            writer = PdfWriter()\n            for page in reader.pages:\n                writer.add_page(page)\n            writer.encrypt(user_password=user_pw, owner_password=owner_pw, algorithm=\"AES-256\")\n            with open(out, \"wb\") as fh:\n                writer.write(fh)\n            ok += 1\n            print(f\"  OK  {pdf.name}\")\n        except Exception as exc:\n            failed += 1\n            print(f\"  ERR {pdf.name}: {exc}\")\n\n    print(f\"\\nDone: {ok} encrypted, {failed} failed\")\n\n\nif __name__ == \"__main__\":\n    batch_encrypt(\n        INPUT_DIR,\n        OUTPUT_DIR,\n        user_pw=os.environ[\"PDF_USER_PW\"],\n        owner_pw=os.environ[\"PDF_OWNER_PW\"],\n    )\n",[30,87320,87321,87325,87331,87341,87351,87355,87368,87381,87385,87389,87398,87403,87407,87415,87423,87431,87451,87466,87474,87495,87499,87503,87517,87527,87550,87556,87565,87573,87583,87588,87615,87631,87636,87645,87666,87676,87685,87714,87718,87754,87758,87762,87774,87779,87786,87793,87806,87819],{"__ignoreMap":28},[33,87322,87323],{"class":35,"line":36},[33,87324,86432],{"class":39},[33,87326,87327,87329],{"class":35,"line":43},[33,87328,164],{"class":163},[33,87330,176],{"class":167},[33,87332,87333,87335,87337,87339],{"class":35,"line":61},[33,87334,190],{"class":163},[33,87336,193],{"class":167},[33,87338,164],{"class":163},[33,87340,198],{"class":167},[33,87342,87343,87345,87347,87349],{"class":35,"line":73},[33,87344,190],{"class":163},[33,87346,57333],{"class":167},[33,87348,164],{"class":163},[33,87350,66892],{"class":167},[33,87352,87353],{"class":35,"line":88},[33,87354,92],{"emptyLinePlaceholder":91},[33,87356,87357,87359,87361,87363,87366],{"class":35,"line":95},[33,87358,507],{"class":50},[33,87360,17208],{"class":163},[33,87362,215],{"class":167},[33,87364,87365],{"class":54},"\".\u002Fraw_pdfs\"",[33,87367,221],{"class":167},[33,87369,87370,87372,87374,87376,87379],{"class":35,"line":101},[33,87371,4615],{"class":50},[33,87373,212],{"class":163},[33,87375,215],{"class":167},[33,87377,87378],{"class":54},"\".\u002Fsecured_pdfs\"",[33,87380,221],{"class":167},[33,87382,87383],{"class":35,"line":171},[33,87384,92],{"emptyLinePlaceholder":91},[33,87386,87387],{"class":35,"line":179},[33,87388,92],{"emptyLinePlaceholder":91},[33,87390,87391,87393,87396],{"class":35,"line":187},[33,87392,562],{"class":163},[33,87394,87395],{"class":46}," batch_encrypt",[33,87397,7637],{"class":167},[33,87399,87400],{"class":35,"line":201},[33,87401,87402],{"class":167},"    source_dir: Path,\n",[33,87404,87405],{"class":35,"line":206},[33,87406,72624],{"class":167},[33,87408,87409,87411,87413],{"class":35,"line":224},[33,87410,87138],{"class":167},[33,87412,1053],{"class":50},[33,87414,247],{"class":167},[33,87416,87417,87419,87421],{"class":35,"line":229},[33,87418,87147],{"class":167},[33,87420,1053],{"class":50},[33,87422,247],{"class":167},[33,87424,87425,87427,87429],{"class":35,"line":235},[33,87426,1617],{"class":167},[33,87428,571],{"class":50},[33,87430,574],{"class":167},[33,87432,87433,87435,87437,87439,87441,87443,87445,87447,87449],{"class":35,"line":250},[33,87434,6346],{"class":167},[33,87436,869],{"class":238},[33,87438,242],{"class":163},[33,87440,855],{"class":50},[33,87442,365],{"class":167},[33,87444,878],{"class":238},[33,87446,242],{"class":163},[33,87448,855],{"class":50},[33,87450,221],{"class":167},[33,87452,87453,87455,87457,87459,87462,87464],{"class":35,"line":266},[33,87454,67695],{"class":167},[33,87456,242],{"class":163},[33,87458,28924],{"class":50},[33,87460,87461],{"class":167},"(source_dir.glob(",[33,87463,610],{"class":54},[33,87465,371],{"class":167},[33,87467,87468,87470,87472],{"class":35,"line":290},[33,87469,617],{"class":163},[33,87471,620],{"class":163},[33,87473,67717],{"class":167},[33,87475,87476,87478,87480,87482,87484,87486,87489,87491,87493],{"class":35,"line":295},[33,87477,9414],{"class":50},[33,87479,602],{"class":167},[33,87481,4059],{"class":163},[33,87483,631],{"class":54},[33,87485,1115],{"class":50},[33,87487,87488],{"class":167},"source_dir",[33,87490,1121],{"class":50},[33,87492,274],{"class":54},[33,87494,221],{"class":167},[33,87496,87497],{"class":35,"line":300},[33,87498,646],{"class":163},[33,87500,87501],{"class":35,"line":317},[33,87502,92],{"emptyLinePlaceholder":91},[33,87504,87505,87508,87510,87512,87514],{"class":35,"line":332},[33,87506,87507],{"class":167},"    ok, failed ",[33,87509,242],{"class":163},[33,87511,10791],{"class":50},[33,87513,365],{"class":167},[33,87515,87516],{"class":50},"0\n",[33,87518,87519,87521,87523,87525],{"class":35,"line":347},[33,87520,656],{"class":163},[33,87522,67712],{"class":167},[33,87524,662],{"class":163},[33,87526,67717],{"class":167},[33,87528,87529,87531,87533,87535,87537,87539,87542,87544,87546,87548],{"class":35,"line":374},[33,87530,50344],{"class":167},[33,87532,242],{"class":163},[33,87534,6393],{"class":167},[33,87536,1351],{"class":163},[33,87538,1110],{"class":163},[33,87540,87541],{"class":54},"\"secure_",[33,87543,1115],{"class":50},[33,87545,68341],{"class":167},[33,87547,1121],{"class":50},[33,87549,7504],{"class":54},[33,87551,87552,87554],{"class":35,"line":397},[33,87553,670],{"class":163},[33,87555,574],{"class":167},[33,87557,87558,87560,87562],{"class":35,"line":653},[33,87559,72722],{"class":167},[33,87561,242],{"class":163},[33,87563,87564],{"class":167}," PdfReader(pdf)\n",[33,87566,87567,87569,87571],{"class":35,"line":667},[33,87568,70275],{"class":167},[33,87570,242],{"class":163},[33,87572,67154],{"class":167},[33,87574,87575,87577,87579,87581],{"class":35,"line":675},[33,87576,1793],{"class":163},[33,87578,695],{"class":167},[33,87580,662],{"class":163},[33,87582,86724],{"class":167},[33,87584,87585],{"class":35,"line":689},[33,87586,87587],{"class":167},"                writer.add_page(page)\n",[33,87589,87590,87593,87595,87597,87600,87602,87604,87607,87609,87611,87613],{"class":35,"line":703},[33,87591,87592],{"class":167},"            writer.encrypt(",[33,87594,87310],{"class":238},[33,87596,242],{"class":163},[33,87598,87599],{"class":167},"user_pw, ",[33,87601,87302],{"class":238},[33,87603,242],{"class":163},[33,87605,87606],{"class":167},"owner_pw, ",[33,87608,86184],{"class":238},[33,87610,242],{"class":163},[33,87612,87250],{"class":54},[33,87614,221],{"class":167},[33,87616,87617,87619,87621,87623,87625,87627,87629],{"class":35,"line":714},[33,87618,678],{"class":163},[33,87620,68213],{"class":50},[33,87622,77675],{"class":167},[33,87624,67169],{"class":54},[33,87626,1649],{"class":167},[33,87628,495],{"class":163},[33,87630,67176],{"class":167},[33,87632,87633],{"class":35,"line":723},[33,87634,87635],{"class":167},"                writer.write(fh)\n",[33,87637,87638,87641,87643],{"class":35,"line":754},[33,87639,87640],{"class":167},"            ok ",[33,87642,28976],{"class":163},[33,87644,17709],{"class":50},[33,87646,87647,87649,87651,87653,87656,87658,87660,87662,87664],{"class":35,"line":771},[33,87648,9364],{"class":50},[33,87650,602],{"class":167},[33,87652,4059],{"class":163},[33,87654,87655],{"class":54},"\"  OK  ",[33,87657,1115],{"class":50},[33,87659,68341],{"class":167},[33,87661,1121],{"class":50},[33,87663,274],{"class":54},[33,87665,221],{"class":167},[33,87667,87668,87670,87672,87674],{"class":35,"line":777},[33,87669,780],{"class":163},[33,87671,783],{"class":50},[33,87673,1852],{"class":163},[33,87675,1855],{"class":167},[33,87677,87678,87681,87683],{"class":35,"line":788},[33,87679,87680],{"class":167},"            failed ",[33,87682,28976],{"class":163},[33,87684,17709],{"class":50},[33,87686,87687,87689,87691,87693,87696,87698,87700,87702,87704,87706,87708,87710,87712],{"class":35,"line":804},[33,87688,9364],{"class":50},[33,87690,602],{"class":167},[33,87692,4059],{"class":163},[33,87694,87695],{"class":54},"\"  ERR ",[33,87697,1115],{"class":50},[33,87699,68341],{"class":167},[33,87701,1121],{"class":50},[33,87703,2079],{"class":54},[33,87705,1115],{"class":50},[33,87707,6565],{"class":167},[33,87709,1121],{"class":50},[33,87711,274],{"class":54},[33,87713,221],{"class":167},[33,87715,87716],{"class":35,"line":809},[33,87717,92],{"emptyLinePlaceholder":91},[33,87719,87720,87722,87724,87726,87728,87730,87733,87735,87738,87740,87743,87745,87748,87750,87752],{"class":35,"line":819},[33,87721,7268],{"class":50},[33,87723,602],{"class":167},[33,87725,4059],{"class":163},[33,87727,274],{"class":54},[33,87729,25830],{"class":50},[33,87731,87732],{"class":54},"Done: ",[33,87734,1115],{"class":50},[33,87736,87737],{"class":167},"ok",[33,87739,1121],{"class":50},[33,87741,87742],{"class":54}," encrypted, ",[33,87744,1115],{"class":50},[33,87746,87747],{"class":167},"failed",[33,87749,1121],{"class":50},[33,87751,29015],{"class":54},[33,87753,221],{"class":167},[33,87755,87756],{"class":35,"line":829},[33,87757,92],{"emptyLinePlaceholder":91},[33,87759,87760],{"class":35,"line":834},[33,87761,92],{"emptyLinePlaceholder":91},[33,87763,87764,87766,87768,87770,87772],{"class":35,"line":839},[33,87765,2491],{"class":163},[33,87767,2494],{"class":50},[33,87769,2497],{"class":163},[33,87771,2500],{"class":54},[33,87773,574],{"class":167},[33,87775,87776],{"class":35,"line":860},[33,87777,87778],{"class":167},"    batch_encrypt(\n",[33,87780,87781,87784],{"class":35,"line":887},[33,87782,87783],{"class":50},"        INPUT_DIR",[33,87785,247],{"class":167},[33,87787,87788,87791],{"class":35,"line":907},[33,87789,87790],{"class":50},"        OUTPUT_DIR",[33,87792,247],{"class":167},[33,87794,87795,87798,87800,87802,87804],{"class":35,"line":1826},[33,87796,87797],{"class":238},"        user_pw",[33,87799,242],{"class":163},[33,87801,35884],{"class":167},[33,87803,86957],{"class":54},[33,87805,8935],{"class":167},[33,87807,87808,87811,87813,87815,87817],{"class":35,"line":1844},[33,87809,87810],{"class":238},"        owner_pw",[33,87812,242],{"class":163},[33,87814,35884],{"class":167},[33,87816,86975],{"class":54},[33,87818,8935],{"class":167},[33,87820,87821],{"class":35,"line":1858},[33,87822,1202],{"class":167},[18,87824,9247],{"id":9246},[14,87826,87827],{},"Confirm encryption succeeded and the password is correct before routing to downstream systems:",[23,87829,87831],{"className":126,"code":87830,"language":47,"meta":28,"style":28},"# pip install \"pypdf>=3.17\"\nfrom pathlib import Path\nfrom pypdf import PdfReader\nfrom pypdf.errors import FileNotDecryptedError\n\n\ndef verify_encryption(file_path: Path, user_password: str) -> bool:\n    \"\"\"Return True if file is encrypted and decrypts cleanly with user_password.\"\"\"\n    try:\n        reader = PdfReader(file_path)\n\n        if not reader.is_encrypted:\n            print(f\"FAIL: {file_path.name} is not encrypted\")\n            return False\n\n        result = reader.decrypt(user_password)\n        if result == 0:\n            print(f\"FAIL: wrong user password for {file_path.name}\")\n            return False\n\n        page_count = len(reader.pages)\n        print(f\"PASS: {file_path.name} — AES encrypted, {page_count} pages accessible\")\n        return True\n\n    except FileNotDecryptedError:\n        # pypdf raises this if you access .pages before calling decrypt()\n        print(f\"FAIL: FileNotDecryptedError — call reader.decrypt() before reading pages\")\n        return False\n    except Exception as exc:\n        print(f\"ERROR: {exc}\")\n        return False\n",[30,87832,87833,87837,87847,87857,87867,87871,87875,87893,87898,87904,87913,87917,87925,87947,87953,87957,87967,87979,88000,88006,88010,88021,88053,88059,88063,88070,88075,88088,88094,88104,88125],{"__ignoreMap":28},[33,87834,87835],{"class":35,"line":36},[33,87836,86432],{"class":39},[33,87838,87839,87841,87843,87845],{"class":35,"line":43},[33,87840,190],{"class":163},[33,87842,193],{"class":167},[33,87844,164],{"class":163},[33,87846,198],{"class":167},[33,87848,87849,87851,87853,87855],{"class":35,"line":61},[33,87850,190],{"class":163},[33,87852,57333],{"class":167},[33,87854,164],{"class":163},[33,87856,57338],{"class":167},[33,87858,87859,87861,87863,87865],{"class":35,"line":73},[33,87860,190],{"class":163},[33,87862,68145],{"class":167},[33,87864,164],{"class":163},[33,87866,73152],{"class":167},[33,87868,87869],{"class":35,"line":88},[33,87870,92],{"emptyLinePlaceholder":91},[33,87872,87873],{"class":35,"line":95},[33,87874,92],{"emptyLinePlaceholder":91},[33,87876,87877,87879,87882,87885,87887,87889,87891],{"class":35,"line":101},[33,87878,562],{"class":163},[33,87880,87881],{"class":46}," verify_encryption",[33,87883,87884],{"class":167},"(file_path: Path, user_password: ",[33,87886,1053],{"class":50},[33,87888,1617],{"class":167},[33,87890,2821],{"class":50},[33,87892,574],{"class":167},[33,87894,87895],{"class":35,"line":171},[33,87896,87897],{"class":54},"    \"\"\"Return True if file is encrypted and decrypts cleanly with user_password.\"\"\"\n",[33,87899,87900,87902],{"class":35,"line":179},[33,87901,2424],{"class":163},[33,87903,574],{"class":167},[33,87905,87906,87908,87910],{"class":35,"line":187},[33,87907,62484],{"class":167},[33,87909,242],{"class":163},[33,87911,87912],{"class":167}," PdfReader(file_path)\n",[33,87914,87915],{"class":35,"line":201},[33,87916,92],{"emptyLinePlaceholder":91},[33,87918,87919,87921,87923],{"class":35,"line":206},[33,87920,8221],{"class":163},[33,87922,620],{"class":163},[33,87924,68749],{"class":167},[33,87926,87927,87929,87931,87933,87935,87937,87940,87942,87945],{"class":35,"line":224},[33,87928,9364],{"class":50},[33,87930,602],{"class":167},[33,87932,4059],{"class":163},[33,87934,70816],{"class":54},[33,87936,1115],{"class":50},[33,87938,87939],{"class":167},"file_path.name",[33,87941,1121],{"class":50},[33,87943,87944],{"class":54}," is not encrypted\"",[33,87946,221],{"class":167},[33,87948,87949,87951],{"class":35,"line":229},[33,87950,28782],{"class":163},[33,87952,2903],{"class":50},[33,87954,87955],{"class":35,"line":235},[33,87956,92],{"emptyLinePlaceholder":91},[33,87958,87959,87962,87964],{"class":35,"line":250},[33,87960,87961],{"class":167},"        result ",[33,87963,242],{"class":163},[33,87965,87966],{"class":167}," reader.decrypt(user_password)\n",[33,87968,87969,87971,87973,87975,87977],{"class":35,"line":266},[33,87970,8221],{"class":163},[33,87972,68801],{"class":167},[33,87974,1865],{"class":163},[33,87976,10791],{"class":50},[33,87978,574],{"class":167},[33,87980,87981,87983,87985,87987,87990,87992,87994,87996,87998],{"class":35,"line":290},[33,87982,9364],{"class":50},[33,87984,602],{"class":167},[33,87986,4059],{"class":163},[33,87988,87989],{"class":54},"\"FAIL: wrong user password for ",[33,87991,1115],{"class":50},[33,87993,87939],{"class":167},[33,87995,1121],{"class":50},[33,87997,274],{"class":54},[33,87999,221],{"class":167},[33,88001,88002,88004],{"class":35,"line":295},[33,88003,28782],{"class":163},[33,88005,2903],{"class":50},[33,88007,88008],{"class":35,"line":300},[33,88009,92],{"emptyLinePlaceholder":91},[33,88011,88012,88015,88017,88019],{"class":35,"line":317},[33,88013,88014],{"class":167},"        page_count ",[33,88016,242],{"class":163},[33,88018,4037],{"class":50},[33,88020,70691],{"class":167},[33,88022,88023,88025,88027,88029,88032,88034,88036,88038,88041,88043,88046,88048,88051],{"class":35,"line":332},[33,88024,9414],{"class":50},[33,88026,602],{"class":167},[33,88028,4059],{"class":163},[33,88030,88031],{"class":54},"\"PASS: ",[33,88033,1115],{"class":50},[33,88035,87939],{"class":167},[33,88037,1121],{"class":50},[33,88039,88040],{"class":54}," — AES encrypted, ",[33,88042,1115],{"class":50},[33,88044,88045],{"class":167},"page_count",[33,88047,1121],{"class":50},[33,88049,88050],{"class":54}," pages accessible\"",[33,88052,221],{"class":167},[33,88054,88055,88057],{"class":35,"line":347},[33,88056,1659],{"class":163},[33,88058,2887],{"class":50},[33,88060,88061],{"class":35,"line":374},[33,88062,92],{"emptyLinePlaceholder":91},[33,88064,88065,88067],{"class":35,"line":397},[33,88066,2449],{"class":163},[33,88068,88069],{"class":167}," FileNotDecryptedError:\n",[33,88071,88072],{"class":35,"line":653},[33,88073,88074],{"class":39},"        # pypdf raises this if you access .pages before calling decrypt()\n",[33,88076,88077,88079,88081,88083,88086],{"class":35,"line":667},[33,88078,9414],{"class":50},[33,88080,602],{"class":167},[33,88082,4059],{"class":163},[33,88084,88085],{"class":54},"\"FAIL: FileNotDecryptedError — call reader.decrypt() before reading pages\"",[33,88087,221],{"class":167},[33,88089,88090,88092],{"class":35,"line":675},[33,88091,1659],{"class":163},[33,88093,2903],{"class":50},[33,88095,88096,88098,88100,88102],{"class":35,"line":689},[33,88097,2449],{"class":163},[33,88099,783],{"class":50},[33,88101,1852],{"class":163},[33,88103,1855],{"class":167},[33,88105,88106,88108,88110,88112,88115,88117,88119,88121,88123],{"class":35,"line":703},[33,88107,9414],{"class":50},[33,88109,602],{"class":167},[33,88111,4059],{"class":163},[33,88113,88114],{"class":54},"\"ERROR: ",[33,88116,1115],{"class":50},[33,88118,6565],{"class":167},[33,88120,1121],{"class":50},[33,88122,274],{"class":54},[33,88124,221],{"class":167},[33,88126,88127,88129],{"class":35,"line":714},[33,88128,1659],{"class":163},[33,88130,2903],{"class":50},[14,88132,88133,88134,88137],{},"An encrypted file that also has ",[940,88135,88136],{"href":65966},"watermarks applied"," should pass this check after the security layer is added as the final pipeline step — never before.",[18,88139,88141],{"id":88140},"metadata-and-bookmark-preservation","Metadata and Bookmark Preservation",[14,88143,88144,88145,88147,88148,88151],{},"By default, copying pages with ",[30,88146,71069],{}," does not transfer the source document's ",[30,88149,88150],{},"\u002FInfo"," metadata dictionary (author, title, subject, creation date) or the outline tree (bookmarks). If your compliance workflow requires preserving these, copy them explicitly before writing:",[23,88153,88155],{"className":126,"code":88154,"language":47,"meta":28,"style":28},"# pip install \"pypdf>=3.17\"\nfrom pathlib import Path\nfrom pypdf import PdfReader, PdfWriter\nimport os\n\n\ndef encrypt_preserve_metadata(\n    source: Path,\n    output: Path,\n    user_pw: str,\n    owner_pw: str,\n) -> None:\n    reader = PdfReader(source)\n    if reader.is_encrypted:\n        if reader.decrypt(os.environ.get(\"PDF_EXISTING_PW\", \"\")) == 0:\n            raise ValueError(\"Wrong existing password\")\n\n    writer = PdfWriter()\n    for page in reader.pages:\n        writer.add_page(page)\n\n    # Preserve \u002FInfo metadata if present\n    if reader.metadata:\n        writer.add_metadata(dict(reader.metadata))\n\n    # Clone the outline (bookmark) tree\n    writer.clone_document_from_reader(reader)   # preserves outline + metadata\n\n    writer.encrypt(user_password=user_pw, owner_password=owner_pw, algorithm=\"AES-256\")\n    output.parent.mkdir(parents=True, exist_ok=True)\n    with open(output, \"wb\") as fh:\n        writer.write(fh)\n",[30,88156,88157,88161,88171,88181,88187,88191,88195,88204,88208,88212,88220,88228,88236,88244,88250,88270,88283,88287,88295,88305,88309,88313,88318,88325,88335,88339,88344,88352,88356,88381,88401,88417],{"__ignoreMap":28},[33,88158,88159],{"class":35,"line":36},[33,88160,86432],{"class":39},[33,88162,88163,88165,88167,88169],{"class":35,"line":43},[33,88164,190],{"class":163},[33,88166,193],{"class":167},[33,88168,164],{"class":163},[33,88170,198],{"class":167},[33,88172,88173,88175,88177,88179],{"class":35,"line":61},[33,88174,190],{"class":163},[33,88176,57333],{"class":167},[33,88178,164],{"class":163},[33,88180,66892],{"class":167},[33,88182,88183,88185],{"class":35,"line":73},[33,88184,164],{"class":163},[33,88186,176],{"class":167},[33,88188,88189],{"class":35,"line":88},[33,88190,92],{"emptyLinePlaceholder":91},[33,88192,88193],{"class":35,"line":95},[33,88194,92],{"emptyLinePlaceholder":91},[33,88196,88197,88199,88202],{"class":35,"line":101},[33,88198,562],{"class":163},[33,88200,88201],{"class":46}," encrypt_preserve_metadata",[33,88203,7637],{"class":167},[33,88205,88206],{"class":35,"line":171},[33,88207,79033],{"class":167},[33,88209,88210],{"class":35,"line":179},[33,88211,70116],{"class":167},[33,88213,88214,88216,88218],{"class":35,"line":187},[33,88215,87138],{"class":167},[33,88217,1053],{"class":50},[33,88219,247],{"class":167},[33,88221,88222,88224,88226],{"class":35,"line":201},[33,88223,87147],{"class":167},[33,88225,1053],{"class":50},[33,88227,247],{"class":167},[33,88229,88230,88232,88234],{"class":35,"line":206},[33,88231,1617],{"class":167},[33,88233,571],{"class":50},[33,88235,574],{"class":167},[33,88237,88238,88240,88242],{"class":35,"line":224},[33,88239,57365],{"class":167},[33,88241,242],{"class":163},[33,88243,86642],{"class":167},[33,88245,88246,88248],{"class":35,"line":229},[33,88247,617],{"class":163},[33,88249,68749],{"class":167},[33,88251,88252,88254,88256,88258,88260,88262,88264,88266,88268],{"class":35,"line":235},[33,88253,8221],{"class":163},[33,88255,86667],{"class":167},[33,88257,86670],{"class":54},[33,88259,365],{"class":167},[33,88261,3198],{"class":54},[33,88263,76992],{"class":167},[33,88265,1865],{"class":163},[33,88267,10791],{"class":50},[33,88269,574],{"class":167},[33,88271,88272,88274,88276,88278,88281],{"class":35,"line":250},[33,88273,59715],{"class":163},[33,88275,4054],{"class":50},[33,88277,602],{"class":167},[33,88279,88280],{"class":54},"\"Wrong existing password\"",[33,88282,221],{"class":167},[33,88284,88285],{"class":35,"line":266},[33,88286,92],{"emptyLinePlaceholder":91},[33,88288,88289,88291,88293],{"class":35,"line":290},[33,88290,68681],{"class":167},[33,88292,242],{"class":163},[33,88294,67154],{"class":167},[33,88296,88297,88299,88301,88303],{"class":35,"line":295},[33,88298,656],{"class":163},[33,88300,695],{"class":167},[33,88302,662],{"class":163},[33,88304,86724],{"class":167},[33,88306,88307],{"class":35,"line":300},[33,88308,87204],{"class":167},[33,88310,88311],{"class":35,"line":317},[33,88312,92],{"emptyLinePlaceholder":91},[33,88314,88315],{"class":35,"line":332},[33,88316,88317],{"class":39},"    # Preserve \u002FInfo metadata if present\n",[33,88319,88320,88322],{"class":35,"line":347},[33,88321,617],{"class":163},[33,88323,88324],{"class":167}," reader.metadata:\n",[33,88326,88327,88330,88332],{"class":35,"line":374},[33,88328,88329],{"class":167},"        writer.add_metadata(",[33,88331,37100],{"class":50},[33,88333,88334],{"class":167},"(reader.metadata))\n",[33,88336,88337],{"class":35,"line":397},[33,88338,92],{"emptyLinePlaceholder":91},[33,88340,88341],{"class":35,"line":653},[33,88342,88343],{"class":39},"    # Clone the outline (bookmark) tree\n",[33,88345,88346,88349],{"class":35,"line":667},[33,88347,88348],{"class":167},"    writer.clone_document_from_reader(reader)   ",[33,88350,88351],{"class":39},"# preserves outline + metadata\n",[33,88353,88354],{"class":35,"line":675},[33,88355,92],{"emptyLinePlaceholder":91},[33,88357,88358,88361,88363,88365,88367,88369,88371,88373,88375,88377,88379],{"class":35,"line":689},[33,88359,88360],{"class":167},"    writer.encrypt(",[33,88362,87310],{"class":238},[33,88364,242],{"class":163},[33,88366,87599],{"class":167},[33,88368,87302],{"class":238},[33,88370,242],{"class":163},[33,88372,87606],{"class":167},[33,88374,86184],{"class":238},[33,88376,242],{"class":163},[33,88378,87250],{"class":54},[33,88380,221],{"class":167},[33,88382,88383,88385,88387,88389,88391,88393,88395,88397,88399],{"class":35,"line":703},[33,88384,74932],{"class":167},[33,88386,869],{"class":238},[33,88388,242],{"class":163},[33,88390,855],{"class":50},[33,88392,365],{"class":167},[33,88394,878],{"class":238},[33,88396,242],{"class":163},[33,88398,855],{"class":50},[33,88400,221],{"class":167},[33,88402,88403,88405,88407,88409,88411,88413,88415],{"class":35,"line":714},[33,88404,1635],{"class":163},[33,88406,68213],{"class":50},[33,88408,70532],{"class":167},[33,88410,67169],{"class":54},[33,88412,1649],{"class":167},[33,88414,495],{"class":163},[33,88416,67176],{"class":167},[33,88418,88419],{"class":35,"line":723},[33,88420,87297],{"class":167},[14,88422,88423,88426,88427,88430],{},[30,88424,88425],{},"clone_document_from_reader()"," copies the full document structure including named destinations and embedded files. If you only need metadata and not the page tree, use ",[30,88428,88429],{},"add_metadata()"," alone — it is faster and avoids duplicating pages.",[18,88432,88434],{"id":88433},"integrating-with-the-broader-pdf-pipeline","Integrating with the Broader PDF Pipeline",[14,88436,88437],{},"Encryption is always the terminal step. The order matters:",[35387,88439,88440,88446,88451,88454],{},[4214,88441,88442,88443,88445],{},"Generate or assemble content — see ",[940,88444,26191],{"href":19001}," for ReportLab-based report creation.",[4214,88447,88448,88449,3035],{},"Apply structural edits — merge, split, reorder; see ",[940,88450,52682],{"href":52681},[4214,88452,88453],{},"Stamp watermarks (optional) — overlay a ReportLab transparency layer.",[4214,88455,88456,88457,88459],{},"Encrypt — call ",[30,88458,86130],{}," on the final composed writer.",[14,88461,88462,88463,88467],{},"Reversing steps 3 and 4 means the watermark step must decrypt, modify, and re-encrypt, which doubles the I\u002FO and risks losing the encryption settings. Reversing steps 2 and 4 means every individual source file must be decrypted before merging — see ",[940,88464,88466],{"href":88465},"\u002Fautomating-pdf-extraction-generation\u002Fwatermarking-and-securing-pdfs\u002Fremove-password-from-pdf-with-python\u002F","Remove a Password from a PDF with Python"," for that pattern.",[18,88469,48994],{"id":29070},[4273,88471,88472,88483],{},[4276,88473,88474],{},[4279,88475,88476,88478,88481],{},[4282,88477,29080],{},[4282,88479,88480],{},"Explanation",[4282,88482,4290],{},[4292,88484,88485,88506,88522,88533,88550],{},[4279,88486,88487,88493,88498],{},[4297,88488,88489,42706,88491],{},[30,88490,86120],{},[30,88492,86146],{},[4297,88494,79527,88495,88497],{},[30,88496,86139],{}," (unmaintained) or pypdf \u003C 3.0",[4297,88499,88500,88503,88504],{},[30,88501,88502],{},"pip install \"pypdf>=3.17\""," and remove ",[30,88505,86139],{},[4279,88507,88508,88514,88517],{},[4297,88509,88510,88513],{},[30,88511,88512],{},"use_128bit=False"," parameter",[4297,88515,88516],{},"Legacy keyword removed in pypdf 3.x",[4297,88518,88519,88520],{},"Replace with ",[30,88521,86127],{},[4279,88523,88524,88527,88530],{},[4297,88525,88526],{},"User and owner passwords identical",[4297,88528,88529],{},"Some readers ignore permission flags when passwords match",[4297,88531,88532],{},"Always use distinct, different-strength passwords",[4279,88534,88535,88538,88544],{},[4297,88536,88537],{},"Overwriting the source file",[4297,88539,88540,88541,88543],{},"Writing encrypted output to ",[30,88542,86467],{}," corrupts the stream mid-write",[4297,88545,88546,88547,88549],{},"Always define a separate ",[30,88548,70566],{}," path",[4279,88551,88552,88555,88558],{},[4297,88553,88554],{},"Encrypting before merging",[4297,88556,88557],{},"Merge operations require unencrypted pages",[4297,88559,88560,88561],{},"Apply encryption as the final step after ",[940,88562,88563],{"href":52681},"merging",[18,88565,88566],{"id":29183},"Frequently Asked Questions",[14,88568,88569,88572,88573,88575],{},[1974,88570,88571],{},"Why does pypdf throw PdfReadError when adding a password?","\nThe source file is already encrypted. Call ",[30,88574,86403],{}," before copying pages to the writer. Check the return value — 0 means the password is wrong.",[14,88577,88578,88581],{},[1974,88579,88580],{},"Can I add password protection without changing file size significantly?","\nYes. AES-256 encryption adds under 1 KB of overhead (a modified trailer and cross-reference table). Size bloat usually indicates uncompressed streams or embedded font duplication unrelated to encryption.",[14,88583,88584,88587,88588,88590,88591,88594,88595,3035],{},[1974,88585,88586],{},"Does encryption preserve bookmarks and metadata?","\npypdf preserves the document outline (bookmarks) and ",[30,88589,88150],{}," metadata by default. If your compliance workflow requires stripping metadata, iterate over ",[30,88592,88593],{},"writer.add_metadata({})"," to clear the info dictionary before calling ",[30,88596,88597],{},"encrypt()",[18,88599,6918],{"id":6917},[4211,88601,88602,88607,88612],{},[4214,88603,88604,88606],{},[940,88605,65967],{"href":65966}," — full guide covering visual overlays, permission flags, and batch pipelines",[4214,88608,88609,88611],{},[940,88610,88466],{"href":88465}," — decrypt an authorized file before re-encrypting",[4214,88613,88614,88616],{},[940,88615,52682],{"href":52681}," — complete structural edits before applying encryption",[14,88618,6947,88619,3035],{},[940,88620,65967],{"href":65966},[6953,88622,88623],{},"html pre.shiki code .s7eDp, html code.shiki .s7eDp{--shiki-default:#6F42C1}html pre.shiki code .sYBdl, html code.shiki .sYBdl{--shiki-default:#032F62}html pre.shiki code .sD7c4, html code.shiki .sD7c4{--shiki-default:#D73A49}html pre.shiki code .sAwPA, html code.shiki .sAwPA{--shiki-default:#6A737D}html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html pre.shiki code .sgsFI, html code.shiki .sgsFI{--shiki-default:#24292E}html pre.shiki code .sYu0t, html code.shiki .sYu0t{--shiki-default:#005CC5}html pre.shiki code .sqxcx, html code.shiki .sqxcx{--shiki-default:#E36209}",{"title":28,"searchDepth":43,"depth":43,"links":88625},[88626,88627,88628,88629,88630,88631,88632,88633,88634,88635,88636],{"id":7020,"depth":43,"text":7021},{"id":35016,"depth":43,"text":35017},{"id":86412,"depth":43,"text":86413},{"id":87014,"depth":43,"text":87015},{"id":87314,"depth":43,"text":87315},{"id":9246,"depth":43,"text":9247},{"id":88140,"depth":43,"text":88141},{"id":88433,"depth":43,"text":88434},{"id":29070,"depth":43,"text":48994},{"id":29183,"depth":43,"text":88566},{"id":6917,"depth":43,"text":6918},"Add Password Protection","Fix PdfReadError and RC4 deprecation errors when adding PDF passwords in Python. Migrate to pypdf AES-256 encryption with owner and user password controls.",{},"\u002Fautomating-pdf-extraction-generation\u002Fwatermarking-and-securing-pdfs\u002Fadd-password-protection-to-pdf-files",{"title":86107,"description":88638},"Add Password Protection to PDF Files with Python","automating-pdf-extraction-generation\u002Fwatermarking-and-securing-pdfs\u002Fadd-password-protection-to-pdf-files\u002Findex",[9631,47,88645,65045],"encryption","9MuIElFLYcxiVRBa3QbFwUrKklt2gAa6gOJ-Gg_O_LQ",{"id":88648,"title":65967,"body":88649,"breadcrumbTitle":92939,"canonical":6977,"date":46387,"description":92940,"draft":6980,"extension":6981,"image":6977,"meta":92941,"navigation":91,"path":92942,"robots":6977,"seo":92943,"seoTitle":92944,"stem":92945,"tags":92946,"updatedAt":6978,"__hash__":92948},"content\u002Fautomating-pdf-extraction-generation\u002Fwatermarking-and-securing-pdfs\u002Findex.md",{"type":7,"value":88650,"toc":92918},[88651,88654,88663,88673,88675,88697,88704,88708,88711,88985,88998,89002,89138,89142,89147,89447,89454,89458,89468,89884,89888,89901,90394,90400,90404,90454,90460,90464,90618,90620,90624,90635,91056,91060,91063,91204,91208,91211,91489,91491,91494,91792,91797,91799,91847,91849,91960,91962,92887,92889,92912,92916],[10,88652,65967],{"id":88653},"watermarking-and-securing-pdfs",[14,88655,88656,88657,88659,88660,88662],{},"Visual watermarks handle branding and confidentiality labelling. Cryptographic controls handle access enforcement. Both belong at the end of any ",[940,88658,6943],{"href":6942}," pipeline — after structural work such as ",[940,88661,52682],{"href":52681}," is complete, and before the output reaches a recipient. Applying encryption mid-pipeline breaks merge operations and parsing steps; applying watermarks after encryption requires decryption first. Get the order right and both techniques compose cleanly.",[14,88664,88665,88666,88669,88670,88672],{},"This guide covers: generating ReportLab overlay templates, stamping pages with ",[30,88667,88668],{},"merge_page()",", AES-256 encryption with ",[30,88671,86130],{},", owner vs user passwords, permission flag bitmasks, and batch processing patterns.",[18,88674,21],{"id":20},[23,88676,88678],{"className":25,"code":88677,"language":27,"meta":28,"style":28},"# pip install pypdf reportlab\npip install \"pypdf>=3.17\" \"reportlab>=4.2\"\n",[30,88679,88680,88685],{"__ignoreMap":28},[33,88681,88682],{"class":35,"line":36},[33,88683,88684],{"class":39},"# pip install pypdf reportlab\n",[33,88686,88687,88689,88691,88694],{"class":35,"line":43},[33,88688,76],{"class":46},[33,88690,79],{"class":54},[33,88692,88693],{"class":54}," \"pypdf>=3.17\"",[33,88695,88696],{"class":54}," \"reportlab>=4.2\"\n",[14,88698,88699,88700,88703],{},"You need at least one source PDF for testing. A minimal one-page file is sufficient for all examples here. Store it at ",[30,88701,88702],{},".\u002Finput\u002Fsource.pdf"," or adjust the path constants in the snippets.",[18,88705,88707],{"id":88706},"diagnostic-step-inspect-the-pdf-before-applying-security","Diagnostic Step: Inspect the PDF Before Applying Security",[14,88709,88710],{},"Before applying watermarks or encryption, verify the file's current state: is it already encrypted, what page size does it use, and does it contain form fields that watermarking might break?",[23,88712,88714],{"className":126,"code":88713,"language":47,"meta":28,"style":28},"# pip install pypdf\nfrom pathlib import Path\nfrom pypdf import PdfReader\n\nSOURCE = Path(\".\u002Finput\u002Fsource.pdf\")\n\ntry:\n    reader = PdfReader(SOURCE)\n    page = reader.pages[0]\n    # MediaBox gives the page dimensions in points (1 pt = 1\u002F72 inch)\n    media_box = page.mediabox\n    print(f\"Pages       : {len(reader.pages)}\")\n    print(f\"Encrypted   : {reader.is_encrypted}\")\n    print(f\"Page width  : {float(media_box.width):.1f} pt  ({float(media_box.width)\u002F72:.2f} in)\")\n    print(f\"Page height : {float(media_box.height):.1f} pt  ({float(media_box.height)\u002F72:.2f} in)\")\n    print(f\"Has AcroForm: {'\u002FAcroForm' in reader.trailer.get('\u002FRoot', {})}\")\nexcept FileNotFoundError:\n    print(f\"File not found: {SOURCE}\")\n",[30,88715,88716,88720,88730,88740,88744,88757,88761,88767,88779,88791,88796,88806,88827,88848,88889,88927,88960,88968],{"__ignoreMap":28},[33,88717,88718],{"class":35,"line":36},[33,88719,57316],{"class":39},[33,88721,88722,88724,88726,88728],{"class":35,"line":43},[33,88723,190],{"class":163},[33,88725,193],{"class":167},[33,88727,164],{"class":163},[33,88729,198],{"class":167},[33,88731,88732,88734,88736,88738],{"class":35,"line":61},[33,88733,190],{"class":163},[33,88735,57333],{"class":167},[33,88737,164],{"class":163},[33,88739,57338],{"class":167},[33,88741,88742],{"class":35,"line":73},[33,88743,92],{"emptyLinePlaceholder":91},[33,88745,88746,88748,88750,88752,88755],{"class":35,"line":88},[33,88747,86272],{"class":50},[33,88749,212],{"class":163},[33,88751,215],{"class":167},[33,88753,88754],{"class":54},"\".\u002Finput\u002Fsource.pdf\"",[33,88756,221],{"class":167},[33,88758,88759],{"class":35,"line":95},[33,88760,92],{"emptyLinePlaceholder":91},[33,88762,88763,88765],{"class":35,"line":101},[33,88764,35574],{"class":163},[33,88766,574],{"class":167},[33,88768,88769,88771,88773,88775,88777],{"class":35,"line":171},[33,88770,57365],{"class":167},[33,88772,242],{"class":163},[33,88774,57370],{"class":167},[33,88776,86272],{"class":50},[33,88778,221],{"class":167},[33,88780,88781,88783,88785,88787,88789],{"class":35,"line":179},[33,88782,39662],{"class":167},[33,88784,242],{"class":163},[33,88786,62542],{"class":167},[33,88788,748],{"class":50},[33,88790,9202],{"class":167},[33,88792,88793],{"class":35,"line":187},[33,88794,88795],{"class":39},"    # MediaBox gives the page dimensions in points (1 pt = 1\u002F72 inch)\n",[33,88797,88798,88801,88803],{"class":35,"line":201},[33,88799,88800],{"class":167},"    media_box ",[33,88802,242],{"class":163},[33,88804,88805],{"class":167}," page.mediabox\n",[33,88807,88808,88810,88812,88814,88817,88819,88821,88823,88825],{"class":35,"line":206},[33,88809,7268],{"class":50},[33,88811,602],{"class":167},[33,88813,4059],{"class":163},[33,88815,88816],{"class":54},"\"Pages       : ",[33,88818,4065],{"class":50},[33,88820,59322],{"class":167},[33,88822,1121],{"class":50},[33,88824,274],{"class":54},[33,88826,221],{"class":167},[33,88828,88829,88831,88833,88835,88838,88840,88842,88844,88846],{"class":35,"line":224},[33,88830,7268],{"class":50},[33,88832,602],{"class":167},[33,88834,4059],{"class":163},[33,88836,88837],{"class":54},"\"Encrypted   : ",[33,88839,1115],{"class":50},[33,88841,75937],{"class":167},[33,88843,1121],{"class":50},[33,88845,274],{"class":54},[33,88847,221],{"class":167},[33,88849,88850,88852,88854,88856,88859,88862,88865,88867,88869,88872,88874,88876,88878,88880,88882,88884,88887],{"class":35,"line":229},[33,88851,7268],{"class":50},[33,88853,602],{"class":167},[33,88855,4059],{"class":163},[33,88857,88858],{"class":54},"\"Page width  : ",[33,88860,88861],{"class":50},"{float",[33,88863,88864],{"class":167},"(media_box.width)",[33,88866,18438],{"class":163},[33,88868,1121],{"class":50},[33,88870,88871],{"class":54}," pt  (",[33,88873,88861],{"class":50},[33,88875,88864],{"class":167},[33,88877,1351],{"class":163},[33,88879,49823],{"class":50},[33,88881,55819],{"class":163},[33,88883,1121],{"class":50},[33,88885,88886],{"class":54}," in)\"",[33,88888,221],{"class":167},[33,88890,88891,88893,88895,88897,88900,88902,88905,88907,88909,88911,88913,88915,88917,88919,88921,88923,88925],{"class":35,"line":235},[33,88892,7268],{"class":50},[33,88894,602],{"class":167},[33,88896,4059],{"class":163},[33,88898,88899],{"class":54},"\"Page height : ",[33,88901,88861],{"class":50},[33,88903,88904],{"class":167},"(media_box.height)",[33,88906,18438],{"class":163},[33,88908,1121],{"class":50},[33,88910,88871],{"class":54},[33,88912,88861],{"class":50},[33,88914,88904],{"class":167},[33,88916,1351],{"class":163},[33,88918,49823],{"class":50},[33,88920,55819],{"class":163},[33,88922,1121],{"class":50},[33,88924,88886],{"class":54},[33,88926,221],{"class":167},[33,88928,88929,88931,88933,88935,88938,88940,88943,88945,88948,88951,88954,88956,88958],{"class":35,"line":250},[33,88930,7268],{"class":50},[33,88932,602],{"class":167},[33,88934,4059],{"class":163},[33,88936,88937],{"class":54},"\"Has AcroForm: ",[33,88939,1115],{"class":50},[33,88941,88942],{"class":54},"'\u002FAcroForm'",[33,88944,8002],{"class":163},[33,88946,88947],{"class":167}," reader.trailer.get(",[33,88949,88950],{"class":54},"'\u002FRoot'",[33,88952,88953],{"class":167},", {})",[33,88955,1121],{"class":50},[33,88957,274],{"class":54},[33,88959,221],{"class":167},[33,88961,88962,88964,88966],{"class":35,"line":266},[33,88963,35726],{"class":163},[33,88965,2945],{"class":50},[33,88967,574],{"class":167},[33,88969,88970,88972,88974,88976,88978,88981,88983],{"class":35,"line":290},[33,88971,7268],{"class":50},[33,88973,602],{"class":167},[33,88975,4059],{"class":163},[33,88977,15677],{"class":54},[33,88979,88980],{"class":50},"{SOURCE}",[33,88982,274],{"class":54},[33,88984,221],{"class":167},[14,88986,41963,88987,4348,88989,88991,88992,88994,88995,88997],{},[30,88988,86397],{},[30,88990,855],{},", decrypt before watermarking — see ",[940,88993,88466],{"href":88465},". If the page is letter-sized (612 × 792 pt), the snippets below work without modification. For A4 (595 × 842 pt), swap the ",[30,88996,20091],{}," constant in the ReportLab call.",[18,88999,89001],{"id":89000},"security-layers-how-watermarking-and-encryption-compose","Security Layers: How Watermarking and Encryption Compose",[2540,89003,2547,89006,2547,89009,2547,89012,2547,89033,89037,89040,2547,2547,89043,2547,89045,2547,89049,2547,2547,89053,2547,89055,2547,89057,2547,2547,89060,2547,89064,2547,2547,89066,2547,89069,2547,89072,2547,2547,89074,2547,2547,89077,2547,89080,2547,89083,2547,2547,89086,2547,2547,89089,2547,89092,2547,89094,2547,2547,89097,2547,2547,89099,2547,89103,2547,89106,2547,89109,2547,89113,2547,2547,89117,2547,89119,2547,89122,2547,89126,2547,89129,2547,89132,2547,2547,89135],{"viewBox":89004,"role":2543,"ariaLabel":89005,"xmlns":2545,"style":2546},"0 0 760 370","Diagram showing how source PDF, watermark overlay, and encryption compose into a secured output",[2549,89007,89008],{},"PDF watermark and encryption pipeline",[2553,89010,89011],{},"Shows the three-stage pipeline: source PDF and ReportLab watermark template merge into a watermarked PDF, which then passes through pypdf encrypt() to produce the final secured PDF with user password, owner password, and permission flags.",[2557,89013,2559,89014,2559,89021,2559,89028,2547],{},[2561,89015,2564,89017,2564,89019,2559],{"id":89016,"x1":748,"y1":748,"x2":734,"y2":748},"secure-grad-blue",[2566,89018],{"offset":748,"style":2568},[2566,89020],{"offset":734,"style":2571},[2561,89022,2564,89024,2564,89026,2559],{"id":89023,"x1":748,"y1":748,"x2":748,"y2":734},"secure-grad-green",[2566,89025],{"offset":748,"style":58312},[2566,89027],{"offset":734,"style":58315},[2573,89029,2564,89031,2559],{"id":89030,"viewBox":2576,"refX":2577,"refY":1153,"markerWidth":1179,"markerHeight":1179,"orient":2578},"secure-arrow",[2580,89032],{"d":2582,"fill":2583},[2000,89034,89036],{"x":2679,"y":89035,"fill":2583,"style":2685},"24","\nStage 1: Overlay\n",[2000,89038,89039],{"x":2677,"y":89035,"fill":2583,"style":2685},"\nStage 2: Encrypt\n",[2000,89041,89042],{"x":49899,"y":89035,"fill":2583,"style":2685},"\nOutput\n",[2585,89044],{"x":2587,"y":26411,"width":2609,"height":2590,"rx":2591,"fill":2592,"stroke":2593,"style":2594},[2000,89046,89048],{"x":2630,"y":89047,"fill":2599,"style":2600},"62","Source PDF",[2000,89050,89052],{"x":2630,"y":89051,"fill":2583,"style":2605},"82","unprotected",[2585,89054],{"x":2587,"y":2589,"width":2609,"height":2590,"rx":2591,"fill":11165,"stroke":11166,"style":2594},[2000,89056,19002],{"x":2630,"y":26406,"fill":2599,"style":2600},[2000,89058,89059],{"x":2630,"y":11173,"fill":2583,"style":2605},"watermark.pdf",[35,89061],{"x1":2610,"y1":38749,"x2":89062,"y2":59956,"stroke":2583,"markerEnd":89063,"style":2594},"245","url(#secure-arrow)",[35,89065],{"x1":2610,"y1":2635,"x2":89062,"y2":59956,"stroke":2583,"markerEnd":89063,"style":2594},[2585,89067],{"x":38722,"y":49842,"width":2609,"height":2590,"rx":2591,"fill":89068,"stroke":2593,"style":2594},"url(#secure-grad-blue)",[2000,89070,88668],{"x":89071,"y":11173,"fill":2599,"style":2600},"318",[2000,89073,70025],{"x":89071,"y":58337,"fill":2599,"style":2605},[35,89075],{"x1":26369,"y1":2639,"x2":89076,"y2":2639,"stroke":2583,"markerEnd":89063,"style":2594},"455",[2585,89078],{"x":89079,"y":49842,"width":2609,"height":2590,"rx":2591,"fill":2592,"stroke":2593,"style":2594},"458",[2000,89081,89082],{"x":82496,"y":11173,"fill":2599,"style":2600},"Watermarked",[2000,89084,89085],{"x":82496,"y":58337,"fill":2583,"style":2605},"PDF (no lock)",[35,89087],{"x1":89088,"y1":2639,"x2":71573,"y2":2639,"stroke":2583,"markerEnd":89063,"style":2594},"598",[2585,89090],{"x":89091,"y":49842,"width":2650,"height":2590,"rx":2591,"fill":89068,"stroke":11166,"style":2594},"643",[2000,89093,88597],{"x":38791,"y":11173,"fill":2599,"style":2600},[2000,89095,89096],{"x":38791,"y":58337,"fill":2599,"style":2605},"AES-256",[35,89098],{"x1":38791,"y1":16986,"x2":38791,"y2":38722,"stroke":2583,"markerEnd":89063,"style":2594},[2585,89100],{"x":89101,"y":49869,"width":2609,"height":2630,"rx":2591,"fill":89102,"stroke":17010,"style":2594},"613","url(#secure-grad-green)",[2000,89104,89105],{"x":11213,"y":17048,"fill":2599,"style":2600},"Secured PDF",[2000,89107,89108],{"x":11213,"y":2689,"fill":2583,"style":2605},"user password",[2000,89110,89112],{"x":11213,"y":89111,"fill":2583,"style":2605},"312","owner password",[2000,89114,89116],{"x":11213,"y":89115,"fill":2583,"style":2605},"330","permission flags",[2585,89118],{"x":2587,"y":49869,"width":2611,"height":2650,"rx":2591,"fill":2615,"stroke":2593,"style":2594},[2000,89120,89121],{"x":2589,"y":38850,"fill":2599,"style":2600},"Permission flags",[2000,89123,89125],{"x":2589,"y":89124,"fill":2583,"style":2605},"292","print_printing",[2000,89127,89128],{"x":2589,"y":2698,"fill":2583,"style":2605},"modify_content",[2000,89130,89131],{"x":2589,"y":26364,"fill":2583,"style":2605},"copy_content",[2000,89133,89134],{"x":2589,"y":38768,"fill":2583,"style":2605},"add_or_modify_annotations",[35,89136],{"x1":2701,"y1":26433,"x2":17013,"y2":26433,"stroke":11166,"style":89137},"stroke-width:1;stroke-dasharray:5,4",[18,89139,89141],{"id":89140},"step-1-generate-a-watermark-template-with-reportlab","Step 1: Generate a Watermark Template with ReportLab",[14,89143,89144,89146],{},[940,89145,19002],{"href":19001}," produces a single-page PDF with transparent text or graphics. This file is reused across every page in a batch — generate it once, open it once.",[23,89148,89150],{"className":126,"code":89149,"language":47,"meta":28,"style":28},"# pip install reportlab\nfrom pathlib import Path\nfrom reportlab.pdfgen import canvas\nfrom reportlab.lib.pagesizes import letter, A4\n\nWATERMARK_PATH = Path(\".\u002Ftmp\u002Fwatermark_template.pdf\")\n\ndef create_text_watermark(\n    text: str = \"CONFIDENTIAL\",\n    pagesize: tuple = letter,   # swap to A4 for European documents\n    alpha: float = 0.25,        # 0.1 = very faint, 0.5 = noticeable\n    font_size: int = 52,\n) -> Path:\n    \"\"\"Generate a diagonal text watermark and save to WATERMARK_PATH.\"\"\"\n    WATERMARK_PATH.parent.mkdir(parents=True, exist_ok=True)\n    width, height = pagesize\n    c = canvas.Canvas(str(WATERMARK_PATH), pagesize=pagesize)\n    c.saveState()\n    # Move origin to page centre, rotate 45°, draw centred string\n    c.translate(width \u002F 2, height \u002F 2)\n    c.rotate(45)\n    c.setFillAlpha(alpha)\n    c.setFont(\"Helvetica-Bold\", font_size)\n    c.setFillColorRGB(0.3, 0.3, 0.3)\n    # drawCentredString centres on x=0 (the translated origin)\n    c.drawCentredString(0, 0, text)\n    c.restoreState()\n    c.save()\n    return WATERMARK_PATH\n",[30,89151,89152,89156,89166,89176,89187,89191,89205,89209,89218,89232,89248,89266,89280,89284,89289,89312,89322,89345,89350,89355,89372,89381,89386,89395,89412,89417,89431,89436,89440],{"__ignoreMap":28},[33,89153,89154],{"class":35,"line":36},[33,89155,20289],{"class":39},[33,89157,89158,89160,89162,89164],{"class":35,"line":43},[33,89159,190],{"class":163},[33,89161,193],{"class":167},[33,89163,164],{"class":163},[33,89165,198],{"class":167},[33,89167,89168,89170,89172,89174],{"class":35,"line":61},[33,89169,190],{"class":163},[33,89171,28221],{"class":167},[33,89173,164],{"class":163},[33,89175,28226],{"class":167},[33,89177,89178,89180,89182,89184],{"class":35,"line":73},[33,89179,190],{"class":163},[33,89181,19044],{"class":167},[33,89183,164],{"class":163},[33,89185,89186],{"class":167}," letter, A4\n",[33,89188,89189],{"class":35,"line":88},[33,89190,92],{"emptyLinePlaceholder":91},[33,89192,89193,89196,89198,89200,89203],{"class":35,"line":95},[33,89194,89195],{"class":50},"WATERMARK_PATH",[33,89197,212],{"class":163},[33,89199,215],{"class":167},[33,89201,89202],{"class":54},"\".\u002Ftmp\u002Fwatermark_template.pdf\"",[33,89204,221],{"class":167},[33,89206,89207],{"class":35,"line":101},[33,89208,92],{"emptyLinePlaceholder":91},[33,89210,89211,89213,89216],{"class":35,"line":171},[33,89212,562],{"class":163},[33,89214,89215],{"class":46}," create_text_watermark",[33,89217,7637],{"class":167},[33,89219,89220,89223,89225,89227,89230],{"class":35,"line":179},[33,89221,89222],{"class":167},"    text: ",[33,89224,1053],{"class":50},[33,89226,212],{"class":163},[33,89228,89229],{"class":54}," \"CONFIDENTIAL\"",[33,89231,247],{"class":167},[33,89233,89234,89237,89240,89242,89245],{"class":35,"line":187},[33,89235,89236],{"class":167},"    pagesize: ",[33,89238,89239],{"class":50},"tuple",[33,89241,212],{"class":163},[33,89243,89244],{"class":167}," letter,   ",[33,89246,89247],{"class":39},"# swap to A4 for European documents\n",[33,89249,89250,89253,89255,89257,89260,89263],{"class":35,"line":201},[33,89251,89252],{"class":167},"    alpha: ",[33,89254,1720],{"class":50},[33,89256,212],{"class":163},[33,89258,89259],{"class":50}," 0.25",[33,89261,89262],{"class":167},",        ",[33,89264,89265],{"class":39},"# 0.1 = very faint, 0.5 = noticeable\n",[33,89267,89268,89271,89273,89275,89278],{"class":35,"line":206},[33,89269,89270],{"class":167},"    font_size: ",[33,89272,1059],{"class":50},[33,89274,212],{"class":163},[33,89276,89277],{"class":50}," 52",[33,89279,247],{"class":167},[33,89281,89282],{"class":35,"line":224},[33,89283,65406],{"class":167},[33,89285,89286],{"class":35,"line":229},[33,89287,89288],{"class":54},"    \"\"\"Generate a diagonal text watermark and save to WATERMARK_PATH.\"\"\"\n",[33,89290,89291,89294,89296,89298,89300,89302,89304,89306,89308,89310],{"class":35,"line":235},[33,89292,89293],{"class":50},"    WATERMARK_PATH",[33,89295,866],{"class":167},[33,89297,869],{"class":238},[33,89299,242],{"class":163},[33,89301,855],{"class":50},[33,89303,365],{"class":167},[33,89305,878],{"class":238},[33,89307,242],{"class":163},[33,89309,855],{"class":50},[33,89311,221],{"class":167},[33,89313,89314,89317,89319],{"class":35,"line":250},[33,89315,89316],{"class":167},"    width, height ",[33,89318,242],{"class":163},[33,89320,89321],{"class":167}," pagesize\n",[33,89323,89324,89326,89328,89330,89332,89334,89336,89338,89340,89342],{"class":35,"line":266},[33,89325,28472],{"class":167},[33,89327,242],{"class":163},[33,89329,28477],{"class":167},[33,89331,1053],{"class":50},[33,89333,602],{"class":167},[33,89335,89195],{"class":50},[33,89337,18525],{"class":167},[33,89339,20091],{"class":238},[33,89341,242],{"class":163},[33,89343,89344],{"class":167},"pagesize)\n",[33,89346,89347],{"class":35,"line":290},[33,89348,89349],{"class":167},"    c.saveState()\n",[33,89351,89352],{"class":35,"line":295},[33,89353,89354],{"class":39},"    # Move origin to page centre, rotate 45°, draw centred string\n",[33,89356,89357,89360,89362,89364,89366,89368,89370],{"class":35,"line":300},[33,89358,89359],{"class":167},"    c.translate(width ",[33,89361,1351],{"class":163},[33,89363,7451],{"class":50},[33,89365,28528],{"class":167},[33,89367,1351],{"class":163},[33,89369,7451],{"class":50},[33,89371,221],{"class":167},[33,89373,89374,89377,89379],{"class":35,"line":317},[33,89375,89376],{"class":167},"    c.rotate(",[33,89378,82765],{"class":50},[33,89380,221],{"class":167},[33,89382,89383],{"class":35,"line":332},[33,89384,89385],{"class":167},"    c.setFillAlpha(alpha)\n",[33,89387,89388,89390,89392],{"class":35,"line":347},[33,89389,28510],{"class":167},[33,89391,19908],{"class":54},[33,89393,89394],{"class":167},", font_size)\n",[33,89396,89397,89400,89402,89404,89406,89408,89410],{"class":35,"line":374},[33,89398,89399],{"class":167},"    c.setFillColorRGB(",[33,89401,24461],{"class":50},[33,89403,365],{"class":167},[33,89405,24461],{"class":50},[33,89407,365],{"class":167},[33,89409,24461],{"class":50},[33,89411,221],{"class":167},[33,89413,89414],{"class":35,"line":397},[33,89415,89416],{"class":39},"    # drawCentredString centres on x=0 (the translated origin)\n",[33,89418,89419,89422,89424,89426,89428],{"class":35,"line":653},[33,89420,89421],{"class":167},"    c.drawCentredString(",[33,89423,748],{"class":50},[33,89425,365],{"class":167},[33,89427,748],{"class":50},[33,89429,89430],{"class":167},", text)\n",[33,89432,89433],{"class":35,"line":667},[33,89434,89435],{"class":167},"    c.restoreState()\n",[33,89437,89438],{"class":35,"line":675},[33,89439,28601],{"class":167},[33,89441,89442,89444],{"class":35,"line":689},[33,89443,1332],{"class":163},[33,89445,89446],{"class":50}," WATERMARK_PATH\n",[14,89448,89449,89450,89453],{},"Keep ",[30,89451,89452],{},"alpha"," between 0.1 and 0.4. Above 0.4, the overlay obscures body text on documents with light backgrounds.",[18,89455,89457],{"id":89456},"step-2-stamp-pages-with-merge_page","Step 2: Stamp Pages with merge_page()",[14,89459,89460,89463,89464,89467],{},[30,89461,89462],{},"PdfWriter.merge_page()"," composites the watermark PDF page on top of each content page using PDF transparency semantics. The watermark layer is stamped over the content; use ",[30,89465,89466],{},"merge_page(watermark, over=False)"," to push it under the content instead (useful for background logos).",[23,89469,89471],{"className":126,"code":89470,"language":47,"meta":28,"style":28},"# pip install pypdf reportlab\nfrom pathlib import Path\nfrom pypdf import PdfReader, PdfWriter\n\nINPUT_PDF  = Path(\".\u002Finput\u002Fsource.pdf\")\nOUTPUT_PDF = Path(\".\u002Foutput\u002Fwatermarked.pdf\")\nWATERMARK  = Path(\".\u002Ftmp\u002Fwatermark_template.pdf\")\n\n\ndef stamp_watermark(\n    source: Path,\n    watermark: Path,\n    output: Path,\n    under: bool = False,     # True = watermark behind content (background mode)\n) -> None:\n    \"\"\"Merge a watermark template onto every page of source PDF.\"\"\"\n    output.parent.mkdir(parents=True, exist_ok=True)\n    try:\n        wm_reader = PdfReader(watermark)\n        wm_page   = wm_reader.pages[0]\n\n        reader = PdfReader(source)\n        writer = PdfWriter()\n\n        for page in reader.pages:\n            if under:\n                # Clone watermark page, stamp content on top\n                wm_copy = PdfReader(watermark).pages[0]\n                wm_copy.merge_page(page)\n                writer.add_page(wm_copy)\n            else:\n                # Stamp watermark on top of content\n                page.merge_page(wm_page)\n                writer.add_page(page)\n\n        with open(output, \"wb\") as fh:\n            writer.write(fh)\n        print(f\"Watermarked: {output}\")\n    except FileNotFoundError as exc:\n        print(f\"Missing file: {exc}\")\n    except Exception as exc:\n        print(f\"Watermark failed: {exc}\")\n\n\nif __name__ == \"__main__\":\n    stamp_watermark(INPUT_PDF, WATERMARK, OUTPUT_PDF)\n",[30,89472,89473,89477,89487,89497,89501,89513,89527,89540,89544,89548,89557,89561,89566,89570,89587,89595,89600,89620,89626,89636,89650,89654,89662,89670,89674,89684,89691,89696,89710,89715,89720,89726,89731,89736,89740,89744,89760,89764,89785,89795,89816,89826,89847,89851,89855,89867],{"__ignoreMap":28},[33,89474,89475],{"class":35,"line":36},[33,89476,88684],{"class":39},[33,89478,89479,89481,89483,89485],{"class":35,"line":43},[33,89480,190],{"class":163},[33,89482,193],{"class":167},[33,89484,164],{"class":163},[33,89486,198],{"class":167},[33,89488,89489,89491,89493,89495],{"class":35,"line":61},[33,89490,190],{"class":163},[33,89492,57333],{"class":167},[33,89494,164],{"class":163},[33,89496,66892],{"class":167},[33,89498,89499],{"class":35,"line":73},[33,89500,92],{"emptyLinePlaceholder":91},[33,89502,89503,89505,89507,89509,89511],{"class":35,"line":88},[33,89504,86467],{"class":50},[33,89506,17208],{"class":163},[33,89508,215],{"class":167},[33,89510,88754],{"class":54},[33,89512,221],{"class":167},[33,89514,89515,89518,89520,89522,89525],{"class":35,"line":95},[33,89516,89517],{"class":50},"OUTPUT_PDF",[33,89519,212],{"class":163},[33,89521,215],{"class":167},[33,89523,89524],{"class":54},"\".\u002Foutput\u002Fwatermarked.pdf\"",[33,89526,221],{"class":167},[33,89528,89529,89532,89534,89536,89538],{"class":35,"line":101},[33,89530,89531],{"class":50},"WATERMARK",[33,89533,17208],{"class":163},[33,89535,215],{"class":167},[33,89537,89202],{"class":54},[33,89539,221],{"class":167},[33,89541,89542],{"class":35,"line":171},[33,89543,92],{"emptyLinePlaceholder":91},[33,89545,89546],{"class":35,"line":179},[33,89547,92],{"emptyLinePlaceholder":91},[33,89549,89550,89552,89555],{"class":35,"line":187},[33,89551,562],{"class":163},[33,89553,89554],{"class":46}," stamp_watermark",[33,89556,7637],{"class":167},[33,89558,89559],{"class":35,"line":201},[33,89560,79033],{"class":167},[33,89562,89563],{"class":35,"line":206},[33,89564,89565],{"class":167},"    watermark: Path,\n",[33,89567,89568],{"class":35,"line":224},[33,89569,70116],{"class":167},[33,89571,89572,89575,89577,89579,89582,89584],{"class":35,"line":229},[33,89573,89574],{"class":167},"    under: ",[33,89576,2821],{"class":50},[33,89578,212],{"class":163},[33,89580,89581],{"class":50}," False",[33,89583,25539],{"class":167},[33,89585,89586],{"class":39},"# True = watermark behind content (background mode)\n",[33,89588,89589,89591,89593],{"class":35,"line":235},[33,89590,1617],{"class":167},[33,89592,571],{"class":50},[33,89594,574],{"class":167},[33,89596,89597],{"class":35,"line":250},[33,89598,89599],{"class":54},"    \"\"\"Merge a watermark template onto every page of source PDF.\"\"\"\n",[33,89601,89602,89604,89606,89608,89610,89612,89614,89616,89618],{"class":35,"line":266},[33,89603,74932],{"class":167},[33,89605,869],{"class":238},[33,89607,242],{"class":163},[33,89609,855],{"class":50},[33,89611,365],{"class":167},[33,89613,878],{"class":238},[33,89615,242],{"class":163},[33,89617,855],{"class":50},[33,89619,221],{"class":167},[33,89621,89622,89624],{"class":35,"line":290},[33,89623,2424],{"class":163},[33,89625,574],{"class":167},[33,89627,89628,89631,89633],{"class":35,"line":295},[33,89629,89630],{"class":167},"        wm_reader ",[33,89632,242],{"class":163},[33,89634,89635],{"class":167}," PdfReader(watermark)\n",[33,89637,89638,89641,89643,89646,89648],{"class":35,"line":300},[33,89639,89640],{"class":167},"        wm_page   ",[33,89642,242],{"class":163},[33,89644,89645],{"class":167}," wm_reader.pages[",[33,89647,748],{"class":50},[33,89649,9202],{"class":167},[33,89651,89652],{"class":35,"line":317},[33,89653,92],{"emptyLinePlaceholder":91},[33,89655,89656,89658,89660],{"class":35,"line":332},[33,89657,62484],{"class":167},[33,89659,242],{"class":163},[33,89661,86642],{"class":167},[33,89663,89664,89666,89668],{"class":35,"line":347},[33,89665,67149],{"class":167},[33,89667,242],{"class":163},[33,89669,67154],{"class":167},[33,89671,89672],{"class":35,"line":374},[33,89673,92],{"emptyLinePlaceholder":91},[33,89675,89676,89678,89680,89682],{"class":35,"line":397},[33,89677,5973],{"class":163},[33,89679,695],{"class":167},[33,89681,662],{"class":163},[33,89683,86724],{"class":167},[33,89685,89686,89688],{"class":35,"line":653},[33,89687,5995],{"class":163},[33,89689,89690],{"class":167}," under:\n",[33,89692,89693],{"class":35,"line":667},[33,89694,89695],{"class":39},"                # Clone watermark page, stamp content on top\n",[33,89697,89698,89701,89703,89706,89708],{"class":35,"line":675},[33,89699,89700],{"class":167},"                wm_copy ",[33,89702,242],{"class":163},[33,89704,89705],{"class":167}," PdfReader(watermark).pages[",[33,89707,748],{"class":50},[33,89709,9202],{"class":167},[33,89711,89712],{"class":35,"line":689},[33,89713,89714],{"class":167},"                wm_copy.merge_page(page)\n",[33,89716,89717],{"class":35,"line":703},[33,89718,89719],{"class":167},"                writer.add_page(wm_copy)\n",[33,89721,89722,89724],{"class":35,"line":714},[33,89723,8705],{"class":163},[33,89725,574],{"class":167},[33,89727,89728],{"class":35,"line":723},[33,89729,89730],{"class":39},"                # Stamp watermark on top of content\n",[33,89732,89733],{"class":35,"line":754},[33,89734,89735],{"class":167},"                page.merge_page(wm_page)\n",[33,89737,89738],{"class":35,"line":771},[33,89739,87587],{"class":167},[33,89741,89742],{"class":35,"line":777},[33,89743,92],{"emptyLinePlaceholder":91},[33,89745,89746,89748,89750,89752,89754,89756,89758],{"class":35,"line":788},[33,89747,2191],{"class":163},[33,89749,68213],{"class":50},[33,89751,70532],{"class":167},[33,89753,67169],{"class":54},[33,89755,1649],{"class":167},[33,89757,495],{"class":163},[33,89759,67176],{"class":167},[33,89761,89762],{"class":35,"line":804},[33,89763,67181],{"class":167},[33,89765,89766,89768,89770,89772,89775,89777,89779,89781,89783],{"class":35,"line":809},[33,89767,9414],{"class":50},[33,89769,602],{"class":167},[33,89771,4059],{"class":163},[33,89773,89774],{"class":54},"\"Watermarked: ",[33,89776,1115],{"class":50},[33,89778,70566],{"class":167},[33,89780,1121],{"class":50},[33,89782,274],{"class":54},[33,89784,221],{"class":167},[33,89786,89787,89789,89791,89793],{"class":35,"line":819},[33,89788,2449],{"class":163},[33,89790,2945],{"class":50},[33,89792,1852],{"class":163},[33,89794,1855],{"class":167},[33,89796,89797,89799,89801,89803,89806,89808,89810,89812,89814],{"class":35,"line":829},[33,89798,9414],{"class":50},[33,89800,602],{"class":167},[33,89802,4059],{"class":163},[33,89804,89805],{"class":54},"\"Missing file: ",[33,89807,1115],{"class":50},[33,89809,6565],{"class":167},[33,89811,1121],{"class":50},[33,89813,274],{"class":54},[33,89815,221],{"class":167},[33,89817,89818,89820,89822,89824],{"class":35,"line":834},[33,89819,2449],{"class":163},[33,89821,783],{"class":50},[33,89823,1852],{"class":163},[33,89825,1855],{"class":167},[33,89827,89828,89830,89832,89834,89837,89839,89841,89843,89845],{"class":35,"line":839},[33,89829,9414],{"class":50},[33,89831,602],{"class":167},[33,89833,4059],{"class":163},[33,89835,89836],{"class":54},"\"Watermark failed: ",[33,89838,1115],{"class":50},[33,89840,6565],{"class":167},[33,89842,1121],{"class":50},[33,89844,274],{"class":54},[33,89846,221],{"class":167},[33,89848,89849],{"class":35,"line":860},[33,89850,92],{"emptyLinePlaceholder":91},[33,89852,89853],{"class":35,"line":887},[33,89854,92],{"emptyLinePlaceholder":91},[33,89856,89857,89859,89861,89863,89865],{"class":35,"line":907},[33,89858,2491],{"class":163},[33,89860,2494],{"class":50},[33,89862,2497],{"class":163},[33,89864,2500],{"class":54},[33,89866,574],{"class":167},[33,89868,89869,89872,89874,89876,89878,89880,89882],{"class":35,"line":1826},[33,89870,89871],{"class":167},"    stamp_watermark(",[33,89873,86467],{"class":50},[33,89875,365],{"class":167},[33,89877,89531],{"class":50},[33,89879,365],{"class":167},[33,89881,89517],{"class":50},[33,89883,221],{"class":167},[18,89885,89887],{"id":89886},"step-3-encrypt-with-aes-256-and-set-permission-flags","Step 3: Encrypt with AES-256 and Set Permission Flags",[14,89889,89890,89893,89894,89896,89897,89900],{},[30,89891,89892],{},"PdfWriter.encrypt()"," accepts a ",[30,89895,87306],{}," integer built from the ",[30,89898,89899],{},"pypdf.generic.PermissionFlags"," constants (or a raw bitmask). The owner password bypasses all restrictions; the user password enforces them.",[23,89902,89904],{"className":126,"code":89903,"language":47,"meta":28,"style":28},"# pip install pypdf\nfrom pathlib import Path\nfrom pypdf import PdfReader, PdfWriter\nfrom pypdf.generic import PermissionFlags\n\nINPUT_PDF   = Path(\".\u002Foutput\u002Fwatermarked.pdf\")\nSECURED_PDF = Path(\".\u002Foutput\u002Fsecured.pdf\")\n\n# Bitmask: allow printing and annotations, deny content copy and modification\nPERMISSIONS = (\n    PermissionFlags.PRINT_PRINTING\n    | PermissionFlags.ADD_OR_MODIFY_ANNOTATIONS\n)\n\n\ndef encrypt_pdf(\n    source: Path,\n    output: Path,\n    user_password: str,\n    owner_password: str,\n    permissions: int = PERMISSIONS,\n    algorithm: str = \"AES-256\",\n) -> None:\n    \"\"\"\n    Encrypt source PDF and write to output.\n    user_password  — required to open\u002Fview the document\n    owner_password — grants full rights, overrides permission flags\n    \"\"\"\n    output.parent.mkdir(parents=True, exist_ok=True)\n    try:\n        reader = PdfReader(source)\n        writer = PdfWriter()\n        for page in reader.pages:\n            writer.add_page(page)\n\n        writer.encrypt(\n            user_password=user_password,\n            owner_password=owner_password,\n            permissions_flag=permissions,\n            algorithm=algorithm,      # \"AES-256\" → PDF 2.0 compliant\n        )\n\n        with open(output, \"wb\") as fh:\n            writer.write(fh)\n        print(f\"Encrypted ({algorithm}): {output}\")\n    except FileNotFoundError as exc:\n        print(f\"Source not found: {exc}\")\n    except Exception as exc:\n        print(f\"Encryption failed: {exc}\")\n\n\nif __name__ == \"__main__\":\n    import os\n    encrypt_pdf(\n        INPUT_PDF,\n        SECURED_PDF,\n        user_password=os.environ[\"PDF_USER_PW\"],    # never hardcode\n        owner_password=os.environ[\"PDF_OWNER_PW\"],\n    )\n",[30,89905,89906,89910,89920,89930,89940,89944,89956,89969,89973,89978,89987,89993,90002,90006,90010,90014,90022,90026,90030,90038,90046,90059,90071,90079,90083,90088,90092,90097,90101,90121,90127,90135,90143,90153,90157,90161,90165,90173,90181,90191,90203,90207,90211,90227,90231,90259,90269,90290,90300,90321,90325,90329,90341,90347,90351,90357,90363,90378,90390],{"__ignoreMap":28},[33,89907,89908],{"class":35,"line":36},[33,89909,57316],{"class":39},[33,89911,89912,89914,89916,89918],{"class":35,"line":43},[33,89913,190],{"class":163},[33,89915,193],{"class":167},[33,89917,164],{"class":163},[33,89919,198],{"class":167},[33,89921,89922,89924,89926,89928],{"class":35,"line":61},[33,89923,190],{"class":163},[33,89925,57333],{"class":167},[33,89927,164],{"class":163},[33,89929,66892],{"class":167},[33,89931,89932,89934,89936,89938],{"class":35,"line":73},[33,89933,190],{"class":163},[33,89935,73414],{"class":167},[33,89937,164],{"class":163},[33,89939,87061],{"class":167},[33,89941,89942],{"class":35,"line":88},[33,89943,92],{"emptyLinePlaceholder":91},[33,89945,89946,89948,89950,89952,89954],{"class":35,"line":95},[33,89947,86467],{"class":50},[33,89949,21012],{"class":163},[33,89951,215],{"class":167},[33,89953,89524],{"class":54},[33,89955,221],{"class":167},[33,89957,89958,89960,89962,89964,89967],{"class":35,"line":101},[33,89959,86480],{"class":50},[33,89961,212],{"class":163},[33,89963,215],{"class":167},[33,89965,89966],{"class":54},"\".\u002Foutput\u002Fsecured.pdf\"",[33,89968,221],{"class":167},[33,89970,89971],{"class":35,"line":171},[33,89972,92],{"emptyLinePlaceholder":91},[33,89974,89975],{"class":35,"line":179},[33,89976,89977],{"class":39},"# Bitmask: allow printing and annotations, deny content copy and modification\n",[33,89979,89980,89983,89985],{"class":35,"line":187},[33,89981,89982],{"class":50},"PERMISSIONS",[33,89984,212],{"class":163},[33,89986,1415],{"class":167},[33,89988,89989,89991],{"class":35,"line":201},[33,89990,87090],{"class":167},[33,89992,87093],{"class":50},[33,89994,89995,89997,89999],{"class":35,"line":206},[33,89996,87098],{"class":163},[33,89998,87101],{"class":167},[33,90000,90001],{"class":50},"ADD_OR_MODIFY_ANNOTATIONS\n",[33,90003,90004],{"class":35,"line":224},[33,90005,221],{"class":167},[33,90007,90008],{"class":35,"line":229},[33,90009,92],{"emptyLinePlaceholder":91},[33,90011,90012],{"class":35,"line":235},[33,90013,92],{"emptyLinePlaceholder":91},[33,90015,90016,90018,90020],{"class":35,"line":250},[33,90017,562],{"class":163},[33,90019,86504],{"class":46},[33,90021,7637],{"class":167},[33,90023,90024],{"class":35,"line":266},[33,90025,79033],{"class":167},[33,90027,90028],{"class":35,"line":290},[33,90029,70116],{"class":167},[33,90031,90032,90034,90036],{"class":35,"line":295},[33,90033,86519],{"class":167},[33,90035,1053],{"class":50},[33,90037,247],{"class":167},[33,90039,90040,90042,90044],{"class":35,"line":300},[33,90041,86528],{"class":167},[33,90043,1053],{"class":50},[33,90045,247],{"class":167},[33,90047,90048,90050,90052,90054,90057],{"class":35,"line":317},[33,90049,87156],{"class":167},[33,90051,1059],{"class":50},[33,90053,212],{"class":163},[33,90055,90056],{"class":50}," PERMISSIONS",[33,90058,247],{"class":167},[33,90060,90061,90063,90065,90067,90069],{"class":35,"line":332},[33,90062,86537],{"class":167},[33,90064,1053],{"class":50},[33,90066,212],{"class":163},[33,90068,86544],{"class":54},[33,90070,247],{"class":167},[33,90072,90073,90075,90077],{"class":35,"line":347},[33,90074,1617],{"class":167},[33,90076,571],{"class":50},[33,90078,574],{"class":167},[33,90080,90081],{"class":35,"line":374},[33,90082,7673],{"class":54},[33,90084,90085],{"class":35,"line":397},[33,90086,90087],{"class":54},"    Encrypt source PDF and write to output.\n",[33,90089,90090],{"class":35,"line":653},[33,90091,86572],{"class":54},[33,90093,90094],{"class":35,"line":667},[33,90095,90096],{"class":54},"    owner_password — grants full rights, overrides permission flags\n",[33,90098,90099],{"class":35,"line":675},[33,90100,7673],{"class":54},[33,90102,90103,90105,90107,90109,90111,90113,90115,90117,90119],{"class":35,"line":689},[33,90104,74932],{"class":167},[33,90106,869],{"class":238},[33,90108,242],{"class":163},[33,90110,855],{"class":50},[33,90112,365],{"class":167},[33,90114,878],{"class":238},[33,90116,242],{"class":163},[33,90118,855],{"class":50},[33,90120,221],{"class":167},[33,90122,90123,90125],{"class":35,"line":703},[33,90124,2424],{"class":163},[33,90126,574],{"class":167},[33,90128,90129,90131,90133],{"class":35,"line":714},[33,90130,62484],{"class":167},[33,90132,242],{"class":163},[33,90134,86642],{"class":167},[33,90136,90137,90139,90141],{"class":35,"line":723},[33,90138,67149],{"class":167},[33,90140,242],{"class":163},[33,90142,67154],{"class":167},[33,90144,90145,90147,90149,90151],{"class":35,"line":754},[33,90146,5973],{"class":163},[33,90148,695],{"class":167},[33,90150,662],{"class":163},[33,90152,86724],{"class":167},[33,90154,90155],{"class":35,"line":771},[33,90156,86729],{"class":167},[33,90158,90159],{"class":35,"line":777},[33,90160,92],{"emptyLinePlaceholder":91},[33,90162,90163],{"class":35,"line":788},[33,90164,86743],{"class":167},[33,90166,90167,90169,90171],{"class":35,"line":804},[33,90168,86748],{"class":238},[33,90170,242],{"class":163},[33,90172,86753],{"class":167},[33,90174,90175,90177,90179],{"class":35,"line":809},[33,90176,86758],{"class":238},[33,90178,242],{"class":163},[33,90180,86763],{"class":167},[33,90182,90183,90186,90188],{"class":35,"line":819},[33,90184,90185],{"class":238},"            permissions_flag",[33,90187,242],{"class":163},[33,90189,90190],{"class":167},"permissions,\n",[33,90192,90193,90195,90197,90200],{"class":35,"line":829},[33,90194,86768],{"class":238},[33,90196,242],{"class":163},[33,90198,90199],{"class":167},"algorithm,      ",[33,90201,90202],{"class":39},"# \"AES-256\" → PDF 2.0 compliant\n",[33,90204,90205],{"class":35,"line":834},[33,90206,5867],{"class":167},[33,90208,90209],{"class":35,"line":839},[33,90210,92],{"emptyLinePlaceholder":91},[33,90212,90213,90215,90217,90219,90221,90223,90225],{"class":35,"line":860},[33,90214,2191],{"class":163},[33,90216,68213],{"class":50},[33,90218,70532],{"class":167},[33,90220,67169],{"class":54},[33,90222,1649],{"class":167},[33,90224,495],{"class":163},[33,90226,67176],{"class":167},[33,90228,90229],{"class":35,"line":887},[33,90230,67181],{"class":167},[33,90232,90233,90235,90237,90239,90241,90243,90245,90247,90249,90251,90253,90255,90257],{"class":35,"line":907},[33,90234,9414],{"class":50},[33,90236,602],{"class":167},[33,90238,4059],{"class":163},[33,90240,86832],{"class":54},[33,90242,1115],{"class":50},[33,90244,86184],{"class":167},[33,90246,1121],{"class":50},[33,90248,86841],{"class":54},[33,90250,1115],{"class":50},[33,90252,70566],{"class":167},[33,90254,1121],{"class":50},[33,90256,274],{"class":54},[33,90258,221],{"class":167},[33,90260,90261,90263,90265,90267],{"class":35,"line":1826},[33,90262,2449],{"class":163},[33,90264,2945],{"class":50},[33,90266,1852],{"class":163},[33,90268,1855],{"class":167},[33,90270,90271,90273,90275,90277,90280,90282,90284,90286,90288],{"class":35,"line":1844},[33,90272,9414],{"class":50},[33,90274,602],{"class":167},[33,90276,4059],{"class":163},[33,90278,90279],{"class":54},"\"Source not found: ",[33,90281,1115],{"class":50},[33,90283,6565],{"class":167},[33,90285,1121],{"class":50},[33,90287,274],{"class":54},[33,90289,221],{"class":167},[33,90291,90292,90294,90296,90298],{"class":35,"line":1858},[33,90293,2449],{"class":163},[33,90295,783],{"class":50},[33,90297,1852],{"class":163},[33,90299,1855],{"class":167},[33,90301,90302,90304,90306,90308,90311,90313,90315,90317,90319],{"class":35,"line":1871},[33,90303,9414],{"class":50},[33,90305,602],{"class":167},[33,90307,4059],{"class":163},[33,90309,90310],{"class":54},"\"Encryption failed: ",[33,90312,1115],{"class":50},[33,90314,6565],{"class":167},[33,90316,1121],{"class":50},[33,90318,274],{"class":54},[33,90320,221],{"class":167},[33,90322,90323],{"class":35,"line":1877},[33,90324,92],{"emptyLinePlaceholder":91},[33,90326,90327],{"class":35,"line":1883},[33,90328,92],{"emptyLinePlaceholder":91},[33,90330,90331,90333,90335,90337,90339],{"class":35,"line":1915},[33,90332,2491],{"class":163},[33,90334,2494],{"class":50},[33,90336,2497],{"class":163},[33,90338,2500],{"class":54},[33,90340,574],{"class":167},[33,90342,90343,90345],{"class":35,"line":1926},[33,90344,1627],{"class":163},[33,90346,176],{"class":167},[33,90348,90349],{"class":35,"line":1932},[33,90350,86931],{"class":167},[33,90352,90353,90355],{"class":35,"line":1938},[33,90354,86936],{"class":50},[33,90356,247],{"class":167},[33,90358,90359,90361],{"class":35,"line":1950},[33,90360,86943],{"class":50},[33,90362,247],{"class":167},[33,90364,90365,90367,90369,90371,90373,90375],{"class":35,"line":1958},[33,90366,86950],{"class":238},[33,90368,242],{"class":163},[33,90370,35884],{"class":167},[33,90372,86957],{"class":54},[33,90374,86960],{"class":167},[33,90376,90377],{"class":39},"# never hardcode\n",[33,90379,90380,90382,90384,90386,90388],{"class":35,"line":4904},[33,90381,86968],{"class":238},[33,90383,242],{"class":163},[33,90385,35884],{"class":167},[33,90387,86975],{"class":54},[33,90389,8935],{"class":167},[33,90391,90392],{"class":35,"line":4909},[33,90393,1202],{"class":167},[14,90395,90396,90397,90399],{},"Never hardcode passwords in source files. Pull them from environment variables or a secrets manager (AWS Secrets Manager, HashiCorp Vault, or even a local ",[30,90398,3585],{}," excluded from version control).",[424,90401,90403],{"id":90402},"owner-vs-user-password","Owner vs User Password",[4273,90405,90406,90419],{},[4276,90407,90408],{},[4279,90409,90410,90413,90416],{},[4282,90411,90412],{},"Password type",[4282,90414,90415],{},"Who holds it",[4282,90417,90418],{},"What it unlocks",[4292,90420,90421,90432,90443],{},[4279,90422,90423,90426,90429],{},[4297,90424,90425],{},"User password",[4297,90427,90428],{},"End recipient",[4297,90430,90431],{},"Open and view (within permission flags)",[4279,90433,90434,90437,90440],{},[4297,90435,90436],{},"Owner password",[4297,90438,90439],{},"Document author \u002F admin",[4297,90441,90442],{},"All operations; overrides every permission flag",[4279,90444,90445,90448,90451],{},[4297,90446,90447],{},"No user password",[4297,90449,90450],{},"—",[4297,90452,90453],{},"File opens without password; flags still apply to non-owner opens",[14,90455,36018,90456,90459],{},[30,90457,90458],{},"user_password=\"\""," (empty string) leaves the document openable by anyone while still enforcing permission flags and requiring the owner password for editing. This is a common pattern for read-only distribution.",[424,90461,90463],{"id":90462},"permission-flags-reference","Permission Flags Reference",[23,90465,90467],{"className":126,"code":90466,"language":47,"meta":28,"style":28},"from pypdf.generic import PermissionFlags\n\n# Common flag combinations\nREAD_ONLY  = 0                           # deny everything\nPRINT_ONLY = PermissionFlags.PRINT_PRINTING\nANNOTATE   = (\n    PermissionFlags.PRINT_PRINTING\n    | PermissionFlags.ADD_OR_MODIFY_ANNOTATIONS\n)\nFULL_EDIT  = (\n    PermissionFlags.PRINT_PRINTING\n    | PermissionFlags.MODIFY_CONTENTS\n    | PermissionFlags.COPY_CONTENT\n    | PermissionFlags.ADD_OR_MODIFY_ANNOTATIONS\n    | PermissionFlags.FILL_IN_EXISTING_FORM_FIELDS\n    | PermissionFlags.EXTRACT_TEXT_AND_GRAPHICS\n    | PermissionFlags.ASSEMBLE_DOCUMENT\n    | PermissionFlags.PRINT_IN_HIGH_QUALITY\n)\n",[30,90468,90469,90479,90483,90488,90500,90511,90520,90526,90534,90538,90547,90553,90562,90571,90579,90588,90597,90606,90614],{"__ignoreMap":28},[33,90470,90471,90473,90475,90477],{"class":35,"line":36},[33,90472,190],{"class":163},[33,90474,73414],{"class":167},[33,90476,164],{"class":163},[33,90478,87061],{"class":167},[33,90480,90481],{"class":35,"line":43},[33,90482,92],{"emptyLinePlaceholder":91},[33,90484,90485],{"class":35,"line":61},[33,90486,90487],{"class":39},"# Common flag combinations\n",[33,90489,90490,90493,90495,90497],{"class":35,"line":73},[33,90491,90492],{"class":50},"READ_ONLY",[33,90494,17208],{"class":163},[33,90496,10791],{"class":50},[33,90498,90499],{"class":39},"                           # deny everything\n",[33,90501,90502,90505,90507,90509],{"class":35,"line":88},[33,90503,90504],{"class":50},"PRINT_ONLY",[33,90506,212],{"class":163},[33,90508,87101],{"class":167},[33,90510,87093],{"class":50},[33,90512,90513,90516,90518],{"class":35,"line":95},[33,90514,90515],{"class":50},"ANNOTATE",[33,90517,21012],{"class":163},[33,90519,1415],{"class":167},[33,90521,90522,90524],{"class":35,"line":101},[33,90523,87090],{"class":167},[33,90525,87093],{"class":50},[33,90527,90528,90530,90532],{"class":35,"line":171},[33,90529,87098],{"class":163},[33,90531,87101],{"class":167},[33,90533,90001],{"class":50},[33,90535,90536],{"class":35,"line":179},[33,90537,221],{"class":167},[33,90539,90540,90543,90545],{"class":35,"line":187},[33,90541,90542],{"class":50},"FULL_EDIT",[33,90544,17208],{"class":163},[33,90546,1415],{"class":167},[33,90548,90549,90551],{"class":35,"line":201},[33,90550,87090],{"class":167},[33,90552,87093],{"class":50},[33,90554,90555,90557,90559],{"class":35,"line":206},[33,90556,87098],{"class":163},[33,90558,87101],{"class":167},[33,90560,90561],{"class":50},"MODIFY_CONTENTS\n",[33,90563,90564,90566,90568],{"class":35,"line":224},[33,90565,87098],{"class":163},[33,90567,87101],{"class":167},[33,90569,90570],{"class":50},"COPY_CONTENT\n",[33,90572,90573,90575,90577],{"class":35,"line":229},[33,90574,87098],{"class":163},[33,90576,87101],{"class":167},[33,90578,90001],{"class":50},[33,90580,90581,90583,90585],{"class":35,"line":235},[33,90582,87098],{"class":163},[33,90584,87101],{"class":167},[33,90586,90587],{"class":50},"FILL_IN_EXISTING_FORM_FIELDS\n",[33,90589,90590,90592,90594],{"class":35,"line":250},[33,90591,87098],{"class":163},[33,90593,87101],{"class":167},[33,90595,90596],{"class":50},"EXTRACT_TEXT_AND_GRAPHICS\n",[33,90598,90599,90601,90603],{"class":35,"line":266},[33,90600,87098],{"class":163},[33,90602,87101],{"class":167},[33,90604,90605],{"class":50},"ASSEMBLE_DOCUMENT\n",[33,90607,90608,90610,90612],{"class":35,"line":290},[33,90609,87098],{"class":163},[33,90611,87101],{"class":167},[33,90613,87104],{"class":50},[33,90615,90616],{"class":35,"line":295},[33,90617,221],{"class":167},[18,90619,2709],{"id":2708},[424,90621,90623],{"id":90622},"variant-1-watermark-encrypt-in-one-pass-in-memory","Variant 1: Watermark + Encrypt in One Pass (in-memory)",[14,90625,90626,90627,90630,90631,90634],{},"Avoid writing an intermediate file by streaming through ",[30,90628,90629],{},"io.BytesIO",". This matters when ",[940,90632,90633],{"href":19001},"generating PDF reports dynamically"," and piping output directly to a secured response:",[23,90636,90638],{"className":126,"code":90637,"language":47,"meta":28,"style":28},"# pip install pypdf reportlab\nimport io, os\nfrom pathlib import Path\nfrom pypdf import PdfReader, PdfWriter\nfrom reportlab.pdfgen import canvas\nfrom reportlab.lib.pagesizes import letter\n\n\ndef watermark_then_encrypt(\n    source_path: Path,\n    output_path: Path,\n    wm_text: str,\n    user_pw: str,\n    owner_pw: str,\n) -> None:\n    \"\"\"Generate watermark in-memory and encrypt in a single pipeline.\"\"\"\n    # 1. Build watermark in a BytesIO buffer\n    buf = io.BytesIO()\n    width, height = letter\n    c = canvas.Canvas(buf, pagesize=letter)\n    c.saveState()\n    c.translate(width \u002F 2, height \u002F 2)\n    c.rotate(45)\n    c.setFillAlpha(0.25)\n    c.setFont(\"Helvetica-Bold\", 52)\n    c.setFillColorRGB(0.3, 0.3, 0.3)\n    c.drawCentredString(0, 0, wm_text)\n    c.restoreState()\n    c.save()\n    buf.seek(0)\n\n    # 2. Merge watermark onto each page\n    wm_page = PdfReader(buf).pages[0]\n    reader  = PdfReader(source_path)\n    writer  = PdfWriter()\n    for page in reader.pages:\n        page.merge_page(wm_page)\n        writer.add_page(page)\n\n    # 3. Encrypt without touching the merged intermediate\n    writer.encrypt(\n        user_password=user_pw,\n        owner_password=owner_pw,\n        algorithm=\"AES-256\",\n    )\n\n    output_path.parent.mkdir(parents=True, exist_ok=True)\n    with open(output_path, \"wb\") as fh:\n        writer.write(fh)\n    print(f\"Done: {output_path}\")\n",[30,90639,90640,90644,90651,90661,90671,90681,90692,90696,90700,90709,90714,90718,90727,90735,90743,90751,90756,90761,90769,90777,90793,90797,90813,90821,90831,90843,90859,90872,90876,90880,90888,90892,90897,90911,90921,90930,90940,90945,90949,90953,90958,90962,90970,90978,90988,90992,90996,91016,91032,91036],{"__ignoreMap":28},[33,90641,90642],{"class":35,"line":36},[33,90643,88684],{"class":39},[33,90645,90646,90648],{"class":35,"line":43},[33,90647,164],{"class":163},[33,90649,90650],{"class":167}," io, os\n",[33,90652,90653,90655,90657,90659],{"class":35,"line":61},[33,90654,190],{"class":163},[33,90656,193],{"class":167},[33,90658,164],{"class":163},[33,90660,198],{"class":167},[33,90662,90663,90665,90667,90669],{"class":35,"line":73},[33,90664,190],{"class":163},[33,90666,57333],{"class":167},[33,90668,164],{"class":163},[33,90670,66892],{"class":167},[33,90672,90673,90675,90677,90679],{"class":35,"line":88},[33,90674,190],{"class":163},[33,90676,28221],{"class":167},[33,90678,164],{"class":163},[33,90680,28226],{"class":167},[33,90682,90683,90685,90687,90689],{"class":35,"line":95},[33,90684,190],{"class":163},[33,90686,19044],{"class":167},[33,90688,164],{"class":163},[33,90690,90691],{"class":167}," letter\n",[33,90693,90694],{"class":35,"line":101},[33,90695,92],{"emptyLinePlaceholder":91},[33,90697,90698],{"class":35,"line":171},[33,90699,92],{"emptyLinePlaceholder":91},[33,90701,90702,90704,90707],{"class":35,"line":179},[33,90703,562],{"class":163},[33,90705,90706],{"class":46}," watermark_then_encrypt",[33,90708,7637],{"class":167},[33,90710,90711],{"class":35,"line":187},[33,90712,90713],{"class":167},"    source_path: Path,\n",[33,90715,90716],{"class":35,"line":201},[33,90717,68575],{"class":167},[33,90719,90720,90723,90725],{"class":35,"line":206},[33,90721,90722],{"class":167},"    wm_text: ",[33,90724,1053],{"class":50},[33,90726,247],{"class":167},[33,90728,90729,90731,90733],{"class":35,"line":224},[33,90730,87138],{"class":167},[33,90732,1053],{"class":50},[33,90734,247],{"class":167},[33,90736,90737,90739,90741],{"class":35,"line":229},[33,90738,87147],{"class":167},[33,90740,1053],{"class":50},[33,90742,247],{"class":167},[33,90744,90745,90747,90749],{"class":35,"line":235},[33,90746,1617],{"class":167},[33,90748,571],{"class":50},[33,90750,574],{"class":167},[33,90752,90753],{"class":35,"line":250},[33,90754,90755],{"class":54},"    \"\"\"Generate watermark in-memory and encrypt in a single pipeline.\"\"\"\n",[33,90757,90758],{"class":35,"line":266},[33,90759,90760],{"class":39},"    # 1. Build watermark in a BytesIO buffer\n",[33,90762,90763,90765,90767],{"class":35,"line":290},[33,90764,61913],{"class":167},[33,90766,242],{"class":163},[33,90768,61918],{"class":167},[33,90770,90771,90773,90775],{"class":35,"line":295},[33,90772,89316],{"class":167},[33,90774,242],{"class":163},[33,90776,90691],{"class":167},[33,90778,90779,90781,90783,90786,90788,90790],{"class":35,"line":300},[33,90780,28472],{"class":167},[33,90782,242],{"class":163},[33,90784,90785],{"class":167}," canvas.Canvas(buf, ",[33,90787,20091],{"class":238},[33,90789,242],{"class":163},[33,90791,90792],{"class":167},"letter)\n",[33,90794,90795],{"class":35,"line":317},[33,90796,89349],{"class":167},[33,90798,90799,90801,90803,90805,90807,90809,90811],{"class":35,"line":332},[33,90800,89359],{"class":167},[33,90802,1351],{"class":163},[33,90804,7451],{"class":50},[33,90806,28528],{"class":167},[33,90808,1351],{"class":163},[33,90810,7451],{"class":50},[33,90812,221],{"class":167},[33,90814,90815,90817,90819],{"class":35,"line":347},[33,90816,89376],{"class":167},[33,90818,82765],{"class":50},[33,90820,221],{"class":167},[33,90822,90823,90826,90829],{"class":35,"line":374},[33,90824,90825],{"class":167},"    c.setFillAlpha(",[33,90827,90828],{"class":50},"0.25",[33,90830,221],{"class":167},[33,90832,90833,90835,90837,90839,90841],{"class":35,"line":397},[33,90834,28510],{"class":167},[33,90836,19908],{"class":54},[33,90838,365],{"class":167},[33,90840,49813],{"class":50},[33,90842,221],{"class":167},[33,90844,90845,90847,90849,90851,90853,90855,90857],{"class":35,"line":653},[33,90846,89399],{"class":167},[33,90848,24461],{"class":50},[33,90850,365],{"class":167},[33,90852,24461],{"class":50},[33,90854,365],{"class":167},[33,90856,24461],{"class":50},[33,90858,221],{"class":167},[33,90860,90861,90863,90865,90867,90869],{"class":35,"line":667},[33,90862,89421],{"class":167},[33,90864,748],{"class":50},[33,90866,365],{"class":167},[33,90868,748],{"class":50},[33,90870,90871],{"class":167},", wm_text)\n",[33,90873,90874],{"class":35,"line":675},[33,90875,89435],{"class":167},[33,90877,90878],{"class":35,"line":689},[33,90879,28601],{"class":167},[33,90881,90882,90884,90886],{"class":35,"line":703},[33,90883,61951],{"class":167},[33,90885,748],{"class":50},[33,90887,221],{"class":167},[33,90889,90890],{"class":35,"line":714},[33,90891,92],{"emptyLinePlaceholder":91},[33,90893,90894],{"class":35,"line":723},[33,90895,90896],{"class":39},"    # 2. Merge watermark onto each page\n",[33,90898,90899,90902,90904,90907,90909],{"class":35,"line":754},[33,90900,90901],{"class":167},"    wm_page ",[33,90903,242],{"class":163},[33,90905,90906],{"class":167}," PdfReader(buf).pages[",[33,90908,748],{"class":50},[33,90910,9202],{"class":167},[33,90912,90913,90916,90918],{"class":35,"line":771},[33,90914,90915],{"class":167},"    reader  ",[33,90917,242],{"class":163},[33,90919,90920],{"class":167}," PdfReader(source_path)\n",[33,90922,90923,90926,90928],{"class":35,"line":777},[33,90924,90925],{"class":167},"    writer  ",[33,90927,242],{"class":163},[33,90929,67154],{"class":167},[33,90931,90932,90934,90936,90938],{"class":35,"line":788},[33,90933,656],{"class":163},[33,90935,695],{"class":167},[33,90937,662],{"class":163},[33,90939,86724],{"class":167},[33,90941,90942],{"class":35,"line":804},[33,90943,90944],{"class":167},"        page.merge_page(wm_page)\n",[33,90946,90947],{"class":35,"line":809},[33,90948,87204],{"class":167},[33,90950,90951],{"class":35,"line":819},[33,90952,92],{"emptyLinePlaceholder":91},[33,90954,90955],{"class":35,"line":829},[33,90956,90957],{"class":39},"    # 3. Encrypt without touching the merged intermediate\n",[33,90959,90960],{"class":35,"line":834},[33,90961,87209],{"class":167},[33,90963,90964,90966,90968],{"class":35,"line":839},[33,90965,86950],{"class":238},[33,90967,242],{"class":163},[33,90969,87218],{"class":167},[33,90971,90972,90974,90976],{"class":35,"line":860},[33,90973,86968],{"class":238},[33,90975,242],{"class":163},[33,90977,87227],{"class":167},[33,90979,90980,90982,90984,90986],{"class":35,"line":887},[33,90981,87245],{"class":238},[33,90983,242],{"class":163},[33,90985,87250],{"class":54},[33,90987,247],{"class":167},[33,90989,90990],{"class":35,"line":907},[33,90991,1202],{"class":167},[33,90993,90994],{"class":35,"line":1826},[33,90995,92],{"emptyLinePlaceholder":91},[33,90997,90998,91000,91002,91004,91006,91008,91010,91012,91014],{"class":35,"line":1844},[33,90999,69063],{"class":167},[33,91001,869],{"class":238},[33,91003,242],{"class":163},[33,91005,855],{"class":50},[33,91007,365],{"class":167},[33,91009,878],{"class":238},[33,91011,242],{"class":163},[33,91013,855],{"class":50},[33,91015,221],{"class":167},[33,91017,91018,91020,91022,91024,91026,91028,91030],{"class":35,"line":1858},[33,91019,1635],{"class":163},[33,91021,68213],{"class":50},[33,91023,69088],{"class":167},[33,91025,67169],{"class":54},[33,91027,1649],{"class":167},[33,91029,495],{"class":163},[33,91031,67176],{"class":167},[33,91033,91034],{"class":35,"line":1871},[33,91035,87297],{"class":167},[33,91037,91038,91040,91042,91044,91046,91048,91050,91052,91054],{"class":35,"line":1877},[33,91039,7268],{"class":50},[33,91041,602],{"class":167},[33,91043,4059],{"class":163},[33,91045,22340],{"class":54},[33,91047,1115],{"class":50},[33,91049,69145],{"class":167},[33,91051,1121],{"class":50},[33,91053,274],{"class":54},[33,91055,221],{"class":167},[424,91057,91059],{"id":91058},"variant-2-rc4-legacy-mode-for-older-readers","Variant 2: RC4 Legacy Mode for Older Readers",[14,91061,91062],{},"Some embedded systems (older MFP scanners, kiosk PDF viewers) reject AES-256. Fall back to RC4-128 only when interoperability requires it — RC4 is cryptographically weak and should not be used for compliance:",[23,91064,91066],{"className":126,"code":91065,"language":47,"meta":28,"style":28},"# pip install pypdf\nfrom pathlib import Path\nfrom pypdf import PdfReader, PdfWriter\n\ndef encrypt_rc4_legacy(source: Path, output: Path, user_pw: str, owner_pw: str) -> None:\n    \"\"\"RC4-128 for legacy reader compatibility only — not for compliance use.\"\"\"\n    reader = PdfReader(source)\n    writer = PdfWriter()\n    for page in reader.pages:\n        writer.add_page(page)\n    # \"RC4-128\" is accepted by pypdf but generates a PDF 1.4-compatible dict\n    writer.encrypt(user_password=user_pw, owner_password=owner_pw, algorithm=\"RC4-128\")\n    with open(output, \"wb\") as fh:\n        writer.write(fh)\n",[30,91067,91068,91072,91082,91092,91096,91119,91124,91132,91140,91150,91154,91159,91184,91200],{"__ignoreMap":28},[33,91069,91070],{"class":35,"line":36},[33,91071,57316],{"class":39},[33,91073,91074,91076,91078,91080],{"class":35,"line":43},[33,91075,190],{"class":163},[33,91077,193],{"class":167},[33,91079,164],{"class":163},[33,91081,198],{"class":167},[33,91083,91084,91086,91088,91090],{"class":35,"line":61},[33,91085,190],{"class":163},[33,91087,57333],{"class":167},[33,91089,164],{"class":163},[33,91091,66892],{"class":167},[33,91093,91094],{"class":35,"line":73},[33,91095,92],{"emptyLinePlaceholder":91},[33,91097,91098,91100,91103,91106,91108,91111,91113,91115,91117],{"class":35,"line":88},[33,91099,562],{"class":163},[33,91101,91102],{"class":46}," encrypt_rc4_legacy",[33,91104,91105],{"class":167},"(source: Path, output: Path, user_pw: ",[33,91107,1053],{"class":50},[33,91109,91110],{"class":167},", owner_pw: ",[33,91112,1053],{"class":50},[33,91114,1617],{"class":167},[33,91116,571],{"class":50},[33,91118,574],{"class":167},[33,91120,91121],{"class":35,"line":95},[33,91122,91123],{"class":54},"    \"\"\"RC4-128 for legacy reader compatibility only — not for compliance use.\"\"\"\n",[33,91125,91126,91128,91130],{"class":35,"line":101},[33,91127,57365],{"class":167},[33,91129,242],{"class":163},[33,91131,86642],{"class":167},[33,91133,91134,91136,91138],{"class":35,"line":171},[33,91135,68681],{"class":167},[33,91137,242],{"class":163},[33,91139,67154],{"class":167},[33,91141,91142,91144,91146,91148],{"class":35,"line":179},[33,91143,656],{"class":163},[33,91145,695],{"class":167},[33,91147,662],{"class":163},[33,91149,86724],{"class":167},[33,91151,91152],{"class":35,"line":187},[33,91153,87204],{"class":167},[33,91155,91156],{"class":35,"line":201},[33,91157,91158],{"class":39},"    # \"RC4-128\" is accepted by pypdf but generates a PDF 1.4-compatible dict\n",[33,91160,91161,91163,91165,91167,91169,91171,91173,91175,91177,91179,91182],{"class":35,"line":206},[33,91162,88360],{"class":167},[33,91164,87310],{"class":238},[33,91166,242],{"class":163},[33,91168,87599],{"class":167},[33,91170,87302],{"class":238},[33,91172,242],{"class":163},[33,91174,87606],{"class":167},[33,91176,86184],{"class":238},[33,91178,242],{"class":163},[33,91180,91181],{"class":54},"\"RC4-128\"",[33,91183,221],{"class":167},[33,91185,91186,91188,91190,91192,91194,91196,91198],{"class":35,"line":224},[33,91187,1635],{"class":163},[33,91189,68213],{"class":50},[33,91191,70532],{"class":167},[33,91193,67169],{"class":54},[33,91195,1649],{"class":167},[33,91197,495],{"class":163},[33,91199,67176],{"class":167},[33,91201,91202],{"class":35,"line":229},[33,91203,87297],{"class":167},[424,91205,91207],{"id":91206},"variant-3-image-watermark-logo-stamp","Variant 3: Image Watermark (logo stamp)",[14,91209,91210],{},"For brand logos, draw a scaled image instead of text in the ReportLab canvas:",[23,91212,91214],{"className":126,"code":91213,"language":47,"meta":28,"style":28},"# pip install reportlab Pillow\nfrom pathlib import Path\nfrom reportlab.pdfgen import canvas\nfrom reportlab.lib.pagesizes import letter\nfrom reportlab.lib.utils import ImageReader\n\nLOGO = Path(\".\u002Fassets\u002Flogo.png\")\nWM   = Path(\".\u002Ftmp\u002Flogo_watermark.pdf\")\n\n\ndef create_image_watermark(logo: Path = LOGO, output: Path = WM, alpha: float = 0.2) -> None:\n    width, height = letter\n    c = canvas.Canvas(str(output), pagesize=letter)\n    c.saveState()\n    c.setFillAlpha(alpha)\n    # Centre the logo; adjust width\u002Fheight as needed\n    logo_w, logo_h = 200, 80\n    c.drawImage(\n        ImageReader(str(logo)),\n        (width - logo_w) \u002F 2,\n        (height - logo_h) \u002F 2,\n        width=logo_w,\n        height=logo_h,\n        mask=\"auto\",        # honour PNG transparency\n    )\n    c.restoreState()\n    c.save()\n",[30,91215,91216,91221,91231,91241,91251,91263,91267,91281,91295,91299,91303,91341,91349,91368,91372,91376,91381,91396,91401,91411,91427,91443,91453,91463,91477,91481,91485],{"__ignoreMap":28},[33,91217,91218],{"class":35,"line":36},[33,91219,91220],{"class":39},"# pip install reportlab Pillow\n",[33,91222,91223,91225,91227,91229],{"class":35,"line":43},[33,91224,190],{"class":163},[33,91226,193],{"class":167},[33,91228,164],{"class":163},[33,91230,198],{"class":167},[33,91232,91233,91235,91237,91239],{"class":35,"line":61},[33,91234,190],{"class":163},[33,91236,28221],{"class":167},[33,91238,164],{"class":163},[33,91240,28226],{"class":167},[33,91242,91243,91245,91247,91249],{"class":35,"line":73},[33,91244,190],{"class":163},[33,91246,19044],{"class":167},[33,91248,164],{"class":163},[33,91250,90691],{"class":167},[33,91252,91253,91255,91258,91260],{"class":35,"line":88},[33,91254,190],{"class":163},[33,91256,91257],{"class":167}," reportlab.lib.utils ",[33,91259,164],{"class":163},[33,91261,91262],{"class":167}," ImageReader\n",[33,91264,91265],{"class":35,"line":95},[33,91266,92],{"emptyLinePlaceholder":91},[33,91268,91269,91272,91274,91276,91279],{"class":35,"line":101},[33,91270,91271],{"class":50},"LOGO",[33,91273,212],{"class":163},[33,91275,215],{"class":167},[33,91277,91278],{"class":54},"\".\u002Fassets\u002Flogo.png\"",[33,91280,221],{"class":167},[33,91282,91283,91286,91288,91290,91293],{"class":35,"line":171},[33,91284,91285],{"class":50},"WM",[33,91287,21012],{"class":163},[33,91289,215],{"class":167},[33,91291,91292],{"class":54},"\".\u002Ftmp\u002Flogo_watermark.pdf\"",[33,91294,221],{"class":167},[33,91296,91297],{"class":35,"line":179},[33,91298,92],{"emptyLinePlaceholder":91},[33,91300,91301],{"class":35,"line":187},[33,91302,92],{"emptyLinePlaceholder":91},[33,91304,91305,91307,91310,91313,91315,91318,91321,91323,91326,91329,91331,91333,91335,91337,91339],{"class":35,"line":201},[33,91306,562],{"class":163},[33,91308,91309],{"class":46}," create_image_watermark",[33,91311,91312],{"class":167},"(logo: Path ",[33,91314,242],{"class":163},[33,91316,91317],{"class":50}," LOGO",[33,91319,91320],{"class":167},", output: Path ",[33,91322,242],{"class":163},[33,91324,91325],{"class":50}," WM",[33,91327,91328],{"class":167},", alpha: ",[33,91330,1720],{"class":50},[33,91332,212],{"class":163},[33,91334,46243],{"class":50},[33,91336,1617],{"class":167},[33,91338,571],{"class":50},[33,91340,574],{"class":167},[33,91342,91343,91345,91347],{"class":35,"line":206},[33,91344,89316],{"class":167},[33,91346,242],{"class":163},[33,91348,90691],{"class":167},[33,91350,91351,91353,91355,91357,91359,91362,91364,91366],{"class":35,"line":224},[33,91352,28472],{"class":167},[33,91354,242],{"class":163},[33,91356,28477],{"class":167},[33,91358,1053],{"class":50},[33,91360,91361],{"class":167},"(output), ",[33,91363,20091],{"class":238},[33,91365,242],{"class":163},[33,91367,90792],{"class":167},[33,91369,91370],{"class":35,"line":229},[33,91371,89349],{"class":167},[33,91373,91374],{"class":35,"line":235},[33,91375,89385],{"class":167},[33,91377,91378],{"class":35,"line":250},[33,91379,91380],{"class":39},"    # Centre the logo; adjust width\u002Fheight as needed\n",[33,91382,91383,91386,91388,91391,91393],{"class":35,"line":266},[33,91384,91385],{"class":167},"    logo_w, logo_h ",[33,91387,242],{"class":163},[33,91389,91390],{"class":50}," 200",[33,91392,365],{"class":167},[33,91394,91395],{"class":50},"80\n",[33,91397,91398],{"class":35,"line":290},[33,91399,91400],{"class":167},"    c.drawImage(\n",[33,91402,91403,91406,91408],{"class":35,"line":295},[33,91404,91405],{"class":167},"        ImageReader(",[33,91407,1053],{"class":50},[33,91409,91410],{"class":167},"(logo)),\n",[33,91412,91413,91416,91418,91421,91423,91425],{"class":35,"line":300},[33,91414,91415],{"class":167},"        (width ",[33,91417,4126],{"class":163},[33,91419,91420],{"class":167}," logo_w) ",[33,91422,1351],{"class":163},[33,91424,7451],{"class":50},[33,91426,247],{"class":167},[33,91428,91429,91432,91434,91437,91439,91441],{"class":35,"line":317},[33,91430,91431],{"class":167},"        (height ",[33,91433,4126],{"class":163},[33,91435,91436],{"class":167}," logo_h) ",[33,91438,1351],{"class":163},[33,91440,7451],{"class":50},[33,91442,247],{"class":167},[33,91444,91445,91448,91450],{"class":35,"line":332},[33,91446,91447],{"class":238},"        width",[33,91449,242],{"class":163},[33,91451,91452],{"class":167},"logo_w,\n",[33,91454,91455,91458,91460],{"class":35,"line":347},[33,91456,91457],{"class":238},"        height",[33,91459,242],{"class":163},[33,91461,91462],{"class":167},"logo_h,\n",[33,91464,91465,91468,91470,91472,91474],{"class":35,"line":374},[33,91466,91467],{"class":238},"        mask",[33,91469,242],{"class":163},[33,91471,15565],{"class":54},[33,91473,89262],{"class":167},[33,91475,91476],{"class":39},"# honour PNG transparency\n",[33,91478,91479],{"class":35,"line":397},[33,91480,1202],{"class":167},[33,91482,91483],{"class":35,"line":653},[33,91484,89435],{"class":167},[33,91486,91487],{"class":35,"line":667},[33,91488,28601],{"class":167},[18,91490,52030],{"id":52029},[14,91492,91493],{},"After applying watermark and encryption, verify both programmatically before delivery:",[23,91495,91497],{"className":126,"code":91496,"language":47,"meta":28,"style":28},"# pip install pypdf\nfrom pathlib import Path\nfrom pypdf import PdfReader\n\ndef validate_secured_pdf(path: Path, user_pw: str, expected_pages: int) -> bool:\n    \"\"\"Assert the file is encrypted, decrypts cleanly, and has correct page count.\"\"\"\n    try:\n        reader = PdfReader(path)\n\n        if not reader.is_encrypted:\n            print(f\"FAIL: {path.name} is not encrypted\")\n            return False\n\n        result = reader.decrypt(user_pw)\n        if result == 0:\n            print(f\"FAIL: wrong password for {path.name}\")\n            return False\n\n        actual = len(reader.pages)\n        if actual != expected_pages:\n            print(f\"FAIL: expected {expected_pages} pages, got {actual}\")\n            return False\n\n        print(f\"PASS: {path.name} — encrypted, {actual} pages\")\n        return True\n    except Exception as exc:\n        print(f\"ERROR: {exc}\")\n        return False\n",[30,91498,91499,91503,91513,91523,91527,91550,91555,91561,91569,91573,91581,91601,91607,91611,91620,91632,91653,91659,91663,91673,91683,91711,91717,91721,91750,91756,91766,91786],{"__ignoreMap":28},[33,91500,91501],{"class":35,"line":36},[33,91502,57316],{"class":39},[33,91504,91505,91507,91509,91511],{"class":35,"line":43},[33,91506,190],{"class":163},[33,91508,193],{"class":167},[33,91510,164],{"class":163},[33,91512,198],{"class":167},[33,91514,91515,91517,91519,91521],{"class":35,"line":61},[33,91516,190],{"class":163},[33,91518,57333],{"class":167},[33,91520,164],{"class":163},[33,91522,57338],{"class":167},[33,91524,91525],{"class":35,"line":73},[33,91526,92],{"emptyLinePlaceholder":91},[33,91528,91529,91531,91534,91537,91539,91542,91544,91546,91548],{"class":35,"line":88},[33,91530,562],{"class":163},[33,91532,91533],{"class":46}," validate_secured_pdf",[33,91535,91536],{"class":167},"(path: Path, user_pw: ",[33,91538,1053],{"class":50},[33,91540,91541],{"class":167},", expected_pages: ",[33,91543,1059],{"class":50},[33,91545,1617],{"class":167},[33,91547,2821],{"class":50},[33,91549,574],{"class":167},[33,91551,91552],{"class":35,"line":95},[33,91553,91554],{"class":54},"    \"\"\"Assert the file is encrypted, decrypts cleanly, and has correct page count.\"\"\"\n",[33,91556,91557,91559],{"class":35,"line":101},[33,91558,2424],{"class":163},[33,91560,574],{"class":167},[33,91562,91563,91565,91567],{"class":35,"line":171},[33,91564,62484],{"class":167},[33,91566,242],{"class":163},[33,91568,71334],{"class":167},[33,91570,91571],{"class":35,"line":179},[33,91572,92],{"emptyLinePlaceholder":91},[33,91574,91575,91577,91579],{"class":35,"line":187},[33,91576,8221],{"class":163},[33,91578,620],{"class":163},[33,91580,68749],{"class":167},[33,91582,91583,91585,91587,91589,91591,91593,91595,91597,91599],{"class":35,"line":201},[33,91584,9364],{"class":50},[33,91586,602],{"class":167},[33,91588,4059],{"class":163},[33,91590,70816],{"class":54},[33,91592,1115],{"class":50},[33,91594,57398],{"class":167},[33,91596,1121],{"class":50},[33,91598,87944],{"class":54},[33,91600,221],{"class":167},[33,91602,91603,91605],{"class":35,"line":206},[33,91604,28782],{"class":163},[33,91606,2903],{"class":50},[33,91608,91609],{"class":35,"line":224},[33,91610,92],{"emptyLinePlaceholder":91},[33,91612,91613,91615,91617],{"class":35,"line":229},[33,91614,87961],{"class":167},[33,91616,242],{"class":163},[33,91618,91619],{"class":167}," reader.decrypt(user_pw)\n",[33,91621,91622,91624,91626,91628,91630],{"class":35,"line":235},[33,91623,8221],{"class":163},[33,91625,68801],{"class":167},[33,91627,1865],{"class":163},[33,91629,10791],{"class":50},[33,91631,574],{"class":167},[33,91633,91634,91636,91638,91640,91643,91645,91647,91649,91651],{"class":35,"line":250},[33,91635,9364],{"class":50},[33,91637,602],{"class":167},[33,91639,4059],{"class":163},[33,91641,91642],{"class":54},"\"FAIL: wrong password for ",[33,91644,1115],{"class":50},[33,91646,57398],{"class":167},[33,91648,1121],{"class":50},[33,91650,274],{"class":54},[33,91652,221],{"class":167},[33,91654,91655,91657],{"class":35,"line":266},[33,91656,28782],{"class":163},[33,91658,2903],{"class":50},[33,91660,91661],{"class":35,"line":290},[33,91662,92],{"emptyLinePlaceholder":91},[33,91664,91665,91667,91669,91671],{"class":35,"line":295},[33,91666,25149],{"class":167},[33,91668,242],{"class":163},[33,91670,4037],{"class":50},[33,91672,70691],{"class":167},[33,91674,91675,91677,91679,91681],{"class":35,"line":300},[33,91676,8221],{"class":163},[33,91678,25170],{"class":167},[33,91680,17877],{"class":163},[33,91682,70713],{"class":167},[33,91684,91685,91687,91689,91691,91693,91695,91697,91699,91701,91703,91705,91707,91709],{"class":35,"line":317},[33,91686,9364],{"class":50},[33,91688,602],{"class":167},[33,91690,4059],{"class":163},[33,91692,70724],{"class":54},[33,91694,1115],{"class":50},[33,91696,70729],{"class":167},[33,91698,1121],{"class":50},[33,91700,62520],{"class":54},[33,91702,1115],{"class":50},[33,91704,25201],{"class":167},[33,91706,1121],{"class":50},[33,91708,274],{"class":54},[33,91710,221],{"class":167},[33,91712,91713,91715],{"class":35,"line":332},[33,91714,28782],{"class":163},[33,91716,2903],{"class":50},[33,91718,91719],{"class":35,"line":347},[33,91720,92],{"emptyLinePlaceholder":91},[33,91722,91723,91725,91727,91729,91731,91733,91735,91737,91740,91742,91744,91746,91748],{"class":35,"line":374},[33,91724,9414],{"class":50},[33,91726,602],{"class":167},[33,91728,4059],{"class":163},[33,91730,88031],{"class":54},[33,91732,1115],{"class":50},[33,91734,57398],{"class":167},[33,91736,1121],{"class":50},[33,91738,91739],{"class":54}," — encrypted, ",[33,91741,1115],{"class":50},[33,91743,25201],{"class":167},[33,91745,1121],{"class":50},[33,91747,77518],{"class":54},[33,91749,221],{"class":167},[33,91751,91752,91754],{"class":35,"line":397},[33,91753,1659],{"class":163},[33,91755,2887],{"class":50},[33,91757,91758,91760,91762,91764],{"class":35,"line":653},[33,91759,2449],{"class":163},[33,91761,783],{"class":50},[33,91763,1852],{"class":163},[33,91765,1855],{"class":167},[33,91767,91768,91770,91772,91774,91776,91778,91780,91782,91784],{"class":35,"line":667},[33,91769,9414],{"class":50},[33,91771,602],{"class":167},[33,91773,4059],{"class":163},[33,91775,88114],{"class":54},[33,91777,1115],{"class":50},[33,91779,6565],{"class":167},[33,91781,1121],{"class":50},[33,91783,274],{"class":54},[33,91785,221],{"class":167},[33,91787,91788,91790],{"class":35,"line":675},[33,91789,1659],{"class":163},[33,91791,2903],{"class":50},[14,91793,91794,91796],{},[30,91795,86194],{}," returns 0 on failure, 1 for user-password success, and 2 for owner-password success.",[18,91798,4209],{"id":4208},[4211,91800,91801,91807,91816,91830],{},[4214,91802,91803,91806],{},[1974,91804,91805],{},"Generate the watermark template once"," per batch, not once per file. A ReportLab canvas render takes ~5–10 ms; multiplied across thousands of documents it adds up.",[4214,91808,91809,91812,91813,91815],{},[1974,91810,91811],{},"Re-open the watermark reader once"," per batch and reuse the page object — ",[30,91814,68108],{}," is not expensive to keep open.",[4214,91817,91818,46332,91821,91823,91824,8877,91826,91829],{},[1974,91819,91820],{},"Use multiprocessing for large batches.",[30,91822,65045],{}," operations are CPU-bound and release the GIL between pages, so ",[30,91825,4240],{},[30,91827,91828],{},"max_workers=os.cpu_count()"," gives near-linear throughput gains.",[4214,91831,91832,91835,91836,1351,91838,91840,91841,43180,91843,91846],{},[1974,91833,91834],{},"Memory ceiling."," Each ",[30,91837,68108],{},[30,91839,70025],{}," pair holds the full page tree in memory. For files over ~200 MB, stream with ",[30,91842,65045],{},[30,91844,91845],{},"clone_reader_document_root"," or process in chunks.",[18,91848,4271],{"id":4270},[4273,91850,91851,91861],{},[4276,91852,91853],{},[4279,91854,91855,91857,91859],{},[4282,91856,14317],{},[4282,91858,4287],{},[4282,91860,4290],{},[4292,91862,91863,91881,91902,91930,91941],{},[4279,91864,91865,91870,91876],{},[4297,91866,91867],{},[30,91868,91869],{},"PdfReadError: Stream has not been decrypted",[4297,91871,91872,91873],{},"Trying to read pages from an encrypted file without calling ",[30,91874,91875],{},"decrypt()",[4297,91877,74566,91878,91880],{},[30,91879,68099],{}," immediately after opening; check the return value",[4279,91882,91883,91888,91893],{},[4297,91884,91885],{},[30,91886,91887],{},"NotImplementedError: Encryption algorithm not supported",[4297,91889,79527,91890,91892],{},[30,91891,86139],{}," (unmaintained) or a version of pypdf older than 3.0",[4297,91894,91895,88503,91897,91899,91900],{},[30,91896,88502],{},[30,91898,86139],{}," from ",[30,91901,26625],{},[4279,91903,91904,91907,91916],{},[4297,91905,91906],{},"Watermark text missing from output",[4297,91908,91909,91912,91913],{},[30,91910,91911],{},"merge_page"," called but transparency not set; canvas not saved before ",[30,91914,91915],{},".save()",[4297,91917,91918,91919,36608,91922,91925,91926,91929],{},"Verify ",[30,91920,91921],{},"c.saveState()",[30,91923,91924],{},"c.restoreState()"," wrap the drawing calls; check ",[30,91927,91928],{},"setFillAlpha"," value",[4279,91931,91932,91935,91938],{},[4297,91933,91934],{},"Permission flags ignored by Adobe Acrobat",[4297,91936,91937],{},"File encrypted with user\u002Fowner password the same value",[4297,91939,91940],{},"Always set owner and user passwords to different values; identical passwords disable flag enforcement in some readers",[4279,91942,91943,91948,91953],{},[4297,91944,91945,91947],{},[30,91946,68095],{}," on decrypt",[4297,91949,91950,91951],{},"Wrong password passed to ",[30,91952,86194],{},[4297,91954,91955,91956,91959],{},"Catch ",[30,91957,91958],{},"pypdf.errors.FileNotDecryptedError","; surface a clear message rather than swallowing the exception",[18,91961,4402],{"id":4401},[23,91963,91965],{"className":126,"code":91964,"language":47,"meta":28,"style":28},"# pip install pypdf reportlab\n\"\"\"\nsecure_pdfs.py — watermark and encrypt all PDFs in a directory.\n\nUsage:\n    PDF_USER_PW=viewer123 PDF_OWNER_PW=admin456 python secure_pdfs.py \\\n        --input .\u002Fraw --output .\u002Fsecured --text \"CONFIDENTIAL\"\n\"\"\"\nimport argparse\nimport io\nimport os\nfrom pathlib import Path\n\nfrom pypdf import PdfReader, PdfWriter\nfrom pypdf.generic import PermissionFlags\nfrom reportlab.lib.pagesizes import letter\nfrom reportlab.pdfgen import canvas\n\nPERMISSIONS = PermissionFlags.PRINT_PRINTING | PermissionFlags.ADD_OR_MODIFY_ANNOTATIONS\n\n\ndef build_watermark_buffer(text: str, alpha: float = 0.25) -> io.BytesIO:\n    buf = io.BytesIO()\n    width, height = letter\n    c = canvas.Canvas(buf, pagesize=letter)\n    c.saveState()\n    c.translate(width \u002F 2, height \u002F 2)\n    c.rotate(45)\n    c.setFillAlpha(alpha)\n    c.setFont(\"Helvetica-Bold\", 52)\n    c.setFillColorRGB(0.3, 0.3, 0.3)\n    c.drawCentredString(0, 0, text)\n    c.restoreState()\n    c.save()\n    buf.seek(0)\n    return buf\n\n\ndef process_file(\n    source: Path,\n    output: Path,\n    wm_page,          # pre-loaded watermark page object\n    user_pw: str,\n    owner_pw: str,\n) -> bool:\n    try:\n        reader = PdfReader(source)\n        writer = PdfWriter()\n        for page in reader.pages:\n            page.merge_page(wm_page)\n            writer.add_page(page)\n        writer.encrypt(\n            user_password=user_pw,\n            owner_password=owner_pw,\n            permissions_flag=PERMISSIONS,\n            algorithm=\"AES-256\",\n        )\n        with open(output, \"wb\") as fh:\n            writer.write(fh)\n        print(f\"  secured: {output.name}\")\n        return True\n    except Exception as exc:\n        print(f\"  SKIP {source.name}: {exc}\")\n        return False\n\n\ndef main() -> None:\n    parser = argparse.ArgumentParser(description=\"Watermark and encrypt PDFs\")\n    parser.add_argument(\"--input\",  type=Path, default=Path(\".\u002Finput\"),  help=\"Source directory\")\n    parser.add_argument(\"--output\", type=Path, default=Path(\".\u002Foutput\"), help=\"Output directory\")\n    parser.add_argument(\"--text\",   default=\"CONFIDENTIAL\",               help=\"Watermark text\")\n    args = parser.parse_args()\n\n    user_pw  = os.environ.get(\"PDF_USER_PW\", \"\")\n    owner_pw = os.environ.get(\"PDF_OWNER_PW\")\n    if not owner_pw:\n        raise SystemExit(\"Set PDF_OWNER_PW environment variable\")\n\n    args.output.mkdir(parents=True, exist_ok=True)\n\n    # Build watermark once for the entire batch\n    wm_buf  = build_watermark_buffer(args.text)\n    wm_page = PdfReader(wm_buf).pages[0]\n\n    pdfs = sorted(args.input.glob(\"*.pdf\"))\n    print(f\"Processing {len(pdfs)} file(s) from {args.input}\")\n    ok = sum(\n        process_file(p, args.output \u002F f\"secure_{p.name}\", wm_page, user_pw, owner_pw)\n        for p in pdfs\n    )\n    print(f\"Done: {ok}\u002F{len(pdfs)} succeeded\")\n\n\nif __name__ == \"__main__\":\n    main()\n",[30,91966,91967,91971,91975,91980,91984,91988,91995,92000,92004,92010,92016,92022,92032,92036,92046,92056,92066,92076,92080,92097,92101,92105,92127,92135,92143,92157,92161,92177,92185,92189,92201,92217,92229,92233,92237,92245,92251,92255,92259,92268,92272,92276,92284,92292,92300,92308,92314,92322,92330,92340,92345,92349,92353,92361,92369,92379,92389,92393,92409,92413,92435,92441,92451,92480,92486,92490,92494,92506,92523,92557,92590,92617,92625,92629,92646,92659,92668,92681,92685,92705,92709,92714,92724,92737,92741,92756,92787,92797,92819,92830,92834,92863,92867,92871,92883],{"__ignoreMap":28},[33,91968,91969],{"class":35,"line":36},[33,91970,88684],{"class":39},[33,91972,91973],{"class":35,"line":43},[33,91974,139],{"class":54},[33,91976,91977],{"class":35,"line":61},[33,91978,91979],{"class":54},"secure_pdfs.py — watermark and encrypt all PDFs in a directory.\n",[33,91981,91982],{"class":35,"line":73},[33,91983,92],{"emptyLinePlaceholder":91},[33,91985,91986],{"class":35,"line":88},[33,91987,4435],{"class":54},[33,91989,91990,91993],{"class":35,"line":95},[33,91991,91992],{"class":54},"    PDF_USER_PW=viewer123 PDF_OWNER_PW=admin456 python secure_pdfs.py ",[33,91994,22544],{"class":50},[33,91996,91997],{"class":35,"line":101},[33,91998,91999],{"class":54},"        --input .\u002Fraw --output .\u002Fsecured --text \"CONFIDENTIAL\"\n",[33,92001,92002],{"class":35,"line":171},[33,92003,139],{"class":54},[33,92005,92006,92008],{"class":35,"line":179},[33,92007,164],{"class":163},[33,92009,4461],{"class":167},[33,92011,92012,92014],{"class":35,"line":187},[33,92013,164],{"class":163},[33,92015,60058],{"class":167},[33,92017,92018,92020],{"class":35,"line":201},[33,92019,164],{"class":163},[33,92021,176],{"class":167},[33,92023,92024,92026,92028,92030],{"class":35,"line":206},[33,92025,190],{"class":163},[33,92027,193],{"class":167},[33,92029,164],{"class":163},[33,92031,198],{"class":167},[33,92033,92034],{"class":35,"line":224},[33,92035,92],{"emptyLinePlaceholder":91},[33,92037,92038,92040,92042,92044],{"class":35,"line":229},[33,92039,190],{"class":163},[33,92041,57333],{"class":167},[33,92043,164],{"class":163},[33,92045,66892],{"class":167},[33,92047,92048,92050,92052,92054],{"class":35,"line":235},[33,92049,190],{"class":163},[33,92051,73414],{"class":167},[33,92053,164],{"class":163},[33,92055,87061],{"class":167},[33,92057,92058,92060,92062,92064],{"class":35,"line":250},[33,92059,190],{"class":163},[33,92061,19044],{"class":167},[33,92063,164],{"class":163},[33,92065,90691],{"class":167},[33,92067,92068,92070,92072,92074],{"class":35,"line":266},[33,92069,190],{"class":163},[33,92071,28221],{"class":167},[33,92073,164],{"class":163},[33,92075,28226],{"class":167},[33,92077,92078],{"class":35,"line":290},[33,92079,92],{"emptyLinePlaceholder":91},[33,92081,92082,92084,92086,92088,92091,92093,92095],{"class":35,"line":295},[33,92083,89982],{"class":50},[33,92085,212],{"class":163},[33,92087,87101],{"class":167},[33,92089,92090],{"class":50},"PRINT_PRINTING",[33,92092,2850],{"class":163},[33,92094,87101],{"class":167},[33,92096,90001],{"class":50},[33,92098,92099],{"class":35,"line":300},[33,92100,92],{"emptyLinePlaceholder":91},[33,92102,92103],{"class":35,"line":317},[33,92104,92],{"emptyLinePlaceholder":91},[33,92106,92107,92109,92112,92114,92116,92118,92120,92122,92124],{"class":35,"line":332},[33,92108,562],{"class":163},[33,92110,92111],{"class":46}," build_watermark_buffer",[33,92113,3423],{"class":167},[33,92115,1053],{"class":50},[33,92117,91328],{"class":167},[33,92119,1720],{"class":50},[33,92121,212],{"class":163},[33,92123,89259],{"class":50},[33,92125,92126],{"class":167},") -> io.BytesIO:\n",[33,92128,92129,92131,92133],{"class":35,"line":347},[33,92130,61913],{"class":167},[33,92132,242],{"class":163},[33,92134,61918],{"class":167},[33,92136,92137,92139,92141],{"class":35,"line":374},[33,92138,89316],{"class":167},[33,92140,242],{"class":163},[33,92142,90691],{"class":167},[33,92144,92145,92147,92149,92151,92153,92155],{"class":35,"line":397},[33,92146,28472],{"class":167},[33,92148,242],{"class":163},[33,92150,90785],{"class":167},[33,92152,20091],{"class":238},[33,92154,242],{"class":163},[33,92156,90792],{"class":167},[33,92158,92159],{"class":35,"line":653},[33,92160,89349],{"class":167},[33,92162,92163,92165,92167,92169,92171,92173,92175],{"class":35,"line":667},[33,92164,89359],{"class":167},[33,92166,1351],{"class":163},[33,92168,7451],{"class":50},[33,92170,28528],{"class":167},[33,92172,1351],{"class":163},[33,92174,7451],{"class":50},[33,92176,221],{"class":167},[33,92178,92179,92181,92183],{"class":35,"line":675},[33,92180,89376],{"class":167},[33,92182,82765],{"class":50},[33,92184,221],{"class":167},[33,92186,92187],{"class":35,"line":689},[33,92188,89385],{"class":167},[33,92190,92191,92193,92195,92197,92199],{"class":35,"line":703},[33,92192,28510],{"class":167},[33,92194,19908],{"class":54},[33,92196,365],{"class":167},[33,92198,49813],{"class":50},[33,92200,221],{"class":167},[33,92202,92203,92205,92207,92209,92211,92213,92215],{"class":35,"line":714},[33,92204,89399],{"class":167},[33,92206,24461],{"class":50},[33,92208,365],{"class":167},[33,92210,24461],{"class":50},[33,92212,365],{"class":167},[33,92214,24461],{"class":50},[33,92216,221],{"class":167},[33,92218,92219,92221,92223,92225,92227],{"class":35,"line":723},[33,92220,89421],{"class":167},[33,92222,748],{"class":50},[33,92224,365],{"class":167},[33,92226,748],{"class":50},[33,92228,89430],{"class":167},[33,92230,92231],{"class":35,"line":754},[33,92232,89435],{"class":167},[33,92234,92235],{"class":35,"line":771},[33,92236,28601],{"class":167},[33,92238,92239,92241,92243],{"class":35,"line":777},[33,92240,61951],{"class":167},[33,92242,748],{"class":50},[33,92244,221],{"class":167},[33,92246,92247,92249],{"class":35,"line":788},[33,92248,1332],{"class":163},[33,92250,63556],{"class":167},[33,92252,92253],{"class":35,"line":804},[33,92254,92],{"emptyLinePlaceholder":91},[33,92256,92257],{"class":35,"line":809},[33,92258,92],{"emptyLinePlaceholder":91},[33,92260,92261,92263,92266],{"class":35,"line":819},[33,92262,562],{"class":163},[33,92264,92265],{"class":46}," process_file",[33,92267,7637],{"class":167},[33,92269,92270],{"class":35,"line":829},[33,92271,79033],{"class":167},[33,92273,92274],{"class":35,"line":834},[33,92275,70116],{"class":167},[33,92277,92278,92281],{"class":35,"line":839},[33,92279,92280],{"class":167},"    wm_page,          ",[33,92282,92283],{"class":39},"# pre-loaded watermark page object\n",[33,92285,92286,92288,92290],{"class":35,"line":860},[33,92287,87138],{"class":167},[33,92289,1053],{"class":50},[33,92291,247],{"class":167},[33,92293,92294,92296,92298],{"class":35,"line":887},[33,92295,87147],{"class":167},[33,92297,1053],{"class":50},[33,92299,247],{"class":167},[33,92301,92302,92304,92306],{"class":35,"line":907},[33,92303,1617],{"class":167},[33,92305,2821],{"class":50},[33,92307,574],{"class":167},[33,92309,92310,92312],{"class":35,"line":1826},[33,92311,2424],{"class":163},[33,92313,574],{"class":167},[33,92315,92316,92318,92320],{"class":35,"line":1844},[33,92317,62484],{"class":167},[33,92319,242],{"class":163},[33,92321,86642],{"class":167},[33,92323,92324,92326,92328],{"class":35,"line":1858},[33,92325,67149],{"class":167},[33,92327,242],{"class":163},[33,92329,67154],{"class":167},[33,92331,92332,92334,92336,92338],{"class":35,"line":1871},[33,92333,5973],{"class":163},[33,92335,695],{"class":167},[33,92337,662],{"class":163},[33,92339,86724],{"class":167},[33,92341,92342],{"class":35,"line":1877},[33,92343,92344],{"class":167},"            page.merge_page(wm_page)\n",[33,92346,92347],{"class":35,"line":1883},[33,92348,86729],{"class":167},[33,92350,92351],{"class":35,"line":1915},[33,92352,86743],{"class":167},[33,92354,92355,92357,92359],{"class":35,"line":1926},[33,92356,86748],{"class":238},[33,92358,242],{"class":163},[33,92360,87218],{"class":167},[33,92362,92363,92365,92367],{"class":35,"line":1932},[33,92364,86758],{"class":238},[33,92366,242],{"class":163},[33,92368,87227],{"class":167},[33,92370,92371,92373,92375,92377],{"class":35,"line":1938},[33,92372,90185],{"class":238},[33,92374,242],{"class":163},[33,92376,89982],{"class":50},[33,92378,247],{"class":167},[33,92380,92381,92383,92385,92387],{"class":35,"line":1950},[33,92382,86768],{"class":238},[33,92384,242],{"class":163},[33,92386,87250],{"class":54},[33,92388,247],{"class":167},[33,92390,92391],{"class":35,"line":1958},[33,92392,5867],{"class":167},[33,92394,92395,92397,92399,92401,92403,92405,92407],{"class":35,"line":4904},[33,92396,2191],{"class":163},[33,92398,68213],{"class":50},[33,92400,70532],{"class":167},[33,92402,67169],{"class":54},[33,92404,1649],{"class":167},[33,92406,495],{"class":163},[33,92408,67176],{"class":167},[33,92410,92411],{"class":35,"line":4909},[33,92412,67181],{"class":167},[33,92414,92415,92417,92419,92421,92424,92426,92429,92431,92433],{"class":35,"line":4915},[33,92416,9414],{"class":50},[33,92418,602],{"class":167},[33,92420,4059],{"class":163},[33,92422,92423],{"class":54},"\"  secured: ",[33,92425,1115],{"class":50},[33,92427,92428],{"class":167},"output.name",[33,92430,1121],{"class":50},[33,92432,274],{"class":54},[33,92434,221],{"class":167},[33,92436,92437,92439],{"class":35,"line":4925},[33,92438,1659],{"class":163},[33,92440,2887],{"class":50},[33,92442,92443,92445,92447,92449],{"class":35,"line":4935},[33,92444,2449],{"class":163},[33,92446,783],{"class":50},[33,92448,1852],{"class":163},[33,92450,1855],{"class":167},[33,92452,92453,92455,92457,92459,92462,92464,92466,92468,92470,92472,92474,92476,92478],{"class":35,"line":4941},[33,92454,9414],{"class":50},[33,92456,602],{"class":167},[33,92458,4059],{"class":163},[33,92460,92461],{"class":54},"\"  SKIP ",[33,92463,1115],{"class":50},[33,92465,86888],{"class":167},[33,92467,1121],{"class":50},[33,92469,2079],{"class":54},[33,92471,1115],{"class":50},[33,92473,6565],{"class":167},[33,92475,1121],{"class":50},[33,92477,274],{"class":54},[33,92479,221],{"class":167},[33,92481,92482,92484],{"class":35,"line":4950},[33,92483,1659],{"class":163},[33,92485,2903],{"class":50},[33,92487,92488],{"class":35,"line":4960},[33,92489,92],{"emptyLinePlaceholder":91},[33,92491,92492],{"class":35,"line":4965},[33,92493,92],{"emptyLinePlaceholder":91},[33,92495,92496,92498,92500,92502,92504],{"class":35,"line":4971},[33,92497,562],{"class":163},[33,92499,6636],{"class":46},[33,92501,568],{"class":167},[33,92503,571],{"class":50},[33,92505,574],{"class":167},[33,92507,92508,92510,92512,92514,92516,92518,92521],{"class":35,"line":4983},[33,92509,6648],{"class":167},[33,92511,242],{"class":163},[33,92513,6653],{"class":167},[33,92515,6656],{"class":238},[33,92517,242],{"class":163},[33,92519,92520],{"class":54},"\"Watermark and encrypt PDFs\"",[33,92522,221],{"class":167},[33,92524,92525,92527,92529,92531,92533,92535,92537,92539,92541,92543,92546,92548,92550,92552,92555],{"class":35,"line":4988},[33,92526,6669],{"class":167},[33,92528,6672],{"class":54},[33,92530,25480],{"class":167},[33,92532,6677],{"class":238},[33,92534,242],{"class":163},[33,92536,6682],{"class":167},[33,92538,6685],{"class":238},[33,92540,242],{"class":163},[33,92542,15641],{"class":167},[33,92544,92545],{"class":54},"\".\u002Finput\"",[33,92547,10713],{"class":167},[33,92549,25463],{"class":238},[33,92551,242],{"class":163},[33,92553,92554],{"class":54},"\"Source directory\"",[33,92556,221],{"class":167},[33,92558,92559,92561,92563,92565,92567,92569,92571,92573,92575,92577,92580,92582,92584,92586,92588],{"class":35,"line":4993},[33,92560,6669],{"class":167},[33,92562,6699],{"class":54},[33,92564,365],{"class":167},[33,92566,6677],{"class":238},[33,92568,242],{"class":163},[33,92570,6682],{"class":167},[33,92572,6685],{"class":238},[33,92574,242],{"class":163},[33,92576,15641],{"class":167},[33,92578,92579],{"class":54},"\".\u002Foutput\"",[33,92581,18525],{"class":167},[33,92583,25463],{"class":238},[33,92585,242],{"class":163},[33,92587,25501],{"class":54},[33,92589,221],{"class":167},[33,92591,92592,92594,92597,92599,92601,92603,92606,92608,92610,92612,92615],{"class":35,"line":5003},[33,92593,6669],{"class":167},[33,92595,92596],{"class":54},"\"--text\"",[33,92598,1166],{"class":167},[33,92600,6685],{"class":238},[33,92602,242],{"class":163},[33,92604,92605],{"class":54},"\"CONFIDENTIAL\"",[33,92607,1182],{"class":167},[33,92609,25463],{"class":238},[33,92611,242],{"class":163},[33,92613,92614],{"class":54},"\"Watermark text\"",[33,92616,221],{"class":167},[33,92618,92619,92621,92623],{"class":35,"line":5008},[33,92620,6766],{"class":167},[33,92622,242],{"class":163},[33,92624,6771],{"class":167},[33,92626,92627],{"class":35,"line":5014},[33,92628,92],{"emptyLinePlaceholder":91},[33,92630,92631,92634,92636,92638,92640,92642,92644],{"class":35,"line":5019},[33,92632,92633],{"class":167},"    user_pw  ",[33,92635,242],{"class":163},[33,92637,3129],{"class":167},[33,92639,86957],{"class":54},[33,92641,365],{"class":167},[33,92643,3198],{"class":54},[33,92645,221],{"class":167},[33,92647,92648,92651,92653,92655,92657],{"class":35,"line":5032},[33,92649,92650],{"class":167},"    owner_pw ",[33,92652,242],{"class":163},[33,92654,3129],{"class":167},[33,92656,86975],{"class":54},[33,92658,221],{"class":167},[33,92660,92661,92663,92665],{"class":35,"line":5039},[33,92662,617],{"class":163},[33,92664,620],{"class":163},[33,92666,92667],{"class":167}," owner_pw:\n",[33,92669,92670,92672,92674,92676,92679],{"class":35,"line":5068},[33,92671,4051],{"class":163},[33,92673,16617],{"class":50},[33,92675,602],{"class":167},[33,92677,92678],{"class":54},"\"Set PDF_OWNER_PW environment variable\"",[33,92680,221],{"class":167},[33,92682,92683],{"class":35,"line":5077},[33,92684,92],{"emptyLinePlaceholder":91},[33,92686,92687,92689,92691,92693,92695,92697,92699,92701,92703],{"class":35,"line":5082},[33,92688,53442],{"class":167},[33,92690,869],{"class":238},[33,92692,242],{"class":163},[33,92694,855],{"class":50},[33,92696,365],{"class":167},[33,92698,878],{"class":238},[33,92700,242],{"class":163},[33,92702,855],{"class":50},[33,92704,221],{"class":167},[33,92706,92707],{"class":35,"line":5089},[33,92708,92],{"emptyLinePlaceholder":91},[33,92710,92711],{"class":35,"line":5098},[33,92712,92713],{"class":39},"    # Build watermark once for the entire batch\n",[33,92715,92716,92719,92721],{"class":35,"line":5105},[33,92717,92718],{"class":167},"    wm_buf  ",[33,92720,242],{"class":163},[33,92722,92723],{"class":167}," build_watermark_buffer(args.text)\n",[33,92725,92726,92728,92730,92733,92735],{"class":35,"line":5110},[33,92727,90901],{"class":167},[33,92729,242],{"class":163},[33,92731,92732],{"class":167}," PdfReader(wm_buf).pages[",[33,92734,748],{"class":50},[33,92736,9202],{"class":167},[33,92738,92739],{"class":35,"line":5115},[33,92740,92],{"emptyLinePlaceholder":91},[33,92742,92743,92745,92747,92749,92752,92754],{"class":35,"line":5128},[33,92744,67695],{"class":167},[33,92746,242],{"class":163},[33,92748,28924],{"class":50},[33,92750,92751],{"class":167},"(args.input.glob(",[33,92753,610],{"class":54},[33,92755,371],{"class":167},[33,92757,92758,92760,92762,92764,92767,92769,92772,92774,92777,92779,92781,92783,92785],{"class":35,"line":5135},[33,92759,7268],{"class":50},[33,92761,602],{"class":167},[33,92763,4059],{"class":163},[33,92765,92766],{"class":54},"\"Processing ",[33,92768,4065],{"class":50},[33,92770,92771],{"class":167},"(pdfs)",[33,92773,1121],{"class":50},[33,92775,92776],{"class":54}," file(s) from ",[33,92778,1115],{"class":50},[33,92780,25634],{"class":167},[33,92782,1121],{"class":50},[33,92784,274],{"class":54},[33,92786,221],{"class":167},[33,92788,92789,92791,92793,92795],{"class":35,"line":5142},[33,92790,79390],{"class":167},[33,92792,242],{"class":163},[33,92794,46601],{"class":50},[33,92796,7637],{"class":167},[33,92798,92799,92802,92804,92806,92808,92810,92812,92814,92816],{"class":35,"line":5151},[33,92800,92801],{"class":167},"        process_file(p, args.output ",[33,92803,1351],{"class":163},[33,92805,1110],{"class":163},[33,92807,87541],{"class":54},[33,92809,1115],{"class":50},[33,92811,14190],{"class":167},[33,92813,1121],{"class":50},[33,92815,274],{"class":54},[33,92817,92818],{"class":167},", wm_page, user_pw, owner_pw)\n",[33,92820,92821,92823,92825,92827],{"class":35,"line":5156},[33,92822,5973],{"class":163},[33,92824,6127],{"class":167},[33,92826,662],{"class":163},[33,92828,92829],{"class":167}," pdfs\n",[33,92831,92832],{"class":35,"line":5161},[33,92833,1202],{"class":167},[33,92835,92836,92838,92840,92842,92844,92846,92848,92850,92852,92854,92856,92858,92861],{"class":35,"line":5167},[33,92837,7268],{"class":50},[33,92839,602],{"class":167},[33,92841,4059],{"class":163},[33,92843,22340],{"class":54},[33,92845,1115],{"class":50},[33,92847,87737],{"class":167},[33,92849,1121],{"class":50},[33,92851,1351],{"class":54},[33,92853,4065],{"class":50},[33,92855,92771],{"class":167},[33,92857,1121],{"class":50},[33,92859,92860],{"class":54}," succeeded\"",[33,92862,221],{"class":167},[33,92864,92865],{"class":35,"line":5172},[33,92866,92],{"emptyLinePlaceholder":91},[33,92868,92869],{"class":35,"line":5182},[33,92870,92],{"emptyLinePlaceholder":91},[33,92872,92873,92875,92877,92879,92881],{"class":35,"line":5195},[33,92874,2491],{"class":163},[33,92876,2494],{"class":50},[33,92878,2497],{"class":163},[33,92880,2500],{"class":54},[33,92882,574],{"class":167},[33,92884,92885],{"class":35,"line":5200},[33,92886,6914],{"class":167},[18,92888,6918],{"id":6917},[4211,92890,92891,92897,92902,92907],{},[4214,92892,92893,92896],{},[940,92894,86107],{"href":92895},"\u002Fautomating-pdf-extraction-generation\u002Fwatermarking-and-securing-pdfs\u002Fadd-password-protection-to-pdf-files\u002F"," — detailed AES-256 encryption workflow with validation",[4214,92898,92899,92901],{},[940,92900,88466],{"href":88465}," — decrypt an authorized PDF and save an unencrypted copy",[4214,92903,92904,92906],{},[940,92905,52682],{"href":52681}," — complete structural edits before applying security",[4214,92908,92909,92911],{},[940,92910,26191],{"href":19001}," — pipe generated output directly into the watermark+encrypt step",[14,92913,6947,92914,3035],{},[940,92915,6943],{"href":6942},[6953,92917,26204],{},{"title":28,"searchDepth":43,"depth":43,"links":92919},[92920,92921,92922,92923,92924,92925,92929,92934,92935,92936,92937,92938],{"id":20,"depth":43,"text":21},{"id":88706,"depth":43,"text":88707},{"id":89000,"depth":43,"text":89001},{"id":89140,"depth":43,"text":89141},{"id":89456,"depth":43,"text":89457},{"id":89886,"depth":43,"text":89887,"children":92926},[92927,92928],{"id":90402,"depth":61,"text":90403},{"id":90462,"depth":61,"text":90463},{"id":2708,"depth":43,"text":2709,"children":92930},[92931,92932,92933],{"id":90622,"depth":61,"text":90623},{"id":91058,"depth":61,"text":91059},{"id":91206,"depth":61,"text":91207},{"id":52029,"depth":43,"text":52030},{"id":4208,"depth":43,"text":4209},{"id":4270,"depth":43,"text":4271},{"id":4401,"depth":43,"text":4402},{"id":6917,"depth":43,"text":6918},"Watermarking & Security","Apply visual watermarks and AES-256 encryption to PDFs with pypdf and ReportLab. Covers overlay stamps, owner vs user passwords, permission flags, and batch pipelines.",{},"\u002Fautomating-pdf-extraction-generation\u002Fwatermarking-and-securing-pdfs",{"title":65967,"description":92940},"Watermarking and Securing PDFs with Python","automating-pdf-extraction-generation\u002Fwatermarking-and-securing-pdfs\u002Findex",[9631,47,92947,65045,26232],"security","_aSUzSzn5mtC45rc-ClbP_gSvLg7VkuaLkSfkmEzLi0",{"id":92950,"title":88466,"body":92951,"breadcrumbTitle":95749,"canonical":6977,"date":6978,"description":95750,"draft":6980,"extension":6981,"image":6977,"meta":95751,"navigation":91,"path":95752,"robots":6977,"seo":95753,"seoTitle":95754,"stem":95755,"tags":95756,"updatedAt":6978,"__hash__":95757},"content\u002Fautomating-pdf-extraction-generation\u002Fwatermarking-and-securing-pdfs\u002Fremove-password-from-pdf-with-python\u002Findex.md",{"type":7,"value":92952,"toc":95736},[92953,92956,92965,92971,92973,92987,92993,93008,93010,93134,93147,93151,93714,93726,93730,93733,94047,94053,94057,94633,94636,94638,94641,94928,94934,94938,94947,95269,95279,95283,95292,95548,95553,95555,95666,95668,95674,95686,95701,95710,95712,95729,95733],[10,92954,88466],{"id":92955},"remove-a-password-from-a-pdf-with-python",[14,92957,92958,92959,92961,92962,92964],{},"When you open a password-protected PDF programmatically without calling ",[30,92960,86194],{},", pypdf raises ",[30,92963,91958],{}," — or silently returns an empty page tree — before you can read, merge, or process any content. The fix is to decrypt the file in-place and write a clean copy.",[14,92966,92967,92970],{},[1974,92968,92969],{},"Authorized use only."," This technique is for removing passwords from documents you own or have explicit permission to modify — files you encrypted yourself, documents issued to you with a known password, or organizational PDFs where your role grants access. Bypassing password protection on documents you do not own is a legal and ethical violation.",[18,92972,7021],{"id":7020},[14,92974,92975,92976,92978,92979,92982,92983,92986],{},"pypdf does not automatically prompt for a password. Opening an encrypted file succeeds (the ",[30,92977,68108],{}," object is created), but accessing ",[30,92980,92981],{},".pages"," before calling ",[30,92984,92985],{},".decrypt()"," raises:",[23,92988,92991],{"className":92989,"code":92990,"language":2000,"meta":28},[1998],"pypdf.errors.FileNotDecryptedError: File has not been decrypted\n",[30,92992,92990],{"__ignoreMap":28},[14,92994,92995,92996,92998,92999,93001,93002,93004,93005,93007],{},"If you call ",[30,92997,92985],{}," with the wrong password it returns ",[30,93000,748],{}," without raising an exception — subsequent ",[30,93003,92981],{}," access then raises ",[30,93006,68095],{}," anyway. The silent return value is the common trap.",[18,93009,35017],{"id":35016},[23,93011,93013],{"className":126,"code":93012,"language":47,"meta":28,"style":28},"# pip install pypdf\nfrom pathlib import Path\nfrom pypdf import PdfReader\n\nENCRYPTED = Path(\"protected.pdf\")\n\ntry:\n    reader = PdfReader(ENCRYPTED)\n    print(f\"Encrypted: {reader.is_encrypted}\")\n    # Accessing .pages before decrypt() raises FileNotDecryptedError\n    # print(len(reader.pages))  # would raise if encrypted\nexcept FileNotFoundError:\n    print(f\"File not found: {ENCRYPTED}\")\n",[30,93014,93015,93019,93029,93039,93043,93057,93061,93067,93079,93099,93104,93109,93117],{"__ignoreMap":28},[33,93016,93017],{"class":35,"line":36},[33,93018,57316],{"class":39},[33,93020,93021,93023,93025,93027],{"class":35,"line":43},[33,93022,190],{"class":163},[33,93024,193],{"class":167},[33,93026,164],{"class":163},[33,93028,198],{"class":167},[33,93030,93031,93033,93035,93037],{"class":35,"line":61},[33,93032,190],{"class":163},[33,93034,57333],{"class":167},[33,93036,164],{"class":163},[33,93038,57338],{"class":167},[33,93040,93041],{"class":35,"line":73},[33,93042,92],{"emptyLinePlaceholder":91},[33,93044,93045,93048,93050,93052,93055],{"class":35,"line":88},[33,93046,93047],{"class":50},"ENCRYPTED",[33,93049,212],{"class":163},[33,93051,215],{"class":167},[33,93053,93054],{"class":54},"\"protected.pdf\"",[33,93056,221],{"class":167},[33,93058,93059],{"class":35,"line":95},[33,93060,92],{"emptyLinePlaceholder":91},[33,93062,93063,93065],{"class":35,"line":101},[33,93064,35574],{"class":163},[33,93066,574],{"class":167},[33,93068,93069,93071,93073,93075,93077],{"class":35,"line":171},[33,93070,57365],{"class":167},[33,93072,242],{"class":163},[33,93074,57370],{"class":167},[33,93076,93047],{"class":50},[33,93078,221],{"class":167},[33,93080,93081,93083,93085,93087,93089,93091,93093,93095,93097],{"class":35,"line":179},[33,93082,7268],{"class":50},[33,93084,602],{"class":167},[33,93086,4059],{"class":163},[33,93088,86314],{"class":54},[33,93090,1115],{"class":50},[33,93092,75937],{"class":167},[33,93094,1121],{"class":50},[33,93096,274],{"class":54},[33,93098,221],{"class":167},[33,93100,93101],{"class":35,"line":187},[33,93102,93103],{"class":39},"    # Accessing .pages before decrypt() raises FileNotDecryptedError\n",[33,93105,93106],{"class":35,"line":201},[33,93107,93108],{"class":39},"    # print(len(reader.pages))  # would raise if encrypted\n",[33,93110,93111,93113,93115],{"class":35,"line":206},[33,93112,35726],{"class":163},[33,93114,2945],{"class":50},[33,93116,574],{"class":167},[33,93118,93119,93121,93123,93125,93127,93130,93132],{"class":35,"line":224},[33,93120,7268],{"class":50},[33,93122,602],{"class":167},[33,93124,4059],{"class":163},[33,93126,15677],{"class":54},[33,93128,93129],{"class":50},"{ENCRYPTED}",[33,93131,274],{"class":54},[33,93133,221],{"class":167},[14,93135,41963,93136,4348,93138,93140,93141,93143,93144,93146],{},[30,93137,86397],{},[30,93139,855],{}," the file needs ",[30,93142,91875],{}," before any page-level operation. If it is ",[30,93145,902],{},", no password removal is required — the file is already open.",[18,93148,93150],{"id":93149},"fix-decrypt-and-save-a-clean-copy","Fix: Decrypt and Save a Clean Copy",[23,93152,93154],{"className":126,"code":93153,"language":47,"meta":28,"style":28},"# pip install \"pypdf>=3.17\"\nfrom pathlib import Path\nfrom pypdf import PdfReader, PdfWriter\nfrom pypdf.errors import FileNotDecryptedError\n\nENCRYPTED   = Path(\"protected.pdf\")\nDECRYPTED   = Path(\"decrypted.pdf\")\nPASSWORD    = \"your_password_here\"   # replace with the actual password\n\n\ndef remove_pdf_password(\n    source: Path,\n    output: Path,\n    password: str,\n) -> None:\n    \"\"\"\n    Open an encrypted PDF, decrypt it, and write an unencrypted copy.\n    Only use this on documents you own or are authorized to access.\n    \"\"\"\n    if not source.exists():\n        raise FileNotFoundError(f\"Source not found: {source}\")\n\n    try:\n        reader = PdfReader(source)\n\n        if not reader.is_encrypted:\n            # Already unencrypted — copy as-is\n            print(f\"{source.name} is not encrypted; copying without changes\")\n            writer = PdfWriter()\n            for page in reader.pages:\n                writer.add_page(page)\n        else:\n            # Decrypt returns: 0 = wrong password, 1 = user pw, 2 = owner pw\n            result = reader.decrypt(password)   # call decrypt() before .pages\n            if result == 0:\n                raise ValueError(f\"Incorrect password for {source.name}\")\n\n            # Copy pages into a fresh writer — new writer has no encryption dict\n            writer = PdfWriter()\n            for page in reader.pages:\n                writer.add_page(page)           # pages are now accessible\n\n        output.parent.mkdir(parents=True, exist_ok=True)\n        with open(output, \"wb\") as fh:\n            writer.write(fh)                    # output file has no encryption\n        print(f\"Decrypted: {output}\")\n\n    except FileNotDecryptedError:\n        # Raised if .pages is accessed before a successful decrypt()\n        print(\"FileNotDecryptedError: call reader.decrypt(password) before reading pages\")\n        raise\n    except ValueError:\n        raise\n    except Exception as exc:\n        raise RuntimeError(f\"Failed to remove password from {source.name}: {exc}\") from exc\n\n\nif __name__ == \"__main__\":\n    import os\n    remove_pdf_password(\n        ENCRYPTED,\n        DECRYPTED,\n        password=os.environ[\"PDF_USER_PW\"],   # pull from environment, never hardcode\n    )\n",[30,93155,93156,93160,93170,93180,93190,93194,93206,93220,93233,93237,93241,93250,93254,93258,93266,93274,93278,93283,93288,93292,93300,93322,93326,93332,93340,93344,93352,93357,93378,93386,93396,93400,93406,93411,93423,93435,93458,93462,93467,93475,93485,93493,93497,93517,93533,93541,93562,93566,93572,93577,93588,93592,93600,93604,93614,93649,93653,93657,93669,93675,93680,93687,93694,93710],{"__ignoreMap":28},[33,93157,93158],{"class":35,"line":36},[33,93159,86432],{"class":39},[33,93161,93162,93164,93166,93168],{"class":35,"line":43},[33,93163,190],{"class":163},[33,93165,193],{"class":167},[33,93167,164],{"class":163},[33,93169,198],{"class":167},[33,93171,93172,93174,93176,93178],{"class":35,"line":61},[33,93173,190],{"class":163},[33,93175,57333],{"class":167},[33,93177,164],{"class":163},[33,93179,66892],{"class":167},[33,93181,93182,93184,93186,93188],{"class":35,"line":73},[33,93183,190],{"class":163},[33,93185,68145],{"class":167},[33,93187,164],{"class":163},[33,93189,73152],{"class":167},[33,93191,93192],{"class":35,"line":88},[33,93193,92],{"emptyLinePlaceholder":91},[33,93195,93196,93198,93200,93202,93204],{"class":35,"line":95},[33,93197,93047],{"class":50},[33,93199,21012],{"class":163},[33,93201,215],{"class":167},[33,93203,93054],{"class":54},[33,93205,221],{"class":167},[33,93207,93208,93211,93213,93215,93218],{"class":35,"line":101},[33,93209,93210],{"class":50},"DECRYPTED",[33,93212,21012],{"class":163},[33,93214,215],{"class":167},[33,93216,93217],{"class":54},"\"decrypted.pdf\"",[33,93219,221],{"class":167},[33,93221,93222,93225,93227,93230],{"class":35,"line":171},[33,93223,93224],{"class":50},"PASSWORD",[33,93226,20470],{"class":163},[33,93228,93229],{"class":54}," \"your_password_here\"",[33,93231,93232],{"class":39},"   # replace with the actual password\n",[33,93234,93235],{"class":35,"line":179},[33,93236,92],{"emptyLinePlaceholder":91},[33,93238,93239],{"class":35,"line":187},[33,93240,92],{"emptyLinePlaceholder":91},[33,93242,93243,93245,93248],{"class":35,"line":201},[33,93244,562],{"class":163},[33,93246,93247],{"class":46}," remove_pdf_password",[33,93249,7637],{"class":167},[33,93251,93252],{"class":35,"line":206},[33,93253,79033],{"class":167},[33,93255,93256],{"class":35,"line":224},[33,93257,70116],{"class":167},[33,93259,93260,93262,93264],{"class":35,"line":229},[33,93261,68580],{"class":167},[33,93263,1053],{"class":50},[33,93265,247],{"class":167},[33,93267,93268,93270,93272],{"class":35,"line":235},[33,93269,1617],{"class":167},[33,93271,571],{"class":50},[33,93273,574],{"class":167},[33,93275,93276],{"class":35,"line":250},[33,93277,7673],{"class":54},[33,93279,93280],{"class":35,"line":266},[33,93281,93282],{"class":54},"    Open an encrypted PDF, decrypt it, and write an unencrypted copy.\n",[33,93284,93285],{"class":35,"line":290},[33,93286,93287],{"class":54},"    Only use this on documents you own or are authorized to access.\n",[33,93289,93290],{"class":35,"line":295},[33,93291,7673],{"class":54},[33,93293,93294,93296,93298],{"class":35,"line":300},[33,93295,617],{"class":163},[33,93297,620],{"class":163},[33,93299,86600],{"class":167},[33,93301,93302,93304,93306,93308,93310,93312,93314,93316,93318,93320],{"class":35,"line":317},[33,93303,4051],{"class":163},[33,93305,2945],{"class":50},[33,93307,602],{"class":167},[33,93309,4059],{"class":163},[33,93311,90279],{"class":54},[33,93313,1115],{"class":50},[33,93315,64],{"class":167},[33,93317,1121],{"class":50},[33,93319,274],{"class":54},[33,93321,221],{"class":167},[33,93323,93324],{"class":35,"line":332},[33,93325,92],{"emptyLinePlaceholder":91},[33,93327,93328,93330],{"class":35,"line":347},[33,93329,2424],{"class":163},[33,93331,574],{"class":167},[33,93333,93334,93336,93338],{"class":35,"line":374},[33,93335,62484],{"class":167},[33,93337,242],{"class":163},[33,93339,86642],{"class":167},[33,93341,93342],{"class":35,"line":397},[33,93343,92],{"emptyLinePlaceholder":91},[33,93345,93346,93348,93350],{"class":35,"line":653},[33,93347,8221],{"class":163},[33,93349,620],{"class":163},[33,93351,68749],{"class":167},[33,93353,93354],{"class":35,"line":667},[33,93355,93356],{"class":39},"            # Already unencrypted — copy as-is\n",[33,93358,93359,93361,93363,93365,93367,93369,93371,93373,93376],{"class":35,"line":675},[33,93360,9364],{"class":50},[33,93362,602],{"class":167},[33,93364,4059],{"class":163},[33,93366,274],{"class":54},[33,93368,1115],{"class":50},[33,93370,86888],{"class":167},[33,93372,1121],{"class":50},[33,93374,93375],{"class":54}," is not encrypted; copying without changes\"",[33,93377,221],{"class":167},[33,93379,93380,93382,93384],{"class":35,"line":689},[33,93381,70275],{"class":167},[33,93383,242],{"class":163},[33,93385,67154],{"class":167},[33,93387,93388,93390,93392,93394],{"class":35,"line":703},[33,93389,1793],{"class":163},[33,93391,695],{"class":167},[33,93393,662],{"class":163},[33,93395,86724],{"class":167},[33,93397,93398],{"class":35,"line":714},[33,93399,87587],{"class":167},[33,93401,93402,93404],{"class":35,"line":723},[33,93403,41290],{"class":163},[33,93405,574],{"class":167},[33,93407,93408],{"class":35,"line":754},[33,93409,93410],{"class":39},"            # Decrypt returns: 0 = wrong password, 1 = user pw, 2 = owner pw\n",[33,93412,93413,93415,93417,93420],{"class":35,"line":771},[33,93414,86662],{"class":167},[33,93416,242],{"class":163},[33,93418,93419],{"class":167}," reader.decrypt(password)   ",[33,93421,93422],{"class":39},"# call decrypt() before .pages\n",[33,93424,93425,93427,93429,93431,93433],{"class":35,"line":777},[33,93426,5995],{"class":163},[33,93428,68801],{"class":167},[33,93430,1865],{"class":163},[33,93432,10791],{"class":50},[33,93434,574],{"class":167},[33,93436,93437,93439,93441,93443,93445,93448,93450,93452,93454,93456],{"class":35,"line":788},[33,93438,16804],{"class":163},[33,93440,4054],{"class":50},[33,93442,602],{"class":167},[33,93444,4059],{"class":163},[33,93446,93447],{"class":54},"\"Incorrect password for ",[33,93449,1115],{"class":50},[33,93451,86888],{"class":167},[33,93453,1121],{"class":50},[33,93455,274],{"class":54},[33,93457,221],{"class":167},[33,93459,93460],{"class":35,"line":804},[33,93461,92],{"emptyLinePlaceholder":91},[33,93463,93464],{"class":35,"line":809},[33,93465,93466],{"class":39},"            # Copy pages into a fresh writer — new writer has no encryption dict\n",[33,93468,93469,93471,93473],{"class":35,"line":819},[33,93470,70275],{"class":167},[33,93472,242],{"class":163},[33,93474,67154],{"class":167},[33,93476,93477,93479,93481,93483],{"class":35,"line":829},[33,93478,1793],{"class":163},[33,93480,695],{"class":167},[33,93482,662],{"class":163},[33,93484,86724],{"class":167},[33,93486,93487,93490],{"class":35,"line":834},[33,93488,93489],{"class":167},"                writer.add_page(page)           ",[33,93491,93492],{"class":39},"# pages are now accessible\n",[33,93494,93495],{"class":35,"line":839},[33,93496,92],{"emptyLinePlaceholder":91},[33,93498,93499,93501,93503,93505,93507,93509,93511,93513,93515],{"class":35,"line":860},[33,93500,70507],{"class":167},[33,93502,869],{"class":238},[33,93504,242],{"class":163},[33,93506,855],{"class":50},[33,93508,365],{"class":167},[33,93510,878],{"class":238},[33,93512,242],{"class":163},[33,93514,855],{"class":50},[33,93516,221],{"class":167},[33,93518,93519,93521,93523,93525,93527,93529,93531],{"class":35,"line":887},[33,93520,2191],{"class":163},[33,93522,68213],{"class":50},[33,93524,70532],{"class":167},[33,93526,67169],{"class":54},[33,93528,1649],{"class":167},[33,93530,495],{"class":163},[33,93532,67176],{"class":167},[33,93534,93535,93538],{"class":35,"line":907},[33,93536,93537],{"class":167},"            writer.write(fh)                    ",[33,93539,93540],{"class":39},"# output file has no encryption\n",[33,93542,93543,93545,93547,93549,93552,93554,93556,93558,93560],{"class":35,"line":1826},[33,93544,9414],{"class":50},[33,93546,602],{"class":167},[33,93548,4059],{"class":163},[33,93550,93551],{"class":54},"\"Decrypted: ",[33,93553,1115],{"class":50},[33,93555,70566],{"class":167},[33,93557,1121],{"class":50},[33,93559,274],{"class":54},[33,93561,221],{"class":167},[33,93563,93564],{"class":35,"line":1844},[33,93565,92],{"emptyLinePlaceholder":91},[33,93567,93568,93570],{"class":35,"line":1858},[33,93569,2449],{"class":163},[33,93571,88069],{"class":167},[33,93573,93574],{"class":35,"line":1871},[33,93575,93576],{"class":39},"        # Raised if .pages is accessed before a successful decrypt()\n",[33,93578,93579,93581,93583,93586],{"class":35,"line":1877},[33,93580,9414],{"class":50},[33,93582,602],{"class":167},[33,93584,93585],{"class":54},"\"FileNotDecryptedError: call reader.decrypt(password) before reading pages\"",[33,93587,221],{"class":167},[33,93589,93590],{"class":35,"line":1883},[33,93591,65922],{"class":163},[33,93593,93594,93596,93598],{"class":35,"line":1915},[33,93595,2449],{"class":163},[33,93597,4054],{"class":50},[33,93599,574],{"class":167},[33,93601,93602],{"class":35,"line":1926},[33,93603,65922],{"class":163},[33,93605,93606,93608,93610,93612],{"class":35,"line":1932},[33,93607,2449],{"class":163},[33,93609,783],{"class":50},[33,93611,1852],{"class":163},[33,93613,1855],{"class":167},[33,93615,93616,93618,93620,93622,93624,93627,93629,93631,93633,93635,93637,93639,93641,93643,93645,93647],{"class":35,"line":1938},[33,93617,4051],{"class":163},[33,93619,7590],{"class":50},[33,93621,602],{"class":167},[33,93623,4059],{"class":163},[33,93625,93626],{"class":54},"\"Failed to remove password from ",[33,93628,1115],{"class":50},[33,93630,86888],{"class":167},[33,93632,1121],{"class":50},[33,93634,2079],{"class":54},[33,93636,1115],{"class":50},[33,93638,6565],{"class":167},[33,93640,1121],{"class":50},[33,93642,274],{"class":54},[33,93644,1649],{"class":167},[33,93646,190],{"class":163},[33,93648,20843],{"class":167},[33,93650,93651],{"class":35,"line":1950},[33,93652,92],{"emptyLinePlaceholder":91},[33,93654,93655],{"class":35,"line":1958},[33,93656,92],{"emptyLinePlaceholder":91},[33,93658,93659,93661,93663,93665,93667],{"class":35,"line":4904},[33,93660,2491],{"class":163},[33,93662,2494],{"class":50},[33,93664,2497],{"class":163},[33,93666,2500],{"class":54},[33,93668,574],{"class":167},[33,93670,93671,93673],{"class":35,"line":4909},[33,93672,1627],{"class":163},[33,93674,176],{"class":167},[33,93676,93677],{"class":35,"line":4915},[33,93678,93679],{"class":167},"    remove_pdf_password(\n",[33,93681,93682,93685],{"class":35,"line":4925},[33,93683,93684],{"class":50},"        ENCRYPTED",[33,93686,247],{"class":167},[33,93688,93689,93692],{"class":35,"line":4935},[33,93690,93691],{"class":50},"        DECRYPTED",[33,93693,247],{"class":167},[33,93695,93696,93699,93701,93703,93705,93707],{"class":35,"line":4941},[33,93697,93698],{"class":238},"        password",[33,93700,242],{"class":163},[33,93702,35884],{"class":167},[33,93704,86957],{"class":54},[33,93706,13424],{"class":167},[33,93708,93709],{"class":39},"# pull from environment, never hardcode\n",[33,93711,93712],{"class":35,"line":4950},[33,93713,1202],{"class":167},[14,93715,93716,93717,93719,93720,93722,93723,93725],{},"The key line is ",[30,93718,68099],{}," called immediately after opening. The returned integer tells you which type of password matched — always check it before proceeding. Copying pages into a fresh ",[30,93721,70025],{}," (one that has never had ",[30,93724,86146],{}," called on it) produces a file with no encryption dictionary.",[18,93727,93729],{"id":93728},"variant-check-encryption-algorithm-before-decrypting","Variant: Check Encryption Algorithm Before Decrypting",[14,93731,93732],{},"pypdf exposes the encryption metadata before decryption, which lets you log the cipher in use or branch on AES vs RC4:",[23,93734,93736],{"className":126,"code":93735,"language":47,"meta":28,"style":28},"# pip install \"pypdf>=3.17\"\nfrom pathlib import Path\nfrom pypdf import PdfReader\n\nENCRYPTED = Path(\"protected.pdf\")\n\n\ndef inspect_encryption(source: Path) -> None:\n    reader = PdfReader(source)\n    if not reader.is_encrypted:\n        print(\"Not encrypted\")\n        return\n\n    # ._encryption is an internal attribute; read-only inspection is fine\n    enc = reader._encryption\n    if enc:\n        print(f\"Filter     : {enc.entry.get('\u002FFilter', 'unknown')}\")\n        print(f\"V (version): {enc.entry.get('\u002FV', '?')}\")\n        print(f\"Length     : {enc.entry.get('\u002FLength', '?')} bits\")\n        # V=4 or V=5 → AES; V=1 or V=2 → RC4\n        v = enc.entry.get(\"\u002FV\", 0)\n        cipher = \"AES\" if v >= 4 else \"RC4\"\n        print(f\"Cipher     : {cipher} (V={v})\")\n    else:\n        print(\"Encryption metadata unavailable without decryption\")\n",[30,93737,93738,93742,93752,93762,93766,93778,93782,93786,93800,93808,93816,93827,93831,93835,93840,93850,93857,93889,93920,93951,93956,93975,93999,94030,94036],{"__ignoreMap":28},[33,93739,93740],{"class":35,"line":36},[33,93741,86432],{"class":39},[33,93743,93744,93746,93748,93750],{"class":35,"line":43},[33,93745,190],{"class":163},[33,93747,193],{"class":167},[33,93749,164],{"class":163},[33,93751,198],{"class":167},[33,93753,93754,93756,93758,93760],{"class":35,"line":61},[33,93755,190],{"class":163},[33,93757,57333],{"class":167},[33,93759,164],{"class":163},[33,93761,57338],{"class":167},[33,93763,93764],{"class":35,"line":73},[33,93765,92],{"emptyLinePlaceholder":91},[33,93767,93768,93770,93772,93774,93776],{"class":35,"line":88},[33,93769,93047],{"class":50},[33,93771,212],{"class":163},[33,93773,215],{"class":167},[33,93775,93054],{"class":54},[33,93777,221],{"class":167},[33,93779,93780],{"class":35,"line":95},[33,93781,92],{"emptyLinePlaceholder":91},[33,93783,93784],{"class":35,"line":101},[33,93785,92],{"emptyLinePlaceholder":91},[33,93787,93788,93790,93793,93796,93798],{"class":35,"line":171},[33,93789,562],{"class":163},[33,93791,93792],{"class":46}," inspect_encryption",[33,93794,93795],{"class":167},"(source: Path) -> ",[33,93797,571],{"class":50},[33,93799,574],{"class":167},[33,93801,93802,93804,93806],{"class":35,"line":179},[33,93803,57365],{"class":167},[33,93805,242],{"class":163},[33,93807,86642],{"class":167},[33,93809,93810,93812,93814],{"class":35,"line":187},[33,93811,617],{"class":163},[33,93813,620],{"class":163},[33,93815,68749],{"class":167},[33,93817,93818,93820,93822,93825],{"class":35,"line":201},[33,93819,9414],{"class":50},[33,93821,602],{"class":167},[33,93823,93824],{"class":54},"\"Not encrypted\"",[33,93826,221],{"class":167},[33,93828,93829],{"class":35,"line":206},[33,93830,646],{"class":163},[33,93832,93833],{"class":35,"line":224},[33,93834,92],{"emptyLinePlaceholder":91},[33,93836,93837],{"class":35,"line":229},[33,93838,93839],{"class":39},"    # ._encryption is an internal attribute; read-only inspection is fine\n",[33,93841,93842,93845,93847],{"class":35,"line":235},[33,93843,93844],{"class":167},"    enc ",[33,93846,242],{"class":163},[33,93848,93849],{"class":167}," reader._encryption\n",[33,93851,93852,93854],{"class":35,"line":250},[33,93853,617],{"class":163},[33,93855,93856],{"class":167}," enc:\n",[33,93858,93859,93861,93863,93865,93868,93870,93873,93876,93878,93881,93883,93885,93887],{"class":35,"line":266},[33,93860,9414],{"class":50},[33,93862,602],{"class":167},[33,93864,4059],{"class":163},[33,93866,93867],{"class":54},"\"Filter     : ",[33,93869,1115],{"class":50},[33,93871,93872],{"class":167},"enc.entry.get(",[33,93874,93875],{"class":54},"'\u002FFilter'",[33,93877,365],{"class":167},[33,93879,93880],{"class":54},"'unknown'",[33,93882,12027],{"class":167},[33,93884,1121],{"class":50},[33,93886,274],{"class":54},[33,93888,221],{"class":167},[33,93890,93891,93893,93895,93897,93900,93902,93904,93907,93909,93912,93914,93916,93918],{"class":35,"line":290},[33,93892,9414],{"class":50},[33,93894,602],{"class":167},[33,93896,4059],{"class":163},[33,93898,93899],{"class":54},"\"V (version): ",[33,93901,1115],{"class":50},[33,93903,93872],{"class":167},[33,93905,93906],{"class":54},"'\u002FV'",[33,93908,365],{"class":167},[33,93910,93911],{"class":54},"'?'",[33,93913,12027],{"class":167},[33,93915,1121],{"class":50},[33,93917,274],{"class":54},[33,93919,221],{"class":167},[33,93921,93922,93924,93926,93928,93931,93933,93935,93938,93940,93942,93944,93946,93949],{"class":35,"line":295},[33,93923,9414],{"class":50},[33,93925,602],{"class":167},[33,93927,4059],{"class":163},[33,93929,93930],{"class":54},"\"Length     : ",[33,93932,1115],{"class":50},[33,93934,93872],{"class":167},[33,93936,93937],{"class":54},"'\u002FLength'",[33,93939,365],{"class":167},[33,93941,93911],{"class":54},[33,93943,12027],{"class":167},[33,93945,1121],{"class":50},[33,93947,93948],{"class":54}," bits\"",[33,93950,221],{"class":167},[33,93952,93953],{"class":35,"line":300},[33,93954,93955],{"class":39},"        # V=4 or V=5 → AES; V=1 or V=2 → RC4\n",[33,93957,93958,93961,93963,93966,93969,93971,93973],{"class":35,"line":317},[33,93959,93960],{"class":167},"        v ",[33,93962,242],{"class":163},[33,93964,93965],{"class":167}," enc.entry.get(",[33,93967,93968],{"class":54},"\"\u002FV\"",[33,93970,365],{"class":167},[33,93972,748],{"class":50},[33,93974,221],{"class":167},[33,93976,93977,93980,93982,93985,93987,93990,93992,93994,93996],{"class":35,"line":332},[33,93978,93979],{"class":167},"        cipher ",[33,93981,242],{"class":163},[33,93983,93984],{"class":54}," \"AES\"",[33,93986,9994],{"class":163},[33,93988,93989],{"class":167}," v ",[33,93991,43000],{"class":163},[33,93993,82708],{"class":50},[33,93995,15715],{"class":163},[33,93997,93998],{"class":54}," \"RC4\"\n",[33,94000,94001,94003,94005,94007,94010,94012,94015,94017,94020,94022,94024,94026,94028],{"class":35,"line":347},[33,94002,9414],{"class":50},[33,94004,602],{"class":167},[33,94006,4059],{"class":163},[33,94008,94009],{"class":54},"\"Cipher     : ",[33,94011,1115],{"class":50},[33,94013,94014],{"class":167},"cipher",[33,94016,1121],{"class":50},[33,94018,94019],{"class":54}," (V=",[33,94021,1115],{"class":50},[33,94023,61854],{"class":167},[33,94025,1121],{"class":50},[33,94027,72406],{"class":54},[33,94029,221],{"class":167},[33,94031,94032,94034],{"class":35,"line":374},[33,94033,6864],{"class":163},[33,94035,574],{"class":167},[33,94037,94038,94040,94042,94045],{"class":35,"line":397},[33,94039,9414],{"class":50},[33,94041,602],{"class":167},[33,94043,94044],{"class":54},"\"Encryption metadata unavailable without decryption\"",[33,94046,221],{"class":167},[14,94048,94049,94050,94052],{},"RC4-encrypted files (V=1 or V=2) decrypt with the same ",[30,94051,86194],{}," call — pypdf handles both cipher types transparently. The distinction matters if you are auditing legacy documents for compliance.",[18,94054,94056],{"id":94055},"variant-batch-decrypt-a-directory","Variant: Batch Decrypt a Directory",[23,94058,94060],{"className":126,"code":94059,"language":47,"meta":28,"style":28},"# pip install \"pypdf>=3.17\"\nimport os\nfrom pathlib import Path\nfrom pypdf import PdfReader, PdfWriter\nfrom pypdf.errors import FileNotDecryptedError\n\nINPUT_DIR  = Path(\".\u002Flocked\")\nOUTPUT_DIR = Path(\".\u002Funlocked\")\n\n\ndef batch_decrypt(source_dir: Path, output_dir: Path, password: str) -> None:\n    output_dir.mkdir(parents=True, exist_ok=True)\n    pdfs = sorted(source_dir.glob(\"*.pdf\"))\n    if not pdfs:\n        print(f\"No PDFs in {source_dir}\")\n        return\n\n    ok, skipped, failed = 0, 0, 0\n    for pdf in pdfs:\n        out = output_dir \u002F pdf.name\n        try:\n            reader = PdfReader(pdf)\n            if not reader.is_encrypted:\n                skipped += 1\n                print(f\"  SKIP (not encrypted): {pdf.name}\")\n                continue\n\n            result = reader.decrypt(password)\n            if result == 0:\n                failed += 1\n                print(f\"  FAIL (wrong password): {pdf.name}\")\n                continue\n\n            writer = PdfWriter()\n            for page in reader.pages:\n                writer.add_page(page)\n            with open(out, \"wb\") as fh:\n                writer.write(fh)\n            ok += 1\n            print(f\"  OK: {pdf.name}\")\n\n        except FileNotDecryptedError:\n            failed += 1\n            print(f\"  FAIL (FileNotDecryptedError): {pdf.name}\")\n        except Exception as exc:\n            failed += 1\n            print(f\"  ERR {pdf.name}: {exc}\")\n\n    print(f\"\\nDone: {ok} decrypted, {skipped} skipped, {failed} failed\")\n\n\nif __name__ == \"__main__\":\n    batch_decrypt(INPUT_DIR, OUTPUT_DIR, password=os.environ[\"PDF_USER_PW\"])\n",[30,94061,94062,94066,94072,94082,94092,94102,94106,94119,94132,94136,94140,94158,94178,94192,94200,94221,94225,94229,94246,94256,94269,94275,94283,94291,94300,94321,94325,94329,94337,94349,94358,94379,94383,94387,94395,94405,94409,94425,94429,94437,94458,94462,94468,94476,94497,94507,94515,94543,94547,94590,94594,94598,94610],{"__ignoreMap":28},[33,94063,94064],{"class":35,"line":36},[33,94065,86432],{"class":39},[33,94067,94068,94070],{"class":35,"line":43},[33,94069,164],{"class":163},[33,94071,176],{"class":167},[33,94073,94074,94076,94078,94080],{"class":35,"line":61},[33,94075,190],{"class":163},[33,94077,193],{"class":167},[33,94079,164],{"class":163},[33,94081,198],{"class":167},[33,94083,94084,94086,94088,94090],{"class":35,"line":73},[33,94085,190],{"class":163},[33,94087,57333],{"class":167},[33,94089,164],{"class":163},[33,94091,66892],{"class":167},[33,94093,94094,94096,94098,94100],{"class":35,"line":88},[33,94095,190],{"class":163},[33,94097,68145],{"class":167},[33,94099,164],{"class":163},[33,94101,73152],{"class":167},[33,94103,94104],{"class":35,"line":95},[33,94105,92],{"emptyLinePlaceholder":91},[33,94107,94108,94110,94112,94114,94117],{"class":35,"line":101},[33,94109,507],{"class":50},[33,94111,17208],{"class":163},[33,94113,215],{"class":167},[33,94115,94116],{"class":54},"\".\u002Flocked\"",[33,94118,221],{"class":167},[33,94120,94121,94123,94125,94127,94130],{"class":35,"line":171},[33,94122,4615],{"class":50},[33,94124,212],{"class":163},[33,94126,215],{"class":167},[33,94128,94129],{"class":54},"\".\u002Funlocked\"",[33,94131,221],{"class":167},[33,94133,94134],{"class":35,"line":179},[33,94135,92],{"emptyLinePlaceholder":91},[33,94137,94138],{"class":35,"line":187},[33,94139,92],{"emptyLinePlaceholder":91},[33,94141,94142,94144,94147,94150,94152,94154,94156],{"class":35,"line":201},[33,94143,562],{"class":163},[33,94145,94146],{"class":46}," batch_decrypt",[33,94148,94149],{"class":167},"(source_dir: Path, output_dir: Path, password: ",[33,94151,1053],{"class":50},[33,94153,1617],{"class":167},[33,94155,571],{"class":50},[33,94157,574],{"class":167},[33,94159,94160,94162,94164,94166,94168,94170,94172,94174,94176],{"class":35,"line":206},[33,94161,6346],{"class":167},[33,94163,869],{"class":238},[33,94165,242],{"class":163},[33,94167,855],{"class":50},[33,94169,365],{"class":167},[33,94171,878],{"class":238},[33,94173,242],{"class":163},[33,94175,855],{"class":50},[33,94177,221],{"class":167},[33,94179,94180,94182,94184,94186,94188,94190],{"class":35,"line":224},[33,94181,67695],{"class":167},[33,94183,242],{"class":163},[33,94185,28924],{"class":50},[33,94187,87461],{"class":167},[33,94189,610],{"class":54},[33,94191,371],{"class":167},[33,94193,94194,94196,94198],{"class":35,"line":229},[33,94195,617],{"class":163},[33,94197,620],{"class":163},[33,94199,67717],{"class":167},[33,94201,94202,94204,94206,94208,94211,94213,94215,94217,94219],{"class":35,"line":235},[33,94203,9414],{"class":50},[33,94205,602],{"class":167},[33,94207,4059],{"class":163},[33,94209,94210],{"class":54},"\"No PDFs in ",[33,94212,1115],{"class":50},[33,94214,87488],{"class":167},[33,94216,1121],{"class":50},[33,94218,274],{"class":54},[33,94220,221],{"class":167},[33,94222,94223],{"class":35,"line":250},[33,94224,646],{"class":163},[33,94226,94227],{"class":35,"line":266},[33,94228,92],{"emptyLinePlaceholder":91},[33,94230,94231,94234,94236,94238,94240,94242,94244],{"class":35,"line":290},[33,94232,94233],{"class":167},"    ok, skipped, failed ",[33,94235,242],{"class":163},[33,94237,10791],{"class":50},[33,94239,365],{"class":167},[33,94241,748],{"class":50},[33,94243,365],{"class":167},[33,94245,87516],{"class":50},[33,94247,94248,94250,94252,94254],{"class":35,"line":295},[33,94249,656],{"class":163},[33,94251,67712],{"class":167},[33,94253,662],{"class":163},[33,94255,67717],{"class":167},[33,94257,94258,94260,94262,94264,94266],{"class":35,"line":300},[33,94259,50344],{"class":167},[33,94261,242],{"class":163},[33,94263,6393],{"class":167},[33,94265,1351],{"class":163},[33,94267,94268],{"class":167}," pdf.name\n",[33,94270,94271,94273],{"class":35,"line":317},[33,94272,670],{"class":163},[33,94274,574],{"class":167},[33,94276,94277,94279,94281],{"class":35,"line":332},[33,94278,72722],{"class":167},[33,94280,242],{"class":163},[33,94282,87564],{"class":167},[33,94284,94285,94287,94289],{"class":35,"line":347},[33,94286,5995],{"class":163},[33,94288,620],{"class":163},[33,94290,68749],{"class":167},[33,94292,94293,94296,94298],{"class":35,"line":374},[33,94294,94295],{"class":167},"                skipped ",[33,94297,28976],{"class":163},[33,94299,17709],{"class":50},[33,94301,94302,94304,94306,94308,94311,94313,94315,94317,94319],{"class":35,"line":397},[33,94303,8264],{"class":50},[33,94305,602],{"class":167},[33,94307,4059],{"class":163},[33,94309,94310],{"class":54},"\"  SKIP (not encrypted): ",[33,94312,1115],{"class":50},[33,94314,68341],{"class":167},[33,94316,1121],{"class":50},[33,94318,274],{"class":54},[33,94320,221],{"class":167},[33,94322,94323],{"class":35,"line":653},[33,94324,12315],{"class":163},[33,94326,94327],{"class":35,"line":667},[33,94328,92],{"emptyLinePlaceholder":91},[33,94330,94331,94333,94335],{"class":35,"line":675},[33,94332,86662],{"class":167},[33,94334,242],{"class":163},[33,94336,68794],{"class":167},[33,94338,94339,94341,94343,94345,94347],{"class":35,"line":689},[33,94340,5995],{"class":163},[33,94342,68801],{"class":167},[33,94344,1865],{"class":163},[33,94346,10791],{"class":50},[33,94348,574],{"class":167},[33,94350,94351,94354,94356],{"class":35,"line":703},[33,94352,94353],{"class":167},"                failed ",[33,94355,28976],{"class":163},[33,94357,17709],{"class":50},[33,94359,94360,94362,94364,94366,94369,94371,94373,94375,94377],{"class":35,"line":714},[33,94361,8264],{"class":50},[33,94363,602],{"class":167},[33,94365,4059],{"class":163},[33,94367,94368],{"class":54},"\"  FAIL (wrong password): ",[33,94370,1115],{"class":50},[33,94372,68341],{"class":167},[33,94374,1121],{"class":50},[33,94376,274],{"class":54},[33,94378,221],{"class":167},[33,94380,94381],{"class":35,"line":723},[33,94382,12315],{"class":163},[33,94384,94385],{"class":35,"line":754},[33,94386,92],{"emptyLinePlaceholder":91},[33,94388,94389,94391,94393],{"class":35,"line":771},[33,94390,70275],{"class":167},[33,94392,242],{"class":163},[33,94394,67154],{"class":167},[33,94396,94397,94399,94401,94403],{"class":35,"line":777},[33,94398,1793],{"class":163},[33,94400,695],{"class":167},[33,94402,662],{"class":163},[33,94404,86724],{"class":167},[33,94406,94407],{"class":35,"line":788},[33,94408,87587],{"class":167},[33,94410,94411,94413,94415,94417,94419,94421,94423],{"class":35,"line":804},[33,94412,678],{"class":163},[33,94414,68213],{"class":50},[33,94416,77675],{"class":167},[33,94418,67169],{"class":54},[33,94420,1649],{"class":167},[33,94422,495],{"class":163},[33,94424,67176],{"class":167},[33,94426,94427],{"class":35,"line":809},[33,94428,87635],{"class":167},[33,94430,94431,94433,94435],{"class":35,"line":819},[33,94432,87640],{"class":167},[33,94434,28976],{"class":163},[33,94436,17709],{"class":50},[33,94438,94439,94441,94443,94445,94448,94450,94452,94454,94456],{"class":35,"line":829},[33,94440,9364],{"class":50},[33,94442,602],{"class":167},[33,94444,4059],{"class":163},[33,94446,94447],{"class":54},"\"  OK: ",[33,94449,1115],{"class":50},[33,94451,68341],{"class":167},[33,94453,1121],{"class":50},[33,94455,274],{"class":54},[33,94457,221],{"class":167},[33,94459,94460],{"class":35,"line":834},[33,94461,92],{"emptyLinePlaceholder":91},[33,94463,94464,94466],{"class":35,"line":839},[33,94465,780],{"class":163},[33,94467,88069],{"class":167},[33,94469,94470,94472,94474],{"class":35,"line":860},[33,94471,87680],{"class":167},[33,94473,28976],{"class":163},[33,94475,17709],{"class":50},[33,94477,94478,94480,94482,94484,94487,94489,94491,94493,94495],{"class":35,"line":887},[33,94479,9364],{"class":50},[33,94481,602],{"class":167},[33,94483,4059],{"class":163},[33,94485,94486],{"class":54},"\"  FAIL (FileNotDecryptedError): ",[33,94488,1115],{"class":50},[33,94490,68341],{"class":167},[33,94492,1121],{"class":50},[33,94494,274],{"class":54},[33,94496,221],{"class":167},[33,94498,94499,94501,94503,94505],{"class":35,"line":907},[33,94500,780],{"class":163},[33,94502,783],{"class":50},[33,94504,1852],{"class":163},[33,94506,1855],{"class":167},[33,94508,94509,94511,94513],{"class":35,"line":1826},[33,94510,87680],{"class":167},[33,94512,28976],{"class":163},[33,94514,17709],{"class":50},[33,94516,94517,94519,94521,94523,94525,94527,94529,94531,94533,94535,94537,94539,94541],{"class":35,"line":1844},[33,94518,9364],{"class":50},[33,94520,602],{"class":167},[33,94522,4059],{"class":163},[33,94524,87695],{"class":54},[33,94526,1115],{"class":50},[33,94528,68341],{"class":167},[33,94530,1121],{"class":50},[33,94532,2079],{"class":54},[33,94534,1115],{"class":50},[33,94536,6565],{"class":167},[33,94538,1121],{"class":50},[33,94540,274],{"class":54},[33,94542,221],{"class":167},[33,94544,94545],{"class":35,"line":1858},[33,94546,92],{"emptyLinePlaceholder":91},[33,94548,94549,94551,94553,94555,94557,94559,94561,94563,94565,94567,94570,94572,94575,94577,94580,94582,94584,94586,94588],{"class":35,"line":1871},[33,94550,7268],{"class":50},[33,94552,602],{"class":167},[33,94554,4059],{"class":163},[33,94556,274],{"class":54},[33,94558,25830],{"class":50},[33,94560,87732],{"class":54},[33,94562,1115],{"class":50},[33,94564,87737],{"class":167},[33,94566,1121],{"class":50},[33,94568,94569],{"class":54}," decrypted, ",[33,94571,1115],{"class":50},[33,94573,94574],{"class":167},"skipped",[33,94576,1121],{"class":50},[33,94578,94579],{"class":54}," skipped, ",[33,94581,1115],{"class":50},[33,94583,87747],{"class":167},[33,94585,1121],{"class":50},[33,94587,29015],{"class":54},[33,94589,221],{"class":167},[33,94591,94592],{"class":35,"line":1877},[33,94593,92],{"emptyLinePlaceholder":91},[33,94595,94596],{"class":35,"line":1883},[33,94597,92],{"emptyLinePlaceholder":91},[33,94599,94600,94602,94604,94606,94608],{"class":35,"line":1915},[33,94601,2491],{"class":163},[33,94603,2494],{"class":50},[33,94605,2497],{"class":163},[33,94607,2500],{"class":54},[33,94609,574],{"class":167},[33,94611,94612,94615,94617,94619,94621,94623,94625,94627,94629,94631],{"class":35,"line":1926},[33,94613,94614],{"class":167},"    batch_decrypt(",[33,94616,507],{"class":50},[33,94618,365],{"class":167},[33,94620,4615],{"class":50},[33,94622,365],{"class":167},[33,94624,39563],{"class":238},[33,94626,242],{"class":163},[33,94628,35884],{"class":167},[33,94630,86957],{"class":54},[33,94632,751],{"class":167},[14,94634,94635],{},"Log failures to a CSV for manual review rather than halting mid-batch — one corrupt or wrongly-passworded file should not block the rest.",[18,94637,9247],{"id":9246},[14,94639,94640],{},"After writing the decrypted file, assert it is no longer encrypted and page count matches:",[23,94642,94644],{"className":126,"code":94643,"language":47,"meta":28,"style":28},"# pip install \"pypdf>=3.17\"\nfrom pathlib import Path\nfrom pypdf import PdfReader\n\n\ndef verify_decrypted(original: Path, decrypted: Path) -> bool:\n    \"\"\"Confirm the output is not encrypted and has the same page count.\"\"\"\n    try:\n        orig_reader = PdfReader(original)\n        # Provide the password only to count pages from the encrypted source\n        import os\n        orig_reader.decrypt(os.environ.get(\"PDF_USER_PW\", \"\"))\n        original_pages = len(orig_reader.pages)\n\n        dec_reader = PdfReader(decrypted)\n        if dec_reader.is_encrypted:\n            print(f\"FAIL: {decrypted.name} is still encrypted\")\n            return False\n\n        if len(dec_reader.pages) != original_pages:\n            print(f\"FAIL: page count mismatch ({len(dec_reader.pages)} vs {original_pages})\")\n            return False\n\n        print(f\"PASS: {decrypted.name} — not encrypted, {len(dec_reader.pages)} pages\")\n        return True\n    except Exception as exc:\n        print(f\"ERROR: {exc}\")\n        return False\n",[30,94645,94646,94650,94660,94670,94674,94678,94692,94697,94703,94713,94718,94724,94737,94749,94753,94763,94770,94792,94798,94802,94816,94847,94853,94857,94886,94892,94902,94922],{"__ignoreMap":28},[33,94647,94648],{"class":35,"line":36},[33,94649,86432],{"class":39},[33,94651,94652,94654,94656,94658],{"class":35,"line":43},[33,94653,190],{"class":163},[33,94655,193],{"class":167},[33,94657,164],{"class":163},[33,94659,198],{"class":167},[33,94661,94662,94664,94666,94668],{"class":35,"line":61},[33,94663,190],{"class":163},[33,94665,57333],{"class":167},[33,94667,164],{"class":163},[33,94669,57338],{"class":167},[33,94671,94672],{"class":35,"line":73},[33,94673,92],{"emptyLinePlaceholder":91},[33,94675,94676],{"class":35,"line":88},[33,94677,92],{"emptyLinePlaceholder":91},[33,94679,94680,94682,94685,94688,94690],{"class":35,"line":95},[33,94681,562],{"class":163},[33,94683,94684],{"class":46}," verify_decrypted",[33,94686,94687],{"class":167},"(original: Path, decrypted: Path) -> ",[33,94689,2821],{"class":50},[33,94691,574],{"class":167},[33,94693,94694],{"class":35,"line":101},[33,94695,94696],{"class":54},"    \"\"\"Confirm the output is not encrypted and has the same page count.\"\"\"\n",[33,94698,94699,94701],{"class":35,"line":171},[33,94700,2424],{"class":163},[33,94702,574],{"class":167},[33,94704,94705,94708,94710],{"class":35,"line":179},[33,94706,94707],{"class":167},"        orig_reader ",[33,94709,242],{"class":163},[33,94711,94712],{"class":167}," PdfReader(original)\n",[33,94714,94715],{"class":35,"line":187},[33,94716,94717],{"class":39},"        # Provide the password only to count pages from the encrypted source\n",[33,94719,94720,94722],{"class":35,"line":201},[33,94721,3388],{"class":163},[33,94723,176],{"class":167},[33,94725,94726,94729,94731,94733,94735],{"class":35,"line":206},[33,94727,94728],{"class":167},"        orig_reader.decrypt(os.environ.get(",[33,94730,86957],{"class":54},[33,94732,365],{"class":167},[33,94734,3198],{"class":54},[33,94736,371],{"class":167},[33,94738,94739,94742,94744,94746],{"class":35,"line":224},[33,94740,94741],{"class":167},"        original_pages ",[33,94743,242],{"class":163},[33,94745,4037],{"class":50},[33,94747,94748],{"class":167},"(orig_reader.pages)\n",[33,94750,94751],{"class":35,"line":229},[33,94752,92],{"emptyLinePlaceholder":91},[33,94754,94755,94758,94760],{"class":35,"line":235},[33,94756,94757],{"class":167},"        dec_reader ",[33,94759,242],{"class":163},[33,94761,94762],{"class":167}," PdfReader(decrypted)\n",[33,94764,94765,94767],{"class":35,"line":250},[33,94766,8221],{"class":163},[33,94768,94769],{"class":167}," dec_reader.is_encrypted:\n",[33,94771,94772,94774,94776,94778,94780,94782,94785,94787,94790],{"class":35,"line":266},[33,94773,9364],{"class":50},[33,94775,602],{"class":167},[33,94777,4059],{"class":163},[33,94779,70816],{"class":54},[33,94781,1115],{"class":50},[33,94783,94784],{"class":167},"decrypted.name",[33,94786,1121],{"class":50},[33,94788,94789],{"class":54}," is still encrypted\"",[33,94791,221],{"class":167},[33,94793,94794,94796],{"class":35,"line":290},[33,94795,28782],{"class":163},[33,94797,2903],{"class":50},[33,94799,94800],{"class":35,"line":295},[33,94801,92],{"emptyLinePlaceholder":91},[33,94803,94804,94806,94808,94811,94813],{"class":35,"line":300},[33,94805,8221],{"class":163},[33,94807,4037],{"class":50},[33,94809,94810],{"class":167},"(dec_reader.pages) ",[33,94812,17877],{"class":163},[33,94814,94815],{"class":167}," original_pages:\n",[33,94817,94818,94820,94822,94824,94827,94829,94832,94834,94836,94838,94841,94843,94845],{"class":35,"line":317},[33,94819,9364],{"class":50},[33,94821,602],{"class":167},[33,94823,4059],{"class":163},[33,94825,94826],{"class":54},"\"FAIL: page count mismatch (",[33,94828,4065],{"class":50},[33,94830,94831],{"class":167},"(dec_reader.pages)",[33,94833,1121],{"class":50},[33,94835,71066],{"class":54},[33,94837,1115],{"class":50},[33,94839,94840],{"class":167},"original_pages",[33,94842,1121],{"class":50},[33,94844,72406],{"class":54},[33,94846,221],{"class":167},[33,94848,94849,94851],{"class":35,"line":332},[33,94850,28782],{"class":163},[33,94852,2903],{"class":50},[33,94854,94855],{"class":35,"line":347},[33,94856,92],{"emptyLinePlaceholder":91},[33,94858,94859,94861,94863,94865,94867,94869,94871,94873,94876,94878,94880,94882,94884],{"class":35,"line":374},[33,94860,9414],{"class":50},[33,94862,602],{"class":167},[33,94864,4059],{"class":163},[33,94866,88031],{"class":54},[33,94868,1115],{"class":50},[33,94870,94784],{"class":167},[33,94872,1121],{"class":50},[33,94874,94875],{"class":54}," — not encrypted, ",[33,94877,4065],{"class":50},[33,94879,94831],{"class":167},[33,94881,1121],{"class":50},[33,94883,77518],{"class":54},[33,94885,221],{"class":167},[33,94887,94888,94890],{"class":35,"line":397},[33,94889,1659],{"class":163},[33,94891,2887],{"class":50},[33,94893,94894,94896,94898,94900],{"class":35,"line":653},[33,94895,2449],{"class":163},[33,94897,783],{"class":50},[33,94899,1852],{"class":163},[33,94901,1855],{"class":167},[33,94903,94904,94906,94908,94910,94912,94914,94916,94918,94920],{"class":35,"line":667},[33,94905,9414],{"class":50},[33,94907,602],{"class":167},[33,94909,4059],{"class":163},[33,94911,88114],{"class":54},[33,94913,1115],{"class":50},[33,94915,6565],{"class":167},[33,94917,1121],{"class":50},[33,94919,274],{"class":54},[33,94921,221],{"class":167},[33,94923,94924,94926],{"class":35,"line":675},[33,94925,1659],{"class":163},[33,94927,2903],{"class":50},[14,94929,94930,94931,94933],{},"For pipelines that feed decrypted output into ",[940,94932,9592],{"href":942}," or text parsers, run this check before handing off — it catches the silent-wrong-password case that can produce an apparently valid but empty file.",[18,94935,94937],{"id":94936},"preserving-metadata-when-decrypting","Preserving Metadata When Decrypting",[14,94939,94940,94941,94943,94944,94946],{},"Copying pages with ",[30,94942,71069],{}," transfers visual content but not the ",[30,94945,88150],{}," metadata dictionary (author, title, subject, keywords) or the document outline (bookmarks). Preserve them explicitly when they matter:",[23,94948,94950],{"className":126,"code":94949,"language":47,"meta":28,"style":28},"# pip install \"pypdf>=3.17\"\nfrom pathlib import Path\nfrom pypdf import PdfReader, PdfWriter\nimport os\n\n\ndef decrypt_preserve_metadata(source: Path, output: Path, password: str) -> None:\n    \"\"\"Decrypt and write a clean copy, preserving metadata and bookmarks.\"\"\"\n    reader = PdfReader(source)\n    if not reader.is_encrypted:\n        raise ValueError(f\"{source.name} is not encrypted\")\n\n    result = reader.decrypt(password)\n    if result == 0:\n        raise ValueError(f\"Wrong password for {source.name}\")\n\n    writer = PdfWriter()\n    for page in reader.pages:\n        writer.add_page(page)\n\n    # Copy \u002FInfo metadata if present\n    if reader.metadata:\n        writer.add_metadata(dict(reader.metadata))\n\n    output.parent.mkdir(parents=True, exist_ok=True)\n    with open(output, \"wb\") as fh:\n        writer.write(fh)\n    print(f\"Decrypted (metadata preserved): {output}\")\n\n\nif __name__ == \"__main__\":\n    decrypt_preserve_metadata(\n        Path(\"protected.pdf\"),\n        Path(\"decrypted.pdf\"),\n        password=os.environ[\"PDF_USER_PW\"],\n    )\n",[30,94951,94952,94956,94966,94976,94982,94986,94990,95008,95013,95021,95029,95051,95055,95063,95075,95098,95102,95110,95120,95124,95128,95133,95139,95147,95151,95171,95187,95191,95212,95216,95220,95232,95237,95245,95253,95265],{"__ignoreMap":28},[33,94953,94954],{"class":35,"line":36},[33,94955,86432],{"class":39},[33,94957,94958,94960,94962,94964],{"class":35,"line":43},[33,94959,190],{"class":163},[33,94961,193],{"class":167},[33,94963,164],{"class":163},[33,94965,198],{"class":167},[33,94967,94968,94970,94972,94974],{"class":35,"line":61},[33,94969,190],{"class":163},[33,94971,57333],{"class":167},[33,94973,164],{"class":163},[33,94975,66892],{"class":167},[33,94977,94978,94980],{"class":35,"line":73},[33,94979,164],{"class":163},[33,94981,176],{"class":167},[33,94983,94984],{"class":35,"line":88},[33,94985,92],{"emptyLinePlaceholder":91},[33,94987,94988],{"class":35,"line":95},[33,94989,92],{"emptyLinePlaceholder":91},[33,94991,94992,94994,94997,95000,95002,95004,95006],{"class":35,"line":101},[33,94993,562],{"class":163},[33,94995,94996],{"class":46}," decrypt_preserve_metadata",[33,94998,94999],{"class":167},"(source: Path, output: Path, password: ",[33,95001,1053],{"class":50},[33,95003,1617],{"class":167},[33,95005,571],{"class":50},[33,95007,574],{"class":167},[33,95009,95010],{"class":35,"line":171},[33,95011,95012],{"class":54},"    \"\"\"Decrypt and write a clean copy, preserving metadata and bookmarks.\"\"\"\n",[33,95014,95015,95017,95019],{"class":35,"line":179},[33,95016,57365],{"class":167},[33,95018,242],{"class":163},[33,95020,86642],{"class":167},[33,95022,95023,95025,95027],{"class":35,"line":187},[33,95024,617],{"class":163},[33,95026,620],{"class":163},[33,95028,68749],{"class":167},[33,95030,95031,95033,95035,95037,95039,95041,95043,95045,95047,95049],{"class":35,"line":201},[33,95032,4051],{"class":163},[33,95034,4054],{"class":50},[33,95036,602],{"class":167},[33,95038,4059],{"class":163},[33,95040,274],{"class":54},[33,95042,1115],{"class":50},[33,95044,86888],{"class":167},[33,95046,1121],{"class":50},[33,95048,87944],{"class":54},[33,95050,221],{"class":167},[33,95052,95053],{"class":35,"line":206},[33,95054,92],{"emptyLinePlaceholder":91},[33,95056,95057,95059,95061],{"class":35,"line":224},[33,95058,8842],{"class":167},[33,95060,242],{"class":163},[33,95062,68794],{"class":167},[33,95064,95065,95067,95069,95071,95073],{"class":35,"line":229},[33,95066,617],{"class":163},[33,95068,68801],{"class":167},[33,95070,1865],{"class":163},[33,95072,10791],{"class":50},[33,95074,574],{"class":167},[33,95076,95077,95079,95081,95083,95085,95088,95090,95092,95094,95096],{"class":35,"line":235},[33,95078,4051],{"class":163},[33,95080,4054],{"class":50},[33,95082,602],{"class":167},[33,95084,4059],{"class":163},[33,95086,95087],{"class":54},"\"Wrong password for ",[33,95089,1115],{"class":50},[33,95091,86888],{"class":167},[33,95093,1121],{"class":50},[33,95095,274],{"class":54},[33,95097,221],{"class":167},[33,95099,95100],{"class":35,"line":250},[33,95101,92],{"emptyLinePlaceholder":91},[33,95103,95104,95106,95108],{"class":35,"line":266},[33,95105,68681],{"class":167},[33,95107,242],{"class":163},[33,95109,67154],{"class":167},[33,95111,95112,95114,95116,95118],{"class":35,"line":290},[33,95113,656],{"class":163},[33,95115,695],{"class":167},[33,95117,662],{"class":163},[33,95119,86724],{"class":167},[33,95121,95122],{"class":35,"line":295},[33,95123,87204],{"class":167},[33,95125,95126],{"class":35,"line":300},[33,95127,92],{"emptyLinePlaceholder":91},[33,95129,95130],{"class":35,"line":317},[33,95131,95132],{"class":39},"    # Copy \u002FInfo metadata if present\n",[33,95134,95135,95137],{"class":35,"line":332},[33,95136,617],{"class":163},[33,95138,88324],{"class":167},[33,95140,95141,95143,95145],{"class":35,"line":347},[33,95142,88329],{"class":167},[33,95144,37100],{"class":50},[33,95146,88334],{"class":167},[33,95148,95149],{"class":35,"line":374},[33,95150,92],{"emptyLinePlaceholder":91},[33,95152,95153,95155,95157,95159,95161,95163,95165,95167,95169],{"class":35,"line":397},[33,95154,74932],{"class":167},[33,95156,869],{"class":238},[33,95158,242],{"class":163},[33,95160,855],{"class":50},[33,95162,365],{"class":167},[33,95164,878],{"class":238},[33,95166,242],{"class":163},[33,95168,855],{"class":50},[33,95170,221],{"class":167},[33,95172,95173,95175,95177,95179,95181,95183,95185],{"class":35,"line":653},[33,95174,1635],{"class":163},[33,95176,68213],{"class":50},[33,95178,70532],{"class":167},[33,95180,67169],{"class":54},[33,95182,1649],{"class":167},[33,95184,495],{"class":163},[33,95186,67176],{"class":167},[33,95188,95189],{"class":35,"line":667},[33,95190,87297],{"class":167},[33,95192,95193,95195,95197,95199,95202,95204,95206,95208,95210],{"class":35,"line":675},[33,95194,7268],{"class":50},[33,95196,602],{"class":167},[33,95198,4059],{"class":163},[33,95200,95201],{"class":54},"\"Decrypted (metadata preserved): ",[33,95203,1115],{"class":50},[33,95205,70566],{"class":167},[33,95207,1121],{"class":50},[33,95209,274],{"class":54},[33,95211,221],{"class":167},[33,95213,95214],{"class":35,"line":689},[33,95215,92],{"emptyLinePlaceholder":91},[33,95217,95218],{"class":35,"line":703},[33,95219,92],{"emptyLinePlaceholder":91},[33,95221,95222,95224,95226,95228,95230],{"class":35,"line":714},[33,95223,2491],{"class":163},[33,95225,2494],{"class":50},[33,95227,2497],{"class":163},[33,95229,2500],{"class":54},[33,95231,574],{"class":167},[33,95233,95234],{"class":35,"line":723},[33,95235,95236],{"class":167},"    decrypt_preserve_metadata(\n",[33,95238,95239,95241,95243],{"class":35,"line":754},[33,95240,69188],{"class":167},[33,95242,93054],{"class":54},[33,95244,1506],{"class":167},[33,95246,95247,95249,95251],{"class":35,"line":771},[33,95248,69188],{"class":167},[33,95250,93217],{"class":54},[33,95252,1506],{"class":167},[33,95254,95255,95257,95259,95261,95263],{"class":35,"line":777},[33,95256,93698],{"class":238},[33,95258,242],{"class":163},[33,95260,35884],{"class":167},[33,95262,86957],{"class":54},[33,95264,8935],{"class":167},[33,95266,95267],{"class":35,"line":788},[33,95268,1202],{"class":167},[14,95270,95271,95272,95275,95276,95278],{},"If the encrypted source has a complex outline (nested bookmarks, named destinations), use ",[30,95273,95274],{},"writer.clone_document_from_reader(reader)"," instead of iterating ",[30,95277,71069],{}," — it copies the full document tree including the outline.",[18,95280,95282],{"id":95281},"downstream-use-feeding-decrypted-pdfs-into-parsers","Downstream Use: Feeding Decrypted PDFs into Parsers",[14,95284,95285,95286,95288,95289,95291],{},"The most common reason to remove a password is to make the file readable by extraction tools. ",[940,95287,9592],{"href":942}," libraries such as pdfplumber and camelot require an unencrypted byte stream — they do not accept password arguments. After decrypting to a clean file (or ",[30,95290,90629],{}," buffer), pass the output path normally:",[23,95293,95295],{"className":126,"code":95294,"language":47,"meta":28,"style":28},"# pip install \"pypdf>=3.17\" pdfplumber\nimport io, os\nfrom pathlib import Path\nfrom pypdf import PdfReader, PdfWriter\nimport pdfplumber\n\n\ndef decrypt_to_buffer(source: Path, password: str) -> io.BytesIO:\n    \"\"\"Decrypt a PDF into a BytesIO buffer — no intermediate file on disk.\"\"\"\n    reader = PdfReader(source)\n    if reader.is_encrypted:\n        if reader.decrypt(password) == 0:\n            raise ValueError(\"Wrong password\")\n    writer = PdfWriter()\n    for page in reader.pages:\n        writer.add_page(page)\n    buf = io.BytesIO()\n    writer.write(buf)\n    buf.seek(0)\n    return buf\n\n\nif __name__ == \"__main__\":\n    buf = decrypt_to_buffer(Path(\"protected.pdf\"), os.environ[\"PDF_USER_PW\"])\n    with pdfplumber.open(buf) as pdf:\n        for i, page in enumerate(pdf.pages):\n            tables = page.extract_tables()\n            print(f\"Page {i+1}: {len(tables)} table(s)\")\n",[30,95296,95297,95302,95308,95318,95328,95334,95338,95342,95356,95361,95369,95375,95387,95400,95408,95418,95422,95430,95435,95443,95449,95453,95457,95469,95487,95498,95510,95518],{"__ignoreMap":28},[33,95298,95299],{"class":35,"line":36},[33,95300,95301],{"class":39},"# pip install \"pypdf>=3.17\" pdfplumber\n",[33,95303,95304,95306],{"class":35,"line":43},[33,95305,164],{"class":163},[33,95307,90650],{"class":167},[33,95309,95310,95312,95314,95316],{"class":35,"line":61},[33,95311,190],{"class":163},[33,95313,193],{"class":167},[33,95315,164],{"class":163},[33,95317,198],{"class":167},[33,95319,95320,95322,95324,95326],{"class":35,"line":73},[33,95321,190],{"class":163},[33,95323,57333],{"class":167},[33,95325,164],{"class":163},[33,95327,66892],{"class":167},[33,95329,95330,95332],{"class":35,"line":88},[33,95331,164],{"class":163},[33,95333,485],{"class":167},[33,95335,95336],{"class":35,"line":95},[33,95337,92],{"emptyLinePlaceholder":91},[33,95339,95340],{"class":35,"line":101},[33,95341,92],{"emptyLinePlaceholder":91},[33,95343,95344,95346,95349,95352,95354],{"class":35,"line":171},[33,95345,562],{"class":163},[33,95347,95348],{"class":46}," decrypt_to_buffer",[33,95350,95351],{"class":167},"(source: Path, password: ",[33,95353,1053],{"class":50},[33,95355,92126],{"class":167},[33,95357,95358],{"class":35,"line":179},[33,95359,95360],{"class":54},"    \"\"\"Decrypt a PDF into a BytesIO buffer — no intermediate file on disk.\"\"\"\n",[33,95362,95363,95365,95367],{"class":35,"line":187},[33,95364,57365],{"class":167},[33,95366,242],{"class":163},[33,95368,86642],{"class":167},[33,95370,95371,95373],{"class":35,"line":201},[33,95372,617],{"class":163},[33,95374,68749],{"class":167},[33,95376,95377,95379,95381,95383,95385],{"class":35,"line":206},[33,95378,8221],{"class":163},[33,95380,69635],{"class":167},[33,95382,1865],{"class":163},[33,95384,10791],{"class":50},[33,95386,574],{"class":167},[33,95388,95389,95391,95393,95395,95398],{"class":35,"line":224},[33,95390,59715],{"class":163},[33,95392,4054],{"class":50},[33,95394,602],{"class":167},[33,95396,95397],{"class":54},"\"Wrong password\"",[33,95399,221],{"class":167},[33,95401,95402,95404,95406],{"class":35,"line":229},[33,95403,68681],{"class":167},[33,95405,242],{"class":163},[33,95407,67154],{"class":167},[33,95409,95410,95412,95414,95416],{"class":35,"line":235},[33,95411,656],{"class":163},[33,95413,695],{"class":167},[33,95415,662],{"class":163},[33,95417,86724],{"class":167},[33,95419,95420],{"class":35,"line":250},[33,95421,87204],{"class":167},[33,95423,95424,95426,95428],{"class":35,"line":266},[33,95425,61913],{"class":167},[33,95427,242],{"class":163},[33,95429,61918],{"class":167},[33,95431,95432],{"class":35,"line":290},[33,95433,95434],{"class":167},"    writer.write(buf)\n",[33,95436,95437,95439,95441],{"class":35,"line":295},[33,95438,61951],{"class":167},[33,95440,748],{"class":50},[33,95442,221],{"class":167},[33,95444,95445,95447],{"class":35,"line":300},[33,95446,1332],{"class":163},[33,95448,63556],{"class":167},[33,95450,95451],{"class":35,"line":317},[33,95452,92],{"emptyLinePlaceholder":91},[33,95454,95455],{"class":35,"line":332},[33,95456,92],{"emptyLinePlaceholder":91},[33,95458,95459,95461,95463,95465,95467],{"class":35,"line":347},[33,95460,2491],{"class":163},[33,95462,2494],{"class":50},[33,95464,2497],{"class":163},[33,95466,2500],{"class":54},[33,95468,574],{"class":167},[33,95470,95471,95473,95475,95478,95480,95483,95485],{"class":35,"line":374},[33,95472,61913],{"class":167},[33,95474,242],{"class":163},[33,95476,95477],{"class":167}," decrypt_to_buffer(Path(",[33,95479,93054],{"class":54},[33,95481,95482],{"class":167},"), os.environ[",[33,95484,86957],{"class":54},[33,95486,751],{"class":167},[33,95488,95489,95491,95494,95496],{"class":35,"line":397},[33,95490,1635],{"class":163},[33,95492,95493],{"class":167}," pdfplumber.open(buf) ",[33,95495,495],{"class":163},[33,95497,686],{"class":167},[33,95499,95500,95502,95504,95506,95508],{"class":35,"line":653},[33,95501,5973],{"class":163},[33,95503,37139],{"class":167},[33,95505,662],{"class":163},[33,95507,7403],{"class":50},[33,95509,40080],{"class":167},[33,95511,95512,95514,95516],{"class":35,"line":667},[33,95513,40540],{"class":167},[33,95515,242],{"class":163},[33,95517,7147],{"class":167},[33,95519,95520,95522,95524,95526,95528,95530,95532,95534,95536,95538,95540,95542,95544,95546],{"class":35,"line":675},[33,95521,9364],{"class":50},[33,95523,602],{"class":167},[33,95525,4059],{"class":163},[33,95527,55719],{"class":54},[33,95529,1115],{"class":50},[33,95531,7499],{"class":167},[33,95533,1811],{"class":163},[33,95535,40161],{"class":50},[33,95537,2079],{"class":54},[33,95539,4065],{"class":50},[33,95541,39168],{"class":167},[33,95543,1121],{"class":50},[33,95545,6247],{"class":54},[33,95547,221],{"class":167},[14,95549,79527,95550,95552],{},[30,95551,90629],{}," avoids writing a temporary unencrypted file to disk, which matters in environments where the working directory is logged or audited.",[18,95554,48994],{"id":29070},[4273,95556,95557,95567],{},[4276,95558,95559],{},[4279,95560,95561,95563,95565],{},[4282,95562,4284],{},[4282,95564,4287],{},[4282,95566,4290],{},[4292,95568,95569,95594,95616,95634,95652],{},[4279,95570,95571,95578,95585],{},[4297,95572,95573,95575,95576],{},[30,95574,68095],{}," when accessing ",[30,95577,92981],{},[4297,95579,95580,95582,95583],{},[30,95581,92985],{}," not called, or called and returned ",[30,95584,748],{},[4297,95586,67848,95587,95589,95590,95593],{},[30,95588,75937],{},"; call ",[30,95591,95592],{},"reader.decrypt(pw)"," and verify return value ≠ 0",[4279,95595,95596,95604,95607],{},[4297,95597,95598,95600,95601,95603],{},[30,95599,91875],{}," returns ",[30,95602,748],{}," silently",[4297,95605,95606],{},"Wrong password supplied",[4297,95608,95609,95610,95612,95613],{},"Double-check password; catch the ",[30,95611,748],{}," return explicitly and raise ",[30,95614,95615],{},"ValueError",[4279,95617,95618,95621,95629],{},[4297,95619,95620],{},"Output file is still encrypted",[4297,95622,95623,95625,95626,95628],{},[30,95624,70025],{}," was created but ",[30,95627,86146],{}," was called on it inadvertently, or you re-opened the input by mistake",[4297,95630,95631,95632,42709],{},"Ensure the writer is freshly instantiated with no ",[30,95633,86146],{},[4279,95635,95636,95639,95646],{},[4297,95637,95638],{},"Metadata stripped from output",[4297,95640,94940,95641,95643,95644,85015],{},[30,95642,71069],{}," does not carry ",[30,95645,88150],{},[4297,95647,17059,95648,95651],{},[30,95649,95650],{},"writer.add_metadata(reader.metadata)"," before writing if you need to preserve author, title, etc.",[4279,95653,95654,95657,95660],{},[4297,95655,95656],{},"Page count changes after decrypt",[4297,95658,95659],{},"Some encrypted PDFs embed additional pages as annotations",[4297,95661,95662,95663,95665],{},"Compare using ",[30,95664,79503],{}," on original (after decrypt) vs output",[18,95667,88566],{"id":29183},[14,95669,95670,95673],{},[1974,95671,95672],{},"Does removing a password change the visual content of the PDF?","\nNo. Decryption is a pure cryptographic operation on the stream encoding. Text, images, fonts, and layout are unchanged.",[14,95675,95676,95679,95680,95682,95683,95685],{},[1974,95677,95678],{},"What if I only have the owner password, not the user password?","\nPass the owner password to ",[30,95681,86194],{},". It succeeds with return value ",[30,95684,1533],{}," (owner match), which grants full access. The same function accepts either password type.",[14,95687,95688,95691,95692,95694,95695,95697,95698,95700],{},[1974,95689,95690],{},"Can I re-encrypt with a new password immediately after decrypting?","\nYes — after copying pages to a fresh ",[30,95693,70025],{},", call ",[30,95696,86146],{}," on that writer before saving. See ",[940,95699,86107],{"href":92895}," for the full re-encryption pattern.",[14,95702,95703,95706,95707,95709],{},[1974,95704,95705],{},"What about PDF files encrypted with certificate-based (public-key) security rather than a password?","\npypdf does not support certificate-based decryption. Use ",[30,95708,68393],{}," with the appropriate private key PEM for those cases.",[18,95711,6918],{"id":6917},[4211,95713,95714,95719,95724],{},[4214,95715,95716,95718],{},[940,95717,65967],{"href":65966}," — full overview of overlays, AES-256 encryption, and permission flags",[4214,95720,95721,95723],{},[940,95722,86107],{"href":92895}," — encrypt a PDF with user and owner passwords",[4214,95725,95726,95728],{},[940,95727,52682],{"href":52681}," — decrypt before merging encrypted source files",[14,95730,6947,95731,3035],{},[940,95732,65967],{"href":65966},[6953,95734,95735],{},"html pre.shiki code .sAwPA, html code.shiki .sAwPA{--shiki-default:#6A737D}html pre.shiki code .sD7c4, html code.shiki .sD7c4{--shiki-default:#D73A49}html pre.shiki code .sgsFI, html code.shiki .sgsFI{--shiki-default:#24292E}html pre.shiki code .sYu0t, html code.shiki .sYu0t{--shiki-default:#005CC5}html pre.shiki code .sYBdl, html code.shiki .sYBdl{--shiki-default:#032F62}html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html pre.shiki code .s7eDp, html code.shiki .s7eDp{--shiki-default:#6F42C1}html pre.shiki code .sqxcx, html code.shiki .sqxcx{--shiki-default:#E36209}",{"title":28,"searchDepth":43,"depth":43,"links":95737},[95738,95739,95740,95741,95742,95743,95744,95745,95746,95747,95748],{"id":7020,"depth":43,"text":7021},{"id":35016,"depth":43,"text":35017},{"id":93149,"depth":43,"text":93150},{"id":93728,"depth":43,"text":93729},{"id":94055,"depth":43,"text":94056},{"id":9246,"depth":43,"text":9247},{"id":94936,"depth":43,"text":94937},{"id":95281,"depth":43,"text":95282},{"id":29070,"depth":43,"text":48994},{"id":29183,"depth":43,"text":88566},{"id":6917,"depth":43,"text":6918},"Remove PDF Password","Open an encrypted PDF you are authorized to access and save a decrypted copy using pypdf. Covers reader.decrypt(), FileNotDecryptedError, AES vs RC4, and batch decryption.",{},"\u002Fautomating-pdf-extraction-generation\u002Fwatermarking-and-securing-pdfs\u002Fremove-password-from-pdf-with-python",{"title":88466,"description":95750},"Remove Password from PDF with Python (pypdf)","automating-pdf-extraction-generation\u002Fwatermarking-and-securing-pdfs\u002Fremove-password-from-pdf-with-python\u002Findex",[9631,47,88645,65045],"j34vIokfauEBDYmxSRGqaY9I0kRMYaY53q21kuQcLK8",{"id":95759,"title":29264,"body":95760,"breadcrumbTitle":6977,"canonical":6977,"date":46387,"description":95770,"draft":6980,"extension":6981,"image":6977,"meta":95771,"navigation":91,"path":1351,"robots":6977,"seo":95772,"seoTitle":95773,"stem":897,"tags":95774,"updatedAt":6978,"__hash__":95777},"content\u002Findex.md",{"type":7,"value":95761,"toc":95768},[95762,95765],[10,95763,29264],{"id":95764},"python-doc-data-automation",[14,95766,95767],{},"Practical, script-first guides for automating document and data work with Python — extracting and generating PDFs, processing Excel and CSV data with pandas, templating Word documents at scale, and wiring those steps into reliable end-to-end pipelines.",{"title":28,"searchDepth":43,"depth":43,"links":95769},[],"Hands-on Python guides for automating PDFs, Excel and CSV data, Word documents, and end-to-end pipelines that wire them together.",{},{"title":29264,"description":95770},"Python Document & Data Automation Guides",[6989,47,95775,95776],"data workflows","pipelines","jp9ZCFdnSZNnf6avKbLM9rE78y0i4N6YWrCIfWJjGEE",{"id":95779,"title":95780,"body":95781,"breadcrumbTitle":99606,"canonical":6977,"date":46387,"description":99607,"draft":6980,"extension":6981,"image":6977,"meta":99608,"navigation":91,"path":99609,"robots":6977,"seo":99610,"seoTitle":99611,"stem":99612,"tags":99613,"updatedAt":6978,"__hash__":99616},"content\u002Fpython-for-excel-csv-data-processing\u002Fautomating-excel-report-generation\u002Fautomating-monthly-sales-reports-in-excel\u002Findex.md","Automating Monthly Sales Reports in Excel",{"type":7,"value":95782,"toc":99588},[95783,95785,95803,95808,95812,95815,95844,95847,95850,95853,95888,95912,95916,95919,96218,96222,96225,96490,96494,96497,96804,96818,96822,96835,97744,97746,97750,97767,97842,97846,97865,98195,98197,98200,98465,98469,98478,98869,98882,98886,98889,98895,98901,98906,99064,99070,99395,99398,99400,99503,99505,99518,99535,99553,99555,99581,99585],[10,95784,95780],{"id":21805},[14,95786,95787,95788,95791,95792,46332,95794,95796,95797,95799,95800,95802],{},"Manual compilation of monthly sales data introduces ",[30,95789,95790],{},"VLOOKUP"," failures, inconsistent date parsing, and formatting drift between contributors. This guide replaces that workflow with a deterministic Python pipeline: ingest multiple CSV exports, normalise the schema, aggregate with ",[30,95793,9630],{},[30,95795,21820],{},", and write a styled multi-sheet ",[30,95798,26542],{}," file with ",[30,95801,22009],{},". The result is a reproducible script you schedule once and never touch again.",[14,95804,95805,95806,3035],{},"For the broader library-selection and ExcelWriter context, see ",[940,95807,6936],{"href":6935},[18,95809,95811],{"id":95810},"root-cause-of-manual-report-failures","Root Cause of Manual-Report Failures",[14,95813,95814],{},"Monthly reports break because the process is ad hoc:",[4211,95816,95817,95826,95838,95841],{},[4214,95818,95819,95820,71066,95823,72232],{},"Different analysts export CSVs with different date formats (",[30,95821,95822],{},"2026-01-05",[30,95824,95825],{},"01\u002F05\u002F2026",[4214,95827,95828,95829,69863,95832,69863,95835,72232],{},"Column names shift month to month (",[30,95830,95831],{},"Amount",[30,95833,95834],{},"Revenue",[30,95836,95837],{},"Total_Amount",[4214,95839,95840],{},"Someone forgets to run the VLOOKUP, or runs it on stale data.",[4214,95842,95843],{},"Formatting is re-applied by hand, introducing colour inconsistencies.",[14,95845,95846],{},"A Python script enforces a fixed schema, fails loudly on unexpected input, and applies formatting programmatically — the same way, every time.",[18,95848,95849],{"id":26618},"Environment Setup",[14,95851,95852],{},"Python 3.9+ required.",[23,95854,95856],{"className":25,"code":95855,"language":27,"meta":28,"style":28},"python -m venv .venv\nsource .venv\u002Fbin\u002Factivate      # Windows: .venv\\Scripts\\activate\npip install pandas openpyxl\n",[30,95857,95858,95868,95877],{"__ignoreMap":28},[33,95859,95860,95862,95864,95866],{"class":35,"line":36},[33,95861,47],{"class":46},[33,95863,51],{"class":50},[33,95865,55],{"class":54},[33,95867,58],{"class":54},[33,95869,95870,95872,95874],{"class":35,"line":43},[33,95871,64],{"class":50},[33,95873,67],{"class":54},[33,95875,95876],{"class":39},"      # Windows: .venv\\Scripts\\activate\n",[33,95878,95879,95881,95883,95885],{"class":35,"line":61},[33,95880,76],{"class":46},[33,95882,79],{"class":54},[33,95884,16183],{"class":54},[33,95886,95887],{"class":54}," openpyxl\n",[14,95889,95890,95891,95894,95895,95898,95899,365,95902,365,95905,365,95908,365,95910,3035],{},"Place raw monthly CSVs in ",[30,95892,95893],{},"data\u002F"," with names matching ",[30,95896,95897],{},"monthly_sales_*.csv",". Each file must contain at minimum: ",[30,95900,95901],{},"date",[30,95903,95904],{},"region",[30,95906,95907],{},"product",[30,95909,18528],{},[30,95911,18511],{},[18,95913,95915],{"id":95914},"step-1-confirm-the-input-schema","Step 1 — Confirm the Input Schema",[14,95917,95918],{},"Before any aggregation, assert that the files contain the expected columns.",[23,95920,95922],{"className":126,"code":95921,"language":47,"meta":28,"style":28},"# pip install pandas\nfrom pathlib import Path\nimport glob\nimport pandas as pd\n\nDATA_DIR = Path(\"data\")\nREQUIRED = {\"date\", \"region\", \"product\", \"revenue\", \"units\"}\n\nfiles = sorted(DATA_DIR.glob(\"monthly_sales_*.csv\"))\nif not files:\n    raise SystemExit(f\"No CSVs found in {DATA_DIR}\")\n\nframes = []\nfor f in files:\n    df = pd.read_csv(f)\n    missing = REQUIRED - set(df.columns.str.lower())\n    if missing:\n        raise ValueError(f\"{f.name} is missing columns: {missing}\")\n    df.columns = df.columns.str.lower()   # normalise to lowercase\n    frames.append(df)\n\nraw = pd.concat(frames, ignore_index=True)\nprint(f\"Loaded {len(raw):,} rows from {len(files)} files\")\nprint(raw.dtypes)\n",[30,95923,95924,95928,95938,95945,95955,95959,95973,96002,96006,96026,96034,96054,96058,96067,96077,96086,96102,96108,96139,96151,96156,96160,96177,96211],{"__ignoreMap":28},[33,95925,95926],{"class":35,"line":36},[33,95927,8895],{"class":39},[33,95929,95930,95932,95934,95936],{"class":35,"line":43},[33,95931,190],{"class":163},[33,95933,193],{"class":167},[33,95935,164],{"class":163},[33,95937,198],{"class":167},[33,95939,95940,95942],{"class":35,"line":61},[33,95941,164],{"class":163},[33,95943,95944],{"class":167}," glob\n",[33,95946,95947,95949,95951,95953],{"class":35,"line":73},[33,95948,164],{"class":163},[33,95950,492],{"class":167},[33,95952,495],{"class":163},[33,95954,498],{"class":167},[33,95956,95957],{"class":35,"line":88},[33,95958,92],{"emptyLinePlaceholder":91},[33,95960,95961,95964,95966,95968,95971],{"class":35,"line":95},[33,95962,95963],{"class":50},"DATA_DIR",[33,95965,212],{"class":163},[33,95967,215],{"class":167},[33,95969,95970],{"class":54},"\"data\"",[33,95972,221],{"class":167},[33,95974,95975,95978,95980,95982,95984,95986,95988,95990,95992,95994,95996,95998,96000],{"class":35,"line":101},[33,95976,95977],{"class":50},"REQUIRED",[33,95979,212],{"class":163},[33,95981,4098],{"class":167},[33,95983,4101],{"class":54},[33,95985,365],{"class":167},[33,95987,16649],{"class":54},[33,95989,365],{"class":167},[33,95991,16654],{"class":54},[33,95993,365],{"class":167},[33,95995,16465],{"class":54},[33,95997,365],{"class":167},[33,95999,16659],{"class":54},[33,96001,4113],{"class":167},[33,96003,96004],{"class":35,"line":171},[33,96005,92],{"emptyLinePlaceholder":91},[33,96007,96008,96011,96013,96015,96017,96019,96021,96024],{"class":35,"line":179},[33,96009,96010],{"class":167},"files ",[33,96012,242],{"class":163},[33,96014,28924],{"class":50},[33,96016,602],{"class":167},[33,96018,95963],{"class":50},[33,96020,607],{"class":167},[33,96022,96023],{"class":54},"\"monthly_sales_*.csv\"",[33,96025,371],{"class":167},[33,96027,96028,96030,96032],{"class":35,"line":187},[33,96029,2491],{"class":163},[33,96031,620],{"class":163},[33,96033,74854],{"class":167},[33,96035,96036,96038,96040,96042,96044,96047,96050,96052],{"class":35,"line":201},[33,96037,35742],{"class":163},[33,96039,16617],{"class":50},[33,96041,602],{"class":167},[33,96043,4059],{"class":163},[33,96045,96046],{"class":54},"\"No CSVs found in ",[33,96048,96049],{"class":50},"{DATA_DIR}",[33,96051,274],{"class":54},[33,96053,221],{"class":167},[33,96055,96056],{"class":35,"line":206},[33,96057,92],{"emptyLinePlaceholder":91},[33,96059,96060,96063,96065],{"class":35,"line":224},[33,96061,96062],{"class":167},"frames ",[33,96064,242],{"class":163},[33,96066,589],{"class":167},[33,96068,96069,96071,96073,96075],{"class":35,"line":229},[33,96070,6124],{"class":163},[33,96072,8832],{"class":167},[33,96074,662],{"class":163},[33,96076,74854],{"class":167},[33,96078,96079,96081,96083],{"class":35,"line":235},[33,96080,4025],{"class":167},[33,96082,242],{"class":163},[33,96084,96085],{"class":167}," pd.read_csv(f)\n",[33,96087,96088,96090,96092,96095,96097,96099],{"class":35,"line":250},[33,96089,4118],{"class":167},[33,96091,242],{"class":163},[33,96093,96094],{"class":50}," REQUIRED",[33,96096,39025],{"class":163},[33,96098,4129],{"class":50},[33,96100,96101],{"class":167},"(df.columns.str.lower())\n",[33,96103,96104,96106],{"class":35,"line":266},[33,96105,617],{"class":163},[33,96107,4139],{"class":167},[33,96109,96110,96112,96114,96116,96118,96120,96122,96124,96126,96129,96131,96133,96135,96137],{"class":35,"line":290},[33,96111,4051],{"class":163},[33,96113,4054],{"class":50},[33,96115,602],{"class":167},[33,96117,4059],{"class":163},[33,96119,274],{"class":54},[33,96121,1115],{"class":50},[33,96123,70352],{"class":167},[33,96125,1121],{"class":50},[33,96127,96128],{"class":54}," is missing columns: ",[33,96130,1115],{"class":50},[33,96132,4157],{"class":167},[33,96134,1121],{"class":50},[33,96136,274],{"class":54},[33,96138,221],{"class":167},[33,96140,96141,96143,96145,96148],{"class":35,"line":295},[33,96142,27546],{"class":167},[33,96144,242],{"class":163},[33,96146,96147],{"class":167}," df.columns.str.lower()   ",[33,96149,96150],{"class":39},"# normalise to lowercase\n",[33,96152,96153],{"class":35,"line":300},[33,96154,96155],{"class":167},"    frames.append(df)\n",[33,96157,96158],{"class":35,"line":317},[33,96159,92],{"emptyLinePlaceholder":91},[33,96161,96162,96165,96167,96169,96171,96173,96175],{"class":35,"line":332},[33,96163,96164],{"class":167},"raw ",[33,96166,242],{"class":163},[33,96168,847],{"class":167},[33,96170,850],{"class":238},[33,96172,242],{"class":163},[33,96174,855],{"class":50},[33,96176,221],{"class":167},[33,96178,96179,96181,96183,96185,96188,96190,96193,96195,96197,96200,96202,96204,96206,96209],{"class":35,"line":347},[33,96180,13474],{"class":50},[33,96182,602],{"class":167},[33,96184,4059],{"class":163},[33,96186,96187],{"class":54},"\"Loaded ",[33,96189,4065],{"class":50},[33,96191,96192],{"class":167},"(raw)",[33,96194,18801],{"class":163},[33,96196,1121],{"class":50},[33,96198,96199],{"class":54}," rows from ",[33,96201,4065],{"class":50},[33,96203,74998],{"class":167},[33,96205,1121],{"class":50},[33,96207,96208],{"class":54}," files\"",[33,96210,221],{"class":167},[33,96212,96213,96215],{"class":35,"line":374},[33,96214,13474],{"class":50},[33,96216,96217],{"class":167},"(raw.dtypes)\n",[18,96219,96221],{"id":96220},"step-2-normalise-types","Step 2 — Normalise Types",[14,96223,96224],{},"Date and currency columns arriving as strings will silently produce wrong aggregations. Coerce them explicitly and drop rows that cannot be parsed.",[23,96226,96228],{"className":126,"code":96227,"language":47,"meta":28,"style":28},"# pip install pandas\n# Continues from Step 1 — raw DataFrame already in scope\n\nraw[\"date\"]    = pd.to_datetime(raw[\"date\"], format=\"mixed\", errors=\"coerce\")\nraw[\"revenue\"] = pd.to_numeric(raw[\"revenue\"].astype(str)\n                                .str.replace(r\"[$,]\", \"\", regex=True),\n                                errors=\"coerce\")\nraw[\"units\"]   = pd.to_numeric(raw[\"units\"], errors=\"coerce\")\n\nn_before = len(raw)\nraw.dropna(subset=[\"date\", \"revenue\", \"units\"], inplace=True)\nn_dropped = n_before - len(raw)\nif n_dropped:\n    print(f\"Warning: dropped {n_dropped} rows with unparseable values\")\n\nraw[\"month\"] = raw[\"date\"].dt.to_period(\"M\").astype(str)  # \"2026-01\" etc.\n",[30,96229,96230,96234,96239,96243,96278,96299,96327,96338,96362,96366,96378,96409,96425,96432,96455,96459],{"__ignoreMap":28},[33,96231,96232],{"class":35,"line":36},[33,96233,8895],{"class":39},[33,96235,96236],{"class":35,"line":43},[33,96237,96238],{"class":39},"# Continues from Step 1 — raw DataFrame already in scope\n",[33,96240,96241],{"class":35,"line":61},[33,96242,92],{"emptyLinePlaceholder":91},[33,96244,96245,96247,96249,96252,96254,96257,96259,96261,96263,96265,96268,96270,96272,96274,96276],{"class":35,"line":73},[33,96246,13789],{"class":167},[33,96248,4101],{"class":54},[33,96250,96251],{"class":167},"]    ",[33,96253,242],{"class":163},[33,96255,96256],{"class":167}," pd.to_datetime(raw[",[33,96258,4101],{"class":54},[33,96260,8314],{"class":167},[33,96262,61926],{"class":238},[33,96264,242],{"class":163},[33,96266,96267],{"class":54},"\"mixed\"",[33,96269,365],{"class":167},[33,96271,8317],{"class":238},[33,96273,242],{"class":163},[33,96275,12107],{"class":54},[33,96277,221],{"class":167},[33,96279,96280,96282,96284,96286,96288,96291,96293,96295,96297],{"class":35,"line":88},[33,96281,13789],{"class":167},[33,96283,16465],{"class":54},[33,96285,763],{"class":167},[33,96287,242],{"class":163},[33,96289,96290],{"class":167}," pd.to_numeric(raw[",[33,96292,16465],{"class":54},[33,96294,27598],{"class":167},[33,96296,1053],{"class":50},[33,96298,221],{"class":167},[33,96300,96301,96304,96306,96308,96311,96313,96315,96317,96319,96321,96323,96325],{"class":35,"line":95},[33,96302,96303],{"class":167},"                                .str.replace(",[33,96305,11977],{"class":163},[33,96307,274],{"class":54},[33,96309,96310],{"class":50},"[$,]",[33,96312,274],{"class":54},[33,96314,365],{"class":167},[33,96316,3198],{"class":54},[33,96318,365],{"class":167},[33,96320,11993],{"class":238},[33,96322,242],{"class":163},[33,96324,855],{"class":50},[33,96326,1506],{"class":167},[33,96328,96329,96332,96334,96336],{"class":35,"line":101},[33,96330,96331],{"class":238},"                                errors",[33,96333,242],{"class":163},[33,96335,12107],{"class":54},[33,96337,221],{"class":167},[33,96339,96340,96342,96344,96346,96348,96350,96352,96354,96356,96358,96360],{"class":35,"line":171},[33,96341,13789],{"class":167},[33,96343,16659],{"class":54},[33,96345,48135],{"class":167},[33,96347,242],{"class":163},[33,96349,96290],{"class":167},[33,96351,16659],{"class":54},[33,96353,8314],{"class":167},[33,96355,8317],{"class":238},[33,96357,242],{"class":163},[33,96359,12107],{"class":54},[33,96361,221],{"class":167},[33,96363,96364],{"class":35,"line":179},[33,96365,92],{"emptyLinePlaceholder":91},[33,96367,96368,96371,96373,96375],{"class":35,"line":187},[33,96369,96370],{"class":167},"n_before ",[33,96372,242],{"class":163},[33,96374,4037],{"class":50},[33,96376,96377],{"class":167},"(raw)\n",[33,96379,96380,96383,96385,96387,96389,96391,96393,96395,96397,96399,96401,96403,96405,96407],{"class":35,"line":201},[33,96381,96382],{"class":167},"raw.dropna(",[33,96384,28066],{"class":238},[33,96386,242],{"class":163},[33,96388,8309],{"class":167},[33,96390,4101],{"class":54},[33,96392,365],{"class":167},[33,96394,16465],{"class":54},[33,96396,365],{"class":167},[33,96398,16659],{"class":54},[33,96400,8314],{"class":167},[33,96402,10891],{"class":238},[33,96404,242],{"class":163},[33,96406,855],{"class":50},[33,96408,221],{"class":167},[33,96410,96411,96414,96416,96419,96421,96423],{"class":35,"line":206},[33,96412,96413],{"class":167},"n_dropped ",[33,96415,242],{"class":163},[33,96417,96418],{"class":167}," n_before ",[33,96420,4126],{"class":163},[33,96422,4037],{"class":50},[33,96424,96377],{"class":167},[33,96426,96427,96429],{"class":35,"line":224},[33,96428,2491],{"class":163},[33,96430,96431],{"class":167}," n_dropped:\n",[33,96433,96434,96436,96438,96440,96443,96445,96448,96450,96453],{"class":35,"line":229},[33,96435,7268],{"class":50},[33,96437,602],{"class":167},[33,96439,4059],{"class":163},[33,96441,96442],{"class":54},"\"Warning: dropped ",[33,96444,1115],{"class":50},[33,96446,96447],{"class":167},"n_dropped",[33,96449,1121],{"class":50},[33,96451,96452],{"class":54}," rows with unparseable values\"",[33,96454,221],{"class":167},[33,96456,96457],{"class":35,"line":235},[33,96458,92],{"emptyLinePlaceholder":91},[33,96460,96461,96463,96466,96468,96470,96472,96474,96477,96480,96483,96485,96487],{"class":35,"line":250},[33,96462,13789],{"class":167},[33,96464,96465],{"class":54},"\"month\"",[33,96467,763],{"class":167},[33,96469,242],{"class":163},[33,96471,51120],{"class":167},[33,96473,4101],{"class":54},[33,96475,96476],{"class":167},"].dt.to_period(",[33,96478,96479],{"class":54},"\"M\"",[33,96481,96482],{"class":167},").astype(",[33,96484,1053],{"class":50},[33,96486,10922],{"class":167},[33,96488,96489],{"class":39},"# \"2026-01\" etc.\n",[18,96491,96493],{"id":96492},"step-3-aggregate-with-groupby","Step 3 — Aggregate with groupby",[14,96495,96496],{},"Two aggregations feed two separate sheets: a regional summary and a month-over-month trend.",[23,96498,96500],{"className":126,"code":96499,"language":47,"meta":28,"style":28},"# pip install pandas\n# Continues from Step 2\n\n# Regional summary\nby_region = (\n    raw.groupby(\"region\", as_index=False)\n    .agg(\n        total_revenue  =(\"revenue\", \"sum\"),\n        total_units    =(\"units\",   \"sum\"),\n        transactions   =(\"revenue\", \"count\"),\n    )\n    .sort_values(\"total_revenue\", ascending=False)\n)\nby_region[\"avg_order\"] = (\n    by_region[\"total_revenue\"] \u002F by_region[\"transactions\"]\n).round(2)\n\n# Monthly trend\nby_month = (\n    raw.groupby(\"month\", as_index=False)\n    .agg(total_revenue=(\"revenue\", \"sum\"),\n         total_units  =(\"units\",   \"sum\"))\n    .sort_values(\"month\")\n)\n\n# Month-on-month revenue change\nby_month[\"mom_change\"] = by_month[\"total_revenue\"].pct_change().round(4)\n\nprint(by_region)\nprint(by_month)\n",[30,96501,96502,96506,96511,96515,96520,96529,96547,96552,96569,96586,96604,96608,96626,96630,96644,96663,96671,96675,96680,96689,96705,96724,96741,96749,96753,96757,96762,96786,96790,96797],{"__ignoreMap":28},[33,96503,96504],{"class":35,"line":36},[33,96505,8895],{"class":39},[33,96507,96508],{"class":35,"line":43},[33,96509,96510],{"class":39},"# Continues from Step 2\n",[33,96512,96513],{"class":35,"line":61},[33,96514,92],{"emptyLinePlaceholder":91},[33,96516,96517],{"class":35,"line":73},[33,96518,96519],{"class":39},"# Regional summary\n",[33,96521,96522,96525,96527],{"class":35,"line":88},[33,96523,96524],{"class":167},"by_region ",[33,96526,242],{"class":163},[33,96528,1415],{"class":167},[33,96530,96531,96534,96536,96538,96541,96543,96545],{"class":35,"line":95},[33,96532,96533],{"class":167},"    raw.groupby(",[33,96535,16649],{"class":54},[33,96537,365],{"class":167},[33,96539,96540],{"class":238},"as_index",[33,96542,242],{"class":163},[33,96544,902],{"class":50},[33,96546,221],{"class":167},[33,96548,96549],{"class":35,"line":101},[33,96550,96551],{"class":167},"    .agg(\n",[33,96553,96554,96557,96559,96561,96563,96565,96567],{"class":35,"line":171},[33,96555,96556],{"class":238},"        total_revenue",[33,96558,17208],{"class":163},[33,96560,602],{"class":167},[33,96562,16465],{"class":54},[33,96564,365],{"class":167},[33,96566,18522],{"class":54},[33,96568,1506],{"class":167},[33,96570,96571,96574,96576,96578,96580,96582,96584],{"class":35,"line":179},[33,96572,96573],{"class":238},"        total_units",[33,96575,20470],{"class":163},[33,96577,602],{"class":167},[33,96579,16659],{"class":54},[33,96581,1166],{"class":167},[33,96583,18522],{"class":54},[33,96585,1506],{"class":167},[33,96587,96588,96591,96593,96595,96597,96599,96602],{"class":35,"line":187},[33,96589,96590],{"class":238},"        transactions",[33,96592,21012],{"class":163},[33,96594,602],{"class":167},[33,96596,16465],{"class":54},[33,96598,365],{"class":167},[33,96600,96601],{"class":54},"\"count\"",[33,96603,1506],{"class":167},[33,96605,96606],{"class":35,"line":201},[33,96607,1202],{"class":167},[33,96609,96610,96613,96616,96618,96620,96622,96624],{"class":35,"line":206},[33,96611,96612],{"class":167},"    .sort_values(",[33,96614,96615],{"class":54},"\"total_revenue\"",[33,96617,365],{"class":167},[33,96619,18572],{"class":238},[33,96621,242],{"class":163},[33,96623,902],{"class":50},[33,96625,221],{"class":167},[33,96627,96628],{"class":35,"line":224},[33,96629,221],{"class":167},[33,96631,96632,96635,96638,96640,96642],{"class":35,"line":229},[33,96633,96634],{"class":167},"by_region[",[33,96636,96637],{"class":54},"\"avg_order\"",[33,96639,763],{"class":167},[33,96641,242],{"class":163},[33,96643,1415],{"class":167},[33,96645,96646,96649,96651,96653,96655,96658,96661],{"class":35,"line":235},[33,96647,96648],{"class":167},"    by_region[",[33,96650,96615],{"class":54},[33,96652,763],{"class":167},[33,96654,1351],{"class":163},[33,96656,96657],{"class":167}," by_region[",[33,96659,96660],{"class":54},"\"transactions\"",[33,96662,9202],{"class":167},[33,96664,96665,96667,96669],{"class":35,"line":250},[33,96666,59790],{"class":167},[33,96668,1533],{"class":50},[33,96670,221],{"class":167},[33,96672,96673],{"class":35,"line":266},[33,96674,92],{"emptyLinePlaceholder":91},[33,96676,96677],{"class":35,"line":290},[33,96678,96679],{"class":39},"# Monthly trend\n",[33,96681,96682,96685,96687],{"class":35,"line":295},[33,96683,96684],{"class":167},"by_month ",[33,96686,242],{"class":163},[33,96688,1415],{"class":167},[33,96690,96691,96693,96695,96697,96699,96701,96703],{"class":35,"line":300},[33,96692,96533],{"class":167},[33,96694,96465],{"class":54},[33,96696,365],{"class":167},[33,96698,96540],{"class":238},[33,96700,242],{"class":163},[33,96702,902],{"class":50},[33,96704,221],{"class":167},[33,96706,96707,96710,96712,96714,96716,96718,96720,96722],{"class":35,"line":317},[33,96708,96709],{"class":167},"    .agg(",[33,96711,18407],{"class":238},[33,96713,242],{"class":163},[33,96715,602],{"class":167},[33,96717,16465],{"class":54},[33,96719,365],{"class":167},[33,96721,18522],{"class":54},[33,96723,1506],{"class":167},[33,96725,96726,96729,96731,96733,96735,96737,96739],{"class":35,"line":332},[33,96727,96728],{"class":238},"         total_units",[33,96730,17208],{"class":163},[33,96732,602],{"class":167},[33,96734,16659],{"class":54},[33,96736,1166],{"class":167},[33,96738,18522],{"class":54},[33,96740,371],{"class":167},[33,96742,96743,96745,96747],{"class":35,"line":347},[33,96744,96612],{"class":167},[33,96746,96465],{"class":54},[33,96748,221],{"class":167},[33,96750,96751],{"class":35,"line":374},[33,96752,221],{"class":167},[33,96754,96755],{"class":35,"line":397},[33,96756,92],{"emptyLinePlaceholder":91},[33,96758,96759],{"class":35,"line":653},[33,96760,96761],{"class":39},"# Month-on-month revenue change\n",[33,96763,96764,96767,96770,96772,96774,96777,96779,96782,96784],{"class":35,"line":667},[33,96765,96766],{"class":167},"by_month[",[33,96768,96769],{"class":54},"\"mom_change\"",[33,96771,763],{"class":167},[33,96773,242],{"class":163},[33,96775,96776],{"class":167}," by_month[",[33,96778,96615],{"class":54},[33,96780,96781],{"class":167},"].pct_change().round(",[33,96783,1503],{"class":50},[33,96785,221],{"class":167},[33,96787,96788],{"class":35,"line":675},[33,96789,92],{"emptyLinePlaceholder":91},[33,96791,96792,96794],{"class":35,"line":689},[33,96793,13474],{"class":50},[33,96795,96796],{"class":167},"(by_region)\n",[33,96798,96799,96801],{"class":35,"line":703},[33,96800,13474],{"class":50},[33,96802,96803],{"class":167},"(by_month)\n",[14,96805,96806,96807,96810,96811,96814,96815,96817],{},"Always call ",[30,96808,96809],{},".reset_index()"," — or pass ",[30,96812,96813],{},"as_index=False"," — after ",[30,96816,21820],{},". Failing to do so produces a MultiIndex that serialises to Excel with a blank header row.",[18,96819,96821],{"id":96820},"step-4-write-the-multi-sheet-workbook","Step 4 — Write the Multi-Sheet Workbook",[14,96823,96824,96827,96828,96830,96831,96834],{},[30,96825,96826],{},"pandas.ExcelWriter"," with the ",[30,96829,22009],{}," engine gives direct access to the workbook object for styling after ",[30,96832,96833],{},"to_excel"," writes the data.",[23,96836,96838],{"className":126,"code":96837,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nfrom pathlib import Path\nfrom datetime import datetime\nimport pandas as pd\nfrom openpyxl.styles import Font, PatternFill, Alignment, numbers\n\nOUTPUT_DIR = Path(\"reports\")\nOUTPUT_DIR.mkdir(exist_ok=True)\ntimestamp  = datetime.now().strftime(\"%Y%m\")\nOUTPUT     = OUTPUT_DIR \u002F f\"monthly_sales_{timestamp}.xlsx\"\n\nHEADER_COLOR = \"2563eb\"\nHEADER_FONT  = \"ffffff\"\n\ntry:\n    with pd.ExcelWriter(OUTPUT, engine=\"openpyxl\") as writer:\n        by_region.to_excel(writer, sheet_name=\"Regional Summary\", index=False)\n        by_month.to_excel( writer, sheet_name=\"Monthly Trend\",    index=False)\n\n        wb = writer.book\n\n        for sheet_name, frame in [(\"Regional Summary\", by_region),\n                                   (\"Monthly Trend\",    by_month)]:\n            ws = wb[sheet_name]\n\n            # Style header row\n            hdr_fill = PatternFill(\"solid\", fgColor=HEADER_COLOR)\n            hdr_font = Font(bold=True, color=HEADER_FONT, size=12)\n            for cell in ws[1]:\n                cell.fill      = hdr_fill\n                cell.font      = hdr_font\n                cell.alignment = Alignment(horizontal=\"center\")\n\n            # Freeze header\n            ws.freeze_panes = \"A2\"\n\n            # Auto-width columns (approximate)\n            for col in ws.columns:\n                max_len = max(\n                    len(str(cell.value)) if cell.value is not None else 0\n                    for cell in col\n                )\n                ws.column_dimensions[col[0].column_letter].width = min(max_len + 4, 30)\n\n        # Currency format for revenue columns\n        ws_r = wb[\"Regional Summary\"]\n        n    = len(by_region)\n        for row in ws_r.iter_rows(min_row=2, max_row=n + 1, min_col=2, max_col=2):\n            for cell in row:\n                cell.number_format = \"#,##0.00\"\n        for row in ws_r.iter_rows(min_row=2, max_row=n + 1, min_col=5, max_col=5):\n            for cell in row:\n                cell.number_format = \"#,##0.00\"\n\n        # Totals row\n        total_row = n + 2\n        ws_r.cell(total_row, 1).value = \"TOTAL\"\n        ws_r.cell(total_row, 1).font  = Font(bold=True)\n        ws_r.cell(total_row, 2).value = by_region[\"total_revenue\"].sum()\n        ws_r.cell(total_row, 2).font  = Font(bold=True)\n        ws_r.cell(total_row, 2).number_format = \"#,##0.00\"\n        ws_r.cell(total_row, 3).value = int(by_region[\"total_units\"].sum())\n        ws_r.cell(total_row, 3).font  = Font(bold=True)\n\nexcept PermissionError:\n    raise SystemExit(f\"Close {OUTPUT} in Excel and retry\")\nexcept Exception as exc:\n    raise SystemExit(f\"Report failed: {exc}\")\n\nprint(f\"Saved: {OUTPUT}\")\n",[30,96839,96840,96844,96854,96865,96875,96885,96889,96902,96916,96931,96957,96961,96971,96980,96984,96990,97013,97035,97057,97061,97071,97075,97091,97101,97111,97115,97120,97141,97174,97188,97198,97208,97225,97229,97234,97244,97248,97253,97264,97275,97302,97312,97317,97344,97348,97353,97366,97377,97426,97436,97445,97491,97501,97509,97513,97518,97532,97546,97567,97583,97603,97616,97637,97657,97661,97669,97690,97700,97723,97727],{"__ignoreMap":28},[33,96841,96842],{"class":35,"line":36},[33,96843,3952],{"class":39},[33,96845,96846,96848,96850,96852],{"class":35,"line":43},[33,96847,190],{"class":163},[33,96849,193],{"class":167},[33,96851,164],{"class":163},[33,96853,198],{"class":167},[33,96855,96856,96858,96860,96862],{"class":35,"line":61},[33,96857,190],{"class":163},[33,96859,3881],{"class":167},[33,96861,164],{"class":163},[33,96863,96864],{"class":167}," datetime\n",[33,96866,96867,96869,96871,96873],{"class":35,"line":73},[33,96868,164],{"class":163},[33,96870,492],{"class":167},[33,96872,495],{"class":163},[33,96874,498],{"class":167},[33,96876,96877,96879,96881,96883],{"class":35,"line":88},[33,96878,190],{"class":163},[33,96880,17115],{"class":167},[33,96882,164],{"class":163},[33,96884,17120],{"class":167},[33,96886,96887],{"class":35,"line":95},[33,96888,92],{"emptyLinePlaceholder":91},[33,96890,96891,96893,96895,96897,96900],{"class":35,"line":101},[33,96892,4615],{"class":50},[33,96894,212],{"class":163},[33,96896,215],{"class":167},[33,96898,96899],{"class":54},"\"reports\"",[33,96901,221],{"class":167},[33,96903,96904,96906,96908,96910,96912,96914],{"class":35,"line":171},[33,96905,4615],{"class":50},[33,96907,1078],{"class":167},[33,96909,878],{"class":238},[33,96911,242],{"class":163},[33,96913,855],{"class":50},[33,96915,221],{"class":167},[33,96917,96918,96921,96923,96926,96929],{"class":35,"line":179},[33,96919,96920],{"class":167},"timestamp  ",[33,96922,242],{"class":163},[33,96924,96925],{"class":167}," datetime.now().strftime(",[33,96927,96928],{"class":54},"\"%Y%m\"",[33,96930,221],{"class":167},[33,96932,96933,96936,96939,96941,96943,96945,96948,96950,96953,96955],{"class":35,"line":187},[33,96934,96935],{"class":50},"OUTPUT",[33,96937,96938],{"class":163},"     =",[33,96940,50349],{"class":50},[33,96942,1107],{"class":163},[33,96944,1110],{"class":163},[33,96946,96947],{"class":54},"\"monthly_sales_",[33,96949,1115],{"class":50},[33,96951,96952],{"class":167},"timestamp",[33,96954,1121],{"class":50},[33,96956,6410],{"class":54},[33,96958,96959],{"class":35,"line":201},[33,96960,92],{"emptyLinePlaceholder":91},[33,96962,96963,96966,96968],{"class":35,"line":206},[33,96964,96965],{"class":50},"HEADER_COLOR",[33,96967,212],{"class":163},[33,96969,96970],{"class":54}," \"2563eb\"\n",[33,96972,96973,96975,96977],{"class":35,"line":224},[33,96974,17228],{"class":50},[33,96976,17208],{"class":163},[33,96978,96979],{"class":54}," \"ffffff\"\n",[33,96981,96982],{"class":35,"line":229},[33,96983,92],{"emptyLinePlaceholder":91},[33,96985,96986,96988],{"class":35,"line":235},[33,96987,35574],{"class":163},[33,96989,574],{"class":167},[33,96991,96992,96994,96997,96999,97001,97003,97005,97007,97009,97011],{"class":35,"line":250},[33,96993,1635],{"class":163},[33,96995,96996],{"class":167}," pd.ExcelWriter(",[33,96998,96935],{"class":50},[33,97000,365],{"class":167},[33,97002,17351],{"class":238},[33,97004,242],{"class":163},[33,97006,17356],{"class":54},[33,97008,1649],{"class":167},[33,97010,495],{"class":163},[33,97012,17363],{"class":167},[33,97014,97015,97018,97020,97022,97025,97027,97029,97031,97033],{"class":35,"line":266},[33,97016,97017],{"class":167},"        by_region.to_excel(writer, ",[33,97019,17371],{"class":238},[33,97021,242],{"class":163},[33,97023,97024],{"class":54},"\"Regional Summary\"",[33,97026,365],{"class":167},[33,97028,897],{"class":238},[33,97030,242],{"class":163},[33,97032,902],{"class":50},[33,97034,221],{"class":167},[33,97036,97037,97040,97042,97044,97047,97049,97051,97053,97055],{"class":35,"line":290},[33,97038,97039],{"class":167},"        by_month.to_excel( writer, ",[33,97041,17371],{"class":238},[33,97043,242],{"class":163},[33,97045,97046],{"class":54},"\"Monthly Trend\"",[33,97048,38342],{"class":167},[33,97050,897],{"class":238},[33,97052,242],{"class":163},[33,97054,902],{"class":50},[33,97056,221],{"class":167},[33,97058,97059],{"class":35,"line":295},[33,97060,92],{"emptyLinePlaceholder":91},[33,97062,97063,97066,97068],{"class":35,"line":300},[33,97064,97065],{"class":167},"        wb ",[33,97067,242],{"class":163},[33,97069,97070],{"class":167}," writer.book\n",[33,97072,97073],{"class":35,"line":317},[33,97074,92],{"emptyLinePlaceholder":91},[33,97076,97077,97079,97082,97084,97086,97088],{"class":35,"line":332},[33,97078,5973],{"class":163},[33,97080,97081],{"class":167}," sheet_name, frame ",[33,97083,662],{"class":163},[33,97085,79351],{"class":167},[33,97087,97024],{"class":54},[33,97089,97090],{"class":167},", by_region),\n",[33,97092,97093,97096,97098],{"class":35,"line":347},[33,97094,97095],{"class":167},"                                   (",[33,97097,97046],{"class":54},[33,97099,97100],{"class":167},",    by_month)]:\n",[33,97102,97103,97106,97108],{"class":35,"line":374},[33,97104,97105],{"class":167},"            ws ",[33,97107,242],{"class":163},[33,97109,97110],{"class":167}," wb[sheet_name]\n",[33,97112,97113],{"class":35,"line":397},[33,97114,92],{"emptyLinePlaceholder":91},[33,97116,97117],{"class":35,"line":653},[33,97118,97119],{"class":39},"            # Style header row\n",[33,97121,97122,97125,97127,97129,97131,97133,97135,97137,97139],{"class":35,"line":667},[33,97123,97124],{"class":167},"            hdr_fill ",[33,97126,242],{"class":163},[33,97128,17185],{"class":167},[33,97130,17188],{"class":54},[33,97132,365],{"class":167},[33,97134,17193],{"class":238},[33,97136,242],{"class":163},[33,97138,96965],{"class":50},[33,97140,221],{"class":167},[33,97142,97143,97146,97148,97150,97152,97154,97156,97158,97160,97162,97164,97166,97168,97170,97172],{"class":35,"line":675},[33,97144,97145],{"class":167},"            hdr_font ",[33,97147,242],{"class":163},[33,97149,17233],{"class":167},[33,97151,17236],{"class":238},[33,97153,242],{"class":163},[33,97155,855],{"class":50},[33,97157,365],{"class":167},[33,97159,17245],{"class":238},[33,97161,242],{"class":163},[33,97163,17228],{"class":50},[33,97165,365],{"class":167},[33,97167,17255],{"class":238},[33,97169,242],{"class":163},[33,97171,55650],{"class":50},[33,97173,221],{"class":167},[33,97175,97176,97178,97180,97182,97184,97186],{"class":35,"line":689},[33,97177,1793],{"class":163},[33,97179,17467],{"class":167},[33,97181,662],{"class":163},[33,97183,17472],{"class":167},[33,97185,734],{"class":50},[33,97187,17477],{"class":167},[33,97189,97190,97193,97195],{"class":35,"line":703},[33,97191,97192],{"class":167},"                cell.fill      ",[33,97194,242],{"class":163},[33,97196,97197],{"class":167}," hdr_fill\n",[33,97199,97200,97203,97205],{"class":35,"line":714},[33,97201,97202],{"class":167},"                cell.font      ",[33,97204,242],{"class":163},[33,97206,97207],{"class":167}," hdr_font\n",[33,97209,97210,97213,97215,97217,97219,97221,97223],{"class":35,"line":723},[33,97211,97212],{"class":167},"                cell.alignment ",[33,97214,242],{"class":163},[33,97216,17507],{"class":167},[33,97218,17510],{"class":238},[33,97220,242],{"class":163},[33,97222,17515],{"class":54},[33,97224,221],{"class":167},[33,97226,97227],{"class":35,"line":754},[33,97228,92],{"emptyLinePlaceholder":91},[33,97230,97231],{"class":35,"line":771},[33,97232,97233],{"class":39},"            # Freeze header\n",[33,97235,97236,97239,97241],{"class":35,"line":777},[33,97237,97238],{"class":167},"            ws.freeze_panes ",[33,97240,242],{"class":163},[33,97242,97243],{"class":54}," \"A2\"\n",[33,97245,97246],{"class":35,"line":788},[33,97247,92],{"emptyLinePlaceholder":91},[33,97249,97250],{"class":35,"line":804},[33,97251,97252],{"class":39},"            # Auto-width columns (approximate)\n",[33,97254,97255,97257,97259,97261],{"class":35,"line":809},[33,97256,1793],{"class":163},[33,97258,7985],{"class":167},[33,97260,662],{"class":163},[33,97262,97263],{"class":167}," ws.columns:\n",[33,97265,97266,97269,97271,97273],{"class":35,"line":819},[33,97267,97268],{"class":167},"                max_len ",[33,97270,242],{"class":163},[33,97272,45817],{"class":50},[33,97274,7637],{"class":167},[33,97276,97277,97280,97282,97284,97287,97289,97292,97294,97296,97298,97300],{"class":35,"line":829},[33,97278,97279],{"class":50},"                    len",[33,97281,602],{"class":167},[33,97283,1053],{"class":50},[33,97285,97286],{"class":167},"(cell.value)) ",[33,97288,2491],{"class":163},[33,97290,97291],{"class":167}," cell.value ",[33,97293,3847],{"class":163},[33,97295,620],{"class":163},[33,97297,7657],{"class":50},[33,97299,15715],{"class":163},[33,97301,28914],{"class":50},[33,97303,97304,97306,97308,97310],{"class":35,"line":834},[33,97305,13668],{"class":163},[33,97307,17467],{"class":167},[33,97309,662],{"class":163},[33,97311,23872],{"class":167},[33,97313,97314],{"class":35,"line":839},[33,97315,97316],{"class":167},"                )\n",[33,97318,97319,97322,97324,97327,97329,97331,97334,97336,97338,97340,97342],{"class":35,"line":860},[33,97320,97321],{"class":167},"                ws.column_dimensions[col[",[33,97323,748],{"class":50},[33,97325,97326],{"class":167},"].column_letter].width ",[33,97328,242],{"class":163},[33,97330,73775],{"class":50},[33,97332,97333],{"class":167},"(max_len ",[33,97335,1811],{"class":163},[33,97337,82708],{"class":50},[33,97339,365],{"class":167},[33,97341,1543],{"class":50},[33,97343,221],{"class":167},[33,97345,97346],{"class":35,"line":887},[33,97347,92],{"emptyLinePlaceholder":91},[33,97349,97350],{"class":35,"line":907},[33,97351,97352],{"class":39},"        # Currency format for revenue columns\n",[33,97354,97355,97358,97360,97362,97364],{"class":35,"line":1826},[33,97356,97357],{"class":167},"        ws_r ",[33,97359,242],{"class":163},[33,97361,17447],{"class":167},[33,97363,97024],{"class":54},[33,97365,9202],{"class":167},[33,97367,97368,97371,97373,97375],{"class":35,"line":1844},[33,97369,97370],{"class":167},"        n    ",[33,97372,242],{"class":163},[33,97374,4037],{"class":50},[33,97376,96796],{"class":167},[33,97378,97379,97381,97383,97385,97388,97390,97392,97394,97396,97399,97401,97404,97406,97408,97410,97412,97414,97416,97418,97420,97422,97424],{"class":35,"line":1858},[33,97380,5973],{"class":163},[33,97382,3844],{"class":167},[33,97384,662],{"class":163},[33,97386,97387],{"class":167}," ws_r.iter_rows(",[33,97389,17642],{"class":238},[33,97391,242],{"class":163},[33,97393,1533],{"class":50},[33,97395,365],{"class":167},[33,97397,97398],{"class":238},"max_row",[33,97400,242],{"class":163},[33,97402,97403],{"class":167},"n ",[33,97405,1811],{"class":163},[33,97407,1814],{"class":50},[33,97409,365],{"class":167},[33,97411,17651],{"class":238},[33,97413,242],{"class":163},[33,97415,1533],{"class":50},[33,97417,365],{"class":167},[33,97419,17659],{"class":238},[33,97421,242],{"class":163},[33,97423,1533],{"class":50},[33,97425,1737],{"class":167},[33,97427,97428,97430,97432,97434],{"class":35,"line":1871},[33,97429,1793],{"class":163},[33,97431,17467],{"class":167},[33,97433,662],{"class":163},[33,97435,17675],{"class":167},[33,97437,97438,97440,97442],{"class":35,"line":1877},[33,97439,17680],{"class":167},[33,97441,242],{"class":163},[33,97443,97444],{"class":54}," \"#,##0.00\"\n",[33,97446,97447,97449,97451,97453,97455,97457,97459,97461,97463,97465,97467,97469,97471,97473,97475,97477,97479,97481,97483,97485,97487,97489],{"class":35,"line":1883},[33,97448,5973],{"class":163},[33,97450,3844],{"class":167},[33,97452,662],{"class":163},[33,97454,97387],{"class":167},[33,97456,17642],{"class":238},[33,97458,242],{"class":163},[33,97460,1533],{"class":50},[33,97462,365],{"class":167},[33,97464,97398],{"class":238},[33,97466,242],{"class":163},[33,97468,97403],{"class":167},[33,97470,1811],{"class":163},[33,97472,1814],{"class":50},[33,97474,365],{"class":167},[33,97476,17651],{"class":238},[33,97478,242],{"class":163},[33,97480,1153],{"class":50},[33,97482,365],{"class":167},[33,97484,17659],{"class":238},[33,97486,242],{"class":163},[33,97488,1153],{"class":50},[33,97490,1737],{"class":167},[33,97492,97493,97495,97497,97499],{"class":35,"line":1915},[33,97494,1793],{"class":163},[33,97496,17467],{"class":167},[33,97498,662],{"class":163},[33,97500,17675],{"class":167},[33,97502,97503,97505,97507],{"class":35,"line":1926},[33,97504,17680],{"class":167},[33,97506,242],{"class":163},[33,97508,97444],{"class":54},[33,97510,97511],{"class":35,"line":1932},[33,97512,92],{"emptyLinePlaceholder":91},[33,97514,97515],{"class":35,"line":1938},[33,97516,97517],{"class":39},"        # Totals row\n",[33,97519,97520,97523,97525,97527,97529],{"class":35,"line":1950},[33,97521,97522],{"class":167},"        total_row ",[33,97524,242],{"class":163},[33,97526,48941],{"class":167},[33,97528,1811],{"class":163},[33,97530,97531],{"class":50}," 2\n",[33,97533,97534,97537,97539,97542,97544],{"class":35,"line":1958},[33,97535,97536],{"class":167},"        ws_r.cell(total_row, ",[33,97538,734],{"class":50},[33,97540,97541],{"class":167},").value ",[33,97543,242],{"class":163},[33,97545,17912],{"class":54},[33,97547,97548,97550,97552,97555,97557,97559,97561,97563,97565],{"class":35,"line":4904},[33,97549,97536],{"class":167},[33,97551,734],{"class":50},[33,97553,97554],{"class":167},").font  ",[33,97556,242],{"class":163},[33,97558,17233],{"class":167},[33,97560,17236],{"class":238},[33,97562,242],{"class":163},[33,97564,855],{"class":50},[33,97566,221],{"class":167},[33,97568,97569,97571,97573,97575,97577,97579,97581],{"class":35,"line":4909},[33,97570,97536],{"class":167},[33,97572,1533],{"class":50},[33,97574,97541],{"class":167},[33,97576,242],{"class":163},[33,97578,96657],{"class":167},[33,97580,96615],{"class":54},[33,97582,18333],{"class":167},[33,97584,97585,97587,97589,97591,97593,97595,97597,97599,97601],{"class":35,"line":4915},[33,97586,97536],{"class":167},[33,97588,1533],{"class":50},[33,97590,97554],{"class":167},[33,97592,242],{"class":163},[33,97594,17233],{"class":167},[33,97596,17236],{"class":238},[33,97598,242],{"class":163},[33,97600,855],{"class":50},[33,97602,221],{"class":167},[33,97604,97605,97607,97609,97612,97614],{"class":35,"line":4925},[33,97606,97536],{"class":167},[33,97608,1533],{"class":50},[33,97610,97611],{"class":167},").number_format ",[33,97613,242],{"class":163},[33,97615,97444],{"class":54},[33,97617,97618,97620,97622,97624,97626,97628,97631,97634],{"class":35,"line":4935},[33,97619,97536],{"class":167},[33,97621,10258],{"class":50},[33,97623,97541],{"class":167},[33,97625,242],{"class":163},[33,97627,3149],{"class":50},[33,97629,97630],{"class":167},"(by_region[",[33,97632,97633],{"class":54},"\"total_units\"",[33,97635,97636],{"class":167},"].sum())\n",[33,97638,97639,97641,97643,97645,97647,97649,97651,97653,97655],{"class":35,"line":4941},[33,97640,97536],{"class":167},[33,97642,10258],{"class":50},[33,97644,97554],{"class":167},[33,97646,242],{"class":163},[33,97648,17233],{"class":167},[33,97650,17236],{"class":238},[33,97652,242],{"class":163},[33,97654,855],{"class":50},[33,97656,221],{"class":167},[33,97658,97659],{"class":35,"line":4950},[33,97660,92],{"emptyLinePlaceholder":91},[33,97662,97663,97665,97667],{"class":35,"line":4960},[33,97664,35726],{"class":163},[33,97666,17393],{"class":50},[33,97668,574],{"class":167},[33,97670,97671,97673,97675,97677,97679,97682,97685,97688],{"class":35,"line":4965},[33,97672,35742],{"class":163},[33,97674,16617],{"class":50},[33,97676,602],{"class":167},[33,97678,4059],{"class":163},[33,97680,97681],{"class":54},"\"Close ",[33,97683,97684],{"class":50},"{OUTPUT}",[33,97686,97687],{"class":54}," in Excel and retry\"",[33,97689,221],{"class":167},[33,97691,97692,97694,97696,97698],{"class":35,"line":4971},[33,97693,35726],{"class":163},[33,97695,783],{"class":50},[33,97697,1852],{"class":163},[33,97699,1855],{"class":167},[33,97701,97702,97704,97706,97708,97710,97713,97715,97717,97719,97721],{"class":35,"line":4983},[33,97703,35742],{"class":163},[33,97705,16617],{"class":50},[33,97707,602],{"class":167},[33,97709,4059],{"class":163},[33,97711,97712],{"class":54},"\"Report failed: ",[33,97714,1115],{"class":50},[33,97716,6565],{"class":167},[33,97718,1121],{"class":50},[33,97720,274],{"class":54},[33,97722,221],{"class":167},[33,97724,97725],{"class":35,"line":4988},[33,97726,92],{"emptyLinePlaceholder":91},[33,97728,97729,97731,97733,97735,97738,97740,97742],{"class":35,"line":4993},[33,97730,13474],{"class":50},[33,97732,602],{"class":167},[33,97734,4059],{"class":163},[33,97736,97737],{"class":54},"\"Saved: ",[33,97739,97684],{"class":50},[33,97741,274],{"class":54},[33,97743,221],{"class":167},[18,97745,35802],{"id":35801},[424,97747,97749],{"id":97748},"variant-a-mixed-date-formats-across-files","Variant A — Mixed Date Formats Across Files",[14,97751,97752,97753,97756,97757,365,97760,97763,97764,20891],{},"If some files use ",[30,97754,97755],{},"MM\u002FDD\u002FYYYY"," and others ",[30,97758,97759],{},"YYYY-MM-DD",[30,97761,97762],{},"format=\"mixed\""," handles the detection automatically in pandas 2.0+. For older pandas, iterate with ",[30,97765,97766],{},"dateutil",[23,97768,97770],{"className":126,"code":97769,"language":47,"meta":28,"style":28},"# pip install pandas python-dateutil\nfrom dateutil import parser as du\n\nraw[\"date\"] = raw[\"date\"].apply(\n    lambda v: du.parse(str(v)) if pd.notna(v) else pd.NaT\n)\n",[30,97771,97772,97777,97794,97798,97815,97838],{"__ignoreMap":28},[33,97773,97774],{"class":35,"line":36},[33,97775,97776],{"class":39},"# pip install pandas python-dateutil\n",[33,97778,97779,97781,97784,97786,97789,97791],{"class":35,"line":43},[33,97780,190],{"class":163},[33,97782,97783],{"class":167}," dateutil ",[33,97785,164],{"class":163},[33,97787,97788],{"class":167}," parser ",[33,97790,495],{"class":163},[33,97792,97793],{"class":167}," du\n",[33,97795,97796],{"class":35,"line":61},[33,97797,92],{"emptyLinePlaceholder":91},[33,97799,97800,97802,97804,97806,97808,97810,97812],{"class":35,"line":73},[33,97801,13789],{"class":167},[33,97803,4101],{"class":54},[33,97805,763],{"class":167},[33,97807,242],{"class":163},[33,97809,51120],{"class":167},[33,97811,4101],{"class":54},[33,97813,97814],{"class":167},"].apply(\n",[33,97816,97817,97820,97823,97825,97828,97830,97833,97835],{"class":35,"line":88},[33,97818,97819],{"class":163},"    lambda",[33,97821,97822],{"class":167}," v: du.parse(",[33,97824,1053],{"class":50},[33,97826,97827],{"class":167},"(v)) ",[33,97829,2491],{"class":163},[33,97831,97832],{"class":167}," pd.notna(v) ",[33,97834,7489],{"class":163},[33,97836,97837],{"class":167}," pd.NaT\n",[33,97839,97840],{"class":35,"line":95},[33,97841,221],{"class":167},[424,97843,97845],{"id":97844},"variant-b-preserving-a-corporate-template","Variant B — Preserving a Corporate Template",[14,97847,97848,97849,75823,97852,97855,97856,97859,97860,97864],{},"Load the template with ",[30,97850,97851],{},"openpyxl.load_workbook",[30,97853,97854],{},"read_only=True",") and write into it. If you encounter ",[30,97857,97858],{},"AttributeError: read-only"," errors, the workbook was opened in the wrong mode — the ",[940,97861,97863],{"href":97862},"\u002Fpython-for-excel-csv-data-processing\u002Fautomating-excel-report-generation\u002Ffix-openpyxl-read-only-mode-error\u002F","Fix openpyxl Read-Only Mode Error"," guide covers this exactly.",[23,97866,97868],{"className":126,"code":97867,"language":47,"meta":28,"style":28},"# pip install openpyxl pandas\nfrom pathlib import Path\nfrom openpyxl import load_workbook\nfrom openpyxl.utils.dataframe import dataframe_to_rows\n\nTEMPLATE = Path(\"templates\u002Fsales_template.xlsx\")\nOUTPUT   = Path(\"reports\u002Fsales_filled.xlsx\")\n\ntry:\n    wb = load_workbook(TEMPLATE)   # editable mode — no read_only kwarg\n    ws = wb[\"Data\"]\n\n    # Clear previous data rows, keep header\n    for row in ws.iter_rows(min_row=2, max_row=ws.max_row):\n        for cell in row:\n            cell.value = None\n\n    for r_idx, row_data in enumerate(\n        dataframe_to_rows(by_region, index=False, header=False), start=2\n    ):\n        for c_idx, val in enumerate(row_data, start=1):\n            ws.cell(row=r_idx, column=c_idx, value=val)\n\n    wb.save(OUTPUT)\nexcept FileNotFoundError as exc:\n    raise SystemExit(f\"Template missing: {exc}\")\n\nprint(f\"Template filled: {OUTPUT}\")\n",[30,97869,97870,97875,97885,97895,97907,97911,97925,97938,97942,97948,97964,97976,97980,97985,98010,98020,98028,98032,98045,98073,98078,98100,98128,98132,98141,98151,98174,98178],{"__ignoreMap":28},[33,97871,97872],{"class":35,"line":36},[33,97873,97874],{"class":39},"# pip install openpyxl pandas\n",[33,97876,97877,97879,97881,97883],{"class":35,"line":43},[33,97878,190],{"class":163},[33,97880,193],{"class":167},[33,97882,164],{"class":163},[33,97884,198],{"class":167},[33,97886,97887,97889,97891,97893],{"class":35,"line":61},[33,97888,190],{"class":163},[33,97890,17103],{"class":167},[33,97892,164],{"class":163},[33,97894,17108],{"class":167},[33,97896,97897,97899,97902,97904],{"class":35,"line":73},[33,97898,190],{"class":163},[33,97900,97901],{"class":167}," openpyxl.utils.dataframe ",[33,97903,164],{"class":163},[33,97905,97906],{"class":167}," dataframe_to_rows\n",[33,97908,97909],{"class":35,"line":88},[33,97910,92],{"emptyLinePlaceholder":91},[33,97912,97913,97916,97918,97920,97923],{"class":35,"line":95},[33,97914,97915],{"class":50},"TEMPLATE",[33,97917,212],{"class":163},[33,97919,215],{"class":167},[33,97921,97922],{"class":54},"\"templates\u002Fsales_template.xlsx\"",[33,97924,221],{"class":167},[33,97926,97927,97929,97931,97933,97936],{"class":35,"line":101},[33,97928,96935],{"class":50},[33,97930,21012],{"class":163},[33,97932,215],{"class":167},[33,97934,97935],{"class":54},"\"reports\u002Fsales_filled.xlsx\"",[33,97937,221],{"class":167},[33,97939,97940],{"class":35,"line":171},[33,97941,92],{"emptyLinePlaceholder":91},[33,97943,97944,97946],{"class":35,"line":179},[33,97945,35574],{"class":163},[33,97947,574],{"class":167},[33,97949,97950,97952,97954,97957,97959,97961],{"class":35,"line":187},[33,97951,17432],{"class":167},[33,97953,242],{"class":163},[33,97955,97956],{"class":167}," load_workbook(",[33,97958,97915],{"class":50},[33,97960,12000],{"class":167},[33,97962,97963],{"class":39},"# editable mode — no read_only kwarg\n",[33,97965,97966,97968,97970,97972,97974],{"class":35,"line":201},[33,97967,17442],{"class":167},[33,97969,242],{"class":163},[33,97971,17447],{"class":167},[33,97973,17376],{"class":54},[33,97975,9202],{"class":167},[33,97977,97978],{"class":35,"line":206},[33,97979,92],{"emptyLinePlaceholder":91},[33,97981,97982],{"class":35,"line":224},[33,97983,97984],{"class":39},"    # Clear previous data rows, keep header\n",[33,97986,97987,97989,97991,97993,97995,97997,97999,98001,98003,98005,98007],{"class":35,"line":229},[33,97988,656],{"class":163},[33,97990,3844],{"class":167},[33,97992,662],{"class":163},[33,97994,17639],{"class":167},[33,97996,17642],{"class":238},[33,97998,242],{"class":163},[33,98000,1533],{"class":50},[33,98002,365],{"class":167},[33,98004,97398],{"class":238},[33,98006,242],{"class":163},[33,98008,98009],{"class":167},"ws.max_row):\n",[33,98011,98012,98014,98016,98018],{"class":35,"line":235},[33,98013,5973],{"class":163},[33,98015,17467],{"class":167},[33,98017,662],{"class":163},[33,98019,17675],{"class":167},[33,98021,98022,98024,98026],{"class":35,"line":250},[33,98023,17807],{"class":167},[33,98025,242],{"class":163},[33,98027,3852],{"class":50},[33,98029,98030],{"class":35,"line":266},[33,98031,92],{"emptyLinePlaceholder":91},[33,98033,98034,98036,98039,98041,98043],{"class":35,"line":290},[33,98035,656],{"class":163},[33,98037,98038],{"class":167}," r_idx, row_data ",[33,98040,662],{"class":163},[33,98042,7403],{"class":50},[33,98044,7637],{"class":167},[33,98046,98047,98050,98052,98054,98056,98058,98060,98062,98064,98066,98068,98070],{"class":35,"line":295},[33,98048,98049],{"class":167},"        dataframe_to_rows(by_region, ",[33,98051,897],{"class":238},[33,98053,242],{"class":163},[33,98055,902],{"class":50},[33,98057,365],{"class":167},[33,98059,44427],{"class":238},[33,98061,242],{"class":163},[33,98063,902],{"class":50},[33,98065,18525],{"class":167},[33,98067,7409],{"class":238},[33,98069,242],{"class":163},[33,98071,98072],{"class":50},"2\n",[33,98074,98075],{"class":35,"line":300},[33,98076,98077],{"class":167},"    ):\n",[33,98079,98080,98082,98085,98087,98089,98092,98094,98096,98098],{"class":35,"line":317},[33,98081,5973],{"class":163},[33,98083,98084],{"class":167}," c_idx, val ",[33,98086,662],{"class":163},[33,98088,7403],{"class":50},[33,98090,98091],{"class":167},"(row_data, ",[33,98093,7409],{"class":238},[33,98095,242],{"class":163},[33,98097,734],{"class":50},[33,98099,1737],{"class":167},[33,98101,98102,98105,98108,98110,98113,98116,98118,98121,98123,98125],{"class":35,"line":332},[33,98103,98104],{"class":167},"            ws.cell(",[33,98106,98107],{"class":238},"row",[33,98109,242],{"class":163},[33,98111,98112],{"class":167},"r_idx, ",[33,98114,98115],{"class":238},"column",[33,98117,242],{"class":163},[33,98119,98120],{"class":167},"c_idx, ",[33,98122,67110],{"class":238},[33,98124,242],{"class":163},[33,98126,98127],{"class":167},"val)\n",[33,98129,98130],{"class":35,"line":347},[33,98131,92],{"emptyLinePlaceholder":91},[33,98133,98134,98137,98139],{"class":35,"line":374},[33,98135,98136],{"class":167},"    wb.save(",[33,98138,96935],{"class":50},[33,98140,221],{"class":167},[33,98142,98143,98145,98147,98149],{"class":35,"line":397},[33,98144,35726],{"class":163},[33,98146,2945],{"class":50},[33,98148,1852],{"class":163},[33,98150,1855],{"class":167},[33,98152,98153,98155,98157,98159,98161,98164,98166,98168,98170,98172],{"class":35,"line":653},[33,98154,35742],{"class":163},[33,98156,16617],{"class":50},[33,98158,602],{"class":167},[33,98160,4059],{"class":163},[33,98162,98163],{"class":54},"\"Template missing: ",[33,98165,1115],{"class":50},[33,98167,6565],{"class":167},[33,98169,1121],{"class":50},[33,98171,274],{"class":54},[33,98173,221],{"class":167},[33,98175,98176],{"class":35,"line":667},[33,98177,92],{"emptyLinePlaceholder":91},[33,98179,98180,98182,98184,98186,98189,98191,98193],{"class":35,"line":675},[33,98181,13474],{"class":50},[33,98183,602],{"class":167},[33,98185,4059],{"class":163},[33,98187,98188],{"class":54},"\"Template filled: ",[33,98190,97684],{"class":50},[33,98192,274],{"class":54},[33,98194,221],{"class":167},[18,98196,9247],{"id":9246},[14,98198,98199],{},"Open the file programmatically to confirm structural correctness before treating the run as done:",[23,98201,98203],{"className":126,"code":98202,"language":47,"meta":28,"style":28},"# pip install openpyxl\nfrom pathlib import Path\nfrom openpyxl import load_workbook\n\nOUTPUT = Path(\"reports\") \u002F f\"monthly_sales_{datetime.now().strftime('%Y%m')}.xlsx\"\n\nwb = load_workbook(OUTPUT, read_only=True)\nassert \"Regional Summary\" in wb.sheetnames, \"Missing Regional Summary sheet\"\nassert \"Monthly Trend\"    in wb.sheetnames, \"Missing Monthly Trend sheet\"\n\nws = wb[\"Regional Summary\"]\nrows = list(ws.iter_rows(values_only=True))\nassert len(rows) >= 2,          \"No data rows\"\nassert rows[0][0] == \"region\",  f\"Unexpected header: {rows[0][0]}\"\nassert isinstance(rows[1][1], (int, float)), \"Revenue not numeric\"\nwb.close()\nprint(\"Verification passed\")\n",[30,98204,98205,98210,98220,98230,98234,98266,98270,98292,98307,98322,98326,98339,98359,98378,98420,98449,98454],{"__ignoreMap":28},[33,98206,98207],{"class":35,"line":36},[33,98208,98209],{"class":39},"# pip install openpyxl\n",[33,98211,98212,98214,98216,98218],{"class":35,"line":43},[33,98213,190],{"class":163},[33,98215,193],{"class":167},[33,98217,164],{"class":163},[33,98219,198],{"class":167},[33,98221,98222,98224,98226,98228],{"class":35,"line":61},[33,98223,190],{"class":163},[33,98225,17103],{"class":167},[33,98227,164],{"class":163},[33,98229,17108],{"class":167},[33,98231,98232],{"class":35,"line":73},[33,98233,92],{"emptyLinePlaceholder":91},[33,98235,98236,98238,98240,98242,98244,98246,98248,98250,98252,98254,98257,98260,98262,98264],{"class":35,"line":88},[33,98237,96935],{"class":50},[33,98239,212],{"class":163},[33,98241,215],{"class":167},[33,98243,96899],{"class":54},[33,98245,1649],{"class":167},[33,98247,1351],{"class":163},[33,98249,1110],{"class":163},[33,98251,96947],{"class":54},[33,98253,1115],{"class":50},[33,98255,98256],{"class":167},"datetime.now().strftime(",[33,98258,98259],{"class":54},"'%Y%m'",[33,98261,12027],{"class":167},[33,98263,1121],{"class":50},[33,98265,6410],{"class":54},[33,98267,98268],{"class":35,"line":95},[33,98269,92],{"emptyLinePlaceholder":91},[33,98271,98272,98275,98277,98279,98281,98283,98286,98288,98290],{"class":35,"line":101},[33,98273,98274],{"class":167},"wb ",[33,98276,242],{"class":163},[33,98278,97956],{"class":167},[33,98280,96935],{"class":50},[33,98282,365],{"class":167},[33,98284,98285],{"class":238},"read_only",[33,98287,242],{"class":163},[33,98289,855],{"class":50},[33,98291,221],{"class":167},[33,98293,98294,98296,98299,98301,98304],{"class":35,"line":171},[33,98295,36397],{"class":163},[33,98297,98298],{"class":54}," \"Regional Summary\"",[33,98300,8002],{"class":163},[33,98302,98303],{"class":167}," wb.sheetnames, ",[33,98305,98306],{"class":54},"\"Missing Regional Summary sheet\"\n",[33,98308,98309,98311,98314,98317,98319],{"class":35,"line":179},[33,98310,36397],{"class":163},[33,98312,98313],{"class":54}," \"Monthly Trend\"",[33,98315,98316],{"class":163},"    in",[33,98318,98303],{"class":167},[33,98320,98321],{"class":54},"\"Missing Monthly Trend sheet\"\n",[33,98323,98324],{"class":35,"line":187},[33,98325,92],{"emptyLinePlaceholder":91},[33,98327,98328,98331,98333,98335,98337],{"class":35,"line":201},[33,98329,98330],{"class":167},"ws ",[33,98332,242],{"class":163},[33,98334,17447],{"class":167},[33,98336,97024],{"class":54},[33,98338,9202],{"class":167},[33,98340,98341,98343,98345,98347,98350,98353,98355,98357],{"class":35,"line":206},[33,98342,59877],{"class":167},[33,98344,242],{"class":163},[33,98346,599],{"class":50},[33,98348,98349],{"class":167},"(ws.iter_rows(",[33,98351,98352],{"class":238},"values_only",[33,98354,242],{"class":163},[33,98356,855],{"class":50},[33,98358,371],{"class":167},[33,98360,98361,98363,98365,98368,98370,98372,98375],{"class":35,"line":224},[33,98362,36397],{"class":163},[33,98364,4037],{"class":50},[33,98366,98367],{"class":167},"(rows) ",[33,98369,43000],{"class":163},[33,98371,7451],{"class":50},[33,98373,98374],{"class":167},",          ",[33,98376,98377],{"class":54},"\"No data rows\"\n",[33,98379,98380,98382,98384,98386,98388,98390,98392,98394,98397,98399,98401,98404,98406,98408,98410,98412,98414,98416,98418],{"class":35,"line":229},[33,98381,36397],{"class":163},[33,98383,13250],{"class":167},[33,98385,748],{"class":50},[33,98387,44179],{"class":167},[33,98389,748],{"class":50},[33,98391,763],{"class":167},[33,98393,1865],{"class":163},[33,98395,98396],{"class":54}," \"region\"",[33,98398,25480],{"class":167},[33,98400,4059],{"class":163},[33,98402,98403],{"class":54},"\"Unexpected header: ",[33,98405,1115],{"class":50},[33,98407,27235],{"class":167},[33,98409,748],{"class":50},[33,98411,44179],{"class":167},[33,98413,748],{"class":50},[33,98415,9546],{"class":167},[33,98417,1121],{"class":50},[33,98419,7504],{"class":54},[33,98421,98422,98424,98426,98429,98431,98433,98435,98438,98440,98442,98444,98446],{"class":35,"line":235},[33,98423,36397],{"class":163},[33,98425,36538],{"class":50},[33,98427,98428],{"class":167},"(rows[",[33,98430,734],{"class":50},[33,98432,44179],{"class":167},[33,98434,734],{"class":50},[33,98436,98437],{"class":167},"], (",[33,98439,1059],{"class":50},[33,98441,365],{"class":167},[33,98443,1720],{"class":50},[33,98445,77348],{"class":167},[33,98447,98448],{"class":54},"\"Revenue not numeric\"\n",[33,98450,98451],{"class":35,"line":250},[33,98452,98453],{"class":167},"wb.close()\n",[33,98455,98456,98458,98460,98463],{"class":35,"line":266},[33,98457,13474],{"class":50},[33,98459,602],{"class":167},[33,98461,98462],{"class":54},"\"Verification passed\"",[33,98464,221],{"class":167},[18,98466,98468],{"id":98467},"adding-conditional-formatting-to-the-sales-sheet","Adding Conditional Formatting to the Sales Sheet",[14,98470,98471,98472,46332,98474,98477],{},"Once the data is written, applying conditional formatting flags anomalies without requiring readers to sort manually. The ",[30,98473,22009],{},[30,98475,98476],{},"ConditionalFormattingList"," API accepts standard Excel rule types.",[23,98479,98481],{"className":126,"code":98480,"language":47,"meta":28,"style":28},"# pip install openpyxl\nfrom pathlib import Path\nfrom openpyxl import load_workbook\nfrom openpyxl.styles import PatternFill, Font\nfrom openpyxl.formatting.rule import CellIsRule\n\nOUTPUT = Path(\"reports\u002Fmonthly_sales_latest.xlsx\")\n\ntry:\n    wb = load_workbook(OUTPUT)       # open for editing — NOT read_only\n    ws = wb[\"Regional Summary\"]\n\n    n_data = ws.max_row - 1         # rows below the header\n    rev_range = f\"B2:B{n_data + 1}\"\n\n    green_fill = PatternFill(\"solid\", fgColor=\"C6EFCE\")\n    red_fill   = PatternFill(\"solid\", fgColor=\"FFC7CE\")\n    green_font = Font(color=\"006100\")\n    red_font   = Font(color=\"9C0006\")\n\n    ws.conditional_formatting.add(rev_range,\n        CellIsRule(operator=\"greaterThanOrEqual\", formula=[\"10000\"],\n                   fill=green_fill, font=green_font))\n    ws.conditional_formatting.add(rev_range,\n        CellIsRule(operator=\"lessThan\", formula=[\"5000\"],\n                   fill=red_fill, font=red_font))\n\n    wb.save(OUTPUT)\nexcept FileNotFoundError as exc:\n    raise SystemExit(f\"Run the export step first: {exc}\")\n\nprint(\"Conditional formatting applied\")\n",[30,98482,98483,98487,98497,98507,98518,98530,98534,98547,98551,98557,98572,98584,98588,98604,98627,98631,98653,98675,98693,98711,98715,98720,98747,98765,98769,98793,98809,98813,98821,98831,98854,98858],{"__ignoreMap":28},[33,98484,98485],{"class":35,"line":36},[33,98486,98209],{"class":39},[33,98488,98489,98491,98493,98495],{"class":35,"line":43},[33,98490,190],{"class":163},[33,98492,193],{"class":167},[33,98494,164],{"class":163},[33,98496,198],{"class":167},[33,98498,98499,98501,98503,98505],{"class":35,"line":61},[33,98500,190],{"class":163},[33,98502,17103],{"class":167},[33,98504,164],{"class":163},[33,98506,17108],{"class":167},[33,98508,98509,98511,98513,98515],{"class":35,"line":73},[33,98510,190],{"class":163},[33,98512,17115],{"class":167},[33,98514,164],{"class":163},[33,98516,98517],{"class":167}," PatternFill, Font\n",[33,98519,98520,98522,98525,98527],{"class":35,"line":88},[33,98521,190],{"class":163},[33,98523,98524],{"class":167}," openpyxl.formatting.rule ",[33,98526,164],{"class":163},[33,98528,98529],{"class":167}," CellIsRule\n",[33,98531,98532],{"class":35,"line":95},[33,98533,92],{"emptyLinePlaceholder":91},[33,98535,98536,98538,98540,98542,98545],{"class":35,"line":101},[33,98537,96935],{"class":50},[33,98539,212],{"class":163},[33,98541,215],{"class":167},[33,98543,98544],{"class":54},"\"reports\u002Fmonthly_sales_latest.xlsx\"",[33,98546,221],{"class":167},[33,98548,98549],{"class":35,"line":171},[33,98550,92],{"emptyLinePlaceholder":91},[33,98552,98553,98555],{"class":35,"line":179},[33,98554,35574],{"class":163},[33,98556,574],{"class":167},[33,98558,98559,98561,98563,98565,98567,98569],{"class":35,"line":187},[33,98560,17432],{"class":167},[33,98562,242],{"class":163},[33,98564,97956],{"class":167},[33,98566,96935],{"class":50},[33,98568,8815],{"class":167},[33,98570,98571],{"class":39},"# open for editing — NOT read_only\n",[33,98573,98574,98576,98578,98580,98582],{"class":35,"line":201},[33,98575,17442],{"class":167},[33,98577,242],{"class":163},[33,98579,17447],{"class":167},[33,98581,97024],{"class":54},[33,98583,9202],{"class":167},[33,98585,98586],{"class":35,"line":206},[33,98587,92],{"emptyLinePlaceholder":91},[33,98589,98590,98593,98595,98597,98599,98601],{"class":35,"line":224},[33,98591,98592],{"class":167},"    n_data ",[33,98594,242],{"class":163},[33,98596,17704],{"class":167},[33,98598,4126],{"class":163},[33,98600,1814],{"class":50},[33,98602,98603],{"class":39},"         # rows below the header\n",[33,98605,98606,98609,98611,98613,98616,98618,98621,98623,98625],{"class":35,"line":229},[33,98607,98608],{"class":167},"    rev_range ",[33,98610,242],{"class":163},[33,98612,1110],{"class":163},[33,98614,98615],{"class":54},"\"B2:B",[33,98617,1115],{"class":50},[33,98619,98620],{"class":167},"n_data ",[33,98622,1811],{"class":163},[33,98624,11022],{"class":50},[33,98626,7504],{"class":54},[33,98628,98629],{"class":35,"line":235},[33,98630,92],{"emptyLinePlaceholder":91},[33,98632,98633,98636,98638,98640,98642,98644,98646,98648,98651],{"class":35,"line":250},[33,98634,98635],{"class":167},"    green_fill ",[33,98637,242],{"class":163},[33,98639,17185],{"class":167},[33,98641,17188],{"class":54},[33,98643,365],{"class":167},[33,98645,17193],{"class":238},[33,98647,242],{"class":163},[33,98649,98650],{"class":54},"\"C6EFCE\"",[33,98652,221],{"class":167},[33,98654,98655,98658,98660,98662,98664,98666,98668,98670,98673],{"class":35,"line":266},[33,98656,98657],{"class":167},"    red_fill   ",[33,98659,242],{"class":163},[33,98661,17185],{"class":167},[33,98663,17188],{"class":54},[33,98665,365],{"class":167},[33,98667,17193],{"class":238},[33,98669,242],{"class":163},[33,98671,98672],{"class":54},"\"FFC7CE\"",[33,98674,221],{"class":167},[33,98676,98677,98680,98682,98684,98686,98688,98691],{"class":35,"line":290},[33,98678,98679],{"class":167},"    green_font ",[33,98681,242],{"class":163},[33,98683,17233],{"class":167},[33,98685,17245],{"class":238},[33,98687,242],{"class":163},[33,98689,98690],{"class":54},"\"006100\"",[33,98692,221],{"class":167},[33,98694,98695,98698,98700,98702,98704,98706,98709],{"class":35,"line":295},[33,98696,98697],{"class":167},"    red_font   ",[33,98699,242],{"class":163},[33,98701,17233],{"class":167},[33,98703,17245],{"class":238},[33,98705,242],{"class":163},[33,98707,98708],{"class":54},"\"9C0006\"",[33,98710,221],{"class":167},[33,98712,98713],{"class":35,"line":300},[33,98714,92],{"emptyLinePlaceholder":91},[33,98716,98717],{"class":35,"line":317},[33,98718,98719],{"class":167},"    ws.conditional_formatting.add(rev_range,\n",[33,98721,98722,98725,98728,98730,98733,98735,98738,98740,98742,98745],{"class":35,"line":332},[33,98723,98724],{"class":167},"        CellIsRule(",[33,98726,98727],{"class":238},"operator",[33,98729,242],{"class":163},[33,98731,98732],{"class":54},"\"greaterThanOrEqual\"",[33,98734,365],{"class":167},[33,98736,98737],{"class":238},"formula",[33,98739,242],{"class":163},[33,98741,8309],{"class":167},[33,98743,98744],{"class":54},"\"10000\"",[33,98746,8935],{"class":167},[33,98748,98749,98752,98754,98757,98760,98762],{"class":35,"line":347},[33,98750,98751],{"class":238},"                   fill",[33,98753,242],{"class":163},[33,98755,98756],{"class":167},"green_fill, ",[33,98758,98759],{"class":238},"font",[33,98761,242],{"class":163},[33,98763,98764],{"class":167},"green_font))\n",[33,98766,98767],{"class":35,"line":374},[33,98768,98719],{"class":167},[33,98770,98771,98773,98775,98777,98780,98782,98784,98786,98788,98791],{"class":35,"line":397},[33,98772,98724],{"class":167},[33,98774,98727],{"class":238},[33,98776,242],{"class":163},[33,98778,98779],{"class":54},"\"lessThan\"",[33,98781,365],{"class":167},[33,98783,98737],{"class":238},[33,98785,242],{"class":163},[33,98787,8309],{"class":167},[33,98789,98790],{"class":54},"\"5000\"",[33,98792,8935],{"class":167},[33,98794,98795,98797,98799,98802,98804,98806],{"class":35,"line":653},[33,98796,98751],{"class":238},[33,98798,242],{"class":163},[33,98800,98801],{"class":167},"red_fill, ",[33,98803,98759],{"class":238},[33,98805,242],{"class":163},[33,98807,98808],{"class":167},"red_font))\n",[33,98810,98811],{"class":35,"line":667},[33,98812,92],{"emptyLinePlaceholder":91},[33,98814,98815,98817,98819],{"class":35,"line":675},[33,98816,98136],{"class":167},[33,98818,96935],{"class":50},[33,98820,221],{"class":167},[33,98822,98823,98825,98827,98829],{"class":35,"line":689},[33,98824,35726],{"class":163},[33,98826,2945],{"class":50},[33,98828,1852],{"class":163},[33,98830,1855],{"class":167},[33,98832,98833,98835,98837,98839,98841,98844,98846,98848,98850,98852],{"class":35,"line":703},[33,98834,35742],{"class":163},[33,98836,16617],{"class":50},[33,98838,602],{"class":167},[33,98840,4059],{"class":163},[33,98842,98843],{"class":54},"\"Run the export step first: ",[33,98845,1115],{"class":50},[33,98847,6565],{"class":167},[33,98849,1121],{"class":50},[33,98851,274],{"class":54},[33,98853,221],{"class":167},[33,98855,98856],{"class":35,"line":714},[33,98857,92],{"emptyLinePlaceholder":91},[33,98859,98860,98862,98864,98867],{"class":35,"line":723},[33,98861,13474],{"class":50},[33,98863,602],{"class":167},[33,98865,98866],{"class":54},"\"Conditional formatting applied\"",[33,98868,221],{"class":167},[14,98870,39550,98871,98874,98875,10065,98878,98881],{},[30,98872,98873],{},"CellIsRule"," approach mirrors the Excel \"Cell Value Is\" rule. For data-bar or icon-set rules, use ",[30,98876,98877],{},"DataBarRule",[30,98879,98880],{},"IconSetRule"," from the same module.",[18,98883,98885],{"id":98884},"scheduling-and-delivery","Scheduling and Delivery",[14,98887,98888],{},"A report that runs on demand is still a manual process. The value comes from automating delivery on a schedule.",[14,98890,98891,98894],{},[1974,98892,98893],{},"Linux\u002FmacOS cron"," — run at 07:00 on the first day of each month:",[23,98896,98899],{"className":98897,"code":98898,"language":2000},[1998],"0 7 1 * * \u002Fpath\u002Fto\u002F.venv\u002Fbin\u002Fpython \u002Fpath\u002Fto\u002Fgenerate_report.py >> \u002Fvar\u002Flog\u002Fsales_report.log 2>&1\n",[30,98900,98898],{"__ignoreMap":28},[14,98902,98903,98905],{},[1974,98904,42482],{}," — on a cron trigger, useful when the team already uses GitHub for the data pipeline:",[23,98907,98909],{"className":2062,"code":98908,"language":2064,"meta":28,"style":28},"name: Monthly Sales Report\non:\n  schedule:\n    - cron: \"0 7 1 * *\"\njobs:\n  report:\n    runs-on: ubuntu-latest\n    steps:\n      - uses: actions\u002Fcheckout@v4\n      - uses: actions\u002Fsetup-python@v5\n        with: { python-version: \"3.12\" }\n      - run: pip install pandas openpyxl\n      - run: python generate_report.py\n      - uses: actions\u002Fupload-artifact@v4\n        with:\n          name: monthly-sales-report\n          path: reports\u002F*.xlsx\n",[30,98910,98911,98920,98926,98932,98943,98949,98956,98964,98970,98980,98990,99008,99019,99030,99040,99046,99055],{"__ignoreMap":28},[33,98912,98913,98915,98917],{"class":35,"line":36},[33,98914,1118],{"class":2076},[33,98916,2079],{"class":167},[33,98918,98919],{"class":54},"Monthly Sales Report\n",[33,98921,98922,98924],{"class":35,"line":43},[33,98923,2091],{"class":50},[33,98925,574],{"class":167},[33,98927,98928,98930],{"class":35,"line":61},[33,98929,2098],{"class":2076},[33,98931,574],{"class":167},[33,98933,98934,98936,98938,98940],{"class":35,"line":73},[33,98935,2105],{"class":167},[33,98937,2108],{"class":2076},[33,98939,2079],{"class":167},[33,98941,98942],{"class":54},"\"0 7 1 * *\"\n",[33,98944,98945,98947],{"class":35,"line":88},[33,98946,2136],{"class":2076},[33,98948,574],{"class":167},[33,98950,98951,98954],{"class":35,"line":95},[33,98952,98953],{"class":2076},"  report",[33,98955,574],{"class":167},[33,98957,98958,98960,98962],{"class":35,"line":101},[33,98959,2150],{"class":2076},[33,98961,2079],{"class":167},[33,98963,2155],{"class":54},[33,98965,98966,98968],{"class":35,"line":171},[33,98967,2160],{"class":2076},[33,98969,574],{"class":167},[33,98971,98972,98974,98976,98978],{"class":35,"line":179},[33,98973,2167],{"class":167},[33,98975,2170],{"class":2076},[33,98977,2079],{"class":167},[33,98979,2175],{"class":54},[33,98981,98982,98984,98986,98988],{"class":35,"line":187},[33,98983,2167],{"class":167},[33,98985,2170],{"class":2076},[33,98987,2079],{"class":167},[33,98989,2186],{"class":54},[33,98991,98992,98994,98997,99000,99002,99005],{"class":35,"line":201},[33,98993,2191],{"class":2076},[33,98995,98996],{"class":167},": { ",[33,98998,98999],{"class":2076},"python-version",[33,99001,2079],{"class":167},[33,99003,99004],{"class":54},"\"3.12\"",[33,99006,99007],{"class":167}," }\n",[33,99009,99010,99012,99014,99016],{"class":35,"line":206},[33,99011,2167],{"class":167},[33,99013,67355],{"class":2076},[33,99015,2079],{"class":167},[33,99017,99018],{"class":54},"pip install pandas openpyxl\n",[33,99020,99021,99023,99025,99027],{"class":35,"line":224},[33,99022,2167],{"class":167},[33,99024,67355],{"class":2076},[33,99026,2079],{"class":167},[33,99028,99029],{"class":54},"python generate_report.py\n",[33,99031,99032,99034,99036,99038],{"class":35,"line":229},[33,99033,2167],{"class":167},[33,99035,2170],{"class":2076},[33,99037,2079],{"class":167},[33,99039,2292],{"class":54},[33,99041,99042,99044],{"class":35,"line":235},[33,99043,2191],{"class":2076},[33,99045,574],{"class":167},[33,99047,99048,99050,99052],{"class":35,"line":250},[33,99049,2303],{"class":2076},[33,99051,2079],{"class":167},[33,99053,99054],{"class":54},"monthly-sales-report\n",[33,99056,99057,99059,99061],{"class":35,"line":266},[33,99058,2313],{"class":2076},[33,99060,2079],{"class":167},[33,99062,99063],{"class":54},"reports\u002F*.xlsx\n",[14,99065,99066,99069],{},[1974,99067,99068],{},"Email delivery"," — attach the generated file and send via SMTP:",[23,99071,99073],{"className":126,"code":99072,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nimport smtplib\nfrom email.mime.multipart import MIMEMultipart\nfrom email.mime.base import MIMEBase\nfrom email.mime.text import MIMEText\nfrom email import encoders\nfrom pathlib import Path\n\ndef email_report(report_path: Path, recipient: str, smtp_host: str,\n                 smtp_port: int, sender: str, password: str) -> None:\n    msg = MIMEMultipart()\n    msg[\"From\"]    = sender\n    msg[\"To\"]      = recipient\n    msg[\"Subject\"] = f\"Monthly Sales Report — {report_path.stem}\"\n    msg.attach(MIMEText(\"Please find the monthly sales report attached.\", \"plain\"))\n\n    with report_path.open(\"rb\") as f:\n        part = MIMEBase(\"application\", \"octet-stream\")\n        part.set_payload(f.read())\n    encoders.encode_base64(part)\n    part.add_header(\"Content-Disposition\",\n                    f'attachment; filename=\"{report_path.name}\"')\n    msg.attach(part)\n\n    with smtplib.SMTP_SSL(smtp_host, smtp_port) as server:\n        server.login(sender, password)\n        server.sendmail(sender, recipient, msg.as_string())\n    print(f\"Report emailed to {recipient}\")\n",[30,99074,99075,99079,99085,99097,99109,99121,99133,99143,99147,99166,99189,99198,99211,99225,99249,99264,99268,99283,99302,99307,99312,99322,99342,99347,99351,99363,99368,99373],{"__ignoreMap":28},[33,99076,99077],{"class":35,"line":36},[33,99078,3952],{"class":39},[33,99080,99081,99083],{"class":35,"line":43},[33,99082,164],{"class":163},[33,99084,3074],{"class":167},[33,99086,99087,99089,99092,99094],{"class":35,"line":61},[33,99088,190],{"class":163},[33,99090,99091],{"class":167}," email.mime.multipart ",[33,99093,164],{"class":163},[33,99095,99096],{"class":167}," MIMEMultipart\n",[33,99098,99099,99101,99104,99106],{"class":35,"line":73},[33,99100,190],{"class":163},[33,99102,99103],{"class":167}," email.mime.base ",[33,99105,164],{"class":163},[33,99107,99108],{"class":167}," MIMEBase\n",[33,99110,99111,99113,99116,99118],{"class":35,"line":88},[33,99112,190],{"class":163},[33,99114,99115],{"class":167}," email.mime.text ",[33,99117,164],{"class":163},[33,99119,99120],{"class":167}," MIMEText\n",[33,99122,99123,99125,99128,99130],{"class":35,"line":95},[33,99124,190],{"class":163},[33,99126,99127],{"class":167}," email ",[33,99129,164],{"class":163},[33,99131,99132],{"class":167}," encoders\n",[33,99134,99135,99137,99139,99141],{"class":35,"line":101},[33,99136,190],{"class":163},[33,99138,193],{"class":167},[33,99140,164],{"class":163},[33,99142,198],{"class":167},[33,99144,99145],{"class":35,"line":171},[33,99146,92],{"emptyLinePlaceholder":91},[33,99148,99149,99151,99154,99157,99159,99162,99164],{"class":35,"line":179},[33,99150,562],{"class":163},[33,99152,99153],{"class":46}," email_report",[33,99155,99156],{"class":167},"(report_path: Path, recipient: ",[33,99158,1053],{"class":50},[33,99160,99161],{"class":167},", smtp_host: ",[33,99163,1053],{"class":50},[33,99165,247],{"class":167},[33,99167,99168,99171,99173,99176,99178,99181,99183,99185,99187],{"class":35,"line":187},[33,99169,99170],{"class":167},"                 smtp_port: ",[33,99172,1059],{"class":50},[33,99174,99175],{"class":167},", sender: ",[33,99177,1053],{"class":50},[33,99179,99180],{"class":167},", password: ",[33,99182,1053],{"class":50},[33,99184,1617],{"class":167},[33,99186,571],{"class":50},[33,99188,574],{"class":167},[33,99190,99191,99193,99195],{"class":35,"line":201},[33,99192,3273],{"class":167},[33,99194,242],{"class":163},[33,99196,99197],{"class":167}," MIMEMultipart()\n",[33,99199,99200,99202,99204,99206,99208],{"class":35,"line":206},[33,99201,3283],{"class":167},[33,99203,3300],{"class":54},[33,99205,96251],{"class":167},[33,99207,242],{"class":163},[33,99209,99210],{"class":167}," sender\n",[33,99212,99213,99215,99217,99220,99222],{"class":35,"line":224},[33,99214,3283],{"class":167},[33,99216,3314],{"class":54},[33,99218,99219],{"class":167},"]      ",[33,99221,242],{"class":163},[33,99223,99224],{"class":167}," recipient\n",[33,99226,99227,99229,99231,99233,99235,99237,99240,99242,99245,99247],{"class":35,"line":229},[33,99228,3283],{"class":167},[33,99230,3286],{"class":54},[33,99232,763],{"class":167},[33,99234,242],{"class":163},[33,99236,1110],{"class":163},[33,99238,99239],{"class":54},"\"Monthly Sales Report — ",[33,99241,1115],{"class":50},[33,99243,99244],{"class":167},"report_path.stem",[33,99246,1121],{"class":50},[33,99248,7504],{"class":54},[33,99250,99251,99254,99257,99259,99262],{"class":35,"line":235},[33,99252,99253],{"class":167},"    msg.attach(MIMEText(",[33,99255,99256],{"class":54},"\"Please find the monthly sales report attached.\"",[33,99258,365],{"class":167},[33,99260,99261],{"class":54},"\"plain\"",[33,99263,371],{"class":167},[33,99265,99266],{"class":35,"line":250},[33,99267,92],{"emptyLinePlaceholder":91},[33,99269,99270,99272,99275,99277,99279,99281],{"class":35,"line":266},[33,99271,1635],{"class":163},[33,99273,99274],{"class":167}," report_path.open(",[33,99276,68219],{"class":54},[33,99278,1649],{"class":167},[33,99280,495],{"class":163},[33,99282,77684],{"class":167},[33,99284,99285,99287,99289,99292,99295,99297,99300],{"class":35,"line":290},[33,99286,75092],{"class":167},[33,99288,242],{"class":163},[33,99290,99291],{"class":167}," MIMEBase(",[33,99293,99294],{"class":54},"\"application\"",[33,99296,365],{"class":167},[33,99298,99299],{"class":54},"\"octet-stream\"",[33,99301,221],{"class":167},[33,99303,99304],{"class":35,"line":295},[33,99305,99306],{"class":167},"        part.set_payload(f.read())\n",[33,99308,99309],{"class":35,"line":300},[33,99310,99311],{"class":167},"    encoders.encode_base64(part)\n",[33,99313,99314,99317,99320],{"class":35,"line":317},[33,99315,99316],{"class":167},"    part.add_header(",[33,99318,99319],{"class":54},"\"Content-Disposition\"",[33,99321,247],{"class":167},[33,99323,99324,99327,99330,99332,99335,99337,99340],{"class":35,"line":332},[33,99325,99326],{"class":163},"                    f",[33,99328,99329],{"class":54},"'attachment; filename=\"",[33,99331,1115],{"class":50},[33,99333,99334],{"class":167},"report_path.name",[33,99336,1121],{"class":50},[33,99338,99339],{"class":54},"\"'",[33,99341,221],{"class":167},[33,99343,99344],{"class":35,"line":347},[33,99345,99346],{"class":167},"    msg.attach(part)\n",[33,99348,99349],{"class":35,"line":374},[33,99350,92],{"emptyLinePlaceholder":91},[33,99352,99353,99355,99358,99360],{"class":35,"line":397},[33,99354,1635],{"class":163},[33,99356,99357],{"class":167}," smtplib.SMTP_SSL(smtp_host, smtp_port) ",[33,99359,495],{"class":163},[33,99361,99362],{"class":167}," server:\n",[33,99364,99365],{"class":35,"line":653},[33,99366,99367],{"class":167},"        server.login(sender, password)\n",[33,99369,99370],{"class":35,"line":667},[33,99371,99372],{"class":167},"        server.sendmail(sender, recipient, msg.as_string())\n",[33,99374,99375,99377,99379,99381,99384,99386,99389,99391,99393],{"class":35,"line":675},[33,99376,7268],{"class":50},[33,99378,602],{"class":167},[33,99380,4059],{"class":163},[33,99382,99383],{"class":54},"\"Report emailed to ",[33,99385,1115],{"class":50},[33,99387,99388],{"class":167},"recipient",[33,99390,1121],{"class":50},[33,99392,274],{"class":54},[33,99394,221],{"class":167},[14,99396,99397],{},"Store credentials in environment variables, not in the script itself.",[18,99399,4271],{"id":4270},[4273,99401,99402,99412],{},[4276,99403,99404],{},[4279,99405,99406,99408,99410],{},[4282,99407,14317],{},[4282,99409,4287],{},[4282,99411,4290],{},[4292,99413,99414,99432,99449,99469,99487],{},[4279,99415,99416,99421,99427],{},[4297,99417,99418],{},[30,99419,99420],{},"ValueError: cannot reindex from a duplicate axis",[4297,99422,99423,99424],{},"Overlapping row indices after ",[30,99425,99426],{},"concat",[4297,99428,74566,99429],{},[30,99430,99431],{},"pd.concat(frames, ignore_index=True)",[4279,99433,99434,99439,99442],{},[4297,99435,99436],{},[30,99437,99438],{},"ValueError: time data ... does not match format",[4297,99440,99441],{},"CSV mixes date formats",[4297,99443,17059,99444,2012,99446,99448],{},[30,99445,97762],{},[30,99447,27816],{}," with a null drop",[4279,99450,99451,99456,99463],{},[4297,99452,99453],{},[30,99454,99455],{},"openpyxl.utils.exceptions.IllegalCharacterError",[4297,99457,99458,99459,99462],{},"Control characters (",[30,99460,99461],{},"\\x00",") in string cells",[4297,99464,99465,99468],{},[30,99466,99467],{},"df.replace(r\"[\\x00-\\x08\\x0b\\x0c\\x0e-\\x1f]\", \"\", regex=True)"," before export",[4279,99470,99471,99476,99479],{},[4297,99472,99473],{},[30,99474,99475],{},"SettingWithCopyWarning",[4297,99477,99478],{},"Chained indexing on a DataFrame slice",[4297,99480,42543,99481,8877,99484],{},[30,99482,99483],{},"df['col'] = val",[30,99485,99486],{},"df.loc[:, 'col'] = val",[4279,99488,99489,99494,99497],{},[4297,99490,99491,99492],{},"Totals row shows ",[30,99493,748],{},[4297,99495,99496],{},"Formula written to wrong row index (off-by-one between Excel 1-indexed and Python 0-indexed)",[4297,99498,91918,99499,99502],{},[30,99500,99501],{},"total_row = len(df) + 2"," (header is row 1, data starts row 2)",[18,99504,36626],{"id":36625},[14,99506,99507,99510,99511,99514,99515,3035],{},[1974,99508,99509],{},"How do I schedule this script to run on the first business day of each month?","\nOn Linux\u002FmacOS use cron: ",[30,99512,99513],{},"0 8 1 * * \u002Fpath\u002F.venv\u002Fbin\u002Fpython \u002Fpath\u002Fgenerate_report.py",". Add a guard at the top of the script: ",[30,99516,99517],{},"from datetime import date; assert date.today().weekday() \u003C 5, \"Weekend — skipping\"",[14,99519,99520,99523,99526,99527,36661,99529,99531,99532,99534],{},[1974,99521,99522],{},"Why does my pivot lose the header when exported to Excel?",[30,99524,99525],{},"DataFrame.pivot_table"," returns a DataFrame with a named index. Pass ",[30,99528,28142],{},[30,99530,96833],{},", or call ",[30,99533,96809],{}," on the pivot result first.",[14,99536,99537,99540,99541,8877,99544,99547,99548,99550,99551,3035],{},[1974,99538,99539],{},"Can I email the report automatically after generation?","\nYes. After saving the file, use Python's ",[30,99542,99543],{},"smtplib",[30,99545,99546],{},"email.mime.multipart"," to attach the ",[30,99549,26542],{}," and send via SMTP. For a full scheduling-and-delivery pattern, see ",[940,99552,6936],{"href":6935},[18,99554,6918],{"id":6917},[4211,99556,99557,99562,99567,99572],{},[4214,99558,99559,99561],{},[940,99560,6936],{"href":6935}," — full pipeline with conditional formatting, charts, and multi-engine walkthrough",[4214,99563,99564,99566],{},[940,99565,97863],{"href":97862}," — resolve write errors when loading templates",[4214,99568,99569,99571],{},[940,99570,9599],{"href":9598}," — schema normalisation before aggregation",[4214,99573,99574,99578,99579],{},[940,99575,99577],{"href":99576},"\u002Fpython-for-excel-csv-data-processing\u002Freading-excel-files-with-python\u002F","Reading Excel Files with Python"," — ingestion patterns when source data is already ",[30,99580,26542],{},[14,99582,6947,99583,3035],{},[940,99584,6936],{"href":6935},[6953,99586,99587],{},"html pre.shiki code .s7eDp, html code.shiki .s7eDp{--shiki-default:#6F42C1}html pre.shiki code .sYu0t, html code.shiki .sYu0t{--shiki-default:#005CC5}html pre.shiki code .sYBdl, html code.shiki .sYBdl{--shiki-default:#032F62}html pre.shiki code .sAwPA, html code.shiki .sAwPA{--shiki-default:#6A737D}html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html pre.shiki code .sD7c4, html code.shiki .sD7c4{--shiki-default:#D73A49}html pre.shiki code .sgsFI, html code.shiki .sgsFI{--shiki-default:#24292E}html pre.shiki code .sqxcx, html code.shiki .sqxcx{--shiki-default:#E36209}html pre.shiki code .shJU0, html code.shiki .shJU0{--shiki-default:#22863A}",{"title":28,"searchDepth":43,"depth":43,"links":99589},[99590,99591,99592,99593,99594,99595,99596,99600,99601,99602,99603,99604,99605],{"id":95810,"depth":43,"text":95811},{"id":26618,"depth":43,"text":95849},{"id":95914,"depth":43,"text":95915},{"id":96220,"depth":43,"text":96221},{"id":96492,"depth":43,"text":96493},{"id":96820,"depth":43,"text":96821},{"id":35801,"depth":43,"text":35802,"children":99597},[99598,99599],{"id":97748,"depth":61,"text":97749},{"id":97844,"depth":61,"text":97845},{"id":9246,"depth":43,"text":9247},{"id":98467,"depth":43,"text":98468},{"id":98884,"depth":43,"text":98885},{"id":4270,"depth":43,"text":4271},{"id":36625,"depth":43,"text":36626},{"id":6917,"depth":43,"text":6918},"Monthly Sales Reports","Replace manual VLOOKUP workflows with a Python pipeline using pandas groupby and openpyxl. Produce a formatted multi-sheet monthly sales workbook automatically.",{},"\u002Fpython-for-excel-csv-data-processing\u002Fautomating-excel-report-generation\u002Fautomating-monthly-sales-reports-in-excel",{"title":95780,"description":99607},"Automate Monthly Sales Reports in Excel with Python","python-for-excel-csv-data-processing\u002Fautomating-excel-report-generation\u002Fautomating-monthly-sales-reports-in-excel\u002Findex",[99614,47,9630,22009,99615],"excel","sales reporting","ACPjIPTkE9GIfERcmP3KlEniQIo4UR7wR8wLtIppSig",{"id":99618,"title":97863,"body":99619,"breadcrumbTitle":102103,"canonical":6977,"date":6978,"description":102104,"draft":6980,"extension":6981,"image":6977,"meta":102105,"navigation":91,"path":102106,"robots":6977,"seo":102107,"seoTitle":102108,"stem":102109,"tags":102110,"updatedAt":6978,"__hash__":102111},"content\u002Fpython-for-excel-csv-data-processing\u002Fautomating-excel-report-generation\u002Ffix-openpyxl-read-only-mode-error\u002Findex.md",{"type":7,"value":99620,"toc":102083},[99621,99624,99646,99649,99651,99656,99736,99768,99784,99787,99790,99959,99964,99971,99974,100259,100268,100355,100365,100385,100555,100561,100570,100778,100786,100797,100802,100917,100926,100930,100943,101102,101124,101135,101141,101310,101313,101319,101343,101737,101748,101752,101883,101885,101888,102039,102047,102049,102076,102080],[10,99622,97863],{"id":99623},"fix-openpyxl-read-only-mode-error",[14,99625,43155,99626,99629,99630,99633,99634,99637,99638,99641,99642,99645],{},[30,99627,99628],{},"load_workbook(path, read_only=True)"," and then trying to write a cell raises an ",[30,99631,99632],{},"AttributeError"," or silently does nothing — the workbook is in an optimised read-only streaming mode that strips the write API entirely. The same problem surfaces when you try to access ",[30,99635,99636],{},".active"," on a read-only workbook, iterate cells after calling ",[30,99639,99640],{},"wb.close()",", or use ",[30,99643,99644],{},".value"," on rows after the worksheet iterator is exhausted.",[14,99647,99648],{},"This page covers the root cause, the minimal diagnostic, and the correct fix for each variant.",[18,99650,7021],{"id":7020},[14,99652,99653,99655],{},[30,99654,22009],{}," ships three workbook modes:",[4273,99657,99658,99679],{},[4276,99659,99660],{},[4279,99661,99662,99665,99670,99673,99676],{},[4282,99663,99664],{},"Mode",[4282,99666,99667,99669],{},[30,99668,22404],{}," kwargs",[4282,99671,99672],{},"Can read?",[4282,99674,99675],{},"Can write?",[4282,99677,99678],{},"Memory use",[4292,99680,99681,99698,99716],{},[4279,99682,99683,99686,99691,99693,99695],{},[4297,99684,99685],{},"Normal",[4297,99687,99688],{},[26245,99689,99690],{},"(none)",[4297,99692,38631],{},[4297,99694,38631],{},[4297,99696,99697],{},"Full DOM in RAM",[4279,99699,99700,99703,99707,99709,99713],{},[4297,99701,99702],{},"Read-only",[4297,99704,99705],{},[30,99706,97854],{},[4297,99708,38631],{},[4297,99710,99711],{},[1974,99712,38628],{},[4297,99714,99715],{},"Streaming — low",[4279,99717,99718,99721,99728,99732,99734],{},[4297,99719,99720],{},"Write-only",[4297,99722,99723,42706,99725],{},[30,99724,22013],{},[30,99726,99727],{},"Workbook()",[4297,99729,99730],{},[1974,99731,38628],{},[4297,99733,38631],{},[4297,99735,99715],{},[14,99737,99738,99740,99741,99744,99745,99748,99749,99751,99752,43076,99755,43076,99758,43076,99761,99764,99765,99767],{},[30,99739,97854],{}," loads the XML as a lazy stream. Sheets are ",[30,99742,99743],{},"ReadOnlyWorksheet"," objects, not ",[30,99746,99747],{},"Worksheet"," objects. ",[30,99750,99743],{}," has no ",[30,99753,99754],{},"append",[30,99756,99757],{},"cell(...).value =",[30,99759,99760],{},"freeze_panes",[30,99762,99763],{},"column_dimensions"," — writing to any of these raises ",[30,99766,99632],{}," or is silently ignored depending on the openpyxl version.",[14,99769,99770,99771,99773,99774,99776,99777,99780,99781,99783],{},"Similarly, ",[30,99772,22013],{}," (on a new ",[30,99775,99727],{},") produces ",[30,99778,99779],{},"WriteOnlyWorksheet"," objects that only accept ",[30,99782,69240],{}," — random-access reads are not supported.",[18,99785,99786],{"id":54445},"Minimal Reproducible Diagnostic",[14,99788,99789],{},"Run this to confirm you are hitting the read-only mode restriction:",[23,99791,99793],{"className":126,"code":99792,"language":47,"meta":28,"style":28},"# pip install openpyxl\nfrom pathlib import Path\nfrom openpyxl import load_workbook\n\nPATH = Path(\"workbook.xlsx\")   # replace with your file\n\ntry:\n    wb = load_workbook(PATH, read_only=True)\n    ws = wb.active\n    print(type(ws))            # → \u003Cclass 'openpyxl.worksheet._read_only.ReadOnlyWorksheet'>\n    ws[\"A1\"] = \"test\"          # triggers the error\nexcept AttributeError as exc:\n    print(f\"Caught: {exc}\")\n    # → 'ReadOnlyWorksheet' object has no attribute '__setitem__'\nfinally:\n    wb.close()\n",[30,99794,99795,99799,99809,99819,99823,99839,99843,99849,99869,99878,99892,99910,99921,99942,99947,99954],{"__ignoreMap":28},[33,99796,99797],{"class":35,"line":36},[33,99798,98209],{"class":39},[33,99800,99801,99803,99805,99807],{"class":35,"line":43},[33,99802,190],{"class":163},[33,99804,193],{"class":167},[33,99806,164],{"class":163},[33,99808,198],{"class":167},[33,99810,99811,99813,99815,99817],{"class":35,"line":61},[33,99812,190],{"class":163},[33,99814,17103],{"class":167},[33,99816,164],{"class":163},[33,99818,17108],{"class":167},[33,99820,99821],{"class":35,"line":73},[33,99822,92],{"emptyLinePlaceholder":91},[33,99824,99825,99827,99829,99831,99834,99836],{"class":35,"line":88},[33,99826,122],{"class":50},[33,99828,212],{"class":163},[33,99830,215],{"class":167},[33,99832,99833],{"class":54},"\"workbook.xlsx\"",[33,99835,12000],{"class":167},[33,99837,99838],{"class":39},"# replace with your file\n",[33,99840,99841],{"class":35,"line":95},[33,99842,92],{"emptyLinePlaceholder":91},[33,99844,99845,99847],{"class":35,"line":101},[33,99846,35574],{"class":163},[33,99848,574],{"class":167},[33,99850,99851,99853,99855,99857,99859,99861,99863,99865,99867],{"class":35,"line":171},[33,99852,17432],{"class":167},[33,99854,242],{"class":163},[33,99856,97956],{"class":167},[33,99858,122],{"class":50},[33,99860,365],{"class":167},[33,99862,98285],{"class":238},[33,99864,242],{"class":163},[33,99866,855],{"class":50},[33,99868,221],{"class":167},[33,99870,99871,99873,99875],{"class":35,"line":179},[33,99872,17442],{"class":167},[33,99874,242],{"class":163},[33,99876,99877],{"class":167}," wb.active\n",[33,99879,99880,99882,99884,99886,99889],{"class":35,"line":187},[33,99881,7268],{"class":50},[33,99883,602],{"class":167},[33,99885,6677],{"class":50},[33,99887,99888],{"class":167},"(ws))            ",[33,99890,99891],{"class":39},"# → \u003Cclass 'openpyxl.worksheet._read_only.ReadOnlyWorksheet'>\n",[33,99893,99894,99897,99900,99902,99904,99907],{"class":35,"line":201},[33,99895,99896],{"class":167},"    ws[",[33,99898,99899],{"class":54},"\"A1\"",[33,99901,763],{"class":167},[33,99903,242],{"class":163},[33,99905,99906],{"class":54}," \"test\"",[33,99908,99909],{"class":39},"          # triggers the error\n",[33,99911,99912,99914,99917,99919],{"class":35,"line":206},[33,99913,35726],{"class":163},[33,99915,99916],{"class":50}," AttributeError",[33,99918,1852],{"class":163},[33,99920,1855],{"class":167},[33,99922,99923,99925,99927,99929,99932,99934,99936,99938,99940],{"class":35,"line":224},[33,99924,7268],{"class":50},[33,99926,602],{"class":167},[33,99928,4059],{"class":163},[33,99930,99931],{"class":54},"\"Caught: ",[33,99933,1115],{"class":50},[33,99935,6565],{"class":167},[33,99937,1121],{"class":50},[33,99939,274],{"class":54},[33,99941,221],{"class":167},[33,99943,99944],{"class":35,"line":229},[33,99945,99946],{"class":39},"    # → 'ReadOnlyWorksheet' object has no attribute '__setitem__'\n",[33,99948,99949,99952],{"class":35,"line":235},[33,99950,99951],{"class":163},"finally",[33,99953,574],{"class":167},[33,99955,99956],{"class":35,"line":250},[33,99957,99958],{"class":167},"    wb.close()\n",[14,99960,99961,99962,3035],{},"If the output matches, the fix is to remove ",[30,99963,97854],{},[18,99965,99967,99968,99970],{"id":99966},"fix-open-without-read_only-for-editing","Fix — Open Without ",[30,99969,98285],{}," for Editing",[14,99972,99973],{},"The correct fix for any workflow that writes to an existing workbook is to open it in normal mode:",[23,99975,99977],{"className":126,"code":99976,"language":47,"meta":28,"style":28},"# pip install openpyxl\nfrom pathlib import Path\nfrom openpyxl import load_workbook\nfrom openpyxl.styles import Font, PatternFill\n\nPATH   = Path(\"workbook.xlsx\")\nOUTPUT = Path(\"workbook_updated.xlsx\")   # save to a new path to preserve original\n\ntry:\n    wb = load_workbook(PATH)             # no read_only kwarg — full read\u002Fwrite mode\n    ws = wb.active\n\n    # Now all write operations work\n    ws[\"A1\"] = \"Report Title\"                              # set cell value\n    ws[\"A1\"].font = Font(bold=True, size=14)              # apply style\n    ws.freeze_panes = \"A2\"                                # freeze header row\n    ws.column_dimensions[\"A\"].width = 24                  # resize column\n\n    wb.save(OUTPUT)\nexcept FileNotFoundError as exc:\n    raise SystemExit(f\"File not found: {exc}\")\nexcept PermissionError:\n    raise SystemExit(f\"Close {OUTPUT} in Excel before saving\")\n\nprint(f\"Saved: {OUTPUT}\")\n",[30,99978,99979,99983,99993,100003,100014,100018,100030,100046,100050,100056,100071,100079,100083,100088,100104,100136,100149,100168,100172,100180,100190,100212,100220,100239,100243],{"__ignoreMap":28},[33,99980,99981],{"class":35,"line":36},[33,99982,98209],{"class":39},[33,99984,99985,99987,99989,99991],{"class":35,"line":43},[33,99986,190],{"class":163},[33,99988,193],{"class":167},[33,99990,164],{"class":163},[33,99992,198],{"class":167},[33,99994,99995,99997,99999,100001],{"class":35,"line":61},[33,99996,190],{"class":163},[33,99998,17103],{"class":167},[33,100000,164],{"class":163},[33,100002,17108],{"class":167},[33,100004,100005,100007,100009,100011],{"class":35,"line":73},[33,100006,190],{"class":163},[33,100008,17115],{"class":167},[33,100010,164],{"class":163},[33,100012,100013],{"class":167}," Font, PatternFill\n",[33,100015,100016],{"class":35,"line":88},[33,100017,92],{"emptyLinePlaceholder":91},[33,100019,100020,100022,100024,100026,100028],{"class":35,"line":95},[33,100021,122],{"class":50},[33,100023,21012],{"class":163},[33,100025,215],{"class":167},[33,100027,99833],{"class":54},[33,100029,221],{"class":167},[33,100031,100032,100034,100036,100038,100041,100043],{"class":35,"line":101},[33,100033,96935],{"class":50},[33,100035,212],{"class":163},[33,100037,215],{"class":167},[33,100039,100040],{"class":54},"\"workbook_updated.xlsx\"",[33,100042,12000],{"class":167},[33,100044,100045],{"class":39},"# save to a new path to preserve original\n",[33,100047,100048],{"class":35,"line":171},[33,100049,92],{"emptyLinePlaceholder":91},[33,100051,100052,100054],{"class":35,"line":179},[33,100053,35574],{"class":163},[33,100055,574],{"class":167},[33,100057,100058,100060,100062,100064,100066,100068],{"class":35,"line":187},[33,100059,17432],{"class":167},[33,100061,242],{"class":163},[33,100063,97956],{"class":167},[33,100065,122],{"class":50},[33,100067,12090],{"class":167},[33,100069,100070],{"class":39},"# no read_only kwarg — full read\u002Fwrite mode\n",[33,100072,100073,100075,100077],{"class":35,"line":201},[33,100074,17442],{"class":167},[33,100076,242],{"class":163},[33,100078,99877],{"class":167},[33,100080,100081],{"class":35,"line":206},[33,100082,92],{"emptyLinePlaceholder":91},[33,100084,100085],{"class":35,"line":224},[33,100086,100087],{"class":39},"    # Now all write operations work\n",[33,100089,100090,100092,100094,100096,100098,100101],{"class":35,"line":229},[33,100091,99896],{"class":167},[33,100093,99899],{"class":54},[33,100095,763],{"class":167},[33,100097,242],{"class":163},[33,100099,100100],{"class":54}," \"Report Title\"",[33,100102,100103],{"class":39},"                              # set cell value\n",[33,100105,100106,100108,100110,100113,100115,100117,100119,100121,100123,100125,100127,100129,100131,100133],{"class":35,"line":235},[33,100107,99896],{"class":167},[33,100109,99899],{"class":54},[33,100111,100112],{"class":167},"].font ",[33,100114,242],{"class":163},[33,100116,17233],{"class":167},[33,100118,17236],{"class":238},[33,100120,242],{"class":163},[33,100122,855],{"class":50},[33,100124,365],{"class":167},[33,100126,17255],{"class":238},[33,100128,242],{"class":163},[33,100130,19368],{"class":50},[33,100132,67217],{"class":167},[33,100134,100135],{"class":39},"# apply style\n",[33,100137,100138,100141,100143,100146],{"class":35,"line":250},[33,100139,100140],{"class":167},"    ws.freeze_panes ",[33,100142,242],{"class":163},[33,100144,100145],{"class":54}," \"A2\"",[33,100147,100148],{"class":39},"                                # freeze header row\n",[33,100150,100151,100154,100157,100160,100162,100165],{"class":35,"line":266},[33,100152,100153],{"class":167},"    ws.column_dimensions[",[33,100155,100156],{"class":54},"\"A\"",[33,100158,100159],{"class":167},"].width ",[33,100161,242],{"class":163},[33,100163,100164],{"class":50}," 24",[33,100166,100167],{"class":39},"                  # resize column\n",[33,100169,100170],{"class":35,"line":290},[33,100171,92],{"emptyLinePlaceholder":91},[33,100173,100174,100176,100178],{"class":35,"line":295},[33,100175,98136],{"class":167},[33,100177,96935],{"class":50},[33,100179,221],{"class":167},[33,100181,100182,100184,100186,100188],{"class":35,"line":300},[33,100183,35726],{"class":163},[33,100185,2945],{"class":50},[33,100187,1852],{"class":163},[33,100189,1855],{"class":167},[33,100191,100192,100194,100196,100198,100200,100202,100204,100206,100208,100210],{"class":35,"line":317},[33,100193,35742],{"class":163},[33,100195,16617],{"class":50},[33,100197,602],{"class":167},[33,100199,4059],{"class":163},[33,100201,15677],{"class":54},[33,100203,1115],{"class":50},[33,100205,6565],{"class":167},[33,100207,1121],{"class":50},[33,100209,274],{"class":54},[33,100211,221],{"class":167},[33,100213,100214,100216,100218],{"class":35,"line":332},[33,100215,35726],{"class":163},[33,100217,17393],{"class":50},[33,100219,574],{"class":167},[33,100221,100222,100224,100226,100228,100230,100232,100234,100237],{"class":35,"line":347},[33,100223,35742],{"class":163},[33,100225,16617],{"class":50},[33,100227,602],{"class":167},[33,100229,4059],{"class":163},[33,100231,97681],{"class":54},[33,100233,97684],{"class":50},[33,100235,100236],{"class":54}," in Excel before saving\"",[33,100238,221],{"class":167},[33,100240,100241],{"class":35,"line":374},[33,100242,92],{"emptyLinePlaceholder":91},[33,100244,100245,100247,100249,100251,100253,100255,100257],{"class":35,"line":397},[33,100246,13474],{"class":50},[33,100248,602],{"class":167},[33,100250,4059],{"class":163},[33,100252,97737],{"class":54},[33,100254,97684],{"class":50},[33,100256,274],{"class":54},[33,100258,221],{"class":167},[14,100260,100261,100262,49047,100265,3035],{},"The only change from the broken version: ",[30,100263,100264],{},"load_workbook(PATH)",[30,100266,100267],{},"load_workbook(PATH, read_only=True)",[2540,100269,2547,100272,2547,100275,2547,100278,2547,2547,100292,2547,100295,2547,100298,2547,2547,100301,2547,100304,2547,2547,100306,2547,100308,2547,100311,2547,2547,100314,2547,100316,2547,100320,2547,100323,2547,100326,2547,2547,100330,2547,100333,2547,100336,2547,100338,2547,100341,2547,100343,2547,2547,100346,2547,100348,2547,100351],{"viewBox":100270,"role":2543,"ariaLabel":100271,"xmlns":2545,"style":2546},"0 0 760 240","Decision tree: choose between read_only, normal, and write_only openpyxl modes",[2549,100273,100274],{},"openpyxl workbook mode decision tree",[2553,100276,100277],{},"Decision tree showing which load_workbook mode to use: read_only for large files you only read; normal for editing existing files; write_only for streaming new files.",[2557,100279,2559,100280,2559,100287,2547],{},[2561,100281,2564,100283,2564,100285,2559],{"id":100282,"x1":748,"y1":748,"x2":734,"y2":748},"openpyxl-ro-grad",[2566,100284],{"offset":748,"style":2568},[2566,100286],{"offset":734,"style":2571},[2573,100288,2564,100290,2559],{"id":100289,"viewBox":2576,"refX":2577,"refY":1153,"markerWidth":1179,"markerHeight":1179,"orient":2578},"openpyxl-ro-arrow",[2580,100291],{"d":2582,"fill":2583},[2585,100293],{"x":26446,"y":3545,"width":2701,"height":2680,"rx":2591,"fill":100294,"stroke":2593,"style":2594},"url(#openpyxl-ro-grad)",[2000,100296,100297],{"x":2626,"y":83012,"fill":2599,"style":38718},"Existing .xlsx file?",[2000,100299,100300],{"x":2626,"y":2680,"fill":2599,"style":2685},"load_workbook(path, ...)",[35,100302],{"x1":2626,"y1":2590,"x2":2626,"y2":71541,"stroke":2583,"markerEnd":100303,"style":2594},"url(#openpyxl-ro-arrow)",[2000,100305,38628],{"x":47140,"y":59963,"fill":2583,"style":49873},[2585,100307],{"x":26446,"y":2630,"width":2701,"height":2680,"rx":2591,"fill":2592,"stroke":2593,"style":2594},[2000,100309,100310],{"x":2626,"y":71536,"fill":2599,"style":38718},"Need to write cells?",[2000,100312,100313],{"x":2626,"y":11214,"fill":2583,"style":2685},"(modify \u002F format)",[35,100315],{"x1":26446,"y1":2629,"x2":2604,"y2":2629,"stroke":2583,"markerEnd":100303,"style":2594},[2000,100317,38631],{"x":100318,"y":100319,"fill":2583,"style":49873},"215","109",[2585,100321],{"x":2587,"y":71541,"width":2635,"height":100322,"rx":2591,"fill":11165,"stroke":11166,"style":2594},"54",[2000,100324,100325],{"x":16357,"y":38755,"fill":2599,"style":38718},"Normal mode",[2000,100327,100329],{"x":16357,"y":100328,"fill":2583,"style":2605},"129","load_workbook(path)",[35,100331],{"x1":16990,"y1":2629,"x2":100332,"y2":2629,"stroke":2583,"markerEnd":100303,"style":2594},"588",[2000,100334,38628],{"x":100335,"y":100319,"fill":2583,"style":49873},"533",[2585,100337],{"x":2649,"y":71541,"width":2635,"height":100322,"rx":2591,"fill":11165,"stroke":11166,"style":2594},[2000,100339,99702],{"x":100340,"y":11099,"fill":2599,"style":38718},"665",[2000,100342,97854],{"x":100340,"y":11131,"fill":2583,"style":2605},[2000,100344,100345],{"x":100340,"y":2609,"fill":2583,"style":2605},"low memory",[2585,100347],{"x":26446,"y":2639,"width":2701,"height":100322,"rx":2591,"fill":11165,"stroke":11166,"style":2594},[2000,100349,99720],{"x":2626,"y":100350,"fill":2599,"style":38718},"191",[2000,100352,100354],{"x":2626,"y":100353,"fill":2583,"style":2605},"209","Workbook(write_only=True)",[18,100356,100358,100359,100361,100362,100364],{"id":100357},"variant-a-active-returns-none-on-a-read-only-workbook","Variant A — ",[30,100360,99636],{}," Returns ",[30,100363,571],{}," on a Read-Only Workbook",[14,100366,100367,100368,100370,100371,100373,100374,100377,100378,100380,100381,100384],{},"A read-only workbook does have ",[30,100369,99636],{},", but it returns ",[30,100372,571],{}," if the workbook contains only one sheet and that sheet's ",[30,100375,100376],{},"sheetState"," is not ",[30,100379,28642],{},", or if ",[30,100382,100383],{},"active_sheet_index"," is not set in the XML. The safe pattern is to access the sheet by name:",[23,100386,100388],{"className":126,"code":100387,"language":47,"meta":28,"style":28},"# pip install openpyxl\nfrom pathlib import Path\nfrom openpyxl import load_workbook\n\nPATH = Path(\"workbook.xlsx\")\n\nwb = load_workbook(PATH, read_only=True)\n\n# Fragile — may return None\n# ws = wb.active\n\n# Safe — access by name\nsheet_names = wb.sheetnames\nprint(sheet_names)           # inspect available sheets first\nws = wb[sheet_names[0]]      # or wb[\"Sheet1\"] if you know the name\n\nfor row in ws.iter_rows(values_only=True):\n    print(row)\n\nwb.close()   # always close read-only workbooks to release the file handle\n",[30,100389,100390,100394,100404,100414,100418,100430,100434,100454,100458,100463,100468,100472,100477,100487,100497,100514,100518,100536,100543,100547],{"__ignoreMap":28},[33,100391,100392],{"class":35,"line":36},[33,100393,98209],{"class":39},[33,100395,100396,100398,100400,100402],{"class":35,"line":43},[33,100397,190],{"class":163},[33,100399,193],{"class":167},[33,100401,164],{"class":163},[33,100403,198],{"class":167},[33,100405,100406,100408,100410,100412],{"class":35,"line":61},[33,100407,190],{"class":163},[33,100409,17103],{"class":167},[33,100411,164],{"class":163},[33,100413,17108],{"class":167},[33,100415,100416],{"class":35,"line":73},[33,100417,92],{"emptyLinePlaceholder":91},[33,100419,100420,100422,100424,100426,100428],{"class":35,"line":88},[33,100421,122],{"class":50},[33,100423,212],{"class":163},[33,100425,215],{"class":167},[33,100427,99833],{"class":54},[33,100429,221],{"class":167},[33,100431,100432],{"class":35,"line":95},[33,100433,92],{"emptyLinePlaceholder":91},[33,100435,100436,100438,100440,100442,100444,100446,100448,100450,100452],{"class":35,"line":101},[33,100437,98274],{"class":167},[33,100439,242],{"class":163},[33,100441,97956],{"class":167},[33,100443,122],{"class":50},[33,100445,365],{"class":167},[33,100447,98285],{"class":238},[33,100449,242],{"class":163},[33,100451,855],{"class":50},[33,100453,221],{"class":167},[33,100455,100456],{"class":35,"line":171},[33,100457,92],{"emptyLinePlaceholder":91},[33,100459,100460],{"class":35,"line":179},[33,100461,100462],{"class":39},"# Fragile — may return None\n",[33,100464,100465],{"class":35,"line":187},[33,100466,100467],{"class":39},"# ws = wb.active\n",[33,100469,100470],{"class":35,"line":201},[33,100471,92],{"emptyLinePlaceholder":91},[33,100473,100474],{"class":35,"line":206},[33,100475,100476],{"class":39},"# Safe — access by name\n",[33,100478,100479,100482,100484],{"class":35,"line":224},[33,100480,100481],{"class":167},"sheet_names ",[33,100483,242],{"class":163},[33,100485,100486],{"class":167}," wb.sheetnames\n",[33,100488,100489,100491,100494],{"class":35,"line":229},[33,100490,13474],{"class":50},[33,100492,100493],{"class":167},"(sheet_names)           ",[33,100495,100496],{"class":39},"# inspect available sheets first\n",[33,100498,100499,100501,100503,100506,100508,100511],{"class":35,"line":235},[33,100500,98330],{"class":167},[33,100502,242],{"class":163},[33,100504,100505],{"class":167}," wb[sheet_names[",[33,100507,748],{"class":50},[33,100509,100510],{"class":167},"]]      ",[33,100512,100513],{"class":39},"# or wb[\"Sheet1\"] if you know the name\n",[33,100515,100516],{"class":35,"line":250},[33,100517,92],{"emptyLinePlaceholder":91},[33,100519,100520,100522,100524,100526,100528,100530,100532,100534],{"class":35,"line":266},[33,100521,6124],{"class":163},[33,100523,3844],{"class":167},[33,100525,662],{"class":163},[33,100527,17639],{"class":167},[33,100529,98352],{"class":238},[33,100531,242],{"class":163},[33,100533,855],{"class":50},[33,100535,1737],{"class":167},[33,100537,100538,100540],{"class":35,"line":290},[33,100539,7268],{"class":50},[33,100541,100542],{"class":167},"(row)\n",[33,100544,100545],{"class":35,"line":295},[33,100546,92],{"emptyLinePlaceholder":91},[33,100548,100549,100552],{"class":35,"line":300},[33,100550,100551],{"class":167},"wb.close()   ",[33,100553,100554],{"class":39},"# always close read-only workbooks to release the file handle\n",[18,100556,100558,100559],{"id":100557},"variant-b-accessing-cell-values-after-wbclose","Variant B — Accessing Cell Values After ",[30,100560,99640],{},[14,100562,100563,100564,100566,100567,100569],{},"In ",[30,100565,98285],{}," mode, rows are yielded from a lazy XML stream. Once you call ",[30,100568,99640],{},", the stream is closed and any references to cells or rows become unusable. Materialise the data before closing:",[23,100571,100573],{"className":126,"code":100572,"language":47,"meta":28,"style":28},"# pip install openpyxl\nfrom pathlib import Path\nfrom openpyxl import load_workbook\n\nPATH = Path(\"large_file.xlsx\")\n\nwb = load_workbook(PATH, read_only=True)\nws = wb.active\n\n# Collect all rows into memory BEFORE closing\ndata = [row for row in ws.iter_rows(values_only=True)]\n\nwb.close()   # safe to close now — data is already in the list\n\n# Work with the data after close\nheaders = data[0]\nfor row in data[1:]:\n    record = dict(zip(headers, row))\n    # process record...\n\nprint(f\"Loaded {len(data) - 1} records\")\n",[30,100574,100575,100579,100589,100599,100603,100616,100620,100640,100648,100652,100657,100682,100686,100693,100697,100702,100715,100729,100746,100751,100755],{"__ignoreMap":28},[33,100576,100577],{"class":35,"line":36},[33,100578,98209],{"class":39},[33,100580,100581,100583,100585,100587],{"class":35,"line":43},[33,100582,190],{"class":163},[33,100584,193],{"class":167},[33,100586,164],{"class":163},[33,100588,198],{"class":167},[33,100590,100591,100593,100595,100597],{"class":35,"line":61},[33,100592,190],{"class":163},[33,100594,17103],{"class":167},[33,100596,164],{"class":163},[33,100598,17108],{"class":167},[33,100600,100601],{"class":35,"line":73},[33,100602,92],{"emptyLinePlaceholder":91},[33,100604,100605,100607,100609,100611,100614],{"class":35,"line":88},[33,100606,122],{"class":50},[33,100608,212],{"class":163},[33,100610,215],{"class":167},[33,100612,100613],{"class":54},"\"large_file.xlsx\"",[33,100615,221],{"class":167},[33,100617,100618],{"class":35,"line":95},[33,100619,92],{"emptyLinePlaceholder":91},[33,100621,100622,100624,100626,100628,100630,100632,100634,100636,100638],{"class":35,"line":101},[33,100623,98274],{"class":167},[33,100625,242],{"class":163},[33,100627,97956],{"class":167},[33,100629,122],{"class":50},[33,100631,365],{"class":167},[33,100633,98285],{"class":238},[33,100635,242],{"class":163},[33,100637,855],{"class":50},[33,100639,221],{"class":167},[33,100641,100642,100644,100646],{"class":35,"line":171},[33,100643,98330],{"class":167},[33,100645,242],{"class":163},[33,100647,99877],{"class":167},[33,100649,100650],{"class":35,"line":179},[33,100651,92],{"emptyLinePlaceholder":91},[33,100653,100654],{"class":35,"line":187},[33,100655,100656],{"class":39},"# Collect all rows into memory BEFORE closing\n",[33,100658,100659,100661,100663,100666,100668,100670,100672,100674,100676,100678,100680],{"class":35,"line":201},[33,100660,16260],{"class":167},[33,100662,242],{"class":163},[33,100664,100665],{"class":167}," [row ",[33,100667,6124],{"class":163},[33,100669,3844],{"class":167},[33,100671,662],{"class":163},[33,100673,17639],{"class":167},[33,100675,98352],{"class":238},[33,100677,242],{"class":163},[33,100679,855],{"class":50},[33,100681,7767],{"class":167},[33,100683,100684],{"class":35,"line":206},[33,100685,92],{"emptyLinePlaceholder":91},[33,100687,100688,100690],{"class":35,"line":224},[33,100689,100551],{"class":167},[33,100691,100692],{"class":39},"# safe to close now — data is already in the list\n",[33,100694,100695],{"class":35,"line":229},[33,100696,92],{"emptyLinePlaceholder":91},[33,100698,100699],{"class":35,"line":235},[33,100700,100701],{"class":39},"# Work with the data after close\n",[33,100703,100704,100707,100709,100711,100713],{"class":35,"line":250},[33,100705,100706],{"class":167},"headers ",[33,100708,242],{"class":163},[33,100710,47294],{"class":167},[33,100712,748],{"class":50},[33,100714,9202],{"class":167},[33,100716,100717,100719,100721,100723,100725,100727],{"class":35,"line":266},[33,100718,6124],{"class":163},[33,100720,3844],{"class":167},[33,100722,662],{"class":163},[33,100724,47294],{"class":167},[33,100726,734],{"class":50},[33,100728,43533],{"class":167},[33,100730,100731,100734,100736,100738,100740,100743],{"class":35,"line":290},[33,100732,100733],{"class":167},"    record ",[33,100735,242],{"class":163},[33,100737,85015],{"class":50},[33,100739,602],{"class":167},[33,100741,100742],{"class":50},"zip",[33,100744,100745],{"class":167},"(headers, row))\n",[33,100747,100748],{"class":35,"line":295},[33,100749,100750],{"class":39},"    # process record...\n",[33,100752,100753],{"class":35,"line":300},[33,100754,92],{"emptyLinePlaceholder":91},[33,100756,100757,100759,100761,100763,100765,100767,100769,100771,100773,100776],{"class":35,"line":317},[33,100758,13474],{"class":50},[33,100760,602],{"class":167},[33,100762,4059],{"class":163},[33,100764,96187],{"class":54},[33,100766,4065],{"class":50},[33,100768,20323],{"class":167},[33,100770,4126],{"class":163},[33,100772,11022],{"class":50},[33,100774,100775],{"class":54}," records\"",[33,100777,221],{"class":167},[14,100779,100780,100781,100783,100784,3035],{},"If the file is so large that materialising it is not feasible, process rows inside the ",[30,100782,22271],{}," block or before calling ",[30,100785,99640],{},[18,100787,100789,100790,100793,100794,100796],{"id":100788},"variant-c-write_only-mode-raises-notimplementederror-on-read","Variant C — ",[30,100791,100792],{},"write_only"," Mode Raises ",[30,100795,86120],{}," on Read",[14,100798,100799,100801],{},[30,100800,100354],{}," is for streaming large files to a new path — you cannot read back cells you already wrote:",[23,100803,100805],{"className":126,"code":100804,"language":47,"meta":28,"style":28},"# pip install openpyxl\nfrom openpyxl import Workbook\n\nwb = Workbook(write_only=True)\nws = wb.create_sheet(\"Data\")\n\nws.append([\"Name\", \"Revenue\"])   # correct — append only\nws.append([\"North\", 15000])\n\n# ws[\"A1\"].value  →  raises NotImplementedError in write_only mode\n\nwb.save(\"output.xlsx\")\nwb.close()\n",[30,100806,100807,100811,100822,100826,100843,100856,100860,100877,100890,100894,100899,100903,100913],{"__ignoreMap":28},[33,100808,100809],{"class":35,"line":36},[33,100810,98209],{"class":39},[33,100812,100813,100815,100817,100819],{"class":35,"line":43},[33,100814,190],{"class":163},[33,100816,17103],{"class":167},[33,100818,164],{"class":163},[33,100820,100821],{"class":167}," Workbook\n",[33,100823,100824],{"class":35,"line":61},[33,100825,92],{"emptyLinePlaceholder":91},[33,100827,100828,100830,100832,100835,100837,100839,100841],{"class":35,"line":73},[33,100829,98274],{"class":167},[33,100831,242],{"class":163},[33,100833,100834],{"class":167}," Workbook(",[33,100836,100792],{"class":238},[33,100838,242],{"class":163},[33,100840,855],{"class":50},[33,100842,221],{"class":167},[33,100844,100845,100847,100849,100852,100854],{"class":35,"line":88},[33,100846,98330],{"class":167},[33,100848,242],{"class":163},[33,100850,100851],{"class":167}," wb.create_sheet(",[33,100853,17376],{"class":54},[33,100855,221],{"class":167},[33,100857,100858],{"class":35,"line":95},[33,100859,92],{"emptyLinePlaceholder":91},[33,100861,100862,100865,100868,100870,100872,100874],{"class":35,"line":101},[33,100863,100864],{"class":167},"ws.append([",[33,100866,100867],{"class":54},"\"Name\"",[33,100869,365],{"class":167},[33,100871,12925],{"class":54},[33,100873,7283],{"class":167},[33,100875,100876],{"class":39},"# correct — append only\n",[33,100878,100879,100881,100883,100885,100888],{"class":35,"line":171},[33,100880,100864],{"class":167},[33,100882,11760],{"class":54},[33,100884,365],{"class":167},[33,100886,100887],{"class":50},"15000",[33,100889,751],{"class":167},[33,100891,100892],{"class":35,"line":179},[33,100893,92],{"emptyLinePlaceholder":91},[33,100895,100896],{"class":35,"line":187},[33,100897,100898],{"class":39},"# ws[\"A1\"].value  →  raises NotImplementedError in write_only mode\n",[33,100900,100901],{"class":35,"line":201},[33,100902,92],{"emptyLinePlaceholder":91},[33,100904,100905,100908,100911],{"class":35,"line":206},[33,100906,100907],{"class":167},"wb.save(",[33,100909,100910],{"class":54},"\"output.xlsx\"",[33,100912,221],{"class":167},[33,100914,100915],{"class":35,"line":224},[33,100916,98453],{"class":167},[14,100918,100919,100920,100922,100923,3035],{},"If you need to read back values after writing, use normal mode (",[30,100921,99727],{},") or load the saved file afterwards with ",[30,100924,100925],{},"load_workbook(\"output.xlsx\")",[18,100927,100929],{"id":100928},"variant-d-io-operation-on-closed-file","Variant D — \"I\u002FO operation on closed file\"",[14,100931,100932,100933,100935,100936,100939,100940,100942],{},"This ",[30,100934,95615],{}," appears when you call ",[30,100937,100938],{},"wb.save()"," after ",[30,100941,99640],{},", or when a context manager closes the workbook before you save:",[23,100944,100946],{"className":126,"code":100945,"language":47,"meta":28,"style":28},"# pip install openpyxl\nfrom pathlib import Path\nfrom openpyxl import load_workbook\n\nPATH   = Path(\"workbook.xlsx\")\nOUTPUT = Path(\"updated.xlsx\")\n\ntry:\n    wb = load_workbook(PATH)\n    ws = wb.active\n    ws[\"A1\"] = \"updated\"\n    # wb.close()  ← DON'T close before saving\n    wb.save(OUTPUT)    # save first\n    wb.close()         # then close\nexcept ValueError as exc:\n    print(f\"Caught: {exc}\")   # \"I\u002FO operation on closed file\"\n",[30,100947,100948,100952,100962,100972,100976,100988,101001,101005,101011,101023,101031,101044,101049,101061,101069,101079],{"__ignoreMap":28},[33,100949,100950],{"class":35,"line":36},[33,100951,98209],{"class":39},[33,100953,100954,100956,100958,100960],{"class":35,"line":43},[33,100955,190],{"class":163},[33,100957,193],{"class":167},[33,100959,164],{"class":163},[33,100961,198],{"class":167},[33,100963,100964,100966,100968,100970],{"class":35,"line":61},[33,100965,190],{"class":163},[33,100967,17103],{"class":167},[33,100969,164],{"class":163},[33,100971,17108],{"class":167},[33,100973,100974],{"class":35,"line":73},[33,100975,92],{"emptyLinePlaceholder":91},[33,100977,100978,100980,100982,100984,100986],{"class":35,"line":88},[33,100979,122],{"class":50},[33,100981,21012],{"class":163},[33,100983,215],{"class":167},[33,100985,99833],{"class":54},[33,100987,221],{"class":167},[33,100989,100990,100992,100994,100996,100999],{"class":35,"line":95},[33,100991,96935],{"class":50},[33,100993,212],{"class":163},[33,100995,215],{"class":167},[33,100997,100998],{"class":54},"\"updated.xlsx\"",[33,101000,221],{"class":167},[33,101002,101003],{"class":35,"line":101},[33,101004,92],{"emptyLinePlaceholder":91},[33,101006,101007,101009],{"class":35,"line":171},[33,101008,35574],{"class":163},[33,101010,574],{"class":167},[33,101012,101013,101015,101017,101019,101021],{"class":35,"line":179},[33,101014,17432],{"class":167},[33,101016,242],{"class":163},[33,101018,97956],{"class":167},[33,101020,122],{"class":50},[33,101022,221],{"class":167},[33,101024,101025,101027,101029],{"class":35,"line":187},[33,101026,17442],{"class":167},[33,101028,242],{"class":163},[33,101030,99877],{"class":167},[33,101032,101033,101035,101037,101039,101041],{"class":35,"line":201},[33,101034,99896],{"class":167},[33,101036,99899],{"class":54},[33,101038,763],{"class":167},[33,101040,242],{"class":163},[33,101042,101043],{"class":54}," \"updated\"\n",[33,101045,101046],{"class":35,"line":206},[33,101047,101048],{"class":39},"    # wb.close()  ← DON'T close before saving\n",[33,101050,101051,101053,101055,101058],{"class":35,"line":224},[33,101052,98136],{"class":167},[33,101054,96935],{"class":50},[33,101056,101057],{"class":167},")    ",[33,101059,101060],{"class":39},"# save first\n",[33,101062,101063,101066],{"class":35,"line":229},[33,101064,101065],{"class":167},"    wb.close()         ",[33,101067,101068],{"class":39},"# then close\n",[33,101070,101071,101073,101075,101077],{"class":35,"line":235},[33,101072,35726],{"class":163},[33,101074,4054],{"class":50},[33,101076,1852],{"class":163},[33,101078,1855],{"class":167},[33,101080,101081,101083,101085,101087,101089,101091,101093,101095,101097,101099],{"class":35,"line":250},[33,101082,7268],{"class":50},[33,101084,602],{"class":167},[33,101086,4059],{"class":163},[33,101088,99931],{"class":54},[33,101090,1115],{"class":50},[33,101092,6565],{"class":167},[33,101094,1121],{"class":50},[33,101096,274],{"class":54},[33,101098,12000],{"class":167},[33,101100,101101],{"class":39},"# \"I\u002FO operation on closed file\"\n",[14,101103,101104,101105,101107,101108,101110,101111,101113,101114,10065,101117,101120,101121,101123],{},"The pattern: save, then close. If you use a ",[30,101106,22271],{}," block, ",[30,101109,22009],{}," does not natively support context managers on ",[30,101112,22404],{}," — you must call ",[30,101115,101116],{},"save",[30,101118,101119],{},"close"," explicitly, or use ",[30,101122,96826],{}," which handles this lifecycle automatically.",[18,101125,101127,101128,101130,101131,101134],{"id":101126},"variant-e-read_only-workbook-with-iter_rows-returning-empty-after-re-iteration","Variant E — ",[30,101129,98285],{}," Workbook with ",[30,101132,101133],{},".iter_rows()"," Returning Empty After Re-Iteration",[14,101136,101137,101138,101140],{},"The lazy row iterator in a ",[30,101139,99743],{}," is a one-shot generator. Once it is exhausted you cannot iterate it again — the second pass returns nothing:",[23,101142,101144],{"className":126,"code":101143,"language":47,"meta":28,"style":28},"# pip install openpyxl\nfrom pathlib import Path\nfrom openpyxl import load_workbook\n\nPATH = Path(\"workbook.xlsx\")\n\nwb = load_workbook(PATH, read_only=True)\nws = wb.active\n\n# First pass — works\nfor row in ws.iter_rows(values_only=True):\n    pass   # process rows\n\n# Second pass — silently empty in read_only mode\ncount = sum(1 for _ in ws.iter_rows(values_only=True))\nprint(count)   # prints 0\n\nwb.close()\n",[30,101145,101146,101150,101160,101170,101174,101186,101190,101210,101218,101222,101227,101245,101253,101257,101262,101292,101302,101306],{"__ignoreMap":28},[33,101147,101148],{"class":35,"line":36},[33,101149,98209],{"class":39},[33,101151,101152,101154,101156,101158],{"class":35,"line":43},[33,101153,190],{"class":163},[33,101155,193],{"class":167},[33,101157,164],{"class":163},[33,101159,198],{"class":167},[33,101161,101162,101164,101166,101168],{"class":35,"line":61},[33,101163,190],{"class":163},[33,101165,17103],{"class":167},[33,101167,164],{"class":163},[33,101169,17108],{"class":167},[33,101171,101172],{"class":35,"line":73},[33,101173,92],{"emptyLinePlaceholder":91},[33,101175,101176,101178,101180,101182,101184],{"class":35,"line":88},[33,101177,122],{"class":50},[33,101179,212],{"class":163},[33,101181,215],{"class":167},[33,101183,99833],{"class":54},[33,101185,221],{"class":167},[33,101187,101188],{"class":35,"line":95},[33,101189,92],{"emptyLinePlaceholder":91},[33,101191,101192,101194,101196,101198,101200,101202,101204,101206,101208],{"class":35,"line":101},[33,101193,98274],{"class":167},[33,101195,242],{"class":163},[33,101197,97956],{"class":167},[33,101199,122],{"class":50},[33,101201,365],{"class":167},[33,101203,98285],{"class":238},[33,101205,242],{"class":163},[33,101207,855],{"class":50},[33,101209,221],{"class":167},[33,101211,101212,101214,101216],{"class":35,"line":171},[33,101213,98330],{"class":167},[33,101215,242],{"class":163},[33,101217,99877],{"class":167},[33,101219,101220],{"class":35,"line":179},[33,101221,92],{"emptyLinePlaceholder":91},[33,101223,101224],{"class":35,"line":187},[33,101225,101226],{"class":39},"# First pass — works\n",[33,101228,101229,101231,101233,101235,101237,101239,101241,101243],{"class":35,"line":201},[33,101230,6124],{"class":163},[33,101232,3844],{"class":167},[33,101234,662],{"class":163},[33,101236,17639],{"class":167},[33,101238,98352],{"class":238},[33,101240,242],{"class":163},[33,101242,855],{"class":50},[33,101244,1737],{"class":167},[33,101246,101247,101250],{"class":35,"line":206},[33,101248,101249],{"class":163},"    pass",[33,101251,101252],{"class":39},"   # process rows\n",[33,101254,101255],{"class":35,"line":224},[33,101256,92],{"emptyLinePlaceholder":91},[33,101258,101259],{"class":35,"line":229},[33,101260,101261],{"class":39},"# Second pass — silently empty in read_only mode\n",[33,101263,101264,101267,101269,101271,101273,101275,101277,101280,101282,101284,101286,101288,101290],{"class":35,"line":235},[33,101265,101266],{"class":167},"count ",[33,101268,242],{"class":163},[33,101270,46601],{"class":50},[33,101272,602],{"class":167},[33,101274,734],{"class":50},[33,101276,14766],{"class":163},[33,101278,101279],{"class":167}," _ ",[33,101281,662],{"class":163},[33,101283,17639],{"class":167},[33,101285,98352],{"class":238},[33,101287,242],{"class":163},[33,101289,855],{"class":50},[33,101291,371],{"class":167},[33,101293,101294,101296,101299],{"class":35,"line":250},[33,101295,13474],{"class":50},[33,101297,101298],{"class":167},"(count)   ",[33,101300,101301],{"class":39},"# prints 0\n",[33,101303,101304],{"class":35,"line":266},[33,101305,92],{"emptyLinePlaceholder":91},[33,101307,101308],{"class":35,"line":290},[33,101309,98453],{"class":167},[14,101311,101312],{},"Fix: reload the workbook for the second pass, or materialise into a list on the first pass (as shown in Variant B).",[18,101314,101316,101317],{"id":101315},"variant-f-applying-openpyxl-styles-when-writing-with-pdexcelwriter","Variant F — Applying openpyxl Styles When Writing with ",[30,101318,22392],{},[14,101320,101321,101322,101325,101326,101328,101329,101332,101333,101335,101336,101338,101339,101342],{},"A common mistake when using ",[30,101323,101324],{},"pd.ExcelWriter(path, engine=\"openpyxl\")"," is applying ",[30,101327,17066],{},"-style ",[30,101330,101331],{},"add_format"," calls — those belong to the ",[30,101334,17066],{}," API. With the ",[30,101337,22009],{}," engine, use ",[30,101340,101341],{},"openpyxl.styles"," objects instead:",[23,101344,101346],{"className":126,"code":101345,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nfrom pathlib import Path\nimport pandas as pd\nfrom openpyxl.styles import Font, PatternFill, Alignment\n\nOUTPUT = Path(\"output\u002Fstyled.xlsx\")\ndf = pd.DataFrame({\"Region\": [\"North\", \"South\"], \"Revenue\": [12000, 4500]})\n\ntry:\n    with pd.ExcelWriter(OUTPUT, engine=\"openpyxl\") as writer:\n        df.to_excel(writer, sheet_name=\"Data\", index=False)\n\n        wb = writer.book\n        ws = wb[\"Data\"]\n\n        # Style the header row\n        for cell in ws[1]:\n            cell.font      = Font(bold=True, color=\"ffffff\", size=12)\n            cell.fill      = PatternFill(\"solid\", fgColor=\"2563eb\")\n            cell.alignment = Alignment(horizontal=\"center\")\n\n        # Number format on revenue column\n        for row in ws.iter_rows(min_row=2, max_row=ws.max_row,\n                                min_col=2, max_col=2):\n            for cell in row:\n                cell.number_format = \"#,##0.00\"\n\nexcept Exception as exc:\n    raise SystemExit(f\"Export failed: {exc}\")\n\nprint(f\"Styled file written: {OUTPUT}\")\n",[30,101347,101348,101352,101362,101372,101382,101386,101399,101434,101438,101444,101466,101487,101491,101499,101512,101516,101521,101535,101569,101591,101608,101612,101617,101642,101661,101671,101679,101683,101693,101716,101720],{"__ignoreMap":28},[33,101349,101350],{"class":35,"line":36},[33,101351,3952],{"class":39},[33,101353,101354,101356,101358,101360],{"class":35,"line":43},[33,101355,190],{"class":163},[33,101357,193],{"class":167},[33,101359,164],{"class":163},[33,101361,198],{"class":167},[33,101363,101364,101366,101368,101370],{"class":35,"line":61},[33,101365,164],{"class":163},[33,101367,492],{"class":167},[33,101369,495],{"class":163},[33,101371,498],{"class":167},[33,101373,101374,101376,101378,101380],{"class":35,"line":73},[33,101375,190],{"class":163},[33,101377,17115],{"class":167},[33,101379,164],{"class":163},[33,101381,22631],{"class":167},[33,101383,101384],{"class":35,"line":88},[33,101385,92],{"emptyLinePlaceholder":91},[33,101387,101388,101390,101392,101394,101397],{"class":35,"line":95},[33,101389,96935],{"class":50},[33,101391,212],{"class":163},[33,101393,215],{"class":167},[33,101395,101396],{"class":54},"\"output\u002Fstyled.xlsx\"",[33,101398,221],{"class":167},[33,101400,101401,101403,101405,101408,101410,101412,101414,101416,101418,101420,101422,101424,101427,101429,101432],{"class":35,"line":101},[33,101402,13459],{"class":167},[33,101404,242],{"class":163},[33,101406,101407],{"class":167}," pd.DataFrame({",[33,101409,11865],{"class":54},[33,101411,12426],{"class":167},[33,101413,11760],{"class":54},[33,101415,365],{"class":167},[33,101417,11773],{"class":54},[33,101419,8314],{"class":167},[33,101421,12925],{"class":54},[33,101423,12426],{"class":167},[33,101425,101426],{"class":50},"12000",[33,101428,365],{"class":167},[33,101430,101431],{"class":50},"4500",[33,101433,45051],{"class":167},[33,101435,101436],{"class":35,"line":171},[33,101437,92],{"emptyLinePlaceholder":91},[33,101439,101440,101442],{"class":35,"line":179},[33,101441,35574],{"class":163},[33,101443,574],{"class":167},[33,101445,101446,101448,101450,101452,101454,101456,101458,101460,101462,101464],{"class":35,"line":187},[33,101447,1635],{"class":163},[33,101449,96996],{"class":167},[33,101451,96935],{"class":50},[33,101453,365],{"class":167},[33,101455,17351],{"class":238},[33,101457,242],{"class":163},[33,101459,17356],{"class":54},[33,101461,1649],{"class":167},[33,101463,495],{"class":163},[33,101465,17363],{"class":167},[33,101467,101468,101471,101473,101475,101477,101479,101481,101483,101485],{"class":35,"line":201},[33,101469,101470],{"class":167},"        df.to_excel(writer, ",[33,101472,17371],{"class":238},[33,101474,242],{"class":163},[33,101476,17376],{"class":54},[33,101478,365],{"class":167},[33,101480,897],{"class":238},[33,101482,242],{"class":163},[33,101484,902],{"class":50},[33,101486,221],{"class":167},[33,101488,101489],{"class":35,"line":206},[33,101490,92],{"emptyLinePlaceholder":91},[33,101492,101493,101495,101497],{"class":35,"line":224},[33,101494,97065],{"class":167},[33,101496,242],{"class":163},[33,101498,97070],{"class":167},[33,101500,101501,101504,101506,101508,101510],{"class":35,"line":229},[33,101502,101503],{"class":167},"        ws ",[33,101505,242],{"class":163},[33,101507,17447],{"class":167},[33,101509,17376],{"class":54},[33,101511,9202],{"class":167},[33,101513,101514],{"class":35,"line":235},[33,101515,92],{"emptyLinePlaceholder":91},[33,101517,101518],{"class":35,"line":250},[33,101519,101520],{"class":39},"        # Style the header row\n",[33,101522,101523,101525,101527,101529,101531,101533],{"class":35,"line":266},[33,101524,5973],{"class":163},[33,101526,17467],{"class":167},[33,101528,662],{"class":163},[33,101530,17472],{"class":167},[33,101532,734],{"class":50},[33,101534,17477],{"class":167},[33,101536,101537,101540,101542,101544,101546,101548,101550,101552,101554,101556,101559,101561,101563,101565,101567],{"class":35,"line":290},[33,101538,101539],{"class":167},"            cell.font      ",[33,101541,242],{"class":163},[33,101543,17233],{"class":167},[33,101545,17236],{"class":238},[33,101547,242],{"class":163},[33,101549,855],{"class":50},[33,101551,365],{"class":167},[33,101553,17245],{"class":238},[33,101555,242],{"class":163},[33,101557,101558],{"class":54},"\"ffffff\"",[33,101560,365],{"class":167},[33,101562,17255],{"class":238},[33,101564,242],{"class":163},[33,101566,55650],{"class":50},[33,101568,221],{"class":167},[33,101570,101571,101574,101576,101578,101580,101582,101584,101586,101589],{"class":35,"line":295},[33,101572,101573],{"class":167},"            cell.fill      ",[33,101575,242],{"class":163},[33,101577,17185],{"class":167},[33,101579,17188],{"class":54},[33,101581,365],{"class":167},[33,101583,17193],{"class":238},[33,101585,242],{"class":163},[33,101587,101588],{"class":54},"\"2563eb\"",[33,101590,221],{"class":167},[33,101592,101593,101596,101598,101600,101602,101604,101606],{"class":35,"line":300},[33,101594,101595],{"class":167},"            cell.alignment ",[33,101597,242],{"class":163},[33,101599,17507],{"class":167},[33,101601,17510],{"class":238},[33,101603,242],{"class":163},[33,101605,17515],{"class":54},[33,101607,221],{"class":167},[33,101609,101610],{"class":35,"line":317},[33,101611,92],{"emptyLinePlaceholder":91},[33,101613,101614],{"class":35,"line":332},[33,101615,101616],{"class":39},"        # Number format on revenue column\n",[33,101618,101619,101621,101623,101625,101627,101629,101631,101633,101635,101637,101639],{"class":35,"line":347},[33,101620,5973],{"class":163},[33,101622,3844],{"class":167},[33,101624,662],{"class":163},[33,101626,17639],{"class":167},[33,101628,17642],{"class":238},[33,101630,242],{"class":163},[33,101632,1533],{"class":50},[33,101634,365],{"class":167},[33,101636,97398],{"class":238},[33,101638,242],{"class":163},[33,101640,101641],{"class":167},"ws.max_row,\n",[33,101643,101644,101647,101649,101651,101653,101655,101657,101659],{"class":35,"line":374},[33,101645,101646],{"class":238},"                                min_col",[33,101648,242],{"class":163},[33,101650,1533],{"class":50},[33,101652,365],{"class":167},[33,101654,17659],{"class":238},[33,101656,242],{"class":163},[33,101658,1533],{"class":50},[33,101660,1737],{"class":167},[33,101662,101663,101665,101667,101669],{"class":35,"line":397},[33,101664,1793],{"class":163},[33,101666,17467],{"class":167},[33,101668,662],{"class":163},[33,101670,17675],{"class":167},[33,101672,101673,101675,101677],{"class":35,"line":653},[33,101674,17680],{"class":167},[33,101676,242],{"class":163},[33,101678,97444],{"class":54},[33,101680,101681],{"class":35,"line":667},[33,101682,92],{"emptyLinePlaceholder":91},[33,101684,101685,101687,101689,101691],{"class":35,"line":675},[33,101686,35726],{"class":163},[33,101688,783],{"class":50},[33,101690,1852],{"class":163},[33,101692,1855],{"class":167},[33,101694,101695,101697,101699,101701,101703,101706,101708,101710,101712,101714],{"class":35,"line":689},[33,101696,35742],{"class":163},[33,101698,16617],{"class":50},[33,101700,602],{"class":167},[33,101702,4059],{"class":163},[33,101704,101705],{"class":54},"\"Export failed: ",[33,101707,1115],{"class":50},[33,101709,6565],{"class":167},[33,101711,1121],{"class":50},[33,101713,274],{"class":54},[33,101715,221],{"class":167},[33,101717,101718],{"class":35,"line":703},[33,101719,92],{"emptyLinePlaceholder":91},[33,101721,101722,101724,101726,101728,101731,101733,101735],{"class":35,"line":714},[33,101723,13474],{"class":50},[33,101725,602],{"class":167},[33,101727,4059],{"class":163},[33,101729,101730],{"class":54},"\"Styled file written: ",[33,101732,97684],{"class":50},[33,101734,274],{"class":54},[33,101736,221],{"class":167},[14,101738,101739,101740,101743,101744,101747],{},"If you need the richer format\u002Fchart API, switch to ",[30,101741,101742],{},"engine=\"xlsxwriter\""," and use ",[30,101745,101746],{},"wb.add_format(...)"," instead. The two engines are not interchangeable — pick one per workbook.",[18,101749,101751],{"id":101750},"common-mode-confusion-summary","Common Mode Confusion Summary",[4273,101753,101754,101766],{},[4276,101755,101756],{},[4279,101757,101758,101760,101763],{},[4282,101759,4284],{},[4282,101761,101762],{},"Likely cause",[4282,101764,101765],{},"Correct call",[4292,101767,101768,101784,101801,101820,101837,101859],{},[4279,101769,101770,101775,101780],{},[4297,101771,101772],{},[30,101773,101774],{},"AttributeError: 'ReadOnlyWorksheet' has no attribute '__setitem__'",[4297,101776,101777,101779],{},[30,101778,97854],{}," used for a write workflow",[4297,101781,101782],{},[30,101783,100329],{},[4279,101785,101786,101792,101795],{},[4297,101787,101788,95600,101790],{},[30,101789,99636],{},[30,101791,571],{},[4297,101793,101794],{},"read-only workbook with a non-default active sheet",[4297,101796,101797,101798],{},"Access by name: ",[30,101799,101800],{},"wb[\"Sheet1\"]",[4279,101802,101803,101810,101813],{},[4297,101804,101805,101806,101809],{},"Second ",[30,101807,101808],{},"iter_rows()"," loop returns nothing",[4297,101811,101812],{},"One-shot generator exhausted",[4297,101814,101815,101816,101819],{},"Materialise to ",[30,101817,101818],{},"list()"," on first pass",[4279,101821,101822,101827,101834],{},[4297,101823,101824,101826],{},[30,101825,86120],{}," reading a cell",[4297,101828,101829,101831,101832],{},[30,101830,22013],{}," on a ",[30,101833,99727],{},[4297,101835,101836],{},"Load the saved file separately for reading",[4279,101838,101839,101844,101853],{},[4297,101840,101841],{},[30,101842,101843],{},"ValueError: I\u002FO operation on closed file",[4297,101845,101846,101849,101850],{},[30,101847,101848],{},"save()"," called after ",[30,101851,101852],{},"close()",[4297,101854,74566,101855,83542,101857],{},[30,101856,101848],{},[30,101858,101852],{},[4279,101860,101861,101867,101875],{},[4297,101862,101863,20859,101865],{},[30,101864,101331],{},[30,101866,99632],{},[4297,101868,101869,101871,101872,101874],{},[30,101870,17066],{}," API called on an ",[30,101873,22009],{}," worksheet",[4297,101876,17059,101877,101879,101880,101882],{},[30,101878,101341],{}," objects with ",[30,101881,22009],{}," engine",[18,101884,9247],{"id":9246},[14,101886,101887],{},"After applying the fix, run the following assertion to confirm the file was written successfully:",[23,101889,101891],{"className":126,"code":101890,"language":47,"meta":28,"style":28},"# pip install openpyxl\nfrom pathlib import Path\nfrom openpyxl import load_workbook\n\nOUTPUT = Path(\"workbook_updated.xlsx\")\n\nwb = load_workbook(OUTPUT, read_only=True)\nws = wb.active\nfirst_row = next(ws.iter_rows(values_only=True))\nwb.close()\n\nassert first_row[0] == \"Report Title\", f\"Unexpected value: {first_row[0]}\"\nprint(\"Write verified successfully\")\n",[30,101892,101893,101897,101907,101917,101921,101933,101937,101957,101965,101985,101989,101993,102028],{"__ignoreMap":28},[33,101894,101895],{"class":35,"line":36},[33,101896,98209],{"class":39},[33,101898,101899,101901,101903,101905],{"class":35,"line":43},[33,101900,190],{"class":163},[33,101902,193],{"class":167},[33,101904,164],{"class":163},[33,101906,198],{"class":167},[33,101908,101909,101911,101913,101915],{"class":35,"line":61},[33,101910,190],{"class":163},[33,101912,17103],{"class":167},[33,101914,164],{"class":163},[33,101916,17108],{"class":167},[33,101918,101919],{"class":35,"line":73},[33,101920,92],{"emptyLinePlaceholder":91},[33,101922,101923,101925,101927,101929,101931],{"class":35,"line":88},[33,101924,96935],{"class":50},[33,101926,212],{"class":163},[33,101928,215],{"class":167},[33,101930,100040],{"class":54},[33,101932,221],{"class":167},[33,101934,101935],{"class":35,"line":95},[33,101936,92],{"emptyLinePlaceholder":91},[33,101938,101939,101941,101943,101945,101947,101949,101951,101953,101955],{"class":35,"line":101},[33,101940,98274],{"class":167},[33,101942,242],{"class":163},[33,101944,97956],{"class":167},[33,101946,96935],{"class":50},[33,101948,365],{"class":167},[33,101950,98285],{"class":238},[33,101952,242],{"class":163},[33,101954,855],{"class":50},[33,101956,221],{"class":167},[33,101958,101959,101961,101963],{"class":35,"line":171},[33,101960,98330],{"class":167},[33,101962,242],{"class":163},[33,101964,99877],{"class":167},[33,101966,101967,101970,101972,101975,101977,101979,101981,101983],{"class":35,"line":179},[33,101968,101969],{"class":167},"first_row ",[33,101971,242],{"class":163},[33,101973,101974],{"class":50}," next",[33,101976,98349],{"class":167},[33,101978,98352],{"class":238},[33,101980,242],{"class":163},[33,101982,855],{"class":50},[33,101984,371],{"class":167},[33,101986,101987],{"class":35,"line":187},[33,101988,98453],{"class":167},[33,101990,101991],{"class":35,"line":201},[33,101992,92],{"emptyLinePlaceholder":91},[33,101994,101995,101997,102000,102002,102004,102006,102008,102010,102012,102015,102017,102020,102022,102024,102026],{"class":35,"line":206},[33,101996,36397],{"class":163},[33,101998,101999],{"class":167}," first_row[",[33,102001,748],{"class":50},[33,102003,763],{"class":167},[33,102005,1865],{"class":163},[33,102007,100100],{"class":54},[33,102009,365],{"class":167},[33,102011,4059],{"class":163},[33,102013,102014],{"class":54},"\"Unexpected value: ",[33,102016,1115],{"class":50},[33,102018,102019],{"class":167},"first_row[",[33,102021,748],{"class":50},[33,102023,9546],{"class":167},[33,102025,1121],{"class":50},[33,102027,7504],{"class":54},[33,102029,102030,102032,102034,102037],{"class":35,"line":224},[33,102031,13474],{"class":50},[33,102033,602],{"class":167},[33,102035,102036],{"class":54},"\"Write verified successfully\"",[33,102038,221],{"class":167},[14,102040,102041,102042,102044,102045,3035],{},"A passing assertion confirms the cell was written. If the assertion fails, check that you saved to ",[30,102043,96935],{}," and not the original ",[30,102046,122],{},[18,102048,6918],{"id":6917},[4211,102050,102051,102056,102061,102069],{},[4214,102052,102053,102055],{},[940,102054,6936],{"href":6935}," — full pipeline using openpyxl and xlsxwriter for styled multi-sheet reports",[4214,102057,102058,102060],{},[940,102059,95780],{"href":21804}," — end-to-end example that loads and fills an existing template",[4214,102062,102063,102065,102066,102068],{},[940,102064,99577],{"href":99576}," — correct patterns for reading ",[30,102067,26542],{}," files, including large-file strategies",[4214,102070,102071,102075],{},[940,102072,102074],{"href":102073},"\u002Fpython-for-excel-csv-data-processing\u002Fwriting-excel-formulas-and-charts-with-openpyxl\u002F","Writing Excel Formulas and Charts with openpyxl"," — advanced write patterns once the workbook is open in normal mode",[14,102077,6947,102078,3035],{},[940,102079,6936],{"href":6935},[6953,102081,102082],{},"html pre.shiki code .sAwPA, html code.shiki .sAwPA{--shiki-default:#6A737D}html pre.shiki code .sD7c4, html code.shiki .sD7c4{--shiki-default:#D73A49}html pre.shiki code .sgsFI, html code.shiki .sgsFI{--shiki-default:#24292E}html pre.shiki code .sYu0t, html code.shiki .sYu0t{--shiki-default:#005CC5}html pre.shiki code .sYBdl, html code.shiki .sYBdl{--shiki-default:#032F62}html pre.shiki code .sqxcx, html code.shiki .sqxcx{--shiki-default:#E36209}html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}",{"title":28,"searchDepth":43,"depth":43,"links":102084},[102085,102086,102087,102089,102091,102093,102095,102096,102098,102100,102101,102102],{"id":7020,"depth":43,"text":7021},{"id":54445,"depth":43,"text":99786},{"id":99966,"depth":43,"text":102088},"Fix — Open Without read_only for Editing",{"id":100357,"depth":43,"text":102090},"Variant A — .active Returns None on a Read-Only Workbook",{"id":100557,"depth":43,"text":102092},"Variant B — Accessing Cell Values After wb.close()",{"id":100788,"depth":43,"text":102094},"Variant C — write_only Mode Raises NotImplementedError on Read",{"id":100928,"depth":43,"text":100929},{"id":101126,"depth":43,"text":102097},"Variant E — read_only Workbook with .iter_rows() Returning Empty After Re-Iteration",{"id":101315,"depth":43,"text":102099},"Variant F — Applying openpyxl Styles When Writing with pd.ExcelWriter",{"id":101750,"depth":43,"text":101751},{"id":9246,"depth":43,"text":9247},{"id":6917,"depth":43,"text":6918},"Fix Read-Only Mode Error","openpyxl raises AttributeError or TypeError when you try to write to a workbook opened with read_only=True. Here is the exact fix and when to use each mode.",{},"\u002Fpython-for-excel-csv-data-processing\u002Fautomating-excel-report-generation\u002Ffix-openpyxl-read-only-mode-error",{"title":97863,"description":102104},"Fix openpyxl Read-Only Mode Error When Writing","python-for-excel-csv-data-processing\u002Fautomating-excel-report-generation\u002Ffix-openpyxl-read-only-mode-error\u002Findex",[22009,47,99614,81739],"H_QFHePxlCqaUnbkFXBhU9FV77YkWWeddADWYB87ssg",{"id":102113,"title":6936,"body":102114,"breadcrumbTitle":107411,"canonical":6977,"date":6977,"description":107412,"draft":6980,"extension":6981,"image":6977,"meta":107413,"navigation":91,"path":107414,"robots":6977,"seo":107415,"seoTitle":107420,"stem":107421,"tags":6977,"updatedAt":6977,"__hash__":107422},"content\u002Fpython-for-excel-csv-data-processing\u002Fautomating-excel-report-generation\u002Findex.md",{"type":7,"value":102115,"toc":107389},[102116,102119,102122,102136,102138,102141,102178,102183,102189,102192,102302,102306,102309,102435,102457,102461,102464,102568,102572,102946,102950,102969,103503,103507,103510,104039,104043,104046,104500,104504,104507,104925,104927,104931,104943,105318,105322,105327,105334,105340,105513,105515,105518,105688,105690,105738,105740,105864,105866,107267,107269,107291,107307,107328,107345,107358,107360,107382,107386],[10,102117,6936],{"id":102118},"automating-excel-report-generation",[14,102120,102121],{},"Generating a formatted Excel report by hand once is manageable. Generating one every Monday, for every region, with conditional highlights, charts, and a summary row, is not. Python automates the entire sequence: pull data, aggregate, write styled cells, embed charts, and save — with no manual post-processing.",[14,102123,102124,102125,102127,102128,10065,102130,102132,102133,102135],{},"This guide covers the full workflow: library selection, ",[30,102126,9630],{}," ExcelWriter integration, ",[30,102129,22009],{},[30,102131,17066],{}," styling, multiple sheets, summary rows, conditional formatting, and embedded charts. The techniques here complement the broader ",[940,102134,26258],{"href":26257}," guide.",[18,102137,21],{"id":20},[14,102139,102140],{},"Python 3.9+, a virtual environment, and the following packages:",[23,102142,102144],{"className":25,"code":102143,"language":27,"meta":28,"style":28},"python -m venv .venv\nsource .venv\u002Fbin\u002Factivate   # Windows: .venv\\Scripts\\activate\npip install pandas openpyxl xlsxwriter\n",[30,102145,102146,102156,102165],{"__ignoreMap":28},[33,102147,102148,102150,102152,102154],{"class":35,"line":36},[33,102149,47],{"class":46},[33,102151,51],{"class":50},[33,102153,55],{"class":54},[33,102155,58],{"class":54},[33,102157,102158,102160,102162],{"class":35,"line":43},[33,102159,64],{"class":50},[33,102161,67],{"class":54},[33,102163,102164],{"class":39},"   # Windows: .venv\\Scripts\\activate\n",[33,102166,102167,102169,102171,102173,102175],{"class":35,"line":61},[33,102168,76],{"class":46},[33,102170,79],{"class":54},[33,102172,16183],{"class":54},[33,102174,16186],{"class":54},[33,102176,102177],{"class":54}," xlsxwriter\n",[14,102179,102180,102182],{},[30,102181,26625],{}," for a locked environment:",[23,102184,102187],{"className":102185,"code":102186,"language":2000},[1998],"pandas==2.2.2\nopenpyxl==3.1.2\nxlsxwriter==3.2.0\n",[30,102188,102186],{"__ignoreMap":28},[14,102190,102191],{},"Create a sample CSV to follow along:",[23,102193,102195],{"className":25,"code":102194,"language":27,"meta":28,"style":28},"mkdir -p data output\npython - \u003C\u003C'EOF'\nimport pandas as pd, numpy as np\nrng = np.random.default_rng(42)\nregions = [\"North\", \"South\", \"East\", \"West\"]\nproducts = [\"Widget A\", \"Widget B\", \"Gadget X\"]\nrows = []\nfor month in range(1, 7):\n    for _ in range(40):\n        rows.append({\n            \"month\": f\"2026-{month:02d}\",\n            \"region\": rng.choice(regions),\n            \"product\": rng.choice(products),\n            \"units\": int(rng.integers(10, 200)),\n            \"revenue\": round(float(rng.uniform(500, 8000)), 2),\n        })\npd.DataFrame(rows).to_csv(\"data\u002Fsales.csv\", index=False)\nprint(\"data\u002Fsales.csv written\")\nEOF\n",[30,102196,102197,102209,102219,102224,102229,102234,102239,102244,102249,102254,102259,102264,102269,102274,102279,102284,102288,102293,102298],{"__ignoreMap":28},[33,102198,102199,102201,102203,102206],{"class":35,"line":36},[33,102200,59501],{"class":46},[33,102202,59504],{"class":50},[33,102204,102205],{"class":54}," data",[33,102207,102208],{"class":54}," output\n",[33,102210,102211,102213,102215,102217],{"class":35,"line":43},[33,102212,47],{"class":46},[33,102214,39025],{"class":54},[33,102216,53957],{"class":163},[33,102218,53960],{"class":54},[33,102220,102221],{"class":35,"line":61},[33,102222,102223],{"class":54},"import pandas as pd, numpy as np\n",[33,102225,102226],{"class":35,"line":73},[33,102227,102228],{"class":54},"rng = np.random.default_rng(42)\n",[33,102230,102231],{"class":35,"line":88},[33,102232,102233],{"class":54},"regions = [\"North\", \"South\", \"East\", \"West\"]\n",[33,102235,102236],{"class":35,"line":95},[33,102237,102238],{"class":54},"products = [\"Widget A\", \"Widget B\", \"Gadget X\"]\n",[33,102240,102241],{"class":35,"line":101},[33,102242,102243],{"class":54},"rows = []\n",[33,102245,102246],{"class":35,"line":171},[33,102247,102248],{"class":54},"for month in range(1, 7):\n",[33,102250,102251],{"class":35,"line":179},[33,102252,102253],{"class":54},"    for _ in range(40):\n",[33,102255,102256],{"class":35,"line":187},[33,102257,102258],{"class":54},"        rows.append({\n",[33,102260,102261],{"class":35,"line":201},[33,102262,102263],{"class":54},"            \"month\": f\"2026-{month:02d}\",\n",[33,102265,102266],{"class":35,"line":206},[33,102267,102268],{"class":54},"            \"region\": rng.choice(regions),\n",[33,102270,102271],{"class":35,"line":224},[33,102272,102273],{"class":54},"            \"product\": rng.choice(products),\n",[33,102275,102276],{"class":35,"line":229},[33,102277,102278],{"class":54},"            \"units\": int(rng.integers(10, 200)),\n",[33,102280,102281],{"class":35,"line":235},[33,102282,102283],{"class":54},"            \"revenue\": round(float(rng.uniform(500, 8000)), 2),\n",[33,102285,102286],{"class":35,"line":250},[33,102287,83823],{"class":54},[33,102289,102290],{"class":35,"line":266},[33,102291,102292],{"class":54},"pd.DataFrame(rows).to_csv(\"data\u002Fsales.csv\", index=False)\n",[33,102294,102295],{"class":35,"line":290},[33,102296,102297],{"class":54},"print(\"data\u002Fsales.csv written\")\n",[33,102299,102300],{"class":35,"line":295},[33,102301,54019],{"class":54},[18,102303,102305],{"id":102304},"inspect-the-input-before-writing","Inspect the Input Before Writing",[14,102307,102308],{},"Always validate the incoming DataFrame before you write anything to Excel. Silent type coercions produce wrong number formats.",[23,102310,102312],{"className":126,"code":102311,"language":47,"meta":28,"style":28},"# pip install pandas\nfrom pathlib import Path\nimport pandas as pd\n\nINPUT = Path(\"data\u002Fsales.csv\")\n\ntry:\n    df = pd.read_csv(INPUT)\n    print(df.dtypes)         # confirm numeric cols are not object\n    print(df.isnull().sum()) # find gaps before aggregation\n    print(df.head(3))\nexcept FileNotFoundError:\n    raise SystemExit(f\"Missing input file: {INPUT}\")\n",[30,102313,102314,102318,102328,102338,102342,102355,102359,102365,102377,102387,102397,102407,102415],{"__ignoreMap":28},[33,102315,102316],{"class":35,"line":36},[33,102317,8895],{"class":39},[33,102319,102320,102322,102324,102326],{"class":35,"line":43},[33,102321,190],{"class":163},[33,102323,193],{"class":167},[33,102325,164],{"class":163},[33,102327,198],{"class":167},[33,102329,102330,102332,102334,102336],{"class":35,"line":61},[33,102331,164],{"class":163},[33,102333,492],{"class":167},[33,102335,495],{"class":163},[33,102337,498],{"class":167},[33,102339,102340],{"class":35,"line":73},[33,102341,92],{"emptyLinePlaceholder":91},[33,102343,102344,102347,102349,102351,102353],{"class":35,"line":88},[33,102345,102346],{"class":50},"INPUT",[33,102348,212],{"class":163},[33,102350,215],{"class":167},[33,102352,59612],{"class":54},[33,102354,221],{"class":167},[33,102356,102357],{"class":35,"line":95},[33,102358,92],{"emptyLinePlaceholder":91},[33,102360,102361,102363],{"class":35,"line":101},[33,102362,35574],{"class":163},[33,102364,574],{"class":167},[33,102366,102367,102369,102371,102373,102375],{"class":35,"line":171},[33,102368,4025],{"class":167},[33,102370,242],{"class":163},[33,102372,9481],{"class":167},[33,102374,102346],{"class":50},[33,102376,221],{"class":167},[33,102378,102379,102381,102384],{"class":35,"line":179},[33,102380,7268],{"class":50},[33,102382,102383],{"class":167},"(df.dtypes)         ",[33,102385,102386],{"class":39},"# confirm numeric cols are not object\n",[33,102388,102389,102391,102394],{"class":35,"line":187},[33,102390,7268],{"class":50},[33,102392,102393],{"class":167},"(df.isnull().sum()) ",[33,102395,102396],{"class":39},"# find gaps before aggregation\n",[33,102398,102399,102401,102403,102405],{"class":35,"line":201},[33,102400,7268],{"class":50},[33,102402,35717],{"class":167},[33,102404,10258],{"class":50},[33,102406,371],{"class":167},[33,102408,102409,102411,102413],{"class":35,"line":206},[33,102410,35726],{"class":163},[33,102412,2945],{"class":50},[33,102414,574],{"class":167},[33,102416,102417,102419,102421,102423,102425,102428,102431,102433],{"class":35,"line":224},[33,102418,35742],{"class":163},[33,102420,16617],{"class":50},[33,102422,602],{"class":167},[33,102424,4059],{"class":163},[33,102426,102427],{"class":54},"\"Missing input file: ",[33,102429,102430],{"class":50},"{INPUT}",[33,102432,274],{"class":54},[33,102434,221],{"class":167},[14,102436,102437,102438,10065,102440,102442,102443,1351,102446,102449,102450,102452,102453,102456],{},"Expected: ",[30,102439,18528],{},[30,102441,18511],{}," as ",[30,102444,102445],{},"float64",[30,102447,102448],{},"int64",". If they show as ",[30,102451,11888],{},", the CSV contains currency symbols or commas — strip them with ",[30,102454,102455],{},"pd.to_numeric(df[\"revenue\"].str.replace(r\"[$,]\", \"\", regex=True), errors=\"coerce\")"," before proceeding.",[18,102458,102460],{"id":102459},"data-styler-workbook-sheets-the-pipeline","Data → Styler → Workbook → Sheets: The Pipeline",[14,102462,102463],{},"The diagram below shows how raw data flows through the four-stage pipeline this guide implements.",[2540,102465,2547,102468,2547,102471,2547,102474,2547,2547,102488,2547,102490,2547,102493,2547,102496,2547,2547,102499,2547,2547,102502,2547,102505,2547,102507,2547,102510,2547,2547,102513,2547,2547,102516,2547,102518,2547,102521,2547,102525,2547,2547,102529,2547,2547,102531,2547,102533,2547,102537,2547,102540,2547,102544,2547,102547,2547,102550,2547,102552,2547,102555,102558,102561,102564],{"viewBox":102466,"role":2543,"ariaLabel":102467,"xmlns":2545,"style":2546},"0 0 760 220","Four-stage pipeline: CSV input to pandas aggregation to ExcelWriter styling to multi-sheet workbook output",[2549,102469,102470],{},"Excel report generation pipeline",[2553,102472,102473],{},"Shows four stages: CSV\u002FDB Input feeds pandas Aggregation, which feeds ExcelWriter + Styler, which produces a Multi-Sheet Workbook with Summary, Detail, and Chart sheets.",[2557,102475,2559,102476,2559,102483,2547],{},[2561,102477,2564,102479,2564,102481,2559],{"id":102478,"x1":748,"y1":748,"x2":734,"y2":748},"excel-report-grad",[2566,102480],{"offset":748,"style":2568},[2566,102482],{"offset":734,"style":2571},[2573,102484,2564,102486,2559],{"id":102485,"viewBox":2576,"refX":2577,"refY":1153,"markerWidth":1179,"markerHeight":1179,"orient":2578},"excel-report-arrow",[2580,102487],{"d":2582,"fill":2583},[2585,102489],{"x":3545,"y":2590,"width":2635,"height":2650,"rx":2591,"fill":2592,"stroke":2593,"style":2594},[2000,102491,102492],{"x":12900,"y":2650,"fill":2599,"style":16979},"CSV \u002F DB",[2000,102494,102495],{"x":12900,"y":11095,"fill":2583,"style":2685},"raw sales data",[2000,102497,102498],{"x":12900,"y":26332,"fill":2583,"style":2685},"mixed types",[35,102500],{"x1":2610,"y1":2679,"x2":16986,"y2":2679,"stroke":2583,"markerEnd":102501,"style":2594},"url(#excel-report-arrow)",[2585,102503],{"x":2611,"y":2590,"width":2610,"height":2650,"rx":2591,"fill":102504,"stroke":2593,"style":2594},"url(#excel-report-grad)",[2000,102506,9630],{"x":49839,"y":2650,"fill":2599,"style":16979},[2000,102508,102509],{"x":49839,"y":11095,"fill":2599,"style":2685},"groupby \u002F pivot",[2000,102511,102512],{"x":49839,"y":26332,"fill":2599,"style":2685},"type coercion",[35,102514],{"x1":59952,"y1":2679,"x2":102515,"y2":2679,"stroke":2583,"markerEnd":102501,"style":2594},"398",[2585,102517],{"x":47140,"y":2590,"width":2610,"height":2650,"rx":2591,"fill":102504,"stroke":2593,"style":2594},[2000,102519,102520],{"x":49852,"y":16357,"fill":2599,"style":16979},"ExcelWriter",[2000,102522,102524],{"x":49852,"y":102523,"fill":2599,"style":2685},"113","+ Styler",[2000,102526,102528],{"x":49852,"y":102527,"fill":2599,"style":2685},"131","formats \u002F charts",[35,102530],{"x1":49853,"y1":2679,"x2":89088,"y2":2679,"stroke":2583,"markerEnd":102501,"style":2594},[2585,102532],{"x":58359,"y":1543,"width":11194,"height":58337,"rx":2591,"fill":2592,"stroke":2593,"style":2594},[2000,102534,102536],{"x":102535,"y":2590,"fill":2599,"style":38718},"674","Workbook",[2585,102538],{"x":102539,"y":38748,"width":2589,"height":11104,"rx":1503,"fill":11165,"stroke":11166,"style":11105},"614",[2000,102541,102543],{"x":102535,"y":102542,"fill":2599,"style":2685},"89","Summary sheet",[2585,102545],{"x":102539,"y":102546,"width":2589,"height":11104,"rx":1503,"fill":11165,"stroke":11166,"style":11105},"105",[2000,102548,102549],{"x":102535,"y":26402,"fill":2599,"style":2685},"Detail sheet",[2585,102551],{"x":102539,"y":2609,"width":2589,"height":11104,"rx":1503,"fill":11165,"stroke":11166,"style":11105},[2000,102553,102554],{"x":102535,"y":17029,"fill":2599,"style":2685},"Chart sheet",[2000,102556,102557],{"x":12900,"y":16982,"fill":2583,"style":2605},"\n① Ingest\n",[2000,102559,102560],{"x":49839,"y":16982,"fill":2583,"style":2605},"\n② Transform\n",[2000,102562,102563],{"x":49852,"y":16982,"fill":2583,"style":2605},"\n③ Style\n",[2000,102565,102567],{"x":102535,"y":102566,"fill":2583,"style":2605},"208","\n④ Output\n",[18,102569,102571],{"id":102570},"step-1-aggregate-the-data","Step 1 — Aggregate the Data",[23,102573,102575],{"className":126,"code":102574,"language":47,"meta":28,"style":28},"# pip install pandas\nfrom pathlib import Path\nimport pandas as pd\n\nINPUT = Path(\"data\u002Fsales.csv\")\n\ntry:\n    df = pd.read_csv(INPUT, parse_dates=[\"month\"])\nexcept FileNotFoundError:\n    raise SystemExit(f\"Missing: {INPUT}\")\n\ndf[\"revenue\"] = pd.to_numeric(df[\"revenue\"], errors=\"coerce\")\ndf[\"units\"] = pd.to_numeric(df[\"units\"], errors=\"coerce\")\ndf.dropna(subset=[\"revenue\", \"units\"], inplace=True)\n\n# Regional summary\nby_region = (\n    df.groupby(\"region\", as_index=False)\n    .agg(total_revenue=(\"revenue\", \"sum\"),\n         total_units=(\"units\", \"sum\"),\n         transactions=(\"revenue\", \"count\"))\n)\nby_region[\"avg_order\"] = by_region[\"total_revenue\"] \u002F by_region[\"transactions\"]\n\n# Monthly trend\nby_month = (\n    df.groupby(\"month\", as_index=False)\n    .agg(total_revenue=(\"revenue\", \"sum\"))\n    .sort_values(\"month\")\n)\n\nprint(by_region)\nprint(by_month)\n",[30,102576,102577,102581,102591,102601,102605,102617,102621,102627,102650,102658,102677,102681,102705,102729,102756,102760,102764,102772,102789,102807,102823,102840,102844,102868,102872,102876,102884,102900,102918,102926,102930,102934,102940],{"__ignoreMap":28},[33,102578,102579],{"class":35,"line":36},[33,102580,8895],{"class":39},[33,102582,102583,102585,102587,102589],{"class":35,"line":43},[33,102584,190],{"class":163},[33,102586,193],{"class":167},[33,102588,164],{"class":163},[33,102590,198],{"class":167},[33,102592,102593,102595,102597,102599],{"class":35,"line":61},[33,102594,164],{"class":163},[33,102596,492],{"class":167},[33,102598,495],{"class":163},[33,102600,498],{"class":167},[33,102602,102603],{"class":35,"line":73},[33,102604,92],{"emptyLinePlaceholder":91},[33,102606,102607,102609,102611,102613,102615],{"class":35,"line":88},[33,102608,102346],{"class":50},[33,102610,212],{"class":163},[33,102612,215],{"class":167},[33,102614,59612],{"class":54},[33,102616,221],{"class":167},[33,102618,102619],{"class":35,"line":95},[33,102620,92],{"emptyLinePlaceholder":91},[33,102622,102623,102625],{"class":35,"line":101},[33,102624,35574],{"class":163},[33,102626,574],{"class":167},[33,102628,102629,102631,102633,102635,102637,102639,102642,102644,102646,102648],{"class":35,"line":171},[33,102630,4025],{"class":167},[33,102632,242],{"class":163},[33,102634,9481],{"class":167},[33,102636,102346],{"class":50},[33,102638,365],{"class":167},[33,102640,102641],{"class":238},"parse_dates",[33,102643,242],{"class":163},[33,102645,8309],{"class":167},[33,102647,96465],{"class":54},[33,102649,751],{"class":167},[33,102651,102652,102654,102656],{"class":35,"line":179},[33,102653,35726],{"class":163},[33,102655,2945],{"class":50},[33,102657,574],{"class":167},[33,102659,102660,102662,102664,102666,102668,102671,102673,102675],{"class":35,"line":187},[33,102661,35742],{"class":163},[33,102663,16617],{"class":50},[33,102665,602],{"class":167},[33,102667,4059],{"class":163},[33,102669,102670],{"class":54},"\"Missing: ",[33,102672,102430],{"class":50},[33,102674,274],{"class":54},[33,102676,221],{"class":167},[33,102678,102679],{"class":35,"line":201},[33,102680,92],{"emptyLinePlaceholder":91},[33,102682,102683,102685,102687,102689,102691,102693,102695,102697,102699,102701,102703],{"class":35,"line":206},[33,102684,11038],{"class":167},[33,102686,16465],{"class":54},[33,102688,763],{"class":167},[33,102690,242],{"class":163},[33,102692,27643],{"class":167},[33,102694,16465],{"class":54},[33,102696,8314],{"class":167},[33,102698,8317],{"class":238},[33,102700,242],{"class":163},[33,102702,12107],{"class":54},[33,102704,221],{"class":167},[33,102706,102707,102709,102711,102713,102715,102717,102719,102721,102723,102725,102727],{"class":35,"line":224},[33,102708,11038],{"class":167},[33,102710,16659],{"class":54},[33,102712,763],{"class":167},[33,102714,242],{"class":163},[33,102716,27643],{"class":167},[33,102718,16659],{"class":54},[33,102720,8314],{"class":167},[33,102722,8317],{"class":238},[33,102724,242],{"class":163},[33,102726,12107],{"class":54},[33,102728,221],{"class":167},[33,102730,102731,102734,102736,102738,102740,102742,102744,102746,102748,102750,102752,102754],{"class":35,"line":229},[33,102732,102733],{"class":167},"df.dropna(",[33,102735,28066],{"class":238},[33,102737,242],{"class":163},[33,102739,8309],{"class":167},[33,102741,16465],{"class":54},[33,102743,365],{"class":167},[33,102745,16659],{"class":54},[33,102747,8314],{"class":167},[33,102749,10891],{"class":238},[33,102751,242],{"class":163},[33,102753,855],{"class":50},[33,102755,221],{"class":167},[33,102757,102758],{"class":35,"line":235},[33,102759,92],{"emptyLinePlaceholder":91},[33,102761,102762],{"class":35,"line":250},[33,102763,96519],{"class":39},[33,102765,102766,102768,102770],{"class":35,"line":266},[33,102767,96524],{"class":167},[33,102769,242],{"class":163},[33,102771,1415],{"class":167},[33,102773,102774,102777,102779,102781,102783,102785,102787],{"class":35,"line":290},[33,102775,102776],{"class":167},"    df.groupby(",[33,102778,16649],{"class":54},[33,102780,365],{"class":167},[33,102782,96540],{"class":238},[33,102784,242],{"class":163},[33,102786,902],{"class":50},[33,102788,221],{"class":167},[33,102790,102791,102793,102795,102797,102799,102801,102803,102805],{"class":35,"line":295},[33,102792,96709],{"class":167},[33,102794,18407],{"class":238},[33,102796,242],{"class":163},[33,102798,602],{"class":167},[33,102800,16465],{"class":54},[33,102802,365],{"class":167},[33,102804,18522],{"class":54},[33,102806,1506],{"class":167},[33,102808,102809,102811,102813,102815,102817,102819,102821],{"class":35,"line":300},[33,102810,96728],{"class":238},[33,102812,242],{"class":163},[33,102814,602],{"class":167},[33,102816,16659],{"class":54},[33,102818,365],{"class":167},[33,102820,18522],{"class":54},[33,102822,1506],{"class":167},[33,102824,102825,102828,102830,102832,102834,102836,102838],{"class":35,"line":317},[33,102826,102827],{"class":238},"         transactions",[33,102829,242],{"class":163},[33,102831,602],{"class":167},[33,102833,16465],{"class":54},[33,102835,365],{"class":167},[33,102837,96601],{"class":54},[33,102839,371],{"class":167},[33,102841,102842],{"class":35,"line":332},[33,102843,221],{"class":167},[33,102845,102846,102848,102850,102852,102854,102856,102858,102860,102862,102864,102866],{"class":35,"line":347},[33,102847,96634],{"class":167},[33,102849,96637],{"class":54},[33,102851,763],{"class":167},[33,102853,242],{"class":163},[33,102855,96657],{"class":167},[33,102857,96615],{"class":54},[33,102859,763],{"class":167},[33,102861,1351],{"class":163},[33,102863,96657],{"class":167},[33,102865,96660],{"class":54},[33,102867,9202],{"class":167},[33,102869,102870],{"class":35,"line":374},[33,102871,92],{"emptyLinePlaceholder":91},[33,102873,102874],{"class":35,"line":397},[33,102875,96679],{"class":39},[33,102877,102878,102880,102882],{"class":35,"line":653},[33,102879,96684],{"class":167},[33,102881,242],{"class":163},[33,102883,1415],{"class":167},[33,102885,102886,102888,102890,102892,102894,102896,102898],{"class":35,"line":667},[33,102887,102776],{"class":167},[33,102889,96465],{"class":54},[33,102891,365],{"class":167},[33,102893,96540],{"class":238},[33,102895,242],{"class":163},[33,102897,902],{"class":50},[33,102899,221],{"class":167},[33,102901,102902,102904,102906,102908,102910,102912,102914,102916],{"class":35,"line":675},[33,102903,96709],{"class":167},[33,102905,18407],{"class":238},[33,102907,242],{"class":163},[33,102909,602],{"class":167},[33,102911,16465],{"class":54},[33,102913,365],{"class":167},[33,102915,18522],{"class":54},[33,102917,371],{"class":167},[33,102919,102920,102922,102924],{"class":35,"line":689},[33,102921,96612],{"class":167},[33,102923,96465],{"class":54},[33,102925,221],{"class":167},[33,102927,102928],{"class":35,"line":703},[33,102929,221],{"class":167},[33,102931,102932],{"class":35,"line":714},[33,102933,92],{"emptyLinePlaceholder":91},[33,102935,102936,102938],{"class":35,"line":723},[33,102937,13474],{"class":50},[33,102939,96796],{"class":167},[33,102941,102942,102944],{"class":35,"line":754},[33,102943,13474],{"class":50},[33,102945,96803],{"class":167},[18,102947,102949],{"id":102948},"step-2-write-multiple-sheets-with-styled-headers","Step 2 — Write Multiple Sheets with Styled Headers",[14,102951,102952,102954,102955,2012,102957,102959,102960,102962,102963,102965,102966,102968],{},[30,102953,96826],{}," delegates to ",[30,102956,17066],{},[30,102958,22009],{}," as the engine. Use ",[30,102961,17066],{}," when creating a new file with heavy formatting; use ",[30,102964,22009],{}," when you need to modify an existing file (see ",[940,102967,99577],{"href":99576}," for that path).",[23,102970,102972],{"className":126,"code":102971,"language":47,"meta":28,"style":28},"# pip install pandas xlsxwriter\nfrom pathlib import Path\nimport pandas as pd\n\nOUTPUT = Path(\"output\u002Fsales_report.xlsx\")\nOUTPUT.parent.mkdir(parents=True, exist_ok=True)\n\n# Re-use by_region and by_month from Step 1\ntry:\n    with pd.ExcelWriter(OUTPUT, engine=\"xlsxwriter\") as writer:\n        by_region.to_excel(writer, sheet_name=\"Summary\", index=False, startrow=1)\n        by_month.to_excel(writer, sheet_name=\"Monthly Trend\", index=False, startrow=1)\n\n        wb = writer.book\n\n        # --- define reusable formats ---\n        hdr_fmt = wb.add_format({\n            \"bold\": True, \"bg_color\": \"#2563eb\", \"font_color\": \"#ffffff\",\n            \"border\": 1, \"font_size\": 12,\n        })\n        currency_fmt = wb.add_format({\"num_format\": \"#,##0.00\", \"border\": 1})\n        int_fmt      = wb.add_format({\"num_format\": \"#,##0\",    \"border\": 1})\n        text_fmt     = wb.add_format({\"border\": 1})\n\n        for sheet_name, frame in [(\"Summary\", by_region), (\"Monthly Trend\", by_month)]:\n            ws = writer.sheets[sheet_name]\n            # Write styled header row (row 0, overwriting the blank startrow)\n            for col_idx, col_name in enumerate(frame.columns):\n                ws.write(1, col_idx, col_name, hdr_fmt)\n            ws.set_row(1, 20)\n\n        # Column widths for Summary sheet\n        ws_sum = writer.sheets[\"Summary\"]\n        ws_sum.set_column(\"A:A\", 14, text_fmt)\n        ws_sum.set_column(\"B:D\", 16, currency_fmt)\n\nexcept PermissionError:\n    raise SystemExit(f\"Close {OUTPUT} in Excel before running this script\")\nexcept Exception as exc:\n    raise SystemExit(f\"Export failed: {exc}\")\n\nprint(f\"Written: {OUTPUT}\")\n",[30,102973,102974,102979,102989,102999,103003,103016,103038,103042,103047,103053,103076,103106,103135,103139,103147,103151,103156,103166,103196,103216,103220,103250,103276,103293,103297,103317,103326,103331,103345,103355,103368,103372,103377,103391,103406,103420,103424,103432,103451,103461,103483,103487],{"__ignoreMap":28},[33,102975,102976],{"class":35,"line":36},[33,102977,102978],{"class":39},"# pip install pandas xlsxwriter\n",[33,102980,102981,102983,102985,102987],{"class":35,"line":43},[33,102982,190],{"class":163},[33,102984,193],{"class":167},[33,102986,164],{"class":163},[33,102988,198],{"class":167},[33,102990,102991,102993,102995,102997],{"class":35,"line":61},[33,102992,164],{"class":163},[33,102994,492],{"class":167},[33,102996,495],{"class":163},[33,102998,498],{"class":167},[33,103000,103001],{"class":35,"line":73},[33,103002,92],{"emptyLinePlaceholder":91},[33,103004,103005,103007,103009,103011,103014],{"class":35,"line":88},[33,103006,96935],{"class":50},[33,103008,212],{"class":163},[33,103010,215],{"class":167},[33,103012,103013],{"class":54},"\"output\u002Fsales_report.xlsx\"",[33,103015,221],{"class":167},[33,103017,103018,103020,103022,103024,103026,103028,103030,103032,103034,103036],{"class":35,"line":95},[33,103019,96935],{"class":50},[33,103021,866],{"class":167},[33,103023,869],{"class":238},[33,103025,242],{"class":163},[33,103027,855],{"class":50},[33,103029,365],{"class":167},[33,103031,878],{"class":238},[33,103033,242],{"class":163},[33,103035,855],{"class":50},[33,103037,221],{"class":167},[33,103039,103040],{"class":35,"line":101},[33,103041,92],{"emptyLinePlaceholder":91},[33,103043,103044],{"class":35,"line":171},[33,103045,103046],{"class":39},"# Re-use by_region and by_month from Step 1\n",[33,103048,103049,103051],{"class":35,"line":179},[33,103050,35574],{"class":163},[33,103052,574],{"class":167},[33,103054,103055,103057,103059,103061,103063,103065,103067,103070,103072,103074],{"class":35,"line":187},[33,103056,1635],{"class":163},[33,103058,96996],{"class":167},[33,103060,96935],{"class":50},[33,103062,365],{"class":167},[33,103064,17351],{"class":238},[33,103066,242],{"class":163},[33,103068,103069],{"class":54},"\"xlsxwriter\"",[33,103071,1649],{"class":167},[33,103073,495],{"class":163},[33,103075,17363],{"class":167},[33,103077,103078,103080,103082,103084,103087,103089,103091,103093,103095,103097,103100,103102,103104],{"class":35,"line":201},[33,103079,97017],{"class":167},[33,103081,17371],{"class":238},[33,103083,242],{"class":163},[33,103085,103086],{"class":54},"\"Summary\"",[33,103088,365],{"class":167},[33,103090,897],{"class":238},[33,103092,242],{"class":163},[33,103094,902],{"class":50},[33,103096,365],{"class":167},[33,103098,103099],{"class":238},"startrow",[33,103101,242],{"class":163},[33,103103,734],{"class":50},[33,103105,221],{"class":167},[33,103107,103108,103111,103113,103115,103117,103119,103121,103123,103125,103127,103129,103131,103133],{"class":35,"line":206},[33,103109,103110],{"class":167},"        by_month.to_excel(writer, ",[33,103112,17371],{"class":238},[33,103114,242],{"class":163},[33,103116,97046],{"class":54},[33,103118,365],{"class":167},[33,103120,897],{"class":238},[33,103122,242],{"class":163},[33,103124,902],{"class":50},[33,103126,365],{"class":167},[33,103128,103099],{"class":238},[33,103130,242],{"class":163},[33,103132,734],{"class":50},[33,103134,221],{"class":167},[33,103136,103137],{"class":35,"line":224},[33,103138,92],{"emptyLinePlaceholder":91},[33,103140,103141,103143,103145],{"class":35,"line":229},[33,103142,97065],{"class":167},[33,103144,242],{"class":163},[33,103146,97070],{"class":167},[33,103148,103149],{"class":35,"line":235},[33,103150,92],{"emptyLinePlaceholder":91},[33,103152,103153],{"class":35,"line":250},[33,103154,103155],{"class":39},"        # --- define reusable formats ---\n",[33,103157,103158,103161,103163],{"class":35,"line":266},[33,103159,103160],{"class":167},"        hdr_fmt ",[33,103162,242],{"class":163},[33,103164,103165],{"class":167}," wb.add_format({\n",[33,103167,103168,103171,103173,103175,103177,103180,103182,103184,103186,103189,103191,103194],{"class":35,"line":290},[33,103169,103170],{"class":54},"            \"bold\"",[33,103172,2079],{"class":167},[33,103174,855],{"class":50},[33,103176,365],{"class":167},[33,103178,103179],{"class":54},"\"bg_color\"",[33,103181,2079],{"class":167},[33,103183,55362],{"class":54},[33,103185,365],{"class":167},[33,103187,103188],{"class":54},"\"font_color\"",[33,103190,2079],{"class":167},[33,103192,103193],{"class":54},"\"#ffffff\"",[33,103195,247],{"class":167},[33,103197,103198,103201,103203,103205,103207,103210,103212,103214],{"class":35,"line":295},[33,103199,103200],{"class":54},"            \"border\"",[33,103202,2079],{"class":167},[33,103204,734],{"class":50},[33,103206,365],{"class":167},[33,103208,103209],{"class":54},"\"font_size\"",[33,103211,2079],{"class":167},[33,103213,55650],{"class":50},[33,103215,247],{"class":167},[33,103217,103218],{"class":35,"line":300},[33,103219,83823],{"class":167},[33,103221,103222,103225,103227,103230,103233,103235,103238,103240,103243,103245,103247],{"class":35,"line":317},[33,103223,103224],{"class":167},"        currency_fmt ",[33,103226,242],{"class":163},[33,103228,103229],{"class":167}," wb.add_format({",[33,103231,103232],{"class":54},"\"num_format\"",[33,103234,2079],{"class":167},[33,103236,103237],{"class":54},"\"#,##0.00\"",[33,103239,365],{"class":167},[33,103241,103242],{"class":54},"\"border\"",[33,103244,2079],{"class":167},[33,103246,734],{"class":50},[33,103248,103249],{"class":167},"})\n",[33,103251,103252,103255,103257,103259,103261,103263,103266,103268,103270,103272,103274],{"class":35,"line":332},[33,103253,103254],{"class":167},"        int_fmt      ",[33,103256,242],{"class":163},[33,103258,103229],{"class":167},[33,103260,103232],{"class":54},[33,103262,2079],{"class":167},[33,103264,103265],{"class":54},"\"#,##0\"",[33,103267,38342],{"class":167},[33,103269,103242],{"class":54},[33,103271,2079],{"class":167},[33,103273,734],{"class":50},[33,103275,103249],{"class":167},[33,103277,103278,103281,103283,103285,103287,103289,103291],{"class":35,"line":347},[33,103279,103280],{"class":167},"        text_fmt     ",[33,103282,242],{"class":163},[33,103284,103229],{"class":167},[33,103286,103242],{"class":54},[33,103288,2079],{"class":167},[33,103290,734],{"class":50},[33,103292,103249],{"class":167},[33,103294,103295],{"class":35,"line":374},[33,103296,92],{"emptyLinePlaceholder":91},[33,103298,103299,103301,103303,103305,103307,103309,103312,103314],{"class":35,"line":397},[33,103300,5973],{"class":163},[33,103302,97081],{"class":167},[33,103304,662],{"class":163},[33,103306,79351],{"class":167},[33,103308,103086],{"class":54},[33,103310,103311],{"class":167},", by_region), (",[33,103313,97046],{"class":54},[33,103315,103316],{"class":167},", by_month)]:\n",[33,103318,103319,103321,103323],{"class":35,"line":653},[33,103320,97105],{"class":167},[33,103322,242],{"class":163},[33,103324,103325],{"class":167}," writer.sheets[sheet_name]\n",[33,103327,103328],{"class":35,"line":667},[33,103329,103330],{"class":39},"            # Write styled header row (row 0, overwriting the blank startrow)\n",[33,103332,103333,103335,103338,103340,103342],{"class":35,"line":675},[33,103334,1793],{"class":163},[33,103336,103337],{"class":167}," col_idx, col_name ",[33,103339,662],{"class":163},[33,103341,7403],{"class":50},[33,103343,103344],{"class":167},"(frame.columns):\n",[33,103346,103347,103350,103352],{"class":35,"line":689},[33,103348,103349],{"class":167},"                ws.write(",[33,103351,734],{"class":50},[33,103353,103354],{"class":167},", col_idx, col_name, hdr_fmt)\n",[33,103356,103357,103360,103362,103364,103366],{"class":35,"line":703},[33,103358,103359],{"class":167},"            ws.set_row(",[33,103361,734],{"class":50},[33,103363,365],{"class":167},[33,103365,2587],{"class":50},[33,103367,221],{"class":167},[33,103369,103370],{"class":35,"line":714},[33,103371,92],{"emptyLinePlaceholder":91},[33,103373,103374],{"class":35,"line":723},[33,103375,103376],{"class":39},"        # Column widths for Summary sheet\n",[33,103378,103379,103382,103384,103387,103389],{"class":35,"line":754},[33,103380,103381],{"class":167},"        ws_sum ",[33,103383,242],{"class":163},[33,103385,103386],{"class":167}," writer.sheets[",[33,103388,103086],{"class":54},[33,103390,9202],{"class":167},[33,103392,103393,103396,103399,103401,103403],{"class":35,"line":771},[33,103394,103395],{"class":167},"        ws_sum.set_column(",[33,103397,103398],{"class":54},"\"A:A\"",[33,103400,365],{"class":167},[33,103402,19368],{"class":50},[33,103404,103405],{"class":167},", text_fmt)\n",[33,103407,103408,103410,103413,103415,103417],{"class":35,"line":777},[33,103409,103395],{"class":167},[33,103411,103412],{"class":54},"\"B:D\"",[33,103414,365],{"class":167},[33,103416,24213],{"class":50},[33,103418,103419],{"class":167},", currency_fmt)\n",[33,103421,103422],{"class":35,"line":788},[33,103423,92],{"emptyLinePlaceholder":91},[33,103425,103426,103428,103430],{"class":35,"line":804},[33,103427,35726],{"class":163},[33,103429,17393],{"class":50},[33,103431,574],{"class":167},[33,103433,103434,103436,103438,103440,103442,103444,103446,103449],{"class":35,"line":809},[33,103435,35742],{"class":163},[33,103437,16617],{"class":50},[33,103439,602],{"class":167},[33,103441,4059],{"class":163},[33,103443,97681],{"class":54},[33,103445,97684],{"class":50},[33,103447,103448],{"class":54}," in Excel before running this script\"",[33,103450,221],{"class":167},[33,103452,103453,103455,103457,103459],{"class":35,"line":819},[33,103454,35726],{"class":163},[33,103456,783],{"class":50},[33,103458,1852],{"class":163},[33,103460,1855],{"class":167},[33,103462,103463,103465,103467,103469,103471,103473,103475,103477,103479,103481],{"class":35,"line":829},[33,103464,35742],{"class":163},[33,103466,16617],{"class":50},[33,103468,602],{"class":167},[33,103470,4059],{"class":163},[33,103472,101705],{"class":54},[33,103474,1115],{"class":50},[33,103476,6565],{"class":167},[33,103478,1121],{"class":50},[33,103480,274],{"class":54},[33,103482,221],{"class":167},[33,103484,103485],{"class":35,"line":834},[33,103486,92],{"emptyLinePlaceholder":91},[33,103488,103489,103491,103493,103495,103497,103499,103501],{"class":35,"line":839},[33,103490,13474],{"class":50},[33,103492,602],{"class":167},[33,103494,4059],{"class":163},[33,103496,58214],{"class":54},[33,103498,97684],{"class":50},[33,103500,274],{"class":54},[33,103502,221],{"class":167},[18,103504,103506],{"id":103505},"step-3-number-formats-and-summary-rows","Step 3 — Number Formats and Summary Rows",[14,103508,103509],{},"A summary row at the bottom of the data gives stakeholders a quick total without opening a formula bar.",[23,103511,103513],{"className":126,"code":103512,"language":47,"meta":28,"style":28},"# pip install pandas xlsxwriter\nfrom pathlib import Path\nimport pandas as pd\n\nOUTPUT = Path(\"output\u002Fsales_report.xlsx\")\n\ntry:\n    with pd.ExcelWriter(OUTPUT, engine=\"xlsxwriter\") as writer:\n        by_region.to_excel(writer, sheet_name=\"Summary\", index=False, startrow=1)\n        wb = writer.book\n        ws = writer.sheets[\"Summary\"]\n\n        currency_fmt = wb.add_format({\"num_format\": \"#,##0.00\", \"bold\": False})\n        total_fmt    = wb.add_format({\"num_format\": \"#,##0.00\", \"bold\": True,\n                                      \"top\": 2, \"bg_color\": \"#f1f5f9\"})\n        label_fmt    = wb.add_format({\"bold\": True, \"bg_color\": \"#f1f5f9\"})\n\n        n_rows = len(by_region)\n        # data starts at Excel row 3 (0-indexed row 2) because startrow=1 + 1 header\n        data_start = 3\n        data_end   = data_start + n_rows - 1\n\n        # Apply number format to revenue and avg_order columns\n        for r in range(data_start, data_end + 1):\n            ws.write(r - 1, 1, by_region.iloc[r - data_start][\"total_revenue\"], currency_fmt)\n            ws.write(r - 1, 3, by_region.iloc[r - data_start][\"avg_order\"],     currency_fmt)\n\n        # Summary \u002F totals row\n        summary_row = data_end  # 0-indexed\n        ws.write(summary_row, 0, \"TOTAL\", label_fmt)\n        ws.write_formula(summary_row, 1,\n                         f\"=SUM(B{data_start}:B{data_end})\", total_fmt)\n        ws.write_formula(summary_row, 2,\n                         f\"=SUM(C{data_start}:C{data_end})\", total_fmt)\n        ws.write(summary_row, 3, \"\", total_fmt)\n\nexcept Exception as exc:\n    raise SystemExit(f\"Export failed: {exc}\")\n\nprint(f\"Written with summary row: {OUTPUT}\")\n",[30,103514,103515,103519,103529,103539,103543,103555,103559,103565,103587,103615,103623,103635,103639,103664,103689,103709,103734,103738,103749,103754,103764,103783,103787,103792,103811,103837,103860,103864,103869,103882,103897,103906,103936,103944,103970,103982,103986,103996,104018,104022],{"__ignoreMap":28},[33,103516,103517],{"class":35,"line":36},[33,103518,102978],{"class":39},[33,103520,103521,103523,103525,103527],{"class":35,"line":43},[33,103522,190],{"class":163},[33,103524,193],{"class":167},[33,103526,164],{"class":163},[33,103528,198],{"class":167},[33,103530,103531,103533,103535,103537],{"class":35,"line":61},[33,103532,164],{"class":163},[33,103534,492],{"class":167},[33,103536,495],{"class":163},[33,103538,498],{"class":167},[33,103540,103541],{"class":35,"line":73},[33,103542,92],{"emptyLinePlaceholder":91},[33,103544,103545,103547,103549,103551,103553],{"class":35,"line":88},[33,103546,96935],{"class":50},[33,103548,212],{"class":163},[33,103550,215],{"class":167},[33,103552,103013],{"class":54},[33,103554,221],{"class":167},[33,103556,103557],{"class":35,"line":95},[33,103558,92],{"emptyLinePlaceholder":91},[33,103560,103561,103563],{"class":35,"line":101},[33,103562,35574],{"class":163},[33,103564,574],{"class":167},[33,103566,103567,103569,103571,103573,103575,103577,103579,103581,103583,103585],{"class":35,"line":171},[33,103568,1635],{"class":163},[33,103570,96996],{"class":167},[33,103572,96935],{"class":50},[33,103574,365],{"class":167},[33,103576,17351],{"class":238},[33,103578,242],{"class":163},[33,103580,103069],{"class":54},[33,103582,1649],{"class":167},[33,103584,495],{"class":163},[33,103586,17363],{"class":167},[33,103588,103589,103591,103593,103595,103597,103599,103601,103603,103605,103607,103609,103611,103613],{"class":35,"line":179},[33,103590,97017],{"class":167},[33,103592,17371],{"class":238},[33,103594,242],{"class":163},[33,103596,103086],{"class":54},[33,103598,365],{"class":167},[33,103600,897],{"class":238},[33,103602,242],{"class":163},[33,103604,902],{"class":50},[33,103606,365],{"class":167},[33,103608,103099],{"class":238},[33,103610,242],{"class":163},[33,103612,734],{"class":50},[33,103614,221],{"class":167},[33,103616,103617,103619,103621],{"class":35,"line":187},[33,103618,97065],{"class":167},[33,103620,242],{"class":163},[33,103622,97070],{"class":167},[33,103624,103625,103627,103629,103631,103633],{"class":35,"line":201},[33,103626,101503],{"class":167},[33,103628,242],{"class":163},[33,103630,103386],{"class":167},[33,103632,103086],{"class":54},[33,103634,9202],{"class":167},[33,103636,103637],{"class":35,"line":206},[33,103638,92],{"emptyLinePlaceholder":91},[33,103640,103641,103643,103645,103647,103649,103651,103653,103655,103658,103660,103662],{"class":35,"line":224},[33,103642,103224],{"class":167},[33,103644,242],{"class":163},[33,103646,103229],{"class":167},[33,103648,103232],{"class":54},[33,103650,2079],{"class":167},[33,103652,103237],{"class":54},[33,103654,365],{"class":167},[33,103656,103657],{"class":54},"\"bold\"",[33,103659,2079],{"class":167},[33,103661,902],{"class":50},[33,103663,103249],{"class":167},[33,103665,103666,103669,103671,103673,103675,103677,103679,103681,103683,103685,103687],{"class":35,"line":229},[33,103667,103668],{"class":167},"        total_fmt    ",[33,103670,242],{"class":163},[33,103672,103229],{"class":167},[33,103674,103232],{"class":54},[33,103676,2079],{"class":167},[33,103678,103237],{"class":54},[33,103680,365],{"class":167},[33,103682,103657],{"class":54},[33,103684,2079],{"class":167},[33,103686,855],{"class":50},[33,103688,247],{"class":167},[33,103690,103691,103694,103696,103698,103700,103702,103704,103707],{"class":35,"line":235},[33,103692,103693],{"class":54},"                                      \"top\"",[33,103695,2079],{"class":167},[33,103697,1533],{"class":50},[33,103699,365],{"class":167},[33,103701,103179],{"class":54},[33,103703,2079],{"class":167},[33,103705,103706],{"class":54},"\"#f1f5f9\"",[33,103708,103249],{"class":167},[33,103710,103711,103714,103716,103718,103720,103722,103724,103726,103728,103730,103732],{"class":35,"line":250},[33,103712,103713],{"class":167},"        label_fmt    ",[33,103715,242],{"class":163},[33,103717,103229],{"class":167},[33,103719,103657],{"class":54},[33,103721,2079],{"class":167},[33,103723,855],{"class":50},[33,103725,365],{"class":167},[33,103727,103179],{"class":54},[33,103729,2079],{"class":167},[33,103731,103706],{"class":54},[33,103733,103249],{"class":167},[33,103735,103736],{"class":35,"line":266},[33,103737,92],{"emptyLinePlaceholder":91},[33,103739,103740,103743,103745,103747],{"class":35,"line":290},[33,103741,103742],{"class":167},"        n_rows ",[33,103744,242],{"class":163},[33,103746,4037],{"class":50},[33,103748,96796],{"class":167},[33,103750,103751],{"class":35,"line":295},[33,103752,103753],{"class":39},"        # data starts at Excel row 3 (0-indexed row 2) because startrow=1 + 1 header\n",[33,103755,103756,103759,103761],{"class":35,"line":300},[33,103757,103758],{"class":167},"        data_start ",[33,103760,242],{"class":163},[33,103762,103763],{"class":50}," 3\n",[33,103765,103766,103769,103771,103774,103776,103779,103781],{"class":35,"line":317},[33,103767,103768],{"class":167},"        data_end   ",[33,103770,242],{"class":163},[33,103772,103773],{"class":167}," data_start ",[33,103775,1811],{"class":163},[33,103777,103778],{"class":167}," n_rows ",[33,103780,4126],{"class":163},[33,103782,17709],{"class":50},[33,103784,103785],{"class":35,"line":332},[33,103786,92],{"emptyLinePlaceholder":91},[33,103788,103789],{"class":35,"line":347},[33,103790,103791],{"class":39},"        # Apply number format to revenue and avg_order columns\n",[33,103793,103794,103796,103798,103800,103802,103805,103807,103809],{"class":35,"line":374},[33,103795,5973],{"class":163},[33,103797,45721],{"class":167},[33,103799,662],{"class":163},[33,103801,1801],{"class":50},[33,103803,103804],{"class":167},"(data_start, data_end ",[33,103806,1811],{"class":163},[33,103808,1814],{"class":50},[33,103810,1737],{"class":167},[33,103812,103813,103816,103818,103820,103822,103824,103827,103829,103832,103834],{"class":35,"line":397},[33,103814,103815],{"class":167},"            ws.write(r ",[33,103817,4126],{"class":163},[33,103819,1814],{"class":50},[33,103821,365],{"class":167},[33,103823,734],{"class":50},[33,103825,103826],{"class":167},", by_region.iloc[r ",[33,103828,4126],{"class":163},[33,103830,103831],{"class":167}," data_start][",[33,103833,96615],{"class":54},[33,103835,103836],{"class":167},"], currency_fmt)\n",[33,103838,103839,103841,103843,103845,103847,103849,103851,103853,103855,103857],{"class":35,"line":653},[33,103840,103815],{"class":167},[33,103842,4126],{"class":163},[33,103844,1814],{"class":50},[33,103846,365],{"class":167},[33,103848,10258],{"class":50},[33,103850,103826],{"class":167},[33,103852,4126],{"class":163},[33,103854,103831],{"class":167},[33,103856,96637],{"class":54},[33,103858,103859],{"class":167},"],     currency_fmt)\n",[33,103861,103862],{"class":35,"line":667},[33,103863,92],{"emptyLinePlaceholder":91},[33,103865,103866],{"class":35,"line":675},[33,103867,103868],{"class":39},"        # Summary \u002F totals row\n",[33,103870,103871,103874,103876,103879],{"class":35,"line":689},[33,103872,103873],{"class":167},"        summary_row ",[33,103875,242],{"class":163},[33,103877,103878],{"class":167}," data_end  ",[33,103880,103881],{"class":39},"# 0-indexed\n",[33,103883,103884,103887,103889,103891,103894],{"class":35,"line":703},[33,103885,103886],{"class":167},"        ws.write(summary_row, ",[33,103888,748],{"class":50},[33,103890,365],{"class":167},[33,103892,103893],{"class":54},"\"TOTAL\"",[33,103895,103896],{"class":167},", label_fmt)\n",[33,103898,103899,103902,103904],{"class":35,"line":714},[33,103900,103901],{"class":167},"        ws.write_formula(summary_row, ",[33,103903,734],{"class":50},[33,103905,247],{"class":167},[33,103907,103908,103911,103914,103916,103919,103921,103924,103926,103929,103931,103933],{"class":35,"line":723},[33,103909,103910],{"class":163},"                         f",[33,103912,103913],{"class":54},"\"=SUM(B",[33,103915,1115],{"class":50},[33,103917,103918],{"class":167},"data_start",[33,103920,1121],{"class":50},[33,103922,103923],{"class":54},":B",[33,103925,1115],{"class":50},[33,103927,103928],{"class":167},"data_end",[33,103930,1121],{"class":50},[33,103932,72406],{"class":54},[33,103934,103935],{"class":167},", total_fmt)\n",[33,103937,103938,103940,103942],{"class":35,"line":754},[33,103939,103901],{"class":167},[33,103941,1533],{"class":50},[33,103943,247],{"class":167},[33,103945,103946,103948,103951,103953,103955,103957,103960,103962,103964,103966,103968],{"class":35,"line":771},[33,103947,103910],{"class":163},[33,103949,103950],{"class":54},"\"=SUM(C",[33,103952,1115],{"class":50},[33,103954,103918],{"class":167},[33,103956,1121],{"class":50},[33,103958,103959],{"class":54},":C",[33,103961,1115],{"class":50},[33,103963,103928],{"class":167},[33,103965,1121],{"class":50},[33,103967,72406],{"class":54},[33,103969,103935],{"class":167},[33,103971,103972,103974,103976,103978,103980],{"class":35,"line":777},[33,103973,103886],{"class":167},[33,103975,10258],{"class":50},[33,103977,365],{"class":167},[33,103979,3198],{"class":54},[33,103981,103935],{"class":167},[33,103983,103984],{"class":35,"line":788},[33,103985,92],{"emptyLinePlaceholder":91},[33,103987,103988,103990,103992,103994],{"class":35,"line":804},[33,103989,35726],{"class":163},[33,103991,783],{"class":50},[33,103993,1852],{"class":163},[33,103995,1855],{"class":167},[33,103997,103998,104000,104002,104004,104006,104008,104010,104012,104014,104016],{"class":35,"line":809},[33,103999,35742],{"class":163},[33,104001,16617],{"class":50},[33,104003,602],{"class":167},[33,104005,4059],{"class":163},[33,104007,101705],{"class":54},[33,104009,1115],{"class":50},[33,104011,6565],{"class":167},[33,104013,1121],{"class":50},[33,104015,274],{"class":54},[33,104017,221],{"class":167},[33,104019,104020],{"class":35,"line":819},[33,104021,92],{"emptyLinePlaceholder":91},[33,104023,104024,104026,104028,104030,104033,104035,104037],{"class":35,"line":829},[33,104025,13474],{"class":50},[33,104027,602],{"class":167},[33,104029,4059],{"class":163},[33,104031,104032],{"class":54},"\"Written with summary row: ",[33,104034,97684],{"class":50},[33,104036,274],{"class":54},[33,104038,221],{"class":167},[18,104040,104042],{"id":104041},"step-4-conditional-formatting","Step 4 — Conditional Formatting",[14,104044,104045],{},"Highlight underperforming regions (revenue below a threshold) in red and top performers in green — without touching a single cell manually.",[23,104047,104049],{"className":126,"code":104048,"language":47,"meta":28,"style":28},"# pip install pandas xlsxwriter\nfrom pathlib import Path\nimport pandas as pd\n\nOUTPUT = Path(\"output\u002Fsales_report_cf.xlsx\")\n\ntry:\n    with pd.ExcelWriter(OUTPUT, engine=\"xlsxwriter\") as writer:\n        by_region.to_excel(writer, sheet_name=\"Summary\", index=False)\n        wb = writer.book\n        ws = writer.sheets[\"Summary\"]\n\n        green_fmt = wb.add_format({\"bg_color\": \"#C6EFCE\", \"font_color\": \"#006100\"})\n        red_fmt   = wb.add_format({\"bg_color\": \"#FFC7CE\", \"font_color\": \"#9C0006\"})\n\n        n = len(by_region)\n        rev_range = f\"B2:B{n + 1}\"   # revenue column, skip header\n\n        # Values above 10 000 → green\n        ws.conditional_format(rev_range, {\n            \"type\": \"cell\", \"criteria\": \">=\", \"value\": 10_000, \"format\": green_fmt\n        })\n        # Values below 5 000 → red\n        ws.conditional_format(rev_range, {\n            \"type\": \"cell\", \"criteria\": \"\u003C\", \"value\": 5_000, \"format\": red_fmt\n        })\n\n        # 3-color scale on units column\n        ws.conditional_format(f\"C2:C{n + 1}\", {\n            \"type\": \"3_color_scale\",\n            \"min_color\": \"#FFC7CE\",\n            \"mid_color\": \"#FFEB9C\",\n            \"max_color\": \"#C6EFCE\",\n        })\n\nexcept Exception as exc:\n    raise SystemExit(f\"Export failed: {exc}\")\n\nprint(f\"Written: {OUTPUT}\")\n",[30,104050,104051,104055,104065,104075,104079,104092,104096,104102,104124,104144,104152,104164,104168,104195,104222,104226,104237,104261,104265,104270,104275,104313,104317,104322,104326,104359,104363,104367,104372,104395,104406,104417,104429,104440,104444,104448,104458,104480,104484],{"__ignoreMap":28},[33,104052,104053],{"class":35,"line":36},[33,104054,102978],{"class":39},[33,104056,104057,104059,104061,104063],{"class":35,"line":43},[33,104058,190],{"class":163},[33,104060,193],{"class":167},[33,104062,164],{"class":163},[33,104064,198],{"class":167},[33,104066,104067,104069,104071,104073],{"class":35,"line":61},[33,104068,164],{"class":163},[33,104070,492],{"class":167},[33,104072,495],{"class":163},[33,104074,498],{"class":167},[33,104076,104077],{"class":35,"line":73},[33,104078,92],{"emptyLinePlaceholder":91},[33,104080,104081,104083,104085,104087,104090],{"class":35,"line":88},[33,104082,96935],{"class":50},[33,104084,212],{"class":163},[33,104086,215],{"class":167},[33,104088,104089],{"class":54},"\"output\u002Fsales_report_cf.xlsx\"",[33,104091,221],{"class":167},[33,104093,104094],{"class":35,"line":95},[33,104095,92],{"emptyLinePlaceholder":91},[33,104097,104098,104100],{"class":35,"line":101},[33,104099,35574],{"class":163},[33,104101,574],{"class":167},[33,104103,104104,104106,104108,104110,104112,104114,104116,104118,104120,104122],{"class":35,"line":171},[33,104105,1635],{"class":163},[33,104107,96996],{"class":167},[33,104109,96935],{"class":50},[33,104111,365],{"class":167},[33,104113,17351],{"class":238},[33,104115,242],{"class":163},[33,104117,103069],{"class":54},[33,104119,1649],{"class":167},[33,104121,495],{"class":163},[33,104123,17363],{"class":167},[33,104125,104126,104128,104130,104132,104134,104136,104138,104140,104142],{"class":35,"line":179},[33,104127,97017],{"class":167},[33,104129,17371],{"class":238},[33,104131,242],{"class":163},[33,104133,103086],{"class":54},[33,104135,365],{"class":167},[33,104137,897],{"class":238},[33,104139,242],{"class":163},[33,104141,902],{"class":50},[33,104143,221],{"class":167},[33,104145,104146,104148,104150],{"class":35,"line":187},[33,104147,97065],{"class":167},[33,104149,242],{"class":163},[33,104151,97070],{"class":167},[33,104153,104154,104156,104158,104160,104162],{"class":35,"line":201},[33,104155,101503],{"class":167},[33,104157,242],{"class":163},[33,104159,103386],{"class":167},[33,104161,103086],{"class":54},[33,104163,9202],{"class":167},[33,104165,104166],{"class":35,"line":206},[33,104167,92],{"emptyLinePlaceholder":91},[33,104169,104170,104173,104175,104177,104179,104181,104184,104186,104188,104190,104193],{"class":35,"line":224},[33,104171,104172],{"class":167},"        green_fmt ",[33,104174,242],{"class":163},[33,104176,103229],{"class":167},[33,104178,103179],{"class":54},[33,104180,2079],{"class":167},[33,104182,104183],{"class":54},"\"#C6EFCE\"",[33,104185,365],{"class":167},[33,104187,103188],{"class":54},[33,104189,2079],{"class":167},[33,104191,104192],{"class":54},"\"#006100\"",[33,104194,103249],{"class":167},[33,104196,104197,104200,104202,104204,104206,104208,104211,104213,104215,104217,104220],{"class":35,"line":229},[33,104198,104199],{"class":167},"        red_fmt   ",[33,104201,242],{"class":163},[33,104203,103229],{"class":167},[33,104205,103179],{"class":54},[33,104207,2079],{"class":167},[33,104209,104210],{"class":54},"\"#FFC7CE\"",[33,104212,365],{"class":167},[33,104214,103188],{"class":54},[33,104216,2079],{"class":167},[33,104218,104219],{"class":54},"\"#9C0006\"",[33,104221,103249],{"class":167},[33,104223,104224],{"class":35,"line":235},[33,104225,92],{"emptyLinePlaceholder":91},[33,104227,104228,104231,104233,104235],{"class":35,"line":250},[33,104229,104230],{"class":167},"        n ",[33,104232,242],{"class":163},[33,104234,4037],{"class":50},[33,104236,96796],{"class":167},[33,104238,104239,104242,104244,104246,104248,104250,104252,104254,104256,104258],{"class":35,"line":266},[33,104240,104241],{"class":167},"        rev_range ",[33,104243,242],{"class":163},[33,104245,1110],{"class":163},[33,104247,98615],{"class":54},[33,104249,1115],{"class":50},[33,104251,97403],{"class":167},[33,104253,1811],{"class":163},[33,104255,11022],{"class":50},[33,104257,274],{"class":54},[33,104259,104260],{"class":39},"   # revenue column, skip header\n",[33,104262,104263],{"class":35,"line":290},[33,104264,92],{"emptyLinePlaceholder":91},[33,104266,104267],{"class":35,"line":295},[33,104268,104269],{"class":39},"        # Values above 10 000 → green\n",[33,104271,104272],{"class":35,"line":300},[33,104273,104274],{"class":167},"        ws.conditional_format(rev_range, {\n",[33,104276,104277,104280,104282,104285,104287,104290,104292,104295,104297,104300,104302,104305,104307,104310],{"class":35,"line":317},[33,104278,104279],{"class":54},"            \"type\"",[33,104281,2079],{"class":167},[33,104283,104284],{"class":54},"\"cell\"",[33,104286,365],{"class":167},[33,104288,104289],{"class":54},"\"criteria\"",[33,104291,2079],{"class":167},[33,104293,104294],{"class":54},"\">=\"",[33,104296,365],{"class":167},[33,104298,104299],{"class":54},"\"value\"",[33,104301,2079],{"class":167},[33,104303,104304],{"class":50},"10_000",[33,104306,365],{"class":167},[33,104308,104309],{"class":54},"\"format\"",[33,104311,104312],{"class":167},": green_fmt\n",[33,104314,104315],{"class":35,"line":332},[33,104316,83823],{"class":167},[33,104318,104319],{"class":35,"line":347},[33,104320,104321],{"class":39},"        # Values below 5 000 → red\n",[33,104323,104324],{"class":35,"line":374},[33,104325,104274],{"class":167},[33,104327,104328,104330,104332,104334,104336,104338,104340,104343,104345,104347,104349,104352,104354,104356],{"class":35,"line":397},[33,104329,104279],{"class":54},[33,104331,2079],{"class":167},[33,104333,104284],{"class":54},[33,104335,365],{"class":167},[33,104337,104289],{"class":54},[33,104339,2079],{"class":167},[33,104341,104342],{"class":54},"\"\u003C\"",[33,104344,365],{"class":167},[33,104346,104299],{"class":54},[33,104348,2079],{"class":167},[33,104350,104351],{"class":50},"5_000",[33,104353,365],{"class":167},[33,104355,104309],{"class":54},[33,104357,104358],{"class":167},": red_fmt\n",[33,104360,104361],{"class":35,"line":653},[33,104362,83823],{"class":167},[33,104364,104365],{"class":35,"line":667},[33,104366,92],{"emptyLinePlaceholder":91},[33,104368,104369],{"class":35,"line":675},[33,104370,104371],{"class":39},"        # 3-color scale on units column\n",[33,104373,104374,104377,104379,104382,104384,104386,104388,104390,104392],{"class":35,"line":689},[33,104375,104376],{"class":167},"        ws.conditional_format(",[33,104378,4059],{"class":163},[33,104380,104381],{"class":54},"\"C2:C",[33,104383,1115],{"class":50},[33,104385,97403],{"class":167},[33,104387,1811],{"class":163},[33,104389,11022],{"class":50},[33,104391,274],{"class":54},[33,104393,104394],{"class":167},", {\n",[33,104396,104397,104399,104401,104404],{"class":35,"line":703},[33,104398,104279],{"class":54},[33,104400,2079],{"class":167},[33,104402,104403],{"class":54},"\"3_color_scale\"",[33,104405,247],{"class":167},[33,104407,104408,104411,104413,104415],{"class":35,"line":714},[33,104409,104410],{"class":54},"            \"min_color\"",[33,104412,2079],{"class":167},[33,104414,104210],{"class":54},[33,104416,247],{"class":167},[33,104418,104419,104422,104424,104427],{"class":35,"line":723},[33,104420,104421],{"class":54},"            \"mid_color\"",[33,104423,2079],{"class":167},[33,104425,104426],{"class":54},"\"#FFEB9C\"",[33,104428,247],{"class":167},[33,104430,104431,104434,104436,104438],{"class":35,"line":754},[33,104432,104433],{"class":54},"            \"max_color\"",[33,104435,2079],{"class":167},[33,104437,104183],{"class":54},[33,104439,247],{"class":167},[33,104441,104442],{"class":35,"line":771},[33,104443,83823],{"class":167},[33,104445,104446],{"class":35,"line":777},[33,104447,92],{"emptyLinePlaceholder":91},[33,104449,104450,104452,104454,104456],{"class":35,"line":788},[33,104451,35726],{"class":163},[33,104453,783],{"class":50},[33,104455,1852],{"class":163},[33,104457,1855],{"class":167},[33,104459,104460,104462,104464,104466,104468,104470,104472,104474,104476,104478],{"class":35,"line":804},[33,104461,35742],{"class":163},[33,104463,16617],{"class":50},[33,104465,602],{"class":167},[33,104467,4059],{"class":163},[33,104469,101705],{"class":54},[33,104471,1115],{"class":50},[33,104473,6565],{"class":167},[33,104475,1121],{"class":50},[33,104477,274],{"class":54},[33,104479,221],{"class":167},[33,104481,104482],{"class":35,"line":809},[33,104483,92],{"emptyLinePlaceholder":91},[33,104485,104486,104488,104490,104492,104494,104496,104498],{"class":35,"line":819},[33,104487,13474],{"class":50},[33,104489,602],{"class":167},[33,104491,4059],{"class":163},[33,104493,58214],{"class":54},[33,104495,97684],{"class":50},[33,104497,274],{"class":54},[33,104499,221],{"class":167},[18,104501,104503],{"id":104502},"step-5-embed-a-bar-chart","Step 5 — Embed a Bar Chart",[14,104505,104506],{},"Charts embedded in the workbook are generated from the data already on the sheet — no separate image file required.",[23,104508,104510],{"className":126,"code":104509,"language":47,"meta":28,"style":28},"# pip install pandas xlsxwriter\nfrom pathlib import Path\nimport pandas as pd\n\nOUTPUT = Path(\"output\u002Fsales_report_chart.xlsx\")\n\ntry:\n    with pd.ExcelWriter(OUTPUT, engine=\"xlsxwriter\") as writer:\n        by_region.to_excel(writer, sheet_name=\"Summary\", index=False)\n        wb = writer.book\n        ws = writer.sheets[\"Summary\"]\n\n        n = len(by_region)\n\n        chart = wb.add_chart({\"type\": \"column\"})\n        chart.add_series({\n            \"name\":       \"Total Revenue\",\n            \"categories\": [\"Summary\", 1, 0, n, 0],   # region labels (col A)\n            \"values\":     [\"Summary\", 1, 1, n, 1],   # revenue values (col B)\n            \"fill\":       {\"color\": \"#2563eb\"},\n            \"gap\":        50,\n        })\n        chart.set_title({\"name\": \"Revenue by Region\"})\n        chart.set_x_axis({\"name\": \"Region\"})\n        chart.set_y_axis({\"name\": \"Revenue ($)\", \"num_format\": \"#,##0\"})\n        chart.set_legend({\"none\": True})\n        chart.set_size({\"width\": 480, \"height\": 288})\n\n        ws.insert_chart(\"F2\", chart)\n\nexcept Exception as exc:\n    raise SystemExit(f\"Export failed: {exc}\")\n\nprint(f\"Written: {OUTPUT}\")\n",[30,104511,104512,104516,104526,104536,104540,104553,104557,104563,104585,104605,104613,104625,104629,104639,104643,104663,104668,104681,104708,104735,104752,104764,104768,104783,104796,104818,104832,104854,104858,104869,104873,104883,104905,104909],{"__ignoreMap":28},[33,104513,104514],{"class":35,"line":36},[33,104515,102978],{"class":39},[33,104517,104518,104520,104522,104524],{"class":35,"line":43},[33,104519,190],{"class":163},[33,104521,193],{"class":167},[33,104523,164],{"class":163},[33,104525,198],{"class":167},[33,104527,104528,104530,104532,104534],{"class":35,"line":61},[33,104529,164],{"class":163},[33,104531,492],{"class":167},[33,104533,495],{"class":163},[33,104535,498],{"class":167},[33,104537,104538],{"class":35,"line":73},[33,104539,92],{"emptyLinePlaceholder":91},[33,104541,104542,104544,104546,104548,104551],{"class":35,"line":88},[33,104543,96935],{"class":50},[33,104545,212],{"class":163},[33,104547,215],{"class":167},[33,104549,104550],{"class":54},"\"output\u002Fsales_report_chart.xlsx\"",[33,104552,221],{"class":167},[33,104554,104555],{"class":35,"line":95},[33,104556,92],{"emptyLinePlaceholder":91},[33,104558,104559,104561],{"class":35,"line":101},[33,104560,35574],{"class":163},[33,104562,574],{"class":167},[33,104564,104565,104567,104569,104571,104573,104575,104577,104579,104581,104583],{"class":35,"line":171},[33,104566,1635],{"class":163},[33,104568,96996],{"class":167},[33,104570,96935],{"class":50},[33,104572,365],{"class":167},[33,104574,17351],{"class":238},[33,104576,242],{"class":163},[33,104578,103069],{"class":54},[33,104580,1649],{"class":167},[33,104582,495],{"class":163},[33,104584,17363],{"class":167},[33,104586,104587,104589,104591,104593,104595,104597,104599,104601,104603],{"class":35,"line":179},[33,104588,97017],{"class":167},[33,104590,17371],{"class":238},[33,104592,242],{"class":163},[33,104594,103086],{"class":54},[33,104596,365],{"class":167},[33,104598,897],{"class":238},[33,104600,242],{"class":163},[33,104602,902],{"class":50},[33,104604,221],{"class":167},[33,104606,104607,104609,104611],{"class":35,"line":187},[33,104608,97065],{"class":167},[33,104610,242],{"class":163},[33,104612,97070],{"class":167},[33,104614,104615,104617,104619,104621,104623],{"class":35,"line":201},[33,104616,101503],{"class":167},[33,104618,242],{"class":163},[33,104620,103386],{"class":167},[33,104622,103086],{"class":54},[33,104624,9202],{"class":167},[33,104626,104627],{"class":35,"line":206},[33,104628,92],{"emptyLinePlaceholder":91},[33,104630,104631,104633,104635,104637],{"class":35,"line":224},[33,104632,104230],{"class":167},[33,104634,242],{"class":163},[33,104636,4037],{"class":50},[33,104638,96796],{"class":167},[33,104640,104641],{"class":35,"line":229},[33,104642,92],{"emptyLinePlaceholder":91},[33,104644,104645,104648,104650,104653,104656,104658,104661],{"class":35,"line":235},[33,104646,104647],{"class":167},"        chart ",[33,104649,242],{"class":163},[33,104651,104652],{"class":167}," wb.add_chart({",[33,104654,104655],{"class":54},"\"type\"",[33,104657,2079],{"class":167},[33,104659,104660],{"class":54},"\"column\"",[33,104662,103249],{"class":167},[33,104664,104665],{"class":35,"line":250},[33,104666,104667],{"class":167},"        chart.add_series({\n",[33,104669,104670,104673,104676,104679],{"class":35,"line":266},[33,104671,104672],{"class":54},"            \"name\"",[33,104674,104675],{"class":167},":       ",[33,104677,104678],{"class":54},"\"Total Revenue\"",[33,104680,247],{"class":167},[33,104682,104683,104686,104688,104690,104692,104694,104696,104698,104701,104703,104705],{"class":35,"line":290},[33,104684,104685],{"class":54},"            \"categories\"",[33,104687,12426],{"class":167},[33,104689,103086],{"class":54},[33,104691,365],{"class":167},[33,104693,734],{"class":50},[33,104695,365],{"class":167},[33,104697,748],{"class":50},[33,104699,104700],{"class":167},", n, ",[33,104702,748],{"class":50},[33,104704,13424],{"class":167},[33,104706,104707],{"class":39},"# region labels (col A)\n",[33,104709,104710,104713,104716,104718,104720,104722,104724,104726,104728,104730,104732],{"class":35,"line":295},[33,104711,104712],{"class":54},"            \"values\"",[33,104714,104715],{"class":167},":     [",[33,104717,103086],{"class":54},[33,104719,365],{"class":167},[33,104721,734],{"class":50},[33,104723,365],{"class":167},[33,104725,734],{"class":50},[33,104727,104700],{"class":167},[33,104729,734],{"class":50},[33,104731,13424],{"class":167},[33,104733,104734],{"class":39},"# revenue values (col B)\n",[33,104736,104737,104740,104743,104746,104748,104750],{"class":35,"line":300},[33,104738,104739],{"class":54},"            \"fill\"",[33,104741,104742],{"class":167},":       {",[33,104744,104745],{"class":54},"\"color\"",[33,104747,2079],{"class":167},[33,104749,55362],{"class":54},[33,104751,3509],{"class":167},[33,104753,104754,104757,104760,104762],{"class":35,"line":317},[33,104755,104756],{"class":54},"            \"gap\"",[33,104758,104759],{"class":167},":        ",[33,104761,2680],{"class":50},[33,104763,247],{"class":167},[33,104765,104766],{"class":35,"line":332},[33,104767,83823],{"class":167},[33,104769,104770,104773,104776,104778,104781],{"class":35,"line":347},[33,104771,104772],{"class":167},"        chart.set_title({",[33,104774,104775],{"class":54},"\"name\"",[33,104777,2079],{"class":167},[33,104779,104780],{"class":54},"\"Revenue by Region\"",[33,104782,103249],{"class":167},[33,104784,104785,104788,104790,104792,104794],{"class":35,"line":374},[33,104786,104787],{"class":167},"        chart.set_x_axis({",[33,104789,104775],{"class":54},[33,104791,2079],{"class":167},[33,104793,11865],{"class":54},[33,104795,103249],{"class":167},[33,104797,104798,104801,104803,104805,104808,104810,104812,104814,104816],{"class":35,"line":397},[33,104799,104800],{"class":167},"        chart.set_y_axis({",[33,104802,104775],{"class":54},[33,104804,2079],{"class":167},[33,104806,104807],{"class":54},"\"Revenue ($)\"",[33,104809,365],{"class":167},[33,104811,103232],{"class":54},[33,104813,2079],{"class":167},[33,104815,103265],{"class":54},[33,104817,103249],{"class":167},[33,104819,104820,104823,104826,104828,104830],{"class":35,"line":653},[33,104821,104822],{"class":167},"        chart.set_legend({",[33,104824,104825],{"class":54},"\"none\"",[33,104827,2079],{"class":167},[33,104829,855],{"class":50},[33,104831,103249],{"class":167},[33,104833,104834,104837,104839,104841,104843,104845,104847,104849,104852],{"class":35,"line":667},[33,104835,104836],{"class":167},"        chart.set_size({",[33,104838,83804],{"class":54},[33,104840,2079],{"class":167},[33,104842,49852],{"class":50},[33,104844,365],{"class":167},[33,104846,83816],{"class":54},[33,104848,2079],{"class":167},[33,104850,104851],{"class":50},"288",[33,104853,103249],{"class":167},[33,104855,104856],{"class":35,"line":675},[33,104857,92],{"emptyLinePlaceholder":91},[33,104859,104860,104863,104866],{"class":35,"line":689},[33,104861,104862],{"class":167},"        ws.insert_chart(",[33,104864,104865],{"class":54},"\"F2\"",[33,104867,104868],{"class":167},", chart)\n",[33,104870,104871],{"class":35,"line":703},[33,104872,92],{"emptyLinePlaceholder":91},[33,104874,104875,104877,104879,104881],{"class":35,"line":714},[33,104876,35726],{"class":163},[33,104878,783],{"class":50},[33,104880,1852],{"class":163},[33,104882,1855],{"class":167},[33,104884,104885,104887,104889,104891,104893,104895,104897,104899,104901,104903],{"class":35,"line":723},[33,104886,35742],{"class":163},[33,104888,16617],{"class":50},[33,104890,602],{"class":167},[33,104892,4059],{"class":163},[33,104894,101705],{"class":54},[33,104896,1115],{"class":50},[33,104898,6565],{"class":167},[33,104900,1121],{"class":50},[33,104902,274],{"class":54},[33,104904,221],{"class":167},[33,104906,104907],{"class":35,"line":754},[33,104908,92],{"emptyLinePlaceholder":91},[33,104910,104911,104913,104915,104917,104919,104921,104923],{"class":35,"line":771},[33,104912,13474],{"class":50},[33,104914,602],{"class":167},[33,104916,4059],{"class":163},[33,104918,58214],{"class":54},[33,104920,97684],{"class":50},[33,104922,274],{"class":54},[33,104924,221],{"class":167},[18,104926,2709],{"id":2708},[424,104928,104930],{"id":104929},"modifying-an-existing-template-with-openpyxl","Modifying an Existing Template with openpyxl",[14,104932,104933,104934,104936,104937,104939,104940,104942],{},"When you need to inject data into a pre-branded template, ",[30,104935,22009],{}," is the right tool. Note that opening in ",[30,104938,98285],{}," mode prevents writes — see ",[940,104941,97863],{"href":97862}," if you hit that error.",[23,104944,104946],{"className":126,"code":104945,"language":47,"meta":28,"style":28},"# pip install openpyxl pandas\nfrom pathlib import Path\nimport pandas as pd\nfrom openpyxl import load_workbook\nfrom openpyxl.utils.dataframe import dataframe_to_rows\nfrom openpyxl.styles import Font, PatternFill, numbers\n\nTEMPLATE = Path(\"templates\u002Freport_template.xlsx\")\nOUTPUT   = Path(\"output\u002Freport_filled.xlsx\")\n\ntry:\n    wb = load_workbook(TEMPLATE)           # NOT read_only — we need to write\n    ws = wb[\"Data\"]\n\n    # Clear old data below header\n    for row in ws.iter_rows(min_row=2, max_row=ws.max_row):\n        for cell in row:\n            cell.value = None\n\n    # Write new data\n    for r_idx, row in enumerate(dataframe_to_rows(by_region, index=False, header=False), start=2):\n        for c_idx, value in enumerate(row, start=1):\n            ws.cell(row=r_idx, column=c_idx, value=value)\n\n    # Apply currency format to column B\n    for cell in ws[\"B\"][1:]:\n        cell.number_format = numbers.FORMAT_NUMBER_COMMA_SEPARATED2\n\n    wb.save(OUTPUT)\nexcept FileNotFoundError as exc:\n    raise SystemExit(f\"Template not found: {exc}\")\n\nprint(f\"Template filled: {OUTPUT}\")\n",[30,104947,104948,104952,104962,104972,104982,104992,105003,105007,105020,105033,105037,105043,105058,105070,105074,105079,105103,105113,105121,105125,105130,105168,105190,105213,105217,105222,105241,105254,105258,105266,105276,105298,105302],{"__ignoreMap":28},[33,104949,104950],{"class":35,"line":36},[33,104951,97874],{"class":39},[33,104953,104954,104956,104958,104960],{"class":35,"line":43},[33,104955,190],{"class":163},[33,104957,193],{"class":167},[33,104959,164],{"class":163},[33,104961,198],{"class":167},[33,104963,104964,104966,104968,104970],{"class":35,"line":61},[33,104965,164],{"class":163},[33,104967,492],{"class":167},[33,104969,495],{"class":163},[33,104971,498],{"class":167},[33,104973,104974,104976,104978,104980],{"class":35,"line":73},[33,104975,190],{"class":163},[33,104977,17103],{"class":167},[33,104979,164],{"class":163},[33,104981,17108],{"class":167},[33,104983,104984,104986,104988,104990],{"class":35,"line":88},[33,104985,190],{"class":163},[33,104987,97901],{"class":167},[33,104989,164],{"class":163},[33,104991,97906],{"class":167},[33,104993,104994,104996,104998,105000],{"class":35,"line":95},[33,104995,190],{"class":163},[33,104997,17115],{"class":167},[33,104999,164],{"class":163},[33,105001,105002],{"class":167}," Font, PatternFill, numbers\n",[33,105004,105005],{"class":35,"line":101},[33,105006,92],{"emptyLinePlaceholder":91},[33,105008,105009,105011,105013,105015,105018],{"class":35,"line":171},[33,105010,97915],{"class":50},[33,105012,212],{"class":163},[33,105014,215],{"class":167},[33,105016,105017],{"class":54},"\"templates\u002Freport_template.xlsx\"",[33,105019,221],{"class":167},[33,105021,105022,105024,105026,105028,105031],{"class":35,"line":179},[33,105023,96935],{"class":50},[33,105025,21012],{"class":163},[33,105027,215],{"class":167},[33,105029,105030],{"class":54},"\"output\u002Freport_filled.xlsx\"",[33,105032,221],{"class":167},[33,105034,105035],{"class":35,"line":187},[33,105036,92],{"emptyLinePlaceholder":91},[33,105038,105039,105041],{"class":35,"line":201},[33,105040,35574],{"class":163},[33,105042,574],{"class":167},[33,105044,105045,105047,105049,105051,105053,105055],{"class":35,"line":206},[33,105046,17432],{"class":167},[33,105048,242],{"class":163},[33,105050,97956],{"class":167},[33,105052,97915],{"class":50},[33,105054,28335],{"class":167},[33,105056,105057],{"class":39},"# NOT read_only — we need to write\n",[33,105059,105060,105062,105064,105066,105068],{"class":35,"line":224},[33,105061,17442],{"class":167},[33,105063,242],{"class":163},[33,105065,17447],{"class":167},[33,105067,17376],{"class":54},[33,105069,9202],{"class":167},[33,105071,105072],{"class":35,"line":229},[33,105073,92],{"emptyLinePlaceholder":91},[33,105075,105076],{"class":35,"line":235},[33,105077,105078],{"class":39},"    # Clear old data below header\n",[33,105080,105081,105083,105085,105087,105089,105091,105093,105095,105097,105099,105101],{"class":35,"line":250},[33,105082,656],{"class":163},[33,105084,3844],{"class":167},[33,105086,662],{"class":163},[33,105088,17639],{"class":167},[33,105090,17642],{"class":238},[33,105092,242],{"class":163},[33,105094,1533],{"class":50},[33,105096,365],{"class":167},[33,105098,97398],{"class":238},[33,105100,242],{"class":163},[33,105102,98009],{"class":167},[33,105104,105105,105107,105109,105111],{"class":35,"line":266},[33,105106,5973],{"class":163},[33,105108,17467],{"class":167},[33,105110,662],{"class":163},[33,105112,17675],{"class":167},[33,105114,105115,105117,105119],{"class":35,"line":290},[33,105116,17807],{"class":167},[33,105118,242],{"class":163},[33,105120,3852],{"class":50},[33,105122,105123],{"class":35,"line":295},[33,105124,92],{"emptyLinePlaceholder":91},[33,105126,105127],{"class":35,"line":300},[33,105128,105129],{"class":39},"    # Write new data\n",[33,105131,105132,105134,105137,105139,105141,105144,105146,105148,105150,105152,105154,105156,105158,105160,105162,105164,105166],{"class":35,"line":317},[33,105133,656],{"class":163},[33,105135,105136],{"class":167}," r_idx, row ",[33,105138,662],{"class":163},[33,105140,7403],{"class":50},[33,105142,105143],{"class":167},"(dataframe_to_rows(by_region, ",[33,105145,897],{"class":238},[33,105147,242],{"class":163},[33,105149,902],{"class":50},[33,105151,365],{"class":167},[33,105153,44427],{"class":238},[33,105155,242],{"class":163},[33,105157,902],{"class":50},[33,105159,18525],{"class":167},[33,105161,7409],{"class":238},[33,105163,242],{"class":163},[33,105165,1533],{"class":50},[33,105167,1737],{"class":167},[33,105169,105170,105172,105175,105177,105179,105182,105184,105186,105188],{"class":35,"line":332},[33,105171,5973],{"class":163},[33,105173,105174],{"class":167}," c_idx, value ",[33,105176,662],{"class":163},[33,105178,7403],{"class":50},[33,105180,105181],{"class":167},"(row, ",[33,105183,7409],{"class":238},[33,105185,242],{"class":163},[33,105187,734],{"class":50},[33,105189,1737],{"class":167},[33,105191,105192,105194,105196,105198,105200,105202,105204,105206,105208,105210],{"class":35,"line":347},[33,105193,98104],{"class":167},[33,105195,98107],{"class":238},[33,105197,242],{"class":163},[33,105199,98112],{"class":167},[33,105201,98115],{"class":238},[33,105203,242],{"class":163},[33,105205,98120],{"class":167},[33,105207,67110],{"class":238},[33,105209,242],{"class":163},[33,105211,105212],{"class":167},"value)\n",[33,105214,105215],{"class":35,"line":374},[33,105216,92],{"emptyLinePlaceholder":91},[33,105218,105219],{"class":35,"line":397},[33,105220,105221],{"class":39},"    # Apply currency format to column B\n",[33,105223,105224,105226,105228,105230,105232,105235,105237,105239],{"class":35,"line":653},[33,105225,656],{"class":163},[33,105227,17467],{"class":167},[33,105229,662],{"class":163},[33,105231,17472],{"class":167},[33,105233,105234],{"class":54},"\"B\"",[33,105236,44179],{"class":167},[33,105238,734],{"class":50},[33,105240,43533],{"class":167},[33,105242,105243,105246,105248,105251],{"class":35,"line":667},[33,105244,105245],{"class":167},"        cell.number_format ",[33,105247,242],{"class":163},[33,105249,105250],{"class":167}," numbers.",[33,105252,105253],{"class":50},"FORMAT_NUMBER_COMMA_SEPARATED2\n",[33,105255,105256],{"class":35,"line":675},[33,105257,92],{"emptyLinePlaceholder":91},[33,105259,105260,105262,105264],{"class":35,"line":689},[33,105261,98136],{"class":167},[33,105263,96935],{"class":50},[33,105265,221],{"class":167},[33,105267,105268,105270,105272,105274],{"class":35,"line":703},[33,105269,35726],{"class":163},[33,105271,2945],{"class":50},[33,105273,1852],{"class":163},[33,105275,1855],{"class":167},[33,105277,105278,105280,105282,105284,105286,105288,105290,105292,105294,105296],{"class":35,"line":714},[33,105279,35742],{"class":163},[33,105281,16617],{"class":50},[33,105283,602],{"class":167},[33,105285,4059],{"class":163},[33,105287,20538],{"class":54},[33,105289,1115],{"class":50},[33,105291,6565],{"class":167},[33,105293,1121],{"class":50},[33,105295,274],{"class":54},[33,105297,221],{"class":167},[33,105299,105300],{"class":35,"line":723},[33,105301,92],{"emptyLinePlaceholder":91},[33,105303,105304,105306,105308,105310,105312,105314,105316],{"class":35,"line":754},[33,105305,13474],{"class":50},[33,105307,602],{"class":167},[33,105309,4059],{"class":163},[33,105311,98188],{"class":54},[33,105313,97684],{"class":50},[33,105315,274],{"class":54},[33,105317,221],{"class":167},[424,105319,105321],{"id":105320},"writing-excel-formulas-and-charts-directly","Writing Excel Formulas and Charts Directly",[14,105323,105324,105325,102135],{},"For advanced formula patterns (array formulas, named ranges, dynamic arrays), see the ",[940,105326,102074],{"href":102073},[424,105328,105330,105331],{"id":105329},"memory-efficient-writes-with-use_constant_memory","Memory-Efficient Writes with ",[30,105332,105333],{},"use_constant_memory",[14,105335,105336,105337,105339],{},"For DataFrames with more than 500 000 rows, ",[30,105338,17066],{},"'s constant-memory mode streams rows without buffering the whole sheet in RAM. The trade-off: you cannot go back and set cell formats after writing.",[23,105341,105343],{"className":126,"code":105342,"language":47,"meta":28,"style":28},"# pip install pandas xlsxwriter\nfrom pathlib import Path\nimport pandas as pd\n\nOUTPUT = Path(\"output\u002Flarge_report.xlsx\")\n\noptions = {\"constant_memory\": True}\n\ntry:\n    with pd.ExcelWriter(OUTPUT, engine=\"xlsxwriter\",\n                        engine_kwargs={\"options\": options}) as writer:\n        # Write in chunks if the DataFrame is chunked from a database\n        big_df.to_excel(writer, sheet_name=\"Data\", index=False)\nexcept Exception as exc:\n    raise SystemExit(f\"Export failed: {exc}\")\n",[30,105344,105345,105349,105359,105369,105373,105386,105390,105408,105412,105418,105436,105455,105460,105481,105491],{"__ignoreMap":28},[33,105346,105347],{"class":35,"line":36},[33,105348,102978],{"class":39},[33,105350,105351,105353,105355,105357],{"class":35,"line":43},[33,105352,190],{"class":163},[33,105354,193],{"class":167},[33,105356,164],{"class":163},[33,105358,198],{"class":167},[33,105360,105361,105363,105365,105367],{"class":35,"line":61},[33,105362,164],{"class":163},[33,105364,492],{"class":167},[33,105366,495],{"class":163},[33,105368,498],{"class":167},[33,105370,105371],{"class":35,"line":73},[33,105372,92],{"emptyLinePlaceholder":91},[33,105374,105375,105377,105379,105381,105384],{"class":35,"line":88},[33,105376,96935],{"class":50},[33,105378,212],{"class":163},[33,105380,215],{"class":167},[33,105382,105383],{"class":54},"\"output\u002Flarge_report.xlsx\"",[33,105385,221],{"class":167},[33,105387,105388],{"class":35,"line":95},[33,105389,92],{"emptyLinePlaceholder":91},[33,105391,105392,105395,105397,105399,105402,105404,105406],{"class":35,"line":101},[33,105393,105394],{"class":167},"options ",[33,105396,242],{"class":163},[33,105398,4098],{"class":167},[33,105400,105401],{"class":54},"\"constant_memory\"",[33,105403,2079],{"class":167},[33,105405,855],{"class":50},[33,105407,4113],{"class":167},[33,105409,105410],{"class":35,"line":171},[33,105411,92],{"emptyLinePlaceholder":91},[33,105413,105414,105416],{"class":35,"line":179},[33,105415,35574],{"class":163},[33,105417,574],{"class":167},[33,105419,105420,105422,105424,105426,105428,105430,105432,105434],{"class":35,"line":187},[33,105421,1635],{"class":163},[33,105423,96996],{"class":167},[33,105425,96935],{"class":50},[33,105427,365],{"class":167},[33,105429,17351],{"class":238},[33,105431,242],{"class":163},[33,105433,103069],{"class":54},[33,105435,247],{"class":167},[33,105437,105438,105441,105443,105445,105448,105451,105453],{"class":35,"line":201},[33,105439,105440],{"class":238},"                        engine_kwargs",[33,105442,242],{"class":163},[33,105444,1115],{"class":167},[33,105446,105447],{"class":54},"\"options\"",[33,105449,105450],{"class":167},": options}) ",[33,105452,495],{"class":163},[33,105454,17363],{"class":167},[33,105456,105457],{"class":35,"line":206},[33,105458,105459],{"class":39},"        # Write in chunks if the DataFrame is chunked from a database\n",[33,105461,105462,105465,105467,105469,105471,105473,105475,105477,105479],{"class":35,"line":224},[33,105463,105464],{"class":167},"        big_df.to_excel(writer, ",[33,105466,17371],{"class":238},[33,105468,242],{"class":163},[33,105470,17376],{"class":54},[33,105472,365],{"class":167},[33,105474,897],{"class":238},[33,105476,242],{"class":163},[33,105478,902],{"class":50},[33,105480,221],{"class":167},[33,105482,105483,105485,105487,105489],{"class":35,"line":229},[33,105484,35726],{"class":163},[33,105486,783],{"class":50},[33,105488,1852],{"class":163},[33,105490,1855],{"class":167},[33,105492,105493,105495,105497,105499,105501,105503,105505,105507,105509,105511],{"class":35,"line":235},[33,105494,35742],{"class":163},[33,105496,16617],{"class":50},[33,105498,602],{"class":167},[33,105500,4059],{"class":163},[33,105502,101705],{"class":54},[33,105504,1115],{"class":50},[33,105506,6565],{"class":167},[33,105508,1121],{"class":50},[33,105510,274],{"class":54},[33,105512,221],{"class":167},[18,105514,52030],{"id":52029},[14,105516,105517],{},"Assert the output is structurally correct before treating the run as successful:",[23,105519,105521],{"className":126,"code":105520,"language":47,"meta":28,"style":28},"# pip install openpyxl\nfrom pathlib import Path\nfrom openpyxl import load_workbook\n\nOUTPUT = Path(\"output\u002Fsales_report.xlsx\")\n\nwb = load_workbook(OUTPUT, read_only=True)\nassert \"Summary\" in wb.sheetnames, \"Summary sheet missing\"\nws = wb[\"Summary\"]\nrows = list(ws.iter_rows(values_only=True))\nassert len(rows) >= 2, \"No data rows written\"\nassert rows[0][1] is not None, \"Column B header missing\"\nwb.close()\nprint(\"Validation passed\")\n",[30,105522,105523,105527,105537,105547,105551,105563,105567,105587,105601,105613,105631,105648,105673,105677],{"__ignoreMap":28},[33,105524,105525],{"class":35,"line":36},[33,105526,98209],{"class":39},[33,105528,105529,105531,105533,105535],{"class":35,"line":43},[33,105530,190],{"class":163},[33,105532,193],{"class":167},[33,105534,164],{"class":163},[33,105536,198],{"class":167},[33,105538,105539,105541,105543,105545],{"class":35,"line":61},[33,105540,190],{"class":163},[33,105542,17103],{"class":167},[33,105544,164],{"class":163},[33,105546,17108],{"class":167},[33,105548,105549],{"class":35,"line":73},[33,105550,92],{"emptyLinePlaceholder":91},[33,105552,105553,105555,105557,105559,105561],{"class":35,"line":88},[33,105554,96935],{"class":50},[33,105556,212],{"class":163},[33,105558,215],{"class":167},[33,105560,103013],{"class":54},[33,105562,221],{"class":167},[33,105564,105565],{"class":35,"line":95},[33,105566,92],{"emptyLinePlaceholder":91},[33,105568,105569,105571,105573,105575,105577,105579,105581,105583,105585],{"class":35,"line":101},[33,105570,98274],{"class":167},[33,105572,242],{"class":163},[33,105574,97956],{"class":167},[33,105576,96935],{"class":50},[33,105578,365],{"class":167},[33,105580,98285],{"class":238},[33,105582,242],{"class":163},[33,105584,855],{"class":50},[33,105586,221],{"class":167},[33,105588,105589,105591,105594,105596,105598],{"class":35,"line":171},[33,105590,36397],{"class":163},[33,105592,105593],{"class":54}," \"Summary\"",[33,105595,8002],{"class":163},[33,105597,98303],{"class":167},[33,105599,105600],{"class":54},"\"Summary sheet missing\"\n",[33,105602,105603,105605,105607,105609,105611],{"class":35,"line":179},[33,105604,98330],{"class":167},[33,105606,242],{"class":163},[33,105608,17447],{"class":167},[33,105610,103086],{"class":54},[33,105612,9202],{"class":167},[33,105614,105615,105617,105619,105621,105623,105625,105627,105629],{"class":35,"line":187},[33,105616,59877],{"class":167},[33,105618,242],{"class":163},[33,105620,599],{"class":50},[33,105622,98349],{"class":167},[33,105624,98352],{"class":238},[33,105626,242],{"class":163},[33,105628,855],{"class":50},[33,105630,371],{"class":167},[33,105632,105633,105635,105637,105639,105641,105643,105645],{"class":35,"line":201},[33,105634,36397],{"class":163},[33,105636,4037],{"class":50},[33,105638,98367],{"class":167},[33,105640,43000],{"class":163},[33,105642,7451],{"class":50},[33,105644,365],{"class":167},[33,105646,105647],{"class":54},"\"No data rows written\"\n",[33,105649,105650,105652,105654,105656,105658,105660,105662,105664,105666,105668,105670],{"class":35,"line":206},[33,105651,36397],{"class":163},[33,105653,13250],{"class":167},[33,105655,748],{"class":50},[33,105657,44179],{"class":167},[33,105659,734],{"class":50},[33,105661,763],{"class":167},[33,105663,3847],{"class":163},[33,105665,620],{"class":163},[33,105667,7657],{"class":50},[33,105669,365],{"class":167},[33,105671,105672],{"class":54},"\"Column B header missing\"\n",[33,105674,105675],{"class":35,"line":224},[33,105676,98453],{"class":167},[33,105678,105679,105681,105683,105686],{"class":35,"line":229},[33,105680,13474],{"class":50},[33,105682,602],{"class":167},[33,105684,105685],{"class":54},"\"Validation passed\"",[33,105687,221],{"class":167},[18,105689,21810],{"id":21809},[4211,105691,105692,105698,105710,105732],{},[4214,105693,105694,105697],{},[1974,105695,105696],{},"Row limit:"," Excel supports ~1 048 576 rows per sheet. For datasets beyond 100 000 rows consider writing to CSV and referencing it via Power Query instead of embedding all data in the workbook.",[4214,105699,105700,91835,105703,105706,105707,105709],{},[1974,105701,105702],{},"Multiple sheets:",[30,105704,105705],{},"to_excel(..., sheet_name=...)"," call within the same ",[30,105708,102520],{}," context adds a sheet without re-opening the file.",[4214,105711,105712,46332,105715,105717,105718,43180,105720,49047,105723,99641,105726,8877,105728,105731],{},[1974,105713,105714],{},"Formula recalculation:",[30,105716,17066],{}," writes formulas as strings; Excel recalculates on open. If you need values pre-calculated, write the computed value with ",[30,105719,17066],{},[30,105721,105722],{},"write()",[30,105724,105725],{},"write_formula()",[30,105727,22009],{},[30,105729,105730],{},"data_only=True"," after the file has been opened and saved by Excel once.",[4214,105733,105734,105737],{},[1974,105735,105736],{},"Parquet intermediary:"," If the ETL step is expensive, save the aggregated DataFrames to Parquet and load from there for the styling step — decouples compute from presentation.",[18,105739,4271],{"id":4270},[4273,105741,105742,105752],{},[4276,105743,105744],{},[4279,105745,105746,105748,105750],{},[4282,105747,14317],{},[4282,105749,4287],{},[4282,105751,4290],{},[4292,105753,105754,105767,105796,105815,105831],{},[4279,105755,105756,105761,105764],{},[4297,105757,105758],{},[30,105759,105760],{},"zipfile.BadZipFile",[4297,105762,105763],{},"Output file was left open by Excel when the script ran",[4297,105765,105766],{},"Close the file in Excel first, or write to a temp path then rename",[4279,105768,105769,105777,105786],{},[4297,105770,105771,95575,105774],{},[30,105772,105773],{},"KeyError: 'Summary'",[30,105775,105776],{},"writer.sheets",[4297,105778,105779,105780,105782,105783],{},"Sheet name not yet created — ",[30,105781,96833],{}," must be called before accessing ",[30,105784,105785],{},"writer.sheets[name]",[4297,105787,105788,105789,105792,105793,105795],{},"Confirm ",[30,105790,105791],{},"to_excel(..., sheet_name=\"Summary\")"," executes before the ",[30,105794,105776],{}," lookup",[4279,105797,105798,105803,105809],{},[4297,105799,105800],{},[30,105801,105802],{},"IllegalCharacterError",[4297,105804,105805,105806,105808],{},"Cell value contains control characters (e.g. ",[30,105807,99461],{},") from a database",[4297,105810,105811,105812,70980],{},"Strip with ",[30,105813,105814],{},"df.replace(r\"[\\x00-\\x1f]\", \"\", regex=True)",[4279,105816,105817,105822,105828],{},[4297,105818,105819,105821],{},[30,105820,68035],{}," on save",[4297,105823,105824,105825,105827],{},"Target ",[30,105826,26542],{}," is open in another application",[4297,105829,105830],{},"Write to a new timestamped filename or close the file",[4279,105832,105833,105838,105851],{},[4297,105834,105835],{},[30,105836,105837],{},"AttributeError: 'Worksheet' object has no attribute 'add_format'",[4297,105839,105840,105841,1351,105843,105845,105846,4348,105848,105850],{},"Mixed ",[30,105842,22009],{},[30,105844,17066],{}," API — ",[30,105847,101331],{},[30,105849,17066],{},"-only",[4297,105852,17059,105853,43180,105855,1351,105858,105861,105862],{},[30,105854,22009],{},[30,105856,105857],{},"PatternFill",[30,105859,105860],{},"Font"," objects when the engine is ",[30,105863,22009],{},[18,105865,4402],{"id":4401},[23,105867,105869],{"className":126,"code":105868,"language":47,"meta":28,"style":28},"#!\u002Fusr\u002Fbin\u002Fenv python3\n# pip install pandas xlsxwriter openpyxl\n\"\"\"\ngenerate_sales_report.py — produce a formatted multi-sheet Excel report.\nUsage: python generate_sales_report.py --input data\u002Fsales.csv --output output\u002Freport.xlsx\n\"\"\"\nimport argparse\nfrom pathlib import Path\nimport pandas as pd\n\n\ndef load_and_aggregate(csv_path: Path):\n    df = pd.read_csv(csv_path)\n    df[\"revenue\"] = pd.to_numeric(df[\"revenue\"], errors=\"coerce\")\n    df[\"units\"]   = pd.to_numeric(df[\"units\"],   errors=\"coerce\")\n    df.dropna(subset=[\"revenue\", \"units\"], inplace=True)\n\n    by_region = (\n        df.groupby(\"region\", as_index=False)\n        .agg(total_revenue=(\"revenue\", \"sum\"),\n             total_units=(\"units\", \"sum\"),\n             transactions=(\"revenue\", \"count\"))\n    )\n    by_region[\"avg_order\"] = (\n        by_region[\"total_revenue\"] \u002F by_region[\"transactions\"]\n    ).round(2)\n\n    by_month = (\n        df.groupby(\"month\", as_index=False)\n        .agg(total_revenue=(\"revenue\", \"sum\"))\n        .sort_values(\"month\")\n    )\n    return by_region, by_month\n\n\ndef write_report(by_region: \"pd.DataFrame\", by_month: \"pd.DataFrame\",\n                 output: Path) -> None:\n    output.parent.mkdir(parents=True, exist_ok=True)\n\n    with pd.ExcelWriter(output, engine=\"xlsxwriter\") as writer:\n        by_region.to_excel(writer, sheet_name=\"Summary\",       index=False)\n        by_month.to_excel( writer, sheet_name=\"Monthly Trend\", index=False)\n\n        wb = writer.book\n\n        hdr_fmt      = wb.add_format({\"bold\": True, \"bg_color\": \"#2563eb\",\n                                      \"font_color\": \"#ffffff\", \"font_size\": 12})\n        currency_fmt = wb.add_format({\"num_format\": \"#,##0.00\"})\n        green_fmt    = wb.add_format({\"bg_color\": \"#C6EFCE\", \"font_color\": \"#006100\"})\n        red_fmt      = wb.add_format({\"bg_color\": \"#FFC7CE\", \"font_color\": \"#9C0006\"})\n\n        # --- Summary sheet ---\n        ws = writer.sheets[\"Summary\"]\n        for c, name in enumerate(by_region.columns):\n            ws.write(0, c, name, hdr_fmt)\n        n = len(by_region)\n        ws.set_column(\"A:A\", 14)\n        ws.set_column(\"B:D\", 16, currency_fmt)\n        ws.conditional_format(f\"B2:B{n+1}\", {\n            \"type\": \"cell\", \"criteria\": \">=\", \"value\": 10_000, \"format\": green_fmt\n        })\n        ws.conditional_format(f\"B2:B{n+1}\", {\n            \"type\": \"cell\", \"criteria\": \"\u003C\",  \"value\": 5_000,  \"format\": red_fmt\n        })\n        # Total row\n        ws.write(n + 1, 0, \"TOTAL\",\n                 wb.add_format({\"bold\": True, \"bg_color\": \"#f1f5f9\"}))\n        ws.write_formula(n + 1, 1, f\"=SUM(B2:B{n+1})\",\n                         wb.add_format({\"num_format\": \"#,##0.00\", \"bold\": True,\n                                        \"bg_color\": \"#f1f5f9\"}))\n\n        # Bar chart\n        chart = wb.add_chart({\"type\": \"column\"})\n        chart.add_series({\n            \"name\":       \"Revenue\",\n            \"categories\": [\"Summary\", 1, 0, n, 0],\n            \"values\":     [\"Summary\", 1, 1, n, 1],\n            \"fill\":       {\"color\": \"#2563eb\"},\n        })\n        chart.set_title({\"name\": \"Revenue by Region\"})\n        chart.set_y_axis({\"num_format\": \"#,##0\"})\n        chart.set_legend({\"none\": True})\n        chart.set_size({\"width\": 400, \"height\": 240})\n        ws.insert_chart(\"F2\", chart)\n\n        # --- Monthly Trend sheet ---\n        ws2 = writer.sheets[\"Monthly Trend\"]\n        for c, name in enumerate(by_month.columns):\n            ws2.write(0, c, name, hdr_fmt)\n        ws2.set_column(\"A:B\", 18, currency_fmt)\n\n\ndef main():\n    parser = argparse.ArgumentParser(description=\"Generate formatted Excel sales report\")\n    parser.add_argument(\"--input\",  default=\"data\u002Fsales.csv\",     help=\"Input CSV path\")\n    parser.add_argument(\"--output\", default=\"output\u002Freport.xlsx\", help=\"Output .xlsx path\")\n    args = parser.parse_args()\n\n    csv_path = Path(args.input)\n    out_path = Path(args.output)\n\n    if not csv_path.exists():\n        raise SystemExit(f\"Input not found: {csv_path}\")\n\n    try:\n        by_region, by_month = load_and_aggregate(csv_path)\n        write_report(by_region, by_month, out_path)\n        print(f\"Report written to {out_path}\")\n    except PermissionError:\n        raise SystemExit(f\"Cannot write {out_path} — close it in Excel first\")\n    except Exception as exc:\n        raise SystemExit(f\"Report generation failed: {exc}\")\n\n\nif __name__ == \"__main__\":\n    main()\n",[30,105870,105871,105875,105880,105884,105889,105894,105898,105904,105914,105924,105928,105932,105942,105951,105975,105999,106026,106030,106039,106055,106074,106091,106108,106112,106124,106141,106150,106154,106163,106179,106197,106206,106210,106217,106221,106225,106245,106254,106274,106278,106297,106317,106337,106341,106349,106353,106378,106397,106413,106438,106463,106467,106472,106484,106498,106508,106518,106531,106543,106563,106593,106597,106617,106647,106651,106656,106675,106697,106729,106750,106761,106765,106770,106786,106790,106800,106822,106844,106858,106862,106874,106886,106898,106918,106926,106930,106935,106948,106961,106970,106984,106988,106992,107000,107017,107042,107068,107076,107080,107089,107097,107101,107110,107133,107137,107143,107153,107158,107178,107186,107210,107220,107243,107247,107251,107263],{"__ignoreMap":28},[33,105872,105873],{"class":35,"line":36},[33,105874,14447],{"class":39},[33,105876,105877],{"class":35,"line":43},[33,105878,105879],{"class":39},"# pip install pandas xlsxwriter openpyxl\n",[33,105881,105882],{"class":35,"line":61},[33,105883,139],{"class":54},[33,105885,105886],{"class":35,"line":73},[33,105887,105888],{"class":54},"generate_sales_report.py — produce a formatted multi-sheet Excel report.\n",[33,105890,105891],{"class":35,"line":88},[33,105892,105893],{"class":54},"Usage: python generate_sales_report.py --input data\u002Fsales.csv --output output\u002Freport.xlsx\n",[33,105895,105896],{"class":35,"line":95},[33,105897,139],{"class":54},[33,105899,105900,105902],{"class":35,"line":101},[33,105901,164],{"class":163},[33,105903,4461],{"class":167},[33,105905,105906,105908,105910,105912],{"class":35,"line":171},[33,105907,190],{"class":163},[33,105909,193],{"class":167},[33,105911,164],{"class":163},[33,105913,198],{"class":167},[33,105915,105916,105918,105920,105922],{"class":35,"line":179},[33,105917,164],{"class":163},[33,105919,492],{"class":167},[33,105921,495],{"class":163},[33,105923,498],{"class":167},[33,105925,105926],{"class":35,"line":187},[33,105927,92],{"emptyLinePlaceholder":91},[33,105929,105930],{"class":35,"line":201},[33,105931,92],{"emptyLinePlaceholder":91},[33,105933,105934,105936,105939],{"class":35,"line":206},[33,105935,562],{"class":163},[33,105937,105938],{"class":46}," load_and_aggregate",[33,105940,105941],{"class":167},"(csv_path: Path):\n",[33,105943,105944,105946,105948],{"class":35,"line":224},[33,105945,4025],{"class":167},[33,105947,242],{"class":163},[33,105949,105950],{"class":167}," pd.read_csv(csv_path)\n",[33,105952,105953,105955,105957,105959,105961,105963,105965,105967,105969,105971,105973],{"class":35,"line":229},[33,105954,27581],{"class":167},[33,105956,16465],{"class":54},[33,105958,763],{"class":167},[33,105960,242],{"class":163},[33,105962,27643],{"class":167},[33,105964,16465],{"class":54},[33,105966,8314],{"class":167},[33,105968,8317],{"class":238},[33,105970,242],{"class":163},[33,105972,12107],{"class":54},[33,105974,221],{"class":167},[33,105976,105977,105979,105981,105983,105985,105987,105989,105991,105993,105995,105997],{"class":35,"line":235},[33,105978,27581],{"class":167},[33,105980,16659],{"class":54},[33,105982,48135],{"class":167},[33,105984,242],{"class":163},[33,105986,27643],{"class":167},[33,105988,16659],{"class":54},[33,105990,13424],{"class":167},[33,105992,8317],{"class":238},[33,105994,242],{"class":163},[33,105996,12107],{"class":54},[33,105998,221],{"class":167},[33,106000,106001,106004,106006,106008,106010,106012,106014,106016,106018,106020,106022,106024],{"class":35,"line":250},[33,106002,106003],{"class":167},"    df.dropna(",[33,106005,28066],{"class":238},[33,106007,242],{"class":163},[33,106009,8309],{"class":167},[33,106011,16465],{"class":54},[33,106013,365],{"class":167},[33,106015,16659],{"class":54},[33,106017,8314],{"class":167},[33,106019,10891],{"class":238},[33,106021,242],{"class":163},[33,106023,855],{"class":50},[33,106025,221],{"class":167},[33,106027,106028],{"class":35,"line":266},[33,106029,92],{"emptyLinePlaceholder":91},[33,106031,106032,106035,106037],{"class":35,"line":290},[33,106033,106034],{"class":167},"    by_region ",[33,106036,242],{"class":163},[33,106038,1415],{"class":167},[33,106040,106041,106043,106045,106047,106049,106051,106053],{"class":35,"line":295},[33,106042,18499],{"class":167},[33,106044,16649],{"class":54},[33,106046,365],{"class":167},[33,106048,96540],{"class":238},[33,106050,242],{"class":163},[33,106052,902],{"class":50},[33,106054,221],{"class":167},[33,106056,106057,106060,106062,106064,106066,106068,106070,106072],{"class":35,"line":300},[33,106058,106059],{"class":167},"        .agg(",[33,106061,18407],{"class":238},[33,106063,242],{"class":163},[33,106065,602],{"class":167},[33,106067,16465],{"class":54},[33,106069,365],{"class":167},[33,106071,18522],{"class":54},[33,106073,1506],{"class":167},[33,106075,106076,106079,106081,106083,106085,106087,106089],{"class":35,"line":317},[33,106077,106078],{"class":238},"             total_units",[33,106080,242],{"class":163},[33,106082,602],{"class":167},[33,106084,16659],{"class":54},[33,106086,365],{"class":167},[33,106088,18522],{"class":54},[33,106090,1506],{"class":167},[33,106092,106093,106096,106098,106100,106102,106104,106106],{"class":35,"line":332},[33,106094,106095],{"class":238},"             transactions",[33,106097,242],{"class":163},[33,106099,602],{"class":167},[33,106101,16465],{"class":54},[33,106103,365],{"class":167},[33,106105,96601],{"class":54},[33,106107,371],{"class":167},[33,106109,106110],{"class":35,"line":347},[33,106111,1202],{"class":167},[33,106113,106114,106116,106118,106120,106122],{"class":35,"line":374},[33,106115,96648],{"class":167},[33,106117,96637],{"class":54},[33,106119,763],{"class":167},[33,106121,242],{"class":163},[33,106123,1415],{"class":167},[33,106125,106126,106129,106131,106133,106135,106137,106139],{"class":35,"line":397},[33,106127,106128],{"class":167},"        by_region[",[33,106130,96615],{"class":54},[33,106132,763],{"class":167},[33,106134,1351],{"class":163},[33,106136,96657],{"class":167},[33,106138,96660],{"class":54},[33,106140,9202],{"class":167},[33,106142,106143,106146,106148],{"class":35,"line":653},[33,106144,106145],{"class":167},"    ).round(",[33,106147,1533],{"class":50},[33,106149,221],{"class":167},[33,106151,106152],{"class":35,"line":667},[33,106153,92],{"emptyLinePlaceholder":91},[33,106155,106156,106159,106161],{"class":35,"line":675},[33,106157,106158],{"class":167},"    by_month ",[33,106160,242],{"class":163},[33,106162,1415],{"class":167},[33,106164,106165,106167,106169,106171,106173,106175,106177],{"class":35,"line":689},[33,106166,18499],{"class":167},[33,106168,96465],{"class":54},[33,106170,365],{"class":167},[33,106172,96540],{"class":238},[33,106174,242],{"class":163},[33,106176,902],{"class":50},[33,106178,221],{"class":167},[33,106180,106181,106183,106185,106187,106189,106191,106193,106195],{"class":35,"line":703},[33,106182,106059],{"class":167},[33,106184,18407],{"class":238},[33,106186,242],{"class":163},[33,106188,602],{"class":167},[33,106190,16465],{"class":54},[33,106192,365],{"class":167},[33,106194,18522],{"class":54},[33,106196,371],{"class":167},[33,106198,106199,106202,106204],{"class":35,"line":714},[33,106200,106201],{"class":167},"        .sort_values(",[33,106203,96465],{"class":54},[33,106205,221],{"class":167},[33,106207,106208],{"class":35,"line":723},[33,106209,1202],{"class":167},[33,106211,106212,106214],{"class":35,"line":754},[33,106213,1332],{"class":163},[33,106215,106216],{"class":167}," by_region, by_month\n",[33,106218,106219],{"class":35,"line":771},[33,106220,92],{"emptyLinePlaceholder":91},[33,106222,106223],{"class":35,"line":777},[33,106224,92],{"emptyLinePlaceholder":91},[33,106226,106227,106229,106232,106235,106238,106241,106243],{"class":35,"line":788},[33,106228,562],{"class":163},[33,106230,106231],{"class":46}," write_report",[33,106233,106234],{"class":167},"(by_region: ",[33,106236,106237],{"class":54},"\"pd.DataFrame\"",[33,106239,106240],{"class":167},", by_month: ",[33,106242,106237],{"class":54},[33,106244,247],{"class":167},[33,106246,106247,106250,106252],{"class":35,"line":804},[33,106248,106249],{"class":167},"                 output: Path) -> ",[33,106251,571],{"class":50},[33,106253,574],{"class":167},[33,106255,106256,106258,106260,106262,106264,106266,106268,106270,106272],{"class":35,"line":809},[33,106257,74932],{"class":167},[33,106259,869],{"class":238},[33,106261,242],{"class":163},[33,106263,855],{"class":50},[33,106265,365],{"class":167},[33,106267,878],{"class":238},[33,106269,242],{"class":163},[33,106271,855],{"class":50},[33,106273,221],{"class":167},[33,106275,106276],{"class":35,"line":819},[33,106277,92],{"emptyLinePlaceholder":91},[33,106279,106280,106282,106285,106287,106289,106291,106293,106295],{"class":35,"line":829},[33,106281,1635],{"class":163},[33,106283,106284],{"class":167}," pd.ExcelWriter(output, ",[33,106286,17351],{"class":238},[33,106288,242],{"class":163},[33,106290,103069],{"class":54},[33,106292,1649],{"class":167},[33,106294,495],{"class":163},[33,106296,17363],{"class":167},[33,106298,106299,106301,106303,106305,106307,106309,106311,106313,106315],{"class":35,"line":834},[33,106300,97017],{"class":167},[33,106302,17371],{"class":238},[33,106304,242],{"class":163},[33,106306,103086],{"class":54},[33,106308,25445],{"class":167},[33,106310,897],{"class":238},[33,106312,242],{"class":163},[33,106314,902],{"class":50},[33,106316,221],{"class":167},[33,106318,106319,106321,106323,106325,106327,106329,106331,106333,106335],{"class":35,"line":839},[33,106320,97039],{"class":167},[33,106322,17371],{"class":238},[33,106324,242],{"class":163},[33,106326,97046],{"class":54},[33,106328,365],{"class":167},[33,106330,897],{"class":238},[33,106332,242],{"class":163},[33,106334,902],{"class":50},[33,106336,221],{"class":167},[33,106338,106339],{"class":35,"line":860},[33,106340,92],{"emptyLinePlaceholder":91},[33,106342,106343,106345,106347],{"class":35,"line":887},[33,106344,97065],{"class":167},[33,106346,242],{"class":163},[33,106348,97070],{"class":167},[33,106350,106351],{"class":35,"line":907},[33,106352,92],{"emptyLinePlaceholder":91},[33,106354,106355,106358,106360,106362,106364,106366,106368,106370,106372,106374,106376],{"class":35,"line":1826},[33,106356,106357],{"class":167},"        hdr_fmt      ",[33,106359,242],{"class":163},[33,106361,103229],{"class":167},[33,106363,103657],{"class":54},[33,106365,2079],{"class":167},[33,106367,855],{"class":50},[33,106369,365],{"class":167},[33,106371,103179],{"class":54},[33,106373,2079],{"class":167},[33,106375,55362],{"class":54},[33,106377,247],{"class":167},[33,106379,106380,106383,106385,106387,106389,106391,106393,106395],{"class":35,"line":1844},[33,106381,106382],{"class":54},"                                      \"font_color\"",[33,106384,2079],{"class":167},[33,106386,103193],{"class":54},[33,106388,365],{"class":167},[33,106390,103209],{"class":54},[33,106392,2079],{"class":167},[33,106394,55650],{"class":50},[33,106396,103249],{"class":167},[33,106398,106399,106401,106403,106405,106407,106409,106411],{"class":35,"line":1858},[33,106400,103224],{"class":167},[33,106402,242],{"class":163},[33,106404,103229],{"class":167},[33,106406,103232],{"class":54},[33,106408,2079],{"class":167},[33,106410,103237],{"class":54},[33,106412,103249],{"class":167},[33,106414,106415,106418,106420,106422,106424,106426,106428,106430,106432,106434,106436],{"class":35,"line":1871},[33,106416,106417],{"class":167},"        green_fmt    ",[33,106419,242],{"class":163},[33,106421,103229],{"class":167},[33,106423,103179],{"class":54},[33,106425,2079],{"class":167},[33,106427,104183],{"class":54},[33,106429,365],{"class":167},[33,106431,103188],{"class":54},[33,106433,2079],{"class":167},[33,106435,104192],{"class":54},[33,106437,103249],{"class":167},[33,106439,106440,106443,106445,106447,106449,106451,106453,106455,106457,106459,106461],{"class":35,"line":1877},[33,106441,106442],{"class":167},"        red_fmt      ",[33,106444,242],{"class":163},[33,106446,103229],{"class":167},[33,106448,103179],{"class":54},[33,106450,2079],{"class":167},[33,106452,104210],{"class":54},[33,106454,365],{"class":167},[33,106456,103188],{"class":54},[33,106458,2079],{"class":167},[33,106460,104219],{"class":54},[33,106462,103249],{"class":167},[33,106464,106465],{"class":35,"line":1883},[33,106466,92],{"emptyLinePlaceholder":91},[33,106468,106469],{"class":35,"line":1915},[33,106470,106471],{"class":39},"        # --- Summary sheet ---\n",[33,106473,106474,106476,106478,106480,106482],{"class":35,"line":1926},[33,106475,101503],{"class":167},[33,106477,242],{"class":163},[33,106479,103386],{"class":167},[33,106481,103086],{"class":54},[33,106483,9202],{"class":167},[33,106485,106486,106488,106491,106493,106495],{"class":35,"line":1932},[33,106487,5973],{"class":163},[33,106489,106490],{"class":167}," c, name ",[33,106492,662],{"class":163},[33,106494,7403],{"class":50},[33,106496,106497],{"class":167},"(by_region.columns):\n",[33,106499,106500,106503,106505],{"class":35,"line":1938},[33,106501,106502],{"class":167},"            ws.write(",[33,106504,748],{"class":50},[33,106506,106507],{"class":167},", c, name, hdr_fmt)\n",[33,106509,106510,106512,106514,106516],{"class":35,"line":1950},[33,106511,104230],{"class":167},[33,106513,242],{"class":163},[33,106515,4037],{"class":50},[33,106517,96796],{"class":167},[33,106519,106520,106523,106525,106527,106529],{"class":35,"line":1958},[33,106521,106522],{"class":167},"        ws.set_column(",[33,106524,103398],{"class":54},[33,106526,365],{"class":167},[33,106528,19368],{"class":50},[33,106530,221],{"class":167},[33,106532,106533,106535,106537,106539,106541],{"class":35,"line":4904},[33,106534,106522],{"class":167},[33,106536,103412],{"class":54},[33,106538,365],{"class":167},[33,106540,24213],{"class":50},[33,106542,103419],{"class":167},[33,106544,106545,106547,106549,106551,106553,106555,106557,106559,106561],{"class":35,"line":4909},[33,106546,104376],{"class":167},[33,106548,4059],{"class":163},[33,106550,98615],{"class":54},[33,106552,1115],{"class":50},[33,106554,22354],{"class":167},[33,106556,1811],{"class":163},[33,106558,40161],{"class":50},[33,106560,274],{"class":54},[33,106562,104394],{"class":167},[33,106564,106565,106567,106569,106571,106573,106575,106577,106579,106581,106583,106585,106587,106589,106591],{"class":35,"line":4915},[33,106566,104279],{"class":54},[33,106568,2079],{"class":167},[33,106570,104284],{"class":54},[33,106572,365],{"class":167},[33,106574,104289],{"class":54},[33,106576,2079],{"class":167},[33,106578,104294],{"class":54},[33,106580,365],{"class":167},[33,106582,104299],{"class":54},[33,106584,2079],{"class":167},[33,106586,104304],{"class":50},[33,106588,365],{"class":167},[33,106590,104309],{"class":54},[33,106592,104312],{"class":167},[33,106594,106595],{"class":35,"line":4925},[33,106596,83823],{"class":167},[33,106598,106599,106601,106603,106605,106607,106609,106611,106613,106615],{"class":35,"line":4935},[33,106600,104376],{"class":167},[33,106602,4059],{"class":163},[33,106604,98615],{"class":54},[33,106606,1115],{"class":50},[33,106608,22354],{"class":167},[33,106610,1811],{"class":163},[33,106612,40161],{"class":50},[33,106614,274],{"class":54},[33,106616,104394],{"class":167},[33,106618,106619,106621,106623,106625,106627,106629,106631,106633,106635,106637,106639,106641,106643,106645],{"class":35,"line":4941},[33,106620,104279],{"class":54},[33,106622,2079],{"class":167},[33,106624,104284],{"class":54},[33,106626,365],{"class":167},[33,106628,104289],{"class":54},[33,106630,2079],{"class":167},[33,106632,104342],{"class":54},[33,106634,25480],{"class":167},[33,106636,104299],{"class":54},[33,106638,2079],{"class":167},[33,106640,104351],{"class":50},[33,106642,25480],{"class":167},[33,106644,104309],{"class":54},[33,106646,104358],{"class":167},[33,106648,106649],{"class":35,"line":4950},[33,106650,83823],{"class":167},[33,106652,106653],{"class":35,"line":4960},[33,106654,106655],{"class":39},"        # Total row\n",[33,106657,106658,106661,106663,106665,106667,106669,106671,106673],{"class":35,"line":4965},[33,106659,106660],{"class":167},"        ws.write(n ",[33,106662,1811],{"class":163},[33,106664,1814],{"class":50},[33,106666,365],{"class":167},[33,106668,748],{"class":50},[33,106670,365],{"class":167},[33,106672,103893],{"class":54},[33,106674,247],{"class":167},[33,106676,106677,106680,106682,106684,106686,106688,106690,106692,106694],{"class":35,"line":4971},[33,106678,106679],{"class":167},"                 wb.add_format({",[33,106681,103657],{"class":54},[33,106683,2079],{"class":167},[33,106685,855],{"class":50},[33,106687,365],{"class":167},[33,106689,103179],{"class":54},[33,106691,2079],{"class":167},[33,106693,103706],{"class":54},[33,106695,106696],{"class":167},"}))\n",[33,106698,106699,106702,106704,106706,106708,106710,106712,106714,106717,106719,106721,106723,106725,106727],{"class":35,"line":4983},[33,106700,106701],{"class":167},"        ws.write_formula(n ",[33,106703,1811],{"class":163},[33,106705,1814],{"class":50},[33,106707,365],{"class":167},[33,106709,734],{"class":50},[33,106711,365],{"class":167},[33,106713,4059],{"class":163},[33,106715,106716],{"class":54},"\"=SUM(B2:B",[33,106718,1115],{"class":50},[33,106720,22354],{"class":167},[33,106722,1811],{"class":163},[33,106724,40161],{"class":50},[33,106726,72406],{"class":54},[33,106728,247],{"class":167},[33,106730,106731,106734,106736,106738,106740,106742,106744,106746,106748],{"class":35,"line":4988},[33,106732,106733],{"class":167},"                         wb.add_format({",[33,106735,103232],{"class":54},[33,106737,2079],{"class":167},[33,106739,103237],{"class":54},[33,106741,365],{"class":167},[33,106743,103657],{"class":54},[33,106745,2079],{"class":167},[33,106747,855],{"class":50},[33,106749,247],{"class":167},[33,106751,106752,106755,106757,106759],{"class":35,"line":4993},[33,106753,106754],{"class":54},"                                        \"bg_color\"",[33,106756,2079],{"class":167},[33,106758,103706],{"class":54},[33,106760,106696],{"class":167},[33,106762,106763],{"class":35,"line":5003},[33,106764,92],{"emptyLinePlaceholder":91},[33,106766,106767],{"class":35,"line":5008},[33,106768,106769],{"class":39},"        # Bar chart\n",[33,106771,106772,106774,106776,106778,106780,106782,106784],{"class":35,"line":5014},[33,106773,104647],{"class":167},[33,106775,242],{"class":163},[33,106777,104652],{"class":167},[33,106779,104655],{"class":54},[33,106781,2079],{"class":167},[33,106783,104660],{"class":54},[33,106785,103249],{"class":167},[33,106787,106788],{"class":35,"line":5019},[33,106789,104667],{"class":167},[33,106791,106792,106794,106796,106798],{"class":35,"line":5032},[33,106793,104672],{"class":54},[33,106795,104675],{"class":167},[33,106797,12925],{"class":54},[33,106799,247],{"class":167},[33,106801,106802,106804,106806,106808,106810,106812,106814,106816,106818,106820],{"class":35,"line":5039},[33,106803,104685],{"class":54},[33,106805,12426],{"class":167},[33,106807,103086],{"class":54},[33,106809,365],{"class":167},[33,106811,734],{"class":50},[33,106813,365],{"class":167},[33,106815,748],{"class":50},[33,106817,104700],{"class":167},[33,106819,748],{"class":50},[33,106821,8935],{"class":167},[33,106823,106824,106826,106828,106830,106832,106834,106836,106838,106840,106842],{"class":35,"line":5068},[33,106825,104712],{"class":54},[33,106827,104715],{"class":167},[33,106829,103086],{"class":54},[33,106831,365],{"class":167},[33,106833,734],{"class":50},[33,106835,365],{"class":167},[33,106837,734],{"class":50},[33,106839,104700],{"class":167},[33,106841,734],{"class":50},[33,106843,8935],{"class":167},[33,106845,106846,106848,106850,106852,106854,106856],{"class":35,"line":5077},[33,106847,104739],{"class":54},[33,106849,104742],{"class":167},[33,106851,104745],{"class":54},[33,106853,2079],{"class":167},[33,106855,55362],{"class":54},[33,106857,3509],{"class":167},[33,106859,106860],{"class":35,"line":5082},[33,106861,83823],{"class":167},[33,106863,106864,106866,106868,106870,106872],{"class":35,"line":5089},[33,106865,104772],{"class":167},[33,106867,104775],{"class":54},[33,106869,2079],{"class":167},[33,106871,104780],{"class":54},[33,106873,103249],{"class":167},[33,106875,106876,106878,106880,106882,106884],{"class":35,"line":5098},[33,106877,104800],{"class":167},[33,106879,103232],{"class":54},[33,106881,2079],{"class":167},[33,106883,103265],{"class":54},[33,106885,103249],{"class":167},[33,106887,106888,106890,106892,106894,106896],{"class":35,"line":5105},[33,106889,104822],{"class":167},[33,106891,104825],{"class":54},[33,106893,2079],{"class":167},[33,106895,855],{"class":50},[33,106897,103249],{"class":167},[33,106899,106900,106902,106904,106906,106908,106910,106912,106914,106916],{"class":35,"line":5110},[33,106901,104836],{"class":167},[33,106903,83804],{"class":54},[33,106905,2079],{"class":167},[33,106907,47140],{"class":50},[33,106909,365],{"class":167},[33,106911,83816],{"class":54},[33,106913,2079],{"class":167},[33,106915,17008],{"class":50},[33,106917,103249],{"class":167},[33,106919,106920,106922,106924],{"class":35,"line":5115},[33,106921,104862],{"class":167},[33,106923,104865],{"class":54},[33,106925,104868],{"class":167},[33,106927,106928],{"class":35,"line":5128},[33,106929,92],{"emptyLinePlaceholder":91},[33,106931,106932],{"class":35,"line":5135},[33,106933,106934],{"class":39},"        # --- Monthly Trend sheet ---\n",[33,106936,106937,106940,106942,106944,106946],{"class":35,"line":5142},[33,106938,106939],{"class":167},"        ws2 ",[33,106941,242],{"class":163},[33,106943,103386],{"class":167},[33,106945,97046],{"class":54},[33,106947,9202],{"class":167},[33,106949,106950,106952,106954,106956,106958],{"class":35,"line":5151},[33,106951,5973],{"class":163},[33,106953,106490],{"class":167},[33,106955,662],{"class":163},[33,106957,7403],{"class":50},[33,106959,106960],{"class":167},"(by_month.columns):\n",[33,106962,106963,106966,106968],{"class":35,"line":5156},[33,106964,106965],{"class":167},"            ws2.write(",[33,106967,748],{"class":50},[33,106969,106507],{"class":167},[33,106971,106972,106975,106978,106980,106982],{"class":35,"line":5161},[33,106973,106974],{"class":167},"        ws2.set_column(",[33,106976,106977],{"class":54},"\"A:B\"",[33,106979,365],{"class":167},[33,106981,19300],{"class":50},[33,106983,103419],{"class":167},[33,106985,106986],{"class":35,"line":5167},[33,106987,92],{"emptyLinePlaceholder":91},[33,106989,106990],{"class":35,"line":5172},[33,106991,92],{"emptyLinePlaceholder":91},[33,106993,106994,106996,106998],{"class":35,"line":5182},[33,106995,562],{"class":163},[33,106997,6636],{"class":46},[33,106999,25419],{"class":167},[33,107001,107002,107004,107006,107008,107010,107012,107015],{"class":35,"line":5195},[33,107003,6648],{"class":167},[33,107005,242],{"class":163},[33,107007,6653],{"class":167},[33,107009,6656],{"class":238},[33,107011,242],{"class":163},[33,107013,107014],{"class":54},"\"Generate formatted Excel sales report\"",[33,107016,221],{"class":167},[33,107018,107019,107021,107023,107025,107027,107029,107031,107033,107035,107037,107040],{"class":35,"line":5200},[33,107020,6669],{"class":167},[33,107022,6672],{"class":54},[33,107024,25480],{"class":167},[33,107026,6685],{"class":238},[33,107028,242],{"class":163},[33,107030,59612],{"class":54},[33,107032,25539],{"class":167},[33,107034,25463],{"class":238},[33,107036,242],{"class":163},[33,107038,107039],{"class":54},"\"Input CSV path\"",[33,107041,221],{"class":167},[33,107043,107044,107046,107048,107050,107052,107054,107057,107059,107061,107063,107066],{"class":35,"line":5205},[33,107045,6669],{"class":167},[33,107047,6699],{"class":54},[33,107049,365],{"class":167},[33,107051,6685],{"class":238},[33,107053,242],{"class":163},[33,107055,107056],{"class":54},"\"output\u002Freport.xlsx\"",[33,107058,365],{"class":167},[33,107060,25463],{"class":238},[33,107062,242],{"class":163},[33,107064,107065],{"class":54},"\"Output .xlsx path\"",[33,107067,221],{"class":167},[33,107069,107070,107072,107074],{"class":35,"line":5210},[33,107071,6766],{"class":167},[33,107073,242],{"class":163},[33,107075,6771],{"class":167},[33,107077,107078],{"class":35,"line":5215},[33,107079,92],{"emptyLinePlaceholder":91},[33,107081,107082,107085,107087],{"class":35,"line":5220},[33,107083,107084],{"class":167},"    csv_path ",[33,107086,242],{"class":163},[33,107088,69442],{"class":167},[33,107090,107091,107093,107095],{"class":35,"line":5227},[33,107092,6388],{"class":167},[33,107094,242],{"class":163},[33,107096,69452],{"class":167},[33,107098,107099],{"class":35,"line":5232},[33,107100,92],{"emptyLinePlaceholder":91},[33,107102,107103,107105,107107],{"class":35,"line":5237},[33,107104,617],{"class":163},[33,107106,620],{"class":163},[33,107108,107109],{"class":167}," csv_path.exists():\n",[33,107111,107112,107114,107116,107118,107120,107122,107124,107127,107129,107131],{"class":35,"line":5251},[33,107113,4051],{"class":163},[33,107115,16617],{"class":50},[33,107117,602],{"class":167},[33,107119,4059],{"class":163},[33,107121,16624],{"class":54},[33,107123,1115],{"class":50},[33,107125,107126],{"class":167},"csv_path",[33,107128,1121],{"class":50},[33,107130,274],{"class":54},[33,107132,221],{"class":167},[33,107134,107135],{"class":35,"line":5259},[33,107136,92],{"emptyLinePlaceholder":91},[33,107138,107139,107141],{"class":35,"line":5264},[33,107140,2424],{"class":163},[33,107142,574],{"class":167},[33,107144,107145,107148,107150],{"class":35,"line":5269},[33,107146,107147],{"class":167},"        by_region, by_month ",[33,107149,242],{"class":163},[33,107151,107152],{"class":167}," load_and_aggregate(csv_path)\n",[33,107154,107155],{"class":35,"line":5283},[33,107156,107157],{"class":167},"        write_report(by_region, by_month, out_path)\n",[33,107159,107160,107162,107164,107166,107168,107170,107172,107174,107176],{"class":35,"line":5293},[33,107161,9414],{"class":50},[33,107163,602],{"class":167},[33,107165,4059],{"class":163},[33,107167,67189],{"class":54},[33,107169,1115],{"class":50},[33,107171,40722],{"class":167},[33,107173,1121],{"class":50},[33,107175,274],{"class":54},[33,107177,221],{"class":167},[33,107179,107180,107182,107184],{"class":35,"line":5303},[33,107181,2449],{"class":163},[33,107183,17393],{"class":50},[33,107185,574],{"class":167},[33,107187,107188,107190,107192,107194,107196,107199,107201,107203,107205,107208],{"class":35,"line":5313},[33,107189,4051],{"class":163},[33,107191,16617],{"class":50},[33,107193,602],{"class":167},[33,107195,4059],{"class":163},[33,107197,107198],{"class":54},"\"Cannot write ",[33,107200,1115],{"class":50},[33,107202,40722],{"class":167},[33,107204,1121],{"class":50},[33,107206,107207],{"class":54}," — close it in Excel first\"",[33,107209,221],{"class":167},[33,107211,107212,107214,107216,107218],{"class":35,"line":5320},[33,107213,2449],{"class":163},[33,107215,783],{"class":50},[33,107217,1852],{"class":163},[33,107219,1855],{"class":167},[33,107221,107222,107224,107226,107228,107230,107233,107235,107237,107239,107241],{"class":35,"line":5325},[33,107223,4051],{"class":163},[33,107225,16617],{"class":50},[33,107227,602],{"class":167},[33,107229,4059],{"class":163},[33,107231,107232],{"class":54},"\"Report generation failed: ",[33,107234,1115],{"class":50},[33,107236,6565],{"class":167},[33,107238,1121],{"class":50},[33,107240,274],{"class":54},[33,107242,221],{"class":167},[33,107244,107245],{"class":35,"line":5330},[33,107246,92],{"emptyLinePlaceholder":91},[33,107248,107249],{"class":35,"line":5344},[33,107250,92],{"emptyLinePlaceholder":91},[33,107252,107253,107255,107257,107259,107261],{"class":35,"line":5349},[33,107254,2491],{"class":163},[33,107256,2494],{"class":50},[33,107258,2497],{"class":163},[33,107260,2500],{"class":54},[33,107262,574],{"class":167},[33,107264,107265],{"class":35,"line":5354},[33,107266,6914],{"class":167},[18,107268,36626],{"id":36625},[14,107270,107271,107278,107279,107281,107282,107284,107285,107287,107288,107290],{},[1974,107272,107273,107274,2012,107276,36637],{},"Which engine should I use — ",[30,107275,22009],{},[30,107277,17066],{},"\nUse ",[30,107280,17066],{}," when creating a new file with heavy formatting and charts — it is faster and has a richer format API. Use ",[30,107283,22009],{}," when loading and modifying an existing ",[30,107286,26542],{}," file. You cannot use ",[30,107289,17066],{}," to read or modify existing files.",[14,107292,107293,107296,107297,36661,107300,107302,107303,107306],{},[1974,107294,107295],{},"How do I add a second header row (e.g. a report title above the column headers)?","\nPass ",[30,107298,107299],{},"startrow=2",[30,107301,96833],{},", then use ",[30,107304,107305],{},"ws.merge_range(\"A1:D1\", \"Monthly Report\", title_fmt)"," before writing the data. The column header row lands at row 2 (0-indexed row 1).",[14,107308,107309,107315,107317,107318,107321,107322,8877,107324,107327],{},[1974,107310,107311,107312,107314],{},"Why do my formulas show as ",[30,107313,748],{}," when I open the file?",[30,107316,17066],{}," writes formulas as strings and relies on Excel to calculate on open. If you need pre-calculated values, write the Python-computed value directly with ",[30,107319,107320],{},"ws.write(row, col, value, fmt)",". Alternatively, open the file in ",[30,107323,22009],{},[30,107325,107326],{},"data_only=False",", then save — Excel will recalculate on next open.",[14,107329,107330,107333,107335,107336,10065,107339,107342,107343,102135],{},[1974,107331,107332],{},"Can I password-protect the generated report?",[30,107334,17066],{}," supports workbook-level protection: ",[30,107337,107338],{},"wb.set_properties(...)",[30,107340,107341],{},"ws.protect(\"password\")",". For PDF output with password protection, see the ",[940,107344,65967],{"href":65966},[14,107346,107347,107350,107351,107354,107355,107357],{},[1974,107348,107349],{},"How do I schedule this to run automatically?","\nOn Linux\u002FmacOS, add a cron entry: ",[30,107352,107353],{},"0 7 1 * * \u002Fpath\u002F.venv\u002Fbin\u002Fpython \u002Fpath\u002Fgenerate_sales_report.py",". See ",[940,107356,95780],{"href":21804}," for a full scheduling and delivery walkthrough.",[18,107359,6918],{"id":6917},[4211,107361,107362,107367,107372,107377],{},[4214,107363,107364,107366],{},[940,107365,95780],{"href":21804}," — end-to-end scheduled report with multi-sheet groupby output",[4214,107368,107369,107371],{},[940,107370,102074],{"href":102073}," — advanced formula patterns and chart types",[4214,107373,107374,107376],{},[940,107375,99577],{"href":99576}," — ingestion patterns before the report generation step",[4214,107378,107379,107381],{},[940,107380,97863],{"href":97862}," — resolve write errors when loading an existing workbook",[14,107383,6947,107384,3035],{},[940,107385,26258],{"href":26257},[6953,107387,107388],{},"html pre.shiki code .s7eDp, html code.shiki .s7eDp{--shiki-default:#6F42C1}html pre.shiki code .sYu0t, html code.shiki .sYu0t{--shiki-default:#005CC5}html pre.shiki code .sYBdl, html code.shiki .sYBdl{--shiki-default:#032F62}html pre.shiki code .sAwPA, html code.shiki .sAwPA{--shiki-default:#6A737D}html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html pre.shiki code .sD7c4, html code.shiki .sD7c4{--shiki-default:#D73A49}html pre.shiki code .sgsFI, html code.shiki .sgsFI{--shiki-default:#24292E}html pre.shiki code .sqxcx, html code.shiki .sqxcx{--shiki-default:#E36209}",{"title":28,"searchDepth":43,"depth":43,"links":107390},[107391,107392,107393,107394,107395,107396,107397,107398,107399,107405,107406,107407,107408,107409,107410],{"id":20,"depth":43,"text":21},{"id":102304,"depth":43,"text":102305},{"id":102459,"depth":43,"text":102460},{"id":102570,"depth":43,"text":102571},{"id":102948,"depth":43,"text":102949},{"id":103505,"depth":43,"text":103506},{"id":104041,"depth":43,"text":104042},{"id":104502,"depth":43,"text":104503},{"id":2708,"depth":43,"text":2709,"children":107400},[107401,107402,107403],{"id":104929,"depth":61,"text":104930},{"id":105320,"depth":61,"text":105321},{"id":105329,"depth":61,"text":107404},"Memory-Efficient Writes with use_constant_memory",{"id":52029,"depth":43,"text":52030},{"id":21809,"depth":43,"text":21810},{"id":4270,"depth":43,"text":4271},{"id":4401,"depth":43,"text":4402},{"id":36625,"depth":43,"text":36626},{"id":6917,"depth":43,"text":6918},"Excel Report Generation","[object Object]",{},"\u002Fpython-for-excel-csv-data-processing\u002Fautomating-excel-report-generation",{"title":6936,"description":107416},{"Build formatted multi-sheet Excel reports with openpyxl and xlsxwriter":107417,"date":46387,"updatedAt":6978,"tags":107418},"styled headers, number formats, conditional formatting, charts, and summary rows.",[99614,47,22009,17066,107419],"reporting","Automate Excel Report Generation with Python","python-for-excel-csv-data-processing\u002Fautomating-excel-report-generation\u002Findex","khu5uco1rIo_DPqw0vdzqCeepZ8asJHab_75SOdrl9Q",{"id":107424,"title":107425,"body":107426,"breadcrumbTitle":110374,"canonical":6977,"date":46387,"description":110375,"draft":6980,"extension":6981,"image":6977,"meta":110376,"navigation":91,"path":110377,"robots":6977,"seo":110378,"seoTitle":110379,"stem":110380,"tags":110381,"updatedAt":6978,"__hash__":110382},"content\u002Fpython-for-excel-csv-data-processing\u002Fcleaning-messy-csv-data-with-pandas\u002Fbest-python-libraries-for-csv-parsing\u002Findex.md","Best Python Libraries for CSV Parsing",{"type":7,"value":107427,"toc":110355},[107428,107431,107449,107453,107477,107479,107483,107648,107650,107654,107657,108273,108275,108279,108286,108301,108626,108630,108642,108856,108867,108871,108887,109144,109147,109151,109161,109599,109617,109619,109621,109627,109633,109822,109826,109829,110001,110003,110007,110010,110312,110323,110325,110327,110349,110353],[10,107429,107425],{"id":107430},"best-python-libraries-for-csv-parsing",[14,107432,107433,107434,365,107437,365,107439,71132,107442,107444,107445,107448],{},"Choosing the wrong CSV parser is the fastest way to blow your RAM budget, stall an ETL job, or discover at 2 AM that type inference silently coerced an integer ID to a float. This guide compares the four libraries that cover 95 % of real-world CSV work — Python's stdlib ",[30,107435,107436],{},"csv",[30,107438,9630],{},[30,107440,107441],{},"polars",[30,107443,14295],{}," — so you can match parser to problem before writing a single line of code. For what comes after ingestion, ",[940,107446,107447],{"href":9598},"Cleaning Messy CSV Data with Pandas"," walks through the full cleaning workflow.",[18,107450,107452],{"id":107451},"why-picking-the-wrong-parser-causes-problems","Why Picking the Wrong Parser Causes Problems",[14,107454,107455,107456,107459,107460,107462,107463,107466,107467,107470,107471,107473,107474,107476],{},"Each library loads CSV data with a fundamentally different memory model. ",[30,107457,107458],{},"pandas.read_csv()"," is eager: it materialises the entire file as a DataFrame before your code touches a single row. On a 2 GB file with mixed object columns that can mean 8–12 GB of resident memory, and a ",[30,107461,70953],{}," you can't recover from mid-process. ",[30,107464,107465],{},"polars.scan_csv()"," is lazy by default — it builds a query plan and only pulls data when you call ",[30,107468,107469],{},".collect()",", and its streaming engine can process files larger than RAM in chunks. The stdlib ",[30,107472,107436],{}," module reads one row at a time with near-zero overhead, but gives you raw strings; every type cast is manual. ",[30,107475,14295],{}," reads into columnar Arrow buffers — ideal when the next step is writing Parquet or passing data to DuckDB, but it adds an unfamiliar API if you only need a quick DataFrame. Picking the wrong one means either rewriting the loader later or patching around out-of-memory crashes in production.",[2537,107478],{},[18,107480,107482],{"id":107481},"quick-selection-matrix","Quick Selection Matrix",[2540,107484,2547,107487,2547,107490,2547,107493,2547,2547,107504,2547,107508,2547,107513,2547,107516,2547,107518,2547,107522,2547,107526,2547,2547,107529,2547,107532,2547,107537,2547,107540,2547,107545,2547,107548,2547,107551,2547,107554,2547,2547,107557,2547,107559,2547,107561,2547,107564,2547,107566,2547,107569,2547,107572,2547,107575,2547,2547,107578,2547,107580,2547,107583,2547,107587,2547,107590,2547,107593,2547,107595,2547,107598,2547,2547,107600,2547,107602,2547,107605,2547,107609,2547,107612,2547,107615,2547,107618,2547,107621,2547,2547,107623,2547,107626,2547,107629,2547,107634,2547,107636,2547,107639,2547,107641,2547,107645],{"viewBox":107485,"role":2543,"ariaLabel":107486,"xmlns":2545,"style":2546},"0 0 760 336","Comparison matrix of four CSV parsing libraries: csv stdlib, pandas, polars, and pyarrow",[2549,107488,107489],{},"CSV Library Comparison Matrix",[2553,107491,107492],{},"A grid comparing csv stdlib, pandas, polars, and pyarrow across five criteria: Speed, Memory, API richness, Streaming, and Type inference.",[2557,107494,2559,107495,2547],{},[2561,107496,2564,107498,2564,107501,2559],{"id":107497,"x1":748,"y1":748,"x2":734,"y2":748},"csv-libs-hdr-grad",[2566,107499],{"offset":748,"style":107500},"stop-color:#1e293b",[2566,107502],{"offset":734,"style":107503},"stop-color:#334155",[2585,107505],{"x":748,"y":748,"width":107506,"height":58404,"fill":107507},"760","url(#csv-libs-hdr-grad)",[2000,107509,26485],{"x":2597,"y":107510,"fill":107511,"style":107512},"23","#f8fafc","font-size:13px;font-weight:bold;text-anchor:middle",[2000,107514,107515],{"x":58401,"y":107510,"fill":107511,"style":107512},"Speed",[2000,107517,4218],{"x":2625,"y":107510,"fill":107511,"style":107512},[2000,107519,107521],{"x":107520,"y":107510,"fill":107511,"style":107512},"440","API richness",[2000,107523,107525],{"x":107524,"y":107510,"fill":107511,"style":107512},"570","Streaming",[2000,107527,107528],{"x":59986,"y":107510,"fill":107511,"style":107512},"Type infer",[2585,107530],{"x":748,"y":58404,"width":107506,"height":38740,"fill":2615,"stroke":2593,"style":107531},"stroke-width:0.5",[2000,107533,107536],{"x":3545,"y":107534,"fill":2599,"style":107535},"59","font-size:14px;font-weight:bold","csv (stdlib)",[2000,107538,107539],{"x":3545,"y":16991,"fill":2583,"style":49873},"no install needed",[2000,107541,107544],{"x":58401,"y":38749,"fill":107542,"style":107543},"#d97706","font-size:13px;text-anchor:middle","Medium",[2000,107546,107547],{"x":2625,"y":38749,"fill":17010,"style":107512},"O(1)",[2000,107549,107550],{"x":107520,"y":38749,"fill":17041,"style":107543},"Minimal",[2000,107552,107553],{"x":107524,"y":38749,"fill":17010,"style":107512},"Native",[2000,107555,107556],{"x":59986,"y":38749,"fill":17041,"style":107543},"None (str)",[2585,107558],{"x":748,"y":26349,"width":107506,"height":38740,"fill":2592,"stroke":2593,"style":107531},[2000,107560,9630],{"x":3545,"y":2629,"fill":2599,"style":107535},[2000,107562,107563],{"x":3545,"y":102527,"fill":2583,"style":49873},"\u003C 500 MB analytical",[2000,107565,38523],{"x":58401,"y":26402,"fill":107542,"style":107543},[2000,107567,107568],{"x":2625,"y":26402,"fill":17041,"style":107543},"Eager \u002F High",[2000,107570,107571],{"x":107520,"y":26402,"fill":17010,"style":107512},"Rich",[2000,107573,107574],{"x":107524,"y":26402,"fill":107542,"style":107543},"chunksize only",[2000,107576,107577],{"x":59986,"y":26402,"fill":17010,"style":107512},"Auto",[2585,107579],{"x":748,"y":11194,"width":107506,"height":38740,"fill":2615,"stroke":2593,"style":107531},[2000,107581,107441],{"x":3545,"y":107582,"fill":2599,"style":107535},"171",[2000,107584,107586],{"x":3545,"y":107585,"fill":2583,"style":49873},"187","500 MB – 5 GB+",[2000,107588,107589],{"x":58401,"y":58337,"fill":17010,"style":107512},"3–5× pandas",[2000,107591,107592],{"x":2625,"y":58337,"fill":17010,"style":107512},"Lazy \u002F Low",[2000,107594,107571],{"x":107520,"y":58337,"fill":17010,"style":107512},[2000,107596,107597],{"x":107524,"y":58337,"fill":17010,"style":107512},"scan_csv native",[2000,107599,107577],{"x":59986,"y":58337,"fill":17010,"style":107512},[2585,107601],{"x":748,"y":11122,"width":107506,"height":38740,"fill":2592,"stroke":2593,"style":107531},[2000,107603,14295],{"x":3545,"y":107604,"fill":2599,"style":107535},"227",[2000,107606,107608],{"x":3545,"y":107607,"fill":2583,"style":49873},"243","Parquet \u002F DuckDB",[2000,107610,107611],{"x":58401,"y":38847,"fill":17010,"style":107512},"Fast (C++)",[2000,107613,107614],{"x":2625,"y":38847,"fill":107542,"style":107543},"Columnar",[2000,107616,107617],{"x":107520,"y":38847,"fill":107542,"style":107543},"Low-level",[2000,107619,107620],{"x":107524,"y":38847,"fill":107542,"style":107543},"open_csv batch",[2000,107622,107577],{"x":59986,"y":38847,"fill":17010,"style":107512},[2585,107624],{"x":748,"y":107625,"width":107506,"height":49813,"fill":2615,"stroke":2593,"style":107531},"268",[2585,107627],{"x":3545,"y":107628,"width":55650,"height":55650,"fill":17010},"284",[2000,107630,107633],{"x":11164,"y":107631,"fill":2583,"style":107632},"295","font-size:12px","Strong",[2585,107635],{"x":2679,"y":107628,"width":55650,"height":55650,"fill":107542},[2000,107637,107638],{"x":11131,"y":107631,"fill":2583,"style":107632},"Partial",[2585,107640],{"x":58401,"y":107628,"width":55650,"height":55650,"fill":17041},[2000,107642,107644],{"x":107643,"y":107631,"fill":2583,"style":107632},"226","Weak \u002F None",[2000,107646,107647],{"x":2626,"y":107631,"fill":2583,"style":107632},"Bold green = strong recommendation for that criterion",[2537,107649],{},[18,107651,107653],{"id":107652},"minimal-diagnostic-side-by-side-parser-comparison","Minimal Diagnostic: Side-by-Side Parser Comparison",[14,107655,107656],{},"Run this snippet on any CSV you're evaluating. It loads the same file with all four parsers and prints elapsed time and peak RSS memory so you can see the trade-offs concretely.",[23,107658,107660],{"className":126,"code":107659,"language":47,"meta":28,"style":28},"# pip install pandas polars pyarrow psutil\nimport csv\nimport time\nimport tracemalloc\nfrom pathlib import Path\n\nimport pandas as pd\nimport polars as pl\nimport pyarrow.csv as pa_csv\n\nDATA = Path(\"sample.csv\")  # replace with your file\n\ndef measure(label: str, fn):\n    tracemalloc.start()\n    t0 = time.perf_counter()\n    result = fn()\n    elapsed = time.perf_counter() - t0\n    _, peak = tracemalloc.get_traced_memory()\n    tracemalloc.stop()\n    print(f\"{label:20s}  {elapsed:.2f}s  peak={peak \u002F 1_048_576:.1f} MB\")\n    return result\n\n# stdlib csv — row-by-row, no type inference\ndef load_csv_stdlib():\n    try:\n        with DATA.open(newline=\"\", encoding=\"utf-8\") as fh:\n            return list(csv.DictReader(fh))\n    except OSError as exc:\n        raise SystemExit(f\"Cannot open {DATA}: {exc}\") from exc\n\n# pandas — eager, vectorised\ndef load_pandas():\n    try:\n        return pd.read_csv(DATA, low_memory=False)\n    except (OSError, pd.errors.ParserError) as exc:\n        raise SystemExit(f\"pandas failed: {exc}\") from exc\n\n# polars — lazy, multi-threaded\ndef load_polars():\n    try:\n        return pl.scan_csv(DATA).collect()\n    except Exception as exc:\n        raise SystemExit(f\"polars failed: {exc}\") from exc\n\n# pyarrow — columnar Arrow table\ndef load_pyarrow():\n    try:\n        return pa_csv.read_csv(DATA)\n    except (OSError, Exception) as exc:\n        raise SystemExit(f\"pyarrow failed: {exc}\") from exc\n\nmeasure(\"csv (stdlib)\", load_csv_stdlib)\nmeasure(\"pandas\",       load_pandas)\nmeasure(\"polars\",       load_polars)\nmeasure(\"pyarrow\",      load_pyarrow)\n",[30,107661,107662,107667,107674,107680,107687,107697,107701,107711,107723,107735,107739,107754,107758,107773,107778,107788,107797,107812,107822,107827,107877,107883,107887,107892,107901,107907,107938,107947,107958,107989,107993,107998,108007,108013,108032,108047,108074,108078,108083,108092,108098,108110,108120,108147,108151,108156,108165,108171,108182,108201,108228,108232,108243,108253,108263],{"__ignoreMap":28},[33,107663,107664],{"class":35,"line":36},[33,107665,107666],{"class":39},"# pip install pandas polars pyarrow psutil\n",[33,107668,107669,107671],{"class":35,"line":43},[33,107670,164],{"class":163},[33,107672,107673],{"class":167}," csv\n",[33,107675,107676,107678],{"class":35,"line":61},[33,107677,164],{"class":163},[33,107679,1689],{"class":167},[33,107681,107682,107684],{"class":35,"line":73},[33,107683,164],{"class":163},[33,107685,107686],{"class":167}," tracemalloc\n",[33,107688,107689,107691,107693,107695],{"class":35,"line":88},[33,107690,190],{"class":163},[33,107692,193],{"class":167},[33,107694,164],{"class":163},[33,107696,198],{"class":167},[33,107698,107699],{"class":35,"line":95},[33,107700,92],{"emptyLinePlaceholder":91},[33,107702,107703,107705,107707,107709],{"class":35,"line":101},[33,107704,164],{"class":163},[33,107706,492],{"class":167},[33,107708,495],{"class":163},[33,107710,498],{"class":167},[33,107712,107713,107715,107718,107720],{"class":35,"line":171},[33,107714,164],{"class":163},[33,107716,107717],{"class":167}," polars ",[33,107719,495],{"class":163},[33,107721,107722],{"class":167}," pl\n",[33,107724,107725,107727,107730,107732],{"class":35,"line":179},[33,107726,164],{"class":163},[33,107728,107729],{"class":167}," pyarrow.csv ",[33,107731,495],{"class":163},[33,107733,107734],{"class":167}," pa_csv\n",[33,107736,107737],{"class":35,"line":187},[33,107738,92],{"emptyLinePlaceholder":91},[33,107740,107741,107743,107745,107747,107750,107752],{"class":35,"line":201},[33,107742,59605],{"class":50},[33,107744,212],{"class":163},[33,107746,215],{"class":167},[33,107748,107749],{"class":54},"\"sample.csv\"",[33,107751,10922],{"class":167},[33,107753,99838],{"class":39},[33,107755,107756],{"class":35,"line":206},[33,107757,92],{"emptyLinePlaceholder":91},[33,107759,107760,107762,107765,107768,107770],{"class":35,"line":224},[33,107761,562],{"class":163},[33,107763,107764],{"class":46}," measure",[33,107766,107767],{"class":167},"(label: ",[33,107769,1053],{"class":50},[33,107771,107772],{"class":167},", fn):\n",[33,107774,107775],{"class":35,"line":229},[33,107776,107777],{"class":167},"    tracemalloc.start()\n",[33,107779,107780,107783,107785],{"class":35,"line":235},[33,107781,107782],{"class":167},"    t0 ",[33,107784,242],{"class":163},[33,107786,107787],{"class":167}," time.perf_counter()\n",[33,107789,107790,107792,107794],{"class":35,"line":250},[33,107791,8842],{"class":167},[33,107793,242],{"class":163},[33,107795,107796],{"class":167}," fn()\n",[33,107798,107799,107802,107804,107807,107809],{"class":35,"line":266},[33,107800,107801],{"class":167},"    elapsed ",[33,107803,242],{"class":163},[33,107805,107806],{"class":167}," time.perf_counter() ",[33,107808,4126],{"class":163},[33,107810,107811],{"class":167}," t0\n",[33,107813,107814,107817,107819],{"class":35,"line":290},[33,107815,107816],{"class":167},"    _, peak ",[33,107818,242],{"class":163},[33,107820,107821],{"class":167}," tracemalloc.get_traced_memory()\n",[33,107823,107824],{"class":35,"line":295},[33,107825,107826],{"class":167},"    tracemalloc.stop()\n",[33,107828,107829,107831,107833,107835,107837,107839,107841,107844,107846,107848,107851,107853,107855,107858,107860,107863,107865,107868,107870,107872,107875],{"class":35,"line":300},[33,107830,7268],{"class":50},[33,107832,602],{"class":167},[33,107834,4059],{"class":163},[33,107836,274],{"class":54},[33,107838,1115],{"class":50},[33,107840,61755],{"class":167},[33,107842,107843],{"class":163},":20s",[33,107845,1121],{"class":50},[33,107847,54867],{"class":50},[33,107849,107850],{"class":167},"elapsed",[33,107852,55819],{"class":163},[33,107854,1121],{"class":50},[33,107856,107857],{"class":54},"s  peak=",[33,107859,1115],{"class":50},[33,107861,107862],{"class":167},"peak ",[33,107864,1351],{"class":163},[33,107866,107867],{"class":50}," 1_048_576",[33,107869,18438],{"class":163},[33,107871,1121],{"class":50},[33,107873,107874],{"class":54}," MB\"",[33,107876,221],{"class":167},[33,107878,107879,107881],{"class":35,"line":317},[33,107880,1332],{"class":163},[33,107882,49632],{"class":167},[33,107884,107885],{"class":35,"line":332},[33,107886,92],{"emptyLinePlaceholder":91},[33,107888,107889],{"class":35,"line":347},[33,107890,107891],{"class":39},"# stdlib csv — row-by-row, no type inference\n",[33,107893,107894,107896,107899],{"class":35,"line":374},[33,107895,562],{"class":163},[33,107897,107898],{"class":46}," load_csv_stdlib",[33,107900,25419],{"class":167},[33,107902,107903,107905],{"class":35,"line":397},[33,107904,2424],{"class":163},[33,107906,574],{"class":167},[33,107908,107909,107911,107914,107917,107920,107922,107924,107926,107928,107930,107932,107934,107936],{"class":35,"line":653},[33,107910,2191],{"class":163},[33,107912,107913],{"class":50}," DATA",[33,107915,107916],{"class":167},".open(",[33,107918,107919],{"class":238},"newline",[33,107921,242],{"class":163},[33,107923,3198],{"class":54},[33,107925,365],{"class":167},[33,107927,27249],{"class":238},[33,107929,242],{"class":163},[33,107931,1195],{"class":54},[33,107933,1649],{"class":167},[33,107935,495],{"class":163},[33,107937,67176],{"class":167},[33,107939,107940,107942,107944],{"class":35,"line":667},[33,107941,28782],{"class":163},[33,107943,599],{"class":50},[33,107945,107946],{"class":167},"(csv.DictReader(fh))\n",[33,107948,107949,107951,107954,107956],{"class":35,"line":675},[33,107950,2449],{"class":163},[33,107952,107953],{"class":50}," OSError",[33,107955,1852],{"class":163},[33,107957,1855],{"class":167},[33,107959,107960,107962,107964,107966,107968,107970,107973,107975,107977,107979,107981,107983,107985,107987],{"class":35,"line":689},[33,107961,4051],{"class":163},[33,107963,16617],{"class":50},[33,107965,602],{"class":167},[33,107967,4059],{"class":163},[33,107969,9935],{"class":54},[33,107971,107972],{"class":50},"{DATA}",[33,107974,2079],{"class":54},[33,107976,1115],{"class":50},[33,107978,6565],{"class":167},[33,107980,1121],{"class":50},[33,107982,274],{"class":54},[33,107984,1649],{"class":167},[33,107986,190],{"class":163},[33,107988,20843],{"class":167},[33,107990,107991],{"class":35,"line":703},[33,107992,92],{"emptyLinePlaceholder":91},[33,107994,107995],{"class":35,"line":714},[33,107996,107997],{"class":39},"# pandas — eager, vectorised\n",[33,107999,108000,108002,108005],{"class":35,"line":723},[33,108001,562],{"class":163},[33,108003,108004],{"class":46}," load_pandas",[33,108006,25419],{"class":167},[33,108008,108009,108011],{"class":35,"line":754},[33,108010,2424],{"class":163},[33,108012,574],{"class":167},[33,108014,108015,108017,108019,108021,108023,108026,108028,108030],{"class":35,"line":771},[33,108016,1659],{"class":163},[33,108018,9481],{"class":167},[33,108020,59605],{"class":50},[33,108022,365],{"class":167},[33,108024,108025],{"class":238},"low_memory",[33,108027,242],{"class":163},[33,108029,902],{"class":50},[33,108031,221],{"class":167},[33,108033,108034,108036,108038,108040,108043,108045],{"class":35,"line":777},[33,108035,2449],{"class":163},[33,108037,17583],{"class":167},[33,108039,43079],{"class":50},[33,108041,108042],{"class":167},", pd.errors.ParserError) ",[33,108044,495],{"class":163},[33,108046,1855],{"class":167},[33,108048,108049,108051,108053,108055,108057,108060,108062,108064,108066,108068,108070,108072],{"class":35,"line":788},[33,108050,4051],{"class":163},[33,108052,16617],{"class":50},[33,108054,602],{"class":167},[33,108056,4059],{"class":163},[33,108058,108059],{"class":54},"\"pandas failed: ",[33,108061,1115],{"class":50},[33,108063,6565],{"class":167},[33,108065,1121],{"class":50},[33,108067,274],{"class":54},[33,108069,1649],{"class":167},[33,108071,190],{"class":163},[33,108073,20843],{"class":167},[33,108075,108076],{"class":35,"line":804},[33,108077,92],{"emptyLinePlaceholder":91},[33,108079,108080],{"class":35,"line":809},[33,108081,108082],{"class":39},"# polars — lazy, multi-threaded\n",[33,108084,108085,108087,108090],{"class":35,"line":819},[33,108086,562],{"class":163},[33,108088,108089],{"class":46}," load_polars",[33,108091,25419],{"class":167},[33,108093,108094,108096],{"class":35,"line":829},[33,108095,2424],{"class":163},[33,108097,574],{"class":167},[33,108099,108100,108102,108105,108107],{"class":35,"line":834},[33,108101,1659],{"class":163},[33,108103,108104],{"class":167}," pl.scan_csv(",[33,108106,59605],{"class":50},[33,108108,108109],{"class":167},").collect()\n",[33,108111,108112,108114,108116,108118],{"class":35,"line":839},[33,108113,2449],{"class":163},[33,108115,783],{"class":50},[33,108117,1852],{"class":163},[33,108119,1855],{"class":167},[33,108121,108122,108124,108126,108128,108130,108133,108135,108137,108139,108141,108143,108145],{"class":35,"line":860},[33,108123,4051],{"class":163},[33,108125,16617],{"class":50},[33,108127,602],{"class":167},[33,108129,4059],{"class":163},[33,108131,108132],{"class":54},"\"polars failed: ",[33,108134,1115],{"class":50},[33,108136,6565],{"class":167},[33,108138,1121],{"class":50},[33,108140,274],{"class":54},[33,108142,1649],{"class":167},[33,108144,190],{"class":163},[33,108146,20843],{"class":167},[33,108148,108149],{"class":35,"line":887},[33,108150,92],{"emptyLinePlaceholder":91},[33,108152,108153],{"class":35,"line":907},[33,108154,108155],{"class":39},"# pyarrow — columnar Arrow table\n",[33,108157,108158,108160,108163],{"class":35,"line":1826},[33,108159,562],{"class":163},[33,108161,108162],{"class":46}," load_pyarrow",[33,108164,25419],{"class":167},[33,108166,108167,108169],{"class":35,"line":1844},[33,108168,2424],{"class":163},[33,108170,574],{"class":167},[33,108172,108173,108175,108178,108180],{"class":35,"line":1858},[33,108174,1659],{"class":163},[33,108176,108177],{"class":167}," pa_csv.read_csv(",[33,108179,59605],{"class":50},[33,108181,221],{"class":167},[33,108183,108184,108186,108188,108190,108192,108195,108197,108199],{"class":35,"line":1871},[33,108185,2449],{"class":163},[33,108187,17583],{"class":167},[33,108189,43079],{"class":50},[33,108191,365],{"class":167},[33,108193,108194],{"class":50},"Exception",[33,108196,1649],{"class":167},[33,108198,495],{"class":163},[33,108200,1855],{"class":167},[33,108202,108203,108205,108207,108209,108211,108214,108216,108218,108220,108222,108224,108226],{"class":35,"line":1877},[33,108204,4051],{"class":163},[33,108206,16617],{"class":50},[33,108208,602],{"class":167},[33,108210,4059],{"class":163},[33,108212,108213],{"class":54},"\"pyarrow failed: ",[33,108215,1115],{"class":50},[33,108217,6565],{"class":167},[33,108219,1121],{"class":50},[33,108221,274],{"class":54},[33,108223,1649],{"class":167},[33,108225,190],{"class":163},[33,108227,20843],{"class":167},[33,108229,108230],{"class":35,"line":1883},[33,108231,92],{"emptyLinePlaceholder":91},[33,108233,108234,108237,108240],{"class":35,"line":1915},[33,108235,108236],{"class":167},"measure(",[33,108238,108239],{"class":54},"\"csv (stdlib)\"",[33,108241,108242],{"class":167},", load_csv_stdlib)\n",[33,108244,108245,108247,108250],{"class":35,"line":1926},[33,108246,108236],{"class":167},[33,108248,108249],{"class":54},"\"pandas\"",[33,108251,108252],{"class":167},",       load_pandas)\n",[33,108254,108255,108257,108260],{"class":35,"line":1932},[33,108256,108236],{"class":167},[33,108258,108259],{"class":54},"\"polars\"",[33,108261,108262],{"class":167},",       load_polars)\n",[33,108264,108265,108267,108270],{"class":35,"line":1938},[33,108266,108236],{"class":167},[33,108268,108269],{"class":54},"\"pyarrow\"",[33,108271,108272],{"class":167},",      load_pyarrow)\n",[2537,108274],{},[18,108276,108278],{"id":108277},"library-by-library-fix-implementation","Library-by-Library Fix Implementation",[424,108280,108282,108283,108285],{"id":108281},"_1-stdlib-csv-streaming-row-by-row","1. stdlib ",[30,108284,107436],{}," — Streaming Row-by-Row",[14,108287,39550,108288,108290,108291,108293,108294,10065,108296,108298,108299,3035],{},[30,108289,107436],{}," module is the right tool when memory is the constraint: log rotation scripts, microservices that process one record at a time, or pipelines where each row triggers a database write. There is no type inference — every field comes out as a string. When you also need to handle ",[940,108292,27254],{"href":27253},", pass ",[30,108295,27249],{},[30,108297,8317],{}," directly to ",[30,108300,70995],{},[23,108302,108304],{"className":126,"code":108303,"language":47,"meta":28,"style":28},"# pip install chardet  (stdlib csv needs no install)\nimport csv\nfrom pathlib import Path\n\nDATA = Path(\"transactions.csv\")\n\ndef stream_csv(path: Path):\n    \"\"\"Yield rows one at a time — O(1) memory regardless of file size.\"\"\"\n    try:\n        with path.open(newline=\"\", encoding=\"utf-8-sig\", errors=\"replace\") as fh:\n            # Sniff the dialect from the first 4 KB to handle ; and \\t delimiters\n            sample = fh.read(4096)\n            fh.seek(0)\n            try:\n                dialect = csv.Sniffer().sniff(sample, delimiters=\",;\\t|\")\n            except csv.Error:\n                dialect = csv.excel  # safe fallback\n            reader = csv.DictReader(fh, dialect=dialect)\n            for row in reader:\n                yield row\n    except OSError as exc:\n        raise SystemExit(f\"Cannot open {path}: {exc}\") from exc\n\nfor record in stream_csv(DATA):\n    amount = float(record.get(\"amount\", 0) or 0)  # manual cast\n    print(record[\"id\"], amount)\n",[30,108305,108306,108311,108317,108327,108331,108344,108348,108358,108363,108369,108406,108411,108426,108435,108441,108466,108473,108485,108502,108513,108521,108531,108565,108569,108585,108614],{"__ignoreMap":28},[33,108307,108308],{"class":35,"line":36},[33,108309,108310],{"class":39},"# pip install chardet  (stdlib csv needs no install)\n",[33,108312,108313,108315],{"class":35,"line":43},[33,108314,164],{"class":163},[33,108316,107673],{"class":167},[33,108318,108319,108321,108323,108325],{"class":35,"line":61},[33,108320,190],{"class":163},[33,108322,193],{"class":167},[33,108324,164],{"class":163},[33,108326,198],{"class":167},[33,108328,108329],{"class":35,"line":73},[33,108330,92],{"emptyLinePlaceholder":91},[33,108332,108333,108335,108337,108339,108342],{"class":35,"line":88},[33,108334,59605],{"class":50},[33,108336,212],{"class":163},[33,108338,215],{"class":167},[33,108340,108341],{"class":54},"\"transactions.csv\"",[33,108343,221],{"class":167},[33,108345,108346],{"class":35,"line":95},[33,108347,92],{"emptyLinePlaceholder":91},[33,108349,108350,108352,108355],{"class":35,"line":101},[33,108351,562],{"class":163},[33,108353,108354],{"class":46}," stream_csv",[33,108356,108357],{"class":167},"(path: Path):\n",[33,108359,108360],{"class":35,"line":171},[33,108361,108362],{"class":54},"    \"\"\"Yield rows one at a time — O(1) memory regardless of file size.\"\"\"\n",[33,108364,108365,108367],{"class":35,"line":179},[33,108366,2424],{"class":163},[33,108368,574],{"class":167},[33,108370,108371,108373,108376,108378,108380,108382,108384,108386,108388,108391,108393,108395,108397,108400,108402,108404],{"class":35,"line":187},[33,108372,2191],{"class":163},[33,108374,108375],{"class":167}," path.open(",[33,108377,107919],{"class":238},[33,108379,242],{"class":163},[33,108381,3198],{"class":54},[33,108383,365],{"class":167},[33,108385,27249],{"class":238},[33,108387,242],{"class":163},[33,108389,108390],{"class":54},"\"utf-8-sig\"",[33,108392,365],{"class":167},[33,108394,8317],{"class":238},[33,108396,242],{"class":163},[33,108398,108399],{"class":54},"\"replace\"",[33,108401,1649],{"class":167},[33,108403,495],{"class":163},[33,108405,67176],{"class":167},[33,108407,108408],{"class":35,"line":201},[33,108409,108410],{"class":39},"            # Sniff the dialect from the first 4 KB to handle ; and \\t delimiters\n",[33,108412,108413,108416,108418,108421,108424],{"class":35,"line":206},[33,108414,108415],{"class":167},"            sample ",[33,108417,242],{"class":163},[33,108419,108420],{"class":167}," fh.read(",[33,108422,108423],{"class":50},"4096",[33,108425,221],{"class":167},[33,108427,108428,108431,108433],{"class":35,"line":224},[33,108429,108430],{"class":167},"            fh.seek(",[33,108432,748],{"class":50},[33,108434,221],{"class":167},[33,108436,108437,108439],{"class":35,"line":229},[33,108438,14151],{"class":163},[33,108440,574],{"class":167},[33,108442,108443,108446,108448,108451,108454,108456,108459,108461,108464],{"class":35,"line":235},[33,108444,108445],{"class":167},"                dialect ",[33,108447,242],{"class":163},[33,108449,108450],{"class":167}," csv.Sniffer().sniff(sample, ",[33,108452,108453],{"class":238},"delimiters",[33,108455,242],{"class":163},[33,108457,108458],{"class":54},"\",;",[33,108460,80208],{"class":50},[33,108462,108463],{"class":54},"|\"",[33,108465,221],{"class":167},[33,108467,108468,108470],{"class":35,"line":250},[33,108469,14168],{"class":163},[33,108471,108472],{"class":167}," csv.Error:\n",[33,108474,108475,108477,108479,108482],{"class":35,"line":266},[33,108476,108445],{"class":167},[33,108478,242],{"class":163},[33,108480,108481],{"class":167}," csv.excel  ",[33,108483,108484],{"class":39},"# safe fallback\n",[33,108486,108487,108489,108491,108494,108497,108499],{"class":35,"line":290},[33,108488,72722],{"class":167},[33,108490,242],{"class":163},[33,108492,108493],{"class":167}," csv.DictReader(fh, ",[33,108495,108496],{"class":238},"dialect",[33,108498,242],{"class":163},[33,108500,108501],{"class":167},"dialect)\n",[33,108503,108504,108506,108508,108510],{"class":35,"line":295},[33,108505,1793],{"class":163},[33,108507,3844],{"class":167},[33,108509,662],{"class":163},[33,108511,108512],{"class":167}," reader:\n",[33,108514,108515,108518],{"class":35,"line":300},[33,108516,108517],{"class":163},"                yield",[33,108519,108520],{"class":167}," row\n",[33,108522,108523,108525,108527,108529],{"class":35,"line":317},[33,108524,2449],{"class":163},[33,108526,107953],{"class":50},[33,108528,1852],{"class":163},[33,108530,1855],{"class":167},[33,108532,108533,108535,108537,108539,108541,108543,108545,108547,108549,108551,108553,108555,108557,108559,108561,108563],{"class":35,"line":332},[33,108534,4051],{"class":163},[33,108536,16617],{"class":50},[33,108538,602],{"class":167},[33,108540,4059],{"class":163},[33,108542,9935],{"class":54},[33,108544,1115],{"class":50},[33,108546,2580],{"class":167},[33,108548,1121],{"class":50},[33,108550,2079],{"class":54},[33,108552,1115],{"class":50},[33,108554,6565],{"class":167},[33,108556,1121],{"class":50},[33,108558,274],{"class":54},[33,108560,1649],{"class":167},[33,108562,190],{"class":163},[33,108564,20843],{"class":167},[33,108566,108567],{"class":35,"line":347},[33,108568,92],{"emptyLinePlaceholder":91},[33,108570,108571,108573,108576,108578,108581,108583],{"class":35,"line":374},[33,108572,6124],{"class":163},[33,108574,108575],{"class":167}," record ",[33,108577,662],{"class":163},[33,108579,108580],{"class":167}," stream_csv(",[33,108582,59605],{"class":50},[33,108584,1737],{"class":167},[33,108586,108587,108590,108592,108594,108597,108599,108601,108603,108605,108607,108609,108611],{"class":35,"line":397},[33,108588,108589],{"class":167},"    amount ",[33,108591,242],{"class":163},[33,108593,54311],{"class":50},[33,108595,108596],{"class":167},"(record.get(",[33,108598,4106],{"class":54},[33,108600,365],{"class":167},[33,108602,748],{"class":50},[33,108604,1649],{"class":167},[33,108606,7162],{"class":163},[33,108608,10791],{"class":50},[33,108610,10922],{"class":167},[33,108612,108613],{"class":39},"# manual cast\n",[33,108615,108616,108618,108621,108623],{"class":35,"line":653},[33,108617,7268],{"class":50},[33,108619,108620],{"class":167},"(record[",[33,108622,57101],{"class":54},[33,108624,108625],{"class":167},"], amount)\n",[424,108627,108629],{"id":108628},"_2-pandas-analytical-work-under-500-mb","2. pandas — Analytical Work Under 500 MB",[14,108631,108632,108634,108635,108638,108639,108641],{},[30,108633,9630],{}," is the workhorse for interactive analysis, group-by summaries, and merges. Its ",[30,108636,108637],{},"read_csv()"," covers the widest surface area of quirky real-world files, and most ",[940,108640,26258],{"href":26257}," tutorials assume you already have a DataFrame. Keep it for files where the loaded size stays comfortably under half your available RAM.",[23,108643,108645],{"className":126,"code":108644,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\nfrom pathlib import Path\n\nDATA = Path(\"sales_q1.csv\")\n\ntry:\n    df = pd.read_csv(\n        DATA,\n        encoding=\"utf-8-sig\",   # handles Excel BOM\n        on_bad_lines=\"warn\",     # log malformed rows, don't crash\n        low_memory=False,        # avoid mixed-type column warnings\n        parse_dates=[\"order_date\"],\n    )\nexcept (OSError, pd.errors.ParserError) as exc:\n    raise SystemExit(f\"Failed to load {DATA}: {exc}\") from exc\n\n# Verify the load looked right\nprint(df.dtypes)\nprint(df.shape)\nassert df[\"order_id\"].notna().all(), \"order_id column has unexpected nulls\"\n",[30,108646,108647,108651,108661,108671,108675,108688,108692,108698,108707,108714,108727,108742,108756,108770,108774,108788,108819,108823,108828,108835,108841],{"__ignoreMap":28},[33,108648,108649],{"class":35,"line":36},[33,108650,8895],{"class":39},[33,108652,108653,108655,108657,108659],{"class":35,"line":43},[33,108654,164],{"class":163},[33,108656,492],{"class":167},[33,108658,495],{"class":163},[33,108660,498],{"class":167},[33,108662,108663,108665,108667,108669],{"class":35,"line":61},[33,108664,190],{"class":163},[33,108666,193],{"class":167},[33,108668,164],{"class":163},[33,108670,198],{"class":167},[33,108672,108673],{"class":35,"line":73},[33,108674,92],{"emptyLinePlaceholder":91},[33,108676,108677,108679,108681,108683,108686],{"class":35,"line":88},[33,108678,59605],{"class":50},[33,108680,212],{"class":163},[33,108682,215],{"class":167},[33,108684,108685],{"class":54},"\"sales_q1.csv\"",[33,108687,221],{"class":167},[33,108689,108690],{"class":35,"line":95},[33,108691,92],{"emptyLinePlaceholder":91},[33,108693,108694,108696],{"class":35,"line":101},[33,108695,35574],{"class":163},[33,108697,574],{"class":167},[33,108699,108700,108702,108704],{"class":35,"line":171},[33,108701,4025],{"class":167},[33,108703,242],{"class":163},[33,108705,108706],{"class":167}," pd.read_csv(\n",[33,108708,108709,108712],{"class":35,"line":179},[33,108710,108711],{"class":50},"        DATA",[33,108713,247],{"class":167},[33,108715,108716,108718,108720,108722,108724],{"class":35,"line":187},[33,108717,1190],{"class":238},[33,108719,242],{"class":163},[33,108721,108390],{"class":54},[33,108723,1166],{"class":167},[33,108725,108726],{"class":39},"# handles Excel BOM\n",[33,108728,108729,108732,108734,108737,108739],{"class":35,"line":201},[33,108730,108731],{"class":238},"        on_bad_lines",[33,108733,242],{"class":163},[33,108735,108736],{"class":54},"\"warn\"",[33,108738,25539],{"class":167},[33,108740,108741],{"class":39},"# log malformed rows, don't crash\n",[33,108743,108744,108747,108749,108751,108753],{"class":35,"line":206},[33,108745,108746],{"class":238},"        low_memory",[33,108748,242],{"class":163},[33,108750,902],{"class":50},[33,108752,89262],{"class":167},[33,108754,108755],{"class":39},"# avoid mixed-type column warnings\n",[33,108757,108758,108761,108763,108765,108768],{"class":35,"line":224},[33,108759,108760],{"class":238},"        parse_dates",[33,108762,242],{"class":163},[33,108764,8309],{"class":167},[33,108766,108767],{"class":54},"\"order_date\"",[33,108769,8935],{"class":167},[33,108771,108772],{"class":35,"line":229},[33,108773,1202],{"class":167},[33,108775,108776,108778,108780,108782,108784,108786],{"class":35,"line":235},[33,108777,35726],{"class":163},[33,108779,17583],{"class":167},[33,108781,43079],{"class":50},[33,108783,108042],{"class":167},[33,108785,495],{"class":163},[33,108787,1855],{"class":167},[33,108789,108790,108792,108794,108796,108798,108801,108803,108805,108807,108809,108811,108813,108815,108817],{"class":35,"line":250},[33,108791,35742],{"class":163},[33,108793,16617],{"class":50},[33,108795,602],{"class":167},[33,108797,4059],{"class":163},[33,108799,108800],{"class":54},"\"Failed to load ",[33,108802,107972],{"class":50},[33,108804,2079],{"class":54},[33,108806,1115],{"class":50},[33,108808,6565],{"class":167},[33,108810,1121],{"class":50},[33,108812,274],{"class":54},[33,108814,1649],{"class":167},[33,108816,190],{"class":163},[33,108818,20843],{"class":167},[33,108820,108821],{"class":35,"line":266},[33,108822,92],{"emptyLinePlaceholder":91},[33,108824,108825],{"class":35,"line":290},[33,108826,108827],{"class":39},"# Verify the load looked right\n",[33,108829,108830,108832],{"class":35,"line":295},[33,108831,13474],{"class":50},[33,108833,108834],{"class":167},"(df.dtypes)\n",[33,108836,108837,108839],{"class":35,"line":300},[33,108838,13474],{"class":50},[33,108840,39529],{"class":167},[33,108842,108843,108845,108847,108850,108853],{"class":35,"line":317},[33,108844,36397],{"class":163},[33,108846,7935],{"class":167},[33,108848,108849],{"class":54},"\"order_id\"",[33,108851,108852],{"class":167},"].notna().all(), ",[33,108854,108855],{"class":54},"\"order_id column has unexpected nulls\"\n",[14,108857,108858,108859,108861,108862,108866],{},"For large pandas loads that approach your RAM ceiling, use ",[30,108860,21944],{}," to iterate in batches. The ",[940,108863,108865],{"href":108864},"\u002Fpython-for-excel-csv-data-processing\u002Fexporting-data-to-csv-formats\u002F","Exporting Data to CSV Formats"," guide shows how to reassemble those chunks and write them back out cleanly.",[424,108868,108870],{"id":108869},"_3-polars-large-files-and-multi-threaded-etl","3. polars — Large Files and Multi-Threaded ETL",[14,108872,108873,108875,108876,108879,108880,108882,108883,108886],{},[30,108874,107465],{}," returns a lazy ",[30,108877,108878],{},"LazyFrame",". Nothing reads from disk until ",[30,108881,107469],{}," is called, and predicate pushdown means polars can skip rows and columns it doesn't need. Use ",[30,108884,108885],{},".collect(engine=\"streaming\")"," (polars ≥ 0.20) to process files that exceed available RAM entirely out-of-core — the Rust scheduler pages data through in fixed-size chunks.",[23,108888,108890],{"className":126,"code":108889,"language":47,"meta":28,"style":28},"# pip install polars\nimport polars as pl\nfrom pathlib import Path\n\nDATA = Path(\"events_2025.csv\")\n\ntry:\n    # scan_csv is lazy — no I\u002FO happens here\n    lf = pl.scan_csv(\n        DATA,\n        infer_schema_length=10_000,  # sample more rows for schema\n        ignore_errors=True,          # skip structurally malformed rows\n        try_parse_dates=True,\n    )\n\n    # Push filters before collect so polars skips irrelevant rows\n    result = (\n        lf\n        .filter(pl.col(\"status\") == \"COMPLETE\")\n        .select([\"event_id\", \"user_id\", \"amount\", \"status\"])\n        .collect(engine=\"streaming\")   # out-of-core for large files\n    )\nexcept Exception as exc:\n    raise SystemExit(f\"polars scan failed: {exc}\") from exc\n\nprint(result.shape)\nprint(result.head())\n",[30,108891,108892,108897,108907,108917,108921,108934,108938,108944,108949,108959,108965,108979,108993,109004,109008,109012,109017,109025,109030,109046,109069,109086,109090,109100,109127,109131,109137],{"__ignoreMap":28},[33,108893,108894],{"class":35,"line":36},[33,108895,108896],{"class":39},"# pip install polars\n",[33,108898,108899,108901,108903,108905],{"class":35,"line":43},[33,108900,164],{"class":163},[33,108902,107717],{"class":167},[33,108904,495],{"class":163},[33,108906,107722],{"class":167},[33,108908,108909,108911,108913,108915],{"class":35,"line":61},[33,108910,190],{"class":163},[33,108912,193],{"class":167},[33,108914,164],{"class":163},[33,108916,198],{"class":167},[33,108918,108919],{"class":35,"line":73},[33,108920,92],{"emptyLinePlaceholder":91},[33,108922,108923,108925,108927,108929,108932],{"class":35,"line":88},[33,108924,59605],{"class":50},[33,108926,212],{"class":163},[33,108928,215],{"class":167},[33,108930,108931],{"class":54},"\"events_2025.csv\"",[33,108933,221],{"class":167},[33,108935,108936],{"class":35,"line":95},[33,108937,92],{"emptyLinePlaceholder":91},[33,108939,108940,108942],{"class":35,"line":101},[33,108941,35574],{"class":163},[33,108943,574],{"class":167},[33,108945,108946],{"class":35,"line":171},[33,108947,108948],{"class":39},"    # scan_csv is lazy — no I\u002FO happens here\n",[33,108950,108951,108954,108956],{"class":35,"line":179},[33,108952,108953],{"class":167},"    lf ",[33,108955,242],{"class":163},[33,108957,108958],{"class":167}," pl.scan_csv(\n",[33,108960,108961,108963],{"class":35,"line":187},[33,108962,108711],{"class":50},[33,108964,247],{"class":167},[33,108966,108967,108970,108972,108974,108976],{"class":35,"line":201},[33,108968,108969],{"class":238},"        infer_schema_length",[33,108971,242],{"class":163},[33,108973,104304],{"class":50},[33,108975,25480],{"class":167},[33,108977,108978],{"class":39},"# sample more rows for schema\n",[33,108980,108981,108984,108986,108988,108990],{"class":35,"line":206},[33,108982,108983],{"class":238},"        ignore_errors",[33,108985,242],{"class":163},[33,108987,855],{"class":50},[33,108989,98374],{"class":167},[33,108991,108992],{"class":39},"# skip structurally malformed rows\n",[33,108994,108995,108998,109000,109002],{"class":35,"line":224},[33,108996,108997],{"class":238},"        try_parse_dates",[33,108999,242],{"class":163},[33,109001,855],{"class":50},[33,109003,247],{"class":167},[33,109005,109006],{"class":35,"line":229},[33,109007,1202],{"class":167},[33,109009,109010],{"class":35,"line":235},[33,109011,92],{"emptyLinePlaceholder":91},[33,109013,109014],{"class":35,"line":250},[33,109015,109016],{"class":39},"    # Push filters before collect so polars skips irrelevant rows\n",[33,109018,109019,109021,109023],{"class":35,"line":266},[33,109020,8842],{"class":167},[33,109022,242],{"class":163},[33,109024,1415],{"class":167},[33,109026,109027],{"class":35,"line":290},[33,109028,109029],{"class":167},"        lf\n",[33,109031,109032,109035,109037,109039,109041,109044],{"class":35,"line":295},[33,109033,109034],{"class":167},"        .filter(pl.col(",[33,109036,43379],{"class":54},[33,109038,1649],{"class":167},[33,109040,1865],{"class":163},[33,109042,109043],{"class":54}," \"COMPLETE\"",[33,109045,221],{"class":167},[33,109047,109048,109051,109054,109056,109059,109061,109063,109065,109067],{"class":35,"line":300},[33,109049,109050],{"class":167},"        .select([",[33,109052,109053],{"class":54},"\"event_id\"",[33,109055,365],{"class":167},[33,109057,109058],{"class":54},"\"user_id\"",[33,109060,365],{"class":167},[33,109062,4106],{"class":54},[33,109064,365],{"class":167},[33,109066,43379],{"class":54},[33,109068,751],{"class":167},[33,109070,109071,109074,109076,109078,109081,109083],{"class":35,"line":317},[33,109072,109073],{"class":167},"        .collect(",[33,109075,17351],{"class":238},[33,109077,242],{"class":163},[33,109079,109080],{"class":54},"\"streaming\"",[33,109082,12000],{"class":167},[33,109084,109085],{"class":39},"# out-of-core for large files\n",[33,109087,109088],{"class":35,"line":332},[33,109089,1202],{"class":167},[33,109091,109092,109094,109096,109098],{"class":35,"line":347},[33,109093,35726],{"class":163},[33,109095,783],{"class":50},[33,109097,1852],{"class":163},[33,109099,1855],{"class":167},[33,109101,109102,109104,109106,109108,109110,109113,109115,109117,109119,109121,109123,109125],{"class":35,"line":374},[33,109103,35742],{"class":163},[33,109105,16617],{"class":50},[33,109107,602],{"class":167},[33,109109,4059],{"class":163},[33,109111,109112],{"class":54},"\"polars scan failed: ",[33,109114,1115],{"class":50},[33,109116,6565],{"class":167},[33,109118,1121],{"class":50},[33,109120,274],{"class":54},[33,109122,1649],{"class":167},[33,109124,190],{"class":163},[33,109126,20843],{"class":167},[33,109128,109129],{"class":35,"line":397},[33,109130,92],{"emptyLinePlaceholder":91},[33,109132,109133,109135],{"class":35,"line":653},[33,109134,13474],{"class":50},[33,109136,9223],{"class":167},[33,109138,109139,109141],{"class":35,"line":667},[33,109140,13474],{"class":50},[33,109142,109143],{"class":167},"(result.head())\n",[14,109145,109146],{},"polars is typically 3–5× faster than pandas on the same hardware because it parallelises column operations across all CPU cores. For any file over 500 MB, the switch pays for itself on the first run.",[424,109148,109150],{"id":109149},"_4-pyarrow-columnar-pipelines-and-parquet-integration","4. pyarrow — Columnar Pipelines and Parquet Integration",[14,109152,109153,109156,109157,109160],{},[30,109154,109155],{},"pyarrow.csv.read_csv()"," loads data directly into an Apache Arrow table — the same in-memory format used by Parquet, DuckDB, and Spark. If your pipeline ends in ",[30,109158,109159],{},".parquet"," or feeds a DuckDB query, reading with pyarrow avoids a conversion step and enables genuine zero-copy hand-off between components. It's also the fastest option for multi-gigabyte files when you don't need polars' lazy query planner.",[23,109162,109164],{"className":126,"code":109163,"language":47,"meta":28,"style":28},"# pip install pyarrow\nimport pyarrow.csv as pa_csv\nimport pyarrow.parquet as pq\nfrom pathlib import Path\n\nDATA    = Path(\"large_export.csv\")\nOUTPUT  = Path(\"large_export.parquet\")\n\ntry:\n    # ConvertOptions lets you control null tokens and type overrides\n    convert_opts = pa_csv.ConvertOptions(\n        null_values=[\"\", \"NULL\", \"N\u002FA\", \"n\u002Fa\"],\n        strings_can_be_null=True,\n    )\n    read_opts = pa_csv.ReadOptions(\n        block_size=32 * 1024 * 1024,  # 32 MB read blocks\n    )\n    table = pa_csv.read_csv(\n        DATA,\n        read_options=read_opts,\n        convert_options=convert_opts,\n    )\nexcept (OSError, Exception) as exc:\n    raise SystemExit(f\"pyarrow read failed: {exc}\") from exc\n\n# Write directly to Parquet — no pandas round-trip needed\ntry:\n    pq.write_table(table, OUTPUT, compression=\"snappy\")\n    print(f\"Wrote {OUTPUT} ({OUTPUT.stat().st_size \u002F 1_048_576:.1f} MB)\")\nexcept OSError as exc:\n    raise SystemExit(f\"Parquet write failed: {exc}\") from exc\n\n# Or query with DuckDB without copying memory\ntry:\n    import duckdb\n    con = duckdb.connect()\n    con.register(\"events\", table)\n    print(con.execute(\"SELECT status, COUNT(*) FROM events GROUP BY 1\").df())\nexcept ImportError:\n    print(\"duckdb not installed — skipping query example\")\n",[30,109165,109166,109171,109181,109193,109203,109207,109220,109233,109237,109243,109248,109258,109284,109295,109299,109309,109332,109336,109345,109351,109361,109371,109375,109393,109420,109424,109429,109435,109454,109487,109497,109524,109528,109533,109539,109546,109556,109567,109580,109588],{"__ignoreMap":28},[33,109167,109168],{"class":35,"line":36},[33,109169,109170],{"class":39},"# pip install pyarrow\n",[33,109172,109173,109175,109177,109179],{"class":35,"line":43},[33,109174,164],{"class":163},[33,109176,107729],{"class":167},[33,109178,495],{"class":163},[33,109180,107734],{"class":167},[33,109182,109183,109185,109188,109190],{"class":35,"line":61},[33,109184,164],{"class":163},[33,109186,109187],{"class":167}," pyarrow.parquet ",[33,109189,495],{"class":163},[33,109191,109192],{"class":167}," pq\n",[33,109194,109195,109197,109199,109201],{"class":35,"line":73},[33,109196,190],{"class":163},[33,109198,193],{"class":167},[33,109200,164],{"class":163},[33,109202,198],{"class":167},[33,109204,109205],{"class":35,"line":88},[33,109206,92],{"emptyLinePlaceholder":91},[33,109208,109209,109211,109213,109215,109218],{"class":35,"line":95},[33,109210,59605],{"class":50},[33,109212,20470],{"class":163},[33,109214,215],{"class":167},[33,109216,109217],{"class":54},"\"large_export.csv\"",[33,109219,221],{"class":167},[33,109221,109222,109224,109226,109228,109231],{"class":35,"line":101},[33,109223,96935],{"class":50},[33,109225,17208],{"class":163},[33,109227,215],{"class":167},[33,109229,109230],{"class":54},"\"large_export.parquet\"",[33,109232,221],{"class":167},[33,109234,109235],{"class":35,"line":171},[33,109236,92],{"emptyLinePlaceholder":91},[33,109238,109239,109241],{"class":35,"line":179},[33,109240,35574],{"class":163},[33,109242,574],{"class":167},[33,109244,109245],{"class":35,"line":187},[33,109246,109247],{"class":39},"    # ConvertOptions lets you control null tokens and type overrides\n",[33,109249,109250,109253,109255],{"class":35,"line":201},[33,109251,109252],{"class":167},"    convert_opts ",[33,109254,242],{"class":163},[33,109256,109257],{"class":167}," pa_csv.ConvertOptions(\n",[33,109259,109260,109263,109265,109267,109269,109271,109274,109276,109278,109280,109282],{"class":35,"line":206},[33,109261,109262],{"class":238},"        null_values",[33,109264,242],{"class":163},[33,109266,8309],{"class":167},[33,109268,3198],{"class":54},[33,109270,365],{"class":167},[33,109272,109273],{"class":54},"\"NULL\"",[33,109275,365],{"class":167},[33,109277,27824],{"class":54},[33,109279,365],{"class":167},[33,109281,12438],{"class":54},[33,109283,8935],{"class":167},[33,109285,109286,109289,109291,109293],{"class":35,"line":224},[33,109287,109288],{"class":238},"        strings_can_be_null",[33,109290,242],{"class":163},[33,109292,855],{"class":50},[33,109294,247],{"class":167},[33,109296,109297],{"class":35,"line":229},[33,109298,1202],{"class":167},[33,109300,109301,109304,109306],{"class":35,"line":235},[33,109302,109303],{"class":167},"    read_opts ",[33,109305,242],{"class":163},[33,109307,109308],{"class":167}," pa_csv.ReadOptions(\n",[33,109310,109311,109314,109316,109319,109321,109323,109325,109327,109329],{"class":35,"line":250},[33,109312,109313],{"class":238},"        block_size",[33,109315,242],{"class":163},[33,109317,109318],{"class":50},"32",[33,109320,1156],{"class":163},[33,109322,1159],{"class":50},[33,109324,1156],{"class":163},[33,109326,1159],{"class":50},[33,109328,25480],{"class":167},[33,109330,109331],{"class":39},"# 32 MB read blocks\n",[33,109333,109334],{"class":35,"line":266},[33,109335,1202],{"class":167},[33,109337,109338,109340,109342],{"class":35,"line":290},[33,109339,18621],{"class":167},[33,109341,242],{"class":163},[33,109343,109344],{"class":167}," pa_csv.read_csv(\n",[33,109346,109347,109349],{"class":35,"line":295},[33,109348,108711],{"class":50},[33,109350,247],{"class":167},[33,109352,109353,109356,109358],{"class":35,"line":300},[33,109354,109355],{"class":238},"        read_options",[33,109357,242],{"class":163},[33,109359,109360],{"class":167},"read_opts,\n",[33,109362,109363,109366,109368],{"class":35,"line":317},[33,109364,109365],{"class":238},"        convert_options",[33,109367,242],{"class":163},[33,109369,109370],{"class":167},"convert_opts,\n",[33,109372,109373],{"class":35,"line":332},[33,109374,1202],{"class":167},[33,109376,109377,109379,109381,109383,109385,109387,109389,109391],{"class":35,"line":347},[33,109378,35726],{"class":163},[33,109380,17583],{"class":167},[33,109382,43079],{"class":50},[33,109384,365],{"class":167},[33,109386,108194],{"class":50},[33,109388,1649],{"class":167},[33,109390,495],{"class":163},[33,109392,1855],{"class":167},[33,109394,109395,109397,109399,109401,109403,109406,109408,109410,109412,109414,109416,109418],{"class":35,"line":374},[33,109396,35742],{"class":163},[33,109398,16617],{"class":50},[33,109400,602],{"class":167},[33,109402,4059],{"class":163},[33,109404,109405],{"class":54},"\"pyarrow read failed: ",[33,109407,1115],{"class":50},[33,109409,6565],{"class":167},[33,109411,1121],{"class":50},[33,109413,274],{"class":54},[33,109415,1649],{"class":167},[33,109417,190],{"class":163},[33,109419,20843],{"class":167},[33,109421,109422],{"class":35,"line":397},[33,109423,92],{"emptyLinePlaceholder":91},[33,109425,109426],{"class":35,"line":653},[33,109427,109428],{"class":39},"# Write directly to Parquet — no pandas round-trip needed\n",[33,109430,109431,109433],{"class":35,"line":667},[33,109432,35574],{"class":163},[33,109434,574],{"class":167},[33,109436,109437,109440,109442,109444,109447,109449,109452],{"class":35,"line":675},[33,109438,109439],{"class":167},"    pq.write_table(table, ",[33,109441,96935],{"class":50},[33,109443,365],{"class":167},[33,109445,109446],{"class":238},"compression",[33,109448,242],{"class":163},[33,109450,109451],{"class":54},"\"snappy\"",[33,109453,221],{"class":167},[33,109455,109456,109458,109460,109462,109464,109466,109468,109471,109474,109476,109478,109480,109482,109485],{"class":35,"line":689},[33,109457,7268],{"class":50},[33,109459,602],{"class":167},[33,109461,4059],{"class":163},[33,109463,913],{"class":54},[33,109465,97684],{"class":50},[33,109467,17583],{"class":54},[33,109469,109470],{"class":50},"{OUTPUT",[33,109472,109473],{"class":167},".stat().st_size ",[33,109475,1351],{"class":163},[33,109477,107867],{"class":50},[33,109479,18438],{"class":163},[33,109481,1121],{"class":50},[33,109483,109484],{"class":54}," MB)\"",[33,109486,221],{"class":167},[33,109488,109489,109491,109493,109495],{"class":35,"line":703},[33,109490,35726],{"class":163},[33,109492,107953],{"class":50},[33,109494,1852],{"class":163},[33,109496,1855],{"class":167},[33,109498,109499,109501,109503,109505,109507,109510,109512,109514,109516,109518,109520,109522],{"class":35,"line":714},[33,109500,35742],{"class":163},[33,109502,16617],{"class":50},[33,109504,602],{"class":167},[33,109506,4059],{"class":163},[33,109508,109509],{"class":54},"\"Parquet write failed: ",[33,109511,1115],{"class":50},[33,109513,6565],{"class":167},[33,109515,1121],{"class":50},[33,109517,274],{"class":54},[33,109519,1649],{"class":167},[33,109521,190],{"class":163},[33,109523,20843],{"class":167},[33,109525,109526],{"class":35,"line":723},[33,109527,92],{"emptyLinePlaceholder":91},[33,109529,109530],{"class":35,"line":754},[33,109531,109532],{"class":39},"# Or query with DuckDB without copying memory\n",[33,109534,109535,109537],{"class":35,"line":771},[33,109536,35574],{"class":163},[33,109538,574],{"class":167},[33,109540,109541,109543],{"class":35,"line":777},[33,109542,1627],{"class":163},[33,109544,109545],{"class":167}," duckdb\n",[33,109547,109548,109551,109553],{"class":35,"line":788},[33,109549,109550],{"class":167},"    con ",[33,109552,242],{"class":163},[33,109554,109555],{"class":167}," duckdb.connect()\n",[33,109557,109558,109561,109564],{"class":35,"line":804},[33,109559,109560],{"class":167},"    con.register(",[33,109562,109563],{"class":54},"\"events\"",[33,109565,109566],{"class":167},", table)\n",[33,109568,109569,109571,109574,109577],{"class":35,"line":809},[33,109570,7268],{"class":50},[33,109572,109573],{"class":167},"(con.execute(",[33,109575,109576],{"class":54},"\"SELECT status, COUNT(*) FROM events GROUP BY 1\"",[33,109578,109579],{"class":167},").df())\n",[33,109581,109582,109584,109586],{"class":35,"line":819},[33,109583,35726],{"class":163},[33,109585,40488],{"class":50},[33,109587,574],{"class":167},[33,109589,109590,109592,109594,109597],{"class":35,"line":829},[33,109591,7268],{"class":50},[33,109593,602],{"class":167},[33,109595,109596],{"class":54},"\"duckdb not installed — skipping query example\"",[33,109598,221],{"class":167},[14,109600,109601,109602,109605,109606,109609,109610,109613,109614,109616],{},"When you need pyarrow's streaming equivalent for very large files, use ",[30,109603,109604],{},"pyarrow.csv.open_csv()"," which returns a ",[30,109607,109608],{},"CSVStreamingReader"," that yields ",[30,109611,109612],{},"RecordBatch"," objects — Arrow's equivalent of polars' streaming collect. This is especially useful when extracting structured data from documents before passing them into an Arrow pipeline; see ",[940,109615,948],{"href":947}," for how that upstream step works.",[2537,109618],{},[18,109620,35802],{"id":35801},[424,109622,109624,109625],{"id":109623},"large-file-with-pandas-chunksize","Large File with pandas ",[30,109626,21944],{},[14,109628,109629,109630,109632],{},"When you cannot switch libraries but the file is too large to load at once, iterate in chunks and aggregate progressively. Memory stays bounded to ",[30,109631,21944],{}," rows at a time.",[23,109634,109636],{"className":126,"code":109635,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\nfrom pathlib import Path\n\nDATA = Path(\"huge_report.csv\")\n\ntotals = {}\ntry:\n    for chunk in pd.read_csv(DATA, chunksize=200_000, low_memory=False):\n        for key, val in chunk.groupby(\"region\")[\"revenue\"].sum().items():\n            totals[key] = totals.get(key, 0) + val\nexcept (OSError, pd.errors.ParserError) as exc:\n    raise SystemExit(f\"Chunked read failed: {exc}\") from exc\n\nprint(totals)\n",[30,109637,109638,109642,109652,109662,109666,109679,109683,109692,109698,109729,109751,109770,109784,109811,109815],{"__ignoreMap":28},[33,109639,109640],{"class":35,"line":36},[33,109641,8895],{"class":39},[33,109643,109644,109646,109648,109650],{"class":35,"line":43},[33,109645,164],{"class":163},[33,109647,492],{"class":167},[33,109649,495],{"class":163},[33,109651,498],{"class":167},[33,109653,109654,109656,109658,109660],{"class":35,"line":61},[33,109655,190],{"class":163},[33,109657,193],{"class":167},[33,109659,164],{"class":163},[33,109661,198],{"class":167},[33,109663,109664],{"class":35,"line":73},[33,109665,92],{"emptyLinePlaceholder":91},[33,109667,109668,109670,109672,109674,109677],{"class":35,"line":88},[33,109669,59605],{"class":50},[33,109671,212],{"class":163},[33,109673,215],{"class":167},[33,109675,109676],{"class":54},"\"huge_report.csv\"",[33,109678,221],{"class":167},[33,109680,109681],{"class":35,"line":95},[33,109682,92],{"emptyLinePlaceholder":91},[33,109684,109685,109688,109690],{"class":35,"line":101},[33,109686,109687],{"class":167},"totals ",[33,109689,242],{"class":163},[33,109691,14093],{"class":167},[33,109693,109694,109696],{"class":35,"line":171},[33,109695,35574],{"class":163},[33,109697,574],{"class":167},[33,109699,109700,109702,109704,109706,109708,109710,109712,109714,109716,109719,109721,109723,109725,109727],{"class":35,"line":179},[33,109701,656],{"class":163},[33,109703,21937],{"class":167},[33,109705,662],{"class":163},[33,109707,9481],{"class":167},[33,109709,59605],{"class":50},[33,109711,365],{"class":167},[33,109713,21944],{"class":238},[33,109715,242],{"class":163},[33,109717,109718],{"class":50},"200_000",[33,109720,365],{"class":167},[33,109722,108025],{"class":238},[33,109724,242],{"class":163},[33,109726,902],{"class":50},[33,109728,1737],{"class":167},[33,109730,109731,109733,109736,109738,109741,109743,109746,109748],{"class":35,"line":187},[33,109732,5973],{"class":163},[33,109734,109735],{"class":167}," key, val ",[33,109737,662],{"class":163},[33,109739,109740],{"class":167}," chunk.groupby(",[33,109742,16649],{"class":54},[33,109744,109745],{"class":167},")[",[33,109747,16465],{"class":54},[33,109749,109750],{"class":167},"].sum().items():\n",[33,109752,109753,109756,109758,109761,109763,109765,109767],{"class":35,"line":201},[33,109754,109755],{"class":167},"            totals[key] ",[33,109757,242],{"class":163},[33,109759,109760],{"class":167}," totals.get(key, ",[33,109762,748],{"class":50},[33,109764,1649],{"class":167},[33,109766,1811],{"class":163},[33,109768,109769],{"class":167}," val\n",[33,109771,109772,109774,109776,109778,109780,109782],{"class":35,"line":206},[33,109773,35726],{"class":163},[33,109775,17583],{"class":167},[33,109777,43079],{"class":50},[33,109779,108042],{"class":167},[33,109781,495],{"class":163},[33,109783,1855],{"class":167},[33,109785,109786,109788,109790,109792,109794,109797,109799,109801,109803,109805,109807,109809],{"class":35,"line":224},[33,109787,35742],{"class":163},[33,109789,16617],{"class":50},[33,109791,602],{"class":167},[33,109793,4059],{"class":163},[33,109795,109796],{"class":54},"\"Chunked read failed: ",[33,109798,1115],{"class":50},[33,109800,6565],{"class":167},[33,109802,1121],{"class":50},[33,109804,274],{"class":54},[33,109806,1649],{"class":167},[33,109808,190],{"class":163},[33,109810,20843],{"class":167},[33,109812,109813],{"class":35,"line":229},[33,109814,92],{"emptyLinePlaceholder":91},[33,109816,109817,109819],{"class":35,"line":235},[33,109818,13474],{"class":50},[33,109820,109821],{"class":167},"(totals)\n",[424,109823,109825],{"id":109824},"polars-out-of-core-streaming-aggregate","polars Out-of-Core Streaming Aggregate",[14,109827,109828],{},"For the same pattern on files exceeding RAM, polars' lazy engine does the work for you — no manual chunk management.",[23,109830,109832],{"className":126,"code":109831,"language":47,"meta":28,"style":28},"# pip install polars\nimport polars as pl\nfrom pathlib import Path\n\nDATA = Path(\"huge_report.csv\")\n\ntry:\n    result = (\n        pl.scan_csv(DATA, ignore_errors=True)\n        .group_by(\"region\")\n        .agg(pl.col(\"revenue\").sum())\n        .collect(engine=\"streaming\")\n    )\n    print(result.sort(\"revenue\", descending=True))\nexcept Exception as exc:\n    raise SystemExit(f\"polars streaming aggregate failed: {exc}\") from exc\n",[30,109833,109834,109838,109848,109858,109862,109874,109878,109884,109892,109909,109918,109928,109940,109944,109964,109974],{"__ignoreMap":28},[33,109835,109836],{"class":35,"line":36},[33,109837,108896],{"class":39},[33,109839,109840,109842,109844,109846],{"class":35,"line":43},[33,109841,164],{"class":163},[33,109843,107717],{"class":167},[33,109845,495],{"class":163},[33,109847,107722],{"class":167},[33,109849,109850,109852,109854,109856],{"class":35,"line":61},[33,109851,190],{"class":163},[33,109853,193],{"class":167},[33,109855,164],{"class":163},[33,109857,198],{"class":167},[33,109859,109860],{"class":35,"line":73},[33,109861,92],{"emptyLinePlaceholder":91},[33,109863,109864,109866,109868,109870,109872],{"class":35,"line":88},[33,109865,59605],{"class":50},[33,109867,212],{"class":163},[33,109869,215],{"class":167},[33,109871,109676],{"class":54},[33,109873,221],{"class":167},[33,109875,109876],{"class":35,"line":95},[33,109877,92],{"emptyLinePlaceholder":91},[33,109879,109880,109882],{"class":35,"line":101},[33,109881,35574],{"class":163},[33,109883,574],{"class":167},[33,109885,109886,109888,109890],{"class":35,"line":171},[33,109887,8842],{"class":167},[33,109889,242],{"class":163},[33,109891,1415],{"class":167},[33,109893,109894,109897,109899,109901,109903,109905,109907],{"class":35,"line":179},[33,109895,109896],{"class":167},"        pl.scan_csv(",[33,109898,59605],{"class":50},[33,109900,365],{"class":167},[33,109902,70586],{"class":238},[33,109904,242],{"class":163},[33,109906,855],{"class":50},[33,109908,221],{"class":167},[33,109910,109911,109914,109916],{"class":35,"line":187},[33,109912,109913],{"class":167},"        .group_by(",[33,109915,16649],{"class":54},[33,109917,221],{"class":167},[33,109919,109920,109923,109925],{"class":35,"line":201},[33,109921,109922],{"class":167},"        .agg(pl.col(",[33,109924,16465],{"class":54},[33,109926,109927],{"class":167},").sum())\n",[33,109929,109930,109932,109934,109936,109938],{"class":35,"line":206},[33,109931,109073],{"class":167},[33,109933,17351],{"class":238},[33,109935,242],{"class":163},[33,109937,109080],{"class":54},[33,109939,221],{"class":167},[33,109941,109942],{"class":35,"line":224},[33,109943,1202],{"class":167},[33,109945,109946,109948,109951,109953,109955,109958,109960,109962],{"class":35,"line":229},[33,109947,7268],{"class":50},[33,109949,109950],{"class":167},"(result.sort(",[33,109952,16465],{"class":54},[33,109954,365],{"class":167},[33,109956,109957],{"class":238},"descending",[33,109959,242],{"class":163},[33,109961,855],{"class":50},[33,109963,371],{"class":167},[33,109965,109966,109968,109970,109972],{"class":35,"line":235},[33,109967,35726],{"class":163},[33,109969,783],{"class":50},[33,109971,1852],{"class":163},[33,109973,1855],{"class":167},[33,109975,109976,109978,109980,109982,109984,109987,109989,109991,109993,109995,109997,109999],{"class":35,"line":250},[33,109977,35742],{"class":163},[33,109979,16617],{"class":50},[33,109981,602],{"class":167},[33,109983,4059],{"class":163},[33,109985,109986],{"class":54},"\"polars streaming aggregate failed: ",[33,109988,1115],{"class":50},[33,109990,6565],{"class":167},[33,109992,1121],{"class":50},[33,109994,274],{"class":54},[33,109996,1649],{"class":167},[33,109998,190],{"class":163},[33,110000,20843],{"class":167},[2537,110002],{},[18,110004,110006],{"id":110005},"verification-confirming-the-right-parser-worked","Verification: Confirming the Right Parser Worked",[14,110008,110009],{},"After loading, run these three assertions to confirm the parser produced a usable result:",[23,110011,110013],{"className":126,"code":110012,"language":47,"meta":28,"style":28},"# pip install pandas polars pyarrow\n# Run whichever block matches your chosen library\n\n# --- pandas ---\nimport pandas as pd\nfrom pathlib import Path\n\ndf = pd.read_csv(Path(\"output.csv\"))\n\nassert not df.empty, \"DataFrame is empty — check delimiter or encoding\"\nassert df.isnull().mean().max() \u003C 0.5, \"More than 50% nulls in at least one column\"\nassert df.select_dtypes(\"object\").shape[1] \u003C df.shape[1], (\n    \"All columns are object dtype — type inference may have failed\"\n)\nprint(\"pandas load verified:\", df.shape)\n\n# --- polars ---\nimport polars as pl\n\ndf_pl = pl.read_csv(Path(\"output.csv\"))\nassert df_pl.height > 0\nassert df_pl.null_count().row(0) != tuple([df_pl.height] * df_pl.width), (\n    \"All-null column detected\"\n)\nprint(\"polars load verified:\", df_pl.shape)\n\n# --- pyarrow ---\nimport pyarrow.csv as pa_csv\n\ntbl = pa_csv.read_csv(Path(\"output.csv\"))\nassert tbl.num_rows > 0\nassert tbl.num_columns > 1\nprint(\"pyarrow load verified:\", tbl.shape)\n",[30,110014,110015,110020,110025,110029,110034,110044,110054,110058,110071,110075,110086,110103,110128,110133,110137,110149,110153,110158,110168,110172,110186,110197,110221,110226,110230,110242,110246,110251,110261,110265,110278,110289,110300],{"__ignoreMap":28},[33,110016,110017],{"class":35,"line":36},[33,110018,110019],{"class":39},"# pip install pandas polars pyarrow\n",[33,110021,110022],{"class":35,"line":43},[33,110023,110024],{"class":39},"# Run whichever block matches your chosen library\n",[33,110026,110027],{"class":35,"line":61},[33,110028,92],{"emptyLinePlaceholder":91},[33,110030,110031],{"class":35,"line":73},[33,110032,110033],{"class":39},"# --- pandas ---\n",[33,110035,110036,110038,110040,110042],{"class":35,"line":88},[33,110037,164],{"class":163},[33,110039,492],{"class":167},[33,110041,495],{"class":163},[33,110043,498],{"class":167},[33,110045,110046,110048,110050,110052],{"class":35,"line":95},[33,110047,190],{"class":163},[33,110049,193],{"class":167},[33,110051,164],{"class":163},[33,110053,198],{"class":167},[33,110055,110056],{"class":35,"line":101},[33,110057,92],{"emptyLinePlaceholder":91},[33,110059,110060,110062,110064,110066,110069],{"class":35,"line":171},[33,110061,13459],{"class":167},[33,110063,242],{"class":163},[33,110065,46182],{"class":167},[33,110067,110068],{"class":54},"\"output.csv\"",[33,110070,371],{"class":167},[33,110072,110073],{"class":35,"line":179},[33,110074,92],{"emptyLinePlaceholder":91},[33,110076,110077,110079,110081,110083],{"class":35,"line":187},[33,110078,36397],{"class":163},[33,110080,620],{"class":163},[33,110082,39770],{"class":167},[33,110084,110085],{"class":54},"\"DataFrame is empty — check delimiter or encoding\"\n",[33,110087,110088,110090,110093,110095,110098,110100],{"class":35,"line":201},[33,110089,36397],{"class":163},[33,110091,110092],{"class":167}," df.isnull().mean().max() ",[33,110094,4043],{"class":163},[33,110096,110097],{"class":50}," 0.5",[33,110099,365],{"class":167},[33,110101,110102],{"class":54},"\"More than 50% nulls in at least one column\"\n",[33,110104,110105,110107,110109,110112,110115,110117,110119,110121,110123,110125],{"class":35,"line":206},[33,110106,36397],{"class":163},[33,110108,23604],{"class":167},[33,110110,110111],{"class":54},"\"object\"",[33,110113,110114],{"class":167},").shape[",[33,110116,734],{"class":50},[33,110118,763],{"class":167},[33,110120,4043],{"class":163},[33,110122,9516],{"class":167},[33,110124,734],{"class":50},[33,110126,110127],{"class":167},"], (\n",[33,110129,110130],{"class":35,"line":224},[33,110131,110132],{"class":54},"    \"All columns are object dtype — type inference may have failed\"\n",[33,110134,110135],{"class":35,"line":229},[33,110136,221],{"class":167},[33,110138,110139,110141,110143,110146],{"class":35,"line":235},[33,110140,13474],{"class":50},[33,110142,602],{"class":167},[33,110144,110145],{"class":54},"\"pandas load verified:\"",[33,110147,110148],{"class":167},", df.shape)\n",[33,110150,110151],{"class":35,"line":250},[33,110152,92],{"emptyLinePlaceholder":91},[33,110154,110155],{"class":35,"line":266},[33,110156,110157],{"class":39},"# --- polars ---\n",[33,110159,110160,110162,110164,110166],{"class":35,"line":290},[33,110161,164],{"class":163},[33,110163,107717],{"class":167},[33,110165,495],{"class":163},[33,110167,107722],{"class":167},[33,110169,110170],{"class":35,"line":295},[33,110171,92],{"emptyLinePlaceholder":91},[33,110173,110174,110177,110179,110182,110184],{"class":35,"line":300},[33,110175,110176],{"class":167},"df_pl ",[33,110178,242],{"class":163},[33,110180,110181],{"class":167}," pl.read_csv(Path(",[33,110183,110068],{"class":54},[33,110185,371],{"class":167},[33,110187,110188,110190,110193,110195],{"class":35,"line":317},[33,110189,36397],{"class":163},[33,110191,110192],{"class":167}," df_pl.height ",[33,110194,6009],{"class":163},[33,110196,28914],{"class":50},[33,110198,110199,110201,110204,110206,110208,110210,110213,110216,110218],{"class":35,"line":332},[33,110200,36397],{"class":163},[33,110202,110203],{"class":167}," df_pl.null_count().row(",[33,110205,748],{"class":50},[33,110207,1649],{"class":167},[33,110209,17877],{"class":163},[33,110211,110212],{"class":50}," tuple",[33,110214,110215],{"class":167},"([df_pl.height] ",[33,110217,1769],{"class":163},[33,110219,110220],{"class":167}," df_pl.width), (\n",[33,110222,110223],{"class":35,"line":347},[33,110224,110225],{"class":54},"    \"All-null column detected\"\n",[33,110227,110228],{"class":35,"line":374},[33,110229,221],{"class":167},[33,110231,110232,110234,110236,110239],{"class":35,"line":397},[33,110233,13474],{"class":50},[33,110235,602],{"class":167},[33,110237,110238],{"class":54},"\"polars load verified:\"",[33,110240,110241],{"class":167},", df_pl.shape)\n",[33,110243,110244],{"class":35,"line":653},[33,110245,92],{"emptyLinePlaceholder":91},[33,110247,110248],{"class":35,"line":667},[33,110249,110250],{"class":39},"# --- pyarrow ---\n",[33,110252,110253,110255,110257,110259],{"class":35,"line":675},[33,110254,164],{"class":163},[33,110256,107729],{"class":167},[33,110258,495],{"class":163},[33,110260,107734],{"class":167},[33,110262,110263],{"class":35,"line":689},[33,110264,92],{"emptyLinePlaceholder":91},[33,110266,110267,110269,110271,110274,110276],{"class":35,"line":703},[33,110268,62048],{"class":167},[33,110270,242],{"class":163},[33,110272,110273],{"class":167}," pa_csv.read_csv(Path(",[33,110275,110068],{"class":54},[33,110277,371],{"class":167},[33,110279,110280,110282,110285,110287],{"class":35,"line":714},[33,110281,36397],{"class":163},[33,110283,110284],{"class":167}," tbl.num_rows ",[33,110286,6009],{"class":163},[33,110288,28914],{"class":50},[33,110290,110291,110293,110296,110298],{"class":35,"line":723},[33,110292,36397],{"class":163},[33,110294,110295],{"class":167}," tbl.num_columns ",[33,110297,6009],{"class":163},[33,110299,17709],{"class":50},[33,110301,110302,110304,110306,110309],{"class":35,"line":754},[33,110303,13474],{"class":50},[33,110305,602],{"class":167},[33,110307,110308],{"class":54},"\"pyarrow load verified:\"",[33,110310,110311],{"class":167},", tbl.shape)\n",[14,110313,110314,110315,110318,110319,110322],{},"If the pandas assertion about object dtype fires, your delimiter was not detected correctly — pass ",[30,110316,110317],{},"sep=None, engine=\"python\""," to trigger auto-detection, or use ",[30,110320,110321],{},"csv.Sniffer"," first as shown in the stdlib snippet above.",[2537,110324],{},[18,110326,6918],{"id":6917},[4211,110328,110329,110334,110339,110344],{},[4214,110330,110331,110333],{},[940,110332,27254],{"href":27253}," — resolve UnicodeDecodeError before any parser can load the file",[4214,110335,110336,110338],{},[940,110337,107447],{"href":9598}," — what to do with your DataFrame after the parser succeeds",[4214,110340,110341,110343],{},[940,110342,108865],{"href":108864}," — write processed data back to CSV, Parquet, or Excel",[4214,110345,110346,110348],{},[940,110347,948],{"href":947}," — parse tables from PDFs into the same DataFrame pipeline",[14,110350,6947,110351,3035],{},[940,110352,107447],{"href":9598},[6953,110354,9614],{},{"title":28,"searchDepth":43,"depth":43,"links":110356},[110357,110358,110359,110360,110367,110372,110373],{"id":107451,"depth":43,"text":107452},{"id":107481,"depth":43,"text":107482},{"id":107652,"depth":43,"text":107653},{"id":108277,"depth":43,"text":108278,"children":110361},[110362,110364,110365,110366],{"id":108281,"depth":61,"text":110363},"1. stdlib csv — Streaming Row-by-Row",{"id":108628,"depth":61,"text":108629},{"id":108869,"depth":61,"text":108870},{"id":109149,"depth":61,"text":109150},{"id":35801,"depth":43,"text":35802,"children":110368},[110369,110371],{"id":109623,"depth":61,"text":110370},"Large File with pandas chunksize",{"id":109824,"depth":61,"text":109825},{"id":110005,"depth":43,"text":110006},{"id":6917,"depth":43,"text":6918},"Best Libraries for CSV Parsing","Compare Python's csv module, pandas, polars, and pyarrow for CSV parsing. Pick the right tool by file size, memory budget, and downstream pipeline needs.",{},"\u002Fpython-for-excel-csv-data-processing\u002Fcleaning-messy-csv-data-with-pandas\u002Fbest-python-libraries-for-csv-parsing",{"title":107425,"description":110375},"Best Python Libraries for CSV Parsing in 2026 — csv vs pandas vs polars vs pyarrow","python-for-excel-csv-data-processing\u002Fcleaning-messy-csv-data-with-pandas\u002Fbest-python-libraries-for-csv-parsing\u002Findex",[47,107436,9630,107441],"-9Ri5_bLphHjMZnEvkbSa9jw4cODPcHI7sFQOJpybDs",{"id":110384,"title":110385,"body":110386,"breadcrumbTitle":27254,"canonical":6977,"date":46387,"description":112811,"draft":6980,"extension":6981,"image":6977,"meta":112812,"navigation":91,"path":112813,"robots":6977,"seo":112814,"seoTitle":110385,"stem":112815,"tags":112816,"updatedAt":6978,"__hash__":112817},"content\u002Fpython-for-excel-csv-data-processing\u002Fcleaning-messy-csv-data-with-pandas\u002Ffixing-encoding-errors-in-csv-files\u002Findex.md","Fixing UnicodeDecodeError in CSV Files: Encoding Errors with Pandas",{"type":7,"value":110387,"toc":112791},[110388,110395,110402,110404,110424,110428,110435,110575,110585,110589,110595,110787,110794,110798,110887,110891,110895,110907,111055,111059,111080,111203,111207,111232,111358,111368,111372,111379,111383,111399,111594,111598,111605,111813,111829,111833,111836,112244,112257,112261,112267,112538,112542,112654,112656,112744,112750,112752,112754,112784,112788],[10,110389,110391,110392],{"id":110390},"fixing-unicodedecodeerror-in-csv-files-utf-8-codec-cant-decode-byte-0x96","Fixing UnicodeDecodeError in CSV Files: ",[30,110393,110394],{},"'utf-8' codec can't decode byte 0x96",[14,110396,110397,110398,110401],{},"The error ",[30,110399,110400],{},"UnicodeDecodeError: 'utf-8' codec can't decode byte 0x96 in position 142: invalid start byte"," stops your script cold every time you try to load a legacy or Windows-exported CSV with pandas. This page gives you a deterministic workflow: detect the real encoding, apply the right fix, handle edge cases like BOM and mixed encodings, and verify the result.",[18,110403,7021],{"id":7020},[14,110405,110406,110407,110410,110411,110413,110414,72819,110417,110420,110421,3035],{},"Pandas defaults to UTF-8. Byte ",[30,110408,110409],{},"0x96"," is a valid Windows-1252 character — the en-dash ",[30,110412,72819],{}," — but it is an illegal start byte in UTF-8. The moment the C parser hits that byte it raises and halts; there is no partial load. The same pattern applies to ",[30,110415,110416],{},"0x91",[30,110418,110419],{},"0x9F"," (curly quotes, em-dash, ellipsis, bullet) that Windows applications encode in the cp1252 range. Regional ERP exports, older Excel CSV saves, and accounting software from any era are the common culprits. For a broader look at which parser handles which file quirks, see ",[940,110422,107425],{"href":110423},"\u002Fpython-for-excel-csv-data-processing\u002Fcleaning-messy-csv-data-with-pandas\u002Fbest-python-libraries-for-csv-parsing\u002F",[18,110425,110427],{"id":110426},"step-1-detect-the-encoding-with-chardet","Step 1 — Detect the Encoding with chardet",[14,110429,110430,110431,110434],{},"Before guessing, measure. ",[30,110432,110433],{},"chardet"," reads a binary sample and returns a codec name and a confidence score.",[23,110436,110438],{"className":126,"code":110437,"language":47,"meta":28,"style":28},"# pip install chardet pandas\nfrom pathlib import Path\nimport chardet\nimport pandas as pd\n\ncsv_path = Path(\"data\u002Fexport.csv\")\n\ntry:\n    raw_sample = csv_path.read_bytes()[:50_000]  # first 50 KB is enough\n    result = chardet.detect(raw_sample)\n    print(result)  # {'encoding': 'Windows-1252', 'confidence': 0.73, 'language': ''}\nexcept OSError as e:\n    raise SystemExit(f\"Cannot read file: {e}\") from e\n",[30,110439,110440,110445,110455,110462,110472,110476,110490,110494,110500,110519,110528,110538,110548],{"__ignoreMap":28},[33,110441,110442],{"class":35,"line":36},[33,110443,110444],{"class":39},"# pip install chardet pandas\n",[33,110446,110447,110449,110451,110453],{"class":35,"line":43},[33,110448,190],{"class":163},[33,110450,193],{"class":167},[33,110452,164],{"class":163},[33,110454,198],{"class":167},[33,110456,110457,110459],{"class":35,"line":61},[33,110458,164],{"class":163},[33,110460,110461],{"class":167}," chardet\n",[33,110463,110464,110466,110468,110470],{"class":35,"line":73},[33,110465,164],{"class":163},[33,110467,492],{"class":167},[33,110469,495],{"class":163},[33,110471,498],{"class":167},[33,110473,110474],{"class":35,"line":88},[33,110475,92],{"emptyLinePlaceholder":91},[33,110477,110478,110481,110483,110485,110488],{"class":35,"line":95},[33,110479,110480],{"class":167},"csv_path ",[33,110482,242],{"class":163},[33,110484,215],{"class":167},[33,110486,110487],{"class":54},"\"data\u002Fexport.csv\"",[33,110489,221],{"class":167},[33,110491,110492],{"class":35,"line":101},[33,110493,92],{"emptyLinePlaceholder":91},[33,110495,110496,110498],{"class":35,"line":171},[33,110497,35574],{"class":163},[33,110499,574],{"class":167},[33,110501,110502,110505,110507,110510,110513,110516],{"class":35,"line":179},[33,110503,110504],{"class":167},"    raw_sample ",[33,110506,242],{"class":163},[33,110508,110509],{"class":167}," csv_path.read_bytes()[:",[33,110511,110512],{"class":50},"50_000",[33,110514,110515],{"class":167},"]  ",[33,110517,110518],{"class":39},"# first 50 KB is enough\n",[33,110520,110521,110523,110525],{"class":35,"line":187},[33,110522,8842],{"class":167},[33,110524,242],{"class":163},[33,110526,110527],{"class":167}," chardet.detect(raw_sample)\n",[33,110529,110530,110532,110535],{"class":35,"line":201},[33,110531,7268],{"class":50},[33,110533,110534],{"class":167},"(result)  ",[33,110536,110537],{"class":39},"# {'encoding': 'Windows-1252', 'confidence': 0.73, 'language': ''}\n",[33,110539,110540,110542,110544,110546],{"class":35,"line":206},[33,110541,35726],{"class":163},[33,110543,107953],{"class":50},[33,110545,1852],{"class":163},[33,110547,7583],{"class":167},[33,110549,110550,110552,110554,110556,110558,110561,110563,110565,110567,110569,110571,110573],{"class":35,"line":224},[33,110551,35742],{"class":163},[33,110553,16617],{"class":50},[33,110555,602],{"class":167},[33,110557,4059],{"class":163},[33,110559,110560],{"class":54},"\"Cannot read file: ",[33,110562,1115],{"class":50},[33,110564,7602],{"class":167},[33,110566,1121],{"class":50},[33,110568,274],{"class":54},[33,110570,1649],{"class":167},[33,110572,190],{"class":163},[33,110574,7613],{"class":167},[14,110576,110577,110578,36608,110581,110584],{},"A confidence score above 0.7 is reliable enough to use directly. Below 0.7 the detector is uncertain — fall back to the manual ladder in the next section. Always read in binary mode (",[30,110579,110580],{},"read_bytes()",[30,110582,110583],{},"'rb'",") so no premature decoding occurs before chardet has seen the raw bytes.",[18,110586,110588],{"id":110587},"step-2-detect-with-charset-normalizer-alternative","Step 2 — Detect with charset-normalizer (Alternative)",[14,110590,110591,110594],{},[30,110592,110593],{},"charset-normalizer"," is a pure-Python alternative with no C extension dependency, making it friendlier in restricted environments. Its API mirrors chardet intentionally.",[23,110596,110598],{"className":126,"code":110597,"language":47,"meta":28,"style":28},"# pip install charset-normalizer pandas\nfrom pathlib import Path\nfrom charset_normalizer import detect\nimport pandas as pd\n\ncsv_path = Path(\"data\u002Fexport.csv\")\n\ntry:\n    raw_sample = csv_path.read_bytes()[:50_000]\n    result = detect(raw_sample)\n    detected_encoding = result.get(\"encoding\")\n    confidence = result.get(\"confidence\", 0.0)\n    print(f\"Detected: {detected_encoding}  confidence: {confidence:.2f}\")\nexcept OSError as e:\n    raise SystemExit(f\"Cannot read file: {e}\") from e\n",[30,110599,110600,110605,110615,110627,110637,110641,110653,110657,110663,110675,110684,110699,110717,110751,110761],{"__ignoreMap":28},[33,110601,110602],{"class":35,"line":36},[33,110603,110604],{"class":39},"# pip install charset-normalizer pandas\n",[33,110606,110607,110609,110611,110613],{"class":35,"line":43},[33,110608,190],{"class":163},[33,110610,193],{"class":167},[33,110612,164],{"class":163},[33,110614,198],{"class":167},[33,110616,110617,110619,110622,110624],{"class":35,"line":61},[33,110618,190],{"class":163},[33,110620,110621],{"class":167}," charset_normalizer ",[33,110623,164],{"class":163},[33,110625,110626],{"class":167}," detect\n",[33,110628,110629,110631,110633,110635],{"class":35,"line":73},[33,110630,164],{"class":163},[33,110632,492],{"class":167},[33,110634,495],{"class":163},[33,110636,498],{"class":167},[33,110638,110639],{"class":35,"line":88},[33,110640,92],{"emptyLinePlaceholder":91},[33,110642,110643,110645,110647,110649,110651],{"class":35,"line":95},[33,110644,110480],{"class":167},[33,110646,242],{"class":163},[33,110648,215],{"class":167},[33,110650,110487],{"class":54},[33,110652,221],{"class":167},[33,110654,110655],{"class":35,"line":101},[33,110656,92],{"emptyLinePlaceholder":91},[33,110658,110659,110661],{"class":35,"line":171},[33,110660,35574],{"class":163},[33,110662,574],{"class":167},[33,110664,110665,110667,110669,110671,110673],{"class":35,"line":179},[33,110666,110504],{"class":167},[33,110668,242],{"class":163},[33,110670,110509],{"class":167},[33,110672,110512],{"class":50},[33,110674,9202],{"class":167},[33,110676,110677,110679,110681],{"class":35,"line":187},[33,110678,8842],{"class":167},[33,110680,242],{"class":163},[33,110682,110683],{"class":167}," detect(raw_sample)\n",[33,110685,110686,110689,110691,110694,110697],{"class":35,"line":201},[33,110687,110688],{"class":167},"    detected_encoding ",[33,110690,242],{"class":163},[33,110692,110693],{"class":167}," result.get(",[33,110695,110696],{"class":54},"\"encoding\"",[33,110698,221],{"class":167},[33,110700,110701,110704,110706,110708,110711,110713,110715],{"class":35,"line":206},[33,110702,110703],{"class":167},"    confidence ",[33,110705,242],{"class":163},[33,110707,110693],{"class":167},[33,110709,110710],{"class":54},"\"confidence\"",[33,110712,365],{"class":167},[33,110714,84626],{"class":50},[33,110716,221],{"class":167},[33,110718,110719,110721,110723,110725,110728,110730,110733,110735,110738,110740,110743,110745,110747,110749],{"class":35,"line":224},[33,110720,7268],{"class":50},[33,110722,602],{"class":167},[33,110724,4059],{"class":163},[33,110726,110727],{"class":54},"\"Detected: ",[33,110729,1115],{"class":50},[33,110731,110732],{"class":167},"detected_encoding",[33,110734,1121],{"class":50},[33,110736,110737],{"class":54},"  confidence: ",[33,110739,1115],{"class":50},[33,110741,110742],{"class":167},"confidence",[33,110744,55819],{"class":163},[33,110746,1121],{"class":50},[33,110748,274],{"class":54},[33,110750,221],{"class":167},[33,110752,110753,110755,110757,110759],{"class":35,"line":229},[33,110754,35726],{"class":163},[33,110756,107953],{"class":50},[33,110758,1852],{"class":163},[33,110760,7583],{"class":167},[33,110762,110763,110765,110767,110769,110771,110773,110775,110777,110779,110781,110783,110785],{"class":35,"line":235},[33,110764,35742],{"class":163},[33,110766,16617],{"class":50},[33,110768,602],{"class":167},[33,110770,4059],{"class":163},[33,110772,110560],{"class":54},[33,110774,1115],{"class":50},[33,110776,7602],{"class":167},[33,110778,1121],{"class":50},[33,110780,274],{"class":54},[33,110782,1649],{"class":167},[33,110784,190],{"class":163},[33,110786,7613],{"class":167},[14,110788,110789,110790,110793],{},"Both libraries return the same dictionary shape: ",[30,110791,110792],{},"{\"encoding\": str | None, \"confidence\": float}",". Use either; the rest of the workflow is identical.",[18,110795,110797],{"id":110796},"step-3-encoding-detection-flow","Step 3 — Encoding Detection Flow",[2540,110799,2547,110802,2547,110805,2547,2547,110808,2547,110811,2547,2547,110816,2547,2547,110819,2547,110822,2547,2547,110825,2547,2547,110827,2547,110831,2547,110833,2547,2547,110837,2547,110839,2547,110843,2547,110846,2547,110850,2547,2547,110853,2547,110855,2547,2547,110857,2547,110860,2547,110864,2547,2547,110867,2547,110870,2547,2547,110874,2547,2547,110878],{"viewBox":110800,"role":2543,"ariaLabel":110801,"style":2546},"0 0 700 380","Encoding detection decision flowchart",[2549,110803,110804],{},"Encoding Detection Decision Flow",[2553,110806,110807],{},"Flowchart showing: open file in binary mode, run chardet.detect(), check confidence above 0.7, if yes use detected encoding, if no try utf-8-sig then cp1252 then latin-1.",[2585,110809],{"id":110810,"x":49869,"y":2587,"width":2611,"height":26341,"rx":11103,"fill":11166,"stroke":11166,"style":2594},"enc-fix-start",[2000,110812,110815],{"x":110813,"y":110814,"fill":2592,"style":2600},"350","47","Open file in binary mode",[35,110817],{"x1":110813,"y1":17014,"x2":110813,"y2":2630,"stroke":2583,"markerEnd":110818,"style":2594},"url(#enc-fix-arrow)",[2585,110820],{"id":110821,"x":2701,"y":2630,"width":2618,"height":26341,"rx":2681,"fill":11165,"stroke":11166,"style":2594},"enc-fix-detect",[2000,110823,110824],{"x":110813,"y":71523,"fill":2599,"style":2600},"chardet.detect() on sample",[35,110826],{"x1":110813,"y1":26332,"x2":110813,"y2":2610,"stroke":2583,"markerEnd":110818,"style":2594},[49826,110828],{"id":110829,"points":110830,"fill":2592,"stroke":2593,"style":2594},"enc-fix-decision","350,160 490,200 350,240 210,200",[2000,110832,110742],{"x":110813,"y":38843,"fill":2599,"style":2600},[2000,110834,110836],{"x":110813,"y":110835,"fill":2599,"style":2600},"213","> 0.7?",[35,110838],{"x1":16990,"y1":2611,"x2":49894,"y2":2611,"stroke":2583,"markerEnd":110818,"style":2594},[2000,110840,38631],{"x":110841,"y":110842,"fill":2583,"style":2605},"530","193",[2585,110844],{"id":110845,"x":49894,"y":16982,"width":2650,"height":26341,"rx":2681,"fill":11165,"stroke":11166,"style":2594},"enc-fix-yes",[2000,110847,110849],{"x":110848,"y":16986,"fill":2599,"style":2685},"630","Use detected",[2000,110851,27249],{"x":110848,"y":110852,"fill":2599,"style":2685},"212",[35,110854],{"x1":110813,"y1":17008,"x2":110813,"y2":107625,"stroke":2583,"markerEnd":110818,"style":2594},[2000,110856,38628],{"x":59952,"y":64929,"fill":2583,"style":58361},[2585,110858],{"id":110859,"x":2701,"y":107625,"width":2618,"height":58404,"rx":2681,"fill":2592,"stroke":2593,"style":2594},"enc-fix-fb1",[2000,110861,110863],{"x":110813,"y":110862,"fill":2599,"style":2685},"291","Try encoding='utf-8-sig'",[35,110865],{"x1":110813,"y1":110866,"x2":110813,"y2":89071,"stroke":2583,"markerEnd":110818,"style":2594},"304",[2585,110868],{"id":110869,"x":2701,"y":89071,"width":2618,"height":58404,"rx":2681,"fill":2592,"stroke":2593,"style":2594},"enc-fix-fb2",[2000,110871,110873],{"x":110813,"y":110872,"fill":2599,"style":2685},"341","Try encoding='cp1252'",[2000,110875,110877],{"x":110813,"y":110876,"fill":2583,"style":2605},"372","Final fallback: encoding='latin-1' (never raises)",[2557,110879,2559,110880,2547],{},[2573,110881,2564,110884,2559],{"id":110882,"markerWidth":2591,"markerHeight":2591,"refX":2681,"refY":10258,"orient":110883},"enc-fix-arrow","auto",[2580,110885],{"d":110886,"fill":2583},"M0,0 L0,6 L8,3 z",[18,110888,110890],{"id":110889},"step-4-fix-with-explicit-encoding","Step 4 — Fix with Explicit Encoding",[424,110892,110894],{"id":110893},"cp1252-windows-and-excel-exports","cp1252 — Windows and Excel Exports",[14,110896,110897,110900,110901,72819,110904,110906],{},[30,110898,110899],{},"cp1252"," is the right first choice for any file that originated on a Windows machine or was saved by Excel's \"Save As CSV\" option. It maps ",[30,110902,110903],{},"0x80",[30,110905,110419],{}," to typographic characters that UTF-8 cannot represent in those byte positions.",[23,110908,110910],{"className":126,"code":110909,"language":47,"meta":28,"style":28},"# pip install pandas\nfrom pathlib import Path\nimport pandas as pd\n\ncsv_path = Path(\"data\u002Fexport.csv\")\n\ntry:\n    df = pd.read_csv(csv_path, encoding=\"cp1252\", engine=\"python\")\n    print(f\"Loaded {len(df):,} rows\")\nexcept (UnicodeDecodeError, OSError) as e:\n    raise SystemExit(f\"Load failed: {e}\") from e\n",[30,110911,110912,110916,110926,110936,110940,110952,110956,110962,110988,111010,111028],{"__ignoreMap":28},[33,110913,110914],{"class":35,"line":36},[33,110915,8895],{"class":39},[33,110917,110918,110920,110922,110924],{"class":35,"line":43},[33,110919,190],{"class":163},[33,110921,193],{"class":167},[33,110923,164],{"class":163},[33,110925,198],{"class":167},[33,110927,110928,110930,110932,110934],{"class":35,"line":61},[33,110929,164],{"class":163},[33,110931,492],{"class":167},[33,110933,495],{"class":163},[33,110935,498],{"class":167},[33,110937,110938],{"class":35,"line":73},[33,110939,92],{"emptyLinePlaceholder":91},[33,110941,110942,110944,110946,110948,110950],{"class":35,"line":88},[33,110943,110480],{"class":167},[33,110945,242],{"class":163},[33,110947,215],{"class":167},[33,110949,110487],{"class":54},[33,110951,221],{"class":167},[33,110953,110954],{"class":35,"line":95},[33,110955,92],{"emptyLinePlaceholder":91},[33,110957,110958,110960],{"class":35,"line":101},[33,110959,35574],{"class":163},[33,110961,574],{"class":167},[33,110963,110964,110966,110968,110970,110972,110974,110977,110979,110981,110983,110986],{"class":35,"line":171},[33,110965,4025],{"class":167},[33,110967,242],{"class":163},[33,110969,21901],{"class":167},[33,110971,27249],{"class":238},[33,110973,242],{"class":163},[33,110975,110976],{"class":54},"\"cp1252\"",[33,110978,365],{"class":167},[33,110980,17351],{"class":238},[33,110982,242],{"class":163},[33,110984,110985],{"class":54},"\"python\"",[33,110987,221],{"class":167},[33,110989,110990,110992,110994,110996,110998,111000,111002,111004,111006,111008],{"class":35,"line":179},[33,110991,7268],{"class":50},[33,110993,602],{"class":167},[33,110995,4059],{"class":163},[33,110997,96187],{"class":54},[33,110999,4065],{"class":50},[33,111001,4068],{"class":167},[33,111003,18801],{"class":163},[33,111005,1121],{"class":50},[33,111007,65937],{"class":54},[33,111009,221],{"class":167},[33,111011,111012,111014,111016,111018,111020,111022,111024,111026],{"class":35,"line":187},[33,111013,35726],{"class":163},[33,111015,17583],{"class":167},[33,111017,53911],{"class":50},[33,111019,365],{"class":167},[33,111021,43079],{"class":50},[33,111023,1649],{"class":167},[33,111025,495],{"class":163},[33,111027,7583],{"class":167},[33,111029,111030,111032,111034,111036,111038,111041,111043,111045,111047,111049,111051,111053],{"class":35,"line":201},[33,111031,35742],{"class":163},[33,111033,16617],{"class":50},[33,111035,602],{"class":167},[33,111037,4059],{"class":163},[33,111039,111040],{"class":54},"\"Load failed: ",[33,111042,1115],{"class":50},[33,111044,7602],{"class":167},[33,111046,1121],{"class":50},[33,111048,274],{"class":54},[33,111050,1649],{"class":167},[33,111052,190],{"class":163},[33,111054,7613],{"class":167},[424,111056,111058],{"id":111057},"latin-1-universal-single-byte-fallback","latin-1 — Universal Single-Byte Fallback",[14,111060,111061,111064,111065,72819,111068,111071,111072,111074,111075,72819,111077,111079],{},[30,111062,111063],{},"latin-1"," (ISO-8859-1) maps every byte ",[30,111066,111067],{},"0x00",[30,111069,111070],{},"0xFF"," directly to the first 256 Unicode code points. It never raises a ",[30,111073,53911],{},", making it the final safety net when you have no idea what encoding was used. The trade-off: cp1252 characters in ",[30,111076,110903],{},[30,111078,110419],{}," come through as control characters rather than typographic symbols, so use it only after cp1252 fails.",[23,111081,111083],{"className":126,"code":111082,"language":47,"meta":28,"style":28},"# pip install pandas\nfrom pathlib import Path\nimport pandas as pd\n\ncsv_path = Path(\"data\u002Fexport.csv\")\n\ntry:\n    df = pd.read_csv(csv_path, encoding=\"latin-1\", engine=\"python\")\n    print(df.dtypes)\nexcept OSError as e:\n    raise SystemExit(f\"Cannot open file: {e}\") from e\n",[30,111084,111085,111089,111099,111109,111113,111125,111129,111135,111160,111166,111176],{"__ignoreMap":28},[33,111086,111087],{"class":35,"line":36},[33,111088,8895],{"class":39},[33,111090,111091,111093,111095,111097],{"class":35,"line":43},[33,111092,190],{"class":163},[33,111094,193],{"class":167},[33,111096,164],{"class":163},[33,111098,198],{"class":167},[33,111100,111101,111103,111105,111107],{"class":35,"line":61},[33,111102,164],{"class":163},[33,111104,492],{"class":167},[33,111106,495],{"class":163},[33,111108,498],{"class":167},[33,111110,111111],{"class":35,"line":73},[33,111112,92],{"emptyLinePlaceholder":91},[33,111114,111115,111117,111119,111121,111123],{"class":35,"line":88},[33,111116,110480],{"class":167},[33,111118,242],{"class":163},[33,111120,215],{"class":167},[33,111122,110487],{"class":54},[33,111124,221],{"class":167},[33,111126,111127],{"class":35,"line":95},[33,111128,92],{"emptyLinePlaceholder":91},[33,111130,111131,111133],{"class":35,"line":101},[33,111132,35574],{"class":163},[33,111134,574],{"class":167},[33,111136,111137,111139,111141,111143,111145,111147,111150,111152,111154,111156,111158],{"class":35,"line":171},[33,111138,4025],{"class":167},[33,111140,242],{"class":163},[33,111142,21901],{"class":167},[33,111144,27249],{"class":238},[33,111146,242],{"class":163},[33,111148,111149],{"class":54},"\"latin-1\"",[33,111151,365],{"class":167},[33,111153,17351],{"class":238},[33,111155,242],{"class":163},[33,111157,110985],{"class":54},[33,111159,221],{"class":167},[33,111161,111162,111164],{"class":35,"line":179},[33,111163,7268],{"class":50},[33,111165,108834],{"class":167},[33,111167,111168,111170,111172,111174],{"class":35,"line":187},[33,111169,35726],{"class":163},[33,111171,107953],{"class":50},[33,111173,1852],{"class":163},[33,111175,7583],{"class":167},[33,111177,111178,111180,111182,111184,111186,111189,111191,111193,111195,111197,111199,111201],{"class":35,"line":201},[33,111179,35742],{"class":163},[33,111181,16617],{"class":50},[33,111183,602],{"class":167},[33,111185,4059],{"class":163},[33,111187,111188],{"class":54},"\"Cannot open file: ",[33,111190,1115],{"class":50},[33,111192,7602],{"class":167},[33,111194,1121],{"class":50},[33,111196,274],{"class":54},[33,111198,1649],{"class":167},[33,111200,190],{"class":163},[33,111202,7613],{"class":167},[424,111204,111206],{"id":111205},"utf-8-sig-bom-stripping","utf-8-sig — BOM Stripping",[14,111208,111209,111210,111213,111214,111216,111217,111220,111221,49047,111224,111227,111228,111231],{},"Files exported from Excel's UTF-8 CSV option often carry a three-byte BOM (",[30,111211,111212],{},"EF BB BF",") at the start. When pandas reads these with ",[30,111215,53907],{},", the first column name gains a leading ",[30,111218,111219],{},"﻿"," — your DataFrame has a column named ",[30,111222,111223],{},"﻿order_id",[30,111225,111226],{},"order_id",". Using ",[30,111229,111230],{},"encoding='utf-8-sig'"," strips the BOM automatically before parsing.",[23,111233,111235],{"className":126,"code":111234,"language":47,"meta":28,"style":28},"# pip install pandas\nfrom pathlib import Path\nimport pandas as pd\n\ncsv_path = Path(\"data\u002Fexcel_utf8_export.csv\")\n\ntry:\n    df = pd.read_csv(csv_path, encoding=\"utf-8-sig\")\n    print(df.columns.tolist())  # ['order_id', 'amount', ...] — no BOM prefix\nexcept (UnicodeDecodeError, OSError) as e:\n    raise SystemExit(f\"Load failed: {e}\") from e\n",[30,111236,111237,111241,111251,111261,111265,111278,111282,111288,111304,111314,111332],{"__ignoreMap":28},[33,111238,111239],{"class":35,"line":36},[33,111240,8895],{"class":39},[33,111242,111243,111245,111247,111249],{"class":35,"line":43},[33,111244,190],{"class":163},[33,111246,193],{"class":167},[33,111248,164],{"class":163},[33,111250,198],{"class":167},[33,111252,111253,111255,111257,111259],{"class":35,"line":61},[33,111254,164],{"class":163},[33,111256,492],{"class":167},[33,111258,495],{"class":163},[33,111260,498],{"class":167},[33,111262,111263],{"class":35,"line":73},[33,111264,92],{"emptyLinePlaceholder":91},[33,111266,111267,111269,111271,111273,111276],{"class":35,"line":88},[33,111268,110480],{"class":167},[33,111270,242],{"class":163},[33,111272,215],{"class":167},[33,111274,111275],{"class":54},"\"data\u002Fexcel_utf8_export.csv\"",[33,111277,221],{"class":167},[33,111279,111280],{"class":35,"line":95},[33,111281,92],{"emptyLinePlaceholder":91},[33,111283,111284,111286],{"class":35,"line":101},[33,111285,35574],{"class":163},[33,111287,574],{"class":167},[33,111289,111290,111292,111294,111296,111298,111300,111302],{"class":35,"line":171},[33,111291,4025],{"class":167},[33,111293,242],{"class":163},[33,111295,21901],{"class":167},[33,111297,27249],{"class":238},[33,111299,242],{"class":163},[33,111301,108390],{"class":54},[33,111303,221],{"class":167},[33,111305,111306,111308,111311],{"class":35,"line":179},[33,111307,7268],{"class":50},[33,111309,111310],{"class":167},"(df.columns.tolist())  ",[33,111312,111313],{"class":39},"# ['order_id', 'amount', ...] — no BOM prefix\n",[33,111315,111316,111318,111320,111322,111324,111326,111328,111330],{"class":35,"line":187},[33,111317,35726],{"class":163},[33,111319,17583],{"class":167},[33,111321,53911],{"class":50},[33,111323,365],{"class":167},[33,111325,43079],{"class":50},[33,111327,1649],{"class":167},[33,111329,495],{"class":163},[33,111331,7583],{"class":167},[33,111333,111334,111336,111338,111340,111342,111344,111346,111348,111350,111352,111354,111356],{"class":35,"line":201},[33,111335,35742],{"class":163},[33,111337,16617],{"class":50},[33,111339,602],{"class":167},[33,111341,4059],{"class":163},[33,111343,111040],{"class":54},[33,111345,1115],{"class":50},[33,111347,7602],{"class":167},[33,111349,1121],{"class":50},[33,111351,274],{"class":54},[33,111353,1649],{"class":167},[33,111355,190],{"class":163},[33,111357,7613],{"class":167},[14,111359,12951,111360,111363,111364,111367],{},[940,111361,111362],{"href":108864},"exporting data to CSV formats"," from pandas you can write a BOM-free UTF-8 file with ",[30,111365,111366],{},"df.to_csv(path, encoding='utf-8', index=False)"," — this prevents the BOM problem entirely for downstream consumers.",[18,111369,111371],{"id":111370},"step-5-handle-residual-bad-bytes-with-encoding_errors","Step 5 — Handle Residual Bad Bytes with encoding_errors",[14,111373,111374,111375,111378],{},"Even with the correct encoding, some files contain isolated corrupted bytes — a copy-paste artifact, a transmission glitch. Use ",[30,111376,111377],{},"encoding_errors"," to control what happens when the parser hits them.",[424,111380,111382],{"id":111381},"encoding_errorsreplace","encoding_errors='replace'",[14,111384,111385,111386,111389,111390,111393,111394,111396,111397,3035],{},"Substitutes each undecodable byte with ",[30,111387,111388],{},"U+FFFD"," (the replacement character ",[30,111391,111392],{},"�",", displayed as ",[30,111395,111392],{},"). The file loads completely; you can then find affected cells and convert them to ",[30,111398,8373],{},[23,111400,111402],{"className":126,"code":111401,"language":47,"meta":28,"style":28},"# pip install pandas\nfrom pathlib import Path\nimport pandas as pd\n\ncsv_path = Path(\"data\u002Fmixed.csv\")\n\ntry:\n    df = pd.read_csv(\n        csv_path,\n        encoding=\"utf-8\",\n        encoding_errors=\"replace\",\n        engine=\"python\",\n    )\n    # Replace replacement characters with proper NA\n    df = df.replace(\"�\", pd.NA)\n    print(f\"Rows with any NA after replacement: {df.isna().any(axis=1).sum()}\")\nexcept OSError as e:\n    raise SystemExit(f\"Cannot open file: {e}\") from e\n",[30,111403,111404,111408,111418,111428,111432,111445,111449,111455,111463,111468,111478,111489,111500,111504,111509,111527,111558,111568],{"__ignoreMap":28},[33,111405,111406],{"class":35,"line":36},[33,111407,8895],{"class":39},[33,111409,111410,111412,111414,111416],{"class":35,"line":43},[33,111411,190],{"class":163},[33,111413,193],{"class":167},[33,111415,164],{"class":163},[33,111417,198],{"class":167},[33,111419,111420,111422,111424,111426],{"class":35,"line":61},[33,111421,164],{"class":163},[33,111423,492],{"class":167},[33,111425,495],{"class":163},[33,111427,498],{"class":167},[33,111429,111430],{"class":35,"line":73},[33,111431,92],{"emptyLinePlaceholder":91},[33,111433,111434,111436,111438,111440,111443],{"class":35,"line":88},[33,111435,110480],{"class":167},[33,111437,242],{"class":163},[33,111439,215],{"class":167},[33,111441,111442],{"class":54},"\"data\u002Fmixed.csv\"",[33,111444,221],{"class":167},[33,111446,111447],{"class":35,"line":95},[33,111448,92],{"emptyLinePlaceholder":91},[33,111450,111451,111453],{"class":35,"line":101},[33,111452,35574],{"class":163},[33,111454,574],{"class":167},[33,111456,111457,111459,111461],{"class":35,"line":171},[33,111458,4025],{"class":167},[33,111460,242],{"class":163},[33,111462,108706],{"class":167},[33,111464,111465],{"class":35,"line":179},[33,111466,111467],{"class":167},"        csv_path,\n",[33,111469,111470,111472,111474,111476],{"class":35,"line":187},[33,111471,1190],{"class":238},[33,111473,242],{"class":163},[33,111475,1195],{"class":54},[33,111477,247],{"class":167},[33,111479,111480,111483,111485,111487],{"class":35,"line":201},[33,111481,111482],{"class":238},"        encoding_errors",[33,111484,242],{"class":163},[33,111486,108399],{"class":54},[33,111488,247],{"class":167},[33,111490,111491,111494,111496,111498],{"class":35,"line":206},[33,111492,111493],{"class":238},"        engine",[33,111495,242],{"class":163},[33,111497,110985],{"class":54},[33,111499,247],{"class":167},[33,111501,111502],{"class":35,"line":224},[33,111503,1202],{"class":167},[33,111505,111506],{"class":35,"line":229},[33,111507,111508],{"class":39},"    # Replace replacement characters with proper NA\n",[33,111510,111511,111513,111515,111518,111521,111523,111525],{"class":35,"line":235},[33,111512,4025],{"class":167},[33,111514,242],{"class":163},[33,111516,111517],{"class":167}," df.replace(",[33,111519,111520],{"class":54},"\"�\"",[33,111522,10884],{"class":167},[33,111524,8018],{"class":50},[33,111526,221],{"class":167},[33,111528,111529,111531,111533,111535,111538,111540,111543,111545,111547,111549,111552,111554,111556],{"class":35,"line":250},[33,111530,7268],{"class":50},[33,111532,602],{"class":167},[33,111534,4059],{"class":163},[33,111536,111537],{"class":54},"\"Rows with any NA after replacement: ",[33,111539,1115],{"class":50},[33,111541,111542],{"class":167},"df.isna().any(",[33,111544,4177],{"class":238},[33,111546,242],{"class":163},[33,111548,734],{"class":50},[33,111550,111551],{"class":167},").sum()",[33,111553,1121],{"class":50},[33,111555,274],{"class":54},[33,111557,221],{"class":167},[33,111559,111560,111562,111564,111566],{"class":35,"line":266},[33,111561,35726],{"class":163},[33,111563,107953],{"class":50},[33,111565,1852],{"class":163},[33,111567,7583],{"class":167},[33,111569,111570,111572,111574,111576,111578,111580,111582,111584,111586,111588,111590,111592],{"class":35,"line":290},[33,111571,35742],{"class":163},[33,111573,16617],{"class":50},[33,111575,602],{"class":167},[33,111577,4059],{"class":163},[33,111579,111188],{"class":54},[33,111581,1115],{"class":50},[33,111583,7602],{"class":167},[33,111585,1121],{"class":50},[33,111587,274],{"class":54},[33,111589,1649],{"class":167},[33,111591,190],{"class":163},[33,111593,7613],{"class":167},[424,111595,111597],{"id":111596},"encoding_errorsbackslashreplace","encoding_errors='backslashreplace'",[14,111599,111600,111601,111604],{},"Converts each bad byte to a Python escape sequence like ",[30,111602,111603],{},"\\x96",". Useful for debugging — you can see exactly which bytes are causing trouble — but leave this only as a diagnostic mode, not in production pipelines.",[23,111606,111608],{"className":126,"code":111607,"language":47,"meta":28,"style":28},"# pip install pandas\nfrom pathlib import Path\nimport pandas as pd\n\ncsv_path = Path(\"data\u002Fmixed.csv\")\n\ntry:\n    df = pd.read_csv(\n        csv_path,\n        encoding=\"utf-8\",\n        encoding_errors=\"backslashreplace\",\n        engine=\"python\",\n    )\n    # Find cells containing escape sequences to spot bad bytes\n    mask = df.apply(lambda col: col.astype(str).str.contains(r\"\\\\x\", regex=True))\n    print(\"Cells with escaped bytes:\")\n    print(df[mask.any(axis=1)])\nexcept OSError as e:\n    raise SystemExit(f\"Cannot open file: {e}\") from e\n",[30,111609,111610,111614,111624,111634,111638,111650,111654,111660,111668,111672,111682,111693,111703,111707,111712,111750,111761,111777,111787],{"__ignoreMap":28},[33,111611,111612],{"class":35,"line":36},[33,111613,8895],{"class":39},[33,111615,111616,111618,111620,111622],{"class":35,"line":43},[33,111617,190],{"class":163},[33,111619,193],{"class":167},[33,111621,164],{"class":163},[33,111623,198],{"class":167},[33,111625,111626,111628,111630,111632],{"class":35,"line":61},[33,111627,164],{"class":163},[33,111629,492],{"class":167},[33,111631,495],{"class":163},[33,111633,498],{"class":167},[33,111635,111636],{"class":35,"line":73},[33,111637,92],{"emptyLinePlaceholder":91},[33,111639,111640,111642,111644,111646,111648],{"class":35,"line":88},[33,111641,110480],{"class":167},[33,111643,242],{"class":163},[33,111645,215],{"class":167},[33,111647,111442],{"class":54},[33,111649,221],{"class":167},[33,111651,111652],{"class":35,"line":95},[33,111653,92],{"emptyLinePlaceholder":91},[33,111655,111656,111658],{"class":35,"line":101},[33,111657,35574],{"class":163},[33,111659,574],{"class":167},[33,111661,111662,111664,111666],{"class":35,"line":171},[33,111663,4025],{"class":167},[33,111665,242],{"class":163},[33,111667,108706],{"class":167},[33,111669,111670],{"class":35,"line":179},[33,111671,111467],{"class":167},[33,111673,111674,111676,111678,111680],{"class":35,"line":187},[33,111675,1190],{"class":238},[33,111677,242],{"class":163},[33,111679,1195],{"class":54},[33,111681,247],{"class":167},[33,111683,111684,111686,111688,111691],{"class":35,"line":201},[33,111685,111482],{"class":238},[33,111687,242],{"class":163},[33,111689,111690],{"class":54},"\"backslashreplace\"",[33,111692,247],{"class":167},[33,111694,111695,111697,111699,111701],{"class":35,"line":206},[33,111696,111493],{"class":238},[33,111698,242],{"class":163},[33,111700,110985],{"class":54},[33,111702,247],{"class":167},[33,111704,111705],{"class":35,"line":224},[33,111706,1202],{"class":167},[33,111708,111709],{"class":35,"line":229},[33,111710,111711],{"class":39},"    # Find cells containing escape sequences to spot bad bytes\n",[33,111713,111714,111717,111719,111721,111723,111725,111727,111730,111732,111734,111737,111740,111742,111744,111746,111748],{"class":35,"line":235},[33,111715,111716],{"class":167},"    mask ",[33,111718,242],{"class":163},[33,111720,39836],{"class":167},[33,111722,39839],{"class":163},[33,111724,39842],{"class":167},[33,111726,1053],{"class":50},[33,111728,111729],{"class":167},").str.contains(",[33,111731,11977],{"class":163},[33,111733,274],{"class":54},[33,111735,111736],{"class":12018},"\\\\",[33,111738,111739],{"class":54},"x\"",[33,111741,365],{"class":167},[33,111743,11993],{"class":238},[33,111745,242],{"class":163},[33,111747,855],{"class":50},[33,111749,371],{"class":167},[33,111751,111752,111754,111756,111759],{"class":35,"line":250},[33,111753,7268],{"class":50},[33,111755,602],{"class":167},[33,111757,111758],{"class":54},"\"Cells with escaped bytes:\"",[33,111760,221],{"class":167},[33,111762,111763,111765,111768,111770,111772,111774],{"class":35,"line":266},[33,111764,7268],{"class":50},[33,111766,111767],{"class":167},"(df[mask.any(",[33,111769,4177],{"class":238},[33,111771,242],{"class":163},[33,111773,734],{"class":50},[33,111775,111776],{"class":167},")])\n",[33,111778,111779,111781,111783,111785],{"class":35,"line":290},[33,111780,35726],{"class":163},[33,111782,107953],{"class":50},[33,111784,1852],{"class":163},[33,111786,7583],{"class":167},[33,111788,111789,111791,111793,111795,111797,111799,111801,111803,111805,111807,111809,111811],{"class":35,"line":295},[33,111790,35742],{"class":163},[33,111792,16617],{"class":50},[33,111794,602],{"class":167},[33,111796,4059],{"class":163},[33,111798,111188],{"class":54},[33,111800,1115],{"class":50},[33,111802,7602],{"class":167},[33,111804,1121],{"class":50},[33,111806,274],{"class":54},[33,111808,1649],{"class":167},[33,111810,190],{"class":163},[33,111812,7613],{"class":167},[14,111814,111815,111821,111822,27825,111825,111828],{},[1974,111816,111817,111818,3035],{},"Never use ",[30,111819,111820],{},"encoding_errors='ignore'"," It silently drops the offending bytes entirely. A value like ",[30,111823,111824],{},"caf\\xe9",[30,111826,111827],{},"caf"," — a shorter string — which can cause column misalignment across rows, truncated values, and data loss that has no visible signal in the DataFrame.",[18,111830,111832],{"id":111831},"step-6-automated-fallback-ladder","Step 6 — Automated Fallback Ladder",[14,111834,111835],{},"Combine detection with a ranked fallback sequence. This covers the full workflow: detect → high-confidence shortcut → manual ladder → give up gracefully.",[23,111837,111839],{"className":126,"code":111838,"language":47,"meta":28,"style":28},"# pip install chardet pandas\nfrom pathlib import Path\nimport chardet\nimport pandas as pd\n\nFALLBACK_ENCODINGS = [\"utf-8-sig\", \"cp1252\", \"latin-1\"]\n\ndef load_csv_robust(csv_path: Path) -> pd.DataFrame:\n    \"\"\"Load a CSV file, auto-detecting encoding with a manual fallback ladder.\"\"\"\n    try:\n        raw_sample = csv_path.read_bytes()[:50_000]\n    except OSError as e:\n        raise SystemExit(f\"Cannot read file: {e}\") from e\n\n    detected = chardet.detect(raw_sample)\n    enc = detected.get(\"encoding\")\n    confidence = detected.get(\"confidence\", 0.0)\n\n    if enc and confidence > 0.7:\n        try:\n            return pd.read_csv(csv_path, encoding=enc, engine=\"python\")\n        except (UnicodeDecodeError, LookupError):\n            pass  # fall through to manual ladder\n\n    for encoding in FALLBACK_ENCODINGS:\n        try:\n            df = pd.read_csv(csv_path, encoding=encoding, engine=\"python\")\n            print(f\"Loaded with fallback encoding: {encoding}\")\n            return df\n        except UnicodeDecodeError:\n            continue\n\n    raise ValueError(\n        f\"Could not decode {csv_path.name} with any known encoding. \"\n        \"Inspect with a hex editor.\"\n    )\n\nif __name__ == \"__main__\":\n    df = load_csv_robust(Path(\"data\u002Fexport.csv\"))\n    print(df.head())\n",[30,111840,111841,111845,111855,111861,111871,111875,111896,111900,111910,111915,111921,111934,111944,111970,111974,111983,111996,112012,112016,112034,112040,112061,112076,112084,112088,112102,112108,112131,112152,112158,112167,112171,112175,112183,112200,112205,112209,112213,112225,112238],{"__ignoreMap":28},[33,111842,111843],{"class":35,"line":36},[33,111844,110444],{"class":39},[33,111846,111847,111849,111851,111853],{"class":35,"line":43},[33,111848,190],{"class":163},[33,111850,193],{"class":167},[33,111852,164],{"class":163},[33,111854,198],{"class":167},[33,111856,111857,111859],{"class":35,"line":61},[33,111858,164],{"class":163},[33,111860,110461],{"class":167},[33,111862,111863,111865,111867,111869],{"class":35,"line":73},[33,111864,164],{"class":163},[33,111866,492],{"class":167},[33,111868,495],{"class":163},[33,111870,498],{"class":167},[33,111872,111873],{"class":35,"line":88},[33,111874,92],{"emptyLinePlaceholder":91},[33,111876,111877,111880,111882,111884,111886,111888,111890,111892,111894],{"class":35,"line":95},[33,111878,111879],{"class":50},"FALLBACK_ENCODINGS",[33,111881,212],{"class":163},[33,111883,9178],{"class":167},[33,111885,108390],{"class":54},[33,111887,365],{"class":167},[33,111889,110976],{"class":54},[33,111891,365],{"class":167},[33,111893,111149],{"class":54},[33,111895,9202],{"class":167},[33,111897,111898],{"class":35,"line":101},[33,111899,92],{"emptyLinePlaceholder":91},[33,111901,111902,111904,111907],{"class":35,"line":171},[33,111903,562],{"class":163},[33,111905,111906],{"class":46}," load_csv_robust",[33,111908,111909],{"class":167},"(csv_path: Path) -> pd.DataFrame:\n",[33,111911,111912],{"class":35,"line":179},[33,111913,111914],{"class":54},"    \"\"\"Load a CSV file, auto-detecting encoding with a manual fallback ladder.\"\"\"\n",[33,111916,111917,111919],{"class":35,"line":187},[33,111918,2424],{"class":163},[33,111920,574],{"class":167},[33,111922,111923,111926,111928,111930,111932],{"class":35,"line":201},[33,111924,111925],{"class":167},"        raw_sample ",[33,111927,242],{"class":163},[33,111929,110509],{"class":167},[33,111931,110512],{"class":50},[33,111933,9202],{"class":167},[33,111935,111936,111938,111940,111942],{"class":35,"line":206},[33,111937,2449],{"class":163},[33,111939,107953],{"class":50},[33,111941,1852],{"class":163},[33,111943,7583],{"class":167},[33,111945,111946,111948,111950,111952,111954,111956,111958,111960,111962,111964,111966,111968],{"class":35,"line":224},[33,111947,4051],{"class":163},[33,111949,16617],{"class":50},[33,111951,602],{"class":167},[33,111953,4059],{"class":163},[33,111955,110560],{"class":54},[33,111957,1115],{"class":50},[33,111959,7602],{"class":167},[33,111961,1121],{"class":50},[33,111963,274],{"class":54},[33,111965,1649],{"class":167},[33,111967,190],{"class":163},[33,111969,7613],{"class":167},[33,111971,111972],{"class":35,"line":229},[33,111973,92],{"emptyLinePlaceholder":91},[33,111975,111976,111979,111981],{"class":35,"line":235},[33,111977,111978],{"class":167},"    detected ",[33,111980,242],{"class":163},[33,111982,110527],{"class":167},[33,111984,111985,111987,111989,111992,111994],{"class":35,"line":250},[33,111986,93844],{"class":167},[33,111988,242],{"class":163},[33,111990,111991],{"class":167}," detected.get(",[33,111993,110696],{"class":54},[33,111995,221],{"class":167},[33,111997,111998,112000,112002,112004,112006,112008,112010],{"class":35,"line":266},[33,111999,110703],{"class":167},[33,112001,242],{"class":163},[33,112003,111991],{"class":167},[33,112005,110710],{"class":54},[33,112007,365],{"class":167},[33,112009,84626],{"class":50},[33,112011,221],{"class":167},[33,112013,112014],{"class":35,"line":290},[33,112015,92],{"emptyLinePlaceholder":91},[33,112017,112018,112020,112023,112025,112028,112030,112032],{"class":35,"line":295},[33,112019,617],{"class":163},[33,112021,112022],{"class":167}," enc ",[33,112024,6001],{"class":163},[33,112026,112027],{"class":167}," confidence ",[33,112029,6009],{"class":163},[33,112031,48946],{"class":50},[33,112033,574],{"class":167},[33,112035,112036,112038],{"class":35,"line":300},[33,112037,670],{"class":163},[33,112039,574],{"class":167},[33,112041,112042,112044,112046,112048,112050,112053,112055,112057,112059],{"class":35,"line":317},[33,112043,28782],{"class":163},[33,112045,21901],{"class":167},[33,112047,27249],{"class":238},[33,112049,242],{"class":163},[33,112051,112052],{"class":167},"enc, ",[33,112054,17351],{"class":238},[33,112056,242],{"class":163},[33,112058,110985],{"class":54},[33,112060,221],{"class":167},[33,112062,112063,112065,112067,112069,112071,112074],{"class":35,"line":332},[33,112064,780],{"class":163},[33,112066,17583],{"class":167},[33,112068,53911],{"class":50},[33,112070,365],{"class":167},[33,112072,112073],{"class":50},"LookupError",[33,112075,1737],{"class":167},[33,112077,112078,112081],{"class":35,"line":347},[33,112079,112080],{"class":163},"            pass",[33,112082,112083],{"class":39},"  # fall through to manual ladder\n",[33,112085,112086],{"class":35,"line":374},[33,112087,92],{"emptyLinePlaceholder":91},[33,112089,112090,112092,112095,112097,112100],{"class":35,"line":397},[33,112091,656],{"class":163},[33,112093,112094],{"class":167}," encoding ",[33,112096,662],{"class":163},[33,112098,112099],{"class":50}," FALLBACK_ENCODINGS",[33,112101,574],{"class":167},[33,112103,112104,112106],{"class":35,"line":653},[33,112105,670],{"class":163},[33,112107,574],{"class":167},[33,112109,112110,112112,112114,112116,112118,112120,112123,112125,112127,112129],{"class":35,"line":667},[33,112111,51528],{"class":167},[33,112113,242],{"class":163},[33,112115,21901],{"class":167},[33,112117,27249],{"class":238},[33,112119,242],{"class":163},[33,112121,112122],{"class":167},"encoding, ",[33,112124,17351],{"class":238},[33,112126,242],{"class":163},[33,112128,110985],{"class":54},[33,112130,221],{"class":167},[33,112132,112133,112135,112137,112139,112142,112144,112146,112148,112150],{"class":35,"line":675},[33,112134,9364],{"class":50},[33,112136,602],{"class":167},[33,112138,4059],{"class":163},[33,112140,112141],{"class":54},"\"Loaded with fallback encoding: ",[33,112143,1115],{"class":50},[33,112145,27249],{"class":167},[33,112147,1121],{"class":50},[33,112149,274],{"class":54},[33,112151,221],{"class":167},[33,112153,112154,112156],{"class":35,"line":689},[33,112155,28782],{"class":163},[33,112157,11719],{"class":167},[33,112159,112160,112162,112165],{"class":35,"line":703},[33,112161,780],{"class":163},[33,112163,112164],{"class":50}," UnicodeDecodeError",[33,112166,574],{"class":167},[33,112168,112169],{"class":35,"line":714},[33,112170,9330],{"class":163},[33,112172,112173],{"class":35,"line":723},[33,112174,92],{"emptyLinePlaceholder":91},[33,112176,112177,112179,112181],{"class":35,"line":754},[33,112178,35742],{"class":163},[33,112180,4054],{"class":50},[33,112182,7637],{"class":167},[33,112184,112185,112187,112190,112192,112195,112197],{"class":35,"line":771},[33,112186,9533],{"class":163},[33,112188,112189],{"class":54},"\"Could not decode ",[33,112191,1115],{"class":50},[33,112193,112194],{"class":167},"csv_path.name",[33,112196,1121],{"class":50},[33,112198,112199],{"class":54}," with any known encoding. \"\n",[33,112201,112202],{"class":35,"line":777},[33,112203,112204],{"class":54},"        \"Inspect with a hex editor.\"\n",[33,112206,112207],{"class":35,"line":788},[33,112208,1202],{"class":167},[33,112210,112211],{"class":35,"line":804},[33,112212,92],{"emptyLinePlaceholder":91},[33,112214,112215,112217,112219,112221,112223],{"class":35,"line":809},[33,112216,2491],{"class":163},[33,112218,2494],{"class":50},[33,112220,2497],{"class":163},[33,112222,2500],{"class":54},[33,112224,574],{"class":167},[33,112226,112227,112229,112231,112234,112236],{"class":35,"line":819},[33,112228,4025],{"class":167},[33,112230,242],{"class":163},[33,112232,112233],{"class":167}," load_csv_robust(Path(",[33,112235,110487],{"class":54},[33,112237,371],{"class":167},[33,112239,112240,112242],{"class":35,"line":829},[33,112241,7268],{"class":50},[33,112243,13311],{"class":167},[14,112245,112246,112247,6242,112250,112252,112253,112256],{},"The same pattern applies when ",[940,112248,112249],{"href":99576},"reading Excel files with Python",[30,112251,26542],{}," files are always UTF-8 internally, but legacy ",[30,112254,112255],{},".xls"," and CSV saves from Excel carry cp1252 or regional codecs.",[18,112258,112260],{"id":112259},"step-7-verify-the-fix","Step 7 — Verify the Fix",[14,112262,112263,112264,112266],{},"A row-count assertion and a spot-check catch silent failures before data reaches downstream code. If the file loaded with ",[30,112265,111382],{}," but you expected clean data, the assertion surfaces the problem.",[23,112268,112270],{"className":126,"code":112269,"language":47,"meta":28,"style":28},"# pip install pandas\nfrom pathlib import Path\nimport pandas as pd\n\ncsv_path = Path(\"data\u002Fexport.csv\")\nEXPECTED_ROWS = 10_000  # set from source system record count\n\ntry:\n    df = pd.read_csv(csv_path, encoding=\"cp1252\", engine=\"python\")\nexcept (UnicodeDecodeError, OSError) as e:\n    raise SystemExit(f\"Load failed: {e}\") from e\n\n# Row count assertion\nassert len(df) == EXPECTED_ROWS, (\n    f\"Expected {EXPECTED_ROWS} rows, got {len(df)}. \"\n    \"Check for header\u002Ffooter rows or encoding truncation.\"\n)\n\n# Spot-check for replacement characters (indicates residual bad bytes)\nreplacement_count = df.apply(lambda c: c.astype(str).str.contains(\"�\")).sum().sum()\nif replacement_count > 0:\n    print(f\"WARNING: {replacement_count} cells contain replacement characters.\")\n\nprint(\"Verification passed.\")\n",[30,112271,112272,112276,112286,112296,112300,112312,112325,112329,112335,112359,112377,112403,112407,112412,112426,112447,112452,112456,112460,112465,112488,112501,112524,112528],{"__ignoreMap":28},[33,112273,112274],{"class":35,"line":36},[33,112275,8895],{"class":39},[33,112277,112278,112280,112282,112284],{"class":35,"line":43},[33,112279,190],{"class":163},[33,112281,193],{"class":167},[33,112283,164],{"class":163},[33,112285,198],{"class":167},[33,112287,112288,112290,112292,112294],{"class":35,"line":61},[33,112289,164],{"class":163},[33,112291,492],{"class":167},[33,112293,495],{"class":163},[33,112295,498],{"class":167},[33,112297,112298],{"class":35,"line":73},[33,112299,92],{"emptyLinePlaceholder":91},[33,112301,112302,112304,112306,112308,112310],{"class":35,"line":88},[33,112303,110480],{"class":167},[33,112305,242],{"class":163},[33,112307,215],{"class":167},[33,112309,110487],{"class":54},[33,112311,221],{"class":167},[33,112313,112314,112317,112319,112322],{"class":35,"line":95},[33,112315,112316],{"class":50},"EXPECTED_ROWS",[33,112318,212],{"class":163},[33,112320,112321],{"class":50}," 10_000",[33,112323,112324],{"class":39},"  # set from source system record count\n",[33,112326,112327],{"class":35,"line":101},[33,112328,92],{"emptyLinePlaceholder":91},[33,112330,112331,112333],{"class":35,"line":171},[33,112332,35574],{"class":163},[33,112334,574],{"class":167},[33,112336,112337,112339,112341,112343,112345,112347,112349,112351,112353,112355,112357],{"class":35,"line":179},[33,112338,4025],{"class":167},[33,112340,242],{"class":163},[33,112342,21901],{"class":167},[33,112344,27249],{"class":238},[33,112346,242],{"class":163},[33,112348,110976],{"class":54},[33,112350,365],{"class":167},[33,112352,17351],{"class":238},[33,112354,242],{"class":163},[33,112356,110985],{"class":54},[33,112358,221],{"class":167},[33,112360,112361,112363,112365,112367,112369,112371,112373,112375],{"class":35,"line":187},[33,112362,35726],{"class":163},[33,112364,17583],{"class":167},[33,112366,53911],{"class":50},[33,112368,365],{"class":167},[33,112370,43079],{"class":50},[33,112372,1649],{"class":167},[33,112374,495],{"class":163},[33,112376,7583],{"class":167},[33,112378,112379,112381,112383,112385,112387,112389,112391,112393,112395,112397,112399,112401],{"class":35,"line":201},[33,112380,35742],{"class":163},[33,112382,16617],{"class":50},[33,112384,602],{"class":167},[33,112386,4059],{"class":163},[33,112388,111040],{"class":54},[33,112390,1115],{"class":50},[33,112392,7602],{"class":167},[33,112394,1121],{"class":50},[33,112396,274],{"class":54},[33,112398,1649],{"class":167},[33,112400,190],{"class":163},[33,112402,7613],{"class":167},[33,112404,112405],{"class":35,"line":206},[33,112406,92],{"emptyLinePlaceholder":91},[33,112408,112409],{"class":35,"line":224},[33,112410,112411],{"class":39},"# Row count assertion\n",[33,112413,112414,112416,112418,112420,112422,112424],{"class":35,"line":229},[33,112415,36397],{"class":163},[33,112417,4037],{"class":50},[33,112419,4040],{"class":167},[33,112421,1865],{"class":163},[33,112423,9525],{"class":50},[33,112425,9528],{"class":167},[33,112427,112428,112431,112434,112436,112439,112441,112443,112445],{"class":35,"line":235},[33,112429,112430],{"class":163},"    f",[33,112432,112433],{"class":54},"\"Expected ",[33,112435,9554],{"class":50},[33,112437,112438],{"class":54}," rows, got ",[33,112440,4065],{"class":50},[33,112442,4068],{"class":167},[33,112444,1121],{"class":50},[33,112446,52129],{"class":54},[33,112448,112449],{"class":35,"line":250},[33,112450,112451],{"class":54},"    \"Check for header\u002Ffooter rows or encoding truncation.\"\n",[33,112453,112454],{"class":35,"line":266},[33,112455,221],{"class":167},[33,112457,112458],{"class":35,"line":290},[33,112459,92],{"emptyLinePlaceholder":91},[33,112461,112462],{"class":35,"line":295},[33,112463,112464],{"class":39},"# Spot-check for replacement characters (indicates residual bad bytes)\n",[33,112466,112467,112470,112472,112474,112476,112479,112481,112483,112485],{"class":35,"line":300},[33,112468,112469],{"class":167},"replacement_count ",[33,112471,242],{"class":163},[33,112473,39836],{"class":167},[33,112475,39839],{"class":163},[33,112477,112478],{"class":167}," c: c.astype(",[33,112480,1053],{"class":50},[33,112482,111729],{"class":167},[33,112484,111520],{"class":54},[33,112486,112487],{"class":167},")).sum().sum()\n",[33,112489,112490,112492,112495,112497,112499],{"class":35,"line":317},[33,112491,2491],{"class":163},[33,112493,112494],{"class":167}," replacement_count ",[33,112496,6009],{"class":163},[33,112498,10791],{"class":50},[33,112500,574],{"class":167},[33,112502,112503,112505,112507,112509,112512,112514,112517,112519,112522],{"class":35,"line":332},[33,112504,7268],{"class":50},[33,112506,602],{"class":167},[33,112508,4059],{"class":163},[33,112510,112511],{"class":54},"\"WARNING: ",[33,112513,1115],{"class":50},[33,112515,112516],{"class":167},"replacement_count",[33,112518,1121],{"class":50},[33,112520,112521],{"class":54}," cells contain replacement characters.\"",[33,112523,221],{"class":167},[33,112525,112526],{"class":35,"line":347},[33,112527,92],{"emptyLinePlaceholder":91},[33,112529,112530,112532,112534,112536],{"class":35,"line":374},[33,112531,13474],{"class":50},[33,112533,602],{"class":167},[33,112535,9569],{"class":54},[33,112537,221],{"class":167},[18,112539,112541],{"id":112540},"quick-reference-encoding-choices","Quick-Reference: Encoding Choices",[4273,112543,112544,112557],{},[4276,112545,112546],{},[4279,112547,112548,112551,112554],{},[4282,112549,112550],{},"Situation",[4282,112552,112553],{},"Recommended encoding",[4282,112555,112556],{},"Notes",[4292,112558,112559,112570,112582,112598,112610,112625,112640],{},[4279,112560,112561,112564,112567],{},[4297,112562,112563],{},"chardet confidence > 0.7",[4297,112565,112566],{},"Use detected encoding",[4297,112568,112569],{},"Read 50 KB minimum",[4279,112571,112572,112575,112579],{},[4297,112573,112574],{},"Excel \"Save As CSV\" on Windows",[4297,112576,112577],{},[30,112578,110899],{},[4297,112580,112581],{},"Covers smart quotes, en\u002Fem-dash",[4279,112583,112584,112587,112592],{},[4297,112585,112586],{},"Excel \"UTF-8 CSV\" with BOM",[4297,112588,112589],{},[30,112590,112591],{},"utf-8-sig",[4297,112593,112594,112595,112597],{},"Strips ",[30,112596,111212],{}," automatically",[4279,112599,112600,112603,112607],{},[4297,112601,112602],{},"Unknown legacy file, any platform",[4297,112604,112605],{},[30,112606,111063],{},[4297,112608,112609],{},"Never raises; use as last resort",[4279,112611,112612,112615,112619],{},[4297,112613,112614],{},"Isolated corrupted bytes",[4297,112616,112617],{},[30,112618,111382],{},[4297,112620,42543,112621,8877,112623],{},[30,112622,111392],{},[30,112624,8373],{},[4279,112626,112627,112630,112634],{},[4297,112628,112629],{},"Debugging bad bytes",[4297,112631,112632],{},[30,112633,111597],{},[4297,112635,112636,112637,112639],{},"Shows ",[30,112638,111603],{}," escapes — diagnostic only",[4279,112641,112642,112647,112651],{},[4297,112643,112644],{},[1974,112645,112646],{},"Avoid",[4297,112648,112649],{},[30,112650,111820],{},[4297,112652,112653],{},"Drops bytes silently → column shifts",[18,112655,48994],{"id":29070},[4273,112657,112658,112670],{},[4276,112659,112660],{},[4279,112661,112662,112664,112667],{},[4282,112663,79442],{},[4282,112665,112666],{},"Impact",[4282,112668,112669],{},"Resolution",[4292,112671,112672,112691,112702,112717,112728],{},[4279,112673,112674,112678,112681],{},[4297,112675,112676],{},[30,112677,111820],{},[4297,112679,112680],{},"Silently drops bytes, causes column misalignment",[4297,112682,17059,112683,112686,112687,36661,112689],{},[30,112684,112685],{},"'replace'"," and convert ",[30,112688,111392],{},[30,112690,8373],{},[4279,112692,112693,112696,112699],{},[4297,112694,112695],{},"Assuming all CSVs are UTF-8",[4297,112697,112698],{},"Immediate crash on legacy exports",[4297,112700,112701],{},"Detect with chardet or try cp1252 first",[4279,112703,112704,112707,112710],{},[4297,112705,112706],{},"Not reading a binary sample for chardet",[4297,112708,112709],{},"Chardet decodes the sample before detecting",[4297,112711,74585,112712,2012,112714,112716],{},[30,112713,110580],{},[30,112715,110583],{}," mode",[4279,112718,112719,112722,112725],{},[4297,112720,112721],{},"Ignoring low-confidence detections",[4297,112723,112724],{},"Wrong encoding applied, garbled strings",[4297,112726,112727],{},"Treat confidence \u003C 0.7 as unknown; use fallback ladder",[4279,112729,112730,112736,112739],{},[4297,112731,112732,112733],{},"Skipping ",[30,112734,112735],{},"engine='python'",[4297,112737,112738],{},"C engine has limited codec fallback support",[4297,112740,4358,112741,112743],{},[30,112742,112735],{}," for non-UTF-8 codecs",[14,112745,112746,112747,112749],{},"Encoding issues are one facet of a larger ingestion problem. The ",[940,112748,107447],{"href":9598}," guide continues from a successfully loaded DataFrame into type coercion, whitespace normalization, and duplicate handling.",[2537,112751],{},[18,112753,6918],{"id":6917},[4211,112755,112756,112761,112766,112774,112779],{},[4214,112757,112758,112760],{},[940,112759,107447],{"href":9598}," — post-ingestion cleaning workflows",[4214,112762,112763,112765],{},[940,112764,107425],{"href":110423}," — chardet, charset-normalizer, and parser comparisons",[4214,112767,112768,112770,112771,112773],{},[940,112769,99577],{"href":99576}," — encoding considerations for ",[30,112772,112255],{}," and legacy Excel formats",[4214,112775,112776,112778],{},[940,112777,108865],{"href":108864}," — write BOM-free UTF-8 to prevent downstream encoding errors",[4214,112780,112781,112783],{},[940,112782,26258],{"href":26257}," — full pipeline overview",[14,112785,6947,112786,3035],{},[940,112787,107447],{"href":9598},[6953,112789,112790],{},"html pre.shiki code .sAwPA, html code.shiki .sAwPA{--shiki-default:#6A737D}html pre.shiki code .sD7c4, html code.shiki .sD7c4{--shiki-default:#D73A49}html pre.shiki code .sgsFI, html code.shiki .sgsFI{--shiki-default:#24292E}html pre.shiki code .sYBdl, html code.shiki .sYBdl{--shiki-default:#032F62}html pre.shiki code .sYu0t, html code.shiki .sYu0t{--shiki-default:#005CC5}html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html pre.shiki code .sqxcx, html code.shiki .sqxcx{--shiki-default:#E36209}html pre.shiki code .s691h, html code.shiki .s691h{--shiki-default:#22863A;--shiki-default-font-weight:bold}html pre.shiki code .s7eDp, html code.shiki .s7eDp{--shiki-default:#6F42C1}",{"title":28,"searchDepth":43,"depth":43,"links":112792},[112793,112794,112795,112796,112797,112802,112806,112807,112808,112809,112810],{"id":7020,"depth":43,"text":7021},{"id":110426,"depth":43,"text":110427},{"id":110587,"depth":43,"text":110588},{"id":110796,"depth":43,"text":110797},{"id":110889,"depth":43,"text":110890,"children":112798},[112799,112800,112801],{"id":110893,"depth":61,"text":110894},{"id":111057,"depth":61,"text":111058},{"id":111205,"depth":61,"text":111206},{"id":111370,"depth":43,"text":111371,"children":112803},[112804,112805],{"id":111381,"depth":61,"text":111382},{"id":111596,"depth":61,"text":111597},{"id":111831,"depth":43,"text":111832},{"id":112259,"depth":43,"text":112260},{"id":112540,"depth":43,"text":112541},{"id":29070,"depth":43,"text":48994},{"id":6917,"depth":43,"text":6918},"Resolve UnicodeDecodeError 'utf-8' codec can't decode byte 0x96 in CSV files with chardet, charset-normalizer, BOM handling, and pandas encoding parameters.",{},"\u002Fpython-for-excel-csv-data-processing\u002Fcleaning-messy-csv-data-with-pandas\u002Ffixing-encoding-errors-in-csv-files",{"title":110385,"description":112811},"python-for-excel-csv-data-processing\u002Fcleaning-messy-csv-data-with-pandas\u002Ffixing-encoding-errors-in-csv-files\u002Findex",[47,107436,27249,9630],"CbgkKh4V58sNnvGjRkHaXmN40ZxUk-xUxWIuotafcpI",{"id":112819,"title":107447,"body":112820,"breadcrumbTitle":117963,"canonical":6977,"date":46387,"description":117964,"draft":6980,"extension":6981,"image":6977,"meta":117965,"navigation":91,"path":117966,"robots":6977,"seo":117967,"seoTitle":117968,"stem":117969,"tags":117970,"updatedAt":6978,"__hash__":117974},"content\u002Fpython-for-excel-csv-data-processing\u002Fcleaning-messy-csv-data-with-pandas\u002Findex.md",{"type":7,"value":112821,"toc":117941},[112822,112825,112837,112843,112845,112847,112853,112868,112871,112939,112941,112945,112948,113413,113427,113429,113431,113434,113438,113684,113696,113700,113703,113821,113825,113828,114193,114199,114203,114206,114546,114552,114554,114558,114561,114692,114694,114696,114700,114714,114857,114861,114874,115043,115047,115066,115142,115148,115150,115152,115155,115544,115546,115548,115557,115565,116053,116066,116082,116084,116086,116236,116238,116240,116248,117826,117828,117858,117860,117864,117879,117897,117903,117905,117907,117934,117938],[10,112823,107447],{"id":112824},"cleaning-messy-csv-data-with-pandas",[14,112826,112827,112828,2012,112830,112832,112833,112836],{},"Raw CSV exports from CRMs, ERPs, and accounting tools rarely arrive clean. Columns have inconsistent capitalisation, numeric fields contain stray currency symbols, date strings span five different formats, and \"empty\" rows are packed with whitespace or placeholder strings like ",[30,112829,27824],{},[30,112831,75122],{},". The default ",[30,112834,112835],{},"pd.read_csv()"," call silently mishandles all of it.",[14,112838,112839,112840,112842],{},"The broader ",[940,112841,26258],{"href":26257}," workflow covers multi-format ingestion — XLSX, ODS, and PDF sources — but this page focuses exclusively on flat-file remediation: diagnosing what is wrong before writing a single line of cleaning code, then applying targeted fixes in a reproducible order.",[2537,112844],{},[18,112846,21],{"id":20},[14,112848,112849,112850,112852],{},"You need Python 3.9+ and the following packages. ",[30,112851,110433],{}," is optional but helps when the file encoding is completely unknown.",[23,112854,112856],{"className":25,"code":112855,"language":27,"meta":28,"style":28},"pip install pandas chardet\n",[30,112857,112858],{"__ignoreMap":28},[33,112859,112860,112862,112864,112866],{"class":35,"line":36},[33,112861,76],{"class":46},[33,112863,79],{"class":54},[33,112865,16183],{"class":54},[33,112867,110461],{"class":54},[14,112869,112870],{},"To follow the examples, create a small test file that mimics real-world mess:",[23,112872,112874],{"className":25,"code":112873,"language":27,"meta":28,"style":28},"python - \u003C\u003C'EOF'\nfrom pathlib import Path\nPath(\"dirty.csv\").write_text(\n    \" Order_ID , Product Name , Quantity , Unit Price , Created_At , Status \\n\"\n    \"  ORD-001 , Widget A , 5 , $12.50 , 2024\u002F01\u002F15 , confirmed \\n\"\n    \"  ORD-002 , Widget B , , $8.00 , 15-Jan-2024 , N\u002FA \\n\"\n    \"  ORD-001 , Widget A , 5 , $12.50 , 2024\u002F01\u002F15 , confirmed \\n\"  # duplicate\n    \"  ORD-003 , Widget C , 2 , - , 2024-01-17 , CONFIRMED \\n\",\n    encoding=\"utf-8\",\n)\nprint(\"dirty.csv created\")\nEOF\n",[30,112875,112876,112886,112891,112896,112901,112906,112911,112916,112921,112926,112930,112935],{"__ignoreMap":28},[33,112877,112878,112880,112882,112884],{"class":35,"line":36},[33,112879,47],{"class":46},[33,112881,39025],{"class":54},[33,112883,53957],{"class":163},[33,112885,53960],{"class":54},[33,112887,112888],{"class":35,"line":43},[33,112889,112890],{"class":54},"from pathlib import Path\n",[33,112892,112893],{"class":35,"line":61},[33,112894,112895],{"class":54},"Path(\"dirty.csv\").write_text(\n",[33,112897,112898],{"class":35,"line":73},[33,112899,112900],{"class":54},"    \" Order_ID , Product Name , Quantity , Unit Price , Created_At , Status \\n\"\n",[33,112902,112903],{"class":35,"line":88},[33,112904,112905],{"class":54},"    \"  ORD-001 , Widget A , 5 , $12.50 , 2024\u002F01\u002F15 , confirmed \\n\"\n",[33,112907,112908],{"class":35,"line":95},[33,112909,112910],{"class":54},"    \"  ORD-002 , Widget B , , $8.00 , 15-Jan-2024 , N\u002FA \\n\"\n",[33,112912,112913],{"class":35,"line":101},[33,112914,112915],{"class":54},"    \"  ORD-001 , Widget A , 5 , $12.50 , 2024\u002F01\u002F15 , confirmed \\n\"  # duplicate\n",[33,112917,112918],{"class":35,"line":171},[33,112919,112920],{"class":54},"    \"  ORD-003 , Widget C , 2 , - , 2024-01-17 , CONFIRMED \\n\",\n",[33,112922,112923],{"class":35,"line":179},[33,112924,112925],{"class":54},"    encoding=\"utf-8\",\n",[33,112927,112928],{"class":35,"line":187},[33,112929,221],{"class":54},[33,112931,112932],{"class":35,"line":201},[33,112933,112934],{"class":54},"print(\"dirty.csv created\")\n",[33,112936,112937],{"class":35,"line":206},[33,112938,54019],{"class":54},[2537,112940],{},[18,112942,112944],{"id":112943},"diagnostic-step-inspect-before-you-clean","Diagnostic Step: Inspect Before You Clean",[14,112946,112947],{},"Blindly applying a cleaning function to an unknown file introduces silent errors. Spend thirty seconds classifying the file first: delimiter, encoding, row count, null density, and dtype inference failures.",[23,112949,112951],{"className":126,"code":112950,"language":47,"meta":28,"style":28},"# pip install pandas chardet\nimport pandas as pd\nimport chardet\nfrom pathlib import Path\n\n\ndef diagnose_csv(path: Path) -> dict:\n    \"\"\"Return a diagnostic summary without committing to a specific parse strategy.\"\"\"\n    raw_bytes = path.read_bytes()\n    detected = chardet.detect(raw_bytes[:10_000])\n\n    # Peek at the first line to guess delimiter\n    first_line = raw_bytes[:500].decode(detected[\"encoding\"] or \"utf-8\", errors=\"replace\")\n    guessed_sep = (\n        \";\" if first_line.count(\";\") > first_line.count(\",\")\n        else \"\\t\" if first_line.count(\"\\t\") > first_line.count(\",\")\n        else \",\"\n    )\n\n    try:\n        df = pd.read_csv(\n            path,\n            encoding=detected[\"encoding\"] or \"utf-8-sig\",\n            sep=None,\n            engine=\"python\",\n            nrows=500,\n        )\n        null_pct = df.isnull().mean().round(3).to_dict()\n        inferred_dtypes = df.dtypes.astype(str).to_dict()\n    except Exception as exc:\n        return {\"error\": str(exc)}\n\n    return {\n        \"encoding\": detected,\n        \"guessed_sep\": guessed_sep,\n        \"shape_sample\": df.shape,\n        \"columns\": list(df.columns),\n        \"null_pct\": null_pct,\n        \"inferred_dtypes\": inferred_dtypes,\n    }\n\n\nif __name__ == \"__main__\":\n    import json\n    report = diagnose_csv(Path(\"dirty.csv\"))\n    print(json.dumps(report, indent=2, default=str))\n",[30,112952,112953,112958,112968,112974,112984,112988,112992,113005,113010,113020,113033,113037,113042,113076,113085,113108,113138,113145,113149,113153,113159,113167,113172,113193,113204,113215,113226,113230,113245,113259,113269,113283,113287,113293,113301,113309,113317,113329,113337,113345,113349,113353,113357,113369,113375,113390],{"__ignoreMap":28},[33,112954,112955],{"class":35,"line":36},[33,112956,112957],{"class":39},"# pip install pandas chardet\n",[33,112959,112960,112962,112964,112966],{"class":35,"line":43},[33,112961,164],{"class":163},[33,112963,492],{"class":167},[33,112965,495],{"class":163},[33,112967,498],{"class":167},[33,112969,112970,112972],{"class":35,"line":61},[33,112971,164],{"class":163},[33,112973,110461],{"class":167},[33,112975,112976,112978,112980,112982],{"class":35,"line":73},[33,112977,190],{"class":163},[33,112979,193],{"class":167},[33,112981,164],{"class":163},[33,112983,198],{"class":167},[33,112985,112986],{"class":35,"line":88},[33,112987,92],{"emptyLinePlaceholder":91},[33,112989,112990],{"class":35,"line":95},[33,112991,92],{"emptyLinePlaceholder":91},[33,112993,112994,112996,112999,113001,113003],{"class":35,"line":101},[33,112995,562],{"class":163},[33,112997,112998],{"class":46}," diagnose_csv",[33,113000,3743],{"class":167},[33,113002,37100],{"class":50},[33,113004,574],{"class":167},[33,113006,113007],{"class":35,"line":171},[33,113008,113009],{"class":54},"    \"\"\"Return a diagnostic summary without committing to a specific parse strategy.\"\"\"\n",[33,113011,113012,113015,113017],{"class":35,"line":179},[33,113013,113014],{"class":167},"    raw_bytes ",[33,113016,242],{"class":163},[33,113018,113019],{"class":167}," path.read_bytes()\n",[33,113021,113022,113024,113026,113029,113031],{"class":35,"line":187},[33,113023,111978],{"class":167},[33,113025,242],{"class":163},[33,113027,113028],{"class":167}," chardet.detect(raw_bytes[:",[33,113030,104304],{"class":50},[33,113032,751],{"class":167},[33,113034,113035],{"class":35,"line":201},[33,113036,92],{"emptyLinePlaceholder":91},[33,113038,113039],{"class":35,"line":206},[33,113040,113041],{"class":39},"    # Peek at the first line to guess delimiter\n",[33,113043,113044,113047,113049,113052,113054,113057,113059,113061,113063,113066,113068,113070,113072,113074],{"class":35,"line":224},[33,113045,113046],{"class":167},"    first_line ",[33,113048,242],{"class":163},[33,113050,113051],{"class":167}," raw_bytes[:",[33,113053,13437],{"class":50},[33,113055,113056],{"class":167},"].decode(detected[",[33,113058,110696],{"class":54},[33,113060,763],{"class":167},[33,113062,7162],{"class":163},[33,113064,113065],{"class":54}," \"utf-8\"",[33,113067,365],{"class":167},[33,113069,8317],{"class":238},[33,113071,242],{"class":163},[33,113073,108399],{"class":54},[33,113075,221],{"class":167},[33,113077,113078,113081,113083],{"class":35,"line":229},[33,113079,113080],{"class":167},"    guessed_sep ",[33,113082,242],{"class":163},[33,113084,1415],{"class":167},[33,113086,113087,113090,113092,113095,113098,113100,113102,113104,113106],{"class":35,"line":235},[33,113088,113089],{"class":54},"        \";\"",[33,113091,9994],{"class":163},[33,113093,113094],{"class":167}," first_line.count(",[33,113096,113097],{"class":54},"\";\"",[33,113099,1649],{"class":167},[33,113101,6009],{"class":163},[33,113103,113094],{"class":167},[33,113105,15900],{"class":54},[33,113107,221],{"class":167},[33,113109,113110,113112,113114,113116,113118,113120,113122,113124,113126,113128,113130,113132,113134,113136],{"class":35,"line":250},[33,113111,41290],{"class":163},[33,113113,44625],{"class":54},[33,113115,80208],{"class":50},[33,113117,274],{"class":54},[33,113119,9994],{"class":163},[33,113121,113094],{"class":167},[33,113123,274],{"class":54},[33,113125,80208],{"class":50},[33,113127,274],{"class":54},[33,113129,1649],{"class":167},[33,113131,6009],{"class":163},[33,113133,113094],{"class":167},[33,113135,15900],{"class":54},[33,113137,221],{"class":167},[33,113139,113140,113142],{"class":35,"line":266},[33,113141,41290],{"class":163},[33,113143,113144],{"class":54}," \",\"\n",[33,113146,113147],{"class":35,"line":290},[33,113148,1202],{"class":167},[33,113150,113151],{"class":35,"line":295},[33,113152,92],{"emptyLinePlaceholder":91},[33,113154,113155,113157],{"class":35,"line":300},[33,113156,2424],{"class":163},[33,113158,574],{"class":167},[33,113160,113161,113163,113165],{"class":35,"line":317},[33,113162,7930],{"class":167},[33,113164,242],{"class":163},[33,113166,108706],{"class":167},[33,113168,113169],{"class":35,"line":332},[33,113170,113171],{"class":167},"            path,\n",[33,113173,113174,113177,113179,113182,113184,113186,113188,113191],{"class":35,"line":347},[33,113175,113176],{"class":238},"            encoding",[33,113178,242],{"class":163},[33,113180,113181],{"class":167},"detected[",[33,113183,110696],{"class":54},[33,113185,763],{"class":167},[33,113187,7162],{"class":163},[33,113189,113190],{"class":54}," \"utf-8-sig\"",[33,113192,247],{"class":167},[33,113194,113195,113198,113200,113202],{"class":35,"line":374},[33,113196,113197],{"class":238},"            sep",[33,113199,242],{"class":163},[33,113201,571],{"class":50},[33,113203,247],{"class":167},[33,113205,113206,113209,113211,113213],{"class":35,"line":397},[33,113207,113208],{"class":238},"            engine",[33,113210,242],{"class":163},[33,113212,110985],{"class":54},[33,113214,247],{"class":167},[33,113216,113217,113220,113222,113224],{"class":35,"line":653},[33,113218,113219],{"class":238},"            nrows",[33,113221,242],{"class":163},[33,113223,13437],{"class":50},[33,113225,247],{"class":167},[33,113227,113228],{"class":35,"line":667},[33,113229,5867],{"class":167},[33,113231,113232,113235,113237,113240,113242],{"class":35,"line":675},[33,113233,113234],{"class":167},"        null_pct ",[33,113236,242],{"class":163},[33,113238,113239],{"class":167}," df.isnull().mean().round(",[33,113241,10258],{"class":50},[33,113243,113244],{"class":167},").to_dict()\n",[33,113246,113247,113250,113252,113255,113257],{"class":35,"line":689},[33,113248,113249],{"class":167},"        inferred_dtypes ",[33,113251,242],{"class":163},[33,113253,113254],{"class":167}," df.dtypes.astype(",[33,113256,1053],{"class":50},[33,113258,113244],{"class":167},[33,113260,113261,113263,113265,113267],{"class":35,"line":703},[33,113262,2449],{"class":163},[33,113264,783],{"class":50},[33,113266,1852],{"class":163},[33,113268,1855],{"class":167},[33,113270,113271,113273,113275,113277,113279,113281],{"class":35,"line":714},[33,113272,1659],{"class":163},[33,113274,4098],{"class":167},[33,113276,37333],{"class":54},[33,113278,2079],{"class":167},[33,113280,1053],{"class":50},[33,113282,71424],{"class":167},[33,113284,113285],{"class":35,"line":723},[33,113286,92],{"emptyLinePlaceholder":91},[33,113288,113289,113291],{"class":35,"line":754},[33,113290,1332],{"class":163},[33,113292,16265],{"class":167},[33,113294,113295,113298],{"class":35,"line":771},[33,113296,113297],{"class":54},"        \"encoding\"",[33,113299,113300],{"class":167},": detected,\n",[33,113302,113303,113306],{"class":35,"line":777},[33,113304,113305],{"class":54},"        \"guessed_sep\"",[33,113307,113308],{"class":167},": guessed_sep,\n",[33,113310,113311,113314],{"class":35,"line":788},[33,113312,113313],{"class":54},"        \"shape_sample\"",[33,113315,113316],{"class":167},": df.shape,\n",[33,113318,113319,113322,113324,113326],{"class":35,"line":804},[33,113320,113321],{"class":54},"        \"columns\"",[33,113323,2079],{"class":167},[33,113325,25066],{"class":50},[33,113327,113328],{"class":167},"(df.columns),\n",[33,113330,113331,113334],{"class":35,"line":809},[33,113332,113333],{"class":54},"        \"null_pct\"",[33,113335,113336],{"class":167},": null_pct,\n",[33,113338,113339,113342],{"class":35,"line":819},[33,113340,113341],{"class":54},"        \"inferred_dtypes\"",[33,113343,113344],{"class":167},": inferred_dtypes,\n",[33,113346,113347],{"class":35,"line":829},[33,113348,20781],{"class":167},[33,113350,113351],{"class":35,"line":834},[33,113352,92],{"emptyLinePlaceholder":91},[33,113354,113355],{"class":35,"line":839},[33,113356,92],{"emptyLinePlaceholder":91},[33,113358,113359,113361,113363,113365,113367],{"class":35,"line":860},[33,113360,2491],{"class":163},[33,113362,2494],{"class":50},[33,113364,2497],{"class":163},[33,113366,2500],{"class":54},[33,113368,574],{"class":167},[33,113370,113371,113373],{"class":35,"line":887},[33,113372,1627],{"class":163},[33,113374,3081],{"class":167},[33,113376,113377,113380,113382,113385,113388],{"class":35,"line":907},[33,113378,113379],{"class":167},"    report ",[33,113381,242],{"class":163},[33,113383,113384],{"class":167}," diagnose_csv(Path(",[33,113386,113387],{"class":54},"\"dirty.csv\"",[33,113389,371],{"class":167},[33,113391,113392,113394,113397,113399,113401,113403,113405,113407,113409,113411],{"class":35,"line":1826},[33,113393,7268],{"class":50},[33,113395,113396],{"class":167},"(json.dumps(report, ",[33,113398,37382],{"class":238},[33,113400,242],{"class":163},[33,113402,1533],{"class":50},[33,113404,365],{"class":167},[33,113406,6685],{"class":238},[33,113408,242],{"class":163},[33,113410,1053],{"class":50},[33,113412,371],{"class":167},[14,113414,113415,113416,113419,113420,113423,113424,113426],{},"Run this before anything else. If ",[30,113417,113418],{},"shape_sample"," shows one column when you expected ten, the delimiter is wrong. If ",[30,113421,113422],{},"inferred_dtypes"," shows ",[30,113425,11888],{}," for a price column, there are non-numeric characters to strip. Use these findings to drive every subsequent decision.",[2537,113428],{},[18,113430,422],{"id":421},[14,113432,113433],{},"The steps below form an ordered pipeline. Each function is self-contained and can be unit-tested independently.",[424,113435,113437],{"id":113436},"step-1-load-with-encoding-fallback","Step 1 — Load with Encoding Fallback",[23,113439,113441],{"className":126,"code":113440,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\nimport sys\nfrom pathlib import Path\n\n\ndef load_robust_csv(path: Path) -> pd.DataFrame:\n    \"\"\"Ingest CSV with automatic delimiter detection and encoding fallback.\"\"\"\n    try:\n        df = pd.read_csv(\n            path,\n            encoding=\"utf-8-sig\",\n            sep=None,\n            engine=\"python\",\n        )\n        print(\"[OK] Loaded with UTF-8-SIG encoding.\")\n        return df\n    except UnicodeDecodeError:\n        try:\n            df = pd.read_csv(\n                path,\n                encoding=\"latin-1\",\n                sep=None,\n                engine=\"python\",\n            )\n            print(\"[WARN] Fallback to Latin-1 encoding applied.\")\n            return df\n        except Exception as exc:\n            print(f\"[ERROR] Ingestion failed: {exc}\")\n            sys.exit(1)\n",[30,113442,113443,113447,113457,113463,113473,113477,113481,113490,113495,113501,113509,113513,113523,113533,113543,113547,113558,113564,113572,113578,113586,113591,113602,113613,113624,113628,113639,113645,113655,113676],{"__ignoreMap":28},[33,113444,113445],{"class":35,"line":36},[33,113446,8895],{"class":39},[33,113448,113449,113451,113453,113455],{"class":35,"line":43},[33,113450,164],{"class":163},[33,113452,492],{"class":167},[33,113454,495],{"class":163},[33,113456,498],{"class":167},[33,113458,113459,113461],{"class":35,"line":61},[33,113460,164],{"class":163},[33,113462,168],{"class":167},[33,113464,113465,113467,113469,113471],{"class":35,"line":73},[33,113466,190],{"class":163},[33,113468,193],{"class":167},[33,113470,164],{"class":163},[33,113472,198],{"class":167},[33,113474,113475],{"class":35,"line":88},[33,113476,92],{"emptyLinePlaceholder":91},[33,113478,113479],{"class":35,"line":95},[33,113480,92],{"emptyLinePlaceholder":91},[33,113482,113483,113485,113488],{"class":35,"line":101},[33,113484,562],{"class":163},[33,113486,113487],{"class":46}," load_robust_csv",[33,113489,7103],{"class":167},[33,113491,113492],{"class":35,"line":171},[33,113493,113494],{"class":54},"    \"\"\"Ingest CSV with automatic delimiter detection and encoding fallback.\"\"\"\n",[33,113496,113497,113499],{"class":35,"line":179},[33,113498,2424],{"class":163},[33,113500,574],{"class":167},[33,113502,113503,113505,113507],{"class":35,"line":187},[33,113504,7930],{"class":167},[33,113506,242],{"class":163},[33,113508,108706],{"class":167},[33,113510,113511],{"class":35,"line":201},[33,113512,113171],{"class":167},[33,113514,113515,113517,113519,113521],{"class":35,"line":206},[33,113516,113176],{"class":238},[33,113518,242],{"class":163},[33,113520,108390],{"class":54},[33,113522,247],{"class":167},[33,113524,113525,113527,113529,113531],{"class":35,"line":224},[33,113526,113197],{"class":238},[33,113528,242],{"class":163},[33,113530,571],{"class":50},[33,113532,247],{"class":167},[33,113534,113535,113537,113539,113541],{"class":35,"line":229},[33,113536,113208],{"class":238},[33,113538,242],{"class":163},[33,113540,110985],{"class":54},[33,113542,247],{"class":167},[33,113544,113545],{"class":35,"line":235},[33,113546,5867],{"class":167},[33,113548,113549,113551,113553,113556],{"class":35,"line":250},[33,113550,9414],{"class":50},[33,113552,602],{"class":167},[33,113554,113555],{"class":54},"\"[OK] Loaded with UTF-8-SIG encoding.\"",[33,113557,221],{"class":167},[33,113559,113560,113562],{"class":35,"line":266},[33,113561,1659],{"class":163},[33,113563,11719],{"class":167},[33,113565,113566,113568,113570],{"class":35,"line":290},[33,113567,2449],{"class":163},[33,113569,112164],{"class":50},[33,113571,574],{"class":167},[33,113573,113574,113576],{"class":35,"line":295},[33,113575,670],{"class":163},[33,113577,574],{"class":167},[33,113579,113580,113582,113584],{"class":35,"line":300},[33,113581,51528],{"class":167},[33,113583,242],{"class":163},[33,113585,108706],{"class":167},[33,113587,113588],{"class":35,"line":317},[33,113589,113590],{"class":167},"                path,\n",[33,113592,113593,113596,113598,113600],{"class":35,"line":332},[33,113594,113595],{"class":238},"                encoding",[33,113597,242],{"class":163},[33,113599,111149],{"class":54},[33,113601,247],{"class":167},[33,113603,113604,113607,113609,113611],{"class":35,"line":347},[33,113605,113606],{"class":238},"                sep",[33,113608,242],{"class":163},[33,113610,571],{"class":50},[33,113612,247],{"class":167},[33,113614,113615,113618,113620,113622],{"class":35,"line":374},[33,113616,113617],{"class":238},"                engine",[33,113619,242],{"class":163},[33,113621,110985],{"class":54},[33,113623,247],{"class":167},[33,113625,113626],{"class":35,"line":397},[33,113627,24021],{"class":167},[33,113629,113630,113632,113634,113637],{"class":35,"line":653},[33,113631,9364],{"class":50},[33,113633,602],{"class":167},[33,113635,113636],{"class":54},"\"[WARN] Fallback to Latin-1 encoding applied.\"",[33,113638,221],{"class":167},[33,113640,113641,113643],{"class":35,"line":667},[33,113642,28782],{"class":163},[33,113644,11719],{"class":167},[33,113646,113647,113649,113651,113653],{"class":35,"line":675},[33,113648,780],{"class":163},[33,113650,783],{"class":50},[33,113652,1852],{"class":163},[33,113654,1855],{"class":167},[33,113656,113657,113659,113661,113663,113666,113668,113670,113672,113674],{"class":35,"line":689},[33,113658,9364],{"class":50},[33,113660,602],{"class":167},[33,113662,4059],{"class":163},[33,113664,113665],{"class":54},"\"[ERROR] Ingestion failed: ",[33,113667,1115],{"class":50},[33,113669,6565],{"class":167},[33,113671,1121],{"class":50},[33,113673,274],{"class":54},[33,113675,221],{"class":167},[33,113677,113678,113680,113682],{"class":35,"line":703},[33,113679,41280],{"class":167},[33,113681,734],{"class":50},[33,113683,221],{"class":167},[14,113685,113686,113687,113689,113690,113692,113693,113695],{},"For persistent ",[30,113688,53911],{}," failures from legacy system exports, ",[940,113691,27254],{"href":27253}," covers BOM stripping, ",[30,113694,110433],{}," fallback chains, and multi-encoding detection in depth.",[424,113697,113699],{"id":113698},"step-2-normalize-headers","Step 2 — Normalize Headers",[14,113701,113702],{},"Column names with leading\u002Ftrailing whitespace silently break column lookups and merges. Fix them immediately after load, before any dtype work.",[23,113704,113706],{"className":126,"code":113705,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\n\n\ndef normalize_headers(df: pd.DataFrame) -> pd.DataFrame:\n    \"\"\"Strip whitespace, lowercase, and snake_case column names.\"\"\"\n    df.columns = (\n        df.columns\n        .str.strip()\n        .str.lower()\n        .str.replace(r\"[^\\w]+\", \"_\", regex=True)\n        .str.strip(\"_\")\n    )\n    return df\n",[30,113707,113708,113712,113722,113726,113730,113739,113744,113752,113757,113762,113767,113802,113811,113815],{"__ignoreMap":28},[33,113709,113710],{"class":35,"line":36},[33,113711,8895],{"class":39},[33,113713,113714,113716,113718,113720],{"class":35,"line":43},[33,113715,164],{"class":163},[33,113717,492],{"class":167},[33,113719,495],{"class":163},[33,113721,498],{"class":167},[33,113723,113724],{"class":35,"line":61},[33,113725,92],{"emptyLinePlaceholder":91},[33,113727,113728],{"class":35,"line":73},[33,113729,92],{"emptyLinePlaceholder":91},[33,113731,113732,113734,113737],{"class":35,"line":88},[33,113733,562],{"class":163},[33,113735,113736],{"class":46}," normalize_headers",[33,113738,12127],{"class":167},[33,113740,113741],{"class":35,"line":95},[33,113742,113743],{"class":54},"    \"\"\"Strip whitespace, lowercase, and snake_case column names.\"\"\"\n",[33,113745,113746,113748,113750],{"class":35,"line":101},[33,113747,27546],{"class":167},[33,113749,242],{"class":163},[33,113751,1415],{"class":167},[33,113753,113754],{"class":35,"line":171},[33,113755,113756],{"class":167},"        df.columns\n",[33,113758,113759],{"class":35,"line":179},[33,113760,113761],{"class":167},"        .str.strip()\n",[33,113763,113764],{"class":35,"line":187},[33,113765,113766],{"class":167},"        .str.lower()\n",[33,113768,113769,113772,113774,113776,113778,113781,113784,113786,113788,113790,113792,113794,113796,113798,113800],{"class":35,"line":201},[33,113770,113771],{"class":167},"        .str.replace(",[33,113773,11977],{"class":163},[33,113775,274],{"class":54},[33,113777,8309],{"class":50},[33,113779,113780],{"class":163},"^",[33,113782,113783],{"class":50},"\\w]",[33,113785,1811],{"class":163},[33,113787,274],{"class":54},[33,113789,365],{"class":167},[33,113791,7764],{"class":54},[33,113793,365],{"class":167},[33,113795,11993],{"class":238},[33,113797,242],{"class":163},[33,113799,855],{"class":50},[33,113801,221],{"class":167},[33,113803,113804,113807,113809],{"class":35,"line":206},[33,113805,113806],{"class":167},"        .str.strip(",[33,113808,7764],{"class":54},[33,113810,221],{"class":167},[33,113812,113813],{"class":35,"line":224},[33,113814,1202],{"class":167},[33,113816,113817,113819],{"class":35,"line":229},[33,113818,1332],{"class":163},[33,113820,11719],{"class":167},[424,113822,113824],{"id":113823},"step-3-coerce-data-types","Step 3 — Coerce Data Types",[14,113826,113827],{},"CSVs have no schema. Pandas infers types row-by-row, which is slow and wrong whenever a column contains mixed content. Define explicit types and let coercion errors surface rather than silently convert.",[23,113829,113831],{"className":126,"code":113830,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\nimport re\n\n\ndef coerce_types(df: pd.DataFrame) -> pd.DataFrame:\n    \"\"\"Enforce explicit dtypes for known columns; parse dates separately.\"\"\"\n    # Strip currency symbols and commas from price-like columns\n    for col in df.columns:\n        if \"price\" in col or \"amount\" in col or \"cost\" in col:\n            df[col] = (\n                df[col]\n                .astype(str)\n                .str.replace(r\"[^\\d.\\-]\", \"\", regex=True)\n                .replace(\"\", pd.NA)\n            )\n\n    type_map = {\n        \"order_id\": \"string\",\n        \"quantity\": \"Int64\",       # Nullable integer — survives NaN\n        \"unit_price\": \"float64\",\n        \"status\": \"string\",\n    }\n    existing = {col: dtype for col, dtype in type_map.items() if col in df.columns}\n    df = df.astype(existing, errors=\"ignore\")\n\n    # Dates require pd.to_datetime, not astype\n    for col in (\"created_at\", \"updated_at\", \"order_date\"):\n        if col in df.columns:\n            df[col] = pd.to_datetime(df[col], format=\"mixed\", dayfirst=False, errors=\"coerce\")\n\n    return df\n",[30,113832,113833,113837,113847,113853,113857,113861,113870,113875,113880,113890,113920,113928,113933,113942,113979,113992,113996,114000,114009,114020,114035,114047,114058,114062,114091,114108,114112,114117,114141,114151,114183,114187],{"__ignoreMap":28},[33,113834,113835],{"class":35,"line":36},[33,113836,8895],{"class":39},[33,113838,113839,113841,113843,113845],{"class":35,"line":43},[33,113840,164],{"class":163},[33,113842,492],{"class":167},[33,113844,495],{"class":163},[33,113846,498],{"class":167},[33,113848,113849,113851],{"class":35,"line":61},[33,113850,164],{"class":163},[33,113852,11917],{"class":167},[33,113854,113855],{"class":35,"line":73},[33,113856,92],{"emptyLinePlaceholder":91},[33,113858,113859],{"class":35,"line":88},[33,113860,92],{"emptyLinePlaceholder":91},[33,113862,113863,113865,113868],{"class":35,"line":95},[33,113864,562],{"class":163},[33,113866,113867],{"class":46}," coerce_types",[33,113869,12127],{"class":167},[33,113871,113872],{"class":35,"line":101},[33,113873,113874],{"class":54},"    \"\"\"Enforce explicit dtypes for known columns; parse dates separately.\"\"\"\n",[33,113876,113877],{"class":35,"line":171},[33,113878,113879],{"class":39},"    # Strip currency symbols and commas from price-like columns\n",[33,113881,113882,113884,113886,113888],{"class":35,"line":179},[33,113883,656],{"class":163},[33,113885,7985],{"class":167},[33,113887,662],{"class":163},[33,113889,8005],{"class":167},[33,113891,113892,113894,113897,113899,113901,113903,113906,113908,113910,113912,113915,113917],{"class":35,"line":187},[33,113893,8221],{"class":163},[33,113895,113896],{"class":54}," \"price\"",[33,113898,8002],{"class":163},[33,113900,7985],{"class":167},[33,113902,7162],{"class":163},[33,113904,113905],{"class":54}," \"amount\"",[33,113907,8002],{"class":163},[33,113909,7985],{"class":167},[33,113911,7162],{"class":163},[33,113913,113914],{"class":54}," \"cost\"",[33,113916,8002],{"class":163},[33,113918,113919],{"class":167}," col:\n",[33,113921,113922,113924,113926],{"class":35,"line":201},[33,113923,11690],{"class":167},[33,113925,242],{"class":163},[33,113927,1415],{"class":167},[33,113929,113930],{"class":35,"line":206},[33,113931,113932],{"class":167},"                df[col]\n",[33,113934,113935,113938,113940],{"class":35,"line":224},[33,113936,113937],{"class":167},"                .astype(",[33,113939,1053],{"class":50},[33,113941,221],{"class":167},[33,113943,113944,113947,113949,113951,113953,113955,113958,113961,113963,113965,113967,113969,113971,113973,113975,113977],{"class":35,"line":229},[33,113945,113946],{"class":167},"                .str.replace(",[33,113948,11977],{"class":163},[33,113950,274],{"class":54},[33,113952,8309],{"class":50},[33,113954,113780],{"class":163},[33,113956,113957],{"class":50},"\\d.",[33,113959,113960],{"class":12018},"\\-",[33,113962,9546],{"class":50},[33,113964,274],{"class":54},[33,113966,365],{"class":167},[33,113968,3198],{"class":54},[33,113970,365],{"class":167},[33,113972,11993],{"class":238},[33,113974,242],{"class":163},[33,113976,855],{"class":50},[33,113978,221],{"class":167},[33,113980,113981,113984,113986,113988,113990],{"class":35,"line":235},[33,113982,113983],{"class":167},"                .replace(",[33,113985,3198],{"class":54},[33,113987,10884],{"class":167},[33,113989,8018],{"class":50},[33,113991,221],{"class":167},[33,113993,113994],{"class":35,"line":250},[33,113995,24021],{"class":167},[33,113997,113998],{"class":35,"line":266},[33,113999,92],{"emptyLinePlaceholder":91},[33,114001,114002,114005,114007],{"class":35,"line":290},[33,114003,114004],{"class":167},"    type_map ",[33,114006,242],{"class":163},[33,114008,16265],{"class":167},[33,114010,114011,114014,114016,114018],{"class":35,"line":295},[33,114012,114013],{"class":54},"        \"order_id\"",[33,114015,2079],{"class":167},[33,114017,27358],{"class":54},[33,114019,247],{"class":167},[33,114021,114022,114025,114027,114030,114032],{"class":35,"line":300},[33,114023,114024],{"class":54},"        \"quantity\"",[33,114026,2079],{"class":167},[33,114028,114029],{"class":54},"\"Int64\"",[33,114031,25445],{"class":167},[33,114033,114034],{"class":39},"# Nullable integer — survives NaN\n",[33,114036,114037,114040,114042,114045],{"class":35,"line":317},[33,114038,114039],{"class":54},"        \"unit_price\"",[33,114041,2079],{"class":167},[33,114043,114044],{"class":54},"\"float64\"",[33,114046,247],{"class":167},[33,114048,114049,114052,114054,114056],{"class":35,"line":332},[33,114050,114051],{"class":54},"        \"status\"",[33,114053,2079],{"class":167},[33,114055,27358],{"class":54},[33,114057,247],{"class":167},[33,114059,114060],{"class":35,"line":347},[33,114061,20781],{"class":167},[33,114063,114064,114067,114069,114072,114074,114077,114079,114082,114084,114086,114088],{"class":35,"line":374},[33,114065,114066],{"class":167},"    existing ",[33,114068,242],{"class":163},[33,114070,114071],{"class":167}," {col: dtype ",[33,114073,6124],{"class":163},[33,114075,114076],{"class":167}," col, dtype ",[33,114078,662],{"class":163},[33,114080,114081],{"class":167}," type_map.items() ",[33,114083,2491],{"class":163},[33,114085,7985],{"class":167},[33,114087,662],{"class":163},[33,114089,114090],{"class":167}," df.columns}\n",[33,114092,114093,114095,114097,114100,114102,114104,114106],{"class":35,"line":397},[33,114094,4025],{"class":167},[33,114096,242],{"class":163},[33,114098,114099],{"class":167}," df.astype(existing, ",[33,114101,8317],{"class":238},[33,114103,242],{"class":163},[33,114105,8322],{"class":54},[33,114107,221],{"class":167},[33,114109,114110],{"class":35,"line":653},[33,114111,92],{"emptyLinePlaceholder":91},[33,114113,114114],{"class":35,"line":667},[33,114115,114116],{"class":39},"    # Dates require pd.to_datetime, not astype\n",[33,114118,114119,114121,114123,114125,114127,114130,114132,114135,114137,114139],{"class":35,"line":675},[33,114120,656],{"class":163},[33,114122,7985],{"class":167},[33,114124,662],{"class":163},[33,114126,17583],{"class":167},[33,114128,114129],{"class":54},"\"created_at\"",[33,114131,365],{"class":167},[33,114133,114134],{"class":54},"\"updated_at\"",[33,114136,365],{"class":167},[33,114138,108767],{"class":54},[33,114140,1737],{"class":167},[33,114142,114143,114145,114147,114149],{"class":35,"line":689},[33,114144,8221],{"class":163},[33,114146,7985],{"class":167},[33,114148,662],{"class":163},[33,114150,8005],{"class":167},[33,114152,114153,114155,114157,114159,114161,114163,114165,114167,114169,114171,114173,114175,114177,114179,114181],{"class":35,"line":703},[33,114154,11690],{"class":167},[33,114156,242],{"class":163},[33,114158,15392],{"class":167},[33,114160,61926],{"class":238},[33,114162,242],{"class":163},[33,114164,96267],{"class":54},[33,114166,365],{"class":167},[33,114168,27683],{"class":238},[33,114170,242],{"class":163},[33,114172,902],{"class":50},[33,114174,365],{"class":167},[33,114176,8317],{"class":238},[33,114178,242],{"class":163},[33,114180,12107],{"class":54},[33,114182,221],{"class":167},[33,114184,114185],{"class":35,"line":714},[33,114186,92],{"emptyLinePlaceholder":91},[33,114188,114189,114191],{"class":35,"line":723},[33,114190,1332],{"class":163},[33,114192,11719],{"class":167},[14,114194,114195,114196,114198],{},"Unlike workbook-based sources — where reading data is covered in ",[940,114197,99577],{"href":99576}," — CSVs carry no cell-level format metadata, so all type inference must be explicit.",[424,114200,114202],{"id":114201},"step-4-handle-missing-values-and-duplicates","Step 4 — Handle Missing Values and Duplicates",[14,114204,114205],{},"Blank rows, placeholder strings, and duplicate records corrupt aggregations. The order matters: replace placeholders first, then impute or drop, then deduplicate.",[23,114207,114209],{"className":126,"code":114208,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\n\n\ndef remediate_records(df: pd.DataFrame) -> pd.DataFrame:\n    \"\"\"Replace placeholder strings, impute categorical gaps, and deduplicate.\"\"\"\n    PLACEHOLDERS = [\"\", \"N\u002FA\", \"n\u002Fa\", \"unknown\", \"-\", \"none\", \"null\", \"na\"]\n\n    # Replace placeholders with proper NA across the whole frame\n    df = df.replace(PLACEHOLDERS, pd.NA)\n\n    # Forward-fill categorical columns where a gap means \"same as previous\"\n    for col in (\"status\", \"region\", \"shipping_method\"):\n        if col in df.columns:\n            df[col] = df[col].ffill()\n\n    # Drop rows missing any mandatory key\n    mandatory = [c for c in (\"order_id\", \"quantity\") if c in df.columns]\n    if mandatory:\n        df = df.dropna(subset=mandatory, how=\"any\")\n\n    # Sort before dedup so the latest record wins\n    if \"created_at\" in df.columns:\n        df = df.sort_values(\"created_at\", na_position=\"last\")\n\n    if \"order_id\" in df.columns:\n        df = df.drop_duplicates(subset=[\"order_id\"], keep=\"last\")\n\n    return df.reset_index(drop=True)\n",[30,114210,114211,114215,114225,114229,114233,114242,114247,114291,114295,114300,114317,114321,114326,114349,114359,114367,114371,114376,114410,114417,114442,114446,114451,114462,114485,114489,114500,114527,114531],{"__ignoreMap":28},[33,114212,114213],{"class":35,"line":36},[33,114214,8895],{"class":39},[33,114216,114217,114219,114221,114223],{"class":35,"line":43},[33,114218,164],{"class":163},[33,114220,492],{"class":167},[33,114222,495],{"class":163},[33,114224,498],{"class":167},[33,114226,114227],{"class":35,"line":61},[33,114228,92],{"emptyLinePlaceholder":91},[33,114230,114231],{"class":35,"line":73},[33,114232,92],{"emptyLinePlaceholder":91},[33,114234,114235,114237,114240],{"class":35,"line":88},[33,114236,562],{"class":163},[33,114238,114239],{"class":46}," remediate_records",[33,114241,12127],{"class":167},[33,114243,114244],{"class":35,"line":95},[33,114245,114246],{"class":54},"    \"\"\"Replace placeholder strings, impute categorical gaps, and deduplicate.\"\"\"\n",[33,114248,114249,114252,114254,114256,114258,114260,114262,114264,114266,114268,114271,114273,114275,114277,114279,114281,114284,114286,114289],{"class":35,"line":101},[33,114250,114251],{"class":50},"    PLACEHOLDERS",[33,114253,212],{"class":163},[33,114255,9178],{"class":167},[33,114257,3198],{"class":54},[33,114259,365],{"class":167},[33,114261,27824],{"class":54},[33,114263,365],{"class":167},[33,114265,12438],{"class":54},[33,114267,365],{"class":167},[33,114269,114270],{"class":54},"\"unknown\"",[33,114272,365],{"class":167},[33,114274,75122],{"class":54},[33,114276,365],{"class":167},[33,114278,104825],{"class":54},[33,114280,365],{"class":167},[33,114282,114283],{"class":54},"\"null\"",[33,114285,365],{"class":167},[33,114287,114288],{"class":54},"\"na\"",[33,114290,9202],{"class":167},[33,114292,114293],{"class":35,"line":171},[33,114294,92],{"emptyLinePlaceholder":91},[33,114296,114297],{"class":35,"line":179},[33,114298,114299],{"class":39},"    # Replace placeholders with proper NA across the whole frame\n",[33,114301,114302,114304,114306,114308,114311,114313,114315],{"class":35,"line":187},[33,114303,4025],{"class":167},[33,114305,242],{"class":163},[33,114307,111517],{"class":167},[33,114309,114310],{"class":50},"PLACEHOLDERS",[33,114312,10884],{"class":167},[33,114314,8018],{"class":50},[33,114316,221],{"class":167},[33,114318,114319],{"class":35,"line":201},[33,114320,92],{"emptyLinePlaceholder":91},[33,114322,114323],{"class":35,"line":206},[33,114324,114325],{"class":39},"    # Forward-fill categorical columns where a gap means \"same as previous\"\n",[33,114327,114328,114330,114332,114334,114336,114338,114340,114342,114344,114347],{"class":35,"line":224},[33,114329,656],{"class":163},[33,114331,7985],{"class":167},[33,114333,662],{"class":163},[33,114335,17583],{"class":167},[33,114337,43379],{"class":54},[33,114339,365],{"class":167},[33,114341,16649],{"class":54},[33,114343,365],{"class":167},[33,114345,114346],{"class":54},"\"shipping_method\"",[33,114348,1737],{"class":167},[33,114350,114351,114353,114355,114357],{"class":35,"line":229},[33,114352,8221],{"class":163},[33,114354,7985],{"class":167},[33,114356,662],{"class":163},[33,114358,8005],{"class":167},[33,114360,114361,114363,114365],{"class":35,"line":235},[33,114362,11690],{"class":167},[33,114364,242],{"class":163},[33,114366,11712],{"class":167},[33,114368,114369],{"class":35,"line":250},[33,114370,92],{"emptyLinePlaceholder":91},[33,114372,114373],{"class":35,"line":266},[33,114374,114375],{"class":39},"    # Drop rows missing any mandatory key\n",[33,114377,114378,114381,114383,114385,114387,114389,114391,114393,114395,114397,114400,114402,114404,114406,114408],{"class":35,"line":290},[33,114379,114380],{"class":167},"    mandatory ",[33,114382,242],{"class":163},[33,114384,7740],{"class":167},[33,114386,6124],{"class":163},[33,114388,7486],{"class":167},[33,114390,662],{"class":163},[33,114392,17583],{"class":167},[33,114394,108849],{"class":54},[33,114396,365],{"class":167},[33,114398,114399],{"class":54},"\"quantity\"",[33,114401,1649],{"class":167},[33,114403,2491],{"class":163},[33,114405,7486],{"class":167},[33,114407,662],{"class":163},[33,114409,12624],{"class":167},[33,114411,114412,114414],{"class":35,"line":295},[33,114413,617],{"class":163},[33,114415,114416],{"class":167}," mandatory:\n",[33,114418,114419,114421,114423,114426,114428,114430,114433,114435,114437,114440],{"class":35,"line":300},[33,114420,7930],{"class":167},[33,114422,242],{"class":163},[33,114424,114425],{"class":167}," df.dropna(",[33,114427,28066],{"class":238},[33,114429,242],{"class":163},[33,114431,114432],{"class":167},"mandatory, ",[33,114434,28045],{"class":238},[33,114436,242],{"class":163},[33,114438,114439],{"class":54},"\"any\"",[33,114441,221],{"class":167},[33,114443,114444],{"class":35,"line":317},[33,114445,92],{"emptyLinePlaceholder":91},[33,114447,114448],{"class":35,"line":332},[33,114449,114450],{"class":39},"    # Sort before dedup so the latest record wins\n",[33,114452,114453,114455,114458,114460],{"class":35,"line":347},[33,114454,617],{"class":163},[33,114456,114457],{"class":54}," \"created_at\"",[33,114459,8002],{"class":163},[33,114461,8005],{"class":167},[33,114463,114464,114466,114468,114471,114473,114475,114478,114480,114483],{"class":35,"line":374},[33,114465,7930],{"class":167},[33,114467,242],{"class":163},[33,114469,114470],{"class":167}," df.sort_values(",[33,114472,114129],{"class":54},[33,114474,365],{"class":167},[33,114476,114477],{"class":238},"na_position",[33,114479,242],{"class":163},[33,114481,114482],{"class":54},"\"last\"",[33,114484,221],{"class":167},[33,114486,114487],{"class":35,"line":397},[33,114488,92],{"emptyLinePlaceholder":91},[33,114490,114491,114493,114496,114498],{"class":35,"line":653},[33,114492,617],{"class":163},[33,114494,114495],{"class":54}," \"order_id\"",[33,114497,8002],{"class":163},[33,114499,8005],{"class":167},[33,114501,114502,114504,114506,114509,114511,114513,114515,114517,114519,114521,114523,114525],{"class":35,"line":667},[33,114503,7930],{"class":167},[33,114505,242],{"class":163},[33,114507,114508],{"class":167}," df.drop_duplicates(",[33,114510,28066],{"class":238},[33,114512,242],{"class":163},[33,114514,8309],{"class":167},[33,114516,108849],{"class":54},[33,114518,8314],{"class":167},[33,114520,28077],{"class":238},[33,114522,242],{"class":163},[33,114524,114482],{"class":54},[33,114526,221],{"class":167},[33,114528,114529],{"class":35,"line":675},[33,114530,92],{"emptyLinePlaceholder":91},[33,114532,114533,114535,114538,114540,114542,114544],{"class":35,"line":689},[33,114534,1332],{"class":163},[33,114536,114537],{"class":167}," df.reset_index(",[33,114539,10868],{"class":238},[33,114541,242],{"class":163},[33,114543,855],{"class":50},[33,114545,221],{"class":167},[14,114547,114548,114549,114551],{},"This cleaned frame is the reliable foundation for downstream tasks such as ",[940,114550,108865],{"href":108864},", where strict type alignment is required for correct serialisation.",[2537,114553],{},[18,114555,114557],{"id":114556},"dirty-clean-pipeline","Dirty → Clean Pipeline",[14,114559,114560],{},"The diagram below shows how a raw CSV file flows through each transformation stage before reaching a clean output.",[2540,114562,2547,114564,2547,114567,2547,114570,2547,2547,114584,2547,2547,114586,2547,114589,2547,114593,2547,114596,2547,2547,114600,2547,2547,114603,2547,114606,2547,114609,2547,114612,2547,2547,114616,2547,2547,114618,2547,114621,2547,114624,2547,114627,2547,2547,114630,2547,2547,114634,2547,114638,2547,114641,2547,114644,2547,2547,114647,2547,2547,114651,2547,114655,2547,114659,2547,114662,2547,114665,2547,2547,114668,2547,2547,114670,2547,114675,2547,114679,2547,114682,2547,114685,114689],{"viewBox":58288,"role":2543,"ariaLabel":114563,"xmlns":2545,"style":2546},"Dirty to clean CSV transformation pipeline with six stages",[2549,114565,114566],{},"CSV Cleaning Pipeline",[2553,114568,114569],{},"A left-to-right flow diagram showing a raw CSV file passing through five transformation stages — Detect Encoding, Normalize Headers, Coerce Types, Handle Missing & Dedup — to produce a clean CSV output.",[2557,114571,2559,114572,2559,114579,2547],{},[2561,114573,2564,114575,2564,114577,2559],{"id":114574,"x1":748,"y1":748,"x2":734,"y2":748},"clean-csv-grad",[2566,114576],{"offset":748,"style":2568},[2566,114578],{"offset":734,"style":2571},[2573,114580,2564,114582,2559],{"id":114581,"viewBox":2576,"refX":2577,"refY":1153,"markerWidth":1179,"markerHeight":1179,"orient":2578},"clean-csv-arrow",[2580,114583],{"d":2582,"fill":2583},[2585,114585],{"width":107506,"height":2618,"fill":2615,"rx":2591},[2585,114587],{"id":114588,"x":19368,"y":16357,"width":38741,"height":38748,"rx":2681,"fill":2592,"stroke":2593,"style":2594},"clean-csv-n1",[2000,114590,114592],{"x":89047,"y":26402,"fill":2599,"style":114591},"text-anchor:middle;font-size:11px;font-weight:600","Raw CSV",[2000,114594,114595],{"x":89047,"y":2609,"fill":2583,"style":11127},"Mixed encoding",[2000,114597,114599],{"x":89047,"y":114598,"fill":2583,"style":11127},"153","bad delimiters",[35,114601],{"x1":2679,"y1":2588,"x2":11214,"y2":2588,"stroke":2583,"markerEnd":114602,"style":2594},"url(#clean-csv-arrow)",[2585,114604],{"id":114605,"x":2588,"y":16357,"width":11099,"height":38748,"rx":2681,"fill":2592,"stroke":2593,"style":2594},"clean-csv-n2",[2000,114607,114608],{"x":2643,"y":26402,"fill":2599,"style":114591},"Detect",[2000,114610,114611],{"x":2643,"y":49842,"fill":2599,"style":114591},"Encoding",[2000,114613,114615],{"x":2643,"y":114614,"fill":2583,"style":11127},"154","chardet + fallback",[35,114617],{"x1":64880,"y1":2588,"x2":26415,"y2":2588,"stroke":2583,"markerEnd":114602,"style":2594},[2585,114619],{"id":114620,"x":64929,"y":16357,"width":11099,"height":38748,"rx":2681,"fill":2592,"stroke":2593,"style":2594},"clean-csv-n3",[2000,114622,114623],{"x":89111,"y":26402,"fill":2599,"style":114591},"Normalize",[2000,114625,114626],{"x":89111,"y":49842,"fill":2599,"style":114591},"Headers",[2000,114628,114629],{"x":89111,"y":114614,"fill":2583,"style":11127},"strip · lower · snake",[35,114631],{"x1":114632,"y1":2588,"x2":114633,"y2":2588,"stroke":2583,"markerEnd":114602,"style":2594},"366","384",[2585,114635],{"id":114636,"x":114637,"y":16357,"width":11099,"height":38748,"rx":2681,"fill":2592,"stroke":2593,"style":2594},"clean-csv-n4","386",[2000,114639,114640],{"x":107520,"y":26402,"fill":2599,"style":114591},"Coerce",[2000,114642,114643],{"x":107520,"y":49842,"fill":2599,"style":114591},"Types",[2000,114645,114646],{"x":107520,"y":114614,"fill":2583,"style":11127},"int · float · datetime",[35,114648],{"x1":114649,"y1":2588,"x2":114650,"y2":2588,"stroke":2583,"markerEnd":114602,"style":2594},"494","512",[2585,114652],{"id":114653,"x":114654,"y":16357,"width":11099,"height":38748,"rx":2681,"fill":2592,"stroke":2593,"style":2594},"clean-csv-n5","514",[2000,114656,114658],{"x":114657,"y":11095,"fill":2599,"style":114591},"568","Handle",[2000,114660,114661],{"x":114657,"y":26345,"fill":2599,"style":114591},"Missing",[2000,114663,114664],{"x":114657,"y":11218,"fill":2599,"style":114591},"& Dedup",[2000,114666,114667],{"x":114657,"y":2610,"fill":2583,"style":11127},"ffill · dropna · dedup",[35,114669],{"x1":64915,"y1":2588,"x2":71573,"y2":2588,"stroke":2583,"markerEnd":114602,"style":2594},[2585,114671],{"id":114672,"x":114673,"y":16357,"width":2650,"height":38748,"rx":2681,"fill":114674,"stroke":11166,"style":2594},"clean-csv-n6","642","url(#clean-csv-grad)",[2000,114676,114678],{"x":114677,"y":26402,"fill":2592,"style":114591},"692","Clean CSV",[2000,114680,114681],{"x":114677,"y":2609,"fill":11165,"style":11127},"typed · deduped",[2000,114683,114684],{"x":114677,"y":114598,"fill":11165,"style":11127},"validated",[2000,114686,114688],{"x":89047,"y":17018,"fill":2583,"style":114687},"text-anchor:middle;font-size:9px","\nINPUT\n",[2000,114690,114691],{"x":114677,"y":17018,"fill":11166,"style":114687},"\nOUTPUT\n",[2537,114693],{},[18,114695,2709],{"id":2708},[424,114697,114699],{"id":114698},"variant-1-semi-colon-or-tab-delimiters","Variant 1 — Semi-colon or Tab Delimiters",[14,114701,114702,114703,114706,114707,114709,114710,114713],{},"European locale exports frequently use ",[30,114704,114705],{},";"," as a delimiter because ",[30,114708,63503],{}," is the decimal separator. Sniff it explicitly rather than relying on ",[30,114711,114712],{},"sep=None",", which can misfire on files with unquoted commas inside fields.",[23,114715,114717],{"className":126,"code":114716,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\nfrom pathlib import Path\n\n\ndef load_with_known_sep(path: Path, sep: str = \";\") -> pd.DataFrame:\n    \"\"\"Load a CSV with an explicitly known delimiter.\"\"\"\n    try:\n        return pd.read_csv(path, sep=sep, encoding=\"utf-8-sig\", dtype_backend=\"numpy_nullable\")\n    except Exception as exc:\n        raise RuntimeError(f\"Failed to load {path}: {exc}\") from exc\n",[30,114718,114719,114723,114733,114743,114747,114751,114770,114775,114781,114813,114823],{"__ignoreMap":28},[33,114720,114721],{"class":35,"line":36},[33,114722,8895],{"class":39},[33,114724,114725,114727,114729,114731],{"class":35,"line":43},[33,114726,164],{"class":163},[33,114728,492],{"class":167},[33,114730,495],{"class":163},[33,114732,498],{"class":167},[33,114734,114735,114737,114739,114741],{"class":35,"line":61},[33,114736,190],{"class":163},[33,114738,193],{"class":167},[33,114740,164],{"class":163},[33,114742,198],{"class":167},[33,114744,114745],{"class":35,"line":73},[33,114746,92],{"emptyLinePlaceholder":91},[33,114748,114749],{"class":35,"line":88},[33,114750,92],{"emptyLinePlaceholder":91},[33,114752,114753,114755,114758,114761,114763,114765,114768],{"class":35,"line":95},[33,114754,562],{"class":163},[33,114756,114757],{"class":46}," load_with_known_sep",[33,114759,114760],{"class":167},"(path: Path, sep: ",[33,114762,1053],{"class":50},[33,114764,212],{"class":163},[33,114766,114767],{"class":54}," \";\"",[33,114769,7668],{"class":167},[33,114771,114772],{"class":35,"line":101},[33,114773,114774],{"class":54},"    \"\"\"Load a CSV with an explicitly known delimiter.\"\"\"\n",[33,114776,114777,114779],{"class":35,"line":171},[33,114778,2424],{"class":163},[33,114780,574],{"class":167},[33,114782,114783,114785,114787,114790,114792,114795,114797,114799,114801,114803,114806,114808,114811],{"class":35,"line":179},[33,114784,1659],{"class":163},[33,114786,27411],{"class":167},[33,114788,114789],{"class":238},"sep",[33,114791,242],{"class":163},[33,114793,114794],{"class":167},"sep, ",[33,114796,27249],{"class":238},[33,114798,242],{"class":163},[33,114800,108390],{"class":54},[33,114802,365],{"class":167},[33,114804,114805],{"class":238},"dtype_backend",[33,114807,242],{"class":163},[33,114809,114810],{"class":54},"\"numpy_nullable\"",[33,114812,221],{"class":167},[33,114814,114815,114817,114819,114821],{"class":35,"line":187},[33,114816,2449],{"class":163},[33,114818,783],{"class":50},[33,114820,1852],{"class":163},[33,114822,1855],{"class":167},[33,114824,114825,114827,114829,114831,114833,114835,114837,114839,114841,114843,114845,114847,114849,114851,114853,114855],{"class":35,"line":201},[33,114826,4051],{"class":163},[33,114828,7590],{"class":50},[33,114830,602],{"class":167},[33,114832,4059],{"class":163},[33,114834,108800],{"class":54},[33,114836,1115],{"class":50},[33,114838,2580],{"class":167},[33,114840,1121],{"class":50},[33,114842,2079],{"class":54},[33,114844,1115],{"class":50},[33,114846,6565],{"class":167},[33,114848,1121],{"class":50},[33,114850,274],{"class":54},[33,114852,1649],{"class":167},[33,114854,190],{"class":163},[33,114856,20843],{"class":167},[424,114858,114860],{"id":114859},"variant-2-quoted-fields-containing-newlines","Variant 2 — Quoted Fields Containing Newlines",[14,114862,114863,114864,114866,114867,10065,114870,114873],{},"Some export tools wrap multi-line text in double quotes without escaping embedded newlines, causing ",[30,114865,57237],{}," to split rows mid-record. The ",[30,114868,114869],{},"quoting",[30,114871,114872],{},"quotechar"," parameters restore correct row boundaries.",[23,114875,114877],{"className":126,"code":114876,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\nimport csv\nfrom pathlib import Path\n\n\ndef load_multiline_csv(path: Path) -> pd.DataFrame:\n    \"\"\"Handle CSVs where fields contain embedded newlines.\"\"\"\n    try:\n        return pd.read_csv(\n            path,\n            quoting=csv.QUOTE_ALL,\n            quotechar='\"',\n            encoding=\"utf-8-sig\",\n            engine=\"python\",\n        )\n    except Exception as exc:\n        raise RuntimeError(f\"Could not parse {path}: {exc}\") from exc\n",[30,114878,114879,114883,114893,114899,114909,114913,114917,114926,114931,114937,114943,114947,114962,114974,114984,114994,114998,115008],{"__ignoreMap":28},[33,114880,114881],{"class":35,"line":36},[33,114882,8895],{"class":39},[33,114884,114885,114887,114889,114891],{"class":35,"line":43},[33,114886,164],{"class":163},[33,114888,492],{"class":167},[33,114890,495],{"class":163},[33,114892,498],{"class":167},[33,114894,114895,114897],{"class":35,"line":61},[33,114896,164],{"class":163},[33,114898,107673],{"class":167},[33,114900,114901,114903,114905,114907],{"class":35,"line":73},[33,114902,190],{"class":163},[33,114904,193],{"class":167},[33,114906,164],{"class":163},[33,114908,198],{"class":167},[33,114910,114911],{"class":35,"line":88},[33,114912,92],{"emptyLinePlaceholder":91},[33,114914,114915],{"class":35,"line":95},[33,114916,92],{"emptyLinePlaceholder":91},[33,114918,114919,114921,114924],{"class":35,"line":101},[33,114920,562],{"class":163},[33,114922,114923],{"class":46}," load_multiline_csv",[33,114925,7103],{"class":167},[33,114927,114928],{"class":35,"line":171},[33,114929,114930],{"class":54},"    \"\"\"Handle CSVs where fields contain embedded newlines.\"\"\"\n",[33,114932,114933,114935],{"class":35,"line":179},[33,114934,2424],{"class":163},[33,114936,574],{"class":167},[33,114938,114939,114941],{"class":35,"line":187},[33,114940,1659],{"class":163},[33,114942,108706],{"class":167},[33,114944,114945],{"class":35,"line":201},[33,114946,113171],{"class":167},[33,114948,114949,114952,114954,114957,114960],{"class":35,"line":206},[33,114950,114951],{"class":238},"            quoting",[33,114953,242],{"class":163},[33,114955,114956],{"class":167},"csv.",[33,114958,114959],{"class":50},"QUOTE_ALL",[33,114961,247],{"class":167},[33,114963,114964,114967,114969,114972],{"class":35,"line":224},[33,114965,114966],{"class":238},"            quotechar",[33,114968,242],{"class":163},[33,114970,114971],{"class":54},"'\"'",[33,114973,247],{"class":167},[33,114975,114976,114978,114980,114982],{"class":35,"line":229},[33,114977,113176],{"class":238},[33,114979,242],{"class":163},[33,114981,108390],{"class":54},[33,114983,247],{"class":167},[33,114985,114986,114988,114990,114992],{"class":35,"line":235},[33,114987,113208],{"class":238},[33,114989,242],{"class":163},[33,114991,110985],{"class":54},[33,114993,247],{"class":167},[33,114995,114996],{"class":35,"line":250},[33,114997,5867],{"class":167},[33,114999,115000,115002,115004,115006],{"class":35,"line":266},[33,115001,2449],{"class":163},[33,115003,783],{"class":50},[33,115005,1852],{"class":163},[33,115007,1855],{"class":167},[33,115009,115010,115012,115014,115016,115018,115021,115023,115025,115027,115029,115031,115033,115035,115037,115039,115041],{"class":35,"line":290},[33,115011,4051],{"class":163},[33,115013,7590],{"class":50},[33,115015,602],{"class":167},[33,115017,4059],{"class":163},[33,115019,115020],{"class":54},"\"Could not parse ",[33,115022,1115],{"class":50},[33,115024,2580],{"class":167},[33,115026,1121],{"class":50},[33,115028,2079],{"class":54},[33,115030,1115],{"class":50},[33,115032,6565],{"class":167},[33,115034,1121],{"class":50},[33,115036,274],{"class":54},[33,115038,1649],{"class":167},[33,115040,190],{"class":163},[33,115042,20843],{"class":167},[424,115044,115046],{"id":115045},"variant-3-mixed-date-formats-in-a-single-column","Variant 3 — Mixed Date Formats in a Single Column",[14,115048,115049,115050,365,115053,71132,115056,115059,115060,8877,115063,115065],{},"When a date column contains ",[30,115051,115052],{},"2024\u002F01\u002F15",[30,115054,115055],{},"15-Jan-2024",[30,115057,115058],{},"01-15-2024"," in the same file — a common outcome of merging exports from different regions — ",[30,115061,115062],{},"pd.to_datetime",[30,115064,97762],{}," handles it without a custom parser.",[23,115067,115069],{"className":126,"code":115068,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\n\ndef parse_mixed_dates(series: pd.Series) -> pd.Series:\n    \"\"\"Parse a column containing multiple date format strings.\"\"\"\n    return pd.to_datetime(series, format=\"mixed\", dayfirst=False, errors=\"coerce\")\n\n# Usage: df[\"created_at\"] = parse_mixed_dates(df[\"created_at\"])\n",[30,115070,115071,115075,115085,115089,115098,115103,115133,115137],{"__ignoreMap":28},[33,115072,115073],{"class":35,"line":36},[33,115074,8895],{"class":39},[33,115076,115077,115079,115081,115083],{"class":35,"line":43},[33,115078,164],{"class":163},[33,115080,492],{"class":167},[33,115082,495],{"class":163},[33,115084,498],{"class":167},[33,115086,115087],{"class":35,"line":61},[33,115088,92],{"emptyLinePlaceholder":91},[33,115090,115091,115093,115096],{"class":35,"line":73},[33,115092,562],{"class":163},[33,115094,115095],{"class":46}," parse_mixed_dates",[33,115097,11945],{"class":167},[33,115099,115100],{"class":35,"line":88},[33,115101,115102],{"class":54},"    \"\"\"Parse a column containing multiple date format strings.\"\"\"\n",[33,115104,115105,115107,115109,115111,115113,115115,115117,115119,115121,115123,115125,115127,115129,115131],{"class":35,"line":95},[33,115106,1332],{"class":163},[33,115108,12271],{"class":167},[33,115110,61926],{"class":238},[33,115112,242],{"class":163},[33,115114,96267],{"class":54},[33,115116,365],{"class":167},[33,115118,27683],{"class":238},[33,115120,242],{"class":163},[33,115122,902],{"class":50},[33,115124,365],{"class":167},[33,115126,8317],{"class":238},[33,115128,242],{"class":163},[33,115130,12107],{"class":54},[33,115132,221],{"class":167},[33,115134,115135],{"class":35,"line":101},[33,115136,92],{"emptyLinePlaceholder":91},[33,115138,115139],{"class":35,"line":171},[33,115140,115141],{"class":39},"# Usage: df[\"created_at\"] = parse_mixed_dates(df[\"created_at\"])\n",[14,115143,115144,115145,115147],{},"Data arriving from non-tabular sources such as PDFs often needs this same treatment — ",[940,115146,948],{"href":947}," shows how to handle that upstream step.",[2537,115149],{},[18,115151,52030],{"id":52029},[14,115153,115154],{},"Assert correctness after cleaning. Assertions that fail loudly are better than a silently corrupted output file.",[23,115156,115158],{"className":126,"code":115157,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\nfrom pathlib import Path\n\n\ndef validate_dataframe(df: pd.DataFrame, min_rows: int = 10) -> None:\n    \"\"\"Assert row count, dtype expectations, and null thresholds.\"\"\"\n    assert len(df) >= min_rows, f\"Only {len(df)} rows — expected at least {min_rows}\"\n\n    # Datetime column must not be all-null after parsing\n    if \"created_at\" in df.columns:\n        null_date_pct = df[\"created_at\"].isnull().mean()\n        assert null_date_pct \u003C 0.2, f\"created_at is {null_date_pct:.0%} null — date parsing likely failed\"\n\n    # Numeric columns must not be object dtype\n    for col in (\"quantity\", \"unit_price\"):\n        if col in df.columns:\n            assert df[col].dtype != object, f\"Column {col!r} is still object dtype — coercion failed\"\n\n    # No fully duplicate rows should survive\n    dup_count = df.duplicated().sum()\n    assert dup_count == 0, f\"{dup_count} fully duplicate rows remain\"\n\n    # Spot-sample: at least one non-null order_id\n    if \"order_id\" in df.columns:\n        assert df[\"order_id\"].notna().any(), \"order_id column is entirely null\"\n\n    print(f\"[PASS] Validation passed — {len(df)} rows, {df.shape[1]} columns\")\n    print(f\"       Memory: {df.memory_usage(deep=True).sum() \u002F 1024**2:.2f} MB\")\n",[30,115159,115160,115164,115174,115184,115188,115192,115213,115218,115252,115256,115261,115271,115285,115315,115319,115324,115343,115353,115384,115388,115393,115403,115430,115434,115439,115449,115463,115467,115502],{"__ignoreMap":28},[33,115161,115162],{"class":35,"line":36},[33,115163,8895],{"class":39},[33,115165,115166,115168,115170,115172],{"class":35,"line":43},[33,115167,164],{"class":163},[33,115169,492],{"class":167},[33,115171,495],{"class":163},[33,115173,498],{"class":167},[33,115175,115176,115178,115180,115182],{"class":35,"line":61},[33,115177,190],{"class":163},[33,115179,193],{"class":167},[33,115181,164],{"class":163},[33,115183,198],{"class":167},[33,115185,115186],{"class":35,"line":73},[33,115187,92],{"emptyLinePlaceholder":91},[33,115189,115190],{"class":35,"line":88},[33,115191,92],{"emptyLinePlaceholder":91},[33,115193,115194,115196,115198,115201,115203,115205,115207,115209,115211],{"class":35,"line":95},[33,115195,562],{"class":163},[33,115197,12540],{"class":46},[33,115199,115200],{"class":167},"(df: pd.DataFrame, min_rows: ",[33,115202,1059],{"class":50},[33,115204,212],{"class":163},[33,115206,37265],{"class":50},[33,115208,1617],{"class":167},[33,115210,571],{"class":50},[33,115212,574],{"class":167},[33,115214,115215],{"class":35,"line":101},[33,115216,115217],{"class":54},"    \"\"\"Assert row count, dtype expectations, and null thresholds.\"\"\"\n",[33,115219,115220,115222,115224,115226,115228,115231,115233,115236,115238,115240,115242,115244,115246,115248,115250],{"class":35,"line":171},[33,115221,9228],{"class":163},[33,115223,4037],{"class":50},[33,115225,4040],{"class":167},[33,115227,43000],{"class":163},[33,115229,115230],{"class":167}," min_rows, ",[33,115232,4059],{"class":163},[33,115234,115235],{"class":54},"\"Only ",[33,115237,4065],{"class":50},[33,115239,4068],{"class":167},[33,115241,1121],{"class":50},[33,115243,4073],{"class":54},[33,115245,1115],{"class":50},[33,115247,4078],{"class":167},[33,115249,1121],{"class":50},[33,115251,7504],{"class":54},[33,115253,115254],{"class":35,"line":179},[33,115255,92],{"emptyLinePlaceholder":91},[33,115257,115258],{"class":35,"line":187},[33,115259,115260],{"class":39},"    # Datetime column must not be all-null after parsing\n",[33,115262,115263,115265,115267,115269],{"class":35,"line":201},[33,115264,617],{"class":163},[33,115266,114457],{"class":54},[33,115268,8002],{"class":163},[33,115270,8005],{"class":167},[33,115272,115273,115276,115278,115280,115282],{"class":35,"line":206},[33,115274,115275],{"class":167},"        null_date_pct ",[33,115277,242],{"class":163},[33,115279,7935],{"class":167},[33,115281,114129],{"class":54},[33,115283,115284],{"class":167},"].isnull().mean()\n",[33,115286,115287,115289,115292,115294,115296,115298,115300,115303,115305,115308,115310,115312],{"class":35,"line":224},[33,115288,21485],{"class":163},[33,115290,115291],{"class":167}," null_date_pct ",[33,115293,4043],{"class":163},[33,115295,46243],{"class":50},[33,115297,365],{"class":167},[33,115299,4059],{"class":163},[33,115301,115302],{"class":54},"\"created_at is ",[33,115304,1115],{"class":50},[33,115306,115307],{"class":167},"null_date_pct",[33,115309,12775],{"class":163},[33,115311,1121],{"class":50},[33,115313,115314],{"class":54}," null — date parsing likely failed\"\n",[33,115316,115317],{"class":35,"line":229},[33,115318,92],{"emptyLinePlaceholder":91},[33,115320,115321],{"class":35,"line":235},[33,115322,115323],{"class":39},"    # Numeric columns must not be object dtype\n",[33,115325,115326,115328,115330,115332,115334,115336,115338,115341],{"class":35,"line":250},[33,115327,656],{"class":163},[33,115329,7985],{"class":167},[33,115331,662],{"class":163},[33,115333,17583],{"class":167},[33,115335,114399],{"class":54},[33,115337,365],{"class":167},[33,115339,115340],{"class":54},"\"unit_price\"",[33,115342,1737],{"class":167},[33,115344,115345,115347,115349,115351],{"class":35,"line":266},[33,115346,8221],{"class":163},[33,115348,7985],{"class":167},[33,115350,662],{"class":163},[33,115352,8005],{"class":167},[33,115354,115355,115358,115361,115363,115366,115368,115370,115373,115375,115377,115379,115381],{"class":35,"line":290},[33,115356,115357],{"class":163},"            assert",[33,115359,115360],{"class":167}," df[col].dtype ",[33,115362,17877],{"class":163},[33,115364,115365],{"class":50}," object",[33,115367,365],{"class":167},[33,115369,4059],{"class":163},[33,115371,115372],{"class":54},"\"Column ",[33,115374,1115],{"class":50},[33,115376,8276],{"class":167},[33,115378,76954],{"class":163},[33,115380,1121],{"class":50},[33,115382,115383],{"class":54}," is still object dtype — coercion failed\"\n",[33,115385,115386],{"class":35,"line":295},[33,115387,92],{"emptyLinePlaceholder":91},[33,115389,115390],{"class":35,"line":300},[33,115391,115392],{"class":39},"    # No fully duplicate rows should survive\n",[33,115394,115395,115398,115400],{"class":35,"line":317},[33,115396,115397],{"class":167},"    dup_count ",[33,115399,242],{"class":163},[33,115401,115402],{"class":167}," df.duplicated().sum()\n",[33,115404,115405,115407,115410,115412,115414,115416,115418,115420,115422,115425,115427],{"class":35,"line":332},[33,115406,9228],{"class":163},[33,115408,115409],{"class":167}," dup_count ",[33,115411,1865],{"class":163},[33,115413,10791],{"class":50},[33,115415,365],{"class":167},[33,115417,4059],{"class":163},[33,115419,274],{"class":54},[33,115421,1115],{"class":50},[33,115423,115424],{"class":167},"dup_count",[33,115426,1121],{"class":50},[33,115428,115429],{"class":54}," fully duplicate rows remain\"\n",[33,115431,115432],{"class":35,"line":347},[33,115433,92],{"emptyLinePlaceholder":91},[33,115435,115436],{"class":35,"line":374},[33,115437,115438],{"class":39},"    # Spot-sample: at least one non-null order_id\n",[33,115440,115441,115443,115445,115447],{"class":35,"line":397},[33,115442,617],{"class":163},[33,115444,114495],{"class":54},[33,115446,8002],{"class":163},[33,115448,8005],{"class":167},[33,115450,115451,115453,115455,115457,115460],{"class":35,"line":653},[33,115452,21485],{"class":163},[33,115454,7935],{"class":167},[33,115456,108849],{"class":54},[33,115458,115459],{"class":167},"].notna().any(), ",[33,115461,115462],{"class":54},"\"order_id column is entirely null\"\n",[33,115464,115465],{"class":35,"line":667},[33,115466,92],{"emptyLinePlaceholder":91},[33,115468,115469,115471,115473,115475,115478,115480,115482,115484,115487,115489,115491,115493,115495,115497,115500],{"class":35,"line":675},[33,115470,7268],{"class":50},[33,115472,602],{"class":167},[33,115474,4059],{"class":163},[33,115476,115477],{"class":54},"\"[PASS] Validation passed — ",[33,115479,4065],{"class":50},[33,115481,4068],{"class":167},[33,115483,1121],{"class":50},[33,115485,115486],{"class":54}," rows, ",[33,115488,1115],{"class":50},[33,115490,9541],{"class":167},[33,115492,734],{"class":50},[33,115494,9546],{"class":167},[33,115496,1121],{"class":50},[33,115498,115499],{"class":54}," columns\"",[33,115501,221],{"class":167},[33,115503,115504,115506,115508,115510,115513,115515,115518,115521,115523,115525,115528,115530,115532,115534,115536,115538,115540,115542],{"class":35,"line":689},[33,115505,7268],{"class":50},[33,115507,602],{"class":167},[33,115509,4059],{"class":163},[33,115511,115512],{"class":54},"\"       Memory: ",[33,115514,1115],{"class":50},[33,115516,115517],{"class":167},"df.memory_usage(",[33,115519,115520],{"class":238},"deep",[33,115522,242],{"class":163},[33,115524,855],{"class":50},[33,115526,115527],{"class":167},").sum() ",[33,115529,1351],{"class":163},[33,115531,1159],{"class":50},[33,115533,1775],{"class":163},[33,115535,1533],{"class":50},[33,115537,55819],{"class":163},[33,115539,1121],{"class":50},[33,115541,107874],{"class":54},[33,115543,221],{"class":167},[2537,115545],{},[18,115547,21810],{"id":21809},[14,115549,115550,115551,115553,115554,115556],{},"For files under ~500 MB, a direct ",[30,115552,112835],{}," call with an explicit ",[30,115555,23262],{}," dict is fastest. Beyond that, RAM becomes the constraint.",[14,115558,115559,80730,115562,115564],{},[1974,115560,115561],{},"Chunked processing",[30,115563,21944],{}," keeps memory flat at the cost of losing global operations (sort, dedup across chunks):",[23,115566,115568],{"className":126,"code":115567,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\nfrom pathlib import Path\n\n\ndef process_large_csv(path: Path, chunk_size: int = 50_000) -> pd.DataFrame:\n    \"\"\"Memory-efficient chunked ingestion. Dedup happens per-chunk only.\"\"\"\n    if not path.exists():\n        raise FileNotFoundError(f\"File not found: {path}\")\n\n    dtype_map = {\"region\": \"category\", \"product_sku\": \"category\", \"status\": \"category\"}\n    cleaned_chunks = []\n\n    try:\n        reader = pd.read_csv(\n            path,\n            chunksize=chunk_size,\n            dtype=dtype_map,\n            encoding=\"utf-8-sig\",\n            sep=None,\n            engine=\"python\",\n        )\n        for i, chunk in enumerate(reader):\n            chunk.columns = (\n                chunk.columns.str.strip().str.lower().str.replace(r\"[^\\w]+\", \"_\", regex=True)\n            )\n            chunk = chunk.dropna(subset=[\"order_id\"], how=\"any\")\n            chunk = chunk.drop_duplicates(subset=[\"order_id\"], keep=\"last\")\n            cleaned_chunks.append(chunk)\n            print(f\"[PROGRESS] Chunk {i + 1}: {len(chunk)} rows\")\n    except Exception as exc:\n        raise RuntimeError(f\"Chunked read failed: {exc}\") from exc\n\n    result = pd.concat(cleaned_chunks, ignore_index=True) if cleaned_chunks else pd.DataFrame()\n    # Global dedup after concat — catches cross-chunk duplicates\n    if \"order_id\" in result.columns:\n        result = result.drop_duplicates(subset=[\"order_id\"], keep=\"last\")\n    return result.reset_index(drop=True)\n",[30,115569,115570,115574,115584,115594,115598,115602,115620,115625,115633,115655,115659,115694,115703,115707,115713,115721,115725,115735,115745,115755,115765,115775,115779,115793,115802,115835,115839,115866,115893,115898,115929,115939,115965,115969,115995,116000,116011,116038],{"__ignoreMap":28},[33,115571,115572],{"class":35,"line":36},[33,115573,8895],{"class":39},[33,115575,115576,115578,115580,115582],{"class":35,"line":43},[33,115577,164],{"class":163},[33,115579,492],{"class":167},[33,115581,495],{"class":163},[33,115583,498],{"class":167},[33,115585,115586,115588,115590,115592],{"class":35,"line":61},[33,115587,190],{"class":163},[33,115589,193],{"class":167},[33,115591,164],{"class":163},[33,115593,198],{"class":167},[33,115595,115596],{"class":35,"line":73},[33,115597,92],{"emptyLinePlaceholder":91},[33,115599,115600],{"class":35,"line":88},[33,115601,92],{"emptyLinePlaceholder":91},[33,115603,115604,115606,115609,115612,115614,115616,115618],{"class":35,"line":95},[33,115605,562],{"class":163},[33,115607,115608],{"class":46}," process_large_csv",[33,115610,115611],{"class":167},"(path: Path, chunk_size: ",[33,115613,1059],{"class":50},[33,115615,212],{"class":163},[33,115617,21880],{"class":50},[33,115619,7668],{"class":167},[33,115621,115622],{"class":35,"line":101},[33,115623,115624],{"class":54},"    \"\"\"Memory-efficient chunked ingestion. Dedup happens per-chunk only.\"\"\"\n",[33,115626,115627,115629,115631],{"class":35,"line":171},[33,115628,617],{"class":163},[33,115630,620],{"class":163},[33,115632,27318],{"class":167},[33,115634,115635,115637,115639,115641,115643,115645,115647,115649,115651,115653],{"class":35,"line":179},[33,115636,4051],{"class":163},[33,115638,2945],{"class":50},[33,115640,602],{"class":167},[33,115642,4059],{"class":163},[33,115644,15677],{"class":54},[33,115646,1115],{"class":50},[33,115648,2580],{"class":167},[33,115650,1121],{"class":50},[33,115652,274],{"class":54},[33,115654,221],{"class":167},[33,115656,115657],{"class":35,"line":187},[33,115658,92],{"emptyLinePlaceholder":91},[33,115660,115661,115664,115666,115668,115670,115672,115675,115677,115680,115682,115684,115686,115688,115690,115692],{"class":35,"line":201},[33,115662,115663],{"class":167},"    dtype_map ",[33,115665,242],{"class":163},[33,115667,4098],{"class":167},[33,115669,16649],{"class":54},[33,115671,2079],{"class":167},[33,115673,115674],{"class":54},"\"category\"",[33,115676,365],{"class":167},[33,115678,115679],{"class":54},"\"product_sku\"",[33,115681,2079],{"class":167},[33,115683,115674],{"class":54},[33,115685,365],{"class":167},[33,115687,43379],{"class":54},[33,115689,2079],{"class":167},[33,115691,115674],{"class":54},[33,115693,4113],{"class":167},[33,115695,115696,115699,115701],{"class":35,"line":206},[33,115697,115698],{"class":167},"    cleaned_chunks ",[33,115700,242],{"class":163},[33,115702,589],{"class":167},[33,115704,115705],{"class":35,"line":224},[33,115706,92],{"emptyLinePlaceholder":91},[33,115708,115709,115711],{"class":35,"line":229},[33,115710,2424],{"class":163},[33,115712,574],{"class":167},[33,115714,115715,115717,115719],{"class":35,"line":235},[33,115716,62484],{"class":167},[33,115718,242],{"class":163},[33,115720,108706],{"class":167},[33,115722,115723],{"class":35,"line":250},[33,115724,113171],{"class":167},[33,115726,115727,115730,115732],{"class":35,"line":266},[33,115728,115729],{"class":238},"            chunksize",[33,115731,242],{"class":163},[33,115733,115734],{"class":167},"chunk_size,\n",[33,115736,115737,115740,115742],{"class":35,"line":290},[33,115738,115739],{"class":238},"            dtype",[33,115741,242],{"class":163},[33,115743,115744],{"class":167},"dtype_map,\n",[33,115746,115747,115749,115751,115753],{"class":35,"line":295},[33,115748,113176],{"class":238},[33,115750,242],{"class":163},[33,115752,108390],{"class":54},[33,115754,247],{"class":167},[33,115756,115757,115759,115761,115763],{"class":35,"line":300},[33,115758,113197],{"class":238},[33,115760,242],{"class":163},[33,115762,571],{"class":50},[33,115764,247],{"class":167},[33,115766,115767,115769,115771,115773],{"class":35,"line":317},[33,115768,113208],{"class":238},[33,115770,242],{"class":163},[33,115772,110985],{"class":54},[33,115774,247],{"class":167},[33,115776,115777],{"class":35,"line":332},[33,115778,5867],{"class":167},[33,115780,115781,115783,115786,115788,115790],{"class":35,"line":347},[33,115782,5973],{"class":163},[33,115784,115785],{"class":167}," i, chunk ",[33,115787,662],{"class":163},[33,115789,7403],{"class":50},[33,115791,115792],{"class":167},"(reader):\n",[33,115794,115795,115798,115800],{"class":35,"line":374},[33,115796,115797],{"class":167},"            chunk.columns ",[33,115799,242],{"class":163},[33,115801,1415],{"class":167},[33,115803,115804,115807,115809,115811,115813,115815,115817,115819,115821,115823,115825,115827,115829,115831,115833],{"class":35,"line":397},[33,115805,115806],{"class":167},"                chunk.columns.str.strip().str.lower().str.replace(",[33,115808,11977],{"class":163},[33,115810,274],{"class":54},[33,115812,8309],{"class":50},[33,115814,113780],{"class":163},[33,115816,113783],{"class":50},[33,115818,1811],{"class":163},[33,115820,274],{"class":54},[33,115822,365],{"class":167},[33,115824,7764],{"class":54},[33,115826,365],{"class":167},[33,115828,11993],{"class":238},[33,115830,242],{"class":163},[33,115832,855],{"class":50},[33,115834,221],{"class":167},[33,115836,115837],{"class":35,"line":653},[33,115838,24021],{"class":167},[33,115840,115841,115843,115845,115848,115850,115852,115854,115856,115858,115860,115862,115864],{"class":35,"line":667},[33,115842,70220],{"class":167},[33,115844,242],{"class":163},[33,115846,115847],{"class":167}," chunk.dropna(",[33,115849,28066],{"class":238},[33,115851,242],{"class":163},[33,115853,8309],{"class":167},[33,115855,108849],{"class":54},[33,115857,8314],{"class":167},[33,115859,28045],{"class":238},[33,115861,242],{"class":163},[33,115863,114439],{"class":54},[33,115865,221],{"class":167},[33,115867,115868,115870,115872,115875,115877,115879,115881,115883,115885,115887,115889,115891],{"class":35,"line":675},[33,115869,70220],{"class":167},[33,115871,242],{"class":163},[33,115873,115874],{"class":167}," chunk.drop_duplicates(",[33,115876,28066],{"class":238},[33,115878,242],{"class":163},[33,115880,8309],{"class":167},[33,115882,108849],{"class":54},[33,115884,8314],{"class":167},[33,115886,28077],{"class":238},[33,115888,242],{"class":163},[33,115890,114482],{"class":54},[33,115892,221],{"class":167},[33,115894,115895],{"class":35,"line":689},[33,115896,115897],{"class":167},"            cleaned_chunks.append(chunk)\n",[33,115899,115900,115902,115904,115906,115909,115911,115913,115915,115917,115919,115921,115923,115925,115927],{"class":35,"line":703},[33,115901,9364],{"class":50},[33,115903,602],{"class":167},[33,115905,4059],{"class":163},[33,115907,115908],{"class":54},"\"[PROGRESS] Chunk ",[33,115910,1115],{"class":50},[33,115912,11017],{"class":167},[33,115914,1811],{"class":163},[33,115916,11022],{"class":50},[33,115918,2079],{"class":54},[33,115920,4065],{"class":50},[33,115922,70435],{"class":167},[33,115924,1121],{"class":50},[33,115926,65937],{"class":54},[33,115928,221],{"class":167},[33,115930,115931,115933,115935,115937],{"class":35,"line":714},[33,115932,2449],{"class":163},[33,115934,783],{"class":50},[33,115936,1852],{"class":163},[33,115938,1855],{"class":167},[33,115940,115941,115943,115945,115947,115949,115951,115953,115955,115957,115959,115961,115963],{"class":35,"line":723},[33,115942,4051],{"class":163},[33,115944,7590],{"class":50},[33,115946,602],{"class":167},[33,115948,4059],{"class":163},[33,115950,109796],{"class":54},[33,115952,1115],{"class":50},[33,115954,6565],{"class":167},[33,115956,1121],{"class":50},[33,115958,274],{"class":54},[33,115960,1649],{"class":167},[33,115962,190],{"class":163},[33,115964,20843],{"class":167},[33,115966,115967],{"class":35,"line":754},[33,115968,92],{"emptyLinePlaceholder":91},[33,115970,115971,115973,115975,115978,115980,115982,115984,115986,115988,115991,115993],{"class":35,"line":771},[33,115972,8842],{"class":167},[33,115974,242],{"class":163},[33,115976,115977],{"class":167}," pd.concat(cleaned_chunks, ",[33,115979,850],{"class":238},[33,115981,242],{"class":163},[33,115983,855],{"class":50},[33,115985,1649],{"class":167},[33,115987,2491],{"class":163},[33,115989,115990],{"class":167}," cleaned_chunks ",[33,115992,7489],{"class":163},[33,115994,7721],{"class":167},[33,115996,115997],{"class":35,"line":777},[33,115998,115999],{"class":39},"    # Global dedup after concat — catches cross-chunk duplicates\n",[33,116001,116002,116004,116006,116008],{"class":35,"line":788},[33,116003,617],{"class":163},[33,116005,114495],{"class":54},[33,116007,8002],{"class":163},[33,116009,116010],{"class":167}," result.columns:\n",[33,116012,116013,116015,116017,116020,116022,116024,116026,116028,116030,116032,116034,116036],{"class":35,"line":804},[33,116014,87961],{"class":167},[33,116016,242],{"class":163},[33,116018,116019],{"class":167}," result.drop_duplicates(",[33,116021,28066],{"class":238},[33,116023,242],{"class":163},[33,116025,8309],{"class":167},[33,116027,108849],{"class":54},[33,116029,8314],{"class":167},[33,116031,28077],{"class":238},[33,116033,242],{"class":163},[33,116035,114482],{"class":54},[33,116037,221],{"class":167},[33,116039,116040,116042,116045,116047,116049,116051],{"class":35,"line":809},[33,116041,1332],{"class":163},[33,116043,116044],{"class":167}," result.reset_index(",[33,116046,10868],{"class":238},[33,116048,242],{"class":163},[33,116050,855],{"class":50},[33,116052,221],{"class":167},[14,116054,116055,116058,116059,116062,116063,116065],{},[1974,116056,116057],{},"Category dtype"," reduces memory for high-cardinality string columns (region codes, SKUs, status values) by up to 80%. Pass ",[30,116060,116061],{},"dtype={\"col\": \"category\"}"," in the initial ",[30,116064,57237],{}," call rather than converting afterwards — conversion after load does not reclaim the original object memory in the same GC cycle.",[14,116067,116068,116071,116072,116074,116075,116078,116079,116081],{},[1974,116069,116070],{},"Out-of-core alternatives",": For files that exceed available RAM even with chunking, consider ",[30,116073,107441],{}," (lazy evaluation, columnar memory layout) or ",[30,116076,116077],{},"dask.dataframe",", which mirrors the pandas API while deferring computation. The ",[940,116080,107425],{"href":110423}," page compares these options across file sizes and operation types.",[2537,116083],{},[18,116085,4271],{"id":4270},[4273,116087,116088,116100],{},[4276,116089,116090],{},[4279,116091,116092,116095,116098],{},[4282,116093,116094],{},"Error \u002F Symptom",[4282,116096,116097],{},"Likely Cause",[4282,116099,4290],{},[4292,116101,116102,116129,116150,116175,116195,116218],{},[4279,116103,116104,116109,116121],{},[4297,116105,116106,116108],{},[30,116107,11219],{}," has one column containing all fields joined by delimiter",[4297,116110,116111,116113,116114,116116,116117,2012,116119],{},[30,116112,114789],{}," defaulted to ",[30,116115,63503],{}," but file uses ",[30,116118,114705],{},[30,116120,80208],{},[4297,116122,17059,116123,116126,116127,29132],{},[30,116124,116125],{},"sep=None, engine='python'"," or pass the correct ",[30,116128,114789],{},[4279,116130,116131,116136,116139],{},[4297,116132,116133],{},[30,116134,116135],{},"UnicodeDecodeError: codec can't decode byte",[4297,116137,116138],{},"File is not UTF-8 (often Latin-1 or Windows-1252)",[4297,116140,40261,116141,116143,116144,14391,116147,116149],{},[30,116142,111230],{}," then ",[30,116145,116146],{},"encoding='latin-1'",[30,116148,110433],{}," to detect",[4279,116151,116152,116158,116168],{},[4297,116153,116154,116155,116157],{},"Price column dtype is ",[30,116156,11888],{}," after load",[4297,116159,116160,116161,365,116163,365,116165,116167],{},"Currency symbols (",[30,116162,12073],{},[30,116164,53873],{},[30,116166,63503],{},") prevent numeric coercion",[4297,116169,116170,116171,116174],{},"Strip non-numeric characters with ",[30,116172,116173],{},".str.replace(r'[^\\d.\\-]', '', regex=True)"," before casting",[4279,116176,116177,116182,116185],{},[4297,116178,116179],{},[30,116180,116181],{},"ParserError: Error tokenizing data",[4297,116183,116184],{},"Rows with different column counts, or unquoted commas inside fields",[4297,116186,4358,116187,116190,116191,116194],{},[30,116188,116189],{},"on_bad_lines='warn'"," to skip; or ",[30,116192,116193],{},"quoting=csv.QUOTE_ALL"," for quoted fields",[4279,116196,116197,116204,116213],{},[4297,116198,116199,116200,116203],{},"Dates parsed as ",[30,116201,116202],{},"NaT"," throughout",[4297,116205,116206,116207,71066,116210,12027],{},"Mixed format strings (",[30,116208,116209],{},"YYYY\u002FMM\u002FDD",[30,116211,116212],{},"DD-Mon-YYYY",[4297,116214,17059,116215],{},[30,116216,116217],{},"pd.to_datetime(col, format='mixed', errors='coerce')",[4279,116219,116220,116225,116228],{},[4297,116221,116222,116224],{},[30,116223,70953],{}," on large file",[4297,116226,116227],{},"Full file loaded into RAM at once",[4297,116229,14337,116230,116232,116233,116235],{},[30,116231,21944],{}," iteration or ",[30,116234,116061],{}," for string columns",[2537,116237],{},[18,116239,4402],{"id":4401},[14,116241,116242,116243,10065,116245,116247],{},"This script wires all steps together with ",[30,116244,40372],{},[30,116246,80336],{},". Copy it, drop it alongside your CSV, and run it.",[23,116249,116251],{"className":126,"code":116250,"language":47,"meta":28,"style":28},"# pip install pandas chardet\n\"\"\"clean_csv.py — end-to-end CSV cleaning pipeline with argparse.\"\"\"\n\nimport argparse\nimport csv\nimport sys\nimport pandas as pd\nimport chardet\nfrom pathlib import Path\n\n\n# ── Helpers ──────────────────────────────────────────────────────────────────\n\ndef detect_encoding(path: Path) -> str:\n    raw = path.read_bytes()[:10_000]\n    result = chardet.detect(raw)\n    return result.get(\"encoding\") or \"utf-8-sig\"\n\n\ndef load_csv(path: Path) -> pd.DataFrame:\n    enc = detect_encoding(path)\n    try:\n        df = pd.read_csv(path, encoding=enc, sep=None, engine=\"python\")\n        print(f\"[OK] Loaded {path.name} ({enc}) — {df.shape[0]} rows\")\n        return df\n    except UnicodeDecodeError:\n        df = pd.read_csv(path, encoding=\"latin-1\", sep=None, engine=\"python\")\n        print(f\"[WARN] Latin-1 fallback applied for {path.name}\")\n        return df\n    except Exception as exc:\n        print(f\"[ERROR] {exc}\")\n        sys.exit(1)\n\n\ndef normalize_headers(df: pd.DataFrame) -> pd.DataFrame:\n    df.columns = (\n        df.columns\n        .str.strip()\n        .str.lower()\n        .str.replace(r\"[^\\w]+\", \"_\", regex=True)\n        .str.strip(\"_\")\n    )\n    return df\n\n\ndef coerce_types(df: pd.DataFrame, date_cols: list[str]) -> pd.DataFrame:\n    # Strip currency \u002F formatting from numeric-looking columns\n    for col in df.columns:\n        if any(kw in col for kw in (\"price\", \"amount\", \"cost\", \"total\")):\n            df[col] = (\n                df[col].astype(str)\n                .str.replace(r\"[^\\d.\\-]\", \"\", regex=True)\n                .replace(\"\", pd.NA)\n            )\n\n    # Numeric coercion\n    for col in df.columns:\n        if df[col].dtype == object:\n            coerced = pd.to_numeric(df[col], errors=\"coerce\")\n            if coerced.notna().sum() > df[col].notna().sum() * 0.8:\n                df[col] = coerced\n\n    # Datetime columns\n    for col in date_cols:\n        if col in df.columns:\n            df[col] = pd.to_datetime(df[col], format=\"mixed\", dayfirst=False, errors=\"coerce\")\n\n    return df\n\n\ndef remediate_records(df: pd.DataFrame, key_col: str) -> pd.DataFrame:\n    PLACEHOLDERS = [\"\", \"N\u002FA\", \"n\u002Fa\", \"unknown\", \"-\", \"none\", \"null\", \"na\"]\n    df = df.replace(PLACEHOLDERS, pd.NA)\n\n    for col in (\"status\", \"region\", \"shipping_method\"):\n        if col in df.columns:\n            df[col] = df[col].ffill()\n\n    if key_col in df.columns:\n        df = df.dropna(subset=[key_col])\n        df = df.drop_duplicates(subset=[key_col], keep=\"last\")\n\n    return df.reset_index(drop=True)\n\n\ndef validate(df: pd.DataFrame, min_rows: int) -> None:\n    if len(df) \u003C min_rows:\n        print(f\"[WARN] Only {len(df)} rows after cleaning — expected >= {min_rows}\")\n    null_pct = df.isnull().mean()\n    bad_cols = null_pct[null_pct > 0.3].index.tolist()\n    if bad_cols:\n        print(f\"[WARN] Columns >30% null after cleaning: {bad_cols}\")\n    print(f\"[PASS] {len(df)} rows · {df.shape[1]} cols · \"\n          f\"{df.memory_usage(deep=True).sum() \u002F 1024**2:.2f} MB\")\n\n\ndef export(df: pd.DataFrame, out_path: Path) -> None:\n    out_path.parent.mkdir(parents=True, exist_ok=True)\n    try:\n        df.to_csv(out_path, index=False, encoding=\"utf-8\")\n        print(f\"[OK] Written to {out_path}\")\n    except OSError as exc:\n        print(f\"[ERROR] Export failed: {exc}\")\n        sys.exit(1)\n\n\n# ── CLI ───────────────────────────────────────────────────────────────────────\n\ndef main() -> None:\n    parser = argparse.ArgumentParser(description=\"Clean a messy CSV file.\")\n    parser.add_argument(\"input\", type=Path, help=\"Path to the raw CSV file\")\n    parser.add_argument(\"output\", type=Path, help=\"Path for the cleaned output CSV\")\n    parser.add_argument(\"--key-col\", default=\"order_id\",\n                        help=\"Primary key column name for dedup (default: order_id)\")\n    parser.add_argument(\"--date-cols\", nargs=\"*\", default=[\"created_at\", \"updated_at\"],\n                        help=\"Columns to parse as datetime (space-separated)\")\n    parser.add_argument(\"--min-rows\", type=int, default=1,\n                        help=\"Warn if cleaned row count is below this value\")\n    args = parser.parse_args()\n\n    if not args.input.exists():\n        print(f\"[ERROR] Input file not found: {args.input}\")\n        sys.exit(1)\n\n    df = load_csv(args.input)\n    df = normalize_headers(df)\n    df = coerce_types(df, date_cols=args.date_cols)\n    df = remediate_records(df, key_col=args.key_col)\n    validate(df, min_rows=args.min_rows)\n    export(df, args.output)\n\n\nif __name__ == \"__main__\":\n    main()\n",[30,116252,116253,116257,116262,116266,116272,116278,116284,116294,116300,116310,116314,116318,116323,116327,116340,116354,116363,116378,116382,116386,116395,116404,116410,116440,116483,116489,116497,116529,116550,116556,116566,116587,116595,116599,116603,116611,116619,116623,116627,116631,116663,116671,116675,116681,116685,116689,116702,116707,116717,116757,116765,116774,116808,116820,116824,116828,116833,116843,116855,116872,116890,116899,116903,116908,116919,116929,116961,116965,116971,116975,116979,116992,117032,117048,117052,117074,117084,117092,117096,117107,117122,117145,117149,117163,117167,117171,117187,117199,117229,117239,117256,117263,117285,117318,117353,117357,117361,117375,117395,117401,117422,117443,117453,117474,117482,117486,117490,117495,117499,117511,117528,117551,117574,117591,117602,117634,117645,117670,117681,117689,117693,117701,117722,117730,117734,117743,117752,117769,117785,117797,117802,117806,117810,117822],{"__ignoreMap":28},[33,116254,116255],{"class":35,"line":36},[33,116256,112957],{"class":39},[33,116258,116259],{"class":35,"line":43},[33,116260,116261],{"class":54},"\"\"\"clean_csv.py — end-to-end CSV cleaning pipeline with argparse.\"\"\"\n",[33,116263,116264],{"class":35,"line":61},[33,116265,92],{"emptyLinePlaceholder":91},[33,116267,116268,116270],{"class":35,"line":73},[33,116269,164],{"class":163},[33,116271,4461],{"class":167},[33,116273,116274,116276],{"class":35,"line":88},[33,116275,164],{"class":163},[33,116277,107673],{"class":167},[33,116279,116280,116282],{"class":35,"line":95},[33,116281,164],{"class":163},[33,116283,168],{"class":167},[33,116285,116286,116288,116290,116292],{"class":35,"line":101},[33,116287,164],{"class":163},[33,116289,492],{"class":167},[33,116291,495],{"class":163},[33,116293,498],{"class":167},[33,116295,116296,116298],{"class":35,"line":171},[33,116297,164],{"class":163},[33,116299,110461],{"class":167},[33,116301,116302,116304,116306,116308],{"class":35,"line":179},[33,116303,190],{"class":163},[33,116305,193],{"class":167},[33,116307,164],{"class":163},[33,116309,198],{"class":167},[33,116311,116312],{"class":35,"line":187},[33,116313,92],{"emptyLinePlaceholder":91},[33,116315,116316],{"class":35,"line":201},[33,116317,92],{"emptyLinePlaceholder":91},[33,116319,116320],{"class":35,"line":206},[33,116321,116322],{"class":39},"# ── Helpers ──────────────────────────────────────────────────────────────────\n",[33,116324,116325],{"class":35,"line":224},[33,116326,92],{"emptyLinePlaceholder":91},[33,116328,116329,116331,116334,116336,116338],{"class":35,"line":229},[33,116330,562],{"class":163},[33,116332,116333],{"class":46}," detect_encoding",[33,116335,3743],{"class":167},[33,116337,1053],{"class":50},[33,116339,574],{"class":167},[33,116341,116342,116345,116347,116350,116352],{"class":35,"line":235},[33,116343,116344],{"class":167},"    raw ",[33,116346,242],{"class":163},[33,116348,116349],{"class":167}," path.read_bytes()[:",[33,116351,104304],{"class":50},[33,116353,9202],{"class":167},[33,116355,116356,116358,116360],{"class":35,"line":250},[33,116357,8842],{"class":167},[33,116359,242],{"class":163},[33,116361,116362],{"class":167}," chardet.detect(raw)\n",[33,116364,116365,116367,116369,116371,116373,116375],{"class":35,"line":266},[33,116366,1332],{"class":163},[33,116368,110693],{"class":167},[33,116370,110696],{"class":54},[33,116372,1649],{"class":167},[33,116374,7162],{"class":163},[33,116376,116377],{"class":54}," \"utf-8-sig\"\n",[33,116379,116380],{"class":35,"line":290},[33,116381,92],{"emptyLinePlaceholder":91},[33,116383,116384],{"class":35,"line":295},[33,116385,92],{"emptyLinePlaceholder":91},[33,116387,116388,116390,116393],{"class":35,"line":300},[33,116389,562],{"class":163},[33,116391,116392],{"class":46}," load_csv",[33,116394,7103],{"class":167},[33,116396,116397,116399,116401],{"class":35,"line":317},[33,116398,93844],{"class":167},[33,116400,242],{"class":163},[33,116402,116403],{"class":167}," detect_encoding(path)\n",[33,116405,116406,116408],{"class":35,"line":332},[33,116407,2424],{"class":163},[33,116409,574],{"class":167},[33,116411,116412,116414,116416,116418,116420,116422,116424,116426,116428,116430,116432,116434,116436,116438],{"class":35,"line":347},[33,116413,7930],{"class":167},[33,116415,242],{"class":163},[33,116417,27411],{"class":167},[33,116419,27249],{"class":238},[33,116421,242],{"class":163},[33,116423,112052],{"class":167},[33,116425,114789],{"class":238},[33,116427,242],{"class":163},[33,116429,571],{"class":50},[33,116431,365],{"class":167},[33,116433,17351],{"class":238},[33,116435,242],{"class":163},[33,116437,110985],{"class":54},[33,116439,221],{"class":167},[33,116441,116442,116444,116446,116448,116451,116453,116455,116457,116459,116461,116464,116466,116469,116471,116473,116475,116477,116479,116481],{"class":35,"line":374},[33,116443,9414],{"class":50},[33,116445,602],{"class":167},[33,116447,4059],{"class":163},[33,116449,116450],{"class":54},"\"[OK] Loaded ",[33,116452,1115],{"class":50},[33,116454,57398],{"class":167},[33,116456,1121],{"class":50},[33,116458,17583],{"class":54},[33,116460,1115],{"class":50},[33,116462,116463],{"class":167},"enc",[33,116465,1121],{"class":50},[33,116467,116468],{"class":54},") — ",[33,116470,1115],{"class":50},[33,116472,9541],{"class":167},[33,116474,748],{"class":50},[33,116476,9546],{"class":167},[33,116478,1121],{"class":50},[33,116480,65937],{"class":54},[33,116482,221],{"class":167},[33,116484,116485,116487],{"class":35,"line":397},[33,116486,1659],{"class":163},[33,116488,11719],{"class":167},[33,116490,116491,116493,116495],{"class":35,"line":653},[33,116492,2449],{"class":163},[33,116494,112164],{"class":50},[33,116496,574],{"class":167},[33,116498,116499,116501,116503,116505,116507,116509,116511,116513,116515,116517,116519,116521,116523,116525,116527],{"class":35,"line":667},[33,116500,7930],{"class":167},[33,116502,242],{"class":163},[33,116504,27411],{"class":167},[33,116506,27249],{"class":238},[33,116508,242],{"class":163},[33,116510,111149],{"class":54},[33,116512,365],{"class":167},[33,116514,114789],{"class":238},[33,116516,242],{"class":163},[33,116518,571],{"class":50},[33,116520,365],{"class":167},[33,116522,17351],{"class":238},[33,116524,242],{"class":163},[33,116526,110985],{"class":54},[33,116528,221],{"class":167},[33,116530,116531,116533,116535,116537,116540,116542,116544,116546,116548],{"class":35,"line":675},[33,116532,9414],{"class":50},[33,116534,602],{"class":167},[33,116536,4059],{"class":163},[33,116538,116539],{"class":54},"\"[WARN] Latin-1 fallback applied for ",[33,116541,1115],{"class":50},[33,116543,57398],{"class":167},[33,116545,1121],{"class":50},[33,116547,274],{"class":54},[33,116549,221],{"class":167},[33,116551,116552,116554],{"class":35,"line":689},[33,116553,1659],{"class":163},[33,116555,11719],{"class":167},[33,116557,116558,116560,116562,116564],{"class":35,"line":703},[33,116559,2449],{"class":163},[33,116561,783],{"class":50},[33,116563,1852],{"class":163},[33,116565,1855],{"class":167},[33,116567,116568,116570,116572,116574,116577,116579,116581,116583,116585],{"class":35,"line":714},[33,116569,9414],{"class":50},[33,116571,602],{"class":167},[33,116573,4059],{"class":163},[33,116575,116576],{"class":54},"\"[ERROR] ",[33,116578,1115],{"class":50},[33,116580,6565],{"class":167},[33,116582,1121],{"class":50},[33,116584,274],{"class":54},[33,116586,221],{"class":167},[33,116588,116589,116591,116593],{"class":35,"line":723},[33,116590,2995],{"class":167},[33,116592,734],{"class":50},[33,116594,221],{"class":167},[33,116596,116597],{"class":35,"line":754},[33,116598,92],{"emptyLinePlaceholder":91},[33,116600,116601],{"class":35,"line":771},[33,116602,92],{"emptyLinePlaceholder":91},[33,116604,116605,116607,116609],{"class":35,"line":777},[33,116606,562],{"class":163},[33,116608,113736],{"class":46},[33,116610,12127],{"class":167},[33,116612,116613,116615,116617],{"class":35,"line":788},[33,116614,27546],{"class":167},[33,116616,242],{"class":163},[33,116618,1415],{"class":167},[33,116620,116621],{"class":35,"line":804},[33,116622,113756],{"class":167},[33,116624,116625],{"class":35,"line":809},[33,116626,113761],{"class":167},[33,116628,116629],{"class":35,"line":819},[33,116630,113766],{"class":167},[33,116632,116633,116635,116637,116639,116641,116643,116645,116647,116649,116651,116653,116655,116657,116659,116661],{"class":35,"line":829},[33,116634,113771],{"class":167},[33,116636,11977],{"class":163},[33,116638,274],{"class":54},[33,116640,8309],{"class":50},[33,116642,113780],{"class":163},[33,116644,113783],{"class":50},[33,116646,1811],{"class":163},[33,116648,274],{"class":54},[33,116650,365],{"class":167},[33,116652,7764],{"class":54},[33,116654,365],{"class":167},[33,116656,11993],{"class":238},[33,116658,242],{"class":163},[33,116660,855],{"class":50},[33,116662,221],{"class":167},[33,116664,116665,116667,116669],{"class":35,"line":834},[33,116666,113806],{"class":167},[33,116668,7764],{"class":54},[33,116670,221],{"class":167},[33,116672,116673],{"class":35,"line":839},[33,116674,1202],{"class":167},[33,116676,116677,116679],{"class":35,"line":860},[33,116678,1332],{"class":163},[33,116680,11719],{"class":167},[33,116682,116683],{"class":35,"line":887},[33,116684,92],{"emptyLinePlaceholder":91},[33,116686,116687],{"class":35,"line":907},[33,116688,92],{"emptyLinePlaceholder":91},[33,116690,116691,116693,116695,116698,116700],{"class":35,"line":1826},[33,116692,562],{"class":163},[33,116694,113867],{"class":46},[33,116696,116697],{"class":167},"(df: pd.DataFrame, date_cols: list[",[33,116699,1053],{"class":50},[33,116701,11632],{"class":167},[33,116703,116704],{"class":35,"line":1844},[33,116705,116706],{"class":39},"    # Strip currency \u002F formatting from numeric-looking columns\n",[33,116708,116709,116711,116713,116715],{"class":35,"line":1858},[33,116710,656],{"class":163},[33,116712,7985],{"class":167},[33,116714,662],{"class":163},[33,116716,8005],{"class":167},[33,116718,116719,116721,116724,116727,116729,116731,116733,116736,116738,116740,116743,116745,116747,116749,116751,116753,116755],{"class":35,"line":1871},[33,116720,8221],{"class":163},[33,116722,116723],{"class":50}," any",[33,116725,116726],{"class":167},"(kw ",[33,116728,662],{"class":163},[33,116730,7985],{"class":167},[33,116732,6124],{"class":163},[33,116734,116735],{"class":167}," kw ",[33,116737,662],{"class":163},[33,116739,17583],{"class":167},[33,116741,116742],{"class":54},"\"price\"",[33,116744,365],{"class":167},[33,116746,4106],{"class":54},[33,116748,365],{"class":167},[33,116750,16474],{"class":54},[33,116752,365],{"class":167},[33,116754,54220],{"class":54},[33,116756,8687],{"class":167},[33,116758,116759,116761,116763],{"class":35,"line":1877},[33,116760,11690],{"class":167},[33,116762,242],{"class":163},[33,116764,1415],{"class":167},[33,116766,116767,116770,116772],{"class":35,"line":1883},[33,116768,116769],{"class":167},"                df[col].astype(",[33,116771,1053],{"class":50},[33,116773,221],{"class":167},[33,116775,116776,116778,116780,116782,116784,116786,116788,116790,116792,116794,116796,116798,116800,116802,116804,116806],{"class":35,"line":1915},[33,116777,113946],{"class":167},[33,116779,11977],{"class":163},[33,116781,274],{"class":54},[33,116783,8309],{"class":50},[33,116785,113780],{"class":163},[33,116787,113957],{"class":50},[33,116789,113960],{"class":12018},[33,116791,9546],{"class":50},[33,116793,274],{"class":54},[33,116795,365],{"class":167},[33,116797,3198],{"class":54},[33,116799,365],{"class":167},[33,116801,11993],{"class":238},[33,116803,242],{"class":163},[33,116805,855],{"class":50},[33,116807,221],{"class":167},[33,116809,116810,116812,116814,116816,116818],{"class":35,"line":1926},[33,116811,113983],{"class":167},[33,116813,3198],{"class":54},[33,116815,10884],{"class":167},[33,116817,8018],{"class":50},[33,116819,221],{"class":167},[33,116821,116822],{"class":35,"line":1932},[33,116823,24021],{"class":167},[33,116825,116826],{"class":35,"line":1938},[33,116827,92],{"emptyLinePlaceholder":91},[33,116829,116830],{"class":35,"line":1950},[33,116831,116832],{"class":39},"    # Numeric coercion\n",[33,116834,116835,116837,116839,116841],{"class":35,"line":1958},[33,116836,656],{"class":163},[33,116838,7985],{"class":167},[33,116840,662],{"class":163},[33,116842,8005],{"class":167},[33,116844,116845,116847,116849,116851,116853],{"class":35,"line":4904},[33,116846,8221],{"class":163},[33,116848,115360],{"class":167},[33,116850,1865],{"class":163},[33,116852,115365],{"class":50},[33,116854,574],{"class":167},[33,116856,116857,116860,116862,116864,116866,116868,116870],{"class":35,"line":4909},[33,116858,116859],{"class":167},"            coerced ",[33,116861,242],{"class":163},[33,116863,16774],{"class":167},[33,116865,8317],{"class":238},[33,116867,242],{"class":163},[33,116869,12107],{"class":54},[33,116871,221],{"class":167},[33,116873,116874,116876,116879,116881,116884,116886,116888],{"class":35,"line":4915},[33,116875,5995],{"class":163},[33,116877,116878],{"class":167}," coerced.notna().sum() ",[33,116880,6009],{"class":163},[33,116882,116883],{"class":167}," df[col].notna().sum() ",[33,116885,1769],{"class":163},[33,116887,51635],{"class":50},[33,116889,574],{"class":167},[33,116891,116892,116894,116896],{"class":35,"line":4925},[33,116893,8010],{"class":167},[33,116895,242],{"class":163},[33,116897,116898],{"class":167}," coerced\n",[33,116900,116901],{"class":35,"line":4935},[33,116902,92],{"emptyLinePlaceholder":91},[33,116904,116905],{"class":35,"line":4941},[33,116906,116907],{"class":39},"    # Datetime columns\n",[33,116909,116910,116912,116914,116916],{"class":35,"line":4950},[33,116911,656],{"class":163},[33,116913,7985],{"class":167},[33,116915,662],{"class":163},[33,116917,116918],{"class":167}," date_cols:\n",[33,116920,116921,116923,116925,116927],{"class":35,"line":4960},[33,116922,8221],{"class":163},[33,116924,7985],{"class":167},[33,116926,662],{"class":163},[33,116928,8005],{"class":167},[33,116930,116931,116933,116935,116937,116939,116941,116943,116945,116947,116949,116951,116953,116955,116957,116959],{"class":35,"line":4965},[33,116932,11690],{"class":167},[33,116934,242],{"class":163},[33,116936,15392],{"class":167},[33,116938,61926],{"class":238},[33,116940,242],{"class":163},[33,116942,96267],{"class":54},[33,116944,365],{"class":167},[33,116946,27683],{"class":238},[33,116948,242],{"class":163},[33,116950,902],{"class":50},[33,116952,365],{"class":167},[33,116954,8317],{"class":238},[33,116956,242],{"class":163},[33,116958,12107],{"class":54},[33,116960,221],{"class":167},[33,116962,116963],{"class":35,"line":4971},[33,116964,92],{"emptyLinePlaceholder":91},[33,116966,116967,116969],{"class":35,"line":4983},[33,116968,1332],{"class":163},[33,116970,11719],{"class":167},[33,116972,116973],{"class":35,"line":4988},[33,116974,92],{"emptyLinePlaceholder":91},[33,116976,116977],{"class":35,"line":4993},[33,116978,92],{"emptyLinePlaceholder":91},[33,116980,116981,116983,116985,116988,116990],{"class":35,"line":5003},[33,116982,562],{"class":163},[33,116984,114239],{"class":46},[33,116986,116987],{"class":167},"(df: pd.DataFrame, key_col: ",[33,116989,1053],{"class":50},[33,116991,7668],{"class":167},[33,116993,116994,116996,116998,117000,117002,117004,117006,117008,117010,117012,117014,117016,117018,117020,117022,117024,117026,117028,117030],{"class":35,"line":5008},[33,116995,114251],{"class":50},[33,116997,212],{"class":163},[33,116999,9178],{"class":167},[33,117001,3198],{"class":54},[33,117003,365],{"class":167},[33,117005,27824],{"class":54},[33,117007,365],{"class":167},[33,117009,12438],{"class":54},[33,117011,365],{"class":167},[33,117013,114270],{"class":54},[33,117015,365],{"class":167},[33,117017,75122],{"class":54},[33,117019,365],{"class":167},[33,117021,104825],{"class":54},[33,117023,365],{"class":167},[33,117025,114283],{"class":54},[33,117027,365],{"class":167},[33,117029,114288],{"class":54},[33,117031,9202],{"class":167},[33,117033,117034,117036,117038,117040,117042,117044,117046],{"class":35,"line":5014},[33,117035,4025],{"class":167},[33,117037,242],{"class":163},[33,117039,111517],{"class":167},[33,117041,114310],{"class":50},[33,117043,10884],{"class":167},[33,117045,8018],{"class":50},[33,117047,221],{"class":167},[33,117049,117050],{"class":35,"line":5019},[33,117051,92],{"emptyLinePlaceholder":91},[33,117053,117054,117056,117058,117060,117062,117064,117066,117068,117070,117072],{"class":35,"line":5032},[33,117055,656],{"class":163},[33,117057,7985],{"class":167},[33,117059,662],{"class":163},[33,117061,17583],{"class":167},[33,117063,43379],{"class":54},[33,117065,365],{"class":167},[33,117067,16649],{"class":54},[33,117069,365],{"class":167},[33,117071,114346],{"class":54},[33,117073,1737],{"class":167},[33,117075,117076,117078,117080,117082],{"class":35,"line":5039},[33,117077,8221],{"class":163},[33,117079,7985],{"class":167},[33,117081,662],{"class":163},[33,117083,8005],{"class":167},[33,117085,117086,117088,117090],{"class":35,"line":5068},[33,117087,11690],{"class":167},[33,117089,242],{"class":163},[33,117091,11712],{"class":167},[33,117093,117094],{"class":35,"line":5077},[33,117095,92],{"emptyLinePlaceholder":91},[33,117097,117098,117100,117103,117105],{"class":35,"line":5082},[33,117099,617],{"class":163},[33,117101,117102],{"class":167}," key_col ",[33,117104,662],{"class":163},[33,117106,8005],{"class":167},[33,117108,117109,117111,117113,117115,117117,117119],{"class":35,"line":5089},[33,117110,7930],{"class":167},[33,117112,242],{"class":163},[33,117114,114425],{"class":167},[33,117116,28066],{"class":238},[33,117118,242],{"class":163},[33,117120,117121],{"class":167},"[key_col])\n",[33,117123,117124,117126,117128,117130,117132,117134,117137,117139,117141,117143],{"class":35,"line":5098},[33,117125,7930],{"class":167},[33,117127,242],{"class":163},[33,117129,114508],{"class":167},[33,117131,28066],{"class":238},[33,117133,242],{"class":163},[33,117135,117136],{"class":167},"[key_col], ",[33,117138,28077],{"class":238},[33,117140,242],{"class":163},[33,117142,114482],{"class":54},[33,117144,221],{"class":167},[33,117146,117147],{"class":35,"line":5105},[33,117148,92],{"emptyLinePlaceholder":91},[33,117150,117151,117153,117155,117157,117159,117161],{"class":35,"line":5110},[33,117152,1332],{"class":163},[33,117154,114537],{"class":167},[33,117156,10868],{"class":238},[33,117158,242],{"class":163},[33,117160,855],{"class":50},[33,117162,221],{"class":167},[33,117164,117165],{"class":35,"line":5115},[33,117166,92],{"emptyLinePlaceholder":91},[33,117168,117169],{"class":35,"line":5128},[33,117170,92],{"emptyLinePlaceholder":91},[33,117172,117173,117175,117177,117179,117181,117183,117185],{"class":35,"line":5135},[33,117174,562],{"class":163},[33,117176,25052],{"class":46},[33,117178,115200],{"class":167},[33,117180,1059],{"class":50},[33,117182,1617],{"class":167},[33,117184,571],{"class":50},[33,117186,574],{"class":167},[33,117188,117189,117191,117193,117195,117197],{"class":35,"line":5142},[33,117190,617],{"class":163},[33,117192,4037],{"class":50},[33,117194,4040],{"class":167},[33,117196,4043],{"class":163},[33,117198,4046],{"class":167},[33,117200,117201,117203,117205,117207,117210,117212,117214,117216,117219,117221,117223,117225,117227],{"class":35,"line":5151},[33,117202,9414],{"class":50},[33,117204,602],{"class":167},[33,117206,4059],{"class":163},[33,117208,117209],{"class":54},"\"[WARN] Only ",[33,117211,4065],{"class":50},[33,117213,4068],{"class":167},[33,117215,1121],{"class":50},[33,117217,117218],{"class":54}," rows after cleaning — expected >= ",[33,117220,1115],{"class":50},[33,117222,4078],{"class":167},[33,117224,1121],{"class":50},[33,117226,274],{"class":54},[33,117228,221],{"class":167},[33,117230,117231,117234,117236],{"class":35,"line":5156},[33,117232,117233],{"class":167},"    null_pct ",[33,117235,242],{"class":163},[33,117237,117238],{"class":167}," df.isnull().mean()\n",[33,117240,117241,117244,117246,117249,117251,117253],{"class":35,"line":5161},[33,117242,117243],{"class":167},"    bad_cols ",[33,117245,242],{"class":163},[33,117247,117248],{"class":167}," null_pct[null_pct ",[33,117250,6009],{"class":163},[33,117252,52204],{"class":50},[33,117254,117255],{"class":167},"].index.tolist()\n",[33,117257,117258,117260],{"class":35,"line":5167},[33,117259,617],{"class":163},[33,117261,117262],{"class":167}," bad_cols:\n",[33,117264,117265,117267,117269,117271,117274,117276,117279,117281,117283],{"class":35,"line":5172},[33,117266,9414],{"class":50},[33,117268,602],{"class":167},[33,117270,4059],{"class":163},[33,117272,117273],{"class":54},"\"[WARN] Columns >30% null after cleaning: ",[33,117275,1115],{"class":50},[33,117277,117278],{"class":167},"bad_cols",[33,117280,1121],{"class":50},[33,117282,274],{"class":54},[33,117284,221],{"class":167},[33,117286,117287,117289,117291,117293,117296,117298,117300,117302,117305,117307,117309,117311,117313,117315],{"class":35,"line":5182},[33,117288,7268],{"class":50},[33,117290,602],{"class":167},[33,117292,4059],{"class":163},[33,117294,117295],{"class":54},"\"[PASS] ",[33,117297,4065],{"class":50},[33,117299,4068],{"class":167},[33,117301,1121],{"class":50},[33,117303,117304],{"class":54}," rows · ",[33,117306,1115],{"class":50},[33,117308,9541],{"class":167},[33,117310,734],{"class":50},[33,117312,9546],{"class":167},[33,117314,1121],{"class":50},[33,117316,117317],{"class":54}," cols · \"\n",[33,117319,117320,117323,117325,117327,117329,117331,117333,117335,117337,117339,117341,117343,117345,117347,117349,117351],{"class":35,"line":5195},[33,117321,117322],{"class":163},"          f",[33,117324,274],{"class":54},[33,117326,1115],{"class":50},[33,117328,115517],{"class":167},[33,117330,115520],{"class":238},[33,117332,242],{"class":163},[33,117334,855],{"class":50},[33,117336,115527],{"class":167},[33,117338,1351],{"class":163},[33,117340,1159],{"class":50},[33,117342,1775],{"class":163},[33,117344,1533],{"class":50},[33,117346,55819],{"class":163},[33,117348,1121],{"class":50},[33,117350,107874],{"class":54},[33,117352,221],{"class":167},[33,117354,117355],{"class":35,"line":5200},[33,117356,92],{"emptyLinePlaceholder":91},[33,117358,117359],{"class":35,"line":5205},[33,117360,92],{"emptyLinePlaceholder":91},[33,117362,117363,117365,117368,117371,117373],{"class":35,"line":5210},[33,117364,562],{"class":163},[33,117366,117367],{"class":46}," export",[33,117369,117370],{"class":167},"(df: pd.DataFrame, out_path: Path) -> ",[33,117372,571],{"class":50},[33,117374,574],{"class":167},[33,117376,117377,117379,117381,117383,117385,117387,117389,117391,117393],{"class":35,"line":5215},[33,117378,64564],{"class":167},[33,117380,869],{"class":238},[33,117382,242],{"class":163},[33,117384,855],{"class":50},[33,117386,365],{"class":167},[33,117388,878],{"class":238},[33,117390,242],{"class":163},[33,117392,855],{"class":50},[33,117394,221],{"class":167},[33,117396,117397,117399],{"class":35,"line":5220},[33,117398,2424],{"class":163},[33,117400,574],{"class":167},[33,117402,117403,117406,117408,117410,117412,117414,117416,117418,117420],{"class":35,"line":5227},[33,117404,117405],{"class":167},"        df.to_csv(out_path, ",[33,117407,897],{"class":238},[33,117409,242],{"class":163},[33,117411,902],{"class":50},[33,117413,365],{"class":167},[33,117415,27249],{"class":238},[33,117417,242],{"class":163},[33,117419,1195],{"class":54},[33,117421,221],{"class":167},[33,117423,117424,117426,117428,117430,117433,117435,117437,117439,117441],{"class":35,"line":5232},[33,117425,9414],{"class":50},[33,117427,602],{"class":167},[33,117429,4059],{"class":163},[33,117431,117432],{"class":54},"\"[OK] Written to ",[33,117434,1115],{"class":50},[33,117436,40722],{"class":167},[33,117438,1121],{"class":50},[33,117440,274],{"class":54},[33,117442,221],{"class":167},[33,117444,117445,117447,117449,117451],{"class":35,"line":5237},[33,117446,2449],{"class":163},[33,117448,107953],{"class":50},[33,117450,1852],{"class":163},[33,117452,1855],{"class":167},[33,117454,117455,117457,117459,117461,117464,117466,117468,117470,117472],{"class":35,"line":5251},[33,117456,9414],{"class":50},[33,117458,602],{"class":167},[33,117460,4059],{"class":163},[33,117462,117463],{"class":54},"\"[ERROR] Export failed: ",[33,117465,1115],{"class":50},[33,117467,6565],{"class":167},[33,117469,1121],{"class":50},[33,117471,274],{"class":54},[33,117473,221],{"class":167},[33,117475,117476,117478,117480],{"class":35,"line":5259},[33,117477,2995],{"class":167},[33,117479,734],{"class":50},[33,117481,221],{"class":167},[33,117483,117484],{"class":35,"line":5264},[33,117485,92],{"emptyLinePlaceholder":91},[33,117487,117488],{"class":35,"line":5269},[33,117489,92],{"emptyLinePlaceholder":91},[33,117491,117492],{"class":35,"line":5283},[33,117493,117494],{"class":39},"# ── CLI ───────────────────────────────────────────────────────────────────────\n",[33,117496,117497],{"class":35,"line":5293},[33,117498,92],{"emptyLinePlaceholder":91},[33,117500,117501,117503,117505,117507,117509],{"class":35,"line":5303},[33,117502,562],{"class":163},[33,117504,6636],{"class":46},[33,117506,568],{"class":167},[33,117508,571],{"class":50},[33,117510,574],{"class":167},[33,117512,117513,117515,117517,117519,117521,117523,117526],{"class":35,"line":5313},[33,117514,6648],{"class":167},[33,117516,242],{"class":163},[33,117518,6653],{"class":167},[33,117520,6656],{"class":238},[33,117522,242],{"class":163},[33,117524,117525],{"class":54},"\"Clean a messy CSV file.\"",[33,117527,221],{"class":167},[33,117529,117530,117532,117534,117536,117538,117540,117542,117544,117546,117549],{"class":35,"line":5320},[33,117531,6669],{"class":167},[33,117533,85805],{"class":54},[33,117535,365],{"class":167},[33,117537,6677],{"class":238},[33,117539,242],{"class":163},[33,117541,6682],{"class":167},[33,117543,25463],{"class":238},[33,117545,242],{"class":163},[33,117547,117548],{"class":54},"\"Path to the raw CSV file\"",[33,117550,221],{"class":167},[33,117552,117553,117555,117557,117559,117561,117563,117565,117567,117569,117572],{"class":35,"line":5325},[33,117554,6669],{"class":167},[33,117556,41169],{"class":54},[33,117558,365],{"class":167},[33,117560,6677],{"class":238},[33,117562,242],{"class":163},[33,117564,6682],{"class":167},[33,117566,25463],{"class":238},[33,117568,242],{"class":163},[33,117570,117571],{"class":54},"\"Path for the cleaned output CSV\"",[33,117573,221],{"class":167},[33,117575,117576,117578,117581,117583,117585,117587,117589],{"class":35,"line":5330},[33,117577,6669],{"class":167},[33,117579,117580],{"class":54},"\"--key-col\"",[33,117582,365],{"class":167},[33,117584,6685],{"class":238},[33,117586,242],{"class":163},[33,117588,108849],{"class":54},[33,117590,247],{"class":167},[33,117592,117593,117595,117597,117600],{"class":35,"line":5344},[33,117594,53388],{"class":238},[33,117596,242],{"class":163},[33,117598,117599],{"class":54},"\"Primary key column name for dedup (default: order_id)\"",[33,117601,221],{"class":167},[33,117603,117604,117606,117609,117611,117613,117615,117618,117620,117622,117624,117626,117628,117630,117632],{"class":35,"line":5349},[33,117605,6669],{"class":167},[33,117607,117608],{"class":54},"\"--date-cols\"",[33,117610,365],{"class":167},[33,117612,25542],{"class":238},[33,117614,242],{"class":163},[33,117616,117617],{"class":54},"\"*\"",[33,117619,365],{"class":167},[33,117621,6685],{"class":238},[33,117623,242],{"class":163},[33,117625,8309],{"class":167},[33,117627,114129],{"class":54},[33,117629,365],{"class":167},[33,117631,114134],{"class":54},[33,117633,8935],{"class":167},[33,117635,117636,117638,117640,117643],{"class":35,"line":5354},[33,117637,53388],{"class":238},[33,117639,242],{"class":163},[33,117641,117642],{"class":54},"\"Columns to parse as datetime (space-separated)\"",[33,117644,221],{"class":167},[33,117646,117647,117649,117652,117654,117656,117658,117660,117662,117664,117666,117668],{"class":35,"line":5368},[33,117648,6669],{"class":167},[33,117650,117651],{"class":54},"\"--min-rows\"",[33,117653,365],{"class":167},[33,117655,6677],{"class":238},[33,117657,242],{"class":163},[33,117659,1059],{"class":50},[33,117661,365],{"class":167},[33,117663,6685],{"class":238},[33,117665,242],{"class":163},[33,117667,734],{"class":50},[33,117669,247],{"class":167},[33,117671,117672,117674,117676,117679],{"class":35,"line":5377},[33,117673,53388],{"class":238},[33,117675,242],{"class":163},[33,117677,117678],{"class":54},"\"Warn if cleaned row count is below this value\"",[33,117680,221],{"class":167},[33,117682,117683,117685,117687],{"class":35,"line":5382},[33,117684,6766],{"class":167},[33,117686,242],{"class":163},[33,117688,6771],{"class":167},[33,117690,117691],{"class":35,"line":5389},[33,117692,92],{"emptyLinePlaceholder":91},[33,117694,117695,117697,117699],{"class":35,"line":5399},[33,117696,617],{"class":163},[33,117698,620],{"class":163},[33,117700,25620],{"class":167},[33,117702,117703,117705,117707,117709,117712,117714,117716,117718,117720],{"class":35,"line":5404},[33,117704,9414],{"class":50},[33,117706,602],{"class":167},[33,117708,4059],{"class":163},[33,117710,117711],{"class":54},"\"[ERROR] Input file not found: ",[33,117713,1115],{"class":50},[33,117715,25634],{"class":167},[33,117717,1121],{"class":50},[33,117719,274],{"class":54},[33,117721,221],{"class":167},[33,117723,117724,117726,117728],{"class":35,"line":5409},[33,117725,2995],{"class":167},[33,117727,734],{"class":50},[33,117729,221],{"class":167},[33,117731,117732],{"class":35,"line":5414},[33,117733,92],{"emptyLinePlaceholder":91},[33,117735,117736,117738,117740],{"class":35,"line":5419},[33,117737,4025],{"class":167},[33,117739,242],{"class":163},[33,117741,117742],{"class":167}," load_csv(args.input)\n",[33,117744,117745,117747,117749],{"class":35,"line":5425},[33,117746,4025],{"class":167},[33,117748,242],{"class":163},[33,117750,117751],{"class":167}," normalize_headers(df)\n",[33,117753,117754,117756,117758,117761,117764,117766],{"class":35,"line":5430},[33,117755,4025],{"class":167},[33,117757,242],{"class":163},[33,117759,117760],{"class":167}," coerce_types(df, ",[33,117762,117763],{"class":238},"date_cols",[33,117765,242],{"class":163},[33,117767,117768],{"class":167},"args.date_cols)\n",[33,117770,117771,117773,117775,117778,117780,117782],{"class":35,"line":5440},[33,117772,4025],{"class":167},[33,117774,242],{"class":163},[33,117776,117777],{"class":167}," remediate_records(df, ",[33,117779,8850],{"class":238},[33,117781,242],{"class":163},[33,117783,117784],{"class":167},"args.key_col)\n",[33,117786,117787,117790,117792,117794],{"class":35,"line":5451},[33,117788,117789],{"class":167},"    validate(df, ",[33,117791,4078],{"class":238},[33,117793,242],{"class":163},[33,117795,117796],{"class":167},"args.min_rows)\n",[33,117798,117799],{"class":35,"line":5464},[33,117800,117801],{"class":167},"    export(df, args.output)\n",[33,117803,117804],{"class":35,"line":5497},[33,117805,92],{"emptyLinePlaceholder":91},[33,117807,117808],{"class":35,"line":5514},[33,117809,92],{"emptyLinePlaceholder":91},[33,117811,117812,117814,117816,117818,117820],{"class":35,"line":5527},[33,117813,2491],{"class":163},[33,117815,2494],{"class":50},[33,117817,2497],{"class":163},[33,117819,2500],{"class":54},[33,117821,574],{"class":167},[33,117823,117824],{"class":35,"line":5532},[33,117825,6914],{"class":167},[14,117827,41347],{},[23,117829,117831],{"className":25,"code":117830,"language":27,"meta":28,"style":28},"python clean_csv.py dirty.csv output\u002Fclean.csv --key-col order_id --date-cols created_at\n",[30,117832,117833],{"__ignoreMap":28},[33,117834,117835,117837,117840,117843,117846,117849,117852,117855],{"class":35,"line":36},[33,117836,47],{"class":46},[33,117838,117839],{"class":54}," clean_csv.py",[33,117841,117842],{"class":54}," dirty.csv",[33,117844,117845],{"class":54}," output\u002Fclean.csv",[33,117847,117848],{"class":50}," --key-col",[33,117850,117851],{"class":54}," order_id",[33,117853,117854],{"class":50}," --date-cols",[33,117856,117857],{"class":54}," created_at\n",[2537,117859],{},[18,117861,117863],{"id":117862},"troubleshooting-faq","Troubleshooting FAQ",[14,117865,117866,117869,117870,36661,117872,117874,117875,117878],{},[1974,117867,117868],{},"How do I handle CSV files with inconsistent row lengths?","\nAdd ",[30,117871,116189],{},[30,117873,108637],{}," to skip malformed rows and log their line offsets. Avoid ",[30,117876,117877],{},"on_bad_lines='skip'"," in production — silent data loss is harder to debug than a warning flood.",[14,117880,117881,107296,117884,36661,117886,117889,117890,117892,117893,117896],{},[1974,117882,117883],{},"Can pandas automatically detect and fix date formats across mixed locales?",[30,117885,97762],{},[30,117887,117888],{},"pd.to_datetime()"," (pandas 2.0+). For pre-2.0 installations, use ",[30,117891,14414],{}," as a fallback, or apply a custom parser via ",[30,117894,117895],{},".apply()"," when formats are too irregular for the built-in heuristics.",[14,117898,117899,117902],{},[1974,117900,117901],{},"When should I switch from pandas to Polars or Dask?","\nSwitch when source files consistently exceed available RAM, when wall-clock time on vectorised string operations becomes a bottleneck, or when you need true parallel execution across CPU cores. Polars is usually the first step up; Dask adds distributed scheduling for cluster environments.",[2537,117904],{},[18,117906,6918],{"id":6917},[4211,117908,117909,117914,117919,117924,117929],{},[4214,117910,117911,117913],{},[940,117912,107425],{"href":110423}," — compare csv stdlib, pandas, polars, and pyarrow before writing the loader",[4214,117915,117916,117918],{},[940,117917,27254],{"href":27253}," — fix UnicodeDecodeError before cleaning starts",[4214,117920,117921,117923],{},[940,117922,108865],{"href":108864}," — write cleaned data back to CSV, Excel, or Parquet without index artifacts",[4214,117925,117926,117928],{},[940,117927,99577],{"href":99576}," — the same cleaning techniques apply to DataFrames from .xlsx files",[4214,117930,117931,117933],{},[940,117932,948],{"href":947}," — clean PDF-extracted tables using the same pipeline",[14,117935,6947,117936,3035],{},[940,117937,26258],{"href":26257},[6953,117939,117940],{},"html pre.shiki code .s7eDp, html code.shiki .s7eDp{--shiki-default:#6F42C1}html pre.shiki code .sYBdl, html code.shiki .sYBdl{--shiki-default:#032F62}html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html pre.shiki code .sD7c4, html code.shiki .sD7c4{--shiki-default:#D73A49}html pre.shiki code .sAwPA, html code.shiki .sAwPA{--shiki-default:#6A737D}html pre.shiki code .sgsFI, html code.shiki .sgsFI{--shiki-default:#24292E}html pre.shiki code .sYu0t, html code.shiki .sYu0t{--shiki-default:#005CC5}html pre.shiki code .sqxcx, html code.shiki .sqxcx{--shiki-default:#E36209}html pre.shiki code .s691h, html code.shiki .s691h{--shiki-default:#22863A;--shiki-default-font-weight:bold}",{"title":28,"searchDepth":43,"depth":43,"links":117942},[117943,117944,117945,117951,117952,117957,117958,117959,117960,117961,117962],{"id":20,"depth":43,"text":21},{"id":112943,"depth":43,"text":112944},{"id":421,"depth":43,"text":422,"children":117946},[117947,117948,117949,117950],{"id":113436,"depth":61,"text":113437},{"id":113698,"depth":61,"text":113699},{"id":113823,"depth":61,"text":113824},{"id":114201,"depth":61,"text":114202},{"id":114556,"depth":43,"text":114557},{"id":2708,"depth":43,"text":2709,"children":117953},[117954,117955,117956],{"id":114698,"depth":61,"text":114699},{"id":114859,"depth":61,"text":114860},{"id":115045,"depth":61,"text":115046},{"id":52029,"depth":43,"text":52030},{"id":21809,"depth":43,"text":21810},{"id":4270,"depth":43,"text":4271},{"id":4401,"depth":43,"text":4402},{"id":117862,"depth":43,"text":117863},{"id":6917,"depth":43,"text":6918},"Cleaning Messy CSV Data","Fix inconsistent delimiters, encoding mismatches, dtype coercion, missing values, and duplicates in CSV exports using a systematic pandas pipeline.",{},"\u002Fpython-for-excel-csv-data-processing\u002Fcleaning-messy-csv-data-with-pandas",{"title":107447,"description":117964},"Cleaning Messy CSV Data with Pandas — Whitespace, Types, Missing Values & Dedup","python-for-excel-csv-data-processing\u002Fcleaning-messy-csv-data-with-pandas\u002Findex",[47,9630,117971,117972,117973],"csv-cleaning","data-wrangling","dtype-coercion","pywsYG2n4Wz2OmS-7hZkjRnjWrLy-90qIx5d-GgArDY",{"id":117976,"title":28147,"body":117977,"breadcrumbTitle":6977,"canonical":6977,"date":6977,"description":120151,"draft":6980,"extension":6981,"image":6977,"meta":120152,"navigation":91,"path":120153,"robots":6977,"seo":120154,"seoTitle":107412,"stem":120155,"tags":6977,"updatedAt":6977,"__hash__":120156},"content\u002Fpython-for-excel-csv-data-processing\u002Fexporting-data-to-csv-formats\u002Ffix-pandas-to-csv-extra-index-column\u002Findex.md",{"type":7,"value":117978,"toc":120136},[117979,117982,118000,118006,118008,118010,118031,118034,118040,118050,118052,118054,118057,118235,118237,118243,118251,118253,118258,118269,118486,118488,118494,118502,118504,118508,118511,118519,118527,118797,118801,118804,119005,119008,119010,119014,119026,119119,119478,119495,119497,119501,119513,119780,119783,119785,119787,119790,120108,120110,120112,120129,120133],[10,117980,28147],{"id":117981},"fix-pandas-to_csv-adding-an-extra-index-column",[14,117983,117984,117987,117988,117991,117992,117995,117996,117999],{},[30,117985,117986],{},"df.to_csv(\"out.csv\")"," writes an extra unnamed leading column. When you re-read that file with ",[30,117989,117990],{},"pd.read_csv(\"out.csv\")",", that column appears as ",[30,117993,117994],{},"Unnamed: 0",". It breaks column-count assertions, corrupts SQL ",[30,117997,117998],{},"COPY"," loads, and confuses every downstream tool that expected only your data columns.",[14,118001,118002,118003,3035],{},"The root cause is a single default parameter: ",[30,118004,118005],{},"index=True",[2537,118007],{},[18,118009,7021],{"id":7020},[14,118011,118012,118013,118016,118017,118020,118021,118024,118025,118027,118028,118030],{},"pandas ",[30,118014,118015],{},"DataFrame.to_csv()"," writes the DataFrame index as the first column by default. A freshly created DataFrame has a ",[30,118018,118019],{},"RangeIndex"," — integers ",[30,118022,118023],{},"0, 1, 2, …"," — with no name. When serialised to CSV, the index occupies the first column with an empty header. When that file is re-read with ",[30,118026,112835],{},", pandas assigns the header ",[30,118029,117994],{}," to any column whose header is an empty string.",[14,118032,118033],{},"The default signature is:",[23,118035,118038],{"className":118036,"code":118037,"language":2000},[1998],"DataFrame.to_csv(path_or_buf, sep=',', index=True, ...)\n",[30,118039,118037],{"__ignoreMap":28},[14,118041,118042,118044,118045,46332,118047,118049],{},[30,118043,118005],{}," is not a mistake in general — a meaningful, named index (a date series, a primary key) is worth writing. The problem is that the ",[26245,118046,6685],{},[30,118048,118019],{}," is meaningless and pollutes every consumer.",[2537,118051],{},[18,118053,99786],{"id":54445},[14,118055,118056],{},"Run this to confirm the symptom:",[23,118058,118060],{"className":126,"code":118059,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\nfrom pathlib import Path\n\nOUT = Path(\"out.csv\")\n\ndf = pd.DataFrame({\"a\": [1, 2, 3], \"b\": [\"x\", \"y\", \"z\"]})\ndf.to_csv(OUT)                       # index=True is the default\n\nraw = OUT.read_text()\nprint(\"--- raw CSV ---\")\nprint(raw)\n\ndf_back = pd.read_csv(OUT)\nprint(\"--- re-read columns ---\")\nprint(df_back.columns.tolist())      # ['Unnamed: 0', 'a', 'b']\n",[30,118061,118062,118066,118076,118086,118090,118103,118107,118152,118164,118168,118180,118191,118197,118201,118214,118225],{"__ignoreMap":28},[33,118063,118064],{"class":35,"line":36},[33,118065,8895],{"class":39},[33,118067,118068,118070,118072,118074],{"class":35,"line":43},[33,118069,164],{"class":163},[33,118071,492],{"class":167},[33,118073,495],{"class":163},[33,118075,498],{"class":167},[33,118077,118078,118080,118082,118084],{"class":35,"line":61},[33,118079,190],{"class":163},[33,118081,193],{"class":167},[33,118083,164],{"class":163},[33,118085,198],{"class":167},[33,118087,118088],{"class":35,"line":73},[33,118089,92],{"emptyLinePlaceholder":91},[33,118091,118092,118094,118096,118098,118101],{"class":35,"line":88},[33,118093,57716],{"class":50},[33,118095,212],{"class":163},[33,118097,215],{"class":167},[33,118099,118100],{"class":54},"\"out.csv\"",[33,118102,221],{"class":167},[33,118104,118105],{"class":35,"line":95},[33,118106,92],{"emptyLinePlaceholder":91},[33,118108,118109,118111,118113,118115,118118,118120,118122,118124,118126,118128,118130,118132,118135,118137,118140,118142,118145,118147,118150],{"class":35,"line":101},[33,118110,13459],{"class":167},[33,118112,242],{"class":163},[33,118114,101407],{"class":167},[33,118116,118117],{"class":54},"\"a\"",[33,118119,12426],{"class":167},[33,118121,734],{"class":50},[33,118123,365],{"class":167},[33,118125,1533],{"class":50},[33,118127,365],{"class":167},[33,118129,10258],{"class":50},[33,118131,8314],{"class":167},[33,118133,118134],{"class":54},"\"b\"",[33,118136,12426],{"class":167},[33,118138,118139],{"class":54},"\"x\"",[33,118141,365],{"class":167},[33,118143,118144],{"class":54},"\"y\"",[33,118146,365],{"class":167},[33,118148,118149],{"class":54},"\"z\"",[33,118151,45051],{"class":167},[33,118153,118154,118156,118158,118161],{"class":35,"line":171},[33,118155,16503],{"class":167},[33,118157,57716],{"class":50},[33,118159,118160],{"class":167},")                       ",[33,118162,118163],{"class":39},"# index=True is the default\n",[33,118165,118166],{"class":35,"line":179},[33,118167,92],{"emptyLinePlaceholder":91},[33,118169,118170,118172,118174,118177],{"class":35,"line":187},[33,118171,96164],{"class":167},[33,118173,242],{"class":163},[33,118175,118176],{"class":50}," OUT",[33,118178,118179],{"class":167},".read_text()\n",[33,118181,118182,118184,118186,118189],{"class":35,"line":201},[33,118183,13474],{"class":50},[33,118185,602],{"class":167},[33,118187,118188],{"class":54},"\"--- raw CSV ---\"",[33,118190,221],{"class":167},[33,118192,118193,118195],{"class":35,"line":206},[33,118194,13474],{"class":50},[33,118196,96377],{"class":167},[33,118198,118199],{"class":35,"line":224},[33,118200,92],{"emptyLinePlaceholder":91},[33,118202,118203,118206,118208,118210,118212],{"class":35,"line":229},[33,118204,118205],{"class":167},"df_back ",[33,118207,242],{"class":163},[33,118209,9481],{"class":167},[33,118211,57716],{"class":50},[33,118213,221],{"class":167},[33,118215,118216,118218,118220,118223],{"class":35,"line":235},[33,118217,13474],{"class":50},[33,118219,602],{"class":167},[33,118221,118222],{"class":54},"\"--- re-read columns ---\"",[33,118224,221],{"class":167},[33,118226,118227,118229,118232],{"class":35,"line":250},[33,118228,13474],{"class":50},[33,118230,118231],{"class":167},"(df_back.columns.tolist())      ",[33,118233,118234],{"class":39},"# ['Unnamed: 0', 'a', 'b']\n",[14,118236,36588],{},[23,118238,118241],{"className":118239,"code":118240,"language":2000},[1998],"--- raw CSV ---\n,a,b\n0,1,x\n1,2,y\n2,3,z\n\n--- re-read columns ---\n['Unnamed: 0', 'a', 'b']\n",[30,118242,118240],{"__ignoreMap":28},[14,118244,118245,118246,118248,118249,3035],{},"The leading ",[30,118247,63503],{}," on the header line is the serialised empty index name. Every downstream tool sees it differently: Excel shows a blank column A, Redshift rejects the file with a column-count error, and pandas names it ",[30,118250,117994],{},[2537,118252],{},[18,118254,58405,118256],{"id":118255},"fix-indexfalse",[30,118257,28142],{},[14,118259,4358,118260,118262,118263,118266,118267,20891],{},[30,118261,28142],{}," to every ",[30,118264,118265],{},"to_csv"," call that uses the default ",[30,118268,118019],{},[23,118270,118272],{"className":126,"code":118271,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\nfrom pathlib import Path\n\nOUT = Path(\"out_fixed.csv\")\n\ndf = pd.DataFrame({\"a\": [1, 2, 3], \"b\": [\"x\", \"y\", \"z\"]})\n\ntry:\n    df.to_csv(OUT, index=False)      # suppress the RangeIndex column\nexcept OSError as e:\n    raise SystemExit(f\"Write failed: {e}\")\n\nraw = OUT.read_text()\nprint(\"--- raw CSV ---\")\nprint(raw)\n\ndf_back = pd.read_csv(OUT)\nprint(\"--- re-read columns ---\")\nprint(df_back.columns.tolist())      # ['a', 'b'] — clean\n",[30,118273,118274,118278,118288,118298,118302,118315,118319,118359,118363,118369,118388,118398,118421,118425,118435,118445,118451,118455,118467,118477],{"__ignoreMap":28},[33,118275,118276],{"class":35,"line":36},[33,118277,8895],{"class":39},[33,118279,118280,118282,118284,118286],{"class":35,"line":43},[33,118281,164],{"class":163},[33,118283,492],{"class":167},[33,118285,495],{"class":163},[33,118287,498],{"class":167},[33,118289,118290,118292,118294,118296],{"class":35,"line":61},[33,118291,190],{"class":163},[33,118293,193],{"class":167},[33,118295,164],{"class":163},[33,118297,198],{"class":167},[33,118299,118300],{"class":35,"line":73},[33,118301,92],{"emptyLinePlaceholder":91},[33,118303,118304,118306,118308,118310,118313],{"class":35,"line":88},[33,118305,57716],{"class":50},[33,118307,212],{"class":163},[33,118309,215],{"class":167},[33,118311,118312],{"class":54},"\"out_fixed.csv\"",[33,118314,221],{"class":167},[33,118316,118317],{"class":35,"line":95},[33,118318,92],{"emptyLinePlaceholder":91},[33,118320,118321,118323,118325,118327,118329,118331,118333,118335,118337,118339,118341,118343,118345,118347,118349,118351,118353,118355,118357],{"class":35,"line":101},[33,118322,13459],{"class":167},[33,118324,242],{"class":163},[33,118326,101407],{"class":167},[33,118328,118117],{"class":54},[33,118330,12426],{"class":167},[33,118332,734],{"class":50},[33,118334,365],{"class":167},[33,118336,1533],{"class":50},[33,118338,365],{"class":167},[33,118340,10258],{"class":50},[33,118342,8314],{"class":167},[33,118344,118134],{"class":54},[33,118346,12426],{"class":167},[33,118348,118139],{"class":54},[33,118350,365],{"class":167},[33,118352,118144],{"class":54},[33,118354,365],{"class":167},[33,118356,118149],{"class":54},[33,118358,45051],{"class":167},[33,118360,118361],{"class":35,"line":171},[33,118362,92],{"emptyLinePlaceholder":91},[33,118364,118365,118367],{"class":35,"line":179},[33,118366,35574],{"class":163},[33,118368,574],{"class":167},[33,118370,118371,118373,118375,118377,118379,118381,118383,118385],{"class":35,"line":187},[33,118372,39534],{"class":167},[33,118374,57716],{"class":50},[33,118376,365],{"class":167},[33,118378,897],{"class":238},[33,118380,242],{"class":163},[33,118382,902],{"class":50},[33,118384,54109],{"class":167},[33,118386,118387],{"class":39},"# suppress the RangeIndex column\n",[33,118389,118390,118392,118394,118396],{"class":35,"line":201},[33,118391,35726],{"class":163},[33,118393,107953],{"class":50},[33,118395,1852],{"class":163},[33,118397,7583],{"class":167},[33,118399,118400,118402,118404,118406,118408,118411,118413,118415,118417,118419],{"class":35,"line":206},[33,118401,35742],{"class":163},[33,118403,16617],{"class":50},[33,118405,602],{"class":167},[33,118407,4059],{"class":163},[33,118409,118410],{"class":54},"\"Write failed: ",[33,118412,1115],{"class":50},[33,118414,7602],{"class":167},[33,118416,1121],{"class":50},[33,118418,274],{"class":54},[33,118420,221],{"class":167},[33,118422,118423],{"class":35,"line":224},[33,118424,92],{"emptyLinePlaceholder":91},[33,118426,118427,118429,118431,118433],{"class":35,"line":229},[33,118428,96164],{"class":167},[33,118430,242],{"class":163},[33,118432,118176],{"class":50},[33,118434,118179],{"class":167},[33,118436,118437,118439,118441,118443],{"class":35,"line":235},[33,118438,13474],{"class":50},[33,118440,602],{"class":167},[33,118442,118188],{"class":54},[33,118444,221],{"class":167},[33,118446,118447,118449],{"class":35,"line":250},[33,118448,13474],{"class":50},[33,118450,96377],{"class":167},[33,118452,118453],{"class":35,"line":266},[33,118454,92],{"emptyLinePlaceholder":91},[33,118456,118457,118459,118461,118463,118465],{"class":35,"line":290},[33,118458,118205],{"class":167},[33,118460,242],{"class":163},[33,118462,9481],{"class":167},[33,118464,57716],{"class":50},[33,118466,221],{"class":167},[33,118468,118469,118471,118473,118475],{"class":35,"line":295},[33,118470,13474],{"class":50},[33,118472,602],{"class":167},[33,118474,118222],{"class":54},[33,118476,221],{"class":167},[33,118478,118479,118481,118483],{"class":35,"line":300},[33,118480,13474],{"class":50},[33,118482,118231],{"class":167},[33,118484,118485],{"class":39},"# ['a', 'b'] — clean\n",[14,118487,36588],{},[23,118489,118492],{"className":118490,"code":118491,"language":2000},[1998],"--- raw CSV ---\na,b\n1,x\n2,y\n3,z\n\n--- re-read columns ---\n['a', 'b']\n",[30,118493,118491],{"__ignoreMap":28},[14,118495,118496,118497,118499,118500,102135],{},"One changed line — ",[30,118498,28142],{}," — eliminates the extra column entirely. This is the canonical fix described in the ",[940,118501,108865],{"href":108864},[2537,118503],{},[18,118505,118507],{"id":118506},"variant-fix-file-already-written-with-the-index","Variant Fix: File Already Written With the Index",[14,118509,118510],{},"If the file already exists on disk with the extra column, you have two options.",[424,118512,118514,118515,118518],{"id":118513},"option-a-re-read-with-index_col0-then-re-export","Option A — Re-read with ",[30,118516,118517],{},"index_col=0",", then re-export",[14,118520,17059,118521,118523,118524,118526],{},[30,118522,118517],{}," to tell pandas that the first column ",[26245,118525,3847],{}," the index, absorbing it back into the DataFrame object and out of the column list:",[23,118528,118530],{"className":126,"code":118529,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\nfrom pathlib import Path\n\nBAD_FILE = Path(\"out.csv\")        # written with index=True by mistake\nFIXED = Path(\"out_fixed.csv\")\n\ntry:\n    df = pd.read_csv(BAD_FILE, index_col=0)   # absorb the leading column as index\nexcept FileNotFoundError as e:\n    raise SystemExit(f\"File not found: {e}\")\n\nprint(\"Columns after absorb:\", df.columns.tolist())   # ['a', 'b']\nprint(\"Index:\", df.index.tolist())                     # [0, 1, 2] — the absorbed RangeIndex\n\ntry:\n    df.to_csv(FIXED, index=False)   # now export without the index\nexcept OSError as e:\n    raise SystemExit(f\"Write failed: {e}\")\n\ncheck = pd.read_csv(FIXED)\nassert \"Unnamed: 0\" not in check.columns, \"Still has the extra column!\"\nprint(\"Fixed file columns:\", check.columns.tolist())\n",[30,118531,118532,118536,118546,118556,118560,118577,118590,118594,118600,118624,118634,118656,118660,118675,118690,118694,118700,118719,118729,118751,118755,118768,118785],{"__ignoreMap":28},[33,118533,118534],{"class":35,"line":36},[33,118535,8895],{"class":39},[33,118537,118538,118540,118542,118544],{"class":35,"line":43},[33,118539,164],{"class":163},[33,118541,492],{"class":167},[33,118543,495],{"class":163},[33,118545,498],{"class":167},[33,118547,118548,118550,118552,118554],{"class":35,"line":61},[33,118549,190],{"class":163},[33,118551,193],{"class":167},[33,118553,164],{"class":163},[33,118555,198],{"class":167},[33,118557,118558],{"class":35,"line":73},[33,118559,92],{"emptyLinePlaceholder":91},[33,118561,118562,118565,118567,118569,118571,118574],{"class":35,"line":88},[33,118563,118564],{"class":50},"BAD_FILE",[33,118566,212],{"class":163},[33,118568,215],{"class":167},[33,118570,118100],{"class":54},[33,118572,118573],{"class":167},")        ",[33,118575,118576],{"class":39},"# written with index=True by mistake\n",[33,118578,118579,118582,118584,118586,118588],{"class":35,"line":95},[33,118580,118581],{"class":50},"FIXED",[33,118583,212],{"class":163},[33,118585,215],{"class":167},[33,118587,118312],{"class":54},[33,118589,221],{"class":167},[33,118591,118592],{"class":35,"line":101},[33,118593,92],{"emptyLinePlaceholder":91},[33,118595,118596,118598],{"class":35,"line":171},[33,118597,35574],{"class":163},[33,118599,574],{"class":167},[33,118601,118602,118604,118606,118608,118610,118612,118615,118617,118619,118621],{"class":35,"line":179},[33,118603,4025],{"class":167},[33,118605,242],{"class":163},[33,118607,9481],{"class":167},[33,118609,118564],{"class":50},[33,118611,365],{"class":167},[33,118613,118614],{"class":238},"index_col",[33,118616,242],{"class":163},[33,118618,748],{"class":50},[33,118620,12000],{"class":167},[33,118622,118623],{"class":39},"# absorb the leading column as index\n",[33,118625,118626,118628,118630,118632],{"class":35,"line":187},[33,118627,35726],{"class":163},[33,118629,2945],{"class":50},[33,118631,1852],{"class":163},[33,118633,7583],{"class":167},[33,118635,118636,118638,118640,118642,118644,118646,118648,118650,118652,118654],{"class":35,"line":201},[33,118637,35742],{"class":163},[33,118639,16617],{"class":50},[33,118641,602],{"class":167},[33,118643,4059],{"class":163},[33,118645,15677],{"class":54},[33,118647,1115],{"class":50},[33,118649,7602],{"class":167},[33,118651,1121],{"class":50},[33,118653,274],{"class":54},[33,118655,221],{"class":167},[33,118657,118658],{"class":35,"line":206},[33,118659,92],{"emptyLinePlaceholder":91},[33,118661,118662,118664,118666,118669,118672],{"class":35,"line":224},[33,118663,13474],{"class":50},[33,118665,602],{"class":167},[33,118667,118668],{"class":54},"\"Columns after absorb:\"",[33,118670,118671],{"class":167},", df.columns.tolist())   ",[33,118673,118674],{"class":39},"# ['a', 'b']\n",[33,118676,118677,118679,118681,118684,118687],{"class":35,"line":229},[33,118678,13474],{"class":50},[33,118680,602],{"class":167},[33,118682,118683],{"class":54},"\"Index:\"",[33,118685,118686],{"class":167},", df.index.tolist())                     ",[33,118688,118689],{"class":39},"# [0, 1, 2] — the absorbed RangeIndex\n",[33,118691,118692],{"class":35,"line":235},[33,118693,92],{"emptyLinePlaceholder":91},[33,118695,118696,118698],{"class":35,"line":250},[33,118697,35574],{"class":163},[33,118699,574],{"class":167},[33,118701,118702,118704,118706,118708,118710,118712,118714,118716],{"class":35,"line":266},[33,118703,39534],{"class":167},[33,118705,118581],{"class":50},[33,118707,365],{"class":167},[33,118709,897],{"class":238},[33,118711,242],{"class":163},[33,118713,902],{"class":50},[33,118715,12000],{"class":167},[33,118717,118718],{"class":39},"# now export without the index\n",[33,118720,118721,118723,118725,118727],{"class":35,"line":290},[33,118722,35726],{"class":163},[33,118724,107953],{"class":50},[33,118726,1852],{"class":163},[33,118728,7583],{"class":167},[33,118730,118731,118733,118735,118737,118739,118741,118743,118745,118747,118749],{"class":35,"line":295},[33,118732,35742],{"class":163},[33,118734,16617],{"class":50},[33,118736,602],{"class":167},[33,118738,4059],{"class":163},[33,118740,118410],{"class":54},[33,118742,1115],{"class":50},[33,118744,7602],{"class":167},[33,118746,1121],{"class":50},[33,118748,274],{"class":54},[33,118750,221],{"class":167},[33,118752,118753],{"class":35,"line":300},[33,118754,92],{"emptyLinePlaceholder":91},[33,118756,118757,118760,118762,118764,118766],{"class":35,"line":317},[33,118758,118759],{"class":167},"check ",[33,118761,242],{"class":163},[33,118763,9481],{"class":167},[33,118765,118581],{"class":50},[33,118767,221],{"class":167},[33,118769,118770,118772,118775,118777,118779,118782],{"class":35,"line":332},[33,118771,36397],{"class":163},[33,118773,118774],{"class":54}," \"Unnamed: 0\"",[33,118776,620],{"class":163},[33,118778,8002],{"class":163},[33,118780,118781],{"class":167}," check.columns, ",[33,118783,118784],{"class":54},"\"Still has the extra column!\"\n",[33,118786,118787,118789,118791,118794],{"class":35,"line":347},[33,118788,13474],{"class":50},[33,118790,602],{"class":167},[33,118792,118793],{"class":54},"\"Fixed file columns:\"",[33,118795,118796],{"class":167},", check.columns.tolist())\n",[424,118798,118800],{"id":118799},"option-b-drop-the-column-after-re-reading","Option B — Drop the column after re-reading",[14,118802,118803],{},"If you have no control over how the file was written and it may or may not have the extra column:",[23,118805,118807],{"className":126,"code":118806,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\nfrom pathlib import Path\n\nFILE = Path(\"out.csv\")   # may or may not have Unnamed: 0\n\ntry:\n    df = pd.read_csv(FILE)\nexcept FileNotFoundError as e:\n    raise SystemExit(f\"File not found: {e}\")\n\n# Drop any leading unnamed columns produced by a serialised RangeIndex\nunnamed_cols = [c for c in df.columns if str(c).startswith(\"Unnamed:\")]\nif unnamed_cols:\n    df = df.drop(columns=unnamed_cols)\n    print(f\"Dropped columns: {unnamed_cols}\")\n\nprint(\"Clean columns:\", df.columns.tolist())\n",[30,118808,118809,118813,118823,118833,118837,118853,118857,118863,118875,118885,118907,118911,118916,118945,118952,118967,118989,118993],{"__ignoreMap":28},[33,118810,118811],{"class":35,"line":36},[33,118812,8895],{"class":39},[33,118814,118815,118817,118819,118821],{"class":35,"line":43},[33,118816,164],{"class":163},[33,118818,492],{"class":167},[33,118820,495],{"class":163},[33,118822,498],{"class":167},[33,118824,118825,118827,118829,118831],{"class":35,"line":61},[33,118826,190],{"class":163},[33,118828,193],{"class":167},[33,118830,164],{"class":163},[33,118832,198],{"class":167},[33,118834,118835],{"class":35,"line":73},[33,118836,92],{"emptyLinePlaceholder":91},[33,118838,118839,118842,118844,118846,118848,118850],{"class":35,"line":88},[33,118840,118841],{"class":50},"FILE",[33,118843,212],{"class":163},[33,118845,215],{"class":167},[33,118847,118100],{"class":54},[33,118849,12000],{"class":167},[33,118851,118852],{"class":39},"# may or may not have Unnamed: 0\n",[33,118854,118855],{"class":35,"line":95},[33,118856,92],{"emptyLinePlaceholder":91},[33,118858,118859,118861],{"class":35,"line":101},[33,118860,35574],{"class":163},[33,118862,574],{"class":167},[33,118864,118865,118867,118869,118871,118873],{"class":35,"line":171},[33,118866,4025],{"class":167},[33,118868,242],{"class":163},[33,118870,9481],{"class":167},[33,118872,118841],{"class":50},[33,118874,221],{"class":167},[33,118876,118877,118879,118881,118883],{"class":35,"line":179},[33,118878,35726],{"class":163},[33,118880,2945],{"class":50},[33,118882,1852],{"class":163},[33,118884,7583],{"class":167},[33,118886,118887,118889,118891,118893,118895,118897,118899,118901,118903,118905],{"class":35,"line":187},[33,118888,35742],{"class":163},[33,118890,16617],{"class":50},[33,118892,602],{"class":167},[33,118894,4059],{"class":163},[33,118896,15677],{"class":54},[33,118898,1115],{"class":50},[33,118900,7602],{"class":167},[33,118902,1121],{"class":50},[33,118904,274],{"class":54},[33,118906,221],{"class":167},[33,118908,118909],{"class":35,"line":201},[33,118910,92],{"emptyLinePlaceholder":91},[33,118912,118913],{"class":35,"line":206},[33,118914,118915],{"class":39},"# Drop any leading unnamed columns produced by a serialised RangeIndex\n",[33,118917,118918,118921,118923,118925,118927,118929,118931,118933,118935,118937,118940,118943],{"class":35,"line":224},[33,118919,118920],{"class":167},"unnamed_cols ",[33,118922,242],{"class":163},[33,118924,7740],{"class":167},[33,118926,6124],{"class":163},[33,118928,7486],{"class":167},[33,118930,662],{"class":163},[33,118932,7837],{"class":167},[33,118934,2491],{"class":163},[33,118936,7887],{"class":50},[33,118938,118939],{"class":167},"(c).startswith(",[33,118941,118942],{"class":54},"\"Unnamed:\"",[33,118944,7767],{"class":167},[33,118946,118947,118949],{"class":35,"line":229},[33,118948,2491],{"class":163},[33,118950,118951],{"class":167}," unnamed_cols:\n",[33,118953,118954,118956,118958,118960,118962,118964],{"class":35,"line":235},[33,118955,4025],{"class":167},[33,118957,242],{"class":163},[33,118959,9027],{"class":167},[33,118961,740],{"class":238},[33,118963,242],{"class":163},[33,118965,118966],{"class":167},"unnamed_cols)\n",[33,118968,118969,118971,118973,118975,118978,118980,118983,118985,118987],{"class":35,"line":250},[33,118970,7268],{"class":50},[33,118972,602],{"class":167},[33,118974,4059],{"class":163},[33,118976,118977],{"class":54},"\"Dropped columns: ",[33,118979,1115],{"class":50},[33,118981,118982],{"class":167},"unnamed_cols",[33,118984,1121],{"class":50},[33,118986,274],{"class":54},[33,118988,221],{"class":167},[33,118990,118991],{"class":35,"line":266},[33,118992,92],{"emptyLinePlaceholder":91},[33,118994,118995,118997,118999,119002],{"class":35,"line":290},[33,118996,13474],{"class":50},[33,118998,602],{"class":167},[33,119000,119001],{"class":54},"\"Clean columns:\"",[33,119003,119004],{"class":167},", df.columns.tolist())\n",[14,119006,119007],{},"This is defensive code suitable for pipelines that ingest CSVs from external sources you cannot control.",[2537,119009],{},[18,119011,119013],{"id":119012},"variant-fix-meaningful-index-you-want-to-keep","Variant Fix: Meaningful Index You Want to Keep",[14,119015,119016,119017,119019,119020,119023,119024,3035],{},"Not every index is a ",[30,119018,118019],{},". When the index carries real information — a date series, a primary-key column, a category name — you ",[26245,119021,119022],{},"should"," write it, but you must name it so it does not come back as ",[30,119025,117994],{},[2540,119027,2547,119030,2547,119033,2547,119036,2547,2547,119043,2547,119046,2547,119049,2547,119052,2547,119055,2547,119057,2547,119060,2547,119062,2547,119066,2547,119069,2547,119071,2547,119073,2547,119076,2547,119079,2547,2547,119082,2547,119084,2547,119088,2547,119091,2547,119093,2547,119095,2547,119098,2547,119100,2547,119103,2547,119105,2547,119107,2547,119109,2547,119112,2547,119116],{"viewBox":119028,"role":2543,"ariaLabel":119029,"xmlns":2545,"style":2546},"0 0 760 200","Two paths: unnamed RangeIndex produces Unnamed: 0 on re-read; named index produces a proper column header",[2549,119031,119032],{},"Named vs unnamed index round-trip",[2553,119034,119035],{},"Shows that an unnamed RangeIndex written to CSV becomes Unnamed: 0 on re-read, while a named index writes and re-reads with its correct column header.",[2557,119037,2559,119038,2547],{},[2573,119039,2564,119041,2559],{"id":119040,"viewBox":2576,"refX":2577,"refY":1153,"markerWidth":2681,"markerHeight":2681,"orient":2578},"fix-index-arrow",[2580,119042],{"d":2582,"fill":2583},[2585,119044],{"x":2587,"y":2587,"width":11115,"height":2680,"rx":2591,"fill":119045,"stroke":2593,"style":2594},"#fff",[2000,119047,119048],{"x":102546,"y":49816,"fill":2599,"style":2600},"RangeIndex (unnamed)",[2000,119050,119051],{"x":102546,"y":58360,"fill":2583,"style":2685},"index.name = None",[35,119053],{"x1":2697,"y1":82765,"x2":26446,"y2":82765,"stroke":2583,"markerEnd":119054,"style":2594},"url(#fix-index-arrow)",[2000,119056,118265],{"x":26354,"y":26411,"fill":2583,"style":2605},[2000,119058,119059],{"x":26354,"y":2680,"fill":2583,"style":2605},"(default)",[2585,119061],{"x":26446,"y":2587,"width":2635,"height":2680,"rx":2591,"fill":119045,"stroke":2593,"style":2594},[2000,119063,119065],{"x":119064,"y":49816,"fill":2599,"style":2685},"345","header = \"\" (blank)",[2000,119067,119068],{"x":119064,"y":58360,"fill":2583,"style":2685},"CSV first column",[35,119070],{"x1":59959,"y1":82765,"x2":13437,"y2":82765,"stroke":2583,"markerEnd":119054,"style":2594},[2000,119072,57237],{"x":64900,"y":26411,"fill":2583,"style":2605},[2585,119074],{"x":13437,"y":2587,"width":17008,"height":2680,"rx":2591,"fill":119075,"stroke":58349,"style":2594},"#fef2f2",[2000,119077,117994],{"x":49863,"y":49816,"fill":119078,"style":2600},"#b91c1c",[2000,119080,119081],{"x":49863,"y":58360,"fill":2583,"style":2685},"spurious column",[2585,119083],{"x":2587,"y":2589,"width":11115,"height":2680,"rx":2591,"fill":119045,"stroke":2593,"style":2594},[2000,119085,119087],{"x":102546,"y":119086,"fill":2599,"style":2600},"142","Named index",[2000,119089,119090],{"x":102546,"y":11132,"fill":2583,"style":2685},"index.name = \"date\"",[35,119092],{"x1":2697,"y1":2648,"x2":26446,"y2":2648,"stroke":2583,"markerEnd":119054,"style":2594},[2000,119094,118265],{"x":26354,"y":49842,"fill":2583,"style":2605},[2000,119096,119097],{"x":26354,"y":2635,"fill":2583,"style":2605},"(index=True)",[2585,119099],{"x":26446,"y":2589,"width":2635,"height":2680,"rx":2591,"fill":119045,"stroke":2593,"style":2594},[2000,119101,119102],{"x":119064,"y":119086,"fill":2599,"style":2685},"header = \"date\"",[2000,119104,119068],{"x":119064,"y":11132,"fill":2583,"style":2685},[35,119106],{"x1":59959,"y1":2648,"x2":13437,"y2":2648,"stroke":2583,"markerEnd":119054,"style":2594},[2000,119108,57237],{"x":64900,"y":49842,"fill":2583,"style":2605},[2585,119110],{"x":13437,"y":2589,"width":17008,"height":2680,"rx":2591,"fill":119111,"stroke":58377,"style":2594},"#f0fdf4",[2000,119113,119115],{"x":49863,"y":119086,"fill":119114,"style":2600},"#166534","date (proper column)",[2000,119117,119118],{"x":49863,"y":11132,"fill":2583,"style":2685},"clean round-trip",[23,119120,119122],{"className":126,"code":119121,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\nfrom pathlib import Path\n\n# Monthly sales with a meaningful DatetimeIndex\ndf = pd.DataFrame(\n    {\"revenue\": [10_000, 12_500, 9_800]},\n    index=pd.date_range(\"2024-01-01\", periods=3, freq=\"MS\"),\n)\ndf.index.name = \"month\"          # name the index before writing\n\nOUT = Path(\"exports\u002Fmonthly_sales.csv\")\nOUT.parent.mkdir(parents=True, exist_ok=True)\n\ntry:\n    df.to_csv(OUT, index=True, date_format=\"%Y-%m-%d\")\nexcept OSError as e:\n    raise SystemExit(f\"Write failed: {e}\")\n\n# Re-read: specify which column is the index\ntry:\n    df_back = pd.read_csv(OUT, index_col=\"month\", parse_dates=True)\nexcept FileNotFoundError as e:\n    raise SystemExit(f\"File not found: {e}\")\n\nassert \"Unnamed: 0\" not in df_back.columns, \"Spurious column present!\"\nassert df_back.index.name == \"month\"\nprint(\"Round-trip OK. Index name:\", df_back.index.name)\nprint(df_back)\n",[30,119123,119124,119128,119138,119148,119152,119157,119166,119190,119224,119228,119241,119245,119258,119280,119284,119290,119319,119329,119351,119355,119360,119366,119395,119405,119427,119431,119447,119459,119471],{"__ignoreMap":28},[33,119125,119126],{"class":35,"line":36},[33,119127,8895],{"class":39},[33,119129,119130,119132,119134,119136],{"class":35,"line":43},[33,119131,164],{"class":163},[33,119133,492],{"class":167},[33,119135,495],{"class":163},[33,119137,498],{"class":167},[33,119139,119140,119142,119144,119146],{"class":35,"line":61},[33,119141,190],{"class":163},[33,119143,193],{"class":167},[33,119145,164],{"class":163},[33,119147,198],{"class":167},[33,119149,119150],{"class":35,"line":73},[33,119151,92],{"emptyLinePlaceholder":91},[33,119153,119154],{"class":35,"line":88},[33,119155,119156],{"class":39},"# Monthly sales with a meaningful DatetimeIndex\n",[33,119158,119159,119161,119163],{"class":35,"line":95},[33,119160,13459],{"class":167},[33,119162,242],{"class":163},[33,119164,119165],{"class":167}," pd.DataFrame(\n",[33,119167,119168,119171,119173,119175,119177,119179,119182,119184,119187],{"class":35,"line":101},[33,119169,119170],{"class":167},"    {",[33,119172,16465],{"class":54},[33,119174,12426],{"class":167},[33,119176,104304],{"class":50},[33,119178,365],{"class":167},[33,119180,119181],{"class":50},"12_500",[33,119183,365],{"class":167},[33,119185,119186],{"class":50},"9_800",[33,119188,119189],{"class":167},"]},\n",[33,119191,119192,119195,119197,119200,119203,119205,119208,119210,119212,119214,119217,119219,119222],{"class":35,"line":171},[33,119193,119194],{"class":238},"    index",[33,119196,242],{"class":163},[33,119198,119199],{"class":167},"pd.date_range(",[33,119201,119202],{"class":54},"\"2024-01-01\"",[33,119204,365],{"class":167},[33,119206,119207],{"class":238},"periods",[33,119209,242],{"class":163},[33,119211,10258],{"class":50},[33,119213,365],{"class":167},[33,119215,119216],{"class":238},"freq",[33,119218,242],{"class":163},[33,119220,119221],{"class":54},"\"MS\"",[33,119223,1506],{"class":167},[33,119225,119226],{"class":35,"line":179},[33,119227,221],{"class":167},[33,119229,119230,119233,119235,119238],{"class":35,"line":187},[33,119231,119232],{"class":167},"df.index.name ",[33,119234,242],{"class":163},[33,119236,119237],{"class":54}," \"month\"",[33,119239,119240],{"class":39},"          # name the index before writing\n",[33,119242,119243],{"class":35,"line":201},[33,119244,92],{"emptyLinePlaceholder":91},[33,119246,119247,119249,119251,119253,119256],{"class":35,"line":206},[33,119248,57716],{"class":50},[33,119250,212],{"class":163},[33,119252,215],{"class":167},[33,119254,119255],{"class":54},"\"exports\u002Fmonthly_sales.csv\"",[33,119257,221],{"class":167},[33,119259,119260,119262,119264,119266,119268,119270,119272,119274,119276,119278],{"class":35,"line":224},[33,119261,57716],{"class":50},[33,119263,866],{"class":167},[33,119265,869],{"class":238},[33,119267,242],{"class":163},[33,119269,855],{"class":50},[33,119271,365],{"class":167},[33,119273,878],{"class":238},[33,119275,242],{"class":163},[33,119277,855],{"class":50},[33,119279,221],{"class":167},[33,119281,119282],{"class":35,"line":229},[33,119283,92],{"emptyLinePlaceholder":91},[33,119285,119286,119288],{"class":35,"line":235},[33,119287,35574],{"class":163},[33,119289,574],{"class":167},[33,119291,119292,119294,119296,119298,119300,119302,119304,119306,119309,119311,119313,119315,119317],{"class":35,"line":250},[33,119293,39534],{"class":167},[33,119295,57716],{"class":50},[33,119297,365],{"class":167},[33,119299,897],{"class":238},[33,119301,242],{"class":163},[33,119303,855],{"class":50},[33,119305,365],{"class":167},[33,119307,119308],{"class":238},"date_format",[33,119310,242],{"class":163},[33,119312,1244],{"class":54},[33,119314,916],{"class":50},[33,119316,274],{"class":54},[33,119318,221],{"class":167},[33,119320,119321,119323,119325,119327],{"class":35,"line":266},[33,119322,35726],{"class":163},[33,119324,107953],{"class":50},[33,119326,1852],{"class":163},[33,119328,7583],{"class":167},[33,119330,119331,119333,119335,119337,119339,119341,119343,119345,119347,119349],{"class":35,"line":290},[33,119332,35742],{"class":163},[33,119334,16617],{"class":50},[33,119336,602],{"class":167},[33,119338,4059],{"class":163},[33,119340,118410],{"class":54},[33,119342,1115],{"class":50},[33,119344,7602],{"class":167},[33,119346,1121],{"class":50},[33,119348,274],{"class":54},[33,119350,221],{"class":167},[33,119352,119353],{"class":35,"line":295},[33,119354,92],{"emptyLinePlaceholder":91},[33,119356,119357],{"class":35,"line":300},[33,119358,119359],{"class":39},"# Re-read: specify which column is the index\n",[33,119361,119362,119364],{"class":35,"line":317},[33,119363,35574],{"class":163},[33,119365,574],{"class":167},[33,119367,119368,119371,119373,119375,119377,119379,119381,119383,119385,119387,119389,119391,119393],{"class":35,"line":332},[33,119369,119370],{"class":167},"    df_back ",[33,119372,242],{"class":163},[33,119374,9481],{"class":167},[33,119376,57716],{"class":50},[33,119378,365],{"class":167},[33,119380,118614],{"class":238},[33,119382,242],{"class":163},[33,119384,96465],{"class":54},[33,119386,365],{"class":167},[33,119388,102641],{"class":238},[33,119390,242],{"class":163},[33,119392,855],{"class":50},[33,119394,221],{"class":167},[33,119396,119397,119399,119401,119403],{"class":35,"line":347},[33,119398,35726],{"class":163},[33,119400,2945],{"class":50},[33,119402,1852],{"class":163},[33,119404,7583],{"class":167},[33,119406,119407,119409,119411,119413,119415,119417,119419,119421,119423,119425],{"class":35,"line":374},[33,119408,35742],{"class":163},[33,119410,16617],{"class":50},[33,119412,602],{"class":167},[33,119414,4059],{"class":163},[33,119416,15677],{"class":54},[33,119418,1115],{"class":50},[33,119420,7602],{"class":167},[33,119422,1121],{"class":50},[33,119424,274],{"class":54},[33,119426,221],{"class":167},[33,119428,119429],{"class":35,"line":397},[33,119430,92],{"emptyLinePlaceholder":91},[33,119432,119433,119435,119437,119439,119441,119444],{"class":35,"line":653},[33,119434,36397],{"class":163},[33,119436,118774],{"class":54},[33,119438,620],{"class":163},[33,119440,8002],{"class":163},[33,119442,119443],{"class":167}," df_back.columns, ",[33,119445,119446],{"class":54},"\"Spurious column present!\"\n",[33,119448,119449,119451,119454,119456],{"class":35,"line":667},[33,119450,36397],{"class":163},[33,119452,119453],{"class":167}," df_back.index.name ",[33,119455,1865],{"class":163},[33,119457,119458],{"class":54}," \"month\"\n",[33,119460,119461,119463,119465,119468],{"class":35,"line":675},[33,119462,13474],{"class":50},[33,119464,602],{"class":167},[33,119466,119467],{"class":54},"\"Round-trip OK. Index name:\"",[33,119469,119470],{"class":167},", df_back.index.name)\n",[33,119472,119473,119475],{"class":35,"line":689},[33,119474,13474],{"class":50},[33,119476,119477],{"class":167},"(df_back)\n",[14,119479,119480,119481,4348,119484,119486,119487,34992,119489,119491,119492,119494],{},"The rule is simple: if ",[30,119482,119483],{},"index.name",[30,119485,571],{},", write with ",[30,119488,28142],{},[30,119490,119483],{}," is set to a meaningful string, writing with ",[30,119493,118005],{}," is correct and safe.",[2537,119496],{},[18,119498,119500],{"id":119499},"variant-fix-reset-a-rangeindex-you-want-as-a-column","Variant Fix: Reset a RangeIndex You Want as a Column",[14,119502,119503,119504,119507,119508,119510,119511,20891],{},"Sometimes you genuinely want the integer row numbers in the output — as a ",[30,119505,119506],{},"row_id"," column, for example. The right approach is to reset the index into a named column ",[26245,119509,59039],{}," calling ",[30,119512,118265],{},[23,119514,119516],{"className":126,"code":119515,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\nfrom pathlib import Path\n\ndf = pd.DataFrame({\"sku\": [\"A1\", \"B2\", \"C3\"], \"qty\": [10, 20, 30]})\n\n# Promote the RangeIndex to a named column, then export\ndf_with_id = df.reset_index().rename(columns={\"index\": \"row_id\"})\n\nOUT = Path(\"exports\u002Fwith_row_id.csv\")\nOUT.parent.mkdir(parents=True, exist_ok=True)\n\ntry:\n    df_with_id.to_csv(OUT, index=False)   # still index=False — the column is now in the data\nexcept OSError as e:\n    raise SystemExit(f\"Write failed: {e}\")\n\ncheck = pd.read_csv(OUT)\nassert \"row_id\" in check.columns\nassert \"Unnamed: 0\" not in check.columns\nprint(\"Columns:\", check.columns.tolist())   # ['row_id', 'sku', 'qty']\n",[30,119517,119518,119522,119532,119542,119546,119589,119593,119598,119624,119628,119641,119663,119667,119673,119693,119703,119725,119729,119741,119753,119765],{"__ignoreMap":28},[33,119519,119520],{"class":35,"line":36},[33,119521,8895],{"class":39},[33,119523,119524,119526,119528,119530],{"class":35,"line":43},[33,119525,164],{"class":163},[33,119527,492],{"class":167},[33,119529,495],{"class":163},[33,119531,498],{"class":167},[33,119533,119534,119536,119538,119540],{"class":35,"line":61},[33,119535,190],{"class":163},[33,119537,193],{"class":167},[33,119539,164],{"class":163},[33,119541,198],{"class":167},[33,119543,119544],{"class":35,"line":73},[33,119545,92],{"emptyLinePlaceholder":91},[33,119547,119548,119550,119552,119554,119557,119559,119561,119563,119566,119568,119571,119573,119575,119577,119579,119581,119583,119585,119587],{"class":35,"line":88},[33,119549,13459],{"class":167},[33,119551,242],{"class":163},[33,119553,101407],{"class":167},[33,119555,119556],{"class":54},"\"sku\"",[33,119558,12426],{"class":167},[33,119560,99899],{"class":54},[33,119562,365],{"class":167},[33,119564,119565],{"class":54},"\"B2\"",[33,119567,365],{"class":167},[33,119569,119570],{"class":54},"\"C3\"",[33,119572,8314],{"class":167},[33,119574,54232],{"class":54},[33,119576,12426],{"class":167},[33,119578,3545],{"class":50},[33,119580,365],{"class":167},[33,119582,2587],{"class":50},[33,119584,365],{"class":167},[33,119586,1543],{"class":50},[33,119588,45051],{"class":167},[33,119590,119591],{"class":35,"line":95},[33,119592,92],{"emptyLinePlaceholder":91},[33,119594,119595],{"class":35,"line":101},[33,119596,119597],{"class":39},"# Promote the RangeIndex to a named column, then export\n",[33,119599,119600,119603,119605,119608,119610,119612,119614,119617,119619,119622],{"class":35,"line":171},[33,119601,119602],{"class":167},"df_with_id ",[33,119604,242],{"class":163},[33,119606,119607],{"class":167}," df.reset_index().rename(",[33,119609,740],{"class":238},[33,119611,242],{"class":163},[33,119613,1115],{"class":167},[33,119615,119616],{"class":54},"\"index\"",[33,119618,2079],{"class":167},[33,119620,119621],{"class":54},"\"row_id\"",[33,119623,103249],{"class":167},[33,119625,119626],{"class":35,"line":179},[33,119627,92],{"emptyLinePlaceholder":91},[33,119629,119630,119632,119634,119636,119639],{"class":35,"line":187},[33,119631,57716],{"class":50},[33,119633,212],{"class":163},[33,119635,215],{"class":167},[33,119637,119638],{"class":54},"\"exports\u002Fwith_row_id.csv\"",[33,119640,221],{"class":167},[33,119642,119643,119645,119647,119649,119651,119653,119655,119657,119659,119661],{"class":35,"line":201},[33,119644,57716],{"class":50},[33,119646,866],{"class":167},[33,119648,869],{"class":238},[33,119650,242],{"class":163},[33,119652,855],{"class":50},[33,119654,365],{"class":167},[33,119656,878],{"class":238},[33,119658,242],{"class":163},[33,119660,855],{"class":50},[33,119662,221],{"class":167},[33,119664,119665],{"class":35,"line":206},[33,119666,92],{"emptyLinePlaceholder":91},[33,119668,119669,119671],{"class":35,"line":224},[33,119670,35574],{"class":163},[33,119672,574],{"class":167},[33,119674,119675,119678,119680,119682,119684,119686,119688,119690],{"class":35,"line":229},[33,119676,119677],{"class":167},"    df_with_id.to_csv(",[33,119679,57716],{"class":50},[33,119681,365],{"class":167},[33,119683,897],{"class":238},[33,119685,242],{"class":163},[33,119687,902],{"class":50},[33,119689,12000],{"class":167},[33,119691,119692],{"class":39},"# still index=False — the column is now in the data\n",[33,119694,119695,119697,119699,119701],{"class":35,"line":235},[33,119696,35726],{"class":163},[33,119698,107953],{"class":50},[33,119700,1852],{"class":163},[33,119702,7583],{"class":167},[33,119704,119705,119707,119709,119711,119713,119715,119717,119719,119721,119723],{"class":35,"line":250},[33,119706,35742],{"class":163},[33,119708,16617],{"class":50},[33,119710,602],{"class":167},[33,119712,4059],{"class":163},[33,119714,118410],{"class":54},[33,119716,1115],{"class":50},[33,119718,7602],{"class":167},[33,119720,1121],{"class":50},[33,119722,274],{"class":54},[33,119724,221],{"class":167},[33,119726,119727],{"class":35,"line":266},[33,119728,92],{"emptyLinePlaceholder":91},[33,119730,119731,119733,119735,119737,119739],{"class":35,"line":290},[33,119732,118759],{"class":167},[33,119734,242],{"class":163},[33,119736,9481],{"class":167},[33,119738,57716],{"class":50},[33,119740,221],{"class":167},[33,119742,119743,119745,119748,119750],{"class":35,"line":295},[33,119744,36397],{"class":163},[33,119746,119747],{"class":54}," \"row_id\"",[33,119749,8002],{"class":163},[33,119751,119752],{"class":167}," check.columns\n",[33,119754,119755,119757,119759,119761,119763],{"class":35,"line":300},[33,119756,36397],{"class":163},[33,119758,118774],{"class":54},[33,119760,620],{"class":163},[33,119762,8002],{"class":163},[33,119764,119752],{"class":167},[33,119766,119767,119769,119771,119774,119777],{"class":35,"line":317},[33,119768,13474],{"class":50},[33,119770,602],{"class":167},[33,119772,119773],{"class":54},"\"Columns:\"",[33,119775,119776],{"class":167},", check.columns.tolist())   ",[33,119778,119779],{"class":39},"# ['row_id', 'sku', 'qty']\n",[14,119781,119782],{},"This pattern works for any scenario where you want a positional ID in the output without relying on pandas index serialisation behaviour.",[2537,119784],{},[18,119786,9247],{"id":9246},[14,119788,119789],{},"Confirm the fix with a round-trip assertion:",[23,119791,119793],{"className":126,"code":119792,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\nfrom pathlib import Path\n\nORIGINAL = pd.DataFrame({\"a\": [1, 2, 3], \"b\": [\"x\", \"y\", \"z\"]})\nOUT = Path(\"verify_out.csv\")\n\ntry:\n    ORIGINAL.to_csv(OUT, index=False)\n    restored = pd.read_csv(OUT)\n    assert list(restored.columns) == list(ORIGINAL.columns), \\\n        f\"Column mismatch: {restored.columns.tolist()} vs {ORIGINAL.columns.tolist()}\"\n    assert len(restored) == len(ORIGINAL), \\\n        f\"Row count mismatch: {len(restored)} vs {len(ORIGINAL)}\"\n    assert \"Unnamed: 0\" not in restored.columns, \"Spurious column still present\"\n    print(\"Verification passed:\", restored.columns.tolist())\nexcept OSError as e:\n    raise SystemExit(f\"I\u002FO error: {e}\")\nexcept AssertionError as e:\n    raise SystemExit(f\"Assertion failed: {e}\")\n",[30,119794,119795,119799,119809,119819,119823,119864,119877,119881,119887,119907,119920,119940,119966,119986,120014,120030,120042,120052,120075,120085],{"__ignoreMap":28},[33,119796,119797],{"class":35,"line":36},[33,119798,8895],{"class":39},[33,119800,119801,119803,119805,119807],{"class":35,"line":43},[33,119802,164],{"class":163},[33,119804,492],{"class":167},[33,119806,495],{"class":163},[33,119808,498],{"class":167},[33,119810,119811,119813,119815,119817],{"class":35,"line":61},[33,119812,190],{"class":163},[33,119814,193],{"class":167},[33,119816,164],{"class":163},[33,119818,198],{"class":167},[33,119820,119821],{"class":35,"line":73},[33,119822,92],{"emptyLinePlaceholder":91},[33,119824,119825,119828,119830,119832,119834,119836,119838,119840,119842,119844,119846,119848,119850,119852,119854,119856,119858,119860,119862],{"class":35,"line":88},[33,119826,119827],{"class":50},"ORIGINAL",[33,119829,212],{"class":163},[33,119831,101407],{"class":167},[33,119833,118117],{"class":54},[33,119835,12426],{"class":167},[33,119837,734],{"class":50},[33,119839,365],{"class":167},[33,119841,1533],{"class":50},[33,119843,365],{"class":167},[33,119845,10258],{"class":50},[33,119847,8314],{"class":167},[33,119849,118134],{"class":54},[33,119851,12426],{"class":167},[33,119853,118139],{"class":54},[33,119855,365],{"class":167},[33,119857,118144],{"class":54},[33,119859,365],{"class":167},[33,119861,118149],{"class":54},[33,119863,45051],{"class":167},[33,119865,119866,119868,119870,119872,119875],{"class":35,"line":95},[33,119867,57716],{"class":50},[33,119869,212],{"class":163},[33,119871,215],{"class":167},[33,119873,119874],{"class":54},"\"verify_out.csv\"",[33,119876,221],{"class":167},[33,119878,119879],{"class":35,"line":101},[33,119880,92],{"emptyLinePlaceholder":91},[33,119882,119883,119885],{"class":35,"line":171},[33,119884,35574],{"class":163},[33,119886,574],{"class":167},[33,119888,119889,119892,119895,119897,119899,119901,119903,119905],{"class":35,"line":179},[33,119890,119891],{"class":50},"    ORIGINAL",[33,119893,119894],{"class":167},".to_csv(",[33,119896,57716],{"class":50},[33,119898,365],{"class":167},[33,119900,897],{"class":238},[33,119902,242],{"class":163},[33,119904,902],{"class":50},[33,119906,221],{"class":167},[33,119908,119909,119912,119914,119916,119918],{"class":35,"line":187},[33,119910,119911],{"class":167},"    restored ",[33,119913,242],{"class":163},[33,119915,9481],{"class":167},[33,119917,57716],{"class":50},[33,119919,221],{"class":167},[33,119921,119922,119924,119926,119929,119931,119933,119935,119937],{"class":35,"line":201},[33,119923,9228],{"class":163},[33,119925,599],{"class":50},[33,119927,119928],{"class":167},"(restored.columns) ",[33,119930,1865],{"class":163},[33,119932,599],{"class":50},[33,119934,602],{"class":167},[33,119936,119827],{"class":50},[33,119938,119939],{"class":167},".columns), \\\n",[33,119941,119942,119944,119947,119949,119952,119954,119956,119959,119962,119964],{"class":35,"line":206},[33,119943,9533],{"class":163},[33,119945,119946],{"class":54},"\"Column mismatch: ",[33,119948,1115],{"class":50},[33,119950,119951],{"class":167},"restored.columns.tolist()",[33,119953,1121],{"class":50},[33,119955,71066],{"class":54},[33,119957,119958],{"class":50},"{ORIGINAL",[33,119960,119961],{"class":167},".columns.tolist()",[33,119963,1121],{"class":50},[33,119965,7504],{"class":54},[33,119967,119968,119970,119972,119975,119977,119979,119981,119983],{"class":35,"line":224},[33,119969,9228],{"class":163},[33,119971,4037],{"class":50},[33,119973,119974],{"class":167},"(restored) ",[33,119976,1865],{"class":163},[33,119978,4037],{"class":50},[33,119980,602],{"class":167},[33,119982,119827],{"class":50},[33,119984,119985],{"class":167},"), \\\n",[33,119987,119988,119990,119993,119995,119998,120000,120002,120004,120006,120008,120010,120012],{"class":35,"line":229},[33,119989,9533],{"class":163},[33,119991,119992],{"class":54},"\"Row count mismatch: ",[33,119994,4065],{"class":50},[33,119996,119997],{"class":167},"(restored)",[33,119999,1121],{"class":50},[33,120001,71066],{"class":54},[33,120003,4065],{"class":50},[33,120005,602],{"class":167},[33,120007,119827],{"class":50},[33,120009,12027],{"class":167},[33,120011,1121],{"class":50},[33,120013,7504],{"class":54},[33,120015,120016,120018,120020,120022,120024,120027],{"class":35,"line":235},[33,120017,9228],{"class":163},[33,120019,118774],{"class":54},[33,120021,620],{"class":163},[33,120023,8002],{"class":163},[33,120025,120026],{"class":167}," restored.columns, ",[33,120028,120029],{"class":54},"\"Spurious column still present\"\n",[33,120031,120032,120034,120036,120039],{"class":35,"line":250},[33,120033,7268],{"class":50},[33,120035,602],{"class":167},[33,120037,120038],{"class":54},"\"Verification passed:\"",[33,120040,120041],{"class":167},", restored.columns.tolist())\n",[33,120043,120044,120046,120048,120050],{"class":35,"line":266},[33,120045,35726],{"class":163},[33,120047,107953],{"class":50},[33,120049,1852],{"class":163},[33,120051,7583],{"class":167},[33,120053,120054,120056,120058,120060,120062,120065,120067,120069,120071,120073],{"class":35,"line":290},[33,120055,35742],{"class":163},[33,120057,16617],{"class":50},[33,120059,602],{"class":167},[33,120061,4059],{"class":163},[33,120063,120064],{"class":54},"\"I\u002FO error: ",[33,120066,1115],{"class":50},[33,120068,7602],{"class":167},[33,120070,1121],{"class":50},[33,120072,274],{"class":54},[33,120074,221],{"class":167},[33,120076,120077,120079,120081,120083],{"class":35,"line":295},[33,120078,35726],{"class":163},[33,120080,9445],{"class":50},[33,120082,1852],{"class":163},[33,120084,7583],{"class":167},[33,120086,120087,120089,120091,120093,120095,120098,120100,120102,120104,120106],{"class":35,"line":300},[33,120088,35742],{"class":163},[33,120090,16617],{"class":50},[33,120092,602],{"class":167},[33,120094,4059],{"class":163},[33,120096,120097],{"class":54},"\"Assertion failed: ",[33,120099,1115],{"class":50},[33,120101,7602],{"class":167},[33,120103,1121],{"class":50},[33,120105,274],{"class":54},[33,120107,221],{"class":167},[2537,120109],{},[18,120111,6918],{"id":6917},[4211,120113,120114,120119,120124],{},[4214,120115,120116,120118],{},[940,120117,108865],{"href":108864}," — full guide covering encoding, delimiters, compression, and BI conventions",[4214,120120,120121,120123],{},[940,120122,9599],{"href":9598}," — fixing structural problems in CSVs you ingest",[4214,120125,120126,120128],{},[940,120127,99577],{"href":99576}," — reading the source before you export it",[14,120130,6947,120131,3035],{},[940,120132,108865],{"href":108864},[6953,120134,120135],{},"html pre.shiki code .sAwPA, html code.shiki .sAwPA{--shiki-default:#6A737D}html pre.shiki code .sD7c4, html code.shiki .sD7c4{--shiki-default:#D73A49}html pre.shiki code .sgsFI, html code.shiki .sgsFI{--shiki-default:#24292E}html pre.shiki code .sYu0t, html code.shiki .sYu0t{--shiki-default:#005CC5}html pre.shiki code .sYBdl, html code.shiki .sYBdl{--shiki-default:#032F62}html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html pre.shiki code .sqxcx, html code.shiki .sqxcx{--shiki-default:#E36209}",{"title":28,"searchDepth":43,"depth":43,"links":120137},[120138,120139,120140,120142,120147,120148,120149,120150],{"id":7020,"depth":43,"text":7021},{"id":54445,"depth":43,"text":99786},{"id":118255,"depth":43,"text":120141},"Fix: index=False",{"id":118506,"depth":43,"text":118507,"children":120143},[120144,120146],{"id":118513,"depth":61,"text":120145},"Option A — Re-read with index_col=0, then re-export",{"id":118799,"depth":61,"text":118800},{"id":119012,"depth":43,"text":119013},{"id":119499,"depth":43,"text":119500},{"id":9246,"depth":43,"text":9247},{"id":6917,"depth":43,"text":6918},"df.to_csv(\"out.csv\") writes an extra unnamed leading column. When you re-read that file with pd.read_csv(\"out.csv\"), that column appears as Unnamed: 0. It breaks column-count assertions, corrupts SQL COPY loads, and confuses every downstream tool that expected only your data columns.",{},"\u002Fpython-for-excel-csv-data-processing\u002Fexporting-data-to-csv-formats\u002Ffix-pandas-to-csv-extra-index-column",{"title":28147,"description":120151},"python-for-excel-csv-data-processing\u002Fexporting-data-to-csv-formats\u002Ffix-pandas-to-csv-extra-index-column\u002Findex","fZsjlfiY6ZYwxQ2SuDMcfHGDLVsXmQIWAssucB3xuQ4",{"id":120158,"title":108865,"body":120159,"breadcrumbTitle":125343,"canonical":6977,"date":6977,"description":107412,"draft":6980,"extension":6981,"image":6977,"meta":125344,"navigation":91,"path":125345,"robots":6977,"seo":125346,"seoTitle":125351,"stem":125352,"tags":6977,"updatedAt":6977,"__hash__":125353},"content\u002Fpython-for-excel-csv-data-processing\u002Fexporting-data-to-csv-formats\u002Findex.md",{"type":7,"value":120160,"toc":125314},[120161,120164,120170,120185,120187,120249,120252,120485,120487,120491,120494,120682,120685,120724,120726,120730,120827,120830,120901,120903,120910,120916,121260,121265,121321,121323,121327,121331,121553,121557,121821,121830,121832,121836,121840,121849,122097,122101,122109,122319,122321,122325,122329,122341,122543,122547,122564,122761,122765,122768,122954,122956,122960,122978,123293,123299,123316,123318,123322,123331,123680,123698,123700,123704,123707,123841,123854,123856,123858,123861,124111,124113,124115,124177,124179,124181,124314,124316,124318,125199,125201,125203,125211,125224,125236,125260,125276,125278,125280,125307,125311],[10,120162,108865],{"id":120163},"exporting-data-to-csv-formats",[14,120165,120166,120167,120169],{},"Every automated pipeline eventually emits a CSV — a report handed to an analyst, a feed uploaded to a CRM, a nightly extract dropped into a BI tool. Getting that file wrong is invisible until someone opens it in Excel and sees a column of ",[30,120168,117994],{}," garbage, garbled accents, or a float that rounds itself to four unexpected decimal places. Generic tutorials skip the edge cases. This guide does not.",[14,120171,120172,120173,120175,120176,120178,120179,120181,120182,120184],{},"Prerequisites are light: Python 3.9+, ",[30,120174,9630],{},", and optionally ",[30,120177,14295],{}," for large-file work. If your data is still dirty, run it through ",[940,120180,9599],{"href":9598}," before exporting; garbage-in means garbage-out regardless of serialization parameters. If your source is an Excel file, the ",[940,120183,99577],{"href":99576}," guide covers ingestion so you arrive at a clean DataFrame ready to export.",[18,120186,21],{"id":20},[23,120188,120190],{"className":25,"code":120189,"language":27,"meta":28,"style":28},"# pip install pandas pyarrow\npython -m venv .venv && source .venv\u002Fbin\u002Factivate\npip install pandas pyarrow\npython - \u003C\u003C'EOF'\nimport pandas as pd, pathlib\nprint(pd.__version__)\nEOF\n",[30,120191,120192,120196,120214,120225,120235,120240,120245],{"__ignoreMap":28},[33,120193,120194],{"class":35,"line":36},[33,120195,66726],{"class":39},[33,120197,120198,120200,120202,120204,120207,120209,120211],{"class":35,"line":43},[33,120199,47],{"class":46},[33,120201,51],{"class":50},[33,120203,55],{"class":54},[33,120205,120206],{"class":54}," .venv",[33,120208,35214],{"class":167},[33,120210,64],{"class":50},[33,120212,120213],{"class":54}," .venv\u002Fbin\u002Factivate\n",[33,120215,120216,120218,120220,120222],{"class":35,"line":61},[33,120217,76],{"class":46},[33,120219,79],{"class":54},[33,120221,16183],{"class":54},[33,120223,120224],{"class":54}," pyarrow\n",[33,120226,120227,120229,120231,120233],{"class":35,"line":73},[33,120228,47],{"class":46},[33,120230,39025],{"class":54},[33,120232,53957],{"class":163},[33,120234,53960],{"class":54},[33,120236,120237],{"class":35,"line":88},[33,120238,120239],{"class":54},"import pandas as pd, pathlib\n",[33,120241,120242],{"class":35,"line":95},[33,120243,120244],{"class":54},"print(pd.__version__)\n",[33,120246,120247],{"class":35,"line":101},[33,120248,54019],{"class":54},[14,120250,120251],{},"You also need a sample file for the diagnostic step:",[23,120253,120255],{"className":126,"code":120254,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\nfrom pathlib import Path\n\nSAMPLE = Path(\"data\u002Fsample_export.csv\")\nSAMPLE.parent.mkdir(parents=True, exist_ok=True)\n\ndf = pd.DataFrame({\n    \"order_id\": [1001, 1002, 1003],\n    \"customer\": [\"Café Müller\", \"O'Brien & Sons\", 'Zhao, \"Alex\"'],\n    \"amount\": [1234.5678, 99.0, 0.1 + 0.2],   # float precision bait\n    \"shipped\": pd.to_datetime([\"2024-03-01\", \"2024-03-15\", \"2024-03-31\"]),\n    \"region\": [\"EU\", \"US\", \"APAC\"],\n})\ndf.to_csv(SAMPLE, index=False)\nprint(\"Sample written:\", SAMPLE)\n",[30,120256,120257,120261,120271,120281,120285,120299,120321,120325,120333,120355,120377,120406,120429,120450,120454,120470],{"__ignoreMap":28},[33,120258,120259],{"class":35,"line":36},[33,120260,8895],{"class":39},[33,120262,120263,120265,120267,120269],{"class":35,"line":43},[33,120264,164],{"class":163},[33,120266,492],{"class":167},[33,120268,495],{"class":163},[33,120270,498],{"class":167},[33,120272,120273,120275,120277,120279],{"class":35,"line":61},[33,120274,190],{"class":163},[33,120276,193],{"class":167},[33,120278,164],{"class":163},[33,120280,198],{"class":167},[33,120282,120283],{"class":35,"line":73},[33,120284,92],{"emptyLinePlaceholder":91},[33,120286,120287,120290,120292,120294,120297],{"class":35,"line":88},[33,120288,120289],{"class":50},"SAMPLE",[33,120291,212],{"class":163},[33,120293,215],{"class":167},[33,120295,120296],{"class":54},"\"data\u002Fsample_export.csv\"",[33,120298,221],{"class":167},[33,120300,120301,120303,120305,120307,120309,120311,120313,120315,120317,120319],{"class":35,"line":95},[33,120302,120289],{"class":50},[33,120304,866],{"class":167},[33,120306,869],{"class":238},[33,120308,242],{"class":163},[33,120310,855],{"class":50},[33,120312,365],{"class":167},[33,120314,878],{"class":238},[33,120316,242],{"class":163},[33,120318,855],{"class":50},[33,120320,221],{"class":167},[33,120322,120323],{"class":35,"line":101},[33,120324,92],{"emptyLinePlaceholder":91},[33,120326,120327,120329,120331],{"class":35,"line":171},[33,120328,13459],{"class":167},[33,120330,242],{"class":163},[33,120332,11749],{"class":167},[33,120334,120335,120338,120340,120343,120345,120348,120350,120353],{"class":35,"line":179},[33,120336,120337],{"class":54},"    \"order_id\"",[33,120339,12426],{"class":167},[33,120341,120342],{"class":50},"1001",[33,120344,365],{"class":167},[33,120346,120347],{"class":50},"1002",[33,120349,365],{"class":167},[33,120351,120352],{"class":50},"1003",[33,120354,8935],{"class":167},[33,120356,120357,120360,120362,120365,120367,120370,120372,120375],{"class":35,"line":187},[33,120358,120359],{"class":54},"    \"customer\"",[33,120361,12426],{"class":167},[33,120363,120364],{"class":54},"\"Café Müller\"",[33,120366,365],{"class":167},[33,120368,120369],{"class":54},"\"O'Brien & Sons\"",[33,120371,365],{"class":167},[33,120373,120374],{"class":54},"'Zhao, \"Alex\"'",[33,120376,8935],{"class":167},[33,120378,120379,120382,120384,120387,120389,120392,120394,120397,120399,120401,120403],{"class":35,"line":201},[33,120380,120381],{"class":54},"    \"amount\"",[33,120383,12426],{"class":167},[33,120385,120386],{"class":50},"1234.5678",[33,120388,365],{"class":167},[33,120390,120391],{"class":50},"99.0",[33,120393,365],{"class":167},[33,120395,120396],{"class":50},"0.1",[33,120398,82634],{"class":163},[33,120400,46243],{"class":50},[33,120402,13424],{"class":167},[33,120404,120405],{"class":39},"# float precision bait\n",[33,120407,120408,120411,120414,120417,120419,120422,120424,120427],{"class":35,"line":206},[33,120409,120410],{"class":54},"    \"shipped\"",[33,120412,120413],{"class":167},": pd.to_datetime([",[33,120415,120416],{"class":54},"\"2024-03-01\"",[33,120418,365],{"class":167},[33,120420,120421],{"class":54},"\"2024-03-15\"",[33,120423,365],{"class":167},[33,120425,120426],{"class":54},"\"2024-03-31\"",[33,120428,12871],{"class":167},[33,120430,120431,120433,120435,120438,120440,120443,120445,120448],{"class":35,"line":224},[33,120432,16270],{"class":54},[33,120434,12426],{"class":167},[33,120436,120437],{"class":54},"\"EU\"",[33,120439,365],{"class":167},[33,120441,120442],{"class":54},"\"US\"",[33,120444,365],{"class":167},[33,120446,120447],{"class":54},"\"APAC\"",[33,120449,8935],{"class":167},[33,120451,120452],{"class":35,"line":229},[33,120453,103249],{"class":167},[33,120455,120456,120458,120460,120462,120464,120466,120468],{"class":35,"line":235},[33,120457,16503],{"class":167},[33,120459,120289],{"class":50},[33,120461,365],{"class":167},[33,120463,897],{"class":238},[33,120465,242],{"class":163},[33,120467,902],{"class":50},[33,120469,221],{"class":167},[33,120471,120472,120474,120476,120479,120481,120483],{"class":35,"line":250},[33,120473,13474],{"class":50},[33,120475,602],{"class":167},[33,120477,120478],{"class":54},"\"Sample written:\"",[33,120480,365],{"class":167},[33,120482,120289],{"class":50},[33,120484,221],{"class":167},[2537,120486],{},[18,120488,120490],{"id":120489},"step-1-inspect-before-you-export","Step 1 — Inspect Before You Export",[14,120492,120493],{},"Before choosing parameters, inspect the DataFrame's dtypes and look for the three things that silently break CSV exports: mixed-type columns, timezone-aware datetimes, and floating-point noise.",[23,120495,120497],{"className":126,"code":120496,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\nfrom pathlib import Path\n\nSOURCE = Path(\"data\u002Fsample_export.csv\")\n\ntry:\n    df = pd.read_csv(SOURCE)\nexcept FileNotFoundError as e:\n    raise SystemExit(f\"Source not found: {e}\")\n\nprint(df.dtypes)\nprint(\"\\nNull counts:\\n\", df.isnull().sum())\nprint(\"\\nFloat sample (raw repr):\")\nfor col in df.select_dtypes(\"float\"):\n    print(f\"  {col}: {df[col].tolist()}\")\n",[30,120498,120499,120503,120513,120523,120527,120539,120543,120549,120561,120571,120593,120597,120603,120623,120638,120653],{"__ignoreMap":28},[33,120500,120501],{"class":35,"line":36},[33,120502,8895],{"class":39},[33,120504,120505,120507,120509,120511],{"class":35,"line":43},[33,120506,164],{"class":163},[33,120508,492],{"class":167},[33,120510,495],{"class":163},[33,120512,498],{"class":167},[33,120514,120515,120517,120519,120521],{"class":35,"line":61},[33,120516,190],{"class":163},[33,120518,193],{"class":167},[33,120520,164],{"class":163},[33,120522,198],{"class":167},[33,120524,120525],{"class":35,"line":73},[33,120526,92],{"emptyLinePlaceholder":91},[33,120528,120529,120531,120533,120535,120537],{"class":35,"line":88},[33,120530,86272],{"class":50},[33,120532,212],{"class":163},[33,120534,215],{"class":167},[33,120536,120296],{"class":54},[33,120538,221],{"class":167},[33,120540,120541],{"class":35,"line":95},[33,120542,92],{"emptyLinePlaceholder":91},[33,120544,120545,120547],{"class":35,"line":101},[33,120546,35574],{"class":163},[33,120548,574],{"class":167},[33,120550,120551,120553,120555,120557,120559],{"class":35,"line":171},[33,120552,4025],{"class":167},[33,120554,242],{"class":163},[33,120556,9481],{"class":167},[33,120558,86272],{"class":50},[33,120560,221],{"class":167},[33,120562,120563,120565,120567,120569],{"class":35,"line":179},[33,120564,35726],{"class":163},[33,120566,2945],{"class":50},[33,120568,1852],{"class":163},[33,120570,7583],{"class":167},[33,120572,120573,120575,120577,120579,120581,120583,120585,120587,120589,120591],{"class":35,"line":187},[33,120574,35742],{"class":163},[33,120576,16617],{"class":50},[33,120578,602],{"class":167},[33,120580,4059],{"class":163},[33,120582,90279],{"class":54},[33,120584,1115],{"class":50},[33,120586,7602],{"class":167},[33,120588,1121],{"class":50},[33,120590,274],{"class":54},[33,120592,221],{"class":167},[33,120594,120595],{"class":35,"line":201},[33,120596,92],{"emptyLinePlaceholder":91},[33,120598,120599,120601],{"class":35,"line":206},[33,120600,13474],{"class":50},[33,120602,108834],{"class":167},[33,120604,120605,120607,120609,120611,120613,120616,120618,120620],{"class":35,"line":224},[33,120606,13474],{"class":50},[33,120608,602],{"class":167},[33,120610,274],{"class":54},[33,120612,25830],{"class":50},[33,120614,120615],{"class":54},"Null counts:",[33,120617,25830],{"class":50},[33,120619,274],{"class":54},[33,120621,120622],{"class":167},", df.isnull().sum())\n",[33,120624,120625,120627,120629,120631,120633,120636],{"class":35,"line":229},[33,120626,13474],{"class":50},[33,120628,602],{"class":167},[33,120630,274],{"class":54},[33,120632,25830],{"class":50},[33,120634,120635],{"class":54},"Float sample (raw repr):\"",[33,120637,221],{"class":167},[33,120639,120640,120642,120644,120646,120648,120651],{"class":35,"line":235},[33,120641,6124],{"class":163},[33,120643,7985],{"class":167},[33,120645,662],{"class":163},[33,120647,23604],{"class":167},[33,120649,120650],{"class":54},"\"float\"",[33,120652,1737],{"class":167},[33,120654,120655,120657,120659,120661,120663,120665,120667,120669,120671,120673,120676,120678,120680],{"class":35,"line":250},[33,120656,7268],{"class":50},[33,120658,602],{"class":167},[33,120660,4059],{"class":163},[33,120662,48959],{"class":54},[33,120664,1115],{"class":50},[33,120666,8276],{"class":167},[33,120668,1121],{"class":50},[33,120670,2079],{"class":54},[33,120672,1115],{"class":50},[33,120674,120675],{"class":167},"df[col].tolist()",[33,120677,1121],{"class":50},[33,120679,274],{"class":54},[33,120681,221],{"class":167},[14,120683,120684],{},"Key things to look for:",[4211,120686,120687,120700,120713],{},[4214,120688,120689,120691,120692,120695,120696,120699],{},[30,120690,11888],{}," columns that ",[26245,120693,120694],{},"look"," numeric — they will export as strings, then surprise downstream ",[30,120697,120698],{},"SUM()"," formulas.",[4214,120701,120702,120705,120706,120708,120709,120712],{},[30,120703,120704],{},"datetime64[ns, UTC]"," columns — ",[30,120707,118265],{}," serialises them with timezone offset by default; most BI tools choke on ",[30,120710,120711],{},"+00:00"," suffixes.",[4214,120714,120715,120716,120719,120720,120723],{},"Floats like ",[30,120717,120718],{},"0.30000000000000004"," — set ",[30,120721,120722],{},"float_format"," to tame them.",[2537,120725],{},[18,120727,120729],{"id":120728},"step-2-choose-your-serialization-path","Step 2 — Choose Your Serialization Path",[2540,120731,2547,120733,2547,120736,2547,120739,2547,2547,120753,2547,120756,2547,120759,2547,2547,120761,2547,2547,120764,2547,120767,2547,120770,2547,2547,2547,120773,2547,120775,2547,120777,2547,120781,2547,2547,120784,2547,120786,2547,120788,2547,120791,2547,2547,120794,2547,120796,2547,120799,2547,120803,2547,2547,120806,2547,120808,2547,120810,2547,120813,2547,2547,120817,2547,120819,2547,120821,2547,120824],{"viewBox":2542,"role":2543,"ariaLabel":120732,"xmlns":2545,"style":2546},"Decision diagram: DataFrame to serialization options to CSV variants",[2549,120734,120735],{},"Serialization path decision diagram",[2553,120737,120738],{},"Shows how a DataFrame flows through serialization option choices to produce different CSV variants: standard UTF-8, UTF-8-sig for Excel, gzip-compressed, chunked, or pipe-delimited.",[2557,120740,2559,120741,2559,120748,2547],{},[2561,120742,2564,120744,2564,120746,2559],{"id":120743,"x1":748,"y1":748,"x2":734,"y2":748},"export-csv-grad",[2566,120745],{"offset":748,"style":2568},[2566,120747],{"offset":734,"style":2571},[2573,120749,2564,120751,2559],{"id":120750,"viewBox":2576,"refX":2577,"refY":1153,"markerWidth":1179,"markerHeight":1179,"orient":2578},"export-csv-arrow",[2580,120752],{"d":2582,"fill":2583},[2585,120754],{"x":2587,"y":2588,"width":2609,"height":2680,"rx":2591,"fill":120755,"stroke":2593,"style":2594},"url(#export-csv-grad)",[2000,120757,11219],{"x":2630,"y":120758,"fill":2599,"style":38718},"151",[2000,120760,64],{"x":2630,"y":2639,"fill":2599,"style":2685},[35,120762],{"x1":2610,"y1":2598,"x2":2701,"y2":2598,"stroke":2583,"markerEnd":120763,"style":2594},"url(#export-csv-arrow)",[49826,120765],{"points":120766,"fill":2592,"stroke":11166,"style":2594},"310,120 390,155 310,190 230,155",[2000,120768,120769],{"x":2698,"y":120758,"fill":2599,"style":2685},"Serialization",[2000,120771,120772],{"x":2698,"y":11198,"fill":2599,"style":2685},"Options",[35,120774],{"x1":2698,"y1":2589,"x2":2701,"y2":58333,"stroke":2583,"markerEnd":120763,"style":2594},[2585,120776],{"x":2590,"y":2587,"width":2598,"height":26341,"rx":1179,"fill":2615,"stroke":2593,"style":2594},[2000,120778,120780],{"x":49842,"y":120779,"fill":2599,"style":2685},"39","Standard UTF-8",[2000,120782,120783],{"x":49842,"y":100322,"fill":2583,"style":2605},"Linux\u002FAPI pipelines",[35,120785],{"x1":2698,"y1":2589,"x2":2698,"y2":2653,"stroke":2583,"markerEnd":120763,"style":2594},[2585,120787],{"x":26354,"y":2587,"width":2598,"height":26341,"rx":1179,"fill":2615,"stroke":2593,"style":2594},[2000,120789,120790],{"x":11149,"y":120779,"fill":2599,"style":2685},"UTF-8-sig",[2000,120792,120793],{"x":11149,"y":100322,"fill":2583,"style":2605},"Excel \u002F Windows",[35,120795],{"x1":2677,"y1":2589,"x2":16990,"y2":58333,"stroke":2583,"markerEnd":120763,"style":2594},[2585,120797],{"x":120798,"y":2587,"width":2598,"height":26341,"rx":1179,"fill":2615,"stroke":2593,"style":2594},"415",[2000,120800,120802],{"x":120801,"y":120779,"fill":2599,"style":2685},"493","Gzip-compressed",[2000,120804,120805],{"x":120801,"y":100322,"fill":2583,"style":2605},"Large files \u002F S3",[35,120807],{"x1":2698,"y1":2697,"x2":2701,"y2":2678,"stroke":2583,"markerEnd":120763,"style":2594},[2585,120809],{"x":2590,"y":2678,"width":2598,"height":26341,"rx":1179,"fill":2615,"stroke":2593,"style":2594},[2000,120811,120812],{"x":49842,"y":17048,"fill":2599,"style":2685},"Chunked write",[2000,120814,120816],{"x":49842,"y":120815,"fill":2583,"style":2605},"289","Out-of-core \u002F 1 GB+",[35,120818],{"x1":2677,"y1":2697,"x2":16990,"y2":2678,"stroke":2583,"markerEnd":120763,"style":2594},[2585,120820],{"x":120798,"y":2678,"width":2598,"height":26341,"rx":1179,"fill":2615,"stroke":2593,"style":2594},[2000,120822,120823],{"x":120801,"y":17048,"fill":2599,"style":2685},"Pipe \u002F TSV",[2000,120825,120826],{"x":120801,"y":120815,"fill":2583,"style":2605},"BI tools \u002F SQL COPY",[14,120828,120829],{},"Use the table below to pick the right branch:",[4273,120831,120832,120842],{},[4276,120833,120834],{},[4279,120835,120836,120839],{},[4282,120837,120838],{},"Scenario",[4282,120840,120841],{},"Recommended approach",[4292,120843,120844,120854,120864,120874,120886],{},[4279,120845,120846,120849],{},[4297,120847,120848],{},"\u003C 500 MB, going to Excel",[4297,120850,120851],{},[30,120852,120853],{},"to_csv(path, index=False, encoding=\"utf-8-sig\")",[4279,120855,120856,120859],{},[4297,120857,120858],{},"\u003C 500 MB, going to Linux pipeline \u002F API",[4297,120860,120861],{},[30,120862,120863],{},"to_csv(path, index=False, encoding=\"utf-8\")",[4279,120865,120866,120869],{},[4297,120867,120868],{},"Large file, S3 or GCS upload",[4297,120870,120871],{},[30,120872,120873],{},"to_csv(path, compression=\"gzip\")",[4279,120875,120876,120879],{},[4297,120877,120878],{},"> 1 GB, can't load into RAM",[4297,120880,120881,120882,120885],{},"Generator + ",[30,120883,120884],{},"csv.DictWriter",", chunked flush",[4279,120887,120888,120894],{},[4297,120889,120890,120891,120893],{},"SQL ",[30,120892,117998],{},", Postgres, Redshift",[4297,120895,120896,120897,120900],{},"TSV (",[30,120898,120899],{},"sep=\"\\t\"",") or pipe (`sep=\"",[2537,120902],{},[18,120904,120906,120907,120909],{"id":120905},"step-3-core-to_csv-configuration","Step 3 — Core ",[30,120908,118265],{}," Configuration",[14,120911,120912,120913,120915],{},"Every ",[30,120914,118265],{}," call in production should spell out these five parameters. Relying on defaults causes the bugs catalogued in the troubleshooting section.",[23,120917,120919],{"className":126,"code":120918,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\nfrom pathlib import Path\n\nSOURCE = Path(\"data\u002Fsample_export.csv\")\nOUT = Path(\"exports\u002Freport.csv\")\nOUT.parent.mkdir(parents=True, exist_ok=True)\n\ntry:\n    df = pd.read_csv(SOURCE, parse_dates=[\"shipped\"])\nexcept FileNotFoundError as e:\n    raise SystemExit(f\"Source not found: {e}\")\n\ntry:\n    df.to_csv(\n        OUT,\n        index=False,              # never write the RangeIndex as a column\n        encoding=\"utf-8-sig\",     # BOM so Excel opens without garbling accents\n        sep=\",\",                  # explicit; change to \";\" for EU locales\n        float_format=\"%.2f\",      # tame 0.30000000000000004 noise\n        date_format=\"%Y-%m-%d\",   # ISO 8601; avoids locale-dependent M\u002FD\u002FY\n        lineterminator=\"\\r\\n\",    # CRLF = RFC 4180 compliant\n        na_rep=\"\",                # empty string for nulls (Excel-safe)\n    )\n    print(f\"Exported {len(df):,} rows → {OUT}\")\nexcept OSError as e:\n    raise SystemExit(f\"Write failed: {e}\")\n",[30,120920,120921,120925,120935,120945,120949,120961,120974,120996,121000,121006,121029,121039,121061,121065,121071,121076,121083,121098,121111,121126,121145,121163,121182,121197,121201,121228,121238],{"__ignoreMap":28},[33,120922,120923],{"class":35,"line":36},[33,120924,8895],{"class":39},[33,120926,120927,120929,120931,120933],{"class":35,"line":43},[33,120928,164],{"class":163},[33,120930,492],{"class":167},[33,120932,495],{"class":163},[33,120934,498],{"class":167},[33,120936,120937,120939,120941,120943],{"class":35,"line":61},[33,120938,190],{"class":163},[33,120940,193],{"class":167},[33,120942,164],{"class":163},[33,120944,198],{"class":167},[33,120946,120947],{"class":35,"line":73},[33,120948,92],{"emptyLinePlaceholder":91},[33,120950,120951,120953,120955,120957,120959],{"class":35,"line":88},[33,120952,86272],{"class":50},[33,120954,212],{"class":163},[33,120956,215],{"class":167},[33,120958,120296],{"class":54},[33,120960,221],{"class":167},[33,120962,120963,120965,120967,120969,120972],{"class":35,"line":95},[33,120964,57716],{"class":50},[33,120966,212],{"class":163},[33,120968,215],{"class":167},[33,120970,120971],{"class":54},"\"exports\u002Freport.csv\"",[33,120973,221],{"class":167},[33,120975,120976,120978,120980,120982,120984,120986,120988,120990,120992,120994],{"class":35,"line":101},[33,120977,57716],{"class":50},[33,120979,866],{"class":167},[33,120981,869],{"class":238},[33,120983,242],{"class":163},[33,120985,855],{"class":50},[33,120987,365],{"class":167},[33,120989,878],{"class":238},[33,120991,242],{"class":163},[33,120993,855],{"class":50},[33,120995,221],{"class":167},[33,120997,120998],{"class":35,"line":171},[33,120999,92],{"emptyLinePlaceholder":91},[33,121001,121002,121004],{"class":35,"line":179},[33,121003,35574],{"class":163},[33,121005,574],{"class":167},[33,121007,121008,121010,121012,121014,121016,121018,121020,121022,121024,121027],{"class":35,"line":187},[33,121009,4025],{"class":167},[33,121011,242],{"class":163},[33,121013,9481],{"class":167},[33,121015,86272],{"class":50},[33,121017,365],{"class":167},[33,121019,102641],{"class":238},[33,121021,242],{"class":163},[33,121023,8309],{"class":167},[33,121025,121026],{"class":54},"\"shipped\"",[33,121028,751],{"class":167},[33,121030,121031,121033,121035,121037],{"class":35,"line":201},[33,121032,35726],{"class":163},[33,121034,2945],{"class":50},[33,121036,1852],{"class":163},[33,121038,7583],{"class":167},[33,121040,121041,121043,121045,121047,121049,121051,121053,121055,121057,121059],{"class":35,"line":206},[33,121042,35742],{"class":163},[33,121044,16617],{"class":50},[33,121046,602],{"class":167},[33,121048,4059],{"class":163},[33,121050,90279],{"class":54},[33,121052,1115],{"class":50},[33,121054,7602],{"class":167},[33,121056,1121],{"class":50},[33,121058,274],{"class":54},[33,121060,221],{"class":167},[33,121062,121063],{"class":35,"line":224},[33,121064,92],{"emptyLinePlaceholder":91},[33,121066,121067,121069],{"class":35,"line":229},[33,121068,35574],{"class":163},[33,121070,574],{"class":167},[33,121072,121073],{"class":35,"line":235},[33,121074,121075],{"class":167},"    df.to_csv(\n",[33,121077,121078,121081],{"class":35,"line":250},[33,121079,121080],{"class":50},"        OUT",[33,121082,247],{"class":167},[33,121084,121085,121088,121090,121092,121095],{"class":35,"line":266},[33,121086,121087],{"class":238},"        index",[33,121089,242],{"class":163},[33,121091,902],{"class":50},[33,121093,121094],{"class":167},",              ",[33,121096,121097],{"class":39},"# never write the RangeIndex as a column\n",[33,121099,121100,121102,121104,121106,121108],{"class":35,"line":290},[33,121101,1190],{"class":238},[33,121103,242],{"class":163},[33,121105,108390],{"class":54},[33,121107,25539],{"class":167},[33,121109,121110],{"class":39},"# BOM so Excel opens without garbling accents\n",[33,121112,121113,121116,121118,121120,121123],{"class":35,"line":295},[33,121114,121115],{"class":238},"        sep",[33,121117,242],{"class":163},[33,121119,15900],{"class":54},[33,121121,121122],{"class":167},",                  ",[33,121124,121125],{"class":39},"# explicit; change to \";\" for EU locales\n",[33,121127,121128,121131,121133,121135,121137,121139,121142],{"class":35,"line":300},[33,121129,121130],{"class":238},"        float_format",[33,121132,242],{"class":163},[33,121134,274],{"class":54},[33,121136,54896],{"class":50},[33,121138,274],{"class":54},[33,121140,121141],{"class":167},",      ",[33,121143,121144],{"class":39},"# tame 0.30000000000000004 noise\n",[33,121146,121147,121150,121152,121154,121156,121158,121160],{"class":35,"line":317},[33,121148,121149],{"class":238},"        date_format",[33,121151,242],{"class":163},[33,121153,1244],{"class":54},[33,121155,916],{"class":50},[33,121157,274],{"class":54},[33,121159,1166],{"class":167},[33,121161,121162],{"class":39},"# ISO 8601; avoids locale-dependent M\u002FD\u002FY\n",[33,121164,121165,121168,121170,121172,121175,121177,121179],{"class":35,"line":332},[33,121166,121167],{"class":238},"        lineterminator",[33,121169,242],{"class":163},[33,121171,274],{"class":54},[33,121173,121174],{"class":50},"\\r\\n",[33,121176,274],{"class":54},[33,121178,38342],{"class":167},[33,121180,121181],{"class":39},"# CRLF = RFC 4180 compliant\n",[33,121183,121184,121187,121189,121191,121194],{"class":35,"line":347},[33,121185,121186],{"class":238},"        na_rep",[33,121188,242],{"class":163},[33,121190,3198],{"class":54},[33,121192,121193],{"class":167},",                ",[33,121195,121196],{"class":39},"# empty string for nulls (Excel-safe)\n",[33,121198,121199],{"class":35,"line":374},[33,121200,1202],{"class":167},[33,121202,121203,121205,121207,121209,121211,121213,121215,121217,121219,121222,121224,121226],{"class":35,"line":397},[33,121204,7268],{"class":50},[33,121206,602],{"class":167},[33,121208,4059],{"class":163},[33,121210,44444],{"class":54},[33,121212,4065],{"class":50},[33,121214,4068],{"class":167},[33,121216,18801],{"class":163},[33,121218,1121],{"class":50},[33,121220,121221],{"class":54}," rows → ",[33,121223,58217],{"class":50},[33,121225,274],{"class":54},[33,121227,221],{"class":167},[33,121229,121230,121232,121234,121236],{"class":35,"line":653},[33,121231,35726],{"class":163},[33,121233,107953],{"class":50},[33,121235,1852],{"class":163},[33,121237,7583],{"class":167},[33,121239,121240,121242,121244,121246,121248,121250,121252,121254,121256,121258],{"class":35,"line":667},[33,121241,35742],{"class":163},[33,121243,16617],{"class":50},[33,121245,602],{"class":167},[33,121247,4059],{"class":163},[33,121249,118410],{"class":54},[33,121251,1115],{"class":50},[33,121253,7602],{"class":167},[33,121255,1121],{"class":50},[33,121257,274],{"class":54},[33,121259,221],{"class":167},[14,121261,121262],{},[1974,121263,121264],{},"Parameter-by-parameter rationale:",[4211,121266,121267,121275,121284,121301,121311],{},[4214,121268,121269,121271,121272,121274],{},[30,121270,28142],{}," — by far the most common export mistake; see ",[940,121273,28147],{"href":28146}," for a full diagnosis.",[4214,121276,121277,121279,121280,121283],{},[30,121278,59124],{}," — the BOM (",[30,121281,121282],{},"\\xef\\xbb\\xbf",") costs three bytes and saves hours of \"why are my accents broken in Excel?\" tickets.",[4214,121285,121286,121289,121290,121293,121294,2012,121297,121300],{},[30,121287,121288],{},"float_format=\"%.2f\""," — format string passed to Python's ",[30,121291,121292],{},"%"," operator per cell; ",[30,121295,121296],{},"\"%.4f\"",[30,121298,121299],{},"\"{:.4f}\".format"," also work.",[4214,121302,121303,121306,121307,121310],{},[30,121304,121305],{},"date_format=\"%Y-%m-%d\""," — without this, pandas emits ",[30,121308,121309],{},"2024-03-01 00:00:00"," and time-stripping logic lands in downstream pipelines.",[4214,121312,121313,121316,121317,121320],{},[30,121314,121315],{},"lineterminator=\"\\r\\n\""," — RFC 4180 mandates CRLF; some strict parsers (SQL Server ",[30,121318,121319],{},"BULK INSERT",") reject LF-only files.",[2537,121322],{},[18,121324,121326],{"id":121325},"step-4-index-handling-in-depth","Step 4 — Index Handling in Depth",[424,121328,121330],{"id":121329},"when-you-want-no-index-most-reports","When you want no index (most reports)",[23,121332,121334],{"className":126,"code":121333,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\nfrom pathlib import Path\n\ndf = pd.DataFrame({\"sku\": [\"A1\", \"B2\"], \"qty\": [10, 20]})\nOUT = Path(\"exports\u002Fno_index.csv\")\nOUT.parent.mkdir(parents=True, exist_ok=True)\n\ntry:\n    df.to_csv(OUT, index=False)\n    # Verify: re-read and check columns\n    check = pd.read_csv(OUT)\n    assert list(check.columns) == [\"sku\", \"qty\"], f\"Unexpected columns: {check.columns.tolist()}\"\n    print(\"OK — columns:\", check.columns.tolist())\nexcept OSError as e:\n    raise SystemExit(f\"Write failed: {e}\")\n",[30,121335,121336,121340,121350,121360,121364,121396,121409,121431,121435,121441,121457,121462,121475,121510,121521,121531],{"__ignoreMap":28},[33,121337,121338],{"class":35,"line":36},[33,121339,8895],{"class":39},[33,121341,121342,121344,121346,121348],{"class":35,"line":43},[33,121343,164],{"class":163},[33,121345,492],{"class":167},[33,121347,495],{"class":163},[33,121349,498],{"class":167},[33,121351,121352,121354,121356,121358],{"class":35,"line":61},[33,121353,190],{"class":163},[33,121355,193],{"class":167},[33,121357,164],{"class":163},[33,121359,198],{"class":167},[33,121361,121362],{"class":35,"line":73},[33,121363,92],{"emptyLinePlaceholder":91},[33,121365,121366,121368,121370,121372,121374,121376,121378,121380,121382,121384,121386,121388,121390,121392,121394],{"class":35,"line":88},[33,121367,13459],{"class":167},[33,121369,242],{"class":163},[33,121371,101407],{"class":167},[33,121373,119556],{"class":54},[33,121375,12426],{"class":167},[33,121377,99899],{"class":54},[33,121379,365],{"class":167},[33,121381,119565],{"class":54},[33,121383,8314],{"class":167},[33,121385,54232],{"class":54},[33,121387,12426],{"class":167},[33,121389,3545],{"class":50},[33,121391,365],{"class":167},[33,121393,2587],{"class":50},[33,121395,45051],{"class":167},[33,121397,121398,121400,121402,121404,121407],{"class":35,"line":95},[33,121399,57716],{"class":50},[33,121401,212],{"class":163},[33,121403,215],{"class":167},[33,121405,121406],{"class":54},"\"exports\u002Fno_index.csv\"",[33,121408,221],{"class":167},[33,121410,121411,121413,121415,121417,121419,121421,121423,121425,121427,121429],{"class":35,"line":101},[33,121412,57716],{"class":50},[33,121414,866],{"class":167},[33,121416,869],{"class":238},[33,121418,242],{"class":163},[33,121420,855],{"class":50},[33,121422,365],{"class":167},[33,121424,878],{"class":238},[33,121426,242],{"class":163},[33,121428,855],{"class":50},[33,121430,221],{"class":167},[33,121432,121433],{"class":35,"line":171},[33,121434,92],{"emptyLinePlaceholder":91},[33,121436,121437,121439],{"class":35,"line":179},[33,121438,35574],{"class":163},[33,121440,574],{"class":167},[33,121442,121443,121445,121447,121449,121451,121453,121455],{"class":35,"line":187},[33,121444,39534],{"class":167},[33,121446,57716],{"class":50},[33,121448,365],{"class":167},[33,121450,897],{"class":238},[33,121452,242],{"class":163},[33,121454,902],{"class":50},[33,121456,221],{"class":167},[33,121458,121459],{"class":35,"line":201},[33,121460,121461],{"class":39},"    # Verify: re-read and check columns\n",[33,121463,121464,121467,121469,121471,121473],{"class":35,"line":206},[33,121465,121466],{"class":167},"    check ",[33,121468,242],{"class":163},[33,121470,9481],{"class":167},[33,121472,57716],{"class":50},[33,121474,221],{"class":167},[33,121476,121477,121479,121481,121484,121486,121488,121490,121492,121494,121496,121498,121501,121503,121506,121508],{"class":35,"line":224},[33,121478,9228],{"class":163},[33,121480,599],{"class":50},[33,121482,121483],{"class":167},"(check.columns) ",[33,121485,1865],{"class":163},[33,121487,9178],{"class":167},[33,121489,119556],{"class":54},[33,121491,365],{"class":167},[33,121493,54232],{"class":54},[33,121495,8314],{"class":167},[33,121497,4059],{"class":163},[33,121499,121500],{"class":54},"\"Unexpected columns: ",[33,121502,1115],{"class":50},[33,121504,121505],{"class":167},"check.columns.tolist()",[33,121507,1121],{"class":50},[33,121509,7504],{"class":54},[33,121511,121512,121514,121516,121519],{"class":35,"line":229},[33,121513,7268],{"class":50},[33,121515,602],{"class":167},[33,121517,121518],{"class":54},"\"OK — columns:\"",[33,121520,118796],{"class":167},[33,121522,121523,121525,121527,121529],{"class":35,"line":235},[33,121524,35726],{"class":163},[33,121526,107953],{"class":50},[33,121528,1852],{"class":163},[33,121530,7583],{"class":167},[33,121532,121533,121535,121537,121539,121541,121543,121545,121547,121549,121551],{"class":35,"line":250},[33,121534,35742],{"class":163},[33,121536,16617],{"class":50},[33,121538,602],{"class":167},[33,121540,4059],{"class":163},[33,121542,118410],{"class":54},[33,121544,1115],{"class":50},[33,121546,7602],{"class":167},[33,121548,1121],{"class":50},[33,121550,274],{"class":54},[33,121552,221],{"class":167},[424,121554,121556],{"id":121555},"when-the-index-is-meaningful-eg-named-date-index","When the index is meaningful (e.g., named date index)",[23,121558,121560],{"className":126,"code":121559,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\nfrom pathlib import Path\n\ndf = pd.DataFrame(\n    {\"sales\": [100, 200, 300]},\n    index=pd.date_range(\"2024-01-01\", periods=3, freq=\"MS\"),\n)\ndf.index.name = \"month\"   # name it so it re-reads as a column, not \"Unnamed: 0\"\n\nOUT = Path(\"exports\u002Fnamed_index.csv\")\nOUT.parent.mkdir(parents=True, exist_ok=True)\n\ntry:\n    df.to_csv(OUT, index=True, date_format=\"%Y-%m-%d\")\n    check = pd.read_csv(OUT, index_col=\"month\", parse_dates=True)\n    assert \"month\" in check.index.name\n    print(\"Named index round-trips correctly.\")\nexcept OSError as e:\n    raise SystemExit(f\"Write failed: {e}\")\n",[30,121561,121562,121566,121576,121586,121590,121598,121619,121647,121651,121662,121666,121679,121701,121705,121711,121739,121767,121778,121789,121799],{"__ignoreMap":28},[33,121563,121564],{"class":35,"line":36},[33,121565,8895],{"class":39},[33,121567,121568,121570,121572,121574],{"class":35,"line":43},[33,121569,164],{"class":163},[33,121571,492],{"class":167},[33,121573,495],{"class":163},[33,121575,498],{"class":167},[33,121577,121578,121580,121582,121584],{"class":35,"line":61},[33,121579,190],{"class":163},[33,121581,193],{"class":167},[33,121583,164],{"class":163},[33,121585,198],{"class":167},[33,121587,121588],{"class":35,"line":73},[33,121589,92],{"emptyLinePlaceholder":91},[33,121591,121592,121594,121596],{"class":35,"line":88},[33,121593,13459],{"class":167},[33,121595,242],{"class":163},[33,121597,119165],{"class":167},[33,121599,121600,121602,121605,121607,121609,121611,121613,121615,121617],{"class":35,"line":95},[33,121601,119170],{"class":167},[33,121603,121604],{"class":54},"\"sales\"",[33,121606,12426],{"class":167},[33,121608,2650],{"class":50},[33,121610,365],{"class":167},[33,121612,2611],{"class":50},[33,121614,365],{"class":167},[33,121616,26433],{"class":50},[33,121618,119189],{"class":167},[33,121620,121621,121623,121625,121627,121629,121631,121633,121635,121637,121639,121641,121643,121645],{"class":35,"line":101},[33,121622,119194],{"class":238},[33,121624,242],{"class":163},[33,121626,119199],{"class":167},[33,121628,119202],{"class":54},[33,121630,365],{"class":167},[33,121632,119207],{"class":238},[33,121634,242],{"class":163},[33,121636,10258],{"class":50},[33,121638,365],{"class":167},[33,121640,119216],{"class":238},[33,121642,242],{"class":163},[33,121644,119221],{"class":54},[33,121646,1506],{"class":167},[33,121648,121649],{"class":35,"line":171},[33,121650,221],{"class":167},[33,121652,121653,121655,121657,121659],{"class":35,"line":179},[33,121654,119232],{"class":167},[33,121656,242],{"class":163},[33,121658,119237],{"class":54},[33,121660,121661],{"class":39},"   # name it so it re-reads as a column, not \"Unnamed: 0\"\n",[33,121663,121664],{"class":35,"line":187},[33,121665,92],{"emptyLinePlaceholder":91},[33,121667,121668,121670,121672,121674,121677],{"class":35,"line":201},[33,121669,57716],{"class":50},[33,121671,212],{"class":163},[33,121673,215],{"class":167},[33,121675,121676],{"class":54},"\"exports\u002Fnamed_index.csv\"",[33,121678,221],{"class":167},[33,121680,121681,121683,121685,121687,121689,121691,121693,121695,121697,121699],{"class":35,"line":206},[33,121682,57716],{"class":50},[33,121684,866],{"class":167},[33,121686,869],{"class":238},[33,121688,242],{"class":163},[33,121690,855],{"class":50},[33,121692,365],{"class":167},[33,121694,878],{"class":238},[33,121696,242],{"class":163},[33,121698,855],{"class":50},[33,121700,221],{"class":167},[33,121702,121703],{"class":35,"line":224},[33,121704,92],{"emptyLinePlaceholder":91},[33,121706,121707,121709],{"class":35,"line":229},[33,121708,35574],{"class":163},[33,121710,574],{"class":167},[33,121712,121713,121715,121717,121719,121721,121723,121725,121727,121729,121731,121733,121735,121737],{"class":35,"line":235},[33,121714,39534],{"class":167},[33,121716,57716],{"class":50},[33,121718,365],{"class":167},[33,121720,897],{"class":238},[33,121722,242],{"class":163},[33,121724,855],{"class":50},[33,121726,365],{"class":167},[33,121728,119308],{"class":238},[33,121730,242],{"class":163},[33,121732,1244],{"class":54},[33,121734,916],{"class":50},[33,121736,274],{"class":54},[33,121738,221],{"class":167},[33,121740,121741,121743,121745,121747,121749,121751,121753,121755,121757,121759,121761,121763,121765],{"class":35,"line":250},[33,121742,121466],{"class":167},[33,121744,242],{"class":163},[33,121746,9481],{"class":167},[33,121748,57716],{"class":50},[33,121750,365],{"class":167},[33,121752,118614],{"class":238},[33,121754,242],{"class":163},[33,121756,96465],{"class":54},[33,121758,365],{"class":167},[33,121760,102641],{"class":238},[33,121762,242],{"class":163},[33,121764,855],{"class":50},[33,121766,221],{"class":167},[33,121768,121769,121771,121773,121775],{"class":35,"line":266},[33,121770,9228],{"class":163},[33,121772,119237],{"class":54},[33,121774,8002],{"class":163},[33,121776,121777],{"class":167}," check.index.name\n",[33,121779,121780,121782,121784,121787],{"class":35,"line":290},[33,121781,7268],{"class":50},[33,121783,602],{"class":167},[33,121785,121786],{"class":54},"\"Named index round-trips correctly.\"",[33,121788,221],{"class":167},[33,121790,121791,121793,121795,121797],{"class":35,"line":295},[33,121792,35726],{"class":163},[33,121794,107953],{"class":50},[33,121796,1852],{"class":163},[33,121798,7583],{"class":167},[33,121800,121801,121803,121805,121807,121809,121811,121813,121815,121817,121819],{"class":35,"line":300},[33,121802,35742],{"class":163},[33,121804,16617],{"class":50},[33,121806,602],{"class":167},[33,121808,4059],{"class":163},[33,121810,118410],{"class":54},[33,121812,1115],{"class":50},[33,121814,7602],{"class":167},[33,121816,1121],{"class":50},[33,121818,274],{"class":54},[33,121820,221],{"class":167},[14,121822,121823,121824,121826,121827,121829],{},"A named index writes as a proper header column. An unnamed ",[30,121825,118019],{}," writes as a blank column header that pandas re-reads as ",[30,121828,117994],{}," — the root cause of one of the most-Googled pandas questions.",[2537,121831],{},[18,121833,121835],{"id":121834},"step-5-float-format-and-date-serialization","Step 5 — Float Format and Date Serialization",[424,121837,121839],{"id":121838},"float-precision","Float precision",[14,121841,121842,121844,121845,121848],{},[30,121843,120722],{}," applies globally across all float columns. If you need ",[26245,121846,121847],{},"per-column"," precision, round before exporting:",[23,121850,121852],{"className":126,"code":121851,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\nfrom pathlib import Path\n\ndf = pd.DataFrame({\n    \"price\": [9.999, 14.001, 100.0],\n    \"weight_kg\": [0.12345, 2.0, 55.123456],\n})\n\n# Round per-column before export\ndf[\"price\"] = df[\"price\"].round(2)\ndf[\"weight_kg\"] = df[\"weight_kg\"].round(4)\n\nOUT = Path(\"exports\u002Fprecise.csv\")\nOUT.parent.mkdir(parents=True, exist_ok=True)\n\ntry:\n    df.to_csv(OUT, index=False)\n    print(OUT.read_text())\nexcept OSError as e:\n    raise SystemExit(f\"Write failed: {e}\")\n",[30,121853,121854,121858,121868,121878,121882,121890,121912,121934,121938,121942,121947,121968,121989,121993,122006,122028,122032,122038,122054,122065,122075],{"__ignoreMap":28},[33,121855,121856],{"class":35,"line":36},[33,121857,8895],{"class":39},[33,121859,121860,121862,121864,121866],{"class":35,"line":43},[33,121861,164],{"class":163},[33,121863,492],{"class":167},[33,121865,495],{"class":163},[33,121867,498],{"class":167},[33,121869,121870,121872,121874,121876],{"class":35,"line":61},[33,121871,190],{"class":163},[33,121873,193],{"class":167},[33,121875,164],{"class":163},[33,121877,198],{"class":167},[33,121879,121880],{"class":35,"line":73},[33,121881,92],{"emptyLinePlaceholder":91},[33,121883,121884,121886,121888],{"class":35,"line":88},[33,121885,13459],{"class":167},[33,121887,242],{"class":163},[33,121889,11749],{"class":167},[33,121891,121892,121895,121897,121900,121902,121905,121907,121910],{"class":35,"line":95},[33,121893,121894],{"class":54},"    \"price\"",[33,121896,12426],{"class":167},[33,121898,121899],{"class":50},"9.999",[33,121901,365],{"class":167},[33,121903,121904],{"class":50},"14.001",[33,121906,365],{"class":167},[33,121908,121909],{"class":50},"100.0",[33,121911,8935],{"class":167},[33,121913,121914,121917,121919,121922,121924,121927,121929,121932],{"class":35,"line":101},[33,121915,121916],{"class":54},"    \"weight_kg\"",[33,121918,12426],{"class":167},[33,121920,121921],{"class":50},"0.12345",[33,121923,365],{"class":167},[33,121925,121926],{"class":50},"2.0",[33,121928,365],{"class":167},[33,121930,121931],{"class":50},"55.123456",[33,121933,8935],{"class":167},[33,121935,121936],{"class":35,"line":171},[33,121937,103249],{"class":167},[33,121939,121940],{"class":35,"line":179},[33,121941,92],{"emptyLinePlaceholder":91},[33,121943,121944],{"class":35,"line":187},[33,121945,121946],{"class":39},"# Round per-column before export\n",[33,121948,121949,121951,121953,121955,121957,121959,121961,121964,121966],{"class":35,"line":201},[33,121950,11038],{"class":167},[33,121952,116742],{"class":54},[33,121954,763],{"class":167},[33,121956,242],{"class":163},[33,121958,7935],{"class":167},[33,121960,116742],{"class":54},[33,121962,121963],{"class":167},"].round(",[33,121965,1533],{"class":50},[33,121967,221],{"class":167},[33,121969,121970,121972,121975,121977,121979,121981,121983,121985,121987],{"class":35,"line":206},[33,121971,11038],{"class":167},[33,121973,121974],{"class":54},"\"weight_kg\"",[33,121976,763],{"class":167},[33,121978,242],{"class":163},[33,121980,7935],{"class":167},[33,121982,121974],{"class":54},[33,121984,121963],{"class":167},[33,121986,1503],{"class":50},[33,121988,221],{"class":167},[33,121990,121991],{"class":35,"line":224},[33,121992,92],{"emptyLinePlaceholder":91},[33,121994,121995,121997,121999,122001,122004],{"class":35,"line":229},[33,121996,57716],{"class":50},[33,121998,212],{"class":163},[33,122000,215],{"class":167},[33,122002,122003],{"class":54},"\"exports\u002Fprecise.csv\"",[33,122005,221],{"class":167},[33,122007,122008,122010,122012,122014,122016,122018,122020,122022,122024,122026],{"class":35,"line":235},[33,122009,57716],{"class":50},[33,122011,866],{"class":167},[33,122013,869],{"class":238},[33,122015,242],{"class":163},[33,122017,855],{"class":50},[33,122019,365],{"class":167},[33,122021,878],{"class":238},[33,122023,242],{"class":163},[33,122025,855],{"class":50},[33,122027,221],{"class":167},[33,122029,122030],{"class":35,"line":250},[33,122031,92],{"emptyLinePlaceholder":91},[33,122033,122034,122036],{"class":35,"line":266},[33,122035,35574],{"class":163},[33,122037,574],{"class":167},[33,122039,122040,122042,122044,122046,122048,122050,122052],{"class":35,"line":290},[33,122041,39534],{"class":167},[33,122043,57716],{"class":50},[33,122045,365],{"class":167},[33,122047,897],{"class":238},[33,122049,242],{"class":163},[33,122051,902],{"class":50},[33,122053,221],{"class":167},[33,122055,122056,122058,122060,122062],{"class":35,"line":295},[33,122057,7268],{"class":50},[33,122059,602],{"class":167},[33,122061,57716],{"class":50},[33,122063,122064],{"class":167},".read_text())\n",[33,122066,122067,122069,122071,122073],{"class":35,"line":300},[33,122068,35726],{"class":163},[33,122070,107953],{"class":50},[33,122072,1852],{"class":163},[33,122074,7583],{"class":167},[33,122076,122077,122079,122081,122083,122085,122087,122089,122091,122093,122095],{"class":35,"line":317},[33,122078,35742],{"class":163},[33,122080,16617],{"class":50},[33,122082,602],{"class":167},[33,122084,4059],{"class":163},[33,122086,118410],{"class":54},[33,122088,1115],{"class":50},[33,122090,7602],{"class":167},[33,122092,1121],{"class":50},[33,122094,274],{"class":54},[33,122096,221],{"class":167},[424,122098,122100],{"id":122099},"timezone-aware-datetimes","Timezone-aware datetimes",[14,122102,122103,122104,122106,122107,20891],{},"Strip the timezone ",[26245,122105,59039],{}," writing if your downstream tool cannot parse ",[30,122108,120711],{},[23,122110,122112],{"className":126,"code":122111,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\nfrom pathlib import Path\n\ndf = pd.DataFrame({\n    \"ts\": pd.to_datetime([\"2024-03-01T12:00:00Z\", \"2024-03-15T08:30:00Z\"])\n                .tz_localize(None),   # remove tz awareness → naive UTC\n    \"value\": [42, 99],\n})\n\nOUT = Path(\"exports\u002Fno_tz.csv\")\nOUT.parent.mkdir(parents=True, exist_ok=True)\n\ntry:\n    df.to_csv(OUT, index=False, date_format=\"%Y-%m-%dT%H:%M:%S\")\n    print(OUT.read_text())\nexcept OSError as e:\n    raise SystemExit(f\"Write failed: {e}\")\n",[30,122113,122114,122118,122128,122138,122142,122150,122167,122180,122196,122200,122204,122217,122239,122243,122249,122277,122287,122297],{"__ignoreMap":28},[33,122115,122116],{"class":35,"line":36},[33,122117,8895],{"class":39},[33,122119,122120,122122,122124,122126],{"class":35,"line":43},[33,122121,164],{"class":163},[33,122123,492],{"class":167},[33,122125,495],{"class":163},[33,122127,498],{"class":167},[33,122129,122130,122132,122134,122136],{"class":35,"line":61},[33,122131,190],{"class":163},[33,122133,193],{"class":167},[33,122135,164],{"class":163},[33,122137,198],{"class":167},[33,122139,122140],{"class":35,"line":73},[33,122141,92],{"emptyLinePlaceholder":91},[33,122143,122144,122146,122148],{"class":35,"line":88},[33,122145,13459],{"class":167},[33,122147,242],{"class":163},[33,122149,11749],{"class":167},[33,122151,122152,122155,122157,122160,122162,122165],{"class":35,"line":95},[33,122153,122154],{"class":54},"    \"ts\"",[33,122156,120413],{"class":167},[33,122158,122159],{"class":54},"\"2024-03-01T12:00:00Z\"",[33,122161,365],{"class":167},[33,122163,122164],{"class":54},"\"2024-03-15T08:30:00Z\"",[33,122166,751],{"class":167},[33,122168,122169,122172,122174,122177],{"class":35,"line":101},[33,122170,122171],{"class":167},"                .tz_localize(",[33,122173,571],{"class":50},[33,122175,122176],{"class":167},"),   ",[33,122178,122179],{"class":39},"# remove tz awareness → naive UTC\n",[33,122181,122182,122185,122187,122189,122191,122194],{"class":35,"line":171},[33,122183,122184],{"class":54},"    \"value\"",[33,122186,12426],{"class":167},[33,122188,49816],{"class":50},[33,122190,365],{"class":167},[33,122192,122193],{"class":50},"99",[33,122195,8935],{"class":167},[33,122197,122198],{"class":35,"line":179},[33,122199,103249],{"class":167},[33,122201,122202],{"class":35,"line":187},[33,122203,92],{"emptyLinePlaceholder":91},[33,122205,122206,122208,122210,122212,122215],{"class":35,"line":201},[33,122207,57716],{"class":50},[33,122209,212],{"class":163},[33,122211,215],{"class":167},[33,122213,122214],{"class":54},"\"exports\u002Fno_tz.csv\"",[33,122216,221],{"class":167},[33,122218,122219,122221,122223,122225,122227,122229,122231,122233,122235,122237],{"class":35,"line":206},[33,122220,57716],{"class":50},[33,122222,866],{"class":167},[33,122224,869],{"class":238},[33,122226,242],{"class":163},[33,122228,855],{"class":50},[33,122230,365],{"class":167},[33,122232,878],{"class":238},[33,122234,242],{"class":163},[33,122236,855],{"class":50},[33,122238,221],{"class":167},[33,122240,122241],{"class":35,"line":224},[33,122242,92],{"emptyLinePlaceholder":91},[33,122244,122245,122247],{"class":35,"line":229},[33,122246,35574],{"class":163},[33,122248,574],{"class":167},[33,122250,122251,122253,122255,122257,122259,122261,122263,122265,122267,122269,122271,122273,122275],{"class":35,"line":235},[33,122252,39534],{"class":167},[33,122254,57716],{"class":50},[33,122256,365],{"class":167},[33,122258,897],{"class":238},[33,122260,242],{"class":163},[33,122262,902],{"class":50},[33,122264,365],{"class":167},[33,122266,119308],{"class":238},[33,122268,242],{"class":163},[33,122270,1244],{"class":54},[33,122272,916],{"class":50},[33,122274,1249],{"class":54},[33,122276,221],{"class":167},[33,122278,122279,122281,122283,122285],{"class":35,"line":250},[33,122280,7268],{"class":50},[33,122282,602],{"class":167},[33,122284,57716],{"class":50},[33,122286,122064],{"class":167},[33,122288,122289,122291,122293,122295],{"class":35,"line":266},[33,122290,35726],{"class":163},[33,122292,107953],{"class":50},[33,122294,1852],{"class":163},[33,122296,7583],{"class":167},[33,122298,122299,122301,122303,122305,122307,122309,122311,122313,122315,122317],{"class":35,"line":290},[33,122300,35742],{"class":163},[33,122302,16617],{"class":50},[33,122304,602],{"class":167},[33,122306,4059],{"class":163},[33,122308,118410],{"class":54},[33,122310,1115],{"class":50},[33,122312,7602],{"class":167},[33,122314,1121],{"class":50},[33,122316,274],{"class":54},[33,122318,221],{"class":167},[2537,122320],{},[18,122322,122324],{"id":122323},"step-6-delimiter-and-quoting-variants","Step 6 — Delimiter and Quoting Variants",[424,122326,122328],{"id":122327},"eu-semicolon-convention","EU semicolon convention",[14,122330,122331,122332,122334,122335,122337,122338,122340],{},"Most EU locale systems use ",[30,122333,63503],{}," as the decimal separator, so CSVs use ",[30,122336,114705],{}," as the field delimiter. Excel on a European Windows machine auto-detects ",[30,122339,114705],{},"-separated files correctly.",[23,122342,122344],{"className":126,"code":122343,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\nfrom pathlib import Path\n\ndf = pd.DataFrame({\"article\": [\"Bürostuhl\", \"Schreibtisch\"], \"preis\": [149.99, 349.00]})\nOUT = Path(\"exports\u002Feu_report.csv\")\nOUT.parent.mkdir(parents=True, exist_ok=True)\n\ntry:\n    df.to_csv(OUT, index=False, sep=\";\", encoding=\"utf-8-sig\", float_format=\"%.2f\")\n    print(OUT.read_text())\nexcept OSError as e:\n    raise SystemExit(f\"Write failed: {e}\")\n",[30,122345,122346,122350,122360,122370,122374,122412,122425,122447,122451,122457,122501,122511,122521],{"__ignoreMap":28},[33,122347,122348],{"class":35,"line":36},[33,122349,8895],{"class":39},[33,122351,122352,122354,122356,122358],{"class":35,"line":43},[33,122353,164],{"class":163},[33,122355,492],{"class":167},[33,122357,495],{"class":163},[33,122359,498],{"class":167},[33,122361,122362,122364,122366,122368],{"class":35,"line":61},[33,122363,190],{"class":163},[33,122365,193],{"class":167},[33,122367,164],{"class":163},[33,122369,198],{"class":167},[33,122371,122372],{"class":35,"line":73},[33,122373,92],{"emptyLinePlaceholder":91},[33,122375,122376,122378,122380,122382,122385,122387,122390,122392,122395,122397,122400,122402,122405,122407,122410],{"class":35,"line":88},[33,122377,13459],{"class":167},[33,122379,242],{"class":163},[33,122381,101407],{"class":167},[33,122383,122384],{"class":54},"\"article\"",[33,122386,12426],{"class":167},[33,122388,122389],{"class":54},"\"Bürostuhl\"",[33,122391,365],{"class":167},[33,122393,122394],{"class":54},"\"Schreibtisch\"",[33,122396,8314],{"class":167},[33,122398,122399],{"class":54},"\"preis\"",[33,122401,12426],{"class":167},[33,122403,122404],{"class":50},"149.99",[33,122406,365],{"class":167},[33,122408,122409],{"class":50},"349.00",[33,122411,45051],{"class":167},[33,122413,122414,122416,122418,122420,122423],{"class":35,"line":95},[33,122415,57716],{"class":50},[33,122417,212],{"class":163},[33,122419,215],{"class":167},[33,122421,122422],{"class":54},"\"exports\u002Feu_report.csv\"",[33,122424,221],{"class":167},[33,122426,122427,122429,122431,122433,122435,122437,122439,122441,122443,122445],{"class":35,"line":101},[33,122428,57716],{"class":50},[33,122430,866],{"class":167},[33,122432,869],{"class":238},[33,122434,242],{"class":163},[33,122436,855],{"class":50},[33,122438,365],{"class":167},[33,122440,878],{"class":238},[33,122442,242],{"class":163},[33,122444,855],{"class":50},[33,122446,221],{"class":167},[33,122448,122449],{"class":35,"line":171},[33,122450,92],{"emptyLinePlaceholder":91},[33,122452,122453,122455],{"class":35,"line":179},[33,122454,35574],{"class":163},[33,122456,574],{"class":167},[33,122458,122459,122461,122463,122465,122467,122469,122471,122473,122475,122477,122479,122481,122483,122485,122487,122489,122491,122493,122495,122497,122499],{"class":35,"line":187},[33,122460,39534],{"class":167},[33,122462,57716],{"class":50},[33,122464,365],{"class":167},[33,122466,897],{"class":238},[33,122468,242],{"class":163},[33,122470,902],{"class":50},[33,122472,365],{"class":167},[33,122474,114789],{"class":238},[33,122476,242],{"class":163},[33,122478,113097],{"class":54},[33,122480,365],{"class":167},[33,122482,27249],{"class":238},[33,122484,242],{"class":163},[33,122486,108390],{"class":54},[33,122488,365],{"class":167},[33,122490,120722],{"class":238},[33,122492,242],{"class":163},[33,122494,274],{"class":54},[33,122496,54896],{"class":50},[33,122498,274],{"class":54},[33,122500,221],{"class":167},[33,122502,122503,122505,122507,122509],{"class":35,"line":201},[33,122504,7268],{"class":50},[33,122506,602],{"class":167},[33,122508,57716],{"class":50},[33,122510,122064],{"class":167},[33,122512,122513,122515,122517,122519],{"class":35,"line":206},[33,122514,35726],{"class":163},[33,122516,107953],{"class":50},[33,122518,1852],{"class":163},[33,122520,7583],{"class":167},[33,122522,122523,122525,122527,122529,122531,122533,122535,122537,122539,122541],{"class":35,"line":224},[33,122524,35742],{"class":163},[33,122526,16617],{"class":50},[33,122528,602],{"class":167},[33,122530,4059],{"class":163},[33,122532,118410],{"class":54},[33,122534,1115],{"class":50},[33,122536,7602],{"class":167},[33,122538,1121],{"class":50},[33,122540,274],{"class":54},[33,122542,221],{"class":167},[424,122544,122546],{"id":122545},"pipe-delimited-for-sql-loaders","Pipe-delimited for SQL loaders",[14,122548,122549,122550,122552,122553,122555,122556,122559,122560,122563],{},"Pipe (",[30,122551,7654],{},") avoids conflicts when text fields contain commas and semicolons. Redshift ",[30,122554,117998],{},", Snowflake ",[30,122557,122558],{},"COPY INTO",", and Postgres ",[30,122561,122562],{},"COPY FROM"," all support custom delimiters.",[23,122565,122567],{"className":126,"code":122566,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\nfrom pathlib import Path\n\ndf = pd.DataFrame({\n    \"id\": [1, 2],\n    \"note\": [\"Ships to: London, UK\", \"Ref: A\u002FB; C\u002FD\"],\n})\nOUT = Path(\"exports\u002Fpipe_delimited.csv\")\nOUT.parent.mkdir(parents=True, exist_ok=True)\n\ntry:\n    df.to_csv(OUT, index=False, sep=\"|\", encoding=\"utf-8\")\n    print(OUT.read_text())\nexcept OSError as e:\n    raise SystemExit(f\"Write failed: {e}\")\n",[30,122568,122569,122573,122583,122593,122597,122605,122620,122637,122641,122654,122676,122680,122686,122719,122729,122739],{"__ignoreMap":28},[33,122570,122571],{"class":35,"line":36},[33,122572,8895],{"class":39},[33,122574,122575,122577,122579,122581],{"class":35,"line":43},[33,122576,164],{"class":163},[33,122578,492],{"class":167},[33,122580,495],{"class":163},[33,122582,498],{"class":167},[33,122584,122585,122587,122589,122591],{"class":35,"line":61},[33,122586,190],{"class":163},[33,122588,193],{"class":167},[33,122590,164],{"class":163},[33,122592,198],{"class":167},[33,122594,122595],{"class":35,"line":73},[33,122596,92],{"emptyLinePlaceholder":91},[33,122598,122599,122601,122603],{"class":35,"line":88},[33,122600,13459],{"class":167},[33,122602,242],{"class":163},[33,122604,11749],{"class":167},[33,122606,122607,122610,122612,122614,122616,122618],{"class":35,"line":95},[33,122608,122609],{"class":54},"    \"id\"",[33,122611,12426],{"class":167},[33,122613,734],{"class":50},[33,122615,365],{"class":167},[33,122617,1533],{"class":50},[33,122619,8935],{"class":167},[33,122621,122622,122625,122627,122630,122632,122635],{"class":35,"line":101},[33,122623,122624],{"class":54},"    \"note\"",[33,122626,12426],{"class":167},[33,122628,122629],{"class":54},"\"Ships to: London, UK\"",[33,122631,365],{"class":167},[33,122633,122634],{"class":54},"\"Ref: A\u002FB; C\u002FD\"",[33,122636,8935],{"class":167},[33,122638,122639],{"class":35,"line":171},[33,122640,103249],{"class":167},[33,122642,122643,122645,122647,122649,122652],{"class":35,"line":179},[33,122644,57716],{"class":50},[33,122646,212],{"class":163},[33,122648,215],{"class":167},[33,122650,122651],{"class":54},"\"exports\u002Fpipe_delimited.csv\"",[33,122653,221],{"class":167},[33,122655,122656,122658,122660,122662,122664,122666,122668,122670,122672,122674],{"class":35,"line":187},[33,122657,57716],{"class":50},[33,122659,866],{"class":167},[33,122661,869],{"class":238},[33,122663,242],{"class":163},[33,122665,855],{"class":50},[33,122667,365],{"class":167},[33,122669,878],{"class":238},[33,122671,242],{"class":163},[33,122673,855],{"class":50},[33,122675,221],{"class":167},[33,122677,122678],{"class":35,"line":201},[33,122679,92],{"emptyLinePlaceholder":91},[33,122681,122682,122684],{"class":35,"line":206},[33,122683,35574],{"class":163},[33,122685,574],{"class":167},[33,122687,122688,122690,122692,122694,122696,122698,122700,122702,122704,122706,122709,122711,122713,122715,122717],{"class":35,"line":224},[33,122689,39534],{"class":167},[33,122691,57716],{"class":50},[33,122693,365],{"class":167},[33,122695,897],{"class":238},[33,122697,242],{"class":163},[33,122699,902],{"class":50},[33,122701,365],{"class":167},[33,122703,114789],{"class":238},[33,122705,242],{"class":163},[33,122707,122708],{"class":54},"\"|\"",[33,122710,365],{"class":167},[33,122712,27249],{"class":238},[33,122714,242],{"class":163},[33,122716,1195],{"class":54},[33,122718,221],{"class":167},[33,122720,122721,122723,122725,122727],{"class":35,"line":229},[33,122722,7268],{"class":50},[33,122724,602],{"class":167},[33,122726,57716],{"class":50},[33,122728,122064],{"class":167},[33,122730,122731,122733,122735,122737],{"class":35,"line":235},[33,122732,35726],{"class":163},[33,122734,107953],{"class":50},[33,122736,1852],{"class":163},[33,122738,7583],{"class":167},[33,122740,122741,122743,122745,122747,122749,122751,122753,122755,122757,122759],{"class":35,"line":250},[33,122742,35742],{"class":163},[33,122744,16617],{"class":50},[33,122746,602],{"class":167},[33,122748,4059],{"class":163},[33,122750,118410],{"class":54},[33,122752,1115],{"class":50},[33,122754,7602],{"class":167},[33,122756,1121],{"class":50},[33,122758,274],{"class":54},[33,122760,221],{"class":167},[424,122762,122764],{"id":122763},"quoting-all-fields","Quoting all fields",[14,122766,122767],{},"When passing CSVs to fragile parsers that do not handle RFC 4180 escaping, quote everything:",[23,122769,122771],{"className":126,"code":122770,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\nimport csv\nfrom pathlib import Path\n\ndf = pd.DataFrame({\"name\": ['Smith, \"Bob\"'], \"score\": [98]})\nOUT = Path(\"exports\u002Fquoted_all.csv\")\nOUT.parent.mkdir(parents=True, exist_ok=True)\n\ntry:\n    df.to_csv(OUT, index=False, quoting=csv.QUOTE_ALL, encoding=\"utf-8\")\n    print(OUT.read_text())\nexcept OSError as e:\n    raise SystemExit(f\"Write failed: {e}\")\n",[30,122772,122773,122777,122787,122793,122803,122807,122833,122846,122868,122872,122878,122912,122922,122932],{"__ignoreMap":28},[33,122774,122775],{"class":35,"line":36},[33,122776,8895],{"class":39},[33,122778,122779,122781,122783,122785],{"class":35,"line":43},[33,122780,164],{"class":163},[33,122782,492],{"class":167},[33,122784,495],{"class":163},[33,122786,498],{"class":167},[33,122788,122789,122791],{"class":35,"line":61},[33,122790,164],{"class":163},[33,122792,107673],{"class":167},[33,122794,122795,122797,122799,122801],{"class":35,"line":73},[33,122796,190],{"class":163},[33,122798,193],{"class":167},[33,122800,164],{"class":163},[33,122802,198],{"class":167},[33,122804,122805],{"class":35,"line":88},[33,122806,92],{"emptyLinePlaceholder":91},[33,122808,122809,122811,122813,122815,122817,122819,122822,122824,122827,122829,122831],{"class":35,"line":95},[33,122810,13459],{"class":167},[33,122812,242],{"class":163},[33,122814,101407],{"class":167},[33,122816,104775],{"class":54},[33,122818,12426],{"class":167},[33,122820,122821],{"class":54},"'Smith, \"Bob\"'",[33,122823,8314],{"class":167},[33,122825,122826],{"class":54},"\"score\"",[33,122828,12426],{"class":167},[33,122830,82416],{"class":50},[33,122832,45051],{"class":167},[33,122834,122835,122837,122839,122841,122844],{"class":35,"line":101},[33,122836,57716],{"class":50},[33,122838,212],{"class":163},[33,122840,215],{"class":167},[33,122842,122843],{"class":54},"\"exports\u002Fquoted_all.csv\"",[33,122845,221],{"class":167},[33,122847,122848,122850,122852,122854,122856,122858,122860,122862,122864,122866],{"class":35,"line":171},[33,122849,57716],{"class":50},[33,122851,866],{"class":167},[33,122853,869],{"class":238},[33,122855,242],{"class":163},[33,122857,855],{"class":50},[33,122859,365],{"class":167},[33,122861,878],{"class":238},[33,122863,242],{"class":163},[33,122865,855],{"class":50},[33,122867,221],{"class":167},[33,122869,122870],{"class":35,"line":179},[33,122871,92],{"emptyLinePlaceholder":91},[33,122873,122874,122876],{"class":35,"line":187},[33,122875,35574],{"class":163},[33,122877,574],{"class":167},[33,122879,122880,122882,122884,122886,122888,122890,122892,122894,122896,122898,122900,122902,122904,122906,122908,122910],{"class":35,"line":201},[33,122881,39534],{"class":167},[33,122883,57716],{"class":50},[33,122885,365],{"class":167},[33,122887,897],{"class":238},[33,122889,242],{"class":163},[33,122891,902],{"class":50},[33,122893,365],{"class":167},[33,122895,114869],{"class":238},[33,122897,242],{"class":163},[33,122899,114956],{"class":167},[33,122901,114959],{"class":50},[33,122903,365],{"class":167},[33,122905,27249],{"class":238},[33,122907,242],{"class":163},[33,122909,1195],{"class":54},[33,122911,221],{"class":167},[33,122913,122914,122916,122918,122920],{"class":35,"line":206},[33,122915,7268],{"class":50},[33,122917,602],{"class":167},[33,122919,57716],{"class":50},[33,122921,122064],{"class":167},[33,122923,122924,122926,122928,122930],{"class":35,"line":224},[33,122925,35726],{"class":163},[33,122927,107953],{"class":50},[33,122929,1852],{"class":163},[33,122931,7583],{"class":167},[33,122933,122934,122936,122938,122940,122942,122944,122946,122948,122950,122952],{"class":35,"line":229},[33,122935,35742],{"class":163},[33,122937,16617],{"class":50},[33,122939,602],{"class":167},[33,122941,4059],{"class":163},[33,122943,118410],{"class":54},[33,122945,1115],{"class":50},[33,122947,7602],{"class":167},[33,122949,1121],{"class":50},[33,122951,274],{"class":54},[33,122953,221],{"class":167},[2537,122955],{},[18,122957,122959],{"id":122958},"step-7-gzip-compression","Step 7 — Gzip Compression",[14,122961,122962,122964,122965,122967,122968,122970,122971,122974,122975,3035],{},[30,122963,9630],{}," passes a ",[30,122966,109446],{}," argument directly to ",[30,122969,118265],{},". The output file must end in ",[30,122972,122973],{},".gz"," for downstream tools to auto-detect the format, though you can also use ",[30,122976,122977],{},"compression={\"method\": \"gzip\", \"compresslevel\": 9}",[23,122979,122981],{"className":126,"code":122980,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\nfrom pathlib import Path\n\nSOURCE = Path(\"data\u002Fsample_export.csv\")\nOUT = Path(\"exports\u002Freport.csv.gz\")\nOUT.parent.mkdir(parents=True, exist_ok=True)\n\ntry:\n    df = pd.read_csv(SOURCE)\nexcept FileNotFoundError as e:\n    raise SystemExit(f\"Source not found: {e}\")\n\ntry:\n    df.to_csv(\n        OUT,\n        index=False,\n        encoding=\"utf-8\",\n        compression={\"method\": \"gzip\", \"compresslevel\": 6},\n    )\n    raw_size = Path(SOURCE).stat().st_size\n    gz_size = OUT.stat().st_size\n    print(f\"Compressed {raw_size:,} B → {gz_size:,} B ({gz_size\u002Fraw_size:.0%})\")\nexcept OSError as e:\n    raise SystemExit(f\"Write failed: {e}\")\n",[30,122982,122983,122987,122997,123007,123011,123023,123036,123058,123062,123068,123080,123090,123112,123116,123122,123126,123132,123142,123152,123180,123184,123198,123210,123261,123271],{"__ignoreMap":28},[33,122984,122985],{"class":35,"line":36},[33,122986,8895],{"class":39},[33,122988,122989,122991,122993,122995],{"class":35,"line":43},[33,122990,164],{"class":163},[33,122992,492],{"class":167},[33,122994,495],{"class":163},[33,122996,498],{"class":167},[33,122998,122999,123001,123003,123005],{"class":35,"line":61},[33,123000,190],{"class":163},[33,123002,193],{"class":167},[33,123004,164],{"class":163},[33,123006,198],{"class":167},[33,123008,123009],{"class":35,"line":73},[33,123010,92],{"emptyLinePlaceholder":91},[33,123012,123013,123015,123017,123019,123021],{"class":35,"line":88},[33,123014,86272],{"class":50},[33,123016,212],{"class":163},[33,123018,215],{"class":167},[33,123020,120296],{"class":54},[33,123022,221],{"class":167},[33,123024,123025,123027,123029,123031,123034],{"class":35,"line":95},[33,123026,57716],{"class":50},[33,123028,212],{"class":163},[33,123030,215],{"class":167},[33,123032,123033],{"class":54},"\"exports\u002Freport.csv.gz\"",[33,123035,221],{"class":167},[33,123037,123038,123040,123042,123044,123046,123048,123050,123052,123054,123056],{"class":35,"line":101},[33,123039,57716],{"class":50},[33,123041,866],{"class":167},[33,123043,869],{"class":238},[33,123045,242],{"class":163},[33,123047,855],{"class":50},[33,123049,365],{"class":167},[33,123051,878],{"class":238},[33,123053,242],{"class":163},[33,123055,855],{"class":50},[33,123057,221],{"class":167},[33,123059,123060],{"class":35,"line":171},[33,123061,92],{"emptyLinePlaceholder":91},[33,123063,123064,123066],{"class":35,"line":179},[33,123065,35574],{"class":163},[33,123067,574],{"class":167},[33,123069,123070,123072,123074,123076,123078],{"class":35,"line":187},[33,123071,4025],{"class":167},[33,123073,242],{"class":163},[33,123075,9481],{"class":167},[33,123077,86272],{"class":50},[33,123079,221],{"class":167},[33,123081,123082,123084,123086,123088],{"class":35,"line":201},[33,123083,35726],{"class":163},[33,123085,2945],{"class":50},[33,123087,1852],{"class":163},[33,123089,7583],{"class":167},[33,123091,123092,123094,123096,123098,123100,123102,123104,123106,123108,123110],{"class":35,"line":206},[33,123093,35742],{"class":163},[33,123095,16617],{"class":50},[33,123097,602],{"class":167},[33,123099,4059],{"class":163},[33,123101,90279],{"class":54},[33,123103,1115],{"class":50},[33,123105,7602],{"class":167},[33,123107,1121],{"class":50},[33,123109,274],{"class":54},[33,123111,221],{"class":167},[33,123113,123114],{"class":35,"line":224},[33,123115,92],{"emptyLinePlaceholder":91},[33,123117,123118,123120],{"class":35,"line":229},[33,123119,35574],{"class":163},[33,123121,574],{"class":167},[33,123123,123124],{"class":35,"line":235},[33,123125,121075],{"class":167},[33,123127,123128,123130],{"class":35,"line":250},[33,123129,121080],{"class":50},[33,123131,247],{"class":167},[33,123133,123134,123136,123138,123140],{"class":35,"line":266},[33,123135,121087],{"class":238},[33,123137,242],{"class":163},[33,123139,902],{"class":50},[33,123141,247],{"class":167},[33,123143,123144,123146,123148,123150],{"class":35,"line":290},[33,123145,1190],{"class":238},[33,123147,242],{"class":163},[33,123149,1195],{"class":54},[33,123151,247],{"class":167},[33,123153,123154,123157,123159,123161,123164,123166,123169,123171,123174,123176,123178],{"class":35,"line":295},[33,123155,123156],{"class":238},"        compression",[33,123158,242],{"class":163},[33,123160,1115],{"class":167},[33,123162,123163],{"class":54},"\"method\"",[33,123165,2079],{"class":167},[33,123167,123168],{"class":54},"\"gzip\"",[33,123170,365],{"class":167},[33,123172,123173],{"class":54},"\"compresslevel\"",[33,123175,2079],{"class":167},[33,123177,2681],{"class":50},[33,123179,3509],{"class":167},[33,123181,123182],{"class":35,"line":300},[33,123183,1202],{"class":167},[33,123185,123186,123189,123191,123193,123195],{"class":35,"line":317},[33,123187,123188],{"class":167},"    raw_size ",[33,123190,242],{"class":163},[33,123192,215],{"class":167},[33,123194,86272],{"class":50},[33,123196,123197],{"class":167},").stat().st_size\n",[33,123199,123200,123203,123205,123207],{"class":35,"line":332},[33,123201,123202],{"class":167},"    gz_size ",[33,123204,242],{"class":163},[33,123206,118176],{"class":50},[33,123208,123209],{"class":167},".stat().st_size\n",[33,123211,123212,123214,123216,123218,123221,123223,123226,123228,123230,123233,123235,123238,123240,123242,123245,123247,123249,123251,123253,123255,123257,123259],{"class":35,"line":347},[33,123213,7268],{"class":50},[33,123215,602],{"class":167},[33,123217,4059],{"class":163},[33,123219,123220],{"class":54},"\"Compressed ",[33,123222,1115],{"class":50},[33,123224,123225],{"class":167},"raw_size",[33,123227,18801],{"class":163},[33,123229,1121],{"class":50},[33,123231,123232],{"class":54}," B → ",[33,123234,1115],{"class":50},[33,123236,123237],{"class":167},"gz_size",[33,123239,18801],{"class":163},[33,123241,1121],{"class":50},[33,123243,123244],{"class":54}," B (",[33,123246,1115],{"class":50},[33,123248,123237],{"class":167},[33,123250,1351],{"class":163},[33,123252,123225],{"class":167},[33,123254,12775],{"class":163},[33,123256,1121],{"class":50},[33,123258,72406],{"class":54},[33,123260,221],{"class":167},[33,123262,123263,123265,123267,123269],{"class":35,"line":374},[33,123264,35726],{"class":163},[33,123266,107953],{"class":50},[33,123268,1852],{"class":163},[33,123270,7583],{"class":167},[33,123272,123273,123275,123277,123279,123281,123283,123285,123287,123289,123291],{"class":35,"line":397},[33,123274,35742],{"class":163},[33,123276,16617],{"class":50},[33,123278,602],{"class":167},[33,123280,4059],{"class":163},[33,123282,118410],{"class":54},[33,123284,1115],{"class":50},[33,123286,7602],{"class":167},[33,123288,1121],{"class":50},[33,123290,274],{"class":54},[33,123292,221],{"class":167},[14,123294,123295,123296,123298],{},"Re-reading a ",[30,123297,122973],{}," file needs no extra code — pandas detects compression automatically:",[23,123300,123302],{"className":126,"code":123301,"language":47,"meta":28,"style":28},"df_back = pd.read_csv(\"exports\u002Freport.csv.gz\")\n",[30,123303,123304],{"__ignoreMap":28},[33,123305,123306,123308,123310,123312,123314],{"class":35,"line":36},[33,123307,118205],{"class":167},[33,123309,242],{"class":163},[33,123311,9481],{"class":167},[33,123313,123033],{"class":54},[33,123315,221],{"class":167},[2537,123317],{},[18,123319,123321],{"id":123320},"step-8-chunked-writing-for-large-data","Step 8 — Chunked Writing for Large Data",[14,123323,123324,123325,8877,123328,123330],{},"When a DataFrame exceeds available RAM, load and write in chunks using ",[30,123326,123327],{},"pd.read_csv",[30,123329,21944],{},", or drive it from a database cursor.",[23,123332,123334],{"className":126,"code":123333,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\nimport csv\nfrom pathlib import Path\n\nSOURCE = Path(\"data\u002Flarge_source.csv\")   # assume this is too big to load whole\nOUT = Path(\"exports\u002Fchunked_output.csv\")\nOUT.parent.mkdir(parents=True, exist_ok=True)\nCHUNK_SIZE = 50_000\n\ntry:\n    with open(OUT, \"w\", newline=\"\", encoding=\"utf-8\") as fout:\n        writer = None\n        for i, chunk in enumerate(pd.read_csv(SOURCE, chunksize=CHUNK_SIZE)):\n            # Apply any transform here, e.g.: chunk = chunk.dropna()\n            if writer is None:\n                writer = csv.DictWriter(fout, fieldnames=chunk.columns.tolist())\n                writer.writeheader()\n            writer.writerows(chunk.to_dict(\"records\"))\n            print(f\"Chunk {i+1}: wrote {len(chunk):,} rows\")\n            fout.flush()\n    print(f\"Done → {OUT}\")\nexcept FileNotFoundError as e:\n    raise SystemExit(f\"Source not found: {e}\")\nexcept OSError as e:\n    raise SystemExit(f\"Write error: {e}\")\n",[30,123335,123336,123340,123350,123356,123366,123370,123386,123399,123421,123431,123435,123441,123479,123487,123512,123517,123530,123547,123552,123561,123594,123599,123615,123625,123647,123657],{"__ignoreMap":28},[33,123337,123338],{"class":35,"line":36},[33,123339,8895],{"class":39},[33,123341,123342,123344,123346,123348],{"class":35,"line":43},[33,123343,164],{"class":163},[33,123345,492],{"class":167},[33,123347,495],{"class":163},[33,123349,498],{"class":167},[33,123351,123352,123354],{"class":35,"line":61},[33,123353,164],{"class":163},[33,123355,107673],{"class":167},[33,123357,123358,123360,123362,123364],{"class":35,"line":73},[33,123359,190],{"class":163},[33,123361,193],{"class":167},[33,123363,164],{"class":163},[33,123365,198],{"class":167},[33,123367,123368],{"class":35,"line":88},[33,123369,92],{"emptyLinePlaceholder":91},[33,123371,123372,123374,123376,123378,123381,123383],{"class":35,"line":95},[33,123373,86272],{"class":50},[33,123375,212],{"class":163},[33,123377,215],{"class":167},[33,123379,123380],{"class":54},"\"data\u002Flarge_source.csv\"",[33,123382,12000],{"class":167},[33,123384,123385],{"class":39},"# assume this is too big to load whole\n",[33,123387,123388,123390,123392,123394,123397],{"class":35,"line":101},[33,123389,57716],{"class":50},[33,123391,212],{"class":163},[33,123393,215],{"class":167},[33,123395,123396],{"class":54},"\"exports\u002Fchunked_output.csv\"",[33,123398,221],{"class":167},[33,123400,123401,123403,123405,123407,123409,123411,123413,123415,123417,123419],{"class":35,"line":171},[33,123402,57716],{"class":50},[33,123404,866],{"class":167},[33,123406,869],{"class":238},[33,123408,242],{"class":163},[33,123410,855],{"class":50},[33,123412,365],{"class":167},[33,123414,878],{"class":238},[33,123416,242],{"class":163},[33,123418,855],{"class":50},[33,123420,221],{"class":167},[33,123422,123423,123426,123428],{"class":35,"line":179},[33,123424,123425],{"class":50},"CHUNK_SIZE",[33,123427,212],{"class":163},[33,123429,123430],{"class":50}," 50_000\n",[33,123432,123433],{"class":35,"line":187},[33,123434,92],{"emptyLinePlaceholder":91},[33,123436,123437,123439],{"class":35,"line":201},[33,123438,35574],{"class":163},[33,123440,574],{"class":167},[33,123442,123443,123445,123447,123449,123451,123453,123456,123458,123460,123462,123464,123466,123468,123470,123472,123474,123476],{"class":35,"line":206},[33,123444,1635],{"class":163},[33,123446,68213],{"class":50},[33,123448,602],{"class":167},[33,123450,57716],{"class":50},[33,123452,365],{"class":167},[33,123454,123455],{"class":54},"\"w\"",[33,123457,365],{"class":167},[33,123459,107919],{"class":238},[33,123461,242],{"class":163},[33,123463,3198],{"class":54},[33,123465,365],{"class":167},[33,123467,27249],{"class":238},[33,123469,242],{"class":163},[33,123471,1195],{"class":54},[33,123473,1649],{"class":167},[33,123475,495],{"class":163},[33,123477,123478],{"class":167}," fout:\n",[33,123480,123481,123483,123485],{"class":35,"line":224},[33,123482,67149],{"class":167},[33,123484,242],{"class":163},[33,123486,3852],{"class":50},[33,123488,123489,123491,123493,123495,123497,123500,123502,123504,123506,123508,123510],{"class":35,"line":229},[33,123490,5973],{"class":163},[33,123492,115785],{"class":167},[33,123494,662],{"class":163},[33,123496,7403],{"class":50},[33,123498,123499],{"class":167},"(pd.read_csv(",[33,123501,86272],{"class":50},[33,123503,365],{"class":167},[33,123505,21944],{"class":238},[33,123507,242],{"class":163},[33,123509,123425],{"class":50},[33,123511,8687],{"class":167},[33,123513,123514],{"class":35,"line":235},[33,123515,123516],{"class":39},"            # Apply any transform here, e.g.: chunk = chunk.dropna()\n",[33,123518,123519,123521,123524,123526,123528],{"class":35,"line":250},[33,123520,5995],{"class":163},[33,123522,123523],{"class":167}," writer ",[33,123525,3847],{"class":163},[33,123527,7657],{"class":50},[33,123529,574],{"class":167},[33,123531,123532,123534,123536,123539,123542,123544],{"class":35,"line":266},[33,123533,72847],{"class":167},[33,123535,242],{"class":163},[33,123537,123538],{"class":167}," csv.DictWriter(fout, ",[33,123540,123541],{"class":238},"fieldnames",[33,123543,242],{"class":163},[33,123545,123546],{"class":167},"chunk.columns.tolist())\n",[33,123548,123549],{"class":35,"line":290},[33,123550,123551],{"class":167},"                writer.writeheader()\n",[33,123553,123554,123557,123559],{"class":35,"line":295},[33,123555,123556],{"class":167},"            writer.writerows(chunk.to_dict(",[33,123558,21222],{"class":54},[33,123560,371],{"class":167},[33,123562,123563,123565,123567,123569,123571,123573,123575,123577,123579,123582,123584,123586,123588,123590,123592],{"class":35,"line":300},[33,123564,9364],{"class":50},[33,123566,602],{"class":167},[33,123568,4059],{"class":163},[33,123570,70409],{"class":54},[33,123572,1115],{"class":50},[33,123574,7499],{"class":167},[33,123576,1811],{"class":163},[33,123578,40161],{"class":50},[33,123580,123581],{"class":54},": wrote ",[33,123583,4065],{"class":50},[33,123585,70435],{"class":167},[33,123587,18801],{"class":163},[33,123589,1121],{"class":50},[33,123591,65937],{"class":54},[33,123593,221],{"class":167},[33,123595,123596],{"class":35,"line":317},[33,123597,123598],{"class":167},"            fout.flush()\n",[33,123600,123601,123603,123605,123607,123609,123611,123613],{"class":35,"line":332},[33,123602,7268],{"class":50},[33,123604,602],{"class":167},[33,123606,4059],{"class":163},[33,123608,85751],{"class":54},[33,123610,58217],{"class":50},[33,123612,274],{"class":54},[33,123614,221],{"class":167},[33,123616,123617,123619,123621,123623],{"class":35,"line":347},[33,123618,35726],{"class":163},[33,123620,2945],{"class":50},[33,123622,1852],{"class":163},[33,123624,7583],{"class":167},[33,123626,123627,123629,123631,123633,123635,123637,123639,123641,123643,123645],{"class":35,"line":374},[33,123628,35742],{"class":163},[33,123630,16617],{"class":50},[33,123632,602],{"class":167},[33,123634,4059],{"class":163},[33,123636,90279],{"class":54},[33,123638,1115],{"class":50},[33,123640,7602],{"class":167},[33,123642,1121],{"class":50},[33,123644,274],{"class":54},[33,123646,221],{"class":167},[33,123648,123649,123651,123653,123655],{"class":35,"line":397},[33,123650,35726],{"class":163},[33,123652,107953],{"class":50},[33,123654,1852],{"class":163},[33,123656,7583],{"class":167},[33,123658,123659,123661,123663,123665,123667,123670,123672,123674,123676,123678],{"class":35,"line":653},[33,123660,35742],{"class":163},[33,123662,16617],{"class":50},[33,123664,602],{"class":167},[33,123666,4059],{"class":163},[33,123668,123669],{"class":54},"\"Write error: ",[33,123671,1115],{"class":50},[33,123673,7602],{"class":167},[33,123675,1121],{"class":50},[33,123677,274],{"class":54},[33,123679,221],{"class":167},[14,123681,123682,123683,123686,123687,123690,123691,123694,123695,3035],{},"For append-mode pipelines (new data arrives hourly), open with ",[30,123684,123685],{},"mode=\"a\""," and skip ",[30,123688,123689],{},"writeheader()"," after the first run. Store a ",[30,123692,123693],{},"header_written"," sentinel file or check ",[30,123696,123697],{},"OUT.stat().st_size > 0",[2537,123699],{},[18,123701,123703],{"id":123702},"step-9-bi-tool-conventions","Step 9 — BI-Tool Conventions",[14,123705,123706],{},"Different BI tools have specific expectations:",[4273,123708,123709,123726],{},[4276,123710,123711],{},[4279,123712,123713,123716,123719,123721,123724],{},[4282,123714,123715],{},"Tool",[4282,123717,123718],{},"Delimiter",[4282,123720,114611],{},[4282,123722,123723],{},"Dates",[4282,123725,112556],{},[4292,123727,123728,123748,123767,123788,123805,123823],{},[4279,123729,123730,123733,123740,123742,123745],{},[4297,123731,123732],{},"Excel (Windows)",[4297,123734,123735,2012,123737,123739],{},[30,123736,63503],{},[30,123738,114705],{}," (locale)",[4297,123741,120790],{},[4297,123743,123744],{},"Any",[4297,123746,123747],{},"BOM required to avoid ANSI decode",[4279,123749,123750,123753,123757,123760,123764],{},[4297,123751,123752],{},"Tableau",[4297,123754,123755],{},[30,123756,63503],{},[4297,123758,123759],{},"UTF-8",[4297,123761,123762],{},[30,123763,97759],{},[4297,123765,123766],{},"No BOM needed; strip timezone",[4279,123768,123769,123772,123776,123779,123782],{},[4297,123770,123771],{},"Power BI",[4297,123773,123774],{},[30,123775,63503],{},[4297,123777,123778],{},"UTF-8 or UTF-16",[4297,123780,123781],{},"ISO 8601",[4297,123783,123784,123785,123787],{},"Handles ",[30,123786,122973],{},"; prefers no index",[4279,123789,123790,123793,123796,123801,123803],{},[4297,123791,123792],{},"Redshift COPY",[4297,123794,123795],{},"`",[4297,123797,123798,123800],{},[30,123799,7162],{},"\\t`",[4297,123802,123759],{},[4297,123804,123781],{},[4279,123806,123807,123810,123814,123816,123818],{},[4297,123808,123809],{},"Snowflake COPY INTO",[4297,123811,123812],{},[30,123813,63503],{},[4297,123815,123759],{},[4297,123817,123781],{},[4297,123819,17059,123820],{},[30,123821,123822],{},"FIELD_OPTIONALLY_ENCLOSED_BY='\"'",[4279,123824,123825,123828,123832,123834,123838],{},[4297,123826,123827],{},"Google Sheets",[4297,123829,123830],{},[30,123831,63503],{},[4297,123833,123759],{},[4297,123835,123836],{},[30,123837,97759],{},[4297,123839,123840],{},"Handles BOM; strips it silently",[14,123842,123843,123844,365,123847,365,123849,365,123851,123853],{},"The cleanest universal export is ",[30,123845,123846],{},"sep=\",\"",[30,123848,46120],{},[30,123850,121305],{},[30,123852,28142],{}," — add the BOM only when Excel is the guaranteed consumer.",[2537,123855],{},[18,123857,52030],{"id":52029},[14,123859,123860],{},"After writing, always re-read and assert:",[23,123862,123864],{"className":126,"code":123863,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\nfrom pathlib import Path\n\nOUT = Path(\"exports\u002Freport.csv\")\n\ntry:\n    original = pd.read_csv(\"data\u002Fsample_export.csv\")\n    exported = pd.read_csv(OUT)\n    assert len(exported) == len(original), \\\n        f\"Row count mismatch: {len(exported)} vs {len(original)}\"\n    assert list(exported.columns) == list(original.columns), \\\n        f\"Column mismatch: {exported.columns.tolist()}\"\n    print(f\"Validation passed — {len(exported):,} rows, {len(exported.columns)} columns.\")\nexcept FileNotFoundError as e:\n    raise SystemExit(f\"File not found: {e}\")\nexcept AssertionError as e:\n    raise SystemExit(f\"Validation failed: {e}\")\n",[30,123865,123866,123870,123880,123890,123894,123906,123910,123916,123929,123942,123958,123982,123998,124013,124046,124056,124078,124088],{"__ignoreMap":28},[33,123867,123868],{"class":35,"line":36},[33,123869,8895],{"class":39},[33,123871,123872,123874,123876,123878],{"class":35,"line":43},[33,123873,164],{"class":163},[33,123875,492],{"class":167},[33,123877,495],{"class":163},[33,123879,498],{"class":167},[33,123881,123882,123884,123886,123888],{"class":35,"line":61},[33,123883,190],{"class":163},[33,123885,193],{"class":167},[33,123887,164],{"class":163},[33,123889,198],{"class":167},[33,123891,123892],{"class":35,"line":73},[33,123893,92],{"emptyLinePlaceholder":91},[33,123895,123896,123898,123900,123902,123904],{"class":35,"line":88},[33,123897,57716],{"class":50},[33,123899,212],{"class":163},[33,123901,215],{"class":167},[33,123903,120971],{"class":54},[33,123905,221],{"class":167},[33,123907,123908],{"class":35,"line":95},[33,123909,92],{"emptyLinePlaceholder":91},[33,123911,123912,123914],{"class":35,"line":101},[33,123913,35574],{"class":163},[33,123915,574],{"class":167},[33,123917,123918,123921,123923,123925,123927],{"class":35,"line":171},[33,123919,123920],{"class":167},"    original ",[33,123922,242],{"class":163},[33,123924,9481],{"class":167},[33,123926,120296],{"class":54},[33,123928,221],{"class":167},[33,123930,123931,123934,123936,123938,123940],{"class":35,"line":179},[33,123932,123933],{"class":167},"    exported ",[33,123935,242],{"class":163},[33,123937,9481],{"class":167},[33,123939,57716],{"class":50},[33,123941,221],{"class":167},[33,123943,123944,123946,123948,123951,123953,123955],{"class":35,"line":187},[33,123945,9228],{"class":163},[33,123947,4037],{"class":50},[33,123949,123950],{"class":167},"(exported) ",[33,123952,1865],{"class":163},[33,123954,4037],{"class":50},[33,123956,123957],{"class":167},"(original), \\\n",[33,123959,123960,123962,123964,123966,123969,123971,123973,123975,123978,123980],{"class":35,"line":201},[33,123961,9533],{"class":163},[33,123963,119992],{"class":54},[33,123965,4065],{"class":50},[33,123967,123968],{"class":167},"(exported)",[33,123970,1121],{"class":50},[33,123972,71066],{"class":54},[33,123974,4065],{"class":50},[33,123976,123977],{"class":167},"(original)",[33,123979,1121],{"class":50},[33,123981,7504],{"class":54},[33,123983,123984,123986,123988,123991,123993,123995],{"class":35,"line":206},[33,123985,9228],{"class":163},[33,123987,599],{"class":50},[33,123989,123990],{"class":167},"(exported.columns) ",[33,123992,1865],{"class":163},[33,123994,599],{"class":50},[33,123996,123997],{"class":167},"(original.columns), \\\n",[33,123999,124000,124002,124004,124006,124009,124011],{"class":35,"line":224},[33,124001,9533],{"class":163},[33,124003,119946],{"class":54},[33,124005,1115],{"class":50},[33,124007,124008],{"class":167},"exported.columns.tolist()",[33,124010,1121],{"class":50},[33,124012,7504],{"class":54},[33,124014,124015,124017,124019,124021,124024,124026,124028,124030,124032,124034,124036,124039,124041,124044],{"class":35,"line":229},[33,124016,7268],{"class":50},[33,124018,602],{"class":167},[33,124020,4059],{"class":163},[33,124022,124023],{"class":54},"\"Validation passed — ",[33,124025,4065],{"class":50},[33,124027,123968],{"class":167},[33,124029,18801],{"class":163},[33,124031,1121],{"class":50},[33,124033,115486],{"class":54},[33,124035,4065],{"class":50},[33,124037,124038],{"class":167},"(exported.columns)",[33,124040,1121],{"class":50},[33,124042,124043],{"class":54}," columns.\"",[33,124045,221],{"class":167},[33,124047,124048,124050,124052,124054],{"class":35,"line":235},[33,124049,35726],{"class":163},[33,124051,2945],{"class":50},[33,124053,1852],{"class":163},[33,124055,7583],{"class":167},[33,124057,124058,124060,124062,124064,124066,124068,124070,124072,124074,124076],{"class":35,"line":250},[33,124059,35742],{"class":163},[33,124061,16617],{"class":50},[33,124063,602],{"class":167},[33,124065,4059],{"class":163},[33,124067,15677],{"class":54},[33,124069,1115],{"class":50},[33,124071,7602],{"class":167},[33,124073,1121],{"class":50},[33,124075,274],{"class":54},[33,124077,221],{"class":167},[33,124079,124080,124082,124084,124086],{"class":35,"line":266},[33,124081,35726],{"class":163},[33,124083,9445],{"class":50},[33,124085,1852],{"class":163},[33,124087,7583],{"class":167},[33,124089,124090,124092,124094,124096,124098,124101,124103,124105,124107,124109],{"class":35,"line":290},[33,124091,35742],{"class":163},[33,124093,16617],{"class":50},[33,124095,602],{"class":167},[33,124097,4059],{"class":163},[33,124099,124100],{"class":54},"\"Validation failed: ",[33,124102,1115],{"class":50},[33,124104,7602],{"class":167},[33,124106,1121],{"class":50},[33,124108,274],{"class":54},[33,124110,221],{"class":167},[2537,124112],{},[18,124114,4209],{"id":4208},[4211,124116,124117,124126,124136,124147,124162],{},[4214,124118,124119,2079,124122,124125],{},[1974,124120,124121],{},"Under 100 MB",[30,124123,124124],{},"df.to_csv()"," is fine. The bottleneck is usually disk I\u002FO, not pandas.",[4214,124127,124128,124131,124132,124135],{},[1974,124129,124130],{},"100 MB – 1 GB",": Add ",[30,124133,124134],{},"compression=\"gzip\""," to cut disk writes by 60–80%. If the DataFrame fits in RAM, in-memory compression is faster than two-step compress.",[4214,124137,124138,124141,124142,69863,124144,124146],{},[1974,124139,124140],{},"Over 1 GB",": Do not load the whole file into RAM. Use the chunked ",[30,124143,57237],{},[30,124145,120884],{}," pattern from Step 8, or export directly from a database cursor.",[4214,124148,124149,2079,124152,124154,124155,124158,124159,124161],{},[1974,124150,124151],{},"pyarrow backend",[30,124153,124124],{}," is single-threaded. For very large DataFrames, export to Parquet (",[30,124156,124157],{},"df.to_parquet()",") and convert with ",[30,124160,14295],{}," if the downstream tool supports it; Parquet round-trips are 3–10× faster.",[4214,124163,124164,124167,124168,65087,124170,124172,124173,124176],{},[1974,124165,124166],{},"Multiple files",": If splitting a large export by date or region, ",[30,124169,21820],{},[30,124171,118265],{}," in a loop is fine up to ~50 groups. Beyond that, switch to ",[30,124174,124175],{},"pyarrow.dataset"," partitioned writes.",[2537,124178],{},[18,124180,4271],{"id":4270},[4273,124182,124183,124193],{},[4276,124184,124185],{},[4279,124186,124187,124189,124191],{},[4282,124188,85029],{},[4282,124190,4287],{},[4282,124192,4290],{},[4292,124194,124195,124213,124225,124242,124263,124280,124296],{},[4279,124196,124197,124203,124208],{},[4297,124198,124199,124200,124202],{},"Re-read shows ",[30,124201,117994],{}," column",[4297,124204,124205,124207],{},[30,124206,118005],{}," (default) was used",[4297,124209,124210],{},[30,124211,124212],{},"df.to_csv(path, index=False)",[4279,124214,124215,124218,124221],{},[4297,124216,124217],{},"Accented chars garbled in Excel",[4297,124219,124220],{},"UTF-8 without BOM",[4297,124222,124223],{},[30,124224,59124],{},[4279,124226,124227,124232,124235],{},[4297,124228,120715,124229,124231],{},[30,124230,120718],{}," in output",[4297,124233,124234],{},"IEEE 754 float noise",[4297,124236,124237,2012,124239],{},[30,124238,121288],{},[30,124240,124241],{},"df[\"col\"].round(n)",[4279,124243,124244,124247,124254],{},[4297,124245,124246],{},"Double blank lines on Windows",[4297,124248,124249,70939,124251],{},[30,124250,70995],{},[30,124252,124253],{},"newline=\"\"",[4297,124255,14408,124256,36661,124258,22506,124260,124262],{},[30,124257,124253],{},[30,124259,70995],{},[30,124261,118265],{}," handles this internally",[4279,124264,124265,124271,124276],{},[4297,124266,124267,124268],{},"Dates include time ",[30,124269,124270],{},"00:00:00",[4297,124272,124273,124274,4129],{},"No ",[30,124275,119308],{},[4297,124277,124278],{},[30,124279,121305],{},[4279,124281,124282,124287,124290],{},[4297,124283,124284,124286],{},[30,124285,70953],{}," on large export",[4297,124288,124289],{},"Entire DataFrame loaded at once",[4297,124291,124292,124293,124295],{},"Chunked write with ",[30,124294,21944],{}," iterator",[4279,124297,124298,124303,124306],{},[4297,124299,120890,124300,124302],{},[30,124301,117998],{}," rejects the file",[4297,124304,124305],{},"Embedded commas unquoted, or wrong line ending",[4297,124307,124308,124309,124311,124312],{},"Use pipe delimiter + ",[30,124310,114959],{},"; set ",[30,124313,121315],{},[2537,124315],{},[18,124317,4402],{"id":4401},[23,124319,124321],{"className":126,"code":124320,"language":47,"meta":28,"style":28},"# pip install pandas\n\"\"\"\nexport_csv.py — export a DataFrame to CSV with full parameter control.\nUsage: python export_csv.py --source data\u002Finput.csv --out exports\u002Foutput.csv\n                            [--sep ,] [--encoding utf-8-sig] [--gzip]\n                            [--float-format %.2f] [--date-format %Y-%m-%d]\n                            [--no-index]\n\"\"\"\nimport argparse\nimport csv\nimport sys\nfrom pathlib import Path\n\nimport pandas as pd\n\n\ndef build_parser() -> argparse.ArgumentParser:\n    p = argparse.ArgumentParser(description=\"Export DataFrame to CSV\")\n    p.add_argument(\"--source\", required=True, type=Path, help=\"Input CSV path\")\n    p.add_argument(\"--out\", required=True, type=Path, help=\"Output CSV path\")\n    p.add_argument(\"--sep\", default=\",\", help=\"Field delimiter (default: ,)\")\n    p.add_argument(\"--encoding\", default=\"utf-8-sig\", help=\"Output encoding\")\n    p.add_argument(\"--gzip\", action=\"store_true\", help=\"Compress output with gzip\")\n    p.add_argument(\"--float-format\", default=\"%.2f\", dest=\"float_format\")\n    p.add_argument(\"--date-format\", default=\"%Y-%m-%d\", dest=\"date_format\")\n    p.add_argument(\"--no-index\", action=\"store_true\", dest=\"no_index\",\n                   help=\"Exclude DataFrame index from output (recommended)\")\n    return p\n\n\ndef main() -> None:\n    args = build_parser().parse_args()\n\n    try:\n        df = pd.read_csv(args.source, parse_dates=True, infer_datetime_format=True)\n    except FileNotFoundError:\n        sys.exit(f\"Source not found: {args.source}\")\n    except pd.errors.ParserError as e:\n        sys.exit(f\"Parse error: {e}\")\n\n    args.out.parent.mkdir(parents=True, exist_ok=True)\n\n    compression = \"gzip\" if args.gzip else None\n    out_path = args.out.with_suffix(args.out.suffix + \".gz\") if args.gzip else args.out\n\n    try:\n        df.to_csv(\n            out_path,\n            index=not args.no_index,\n            sep=args.sep,\n            encoding=args.encoding,\n            float_format=args.float_format,\n            date_format=args.date_format,\n            lineterminator=\"\\r\\n\",\n            na_rep=\"\",\n            compression=compression,\n        )\n    except OSError as e:\n        sys.exit(f\"Write failed: {e}\")\n\n    # Validation round-trip\n    try:\n        check = pd.read_csv(out_path, compression=compression or \"infer\")\n        expected_rows = len(df)\n        if len(check) != expected_rows:\n            sys.exit(f\"Row count mismatch after export: {len(check)} vs {expected_rows}\")\n        print(f\"Exported {len(df):,} rows × {len(df.columns)} cols → {out_path}\")\n    except Exception as e:\n        sys.exit(f\"Validation failed: {e}\")\n\n\nif __name__ == \"__main__\":\n    main()\n",[30,124322,124323,124327,124331,124336,124341,124346,124351,124356,124360,124366,124372,124378,124388,124392,124402,124406,124410,124420,124437,124469,124500,124526,124552,124579,124610,124640,124666,124678,124685,124689,124693,124705,124714,124718,124724,124749,124757,124776,124787,124806,124810,124831,124835,124854,124879,124883,124889,124894,124899,124910,124919,124928,124938,124948,124963,124974,124984,124988,124998,125016,125020,125025,125031,125055,125066,125080,125108,125147,125157,125175,125179,125183,125195],{"__ignoreMap":28},[33,124324,124325],{"class":35,"line":36},[33,124326,8895],{"class":39},[33,124328,124329],{"class":35,"line":43},[33,124330,139],{"class":54},[33,124332,124333],{"class":35,"line":61},[33,124334,124335],{"class":54},"export_csv.py — export a DataFrame to CSV with full parameter control.\n",[33,124337,124338],{"class":35,"line":73},[33,124339,124340],{"class":54},"Usage: python export_csv.py --source data\u002Finput.csv --out exports\u002Foutput.csv\n",[33,124342,124343],{"class":35,"line":88},[33,124344,124345],{"class":54},"                            [--sep ,] [--encoding utf-8-sig] [--gzip]\n",[33,124347,124348],{"class":35,"line":95},[33,124349,124350],{"class":54},"                            [--float-format %.2f] [--date-format %Y-%m-%d]\n",[33,124352,124353],{"class":35,"line":101},[33,124354,124355],{"class":54},"                            [--no-index]\n",[33,124357,124358],{"class":35,"line":171},[33,124359,139],{"class":54},[33,124361,124362,124364],{"class":35,"line":179},[33,124363,164],{"class":163},[33,124365,4461],{"class":167},[33,124367,124368,124370],{"class":35,"line":187},[33,124369,164],{"class":163},[33,124371,107673],{"class":167},[33,124373,124374,124376],{"class":35,"line":201},[33,124375,164],{"class":163},[33,124377,168],{"class":167},[33,124379,124380,124382,124384,124386],{"class":35,"line":206},[33,124381,190],{"class":163},[33,124383,193],{"class":167},[33,124385,164],{"class":163},[33,124387,198],{"class":167},[33,124389,124390],{"class":35,"line":224},[33,124391,92],{"emptyLinePlaceholder":91},[33,124393,124394,124396,124398,124400],{"class":35,"line":229},[33,124395,164],{"class":163},[33,124397,492],{"class":167},[33,124399,495],{"class":163},[33,124401,498],{"class":167},[33,124403,124404],{"class":35,"line":235},[33,124405,92],{"emptyLinePlaceholder":91},[33,124407,124408],{"class":35,"line":250},[33,124409,92],{"emptyLinePlaceholder":91},[33,124411,124412,124414,124417],{"class":35,"line":266},[33,124413,562],{"class":163},[33,124415,124416],{"class":46}," build_parser",[33,124418,124419],{"class":167},"() -> argparse.ArgumentParser:\n",[33,124421,124422,124424,124426,124428,124430,124432,124435],{"class":35,"line":290},[33,124423,18452],{"class":167},[33,124425,242],{"class":163},[33,124427,6653],{"class":167},[33,124429,6656],{"class":238},[33,124431,242],{"class":163},[33,124433,124434],{"class":54},"\"Export DataFrame to CSV\"",[33,124436,221],{"class":167},[33,124438,124439,124442,124445,124447,124449,124451,124453,124455,124457,124459,124461,124463,124465,124467],{"class":35,"line":295},[33,124440,124441],{"class":167},"    p.add_argument(",[33,124443,124444],{"class":54},"\"--source\"",[33,124446,365],{"class":167},[33,124448,25448],{"class":238},[33,124450,242],{"class":163},[33,124452,855],{"class":50},[33,124454,365],{"class":167},[33,124456,6677],{"class":238},[33,124458,242],{"class":163},[33,124460,6682],{"class":167},[33,124462,25463],{"class":238},[33,124464,242],{"class":163},[33,124466,107039],{"class":54},[33,124468,221],{"class":167},[33,124470,124471,124473,124475,124477,124479,124481,124483,124485,124487,124489,124491,124493,124495,124498],{"class":35,"line":300},[33,124472,124441],{"class":167},[33,124474,41152],{"class":54},[33,124476,365],{"class":167},[33,124478,25448],{"class":238},[33,124480,242],{"class":163},[33,124482,855],{"class":50},[33,124484,365],{"class":167},[33,124486,6677],{"class":238},[33,124488,242],{"class":163},[33,124490,6682],{"class":167},[33,124492,25463],{"class":238},[33,124494,242],{"class":163},[33,124496,124497],{"class":54},"\"Output CSV path\"",[33,124499,221],{"class":167},[33,124501,124502,124504,124507,124509,124511,124513,124515,124517,124519,124521,124524],{"class":35,"line":317},[33,124503,124441],{"class":167},[33,124505,124506],{"class":54},"\"--sep\"",[33,124508,365],{"class":167},[33,124510,6685],{"class":238},[33,124512,242],{"class":163},[33,124514,15900],{"class":54},[33,124516,365],{"class":167},[33,124518,25463],{"class":238},[33,124520,242],{"class":163},[33,124522,124523],{"class":54},"\"Field delimiter (default: ,)\"",[33,124525,221],{"class":167},[33,124527,124528,124530,124533,124535,124537,124539,124541,124543,124545,124547,124550],{"class":35,"line":332},[33,124529,124441],{"class":167},[33,124531,124532],{"class":54},"\"--encoding\"",[33,124534,365],{"class":167},[33,124536,6685],{"class":238},[33,124538,242],{"class":163},[33,124540,108390],{"class":54},[33,124542,365],{"class":167},[33,124544,25463],{"class":238},[33,124546,242],{"class":163},[33,124548,124549],{"class":54},"\"Output encoding\"",[33,124551,221],{"class":167},[33,124553,124554,124556,124559,124561,124564,124566,124568,124570,124572,124574,124577],{"class":35,"line":347},[33,124555,124441],{"class":167},[33,124557,124558],{"class":54},"\"--gzip\"",[33,124560,365],{"class":167},[33,124562,124563],{"class":238},"action",[33,124565,242],{"class":163},[33,124567,6740],{"class":54},[33,124569,365],{"class":167},[33,124571,25463],{"class":238},[33,124573,242],{"class":163},[33,124575,124576],{"class":54},"\"Compress output with gzip\"",[33,124578,221],{"class":167},[33,124580,124581,124583,124586,124588,124590,124592,124594,124596,124598,124600,124603,124605,124608],{"class":35,"line":374},[33,124582,124441],{"class":167},[33,124584,124585],{"class":54},"\"--float-format\"",[33,124587,365],{"class":167},[33,124589,6685],{"class":238},[33,124591,242],{"class":163},[33,124593,274],{"class":54},[33,124595,54896],{"class":50},[33,124597,274],{"class":54},[33,124599,365],{"class":167},[33,124601,124602],{"class":238},"dest",[33,124604,242],{"class":163},[33,124606,124607],{"class":54},"\"float_format\"",[33,124609,221],{"class":167},[33,124611,124612,124614,124617,124619,124621,124623,124625,124627,124629,124631,124633,124635,124638],{"class":35,"line":397},[33,124613,124441],{"class":167},[33,124615,124616],{"class":54},"\"--date-format\"",[33,124618,365],{"class":167},[33,124620,6685],{"class":238},[33,124622,242],{"class":163},[33,124624,1244],{"class":54},[33,124626,916],{"class":50},[33,124628,274],{"class":54},[33,124630,365],{"class":167},[33,124632,124602],{"class":238},[33,124634,242],{"class":163},[33,124636,124637],{"class":54},"\"date_format\"",[33,124639,221],{"class":167},[33,124641,124642,124644,124647,124649,124651,124653,124655,124657,124659,124661,124664],{"class":35,"line":653},[33,124643,124441],{"class":167},[33,124645,124646],{"class":54},"\"--no-index\"",[33,124648,365],{"class":167},[33,124650,124563],{"class":238},[33,124652,242],{"class":163},[33,124654,6740],{"class":54},[33,124656,365],{"class":167},[33,124658,124602],{"class":238},[33,124660,242],{"class":163},[33,124662,124663],{"class":54},"\"no_index\"",[33,124665,247],{"class":167},[33,124667,124668,124671,124673,124676],{"class":35,"line":667},[33,124669,124670],{"class":238},"                   help",[33,124672,242],{"class":163},[33,124674,124675],{"class":54},"\"Exclude DataFrame index from output (recommended)\"",[33,124677,221],{"class":167},[33,124679,124680,124682],{"class":35,"line":675},[33,124681,1332],{"class":163},[33,124683,124684],{"class":167}," p\n",[33,124686,124687],{"class":35,"line":689},[33,124688,92],{"emptyLinePlaceholder":91},[33,124690,124691],{"class":35,"line":703},[33,124692,92],{"emptyLinePlaceholder":91},[33,124694,124695,124697,124699,124701,124703],{"class":35,"line":714},[33,124696,562],{"class":163},[33,124698,6636],{"class":46},[33,124700,568],{"class":167},[33,124702,571],{"class":50},[33,124704,574],{"class":167},[33,124706,124707,124709,124711],{"class":35,"line":723},[33,124708,6766],{"class":167},[33,124710,242],{"class":163},[33,124712,124713],{"class":167}," build_parser().parse_args()\n",[33,124715,124716],{"class":35,"line":754},[33,124717,92],{"emptyLinePlaceholder":91},[33,124719,124720,124722],{"class":35,"line":771},[33,124721,2424],{"class":163},[33,124723,574],{"class":167},[33,124725,124726,124728,124730,124733,124735,124737,124739,124741,124743,124745,124747],{"class":35,"line":777},[33,124727,7930],{"class":167},[33,124729,242],{"class":163},[33,124731,124732],{"class":167}," pd.read_csv(args.source, ",[33,124734,102641],{"class":238},[33,124736,242],{"class":163},[33,124738,855],{"class":50},[33,124740,365],{"class":167},[33,124742,12274],{"class":238},[33,124744,242],{"class":163},[33,124746,855],{"class":50},[33,124748,221],{"class":167},[33,124750,124751,124753,124755],{"class":35,"line":788},[33,124752,2449],{"class":163},[33,124754,2945],{"class":50},[33,124756,574],{"class":167},[33,124758,124759,124761,124763,124765,124767,124770,124772,124774],{"class":35,"line":804},[33,124760,2995],{"class":167},[33,124762,4059],{"class":163},[33,124764,90279],{"class":54},[33,124766,1115],{"class":50},[33,124768,124769],{"class":167},"args.source",[33,124771,1121],{"class":50},[33,124773,274],{"class":54},[33,124775,221],{"class":167},[33,124777,124778,124780,124783,124785],{"class":35,"line":809},[33,124779,2449],{"class":163},[33,124781,124782],{"class":167}," pd.errors.ParserError ",[33,124784,495],{"class":163},[33,124786,7583],{"class":167},[33,124788,124789,124791,124793,124796,124798,124800,124802,124804],{"class":35,"line":819},[33,124790,2995],{"class":167},[33,124792,4059],{"class":163},[33,124794,124795],{"class":54},"\"Parse error: ",[33,124797,1115],{"class":50},[33,124799,7602],{"class":167},[33,124801,1121],{"class":50},[33,124803,274],{"class":54},[33,124805,221],{"class":167},[33,124807,124808],{"class":35,"line":829},[33,124809,92],{"emptyLinePlaceholder":91},[33,124811,124812,124815,124817,124819,124821,124823,124825,124827,124829],{"class":35,"line":834},[33,124813,124814],{"class":167},"    args.out.parent.mkdir(",[33,124816,869],{"class":238},[33,124818,242],{"class":163},[33,124820,855],{"class":50},[33,124822,365],{"class":167},[33,124824,878],{"class":238},[33,124826,242],{"class":163},[33,124828,855],{"class":50},[33,124830,221],{"class":167},[33,124832,124833],{"class":35,"line":839},[33,124834,92],{"emptyLinePlaceholder":91},[33,124836,124837,124840,124842,124845,124847,124850,124852],{"class":35,"line":860},[33,124838,124839],{"class":167},"    compression ",[33,124841,242],{"class":163},[33,124843,124844],{"class":54}," \"gzip\"",[33,124846,9994],{"class":163},[33,124848,124849],{"class":167}," args.gzip ",[33,124851,7489],{"class":163},[33,124853,3852],{"class":50},[33,124855,124856,124858,124860,124863,124865,124868,124870,124872,124874,124876],{"class":35,"line":887},[33,124857,6388],{"class":167},[33,124859,242],{"class":163},[33,124861,124862],{"class":167}," args.out.with_suffix(args.out.suffix ",[33,124864,1811],{"class":163},[33,124866,124867],{"class":54}," \".gz\"",[33,124869,1649],{"class":167},[33,124871,2491],{"class":163},[33,124873,124849],{"class":167},[33,124875,7489],{"class":163},[33,124877,124878],{"class":167}," args.out\n",[33,124880,124881],{"class":35,"line":907},[33,124882,92],{"emptyLinePlaceholder":91},[33,124884,124885,124887],{"class":35,"line":1826},[33,124886,2424],{"class":163},[33,124888,574],{"class":167},[33,124890,124891],{"class":35,"line":1844},[33,124892,124893],{"class":167},"        df.to_csv(\n",[33,124895,124896],{"class":35,"line":1858},[33,124897,124898],{"class":167},"            out_path,\n",[33,124900,124901,124904,124907],{"class":35,"line":1871},[33,124902,124903],{"class":238},"            index",[33,124905,124906],{"class":163},"=not",[33,124908,124909],{"class":167}," args.no_index,\n",[33,124911,124912,124914,124916],{"class":35,"line":1877},[33,124913,113197],{"class":238},[33,124915,242],{"class":163},[33,124917,124918],{"class":167},"args.sep,\n",[33,124920,124921,124923,124925],{"class":35,"line":1883},[33,124922,113176],{"class":238},[33,124924,242],{"class":163},[33,124926,124927],{"class":167},"args.encoding,\n",[33,124929,124930,124933,124935],{"class":35,"line":1915},[33,124931,124932],{"class":238},"            float_format",[33,124934,242],{"class":163},[33,124936,124937],{"class":167},"args.float_format,\n",[33,124939,124940,124943,124945],{"class":35,"line":1926},[33,124941,124942],{"class":238},"            date_format",[33,124944,242],{"class":163},[33,124946,124947],{"class":167},"args.date_format,\n",[33,124949,124950,124953,124955,124957,124959,124961],{"class":35,"line":1932},[33,124951,124952],{"class":238},"            lineterminator",[33,124954,242],{"class":163},[33,124956,274],{"class":54},[33,124958,121174],{"class":50},[33,124960,274],{"class":54},[33,124962,247],{"class":167},[33,124964,124965,124968,124970,124972],{"class":35,"line":1938},[33,124966,124967],{"class":238},"            na_rep",[33,124969,242],{"class":163},[33,124971,3198],{"class":54},[33,124973,247],{"class":167},[33,124975,124976,124979,124981],{"class":35,"line":1950},[33,124977,124978],{"class":238},"            compression",[33,124980,242],{"class":163},[33,124982,124983],{"class":167},"compression,\n",[33,124985,124986],{"class":35,"line":1958},[33,124987,5867],{"class":167},[33,124989,124990,124992,124994,124996],{"class":35,"line":4904},[33,124991,2449],{"class":163},[33,124993,107953],{"class":50},[33,124995,1852],{"class":163},[33,124997,7583],{"class":167},[33,124999,125000,125002,125004,125006,125008,125010,125012,125014],{"class":35,"line":4909},[33,125001,2995],{"class":167},[33,125003,4059],{"class":163},[33,125005,118410],{"class":54},[33,125007,1115],{"class":50},[33,125009,7602],{"class":167},[33,125011,1121],{"class":50},[33,125013,274],{"class":54},[33,125015,221],{"class":167},[33,125017,125018],{"class":35,"line":4915},[33,125019,92],{"emptyLinePlaceholder":91},[33,125021,125022],{"class":35,"line":4925},[33,125023,125024],{"class":39},"    # Validation round-trip\n",[33,125026,125027,125029],{"class":35,"line":4935},[33,125028,2424],{"class":163},[33,125030,574],{"class":167},[33,125032,125033,125036,125038,125041,125043,125045,125048,125050,125053],{"class":35,"line":4941},[33,125034,125035],{"class":167},"        check ",[33,125037,242],{"class":163},[33,125039,125040],{"class":167}," pd.read_csv(out_path, ",[33,125042,109446],{"class":238},[33,125044,242],{"class":163},[33,125046,125047],{"class":167},"compression ",[33,125049,7162],{"class":163},[33,125051,125052],{"class":54}," \"infer\"",[33,125054,221],{"class":167},[33,125056,125057,125060,125062,125064],{"class":35,"line":4950},[33,125058,125059],{"class":167},"        expected_rows ",[33,125061,242],{"class":163},[33,125063,4037],{"class":50},[33,125065,13477],{"class":167},[33,125067,125068,125070,125072,125075,125077],{"class":35,"line":4960},[33,125069,8221],{"class":163},[33,125071,4037],{"class":50},[33,125073,125074],{"class":167},"(check) ",[33,125076,17877],{"class":163},[33,125078,125079],{"class":167}," expected_rows:\n",[33,125081,125082,125084,125086,125089,125091,125094,125096,125098,125100,125102,125104,125106],{"class":35,"line":4965},[33,125083,41280],{"class":167},[33,125085,4059],{"class":163},[33,125087,125088],{"class":54},"\"Row count mismatch after export: ",[33,125090,4065],{"class":50},[33,125092,125093],{"class":167},"(check)",[33,125095,1121],{"class":50},[33,125097,71066],{"class":54},[33,125099,1115],{"class":50},[33,125101,21514],{"class":167},[33,125103,1121],{"class":50},[33,125105,274],{"class":54},[33,125107,221],{"class":167},[33,125109,125110,125112,125114,125116,125118,125120,125122,125124,125126,125128,125130,125133,125135,125137,125139,125141,125143,125145],{"class":35,"line":4971},[33,125111,9414],{"class":50},[33,125113,602],{"class":167},[33,125115,4059],{"class":163},[33,125117,44444],{"class":54},[33,125119,4065],{"class":50},[33,125121,4068],{"class":167},[33,125123,18801],{"class":163},[33,125125,1121],{"class":50},[33,125127,16022],{"class":54},[33,125129,4065],{"class":50},[33,125131,125132],{"class":167},"(df.columns)",[33,125134,1121],{"class":50},[33,125136,16035],{"class":54},[33,125138,1115],{"class":50},[33,125140,40722],{"class":167},[33,125142,1121],{"class":50},[33,125144,274],{"class":54},[33,125146,221],{"class":167},[33,125148,125149,125151,125153,125155],{"class":35,"line":4983},[33,125150,2449],{"class":163},[33,125152,783],{"class":50},[33,125154,1852],{"class":163},[33,125156,7583],{"class":167},[33,125158,125159,125161,125163,125165,125167,125169,125171,125173],{"class":35,"line":4988},[33,125160,2995],{"class":167},[33,125162,4059],{"class":163},[33,125164,124100],{"class":54},[33,125166,1115],{"class":50},[33,125168,7602],{"class":167},[33,125170,1121],{"class":50},[33,125172,274],{"class":54},[33,125174,221],{"class":167},[33,125176,125177],{"class":35,"line":4993},[33,125178,92],{"emptyLinePlaceholder":91},[33,125180,125181],{"class":35,"line":5003},[33,125182,92],{"emptyLinePlaceholder":91},[33,125184,125185,125187,125189,125191,125193],{"class":35,"line":5008},[33,125186,2491],{"class":163},[33,125188,2494],{"class":50},[33,125190,2497],{"class":163},[33,125192,2500],{"class":54},[33,125194,574],{"class":167},[33,125196,125197],{"class":35,"line":5014},[33,125198,6914],{"class":167},[2537,125200],{},[18,125202,88566],{"id":29183},[14,125204,125205,107278,125208,125210],{},[1974,125206,125207],{},"How do I export a CSV that opens correctly in Excel without garbled characters?",[30,125209,59124],{},". The three-byte BOM tells Excel to decode the file as UTF-8 instead of the system ANSI code page.",[14,125212,125213,125216,125217,125220,125221,125223],{},[1974,125214,125215],{},"What is the fastest way to export millions of rows?","\nIf the DataFrame fits in RAM: ",[30,125218,125219],{},"df.to_csv(path, compression=\"gzip\")"," — disk I\u002FO is the bottleneck and gzip cuts it by 70–80%. If it does not fit: stream chunks through ",[30,125222,120884],{}," as shown in Step 8.",[14,125225,125226,107296,125229,36661,125231,125233,125234,3035],{},[1974,125227,125228],{},"How do I prevent pandas from writing row numbers as the first column?",[30,125230,28142],{},[30,125232,46123],{},". The detailed diagnosis of what goes wrong without it is in ",[940,125235,28147],{"href":28146},[14,125237,125238,125241,125242,10065,125244,125247,125248,125250,125251,8877,125253,125255,125256,125259],{},[1974,125239,125240],{},"Can I append to an existing CSV without overwriting it?","\nYes. Open with ",[30,125243,123685],{},[30,125245,125246],{},"header=False"," in pandas, or open the file in ",[30,125249,118117],{}," mode and skip ",[30,125252,123689],{},[30,125254,120884],{},". Check ",[30,125257,125258],{},"OUT.stat().st_size == 0"," to decide whether the header is needed.",[14,125261,125262,125265,125268,125269,125272,125273,3035],{},[1974,125263,125264],{},"How do I export multiple DataFrames to one CSV?",[30,125266,125267],{},"pd.concat([df1, df2, df3], ignore_index=True).to_csv(path, index=False)",". If the DataFrames are too large to concat in RAM, write the first with ",[30,125270,125271],{},"header=True",", the rest with ",[30,125274,125275],{},"mode=\"a\", header=False",[2537,125277],{},[18,125279,6918],{"id":6917},[4211,125281,125282,125290,125295,125300],{},[4214,125283,125284,125286,125287,125289],{},[940,125285,28147],{"href":28146}," — full diagnosis of the ",[30,125288,117994],{}," problem",[4214,125291,125292,125294],{},[940,125293,9599],{"href":9598}," — clean data before you export it",[4214,125296,125297,125299],{},[940,125298,99577],{"href":99576}," — ingestion side of the same pipeline",[4214,125301,125302,125304,125305],{},[940,125303,6936],{"href":6935}," — when CSV is not enough and you need formatted ",[30,125306,26542],{},[14,125308,6947,125309,3035],{},[940,125310,26258],{"href":26257},[6953,125312,125313],{},"html pre.shiki code .sAwPA, html code.shiki .sAwPA{--shiki-default:#6A737D}html pre.shiki code .s7eDp, html code.shiki .s7eDp{--shiki-default:#6F42C1}html pre.shiki code .sYu0t, html code.shiki .sYu0t{--shiki-default:#005CC5}html pre.shiki code .sYBdl, html code.shiki .sYBdl{--shiki-default:#032F62}html pre.shiki code .sgsFI, html code.shiki .sgsFI{--shiki-default:#24292E}html pre.shiki code .sD7c4, html code.shiki .sD7c4{--shiki-default:#D73A49}html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html pre.shiki code .sqxcx, html code.shiki .sqxcx{--shiki-default:#E36209}",{"title":28,"searchDepth":43,"depth":43,"links":125315},[125316,125317,125318,125319,125321,125325,125329,125334,125335,125336,125337,125338,125339,125340,125341,125342],{"id":20,"depth":43,"text":21},{"id":120489,"depth":43,"text":120490},{"id":120728,"depth":43,"text":120729},{"id":120905,"depth":43,"text":125320},"Step 3 — Core to_csv Configuration",{"id":121325,"depth":43,"text":121326,"children":125322},[125323,125324],{"id":121329,"depth":61,"text":121330},{"id":121555,"depth":61,"text":121556},{"id":121834,"depth":43,"text":121835,"children":125326},[125327,125328],{"id":121838,"depth":61,"text":121839},{"id":122099,"depth":61,"text":122100},{"id":122323,"depth":43,"text":122324,"children":125330},[125331,125332,125333],{"id":122327,"depth":61,"text":122328},{"id":122545,"depth":61,"text":122546},{"id":122763,"depth":61,"text":122764},{"id":122958,"depth":43,"text":122959},{"id":123320,"depth":43,"text":123321},{"id":123702,"depth":43,"text":123703},{"id":52029,"depth":43,"text":52030},{"id":4208,"depth":43,"text":4209},{"id":4270,"depth":43,"text":4271},{"id":4401,"depth":43,"text":4402},{"id":29183,"depth":43,"text":88566},{"id":6917,"depth":43,"text":6918},"Exporting to CSV",{},"\u002Fpython-for-excel-csv-data-processing\u002Fexporting-data-to-csv-formats",{"title":108865,"description":125347},{"Complete guide to exporting DataFrames and dicts to CSV with pandas and the csv module":125348,"date":46387,"updatedAt":6978,"tags":125349},"delimiters, encoding, index handling, float precision, dates, chunking, gzip, and BI-tool conventions.",[107436,47,9630,125350],"data export","Exporting Data to CSV Formats with Python","python-for-excel-csv-data-processing\u002Fexporting-data-to-csv-formats\u002Findex","upm4hMYYA6ncQ1lntZ1LJYc-XY6QnOKj3M7Fjo4w9cE",{"id":125355,"title":26258,"body":125356,"breadcrumbTitle":6977,"canonical":6977,"date":46387,"description":128330,"draft":6980,"extension":6981,"image":6977,"meta":128331,"navigation":91,"path":128332,"robots":6977,"seo":128333,"seoTitle":128334,"stem":128335,"tags":128336,"updatedAt":6978,"__hash__":128337},"content\u002Fpython-for-excel-csv-data-processing\u002Findex.md",{"type":7,"value":125357,"toc":128313},[125358,125361,125367,125376,125380,125383,125491,125493,125496,125632,125644,125646,125667,125704,125710,125713,125769,125773,125776,125780,125799,126067,126079,126083,126101,126395,126411,126415,126421,126711,126722,126726,126749,127053,127062,127066,127069,127107,127403,127428,127432,127446,127694,127706,127708,127715,128062,128092,128094,128201,128203,128209,128223,128241,128256,128262,128264,128296,128306,128310],[10,125359,26258],{"id":125360},"python-for-excel-csv-data-processing",[14,125362,125363,125364,125366],{},"Manual spreadsheet work breaks down the moment volume, cadence, or auditability matters. A monthly report assembled by hand from a dozen CSV exports is a chain of undocumented copy-paste steps: a wrong sort order, a stray decimal, a ",[30,125365,95790],{}," against a stale tab, and the number that lands on a stakeholder's desk is wrong with no diff to trace it back. The failure mode scales badly in three independent directions at once. Volume: a spreadsheet that opens fine at 50,000 rows freezes or silently truncates past Excel's row limit. Cadence: a report that is tolerable to assemble by hand once a quarter becomes a daily firefight when the business asks for it weekly. Auditability: when finance asks why last month's total differs from the system of record, a manual workflow has no answer because no step was recorded. VBA macros move the problem rather than solve it — they live inside a single workbook, resist version control, fail silently across Office versions, and cannot run on a Linux server or in a CI runner.",[14,125368,125369,125370,1351,125373,125375],{},"Python replaces that fragility with a script: deterministic, testable, reviewable in a pull request, and runnable unattended on a schedule. The same script produces the same output from the same input every time, a diff shows exactly what changed when logic is updated, and a failing run leaves a stack trace instead of a quietly wrong number. This guide covers the full path from raw ",[30,125371,125372],{},".csv",[30,125374,26542],{}," ingestion through cleaning, type coercion, multi-file consolidation, and serialization into outputs your BI tools and databases can trust — with the recurring failure modes called out at each stage so you can defend against them before they reach production.",[18,125377,125379],{"id":125378},"the-data-flow-at-a-glance","The data-flow at a glance",[14,125381,125382],{},"Every Excel\u002FCSV automation, however large, is the same four stages: read heterogeneous inputs into a common in-memory representation, clean and coerce them to a stable schema, consolidate across files, and serialize the result for a downstream consumer. The sections below walk each stage in order; the diagram fixes the vocabulary.",[2540,125384,2547,125386,2547,125389,2547,125392,2547,125406,2547,125408,2547,125412,2547,125415,2547,125417,2547,125420,2547,125423,2547,125427,2547,125430,2547,125433,2547,125437,2547,125440,2547,125443,2547,125446,2547,125450,2547,125453,2547,125456,2547,125459,2547,125462,2547,125465,2547,125468,2547,125470,2547,125474,2547,125477,2547,125479,2547,125482,2547,125485,2547,125487],{"viewBox":11071,"role":2543,"ariaLabel":125385,"xmlns":2545,"style":2546},"Data flow from CSV and XLSX inputs through cleaning, coercion, merge, and export",[2549,125387,125388],{},"Excel and CSV processing pipeline",[2553,125390,125391],{},"Raw CSV and XLSX files are read into pandas, cleaned and type-coerced, merged into one frame, then exported to CSV, XLSX, or Parquet for BI and databases.",[2557,125393,2559,125394,2559,125401,2547],{},[2561,125395,2564,125397,2564,125399,2559],{"id":125396,"x1":748,"y1":748,"x2":734,"y2":748},"excel-pillar-grad",[2566,125398],{"offset":748,"style":2568},[2566,125400],{"offset":734,"style":2571},[2573,125402,2564,125404,2559],{"id":125403,"viewBox":2576,"refX":2577,"refY":1153,"markerWidth":1179,"markerHeight":1179,"orient":2578},"excel-pillar-arrow",[2580,125405],{"d":2582,"fill":2583},[2585,125407],{"x":24213,"y":26323,"width":2635,"height":2680,"rx":2577,"fill":2592,"stroke":2593,"style":2594},[2000,125409,125411],{"x":125410,"y":89047,"fill":2599,"style":16983},"91","CSV exports",[2000,125413,125414],{"x":125410,"y":2597,"fill":2583,"style":2685},"ERP \u002F CRM dumps",[2585,125416],{"x":24213,"y":2679,"width":2635,"height":2680,"rx":2577,"fill":2592,"stroke":2593,"style":2594},[2000,125418,125419],{"x":125410,"y":26345,"fill":2599,"style":16983},"XLSX workbooks",[2000,125421,125422],{"x":125410,"y":2635,"fill":2583,"style":2685},"multi-sheet",[35,125424],{"x1":11198,"y1":2653,"x2":125425,"y2":26349,"stroke":2583,"markerEnd":125426,"style":2594},"214","url(#excel-pillar-arrow)",[35,125428],{"x1":11198,"y1":125429,"x2":125425,"y2":11099,"stroke":2583,"markerEnd":125426,"style":2594},"135",[2585,125431],{"x":71581,"y":16991,"width":2635,"height":2680,"rx":2577,"fill":125432,"stroke":2593,"style":2594},"url(#excel-pillar-grad)",[2000,125434,125436],{"x":110862,"y":125435,"fill":2599,"style":16983},"97","Read",[2000,125438,125439],{"x":110862,"y":2629,"fill":2583,"style":2685},"encoding \u002F dtype",[35,125441],{"x1":114632,"y1":2650,"x2":125442,"y2":2650,"stroke":2583,"markerEnd":125426,"style":2594},"414",[2585,125444],{"x":125445,"y":16991,"width":2635,"height":2680,"rx":2577,"fill":125432,"stroke":2593,"style":2594},"416",[2000,125447,125449],{"x":125448,"y":125435,"fill":2599,"style":16983},"491","Clean + coerce",[2000,125451,125452],{"x":125448,"y":2629,"fill":2583,"style":2685},"schema normalize",[35,125454],{"x1":125448,"y1":17025,"x2":125448,"y2":125455,"stroke":2583,"markerEnd":125426,"style":2594},"185",[2585,125457],{"x":125445,"y":125458,"width":2635,"height":2680,"rx":2577,"fill":125432,"stroke":2593,"style":2594},"188",[2000,125460,125461],{"x":125448,"y":58401,"fill":2599,"style":16983},"Merge",[2000,125463,125464],{"x":125448,"y":11126,"fill":2583,"style":2685},"concat \u002F join \u002F dedup",[35,125466],{"x1":125467,"y1":110835,"x2":102539,"y2":110835,"stroke":2583,"markerEnd":125426,"style":2594},"566",[2585,125469],{"x":17013,"y":2650,"width":26332,"height":2680,"rx":2577,"fill":2592,"stroke":2593,"style":2594},[2000,125471,125473],{"x":125472,"y":71551,"fill":2599,"style":16983},"677","CSV \u002F XLSX",[2000,125475,125476],{"x":125472,"y":2609,"fill":2583,"style":2685},"BI dashboards",[2585,125478],{"x":17013,"y":125458,"width":26332,"height":2680,"rx":2577,"fill":2592,"stroke":2593,"style":2594},[2000,125480,125481],{"x":125472,"y":58401,"fill":2599,"style":16983},"Parquet",[2000,125483,125484],{"x":125472,"y":11126,"fill":2583,"style":2685},"database load",[35,125486],{"x1":125467,"y1":110835,"x2":17013,"y2":2610,"stroke":2593,"style":2594},[2000,125488,125490],{"x":2626,"y":125489,"fill":2583,"style":2600},"285","Read → Clean & coerce → Merge → Serialize",[18,125492,26469],{"id":26468},[14,125494,125495],{},"Six libraries cover nearly every Excel\u002FCSV task. Reach for the lightest tool that does the job — pull in a DataFrame only when you need tabular operations, and drop to the standard library for plain streaming.",[4273,125497,125498,125510],{},[4276,125499,125500],{},[4279,125501,125502,125504,125506,125508],{},[4282,125503,26485],{},[4282,125505,64975],{},[4282,125507,26491],{},[4282,125509,26494],{},[4292,125511,125512,125530,125554,125574,125589,125615],{},[4279,125513,125514,125520,125523,125527],{},[4297,125515,125516],{},[940,125517,125518],{"href":9598},[1974,125519,9630],{},[4297,125521,125522],{},"Tabular cleaning, joins, aggregation, type coercion",[4297,125524,125525],{},[30,125526,26527],{},[4297,125528,125529],{},"Streaming a multi-GB CSV row-by-row where you never need the whole frame in memory",[4279,125531,125532,125536,125544,125548],{},[4297,125533,125534],{},[1974,125535,22009],{},[4297,125537,125538,125539,125541,125542,101882],{},"Reading\u002Fwriting ",[30,125540,26542],{}," with styles, formulas, charts; pandas' default ",[30,125543,26542],{},[4297,125545,125546],{},[30,125547,26548],{},[4297,125549,125550,125551,125553],{},"High-volume write-only output (slower than xlsxwriter); legacy ",[30,125552,112255],{}," files",[4279,125555,125556,125560,125566,125571],{},[4297,125557,125558],{},[1974,125559,17066],{},[4297,125561,125562,125563,125565],{},"Fast write-only ",[30,125564,26542],{}," with formatting, charts, conditional formats",[4297,125567,125568],{},[30,125569,125570],{},"pip install xlsxwriter",[4297,125572,125573],{},"Reading or editing an existing workbook — it is write-only",[4279,125575,125576,125580,125583,125586],{},[4297,125577,125578],{},[1974,125579,107536],{},[4297,125581,125582],{},"Constant-memory streaming of huge or malformed delimited files",[4297,125584,125585],{},"built in",[4297,125587,125588],{},"Anything needing joins, group-bys, or numeric coercion across columns",[4279,125590,125591,125596,125602,125607],{},[4297,125592,125593],{},[1974,125594,125595],{},"xlrd",[4297,125597,125598,125599,125601],{},"Reading legacy ",[30,125600,112255],{}," (BIFF) workbooks only",[4297,125603,125604],{},[30,125605,125606],{},"pip install xlrd",[4297,125608,125609,125611,125612,125614],{},[30,125610,26542],{}," files — xlrd dropped ",[30,125613,26542],{}," support in 2.0; use openpyxl instead",[4279,125616,125617,125621,125624,125629],{},[4297,125618,125619],{},[1974,125620,14295],{},[4297,125622,125623],{},"Columnar Parquet I\u002FO, fast typed serialization, large-data interchange",[4297,125625,125626],{},[30,125627,125628],{},"pip install pyarrow",[4297,125630,125631],{},"Sharing with non-technical stakeholders who expect a clickable spreadsheet",[14,125633,125634,125635,125640,125641,125643],{},"The library you pick for writing matters as much as the one for reading: ",[940,125636,125637,125639],{"href":102073},[1974,125638,22009],{}," is covered in depth for formulas and charts",", while xlsxwriter wins when you only ever produce new files. A useful rule of thumb: do all transformation work in pandas because its vectorized operations are an order of magnitude faster than cell-by-cell loops, and only drop to openpyxl or xlsxwriter for the final formatting pass. The two are not competitors at the same layer — pandas is the data layer, the Excel libraries are the presentation layer, and a typical pipeline uses both. The standard-library ",[30,125642,107436],{}," module sits below all of them: it never builds a DataFrame, so its memory footprint stays flat regardless of file size, which is exactly what you want for a 5 GB log you only need to filter and re-emit line by line.",[18,125645,26619],{"id":26618},[14,125647,125648,125649,125651,125652,125655,125656,42238,125658,125660,125661,125663,125664,125666],{},"Isolate every automation in a virtualenv so a ",[30,125650,9630],{}," upgrade for one job never silently changes another. This is not bureaucratic caution: pandas has a history of behavior changes between minor versions — default dtypes, the handling of ",[30,125653,125654],{},"mixed","-format dates, the deprecation of ",[30,125657,125595],{},[30,125659,26542],{}," — any of which can change a number without changing your code. Pin versions in a ",[30,125662,26625],{}," and commit it alongside the script. An unpinned ",[30,125665,26527],{}," on a fresh machine in six months will not reproduce today's behavior, and the gap between \"works on my laptop\" and \"works on the scheduler\" is almost always an unpinned dependency.",[23,125668,125670],{"className":25,"code":125669,"language":27,"meta":28,"style":28},"# Create and activate an isolated environment\npython -m venv .venv\nsource .venv\u002Fbin\u002Factivate          # Windows: .venv\\Scripts\\activate\npip install -r requirements.txt\n",[30,125671,125672,125676,125686,125694],{"__ignoreMap":28},[33,125673,125674],{"class":35,"line":36},[33,125675,26635],{"class":39},[33,125677,125678,125680,125682,125684],{"class":35,"line":43},[33,125679,47],{"class":46},[33,125681,51],{"class":50},[33,125683,55],{"class":54},[33,125685,58],{"class":54},[33,125687,125688,125690,125692],{"class":35,"line":61},[33,125689,64],{"class":50},[33,125691,67],{"class":54},[33,125693,70],{"class":39},[33,125695,125696,125698,125700,125702],{"class":35,"line":73},[33,125697,76],{"class":46},[33,125699,79],{"class":54},[33,125701,26709],{"class":50},[33,125703,26712],{"class":54},[23,125705,125708],{"className":125706,"code":125707,"language":2000,"meta":28},[1998],"# requirements.txt — pin everything that touches data correctness\npandas==2.2.2\nopenpyxl==3.1.5\nXlsxWriter==3.2.0\nxlrd==2.0.1\npyarrow==16.1.0\n",[30,125709,125707],{"__ignoreMap":28},[14,125711,125712],{},"With the environment active, confirm the engines resolve before writing pipeline logic:",[23,125714,125716],{"className":126,"code":125715,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nimport pandas as pd\n\nprint(pd.__version__)\n# openpyxl is imported lazily by pandas; a clean import proves it is installed\nimport openpyxl\nprint(openpyxl.__version__)\n",[30,125717,125718,125722,125732,125736,125747,125752,125758],{"__ignoreMap":28},[33,125719,125720],{"class":35,"line":36},[33,125721,3952],{"class":39},[33,125723,125724,125726,125728,125730],{"class":35,"line":43},[33,125725,164],{"class":163},[33,125727,492],{"class":167},[33,125729,495],{"class":163},[33,125731,498],{"class":167},[33,125733,125734],{"class":35,"line":61},[33,125735,92],{"emptyLinePlaceholder":91},[33,125737,125738,125740,125743,125745],{"class":35,"line":73},[33,125739,13474],{"class":50},[33,125741,125742],{"class":167},"(pd.",[33,125744,37016],{"class":50},[33,125746,221],{"class":167},[33,125748,125749],{"class":35,"line":88},[33,125750,125751],{"class":39},"# openpyxl is imported lazily by pandas; a clean import proves it is installed\n",[33,125753,125754,125756],{"class":35,"line":95},[33,125755,164],{"class":163},[33,125757,95887],{"class":167},[33,125759,125760,125762,125765,125767],{"class":35,"line":101},[33,125761,13474],{"class":50},[33,125763,125764],{"class":167},"(openpyxl.",[33,125766,37016],{"class":50},[33,125768,221],{"class":167},[18,125770,125772],{"id":125771},"ingestion-reading-the-raw-formats","Ingestion: reading the raw formats",[14,125774,125775],{},"Ingestion is where most pipelines fail, because the file you receive is rarely the file you were promised. A CSV labeled UTF-8 turns out to be Windows-1252; a column of order numbers arrives with a stray currency symbol; an Excel export has two banner rows above the real header and three blank rows at the bottom. None of this is exotic — it is the normal state of data that left another team's system. Three controls prevent the majority of corruption at the door: choosing the right engine, declaring the encoding, and forcing dtypes so pandas never guesses. Each control is cheap to add and expensive to retrofit once bad inferences have propagated into downstream totals.",[424,125777,125779],{"id":125778},"reading-csv-with-encoding-and-dtype-control","Reading CSV with encoding and dtype control",[14,125781,125782,125784,125785,125788,125789,125792,125793,125795,125796,125798],{},[30,125783,123327],{}," will happily infer types, and that inference is the source of the classic data-loss bug: a column of zip codes or SKUs like ",[30,125786,125787],{},"00734"," is read as the integer ",[30,125790,125791],{},"734",". Declare ",[30,125794,23262],{}," for any column whose leading zeros, length, or formatting are load-bearing, and name the encoding explicitly so a Windows-1252 export does not raise ",[30,125797,53911],{}," mid-file.",[23,125800,125802],{"className":126,"code":125801,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\nfrom pathlib import Path\n\nRAW_CSV = Path(\"data\u002Fraw\u002Fcustomers.csv\")\n\ntry:\n    df = pd.read_csv(\n        RAW_CSV,\n        encoding=\"utf-8-sig\",          # strips a BOM if Excel added one; falls back cleanly to plain utf-8\n        dtype={\"customer_id\": \"string\", \"zip\": \"string\"},  # preserve leading zeros\n        parse_dates=[\"signup_date\"],   # parse at read time, not after\n        na_values=[\"\", \"NULL\", \"N\u002FA\"], # normalize the many spellings of \"missing\"\n    )\n    print(f\"Loaded {len(df):,} rows, {df.shape[1]} columns\")\nexcept UnicodeDecodeError:\n    # Latin-encoded legacy exports: retry with the common Windows fallback\n    df = pd.read_csv(RAW_CSV, encoding=\"cp1252\", dtype=\"string\")\nexcept FileNotFoundError as exc:\n    raise SystemExit(f\"Input missing: {exc}\")\n",[30,125803,125804,125808,125818,125828,125832,125846,125850,125856,125864,125871,125884,125915,125931,125955,125959,125993,126001,126006,126034,126044],{"__ignoreMap":28},[33,125805,125806],{"class":35,"line":36},[33,125807,8895],{"class":39},[33,125809,125810,125812,125814,125816],{"class":35,"line":43},[33,125811,164],{"class":163},[33,125813,492],{"class":167},[33,125815,495],{"class":163},[33,125817,498],{"class":167},[33,125819,125820,125822,125824,125826],{"class":35,"line":61},[33,125821,190],{"class":163},[33,125823,193],{"class":167},[33,125825,164],{"class":163},[33,125827,198],{"class":167},[33,125829,125830],{"class":35,"line":73},[33,125831,92],{"emptyLinePlaceholder":91},[33,125833,125834,125837,125839,125841,125844],{"class":35,"line":88},[33,125835,125836],{"class":50},"RAW_CSV",[33,125838,212],{"class":163},[33,125840,215],{"class":167},[33,125842,125843],{"class":54},"\"data\u002Fraw\u002Fcustomers.csv\"",[33,125845,221],{"class":167},[33,125847,125848],{"class":35,"line":95},[33,125849,92],{"emptyLinePlaceholder":91},[33,125851,125852,125854],{"class":35,"line":101},[33,125853,35574],{"class":163},[33,125855,574],{"class":167},[33,125857,125858,125860,125862],{"class":35,"line":171},[33,125859,4025],{"class":167},[33,125861,242],{"class":163},[33,125863,108706],{"class":167},[33,125865,125866,125869],{"class":35,"line":179},[33,125867,125868],{"class":50},"        RAW_CSV",[33,125870,247],{"class":167},[33,125872,125873,125875,125877,125879,125881],{"class":35,"line":187},[33,125874,1190],{"class":238},[33,125876,242],{"class":163},[33,125878,108390],{"class":54},[33,125880,98374],{"class":167},[33,125882,125883],{"class":39},"# strips a BOM if Excel added one; falls back cleanly to plain utf-8\n",[33,125885,125886,125889,125891,125893,125896,125898,125900,125902,125905,125907,125909,125912],{"class":35,"line":201},[33,125887,125888],{"class":238},"        dtype",[33,125890,242],{"class":163},[33,125892,1115],{"class":167},[33,125894,125895],{"class":54},"\"customer_id\"",[33,125897,2079],{"class":167},[33,125899,27358],{"class":54},[33,125901,365],{"class":167},[33,125903,125904],{"class":54},"\"zip\"",[33,125906,2079],{"class":167},[33,125908,27358],{"class":54},[33,125910,125911],{"class":167},"},  ",[33,125913,125914],{"class":39},"# preserve leading zeros\n",[33,125916,125917,125919,125921,125923,125926,125928],{"class":35,"line":206},[33,125918,108760],{"class":238},[33,125920,242],{"class":163},[33,125922,8309],{"class":167},[33,125924,125925],{"class":54},"\"signup_date\"",[33,125927,13424],{"class":167},[33,125929,125930],{"class":39},"# parse at read time, not after\n",[33,125932,125933,125936,125938,125940,125942,125944,125946,125948,125950,125952],{"class":35,"line":224},[33,125934,125935],{"class":238},"        na_values",[33,125937,242],{"class":163},[33,125939,8309],{"class":167},[33,125941,3198],{"class":54},[33,125943,365],{"class":167},[33,125945,109273],{"class":54},[33,125947,365],{"class":167},[33,125949,27824],{"class":54},[33,125951,8314],{"class":167},[33,125953,125954],{"class":39},"# normalize the many spellings of \"missing\"\n",[33,125956,125957],{"class":35,"line":229},[33,125958,1202],{"class":167},[33,125960,125961,125963,125965,125967,125969,125971,125973,125975,125977,125979,125981,125983,125985,125987,125989,125991],{"class":35,"line":235},[33,125962,7268],{"class":50},[33,125964,602],{"class":167},[33,125966,4059],{"class":163},[33,125968,96187],{"class":54},[33,125970,4065],{"class":50},[33,125972,4068],{"class":167},[33,125974,18801],{"class":163},[33,125976,1121],{"class":50},[33,125978,115486],{"class":54},[33,125980,1115],{"class":50},[33,125982,9541],{"class":167},[33,125984,734],{"class":50},[33,125986,9546],{"class":167},[33,125988,1121],{"class":50},[33,125990,115499],{"class":54},[33,125992,221],{"class":167},[33,125994,125995,125997,125999],{"class":35,"line":250},[33,125996,35726],{"class":163},[33,125998,112164],{"class":50},[33,126000,574],{"class":167},[33,126002,126003],{"class":35,"line":266},[33,126004,126005],{"class":39},"    # Latin-encoded legacy exports: retry with the common Windows fallback\n",[33,126007,126008,126010,126012,126014,126016,126018,126020,126022,126024,126026,126028,126030,126032],{"class":35,"line":290},[33,126009,4025],{"class":167},[33,126011,242],{"class":163},[33,126013,9481],{"class":167},[33,126015,125836],{"class":50},[33,126017,365],{"class":167},[33,126019,27249],{"class":238},[33,126021,242],{"class":163},[33,126023,110976],{"class":54},[33,126025,365],{"class":167},[33,126027,23262],{"class":238},[33,126029,242],{"class":163},[33,126031,27358],{"class":54},[33,126033,221],{"class":167},[33,126035,126036,126038,126040,126042],{"class":35,"line":295},[33,126037,35726],{"class":163},[33,126039,2945],{"class":50},[33,126041,1852],{"class":163},[33,126043,1855],{"class":167},[33,126045,126046,126048,126050,126052,126054,126057,126059,126061,126063,126065],{"class":35,"line":300},[33,126047,35742],{"class":163},[33,126049,16617],{"class":50},[33,126051,602],{"class":167},[33,126053,4059],{"class":163},[33,126055,126056],{"class":54},"\"Input missing: ",[33,126058,1115],{"class":50},[33,126060,6565],{"class":167},[33,126062,1121],{"class":50},[33,126064,274],{"class":54},[33,126066,221],{"class":167},[14,126068,126069,126070,126072,126073,126075,126076,126078],{},"Encoding is a deep enough trap to warrant its own walkthrough — see ",[940,126071,27254],{"href":27253}," when the fallback above is not enough, and ",[940,126074,107425],{"href":110423}," to decide between pandas and the stdlib ",[30,126077,107436],{}," module for a given file.",[424,126080,126082],{"id":126081},"reading-excel-with-the-right-engine","Reading Excel with the right engine",[14,126084,126085,126086,108293,126088,126090,126091,126094,126095,10065,126097,126100],{},"For ",[30,126087,26542],{},[30,126089,22395],{}," explicitly rather than relying on the default — it documents intent and fails loudly if openpyxl is missing. Multi-sheet workbooks need ",[30,126092,126093],{},"sheet_name=None"," to load every tab into a dict; reading a single named sheet avoids pulling tabs you do not need. Merged header cells, hidden rows, and a banner row above the real header are the recurring shapes of Excel chaos; ",[30,126096,44427],{},[30,126098,126099],{},"skiprows"," realign them.",[23,126102,126104],{"className":126,"code":126103,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nimport pandas as pd\nfrom pathlib import Path\n\nWORKBOOK = Path(\"data\u002Fraw\u002Fq3_sales.xlsx\")\n\ntry:\n    # Load one sheet, skipping a two-row title banner so row 3 becomes the header\n    df = pd.read_excel(\n        WORKBOOK,\n        sheet_name=\"Transactions\",\n        engine=\"openpyxl\",\n        header=2,                       # zero-indexed: real header is the 3rd row\n        dtype={\"order_id\": \"string\"},\n    )\n\n    # To process every tab, load them all and tag each with its source sheet:\n    all_sheets = pd.read_excel(WORKBOOK, sheet_name=None, engine=\"openpyxl\")\n    combined = pd.concat(\n        [frame.assign(_sheet=name) for name, frame in all_sheets.items()],\n        ignore_index=True,\n    )\n    print(f\"Single sheet: {len(df):,} rows; all sheets: {len(combined):,} rows\")\nexcept ValueError as exc:\n    # Raised when sheet_name does not exist in the workbook\n    raise SystemExit(f\"Sheet lookup failed: {exc}\")\n",[30,126105,126106,126110,126120,126130,126134,126148,126152,126158,126163,126172,126179,126191,126201,126216,126232,126236,126240,126245,126275,126284,126307,126318,126322,126357,126367,126372],{"__ignoreMap":28},[33,126107,126108],{"class":35,"line":36},[33,126109,3952],{"class":39},[33,126111,126112,126114,126116,126118],{"class":35,"line":43},[33,126113,164],{"class":163},[33,126115,492],{"class":167},[33,126117,495],{"class":163},[33,126119,498],{"class":167},[33,126121,126122,126124,126126,126128],{"class":35,"line":61},[33,126123,190],{"class":163},[33,126125,193],{"class":167},[33,126127,164],{"class":163},[33,126129,198],{"class":167},[33,126131,126132],{"class":35,"line":73},[33,126133,92],{"emptyLinePlaceholder":91},[33,126135,126136,126139,126141,126143,126146],{"class":35,"line":88},[33,126137,126138],{"class":50},"WORKBOOK",[33,126140,212],{"class":163},[33,126142,215],{"class":167},[33,126144,126145],{"class":54},"\"data\u002Fraw\u002Fq3_sales.xlsx\"",[33,126147,221],{"class":167},[33,126149,126150],{"class":35,"line":95},[33,126151,92],{"emptyLinePlaceholder":91},[33,126153,126154,126156],{"class":35,"line":101},[33,126155,35574],{"class":163},[33,126157,574],{"class":167},[33,126159,126160],{"class":35,"line":171},[33,126161,126162],{"class":39},"    # Load one sheet, skipping a two-row title banner so row 3 becomes the header\n",[33,126164,126165,126167,126169],{"class":35,"line":179},[33,126166,4025],{"class":167},[33,126168,242],{"class":163},[33,126170,126171],{"class":167}," pd.read_excel(\n",[33,126173,126174,126177],{"class":35,"line":187},[33,126175,126176],{"class":50},"        WORKBOOK",[33,126178,247],{"class":167},[33,126180,126181,126184,126186,126189],{"class":35,"line":201},[33,126182,126183],{"class":238},"        sheet_name",[33,126185,242],{"class":163},[33,126187,126188],{"class":54},"\"Transactions\"",[33,126190,247],{"class":167},[33,126192,126193,126195,126197,126199],{"class":35,"line":206},[33,126194,111493],{"class":238},[33,126196,242],{"class":163},[33,126198,17356],{"class":54},[33,126200,247],{"class":167},[33,126202,126203,126206,126208,126210,126213],{"class":35,"line":224},[33,126204,126205],{"class":238},"        header",[33,126207,242],{"class":163},[33,126209,1533],{"class":50},[33,126211,126212],{"class":167},",                       ",[33,126214,126215],{"class":39},"# zero-indexed: real header is the 3rd row\n",[33,126217,126218,126220,126222,126224,126226,126228,126230],{"class":35,"line":229},[33,126219,125888],{"class":238},[33,126221,242],{"class":163},[33,126223,1115],{"class":167},[33,126225,108849],{"class":54},[33,126227,2079],{"class":167},[33,126229,27358],{"class":54},[33,126231,3509],{"class":167},[33,126233,126234],{"class":35,"line":235},[33,126235,1202],{"class":167},[33,126237,126238],{"class":35,"line":250},[33,126239,92],{"emptyLinePlaceholder":91},[33,126241,126242],{"class":35,"line":266},[33,126243,126244],{"class":39},"    # To process every tab, load them all and tag each with its source sheet:\n",[33,126246,126247,126250,126252,126255,126257,126259,126261,126263,126265,126267,126269,126271,126273],{"class":35,"line":290},[33,126248,126249],{"class":167},"    all_sheets ",[33,126251,242],{"class":163},[33,126253,126254],{"class":167}," pd.read_excel(",[33,126256,126138],{"class":50},[33,126258,365],{"class":167},[33,126260,17371],{"class":238},[33,126262,242],{"class":163},[33,126264,571],{"class":50},[33,126266,365],{"class":167},[33,126268,17351],{"class":238},[33,126270,242],{"class":163},[33,126272,17356],{"class":54},[33,126274,221],{"class":167},[33,126276,126277,126279,126281],{"class":35,"line":295},[33,126278,842],{"class":167},[33,126280,242],{"class":163},[33,126282,126283],{"class":167}," pd.concat(\n",[33,126285,126286,126289,126292,126294,126297,126299,126302,126304],{"class":35,"line":300},[33,126287,126288],{"class":167},"        [frame.assign(",[33,126290,126291],{"class":238},"_sheet",[33,126293,242],{"class":163},[33,126295,126296],{"class":167},"name) ",[33,126298,6124],{"class":163},[33,126300,126301],{"class":167}," name, frame ",[33,126303,662],{"class":163},[33,126305,126306],{"class":167}," all_sheets.items()],\n",[33,126308,126309,126312,126314,126316],{"class":35,"line":317},[33,126310,126311],{"class":238},"        ignore_index",[33,126313,242],{"class":163},[33,126315,855],{"class":50},[33,126317,247],{"class":167},[33,126319,126320],{"class":35,"line":332},[33,126321,1202],{"class":167},[33,126323,126324,126326,126328,126330,126333,126335,126337,126339,126341,126344,126346,126349,126351,126353,126355],{"class":35,"line":347},[33,126325,7268],{"class":50},[33,126327,602],{"class":167},[33,126329,4059],{"class":163},[33,126331,126332],{"class":54},"\"Single sheet: ",[33,126334,4065],{"class":50},[33,126336,4068],{"class":167},[33,126338,18801],{"class":163},[33,126340,1121],{"class":50},[33,126342,126343],{"class":54}," rows; all sheets: ",[33,126345,4065],{"class":50},[33,126347,126348],{"class":167},"(combined)",[33,126350,18801],{"class":163},[33,126352,1121],{"class":50},[33,126354,65937],{"class":54},[33,126356,221],{"class":167},[33,126358,126359,126361,126363,126365],{"class":35,"line":374},[33,126360,35726],{"class":163},[33,126362,4054],{"class":50},[33,126364,1852],{"class":163},[33,126366,1855],{"class":167},[33,126368,126369],{"class":35,"line":397},[33,126370,126371],{"class":39},"    # Raised when sheet_name does not exist in the workbook\n",[33,126373,126374,126376,126378,126380,126382,126385,126387,126389,126391,126393],{"class":35,"line":653},[33,126375,35742],{"class":163},[33,126377,16617],{"class":50},[33,126379,602],{"class":167},[33,126381,4059],{"class":163},[33,126383,126384],{"class":54},"\"Sheet lookup failed: ",[33,126386,1115],{"class":50},[33,126388,6565],{"class":167},[33,126390,1121],{"class":50},[33,126392,274],{"class":54},[33,126394,221],{"class":167},[14,126396,126397,126398,126400,126401,126403,126404,126406,126407,3035],{},"Engine selection, ",[30,126399,112255],{}," versus ",[30,126402,26542],{},", and metadata extraction are covered step by step in ",[940,126405,99577],{"href":99576},", and the legacy-format failure mode specifically in ",[940,126408,126410],{"href":126409},"\u002Fpython-for-excel-csv-data-processing\u002Freading-excel-files-with-python\u002Ffix-xlrd-error-reading-xlsx-files\u002F","Fix xlrd Error Reading .xlsx Files",[424,126412,126414],{"id":126413},"streaming-huge-files-with-the-csv-module","Streaming huge files with the csv module",[14,126416,126417,126418,126420],{},"When a file is too large to hold in memory, or when you only need to filter and re-emit rows rather than analyze them, skip pandas entirely. The standard-library ",[30,126419,107436],{}," module reads one row at a time, so memory stays flat whether the file is 50 MB or 50 GB. This is the right tool for splitting a giant export, dropping rows that match a predicate, or extracting a handful of columns before a downstream tool ever sees a DataFrame.",[23,126422,126424],{"className":126,"code":126423,"language":47,"meta":28,"style":28},"# stdlib only — no install needed\nimport csv\nfrom pathlib import Path\n\nSOURCE = Path(\"data\u002Fraw\u002Ftransactions_huge.csv\")\nTARGET = Path(\"output\u002Ftransactions_2026.csv\")\nTARGET.parent.mkdir(parents=True, exist_ok=True)\n\ntry:\n    with SOURCE.open(newline=\"\", encoding=\"utf-8\") as fin, \\\n         TARGET.open(\"w\", newline=\"\", encoding=\"utf-8\") as fout:\n        reader = csv.DictReader(fin)\n        writer = csv.DictWriter(fout, fieldnames=reader.fieldnames)\n        writer.writeheader()\n        kept = 0\n        for row in reader:                 # one row in memory at a time\n            if row[\"order_date\"].startswith(\"2026\"):\n                writer.writerow(row)\n                kept += 1\n        print(f\"Streamed {kept:,} matching rows without loading the full file\")\nexcept FileNotFoundError as exc:\n    raise SystemExit(f\"Source missing: {exc}\")\n",[30,126425,126426,126431,126437,126447,126451,126464,126478,126500,126504,126510,126540,126571,126580,126595,126600,126609,126623,126640,126645,126654,126679,126689],{"__ignoreMap":28},[33,126427,126428],{"class":35,"line":36},[33,126429,126430],{"class":39},"# stdlib only — no install needed\n",[33,126432,126433,126435],{"class":35,"line":43},[33,126434,164],{"class":163},[33,126436,107673],{"class":167},[33,126438,126439,126441,126443,126445],{"class":35,"line":61},[33,126440,190],{"class":163},[33,126442,193],{"class":167},[33,126444,164],{"class":163},[33,126446,198],{"class":167},[33,126448,126449],{"class":35,"line":73},[33,126450,92],{"emptyLinePlaceholder":91},[33,126452,126453,126455,126457,126459,126462],{"class":35,"line":88},[33,126454,86272],{"class":50},[33,126456,212],{"class":163},[33,126458,215],{"class":167},[33,126460,126461],{"class":54},"\"data\u002Fraw\u002Ftransactions_huge.csv\"",[33,126463,221],{"class":167},[33,126465,126466,126469,126471,126473,126476],{"class":35,"line":95},[33,126467,126468],{"class":50},"TARGET",[33,126470,212],{"class":163},[33,126472,215],{"class":167},[33,126474,126475],{"class":54},"\"output\u002Ftransactions_2026.csv\"",[33,126477,221],{"class":167},[33,126479,126480,126482,126484,126486,126488,126490,126492,126494,126496,126498],{"class":35,"line":101},[33,126481,126468],{"class":50},[33,126483,866],{"class":167},[33,126485,869],{"class":238},[33,126487,242],{"class":163},[33,126489,855],{"class":50},[33,126491,365],{"class":167},[33,126493,878],{"class":238},[33,126495,242],{"class":163},[33,126497,855],{"class":50},[33,126499,221],{"class":167},[33,126501,126502],{"class":35,"line":171},[33,126503,92],{"emptyLinePlaceholder":91},[33,126505,126506,126508],{"class":35,"line":179},[33,126507,35574],{"class":163},[33,126509,574],{"class":167},[33,126511,126512,126514,126517,126519,126521,126523,126525,126527,126529,126531,126533,126535,126537],{"class":35,"line":187},[33,126513,1635],{"class":163},[33,126515,126516],{"class":50}," SOURCE",[33,126518,107916],{"class":167},[33,126520,107919],{"class":238},[33,126522,242],{"class":163},[33,126524,3198],{"class":54},[33,126526,365],{"class":167},[33,126528,27249],{"class":238},[33,126530,242],{"class":163},[33,126532,1195],{"class":54},[33,126534,1649],{"class":167},[33,126536,495],{"class":163},[33,126538,126539],{"class":167}," fin, \\\n",[33,126541,126542,126545,126547,126549,126551,126553,126555,126557,126559,126561,126563,126565,126567,126569],{"class":35,"line":201},[33,126543,126544],{"class":50},"         TARGET",[33,126546,107916],{"class":167},[33,126548,123455],{"class":54},[33,126550,365],{"class":167},[33,126552,107919],{"class":238},[33,126554,242],{"class":163},[33,126556,3198],{"class":54},[33,126558,365],{"class":167},[33,126560,27249],{"class":238},[33,126562,242],{"class":163},[33,126564,1195],{"class":54},[33,126566,1649],{"class":167},[33,126568,495],{"class":163},[33,126570,123478],{"class":167},[33,126572,126573,126575,126577],{"class":35,"line":206},[33,126574,62484],{"class":167},[33,126576,242],{"class":163},[33,126578,126579],{"class":167}," csv.DictReader(fin)\n",[33,126581,126582,126584,126586,126588,126590,126592],{"class":35,"line":224},[33,126583,67149],{"class":167},[33,126585,242],{"class":163},[33,126587,123538],{"class":167},[33,126589,123541],{"class":238},[33,126591,242],{"class":163},[33,126593,126594],{"class":167},"reader.fieldnames)\n",[33,126596,126597],{"class":35,"line":229},[33,126598,126599],{"class":167},"        writer.writeheader()\n",[33,126601,126602,126605,126607],{"class":35,"line":235},[33,126603,126604],{"class":167},"        kept ",[33,126606,242],{"class":163},[33,126608,28914],{"class":50},[33,126610,126611,126613,126615,126617,126620],{"class":35,"line":250},[33,126612,5973],{"class":163},[33,126614,3844],{"class":167},[33,126616,662],{"class":163},[33,126618,126619],{"class":167}," reader:                 ",[33,126621,126622],{"class":39},"# one row in memory at a time\n",[33,126624,126625,126627,126630,126632,126635,126638],{"class":35,"line":266},[33,126626,5995],{"class":163},[33,126628,126629],{"class":167}," row[",[33,126631,108767],{"class":54},[33,126633,126634],{"class":167},"].startswith(",[33,126636,126637],{"class":54},"\"2026\"",[33,126639,1737],{"class":167},[33,126641,126642],{"class":35,"line":290},[33,126643,126644],{"class":167},"                writer.writerow(row)\n",[33,126646,126647,126650,126652],{"class":35,"line":295},[33,126648,126649],{"class":167},"                kept ",[33,126651,28976],{"class":163},[33,126653,17709],{"class":50},[33,126655,126656,126658,126660,126662,126665,126667,126670,126672,126674,126677],{"class":35,"line":300},[33,126657,9414],{"class":50},[33,126659,602],{"class":167},[33,126661,4059],{"class":163},[33,126663,126664],{"class":54},"\"Streamed ",[33,126666,1115],{"class":50},[33,126668,126669],{"class":167},"kept",[33,126671,18801],{"class":163},[33,126673,1121],{"class":50},[33,126675,126676],{"class":54}," matching rows without loading the full file\"",[33,126678,221],{"class":167},[33,126680,126681,126683,126685,126687],{"class":35,"line":317},[33,126682,35726],{"class":163},[33,126684,2945],{"class":50},[33,126686,1852],{"class":163},[33,126688,1855],{"class":167},[33,126690,126691,126693,126695,126697,126699,126701,126703,126705,126707,126709],{"class":35,"line":332},[33,126692,35742],{"class":163},[33,126694,16617],{"class":50},[33,126696,602],{"class":167},[33,126698,4059],{"class":163},[33,126700,27331],{"class":54},[33,126702,1115],{"class":50},[33,126704,6565],{"class":167},[33,126706,1121],{"class":50},[33,126708,274],{"class":54},[33,126710,221],{"class":167},[14,126712,126713,126714,126716,126717,126719,126720,3035],{},"The trade-off is that you lose vectorized operations: there are no joins, no ",[30,126715,21820],{},", no automatic type coercion. Reach for the ",[30,126718,107436],{}," module when the task is genuinely row-local, and otherwise stay in pandas. The decision between the two for a given file is the subject of ",[940,126721,107425],{"href":110423},[18,126723,126725],{"id":126724},"transformation-cleaning-coercion-schema-normalization","Transformation: cleaning, coercion, schema normalization",[14,126727,126728,126729,126731,126732,126734,126735,126738,126739,126741,126742,126744,126745,126748],{},"Once data is in a frame, the goal is a stable schema: predictable column names, correct dtypes, and no silent string-versus-number ambiguity. A stable schema is what lets the rest of the pipeline make assumptions safely — once ",[30,126730,18528],{}," is guaranteed numeric and ",[30,126733,95904],{}," is guaranteed an uppercase trimmed string, a downstream ",[30,126736,126737],{},"groupby(\"region\")[\"revenue\"].sum()"," cannot quietly split one region into three or refuse to add a column of strings. Do the cleaning defensively: ",[30,126740,27816],{}," turns unparseable values into ",[30,126743,8884],{}," you can inspect and report, rather than letting one bad cell abort the run or, worse, poison a ",[30,126746,126747],{},"sum"," by being treated as text. The pattern below also surfaces how many values failed coercion, because a transformation that silently discards 4,000 rows is more dangerous than one that crashes — the crash gets noticed, the silent loss does not.",[23,126750,126752],{"className":126,"code":126751,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\n\ndef normalize(df: pd.DataFrame) -> pd.DataFrame:\n    # 1. Canonical column names: lowercase, underscored, stripped\n    df.columns = (\n        df.columns.str.strip().str.lower().str.replace(r\"[^\\w]+\", \"_\", regex=True)\n    )\n\n    # 2. Strip currency\u002Fwhitespace, then coerce to numeric — bad cells become NaN, not crashes\n    df[\"revenue\"] = (\n        df[\"revenue\"].astype(\"string\").str.replace(r\"[^\\d.\\-]\", \"\", regex=True)\n    )\n    df[\"revenue\"] = pd.to_numeric(df[\"revenue\"], errors=\"coerce\")\n\n    # 3. Dates: mixed formats parse safely; unparseable -> NaT for later auditing\n    df[\"order_date\"] = pd.to_datetime(df[\"order_date\"], format=\"mixed\", errors=\"coerce\")\n\n    # 4. Categorical normalization avoids \"USA\"\u002F\"usa\"\u002F\" US \" counting as three regions\n    df[\"region\"] = df[\"region\"].astype(\"string\").str.strip().str.upper()\n\n    # 5. Surface coercion failures explicitly instead of shipping silent gaps\n    bad_revenue = int(df[\"revenue\"].isna().sum())\n    if bad_revenue:\n        print(f\"WARNING: {bad_revenue} revenue values failed numeric coercion\")\n    return df\n",[30,126753,126754,126758,126768,126772,126780,126785,126793,126826,126830,126834,126839,126851,126893,126897,126921,126925,126930,126962,126966,126971,126992,126996,127001,127018,127025,127047],{"__ignoreMap":28},[33,126755,126756],{"class":35,"line":36},[33,126757,8895],{"class":39},[33,126759,126760,126762,126764,126766],{"class":35,"line":43},[33,126761,164],{"class":163},[33,126763,492],{"class":167},[33,126765,495],{"class":163},[33,126767,498],{"class":167},[33,126769,126770],{"class":35,"line":61},[33,126771,92],{"emptyLinePlaceholder":91},[33,126773,126774,126776,126778],{"class":35,"line":73},[33,126775,562],{"class":163},[33,126777,66068],{"class":46},[33,126779,12127],{"class":167},[33,126781,126782],{"class":35,"line":88},[33,126783,126784],{"class":39},"    # 1. Canonical column names: lowercase, underscored, stripped\n",[33,126786,126787,126789,126791],{"class":35,"line":95},[33,126788,27546],{"class":167},[33,126790,242],{"class":163},[33,126792,1415],{"class":167},[33,126794,126795,126798,126800,126802,126804,126806,126808,126810,126812,126814,126816,126818,126820,126822,126824],{"class":35,"line":101},[33,126796,126797],{"class":167},"        df.columns.str.strip().str.lower().str.replace(",[33,126799,11977],{"class":163},[33,126801,274],{"class":54},[33,126803,8309],{"class":50},[33,126805,113780],{"class":163},[33,126807,113783],{"class":50},[33,126809,1811],{"class":163},[33,126811,274],{"class":54},[33,126813,365],{"class":167},[33,126815,7764],{"class":54},[33,126817,365],{"class":167},[33,126819,11993],{"class":238},[33,126821,242],{"class":163},[33,126823,855],{"class":50},[33,126825,221],{"class":167},[33,126827,126828],{"class":35,"line":171},[33,126829,1202],{"class":167},[33,126831,126832],{"class":35,"line":179},[33,126833,92],{"emptyLinePlaceholder":91},[33,126835,126836],{"class":35,"line":187},[33,126837,126838],{"class":39},"    # 2. Strip currency\u002Fwhitespace, then coerce to numeric — bad cells become NaN, not crashes\n",[33,126840,126841,126843,126845,126847,126849],{"class":35,"line":201},[33,126842,27581],{"class":167},[33,126844,16465],{"class":54},[33,126846,763],{"class":167},[33,126848,242],{"class":163},[33,126850,1415],{"class":167},[33,126852,126853,126855,126857,126859,126861,126863,126865,126867,126869,126871,126873,126875,126877,126879,126881,126883,126885,126887,126889,126891],{"class":35,"line":206},[33,126854,10902],{"class":167},[33,126856,16465],{"class":54},[33,126858,27598],{"class":167},[33,126860,27358],{"class":54},[33,126862,27603],{"class":167},[33,126864,11977],{"class":163},[33,126866,274],{"class":54},[33,126868,8309],{"class":50},[33,126870,113780],{"class":163},[33,126872,113957],{"class":50},[33,126874,113960],{"class":12018},[33,126876,9546],{"class":50},[33,126878,274],{"class":54},[33,126880,365],{"class":167},[33,126882,3198],{"class":54},[33,126884,365],{"class":167},[33,126886,11993],{"class":238},[33,126888,242],{"class":163},[33,126890,855],{"class":50},[33,126892,221],{"class":167},[33,126894,126895],{"class":35,"line":224},[33,126896,1202],{"class":167},[33,126898,126899,126901,126903,126905,126907,126909,126911,126913,126915,126917,126919],{"class":35,"line":229},[33,126900,27581],{"class":167},[33,126902,16465],{"class":54},[33,126904,763],{"class":167},[33,126906,242],{"class":163},[33,126908,27643],{"class":167},[33,126910,16465],{"class":54},[33,126912,8314],{"class":167},[33,126914,8317],{"class":238},[33,126916,242],{"class":163},[33,126918,12107],{"class":54},[33,126920,221],{"class":167},[33,126922,126923],{"class":35,"line":235},[33,126924,92],{"emptyLinePlaceholder":91},[33,126926,126927],{"class":35,"line":250},[33,126928,126929],{"class":39},"    # 3. Dates: mixed formats parse safely; unparseable -> NaT for later auditing\n",[33,126931,126932,126934,126936,126938,126940,126942,126944,126946,126948,126950,126952,126954,126956,126958,126960],{"class":35,"line":266},[33,126933,27581],{"class":167},[33,126935,108767],{"class":54},[33,126937,763],{"class":167},[33,126939,242],{"class":163},[33,126941,27668],{"class":167},[33,126943,108767],{"class":54},[33,126945,8314],{"class":167},[33,126947,61926],{"class":238},[33,126949,242],{"class":163},[33,126951,96267],{"class":54},[33,126953,365],{"class":167},[33,126955,8317],{"class":238},[33,126957,242],{"class":163},[33,126959,12107],{"class":54},[33,126961,221],{"class":167},[33,126963,126964],{"class":35,"line":290},[33,126965,92],{"emptyLinePlaceholder":91},[33,126967,126968],{"class":35,"line":295},[33,126969,126970],{"class":39},"    # 4. Categorical normalization avoids \"USA\"\u002F\"usa\"\u002F\" US \" counting as three regions\n",[33,126972,126973,126975,126977,126979,126981,126983,126985,126987,126989],{"class":35,"line":300},[33,126974,27581],{"class":167},[33,126976,16649],{"class":54},[33,126978,763],{"class":167},[33,126980,242],{"class":163},[33,126982,7935],{"class":167},[33,126984,16649],{"class":54},[33,126986,27598],{"class":167},[33,126988,27358],{"class":54},[33,126990,126991],{"class":167},").str.strip().str.upper()\n",[33,126993,126994],{"class":35,"line":317},[33,126995,92],{"emptyLinePlaceholder":91},[33,126997,126998],{"class":35,"line":332},[33,126999,127000],{"class":39},"    # 5. Surface coercion failures explicitly instead of shipping silent gaps\n",[33,127002,127003,127006,127008,127010,127013,127015],{"class":35,"line":347},[33,127004,127005],{"class":167},"    bad_revenue ",[33,127007,242],{"class":163},[33,127009,3149],{"class":50},[33,127011,127012],{"class":167},"(df[",[33,127014,16465],{"class":54},[33,127016,127017],{"class":167},"].isna().sum())\n",[33,127019,127020,127022],{"class":35,"line":374},[33,127021,617],{"class":163},[33,127023,127024],{"class":167}," bad_revenue:\n",[33,127026,127027,127029,127031,127033,127035,127037,127040,127042,127045],{"class":35,"line":397},[33,127028,9414],{"class":50},[33,127030,602],{"class":167},[33,127032,4059],{"class":163},[33,127034,112511],{"class":54},[33,127036,1115],{"class":50},[33,127038,127039],{"class":167},"bad_revenue",[33,127041,1121],{"class":50},[33,127043,127044],{"class":54}," revenue values failed numeric coercion\"",[33,127046,221],{"class":167},[33,127048,127049,127051],{"class":35,"line":653},[33,127050,1332],{"class":163},[33,127052,11719],{"class":167},[14,127054,127055,127056,127058,127059,127061],{},"Currency stripping, missing-value imputation, and regex column standardization are expanded in ",[940,127057,9599],{"href":9598},". The same coercion discipline applies when the source is a PDF table rather than a spreadsheet — see ",[940,127060,948],{"href":947}," in the pipelines guide.",[18,127063,127065],{"id":127064},"consolidation-concat-vs-merge-vs-join","Consolidation: concat vs merge vs join",[14,127067,127068],{},"Three operations cover all multi-file work, and choosing wrong produces either duplicated rows or dropped ones:",[4211,127070,127071,127081,127097],{},[4214,127072,127073,127077,127078,127080],{},[1974,127074,127075],{},[30,127076,8366],{}," stacks frames that share a schema — twelve monthly CSVs into one year. It is the right tool only when columns align; mismatched columns produce ",[30,127079,8884],{},"-filled gaps.",[4214,127082,127083,127088,127089,127092,127093,127096],{},[1974,127084,127085],{},[30,127086,127087],{},"pd.merge"," is a relational join on key columns — attach customer master data to a transaction frame on ",[30,127090,127091],{},"customer_id",". Always pass ",[30,127094,127095],{},"how="," explicitly and check row counts before and after, because a non-unique key on the right side multiplies rows (a fan-out).",[4214,127098,127099,4348,127104,127106],{},[1974,127100,127101],{},[30,127102,127103],{},"df.join",[30,127105,27844],{},"'s index-aligned shorthand; reach for it only when both frames are already indexed on the join key.",[23,127108,127110],{"className":126,"code":127109,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\nfrom pathlib import Path\n\nMONTHLY_DIR = Path(\"data\u002Fraw\u002Fmonthly\")\n\n# Vertical: stack identically-shaped monthly exports into one frame\nparts = [pd.read_csv(p, dtype=\"string\") for p in sorted(MONTHLY_DIR.glob(\"sales_*.csv\"))]\nyear = pd.concat(parts, ignore_index=True)\n\n# Horizontal: enrich with a customer dimension on a key, guarding against fan-out\ncustomers = pd.read_csv(Path(\"data\u002Fraw\u002Fcustomers.csv\"), dtype={\"customer_id\": \"string\"})\nassert customers[\"customer_id\"].is_unique, \"Right-side key must be unique to avoid row multiplication\"\n\nbefore = len(year)\nenriched = year.merge(customers, on=\"customer_id\", how=\"left\", validate=\"many_to_one\")\nassert len(enriched) == before, \"Left join changed row count — investigate duplicate keys\"\n\n# Dedup on the natural key, keeping the most recent record\nenriched = enriched.sort_values(\"order_date\").drop_duplicates(subset=\"order_id\", keep=\"last\")\nprint(f\"Consolidated to {len(enriched):,} unique orders\")\n",[30,127111,127112,127116,127126,127136,127140,127154,127158,127163,127201,127219,127223,127228,127255,127270,127274,127286,127322,127339,127343,127348,127378],{"__ignoreMap":28},[33,127113,127114],{"class":35,"line":36},[33,127115,8895],{"class":39},[33,127117,127118,127120,127122,127124],{"class":35,"line":43},[33,127119,164],{"class":163},[33,127121,492],{"class":167},[33,127123,495],{"class":163},[33,127125,498],{"class":167},[33,127127,127128,127130,127132,127134],{"class":35,"line":61},[33,127129,190],{"class":163},[33,127131,193],{"class":167},[33,127133,164],{"class":163},[33,127135,198],{"class":167},[33,127137,127138],{"class":35,"line":73},[33,127139,92],{"emptyLinePlaceholder":91},[33,127141,127142,127145,127147,127149,127152],{"class":35,"line":88},[33,127143,127144],{"class":50},"MONTHLY_DIR",[33,127146,212],{"class":163},[33,127148,215],{"class":167},[33,127150,127151],{"class":54},"\"data\u002Fraw\u002Fmonthly\"",[33,127153,221],{"class":167},[33,127155,127156],{"class":35,"line":95},[33,127157,92],{"emptyLinePlaceholder":91},[33,127159,127160],{"class":35,"line":101},[33,127161,127162],{"class":39},"# Vertical: stack identically-shaped monthly exports into one frame\n",[33,127164,127165,127168,127170,127173,127175,127177,127179,127181,127183,127185,127187,127189,127191,127193,127195,127198],{"class":35,"line":171},[33,127166,127167],{"class":167},"parts ",[33,127169,242],{"class":163},[33,127171,127172],{"class":167}," [pd.read_csv(p, ",[33,127174,23262],{"class":238},[33,127176,242],{"class":163},[33,127178,27358],{"class":54},[33,127180,1649],{"class":167},[33,127182,6124],{"class":163},[33,127184,6127],{"class":167},[33,127186,662],{"class":163},[33,127188,28924],{"class":50},[33,127190,602],{"class":167},[33,127192,127144],{"class":50},[33,127194,607],{"class":167},[33,127196,127197],{"class":54},"\"sales_*.csv\"",[33,127199,127200],{"class":167},"))]\n",[33,127202,127203,127206,127208,127211,127213,127215,127217],{"class":35,"line":179},[33,127204,127205],{"class":167},"year ",[33,127207,242],{"class":163},[33,127209,127210],{"class":167}," pd.concat(parts, ",[33,127212,850],{"class":238},[33,127214,242],{"class":163},[33,127216,855],{"class":50},[33,127218,221],{"class":167},[33,127220,127221],{"class":35,"line":187},[33,127222,92],{"emptyLinePlaceholder":91},[33,127224,127225],{"class":35,"line":201},[33,127226,127227],{"class":39},"# Horizontal: enrich with a customer dimension on a key, guarding against fan-out\n",[33,127229,127230,127233,127235,127237,127239,127241,127243,127245,127247,127249,127251,127253],{"class":35,"line":206},[33,127231,127232],{"class":167},"customers ",[33,127234,242],{"class":163},[33,127236,46182],{"class":167},[33,127238,125843],{"class":54},[33,127240,18525],{"class":167},[33,127242,23262],{"class":238},[33,127244,242],{"class":163},[33,127246,1115],{"class":167},[33,127248,125895],{"class":54},[33,127250,2079],{"class":167},[33,127252,27358],{"class":54},[33,127254,103249],{"class":167},[33,127256,127257,127259,127262,127264,127267],{"class":35,"line":224},[33,127258,36397],{"class":163},[33,127260,127261],{"class":167}," customers[",[33,127263,125895],{"class":54},[33,127265,127266],{"class":167},"].is_unique, ",[33,127268,127269],{"class":54},"\"Right-side key must be unique to avoid row multiplication\"\n",[33,127271,127272],{"class":35,"line":229},[33,127273,92],{"emptyLinePlaceholder":91},[33,127275,127276,127279,127281,127283],{"class":35,"line":235},[33,127277,127278],{"class":167},"before ",[33,127280,242],{"class":163},[33,127282,4037],{"class":50},[33,127284,127285],{"class":167},"(year)\n",[33,127287,127288,127291,127293,127296,127298,127300,127302,127304,127306,127308,127310,127312,127315,127317,127320],{"class":35,"line":250},[33,127289,127290],{"class":167},"enriched ",[33,127292,242],{"class":163},[33,127294,127295],{"class":167}," year.merge(customers, ",[33,127297,2091],{"class":238},[33,127299,242],{"class":163},[33,127301,125895],{"class":54},[33,127303,365],{"class":167},[33,127305,28045],{"class":238},[33,127307,242],{"class":163},[33,127309,28050],{"class":54},[33,127311,365],{"class":167},[33,127313,127314],{"class":238},"validate",[33,127316,242],{"class":163},[33,127318,127319],{"class":54},"\"many_to_one\"",[33,127321,221],{"class":167},[33,127323,127324,127326,127328,127331,127333,127336],{"class":35,"line":266},[33,127325,36397],{"class":163},[33,127327,4037],{"class":50},[33,127329,127330],{"class":167},"(enriched) ",[33,127332,1865],{"class":163},[33,127334,127335],{"class":167}," before, ",[33,127337,127338],{"class":54},"\"Left join changed row count — investigate duplicate keys\"\n",[33,127340,127341],{"class":35,"line":290},[33,127342,92],{"emptyLinePlaceholder":91},[33,127344,127345],{"class":35,"line":295},[33,127346,127347],{"class":39},"# Dedup on the natural key, keeping the most recent record\n",[33,127349,127350,127352,127354,127357,127359,127362,127364,127366,127368,127370,127372,127374,127376],{"class":35,"line":300},[33,127351,127290],{"class":167},[33,127353,242],{"class":163},[33,127355,127356],{"class":167}," enriched.sort_values(",[33,127358,108767],{"class":54},[33,127360,127361],{"class":167},").drop_duplicates(",[33,127363,28066],{"class":238},[33,127365,242],{"class":163},[33,127367,108849],{"class":54},[33,127369,365],{"class":167},[33,127371,28077],{"class":238},[33,127373,242],{"class":163},[33,127375,114482],{"class":54},[33,127377,221],{"class":167},[33,127379,127380,127382,127384,127386,127389,127391,127394,127396,127398,127401],{"class":35,"line":317},[33,127381,13474],{"class":50},[33,127383,602],{"class":167},[33,127385,4059],{"class":163},[33,127387,127388],{"class":54},"\"Consolidated to ",[33,127390,4065],{"class":50},[33,127392,127393],{"class":167},"(enriched)",[33,127395,18801],{"class":163},[33,127397,1121],{"class":50},[33,127399,127400],{"class":54}," unique orders\"",[33,127402,221],{"class":167},[14,127404,39550,127405,127408,127409,127411,127412,127414,127415,127417,127418,127421,127422,127424,127425,3035],{},[30,127406,127407],{},"validate="," argument is the cheapest insurance in pandas: it raises immediately on an unexpected key cardinality instead of letting a silent fan-out inflate every downstream total. A fan-out is insidious precisely because the result still looks plausible — the column names are right, the frame is not empty, and only the row count and the totals are wrong. Asserting the row count before and after a left join catches it deterministically. For ",[30,127410,99426],{},", the equivalent discipline is checking that every part has the columns you expect before stacking, because ",[30,127413,99426],{}," will happily union mismatched schemas and fill the gaps with ",[30,127416,8884],{},", producing a frame that is taller than any input and wider than it should be. When in doubt, log the shape after every consolidation step; a one-line ",[30,127419,127420],{},"log.info(\"after merge: %s\", df.shape)"," has saved more reconciliation hours than any test suite. Join strategy, suffix collisions, and schema drift get full treatment in ",[940,127423,28119],{"href":28118},", including the common ",[940,127426,127427],{"href":28113},"overlapping-column suffix problem",[18,127429,127431],{"id":127430},"output-and-serialization","Output and serialization",[14,127433,127434,127435,127437,127438,127441,127442,127445],{},"Serialization decides whether a downstream system can trust your data. The frame is correct in memory; the question is whether it survives the round-trip into a CSV a BI tool ingests or a Parquet file a warehouse loads. Three defaults cause most import corruption: pandas writing its ",[30,127436,118019],{}," as a phantom first column that shifts every other column one position to the right, uncontrolled float precision letting ",[30,127439,127440],{},"19.99"," become ",[30,127443,127444],{},"19.989999999999998"," and breaking penny-exact reconciliation, and dates leaving as native objects a database cannot parse. Each is fixed by an explicit argument to the writer rather than by hoping the default is right.",[23,127447,127449],{"className":126,"code":127448,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl pyarrow\nimport pandas as pd\nfrom pathlib import Path\n\nOUT_DIR = Path(\"output\")\nOUT_DIR.mkdir(parents=True, exist_ok=True)\n\n# CSV for BI tools: no index column, ISO dates, fixed money precision, explicit utf-8\nenriched.to_csv(\n    OUT_DIR \u002F \"orders_clean.csv\",\n    index=False,                 # never ship the phantom index column\n    encoding=\"utf-8\",\n    date_format=\"%Y-%m-%d\",      # ISO 8601 — unambiguous across every locale\n    float_format=\"%.2f\",         # lock financial precision; no 19.999999999 drift\n)\n\n# XLSX for stakeholders who want a clickable file\nenriched.to_excel(OUT_DIR \u002F \"orders_clean.xlsx\", index=False, engine=\"openpyxl\")\n\n# Parquet for database loads and large interchange: typed, columnar, compact\nenriched.to_parquet(OUT_DIR \u002F \"orders_clean.parquet\", engine=\"pyarrow\", index=False)\nprint(\"Wrote CSV, XLSX, and Parquet outputs\")\n",[30,127450,127451,127456,127466,127476,127480,127492,127514,127518,127523,127528,127540,127554,127565,127583,127601,127605,127609,127614,127644,127648,127653,127683],{"__ignoreMap":28},[33,127452,127453],{"class":35,"line":36},[33,127454,127455],{"class":39},"# pip install pandas openpyxl pyarrow\n",[33,127457,127458,127460,127462,127464],{"class":35,"line":43},[33,127459,164],{"class":163},[33,127461,492],{"class":167},[33,127463,495],{"class":163},[33,127465,498],{"class":167},[33,127467,127468,127470,127472,127474],{"class":35,"line":61},[33,127469,190],{"class":163},[33,127471,193],{"class":167},[33,127473,164],{"class":163},[33,127475,198],{"class":167},[33,127477,127478],{"class":35,"line":73},[33,127479,92],{"emptyLinePlaceholder":91},[33,127481,127482,127484,127486,127488,127490],{"class":35,"line":88},[33,127483,40018],{"class":50},[33,127485,212],{"class":163},[33,127487,215],{"class":167},[33,127489,41169],{"class":54},[33,127491,221],{"class":167},[33,127493,127494,127496,127498,127500,127502,127504,127506,127508,127510,127512],{"class":35,"line":95},[33,127495,40018],{"class":50},[33,127497,1078],{"class":167},[33,127499,869],{"class":238},[33,127501,242],{"class":163},[33,127503,855],{"class":50},[33,127505,365],{"class":167},[33,127507,878],{"class":238},[33,127509,242],{"class":163},[33,127511,855],{"class":50},[33,127513,221],{"class":167},[33,127515,127516],{"class":35,"line":101},[33,127517,92],{"emptyLinePlaceholder":91},[33,127519,127520],{"class":35,"line":171},[33,127521,127522],{"class":39},"# CSV for BI tools: no index column, ISO dates, fixed money precision, explicit utf-8\n",[33,127524,127525],{"class":35,"line":179},[33,127526,127527],{"class":167},"enriched.to_csv(\n",[33,127529,127530,127533,127535,127538],{"class":35,"line":187},[33,127531,127532],{"class":50},"    OUT_DIR",[33,127534,1107],{"class":163},[33,127536,127537],{"class":54}," \"orders_clean.csv\"",[33,127539,247],{"class":167},[33,127541,127542,127544,127546,127548,127551],{"class":35,"line":201},[33,127543,119194],{"class":238},[33,127545,242],{"class":163},[33,127547,902],{"class":50},[33,127549,127550],{"class":167},",                 ",[33,127552,127553],{"class":39},"# never ship the phantom index column\n",[33,127555,127556,127559,127561,127563],{"class":35,"line":206},[33,127557,127558],{"class":238},"    encoding",[33,127560,242],{"class":163},[33,127562,1195],{"class":54},[33,127564,247],{"class":167},[33,127566,127567,127570,127572,127574,127576,127578,127580],{"class":35,"line":224},[33,127568,127569],{"class":238},"    date_format",[33,127571,242],{"class":163},[33,127573,1244],{"class":54},[33,127575,916],{"class":50},[33,127577,274],{"class":54},[33,127579,121141],{"class":167},[33,127581,127582],{"class":39},"# ISO 8601 — unambiguous across every locale\n",[33,127584,127585,127588,127590,127592,127594,127596,127598],{"class":35,"line":229},[33,127586,127587],{"class":238},"    float_format",[33,127589,242],{"class":163},[33,127591,274],{"class":54},[33,127593,54896],{"class":50},[33,127595,274],{"class":54},[33,127597,35641],{"class":167},[33,127599,127600],{"class":39},"# lock financial precision; no 19.999999999 drift\n",[33,127602,127603],{"class":35,"line":235},[33,127604,221],{"class":167},[33,127606,127607],{"class":35,"line":250},[33,127608,92],{"emptyLinePlaceholder":91},[33,127610,127611],{"class":35,"line":266},[33,127612,127613],{"class":39},"# XLSX for stakeholders who want a clickable file\n",[33,127615,127616,127619,127621,127623,127626,127628,127630,127632,127634,127636,127638,127640,127642],{"class":35,"line":290},[33,127617,127618],{"class":167},"enriched.to_excel(",[33,127620,40018],{"class":50},[33,127622,1107],{"class":163},[33,127624,127625],{"class":54}," \"orders_clean.xlsx\"",[33,127627,365],{"class":167},[33,127629,897],{"class":238},[33,127631,242],{"class":163},[33,127633,902],{"class":50},[33,127635,365],{"class":167},[33,127637,17351],{"class":238},[33,127639,242],{"class":163},[33,127641,17356],{"class":54},[33,127643,221],{"class":167},[33,127645,127646],{"class":35,"line":295},[33,127647,92],{"emptyLinePlaceholder":91},[33,127649,127650],{"class":35,"line":300},[33,127651,127652],{"class":39},"# Parquet for database loads and large interchange: typed, columnar, compact\n",[33,127654,127655,127658,127660,127662,127665,127667,127669,127671,127673,127675,127677,127679,127681],{"class":35,"line":317},[33,127656,127657],{"class":167},"enriched.to_parquet(",[33,127659,40018],{"class":50},[33,127661,1107],{"class":163},[33,127663,127664],{"class":54}," \"orders_clean.parquet\"",[33,127666,365],{"class":167},[33,127668,17351],{"class":238},[33,127670,242],{"class":163},[33,127672,108269],{"class":54},[33,127674,365],{"class":167},[33,127676,897],{"class":238},[33,127678,242],{"class":163},[33,127680,902],{"class":50},[33,127682,221],{"class":167},[33,127684,127685,127687,127689,127692],{"class":35,"line":332},[33,127686,13474],{"class":50},[33,127688,602],{"class":167},[33,127690,127691],{"class":54},"\"Wrote CSV, XLSX, and Parquet outputs\"",[33,127693,221],{"class":167},[14,127695,127696,127697,127699,127700,127702,127703,127705],{},"The BI-ready conventions — ",[30,127698,28142],{},", ISO dates, controlled precision, UTF-8 — are non-negotiable for clean dashboard imports; ",[940,127701,108865],{"href":108864}," covers compression, quoting, and chunked writes, and ",[940,127704,28147],{"href":28146}," addresses the single most common export bug head-on.",[18,127707,28616],{"id":28615},[14,127709,127710,127711,127714],{},"A pipeline that runs once on your laptop is a prototype; production means it runs unattended at 6 a.m. on a server you are not watching, leaves a trace when it fails, and survives a flaky network share that drops a connection one read in fifty. Three properties separate the two. First, a single entry point — one ",[30,127712,127713],{},"main()"," that reads, normalizes, merges, and exports — so the whole job is one process to schedule and one exit code to check. Second, logging to a file as well as stdout, with timestamps, so when the 6 a.m. run produces a wrong number you can reconstruct what it read and how many rows survived each stage. Third, retries on transient I\u002FO, because a locked Excel file or a momentarily unreachable share is not a reason to fail the whole job — it is a reason to wait two seconds and try again.",[23,127716,127718],{"className":126,"code":127717,"language":47,"meta":28,"style":28},"# pip install pandas\nimport logging\nimport time\nfrom pathlib import Path\nimport pandas as pd\n\nlogging.basicConfig(\n    level=logging.INFO,\n    format=\"%(asctime)s %(levelname)s %(message)s\",\n    handlers=[logging.FileHandler(\"pipeline.log\"), logging.StreamHandler()],\n)\nlog = logging.getLogger(\"excel_pipeline\")\n\ndef read_with_retry(path: Path, attempts: int = 3, delay: float = 2.0) -> pd.DataFrame:\n    \"\"\"Retry transient failures (locked file, network share hiccup) with backoff.\"\"\"\n    for attempt in range(1, attempts + 1):\n        try:\n            return pd.read_csv(path, dtype=\"string\")\n        except (OSError, pd.errors.ParserError) as exc:\n            log.warning(\"Read attempt %d\u002F%d failed for %s: %s\", attempt, attempts, path, exc)\n            if attempt == attempts:\n                raise\n            time.sleep(delay * attempt)   # linear backoff\n\ndef main() -> None:\n    log.info(\"Pipeline start\")\n    df = read_with_retry(Path(\"data\u002Fraw\u002Forders.csv\"))\n    log.info(\"Read %d rows\", len(df))\n    # ... normalize \u002F merge \u002F export ...\n    log.info(\"Pipeline complete\")\n\nif __name__ == \"__main__\":\n    main()\n",[30,127719,127720,127724,127730,127736,127746,127756,127760,127764,127776,127794,127809,127813,127826,127830,127856,127861,127883,127889,127903,127917,127944,127954,127958,127971,127975,127987,127996,128010,128028,128033,128042,128046,128058],{"__ignoreMap":28},[33,127721,127722],{"class":35,"line":36},[33,127723,8895],{"class":39},[33,127725,127726,127728],{"class":35,"line":43},[33,127727,164],{"class":163},[33,127729,184],{"class":167},[33,127731,127732,127734],{"class":35,"line":61},[33,127733,164],{"class":163},[33,127735,1689],{"class":167},[33,127737,127738,127740,127742,127744],{"class":35,"line":73},[33,127739,190],{"class":163},[33,127741,193],{"class":167},[33,127743,164],{"class":163},[33,127745,198],{"class":167},[33,127747,127748,127750,127752,127754],{"class":35,"line":88},[33,127749,164],{"class":163},[33,127751,492],{"class":167},[33,127753,495],{"class":163},[33,127755,498],{"class":167},[33,127757,127758],{"class":35,"line":95},[33,127759,92],{"emptyLinePlaceholder":91},[33,127761,127762],{"class":35,"line":101},[33,127763,232],{"class":167},[33,127765,127766,127768,127770,127772,127774],{"class":35,"line":171},[33,127767,253],{"class":238},[33,127769,242],{"class":163},[33,127771,258],{"class":167},[33,127773,1067],{"class":50},[33,127775,247],{"class":167},[33,127777,127778,127780,127782,127784,127786,127788,127790,127792],{"class":35,"line":179},[33,127779,269],{"class":238},[33,127781,242],{"class":163},[33,127783,274],{"class":54},[33,127785,277],{"class":50},[33,127787,280],{"class":50},[33,127789,283],{"class":50},[33,127791,274],{"class":54},[33,127793,247],{"class":167},[33,127795,127796,127798,127800,127803,127806],{"class":35,"line":187},[33,127797,26852],{"class":238},[33,127799,242],{"class":163},[33,127801,127802],{"class":167},"[logging.FileHandler(",[33,127804,127805],{"class":54},"\"pipeline.log\"",[33,127807,127808],{"class":167},"), logging.StreamHandler()],\n",[33,127810,127811],{"class":35,"line":201},[33,127812,221],{"class":167},[33,127814,127815,127817,127819,127821,127824],{"class":35,"line":206},[33,127816,28695],{"class":167},[33,127818,242],{"class":163},[33,127820,544],{"class":167},[33,127822,127823],{"class":54},"\"excel_pipeline\"",[33,127825,221],{"class":167},[33,127827,127828],{"class":35,"line":224},[33,127829,92],{"emptyLinePlaceholder":91},[33,127831,127832,127834,127837,127840,127842,127844,127846,127848,127850,127852,127854],{"class":35,"line":229},[33,127833,562],{"class":163},[33,127835,127836],{"class":46}," read_with_retry",[33,127838,127839],{"class":167},"(path: Path, attempts: ",[33,127841,1059],{"class":50},[33,127843,212],{"class":163},[33,127845,1714],{"class":50},[33,127847,1717],{"class":167},[33,127849,1720],{"class":50},[33,127851,212],{"class":163},[33,127853,1725],{"class":50},[33,127855,7668],{"class":167},[33,127857,127858],{"class":35,"line":235},[33,127859,127860],{"class":54},"    \"\"\"Retry transient failures (locked file, network share hiccup) with backoff.\"\"\"\n",[33,127862,127863,127865,127867,127869,127871,127873,127875,127877,127879,127881],{"class":35,"line":250},[33,127864,656],{"class":163},[33,127866,1796],{"class":167},[33,127868,662],{"class":163},[33,127870,1801],{"class":50},[33,127872,602],{"class":167},[33,127874,734],{"class":50},[33,127876,1808],{"class":167},[33,127878,1811],{"class":163},[33,127880,1814],{"class":50},[33,127882,1737],{"class":167},[33,127884,127885,127887],{"class":35,"line":266},[33,127886,670],{"class":163},[33,127888,574],{"class":167},[33,127890,127891,127893,127895,127897,127899,127901],{"class":35,"line":290},[33,127892,28782],{"class":163},[33,127894,27411],{"class":167},[33,127896,23262],{"class":238},[33,127898,242],{"class":163},[33,127900,27358],{"class":54},[33,127902,221],{"class":167},[33,127904,127905,127907,127909,127911,127913,127915],{"class":35,"line":295},[33,127906,780],{"class":163},[33,127908,17583],{"class":167},[33,127910,43079],{"class":50},[33,127912,108042],{"class":167},[33,127914,495],{"class":163},[33,127916,1855],{"class":167},[33,127918,127919,127921,127924,127926,127928,127930,127933,127935,127937,127939,127941],{"class":35,"line":300},[33,127920,28847],{"class":167},[33,127922,127923],{"class":54},"\"Read attempt ",[33,127925,916],{"class":50},[33,127927,1351],{"class":54},[33,127929,916],{"class":50},[33,127931,127932],{"class":54}," failed for ",[33,127934,309],{"class":50},[33,127936,2079],{"class":54},[33,127938,309],{"class":50},[33,127940,274],{"class":54},[33,127942,127943],{"class":167},", attempt, attempts, path, exc)\n",[33,127945,127946,127948,127950,127952],{"class":35,"line":317},[33,127947,5995],{"class":163},[33,127949,1796],{"class":167},[33,127951,1865],{"class":163},[33,127953,1868],{"class":167},[33,127955,127956],{"class":35,"line":332},[33,127957,28814],{"class":163},[33,127959,127960,127963,127965,127968],{"class":35,"line":347},[33,127961,127962],{"class":167},"            time.sleep(delay ",[33,127964,1769],{"class":163},[33,127966,127967],{"class":167}," attempt)   ",[33,127969,127970],{"class":39},"# linear backoff\n",[33,127972,127973],{"class":35,"line":374},[33,127974,92],{"emptyLinePlaceholder":91},[33,127976,127977,127979,127981,127983,127985],{"class":35,"line":397},[33,127978,562],{"class":163},[33,127980,6636],{"class":46},[33,127982,568],{"class":167},[33,127984,571],{"class":50},[33,127986,574],{"class":167},[33,127988,127989,127991,127994],{"class":35,"line":653},[33,127990,29002],{"class":167},[33,127992,127993],{"class":54},"\"Pipeline start\"",[33,127995,221],{"class":167},[33,127997,127998,128000,128002,128005,128008],{"class":35,"line":667},[33,127999,4025],{"class":167},[33,128001,242],{"class":163},[33,128003,128004],{"class":167}," read_with_retry(Path(",[33,128006,128007],{"class":54},"\"data\u002Fraw\u002Forders.csv\"",[33,128009,371],{"class":167},[33,128011,128012,128014,128017,128019,128021,128023,128025],{"class":35,"line":675},[33,128013,29002],{"class":167},[33,128015,128016],{"class":54},"\"Read ",[33,128018,916],{"class":50},[33,128020,65937],{"class":54},[33,128022,365],{"class":167},[33,128024,928],{"class":50},[33,128026,128027],{"class":167},"(df))\n",[33,128029,128030],{"class":35,"line":689},[33,128031,128032],{"class":39},"    # ... normalize \u002F merge \u002F export ...\n",[33,128034,128035,128037,128040],{"class":35,"line":703},[33,128036,29002],{"class":167},[33,128038,128039],{"class":54},"\"Pipeline complete\"",[33,128041,221],{"class":167},[33,128043,128044],{"class":35,"line":714},[33,128045,92],{"emptyLinePlaceholder":91},[33,128047,128048,128050,128052,128054,128056],{"class":35,"line":723},[33,128049,2491],{"class":163},[33,128051,2494],{"class":50},[33,128053,2497],{"class":163},[33,128055,2500],{"class":54},[33,128057,574],{"class":167},[33,128059,128060],{"class":35,"line":754},[33,128061,6914],{"class":167},[14,128063,128064,128065,128067,128068,128071,128072,128075,128076,128079,128080,128082,128083,128085,128086,128088,128089,3035],{},"Schedule the entry point with ",[30,128066,2108],{}," on Linux\u002FmacOS (",[30,128069,128070],{},"0 6 * * 1 \u002Fpath\u002F.venv\u002Fbin\u002Fpython \u002Fpath\u002Fpipeline.py","), Windows Task Scheduler, or a GitHub Actions workflow on a ",[30,128073,128074],{},"schedule:"," trigger for zero-infrastructure runs. Whichever scheduler you choose, always invoke the virtualenv's own interpreter by absolute path rather than relying on an activated shell — a cron job runs with a minimal environment that will not have your ",[30,128077,128078],{},"source .venv\u002Fbin\u002Factivate"," and will silently fall back to the system Python with none of your pinned dependencies. Capture the exit code: a non-zero status from ",[30,128081,127713],{}," should fail the scheduled job loudly so an alert fires, rather than logging an error nobody reads. End-to-end scheduling, structured logging, and alerting patterns live in ",[940,128084,5],{"href":26465},", and the ",[940,128087,6936],{"href":6935}," guide ties styling and pivots into a CI\u002FCD-scheduled report — including the recurring ",[940,128090,128091],{"href":97862},"openpyxl read-only mode error",[18,128093,29071],{"id":29070},[4273,128095,128096,128106],{},[4276,128097,128098],{},[4279,128099,128100,128102,128104],{},[4282,128101,29080],{},[4282,128103,4287],{},[4282,128105,4290],{},[4292,128107,128108,128128,128147,128168,128185],{},[4279,128109,128110,128113,128122],{},[4297,128111,128112],{},"Leading zeros vanish from IDs\u002Fzips",[4297,128114,128115,128117,128118,36661,128120],{},[30,128116,57237],{}," infers numeric dtype and drops ",[30,128119,125787],{},[30,128121,125791],{},[4297,128123,14408,128124,128127],{},[30,128125,128126],{},"dtype={\"col\": \"string\"}"," for any identifier column",[4279,128129,128130,128133,128141],{},[4297,128131,128132],{},"Phantom first column in exported CSV",[4297,128134,128135,128137,128138,128140],{},[30,128136,118265],{}," writes the ",[30,128139,118019],{}," by default",[4297,128142,96806,128143,128146],{},[30,128144,128145],{},"to_csv(..., index=False)"," unless the index is real data",[4279,128148,128149,128154,128162],{},[4297,128150,128151,128153],{},[30,128152,70953],{}," on a large Excel file",[4297,128155,128156,128159,128160],{},[30,128157,128158],{},"pd.read_excel"," loads the whole workbook into RAM and has no ",[30,128161,21944],{},[4297,128163,128164,128165,128167],{},"Convert to Parquet\u002FCSV first, then stream; or use the stdlib ",[30,128166,107436],{}," module for plain text",[4279,128169,128170,128173,128179],{},[4297,128171,128172],{},"Join silently multiplies rows",[4297,128174,128175,128176,128178],{},"Non-unique key on the right side of a ",[30,128177,27844],{}," causes fan-out",[4297,128180,4358,128181,128184],{},[30,128182,128183],{},"validate=\"many_to_one\""," and assert row counts before\u002Fafter",[4279,128186,128187,128192,128195],{},[4297,128188,128189,128191],{},[30,128190,53911],{}," mid-file",[4297,128193,128194],{},"Legacy export is Windows-1252, not UTF-8",[4297,128196,128197,128198,128200],{},"Retry with ",[30,128199,59121],{}," or detect encoding before reading",[18,128202,29184],{"id":29183},[14,128204,128205,128208],{},[1974,128206,128207],{},"Should I use pandas or openpyxl?","\nUse pandas for anything analytical — cleaning, joins, aggregation, type coercion. Switch to openpyxl (or xlsxwriter for write-only speed) only when you must control cell styling, formulas, conditional formats, or charts in the output workbook. Most pipelines read and transform in pandas, then hand the final frame to openpyxl purely for formatting on export.",[14,128210,128211,128214,128216,128217,8363,128220,128222],{},[1974,128212,128213],{},"How do I process an Excel file larger than available RAM?",[30,128215,128158],{}," cannot chunk — it loads the entire workbook. Convert the data to CSV or Parquet once, then stream the CSV with ",[30,128218,128219],{},"chunksize=",[30,128221,57237],{},", or read columns selectively from Parquet with pyarrow. For genuinely large workloads, evaluate Polars or Dask before forcing pandas through an out-of-core workaround.",[14,128224,128225,128228,128229,36661,128232,1351,128234,128236,128237,128240],{},[1974,128226,128227],{},"Why do my SKU or zip codes lose their leading zeros?","\npandas inferred an integer dtype and discarded them on read. The damage happens at ingestion, so the fix must too: pass ",[30,128230,128231],{},"dtype={\"sku\": \"string\", \"zip\": \"string\"}",[30,128233,57237],{},[30,128235,57240],{},". Re-padding with ",[30,128238,128239],{},"str.zfill"," afterward is fragile because you no longer know the original width.",[14,128242,128243,128246,128247,128249,128250,128252,128253,128255],{},[1974,128244,128245],{},"How do I keep dates and money consistent for a BI tool?","\nCoerce dates to real datetimes on read with ",[30,128248,102641],{},", export them as ISO 8601 via ",[30,128251,121305],{},", and lock currency precision with ",[30,128254,121288],{},". ISO dates and explicit precision remove every locale-dependent parsing ambiguity downstream tools introduce.",[14,128257,128258,128261],{},[1974,128259,128260],{},"Can Python fully replace VBA for spreadsheet automation?","\nYes, for nearly all batch and reporting work — Python gives you version control, testing, and unattended scheduling that VBA cannot. VBA remains relevant only for macros embedded inside a workbook's interactive UI or locked-down environments where no external runtime may be installed.",[18,128263,6918],{"id":6917},[4211,128265,128266,128271,128276,128281,128286,128291],{},[4214,128267,128268,128270],{},[940,128269,99577],{"href":99576}," — engine selection, multi-sheet parsing, and metadata extraction in depth.",[4214,128272,128273,128275],{},[940,128274,9599],{"href":9598}," — encoding, regex normalization, and type-coercion safeguards.",[4214,128277,128278,128280],{},[940,128279,28119],{"href":28118}," — concat vs merge vs join and schema alignment.",[4214,128282,128283,128285],{},[940,128284,6936],{"href":6935}," — styled, pivoted, CI\u002FCD-scheduled report output.",[4214,128287,128288,128290],{},[940,128289,102074],{"href":102073}," — formatting and charts on the write side.",[4214,128292,128293,128295],{},[940,128294,108865],{"href":108864}," — BI-ready serialization, compression, and precision control.",[14,128297,128298,128299,365,128301,128303,128304,3035],{},"Adjacent formats follow the same pipeline shape: ",[940,128300,6943],{"href":6942},[940,128302,26263],{"href":26262},", and end-to-end orchestration in ",[940,128305,6951],{"href":6950},[14,128307,6947,128308,3035],{},[940,128309,29264],{"href":1351},[6953,128311,128312],{},"html pre.shiki code .sAwPA, html code.shiki .sAwPA{--shiki-default:#6A737D}html pre.shiki code .s7eDp, html code.shiki .s7eDp{--shiki-default:#6F42C1}html pre.shiki code .sYu0t, html code.shiki .sYu0t{--shiki-default:#005CC5}html pre.shiki code .sYBdl, html code.shiki .sYBdl{--shiki-default:#032F62}html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html pre.shiki code .sD7c4, html code.shiki .sD7c4{--shiki-default:#D73A49}html pre.shiki code .sgsFI, html code.shiki .sgsFI{--shiki-default:#24292E}html pre.shiki code .sqxcx, html code.shiki .sqxcx{--shiki-default:#E36209}html pre.shiki code .s691h, html code.shiki .s691h{--shiki-default:#22863A;--shiki-default-font-weight:bold}",{"title":28,"searchDepth":43,"depth":43,"links":128314},[128315,128316,128317,128318,128323,128324,128325,128326,128327,128328,128329],{"id":125378,"depth":43,"text":125379},{"id":26468,"depth":43,"text":26469},{"id":26618,"depth":43,"text":26619},{"id":125771,"depth":43,"text":125772,"children":128319},[128320,128321,128322],{"id":125778,"depth":61,"text":125779},{"id":126081,"depth":61,"text":126082},{"id":126413,"depth":61,"text":126414},{"id":126724,"depth":43,"text":126725},{"id":127064,"depth":43,"text":127065},{"id":127430,"depth":43,"text":127431},{"id":28615,"depth":43,"text":28616},{"id":29070,"depth":43,"text":29071},{"id":29183,"depth":43,"text":29184},{"id":6917,"depth":43,"text":6918},"Replace manual spreadsheet workflows with reliable Python automation. Covers pandas, openpyxl, xlsxwriter, the csv module, and BI-ready export pipelines.",{},"\u002Fpython-for-excel-csv-data-processing",{"title":26258,"description":128330},"Python Excel & CSV Data Processing","python-for-excel-csv-data-processing\u002Findex",[99614,107436,47,9630,22009],"Izauopku1OnfRaUqMxXmFG-Fi73qzDuNKbVoqOtEz4M",{"id":128339,"title":128340,"body":128341,"breadcrumbTitle":131217,"canonical":6977,"date":6977,"description":107412,"draft":6980,"extension":6981,"image":6977,"meta":131218,"navigation":91,"path":131219,"robots":6977,"seo":131220,"seoTitle":131224,"stem":131225,"tags":6977,"updatedAt":6977,"__hash__":131226},"content\u002Fpython-for-excel-csv-data-processing\u002Fmerging-multiple-spreadsheets\u002Fconverting-excel-to-json-with-python\u002Findex.md","Converting Excel to JSON with Python",{"type":7,"value":128342,"toc":131200},[128343,128346,128371,128373,128395,128463,128472,128475,128507,128509,128528,128531,128685,128689,128695,128848,128877,128881,128893,128998,129009,129056,129060,129064,129078,129322,129326,129332,129584,129598,129601,129800,129803,129807,129812,130035,130044,130050,130054,130063,130375,130379,130382,130682,130689,130691,130912,130915,130974,130986,130988,131011,131037,131050,131068,131070,131167,131169,131193,131197],[10,128344,128340],{"id":128345},"converting-excel-to-json-with-python",[14,128347,128348,128349,128351,128352,20859,128355,128358,128359,128362,128363,128365,128366,128368,128369,3035],{},"Passing ",[30,128350,128158],{}," output directly to Python's ",[30,128353,128354],{},"json.dumps",[30,128356,128357],{},"TypeError: Object of type Timestamp is not JSON serializable"," because the standard ",[30,128360,128361],{},"json"," module has no handler for NumPy or pandas types. This guide shows how to read an Excel workbook with ",[940,128364,9630],{"href":99576},", pick the right ",[30,128367,22169],{}," option, fix serialization errors, and handle nested structures and multi-sheet workbooks. For consolidating several workbooks before exporting, see ",[940,128370,28119],{"href":28118},[18,128372,7021],{"id":7020},[14,128374,128375,128377,128378,128380,128381,128384,128385,2012,128388,128391,128392,128394],{},[30,128376,9630],{}," auto-infers column types on ",[30,128379,57240],{},". Date columns become ",[30,128382,128383],{},"datetime64[ns]","; empty cells become ",[30,128386,128387],{},"numpy.nan",[30,128389,128390],{},"pandas.NaT",". Neither type is recognized by the RFC 8259 JSON spec, so ",[30,128393,128354],{}," raises immediately.",[23,128396,128398],{"className":126,"code":128397,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nimport pandas as pd\nimport json\n\ndf = pd.read_excel(\"report.xlsx\", engine=\"openpyxl\")\njson.dumps(df.to_dict(orient=\"records\"))\n# TypeError: Object of type Timestamp is not JSON serializable\n",[30,128399,128400,128404,128414,128420,128424,128445,128458],{"__ignoreMap":28},[33,128401,128402],{"class":35,"line":36},[33,128403,3952],{"class":39},[33,128405,128406,128408,128410,128412],{"class":35,"line":43},[33,128407,164],{"class":163},[33,128409,492],{"class":167},[33,128411,495],{"class":163},[33,128413,498],{"class":167},[33,128415,128416,128418],{"class":35,"line":61},[33,128417,164],{"class":163},[33,128419,3081],{"class":167},[33,128421,128422],{"class":35,"line":73},[33,128423,92],{"emptyLinePlaceholder":91},[33,128425,128426,128428,128430,128432,128435,128437,128439,128441,128443],{"class":35,"line":88},[33,128427,13459],{"class":167},[33,128429,242],{"class":163},[33,128431,126254],{"class":167},[33,128433,128434],{"class":54},"\"report.xlsx\"",[33,128436,365],{"class":167},[33,128438,17351],{"class":238},[33,128440,242],{"class":163},[33,128442,17356],{"class":54},[33,128444,221],{"class":167},[33,128446,128447,128450,128452,128454,128456],{"class":35,"line":95},[33,128448,128449],{"class":167},"json.dumps(df.to_dict(",[33,128451,22169],{"class":238},[33,128453,242],{"class":163},[33,128455,21222],{"class":54},[33,128457,371],{"class":167},[33,128459,128460],{"class":35,"line":101},[33,128461,128462],{"class":39},"# TypeError: Object of type Timestamp is not JSON serializable\n",[14,128464,9574,128465,10065,128468,128471],{},[30,128466,128467],{},"df.dtypes",[30,128469,128470],{},"df.isna().sum()"," first to locate offending columns before writing a conversion script.",[14,128473,128474],{},"The three types most likely to break serialization are:",[4211,128476,128477,128482,128493],{},[4214,128478,128479,128481],{},[30,128480,128383],{}," — pandas timestamp, no JSON equivalent without conversion",[4214,128483,128484,128486,128487,128489,128490,128492],{},[30,128485,102445],{}," containing ",[30,128488,128387],{}," — RFC 8259 does not allow ",[30,128491,8884],{}," as a JSON value",[4214,128494,128495,128497,128498,365,128501,128504,128505],{},[30,128496,11888],{}," columns with mixed types — could contain ",[30,128499,128500],{},"pd.Timestamp",[30,128502,128503],{},"numpy.int64",", or plain ",[30,128506,571],{},[18,128508,21],{"id":20},[23,128510,128512],{"className":25,"code":128511,"language":27,"meta":28,"style":28},"# pip install pandas openpyxl\npip install pandas openpyxl\n",[30,128513,128514,128518],{"__ignoreMap":28},[33,128515,128516],{"class":35,"line":36},[33,128517,3952],{"class":39},[33,128519,128520,128522,128524,128526],{"class":35,"line":43},[33,128521,76],{"class":46},[33,128523,79],{"class":54},[33,128525,16183],{"class":54},[33,128527,95887],{"class":54},[14,128529,128530],{},"Create a test workbook to follow along:",[23,128532,128534],{"className":126,"code":128533,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nimport pandas as pd\nfrom pathlib import Path\nfrom datetime import date\n\npd.DataFrame({\n    \"order_id\": [1001, 1002, 1003],\n    \"customer\":  [\"Acme\", \"Globex\", None],\n    \"order_date\": pd.to_datetime([\"2026-01-15\", \"2026-02-03\", \"2026-03-22\"]),\n    \"amount\":    [1250.50, 980.00, 2100.75],\n}).to_excel(Path(\"report.xlsx\"), index=False, engine=\"openpyxl\")\n",[30,128535,128536,128540,128550,128560,128571,128575,128580,128598,128618,128639,128660],{"__ignoreMap":28},[33,128537,128538],{"class":35,"line":36},[33,128539,3952],{"class":39},[33,128541,128542,128544,128546,128548],{"class":35,"line":43},[33,128543,164],{"class":163},[33,128545,492],{"class":167},[33,128547,495],{"class":163},[33,128549,498],{"class":167},[33,128551,128552,128554,128556,128558],{"class":35,"line":61},[33,128553,190],{"class":163},[33,128555,193],{"class":167},[33,128557,164],{"class":163},[33,128559,198],{"class":167},[33,128561,128562,128564,128566,128568],{"class":35,"line":73},[33,128563,190],{"class":163},[33,128565,3881],{"class":167},[33,128567,164],{"class":163},[33,128569,128570],{"class":167}," date\n",[33,128572,128573],{"class":35,"line":88},[33,128574,92],{"emptyLinePlaceholder":91},[33,128576,128577],{"class":35,"line":95},[33,128578,128579],{"class":167},"pd.DataFrame({\n",[33,128581,128582,128584,128586,128588,128590,128592,128594,128596],{"class":35,"line":101},[33,128583,120337],{"class":54},[33,128585,12426],{"class":167},[33,128587,120342],{"class":50},[33,128589,365],{"class":167},[33,128591,120347],{"class":50},[33,128593,365],{"class":167},[33,128595,120352],{"class":50},[33,128597,8935],{"class":167},[33,128599,128600,128602,128604,128607,128609,128612,128614,128616],{"class":35,"line":171},[33,128601,120359],{"class":54},[33,128603,11818],{"class":167},[33,128605,128606],{"class":54},"\"Acme\"",[33,128608,365],{"class":167},[33,128610,128611],{"class":54},"\"Globex\"",[33,128613,365],{"class":167},[33,128615,571],{"class":50},[33,128617,8935],{"class":167},[33,128619,128620,128623,128625,128627,128629,128632,128634,128637],{"class":35,"line":179},[33,128621,128622],{"class":54},"    \"order_date\"",[33,128624,120413],{"class":167},[33,128626,12407],{"class":54},[33,128628,365],{"class":167},[33,128630,128631],{"class":54},"\"2026-02-03\"",[33,128633,365],{"class":167},[33,128635,128636],{"class":54},"\"2026-03-22\"",[33,128638,12871],{"class":167},[33,128640,128641,128643,128645,128648,128650,128653,128655,128658],{"class":35,"line":187},[33,128642,120381],{"class":54},[33,128644,11787],{"class":167},[33,128646,128647],{"class":50},"1250.50",[33,128649,365],{"class":167},[33,128651,128652],{"class":50},"980.00",[33,128654,365],{"class":167},[33,128656,128657],{"class":50},"2100.75",[33,128659,8935],{"class":167},[33,128661,128662,128665,128667,128669,128671,128673,128675,128677,128679,128681,128683],{"class":35,"line":201},[33,128663,128664],{"class":167},"}).to_excel(Path(",[33,128666,128434],{"class":54},[33,128668,18525],{"class":167},[33,128670,897],{"class":238},[33,128672,242],{"class":163},[33,128674,902],{"class":50},[33,128676,365],{"class":167},[33,128678,17351],{"class":238},[33,128680,242],{"class":163},[33,128682,17356],{"class":54},[33,128684,221],{"class":167},[18,128686,128688],{"id":128687},"step-1-read-and-inspect","Step 1 — Read and Inspect",[14,128690,128691,128692,128694],{},"Always inspect dtypes and null counts before attempting serialization. It takes two lines and prevents mysterious ",[30,128693,86188],{}," failures later.",[23,128696,128698],{"className":126,"code":128697,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nfrom pathlib import Path\nimport pandas as pd\n\nSOURCE = Path(\"report.xlsx\")\n\ntry:\n    df = pd.read_excel(SOURCE, engine=\"openpyxl\")\nexcept FileNotFoundError as exc:\n    raise SystemExit(f\"File not found: {exc}\")\n\nprint(df.dtypes)\n# order_id               int64\n# customer              object\n# order_date    datetime64[ns]   ← will fail json.dumps\n# amount               float64\n\nprint(df.isna().sum())\n# customer    1  ← one null → NaN in a float column raises ValueError in json.dumps\n",[30,128699,128700,128704,128714,128724,128728,128740,128744,128750,128770,128780,128802,128806,128812,128817,128822,128827,128832,128836,128843],{"__ignoreMap":28},[33,128701,128702],{"class":35,"line":36},[33,128703,3952],{"class":39},[33,128705,128706,128708,128710,128712],{"class":35,"line":43},[33,128707,190],{"class":163},[33,128709,193],{"class":167},[33,128711,164],{"class":163},[33,128713,198],{"class":167},[33,128715,128716,128718,128720,128722],{"class":35,"line":61},[33,128717,164],{"class":163},[33,128719,492],{"class":167},[33,128721,495],{"class":163},[33,128723,498],{"class":167},[33,128725,128726],{"class":35,"line":73},[33,128727,92],{"emptyLinePlaceholder":91},[33,128729,128730,128732,128734,128736,128738],{"class":35,"line":88},[33,128731,86272],{"class":50},[33,128733,212],{"class":163},[33,128735,215],{"class":167},[33,128737,128434],{"class":54},[33,128739,221],{"class":167},[33,128741,128742],{"class":35,"line":95},[33,128743,92],{"emptyLinePlaceholder":91},[33,128745,128746,128748],{"class":35,"line":101},[33,128747,35574],{"class":163},[33,128749,574],{"class":167},[33,128751,128752,128754,128756,128758,128760,128762,128764,128766,128768],{"class":35,"line":171},[33,128753,4025],{"class":167},[33,128755,242],{"class":163},[33,128757,126254],{"class":167},[33,128759,86272],{"class":50},[33,128761,365],{"class":167},[33,128763,17351],{"class":238},[33,128765,242],{"class":163},[33,128767,17356],{"class":54},[33,128769,221],{"class":167},[33,128771,128772,128774,128776,128778],{"class":35,"line":179},[33,128773,35726],{"class":163},[33,128775,2945],{"class":50},[33,128777,1852],{"class":163},[33,128779,1855],{"class":167},[33,128781,128782,128784,128786,128788,128790,128792,128794,128796,128798,128800],{"class":35,"line":187},[33,128783,35742],{"class":163},[33,128785,16617],{"class":50},[33,128787,602],{"class":167},[33,128789,4059],{"class":163},[33,128791,15677],{"class":54},[33,128793,1115],{"class":50},[33,128795,6565],{"class":167},[33,128797,1121],{"class":50},[33,128799,274],{"class":54},[33,128801,221],{"class":167},[33,128803,128804],{"class":35,"line":201},[33,128805,92],{"emptyLinePlaceholder":91},[33,128807,128808,128810],{"class":35,"line":206},[33,128809,13474],{"class":50},[33,128811,108834],{"class":167},[33,128813,128814],{"class":35,"line":224},[33,128815,128816],{"class":39},"# order_id               int64\n",[33,128818,128819],{"class":35,"line":229},[33,128820,128821],{"class":39},"# customer              object\n",[33,128823,128824],{"class":35,"line":235},[33,128825,128826],{"class":39},"# order_date    datetime64[ns]   ← will fail json.dumps\n",[33,128828,128829],{"class":35,"line":250},[33,128830,128831],{"class":39},"# amount               float64\n",[33,128833,128834],{"class":35,"line":266},[33,128835,92],{"emptyLinePlaceholder":91},[33,128837,128838,128840],{"class":35,"line":290},[33,128839,13474],{"class":50},[33,128841,128842],{"class":167},"(df.isna().sum())\n",[33,128844,128845],{"class":35,"line":295},[33,128846,128847],{"class":39},"# customer    1  ← one null → NaN in a float column raises ValueError in json.dumps\n",[14,128849,39550,128850,128852,128853,128855,128856,128858,128859,20859,128861,128864,128865,128867,128868,128870,128871,128874,128875,3035],{},[30,128851,22395],{}," parameter is required for ",[30,128854,26542],{}," files. Using the old ",[30,128857,125595],{}," engine for ",[30,128860,26542],{},[30,128862,128863],{},"XLRDError"," — xlrd 2.x dropped ",[30,128866,26542],{}," support. If you work with ",[30,128869,112255],{}," (old binary format), use ",[30,128872,128873],{},"engine=\"xlrd\"",". For more on engine selection and sheet options, see ",[940,128876,99577],{"href":99576},[18,128878,128880],{"id":128879},"step-2-choose-an-orient","Step 2 — Choose an orient",[14,128882,128883,10065,128886,128889,128890,128892],{},[30,128884,128885],{},"DataFrame.to_json(orient=...)",[30,128887,128888],{},"DataFrame.to_dict(orient=...)"," both take an ",[30,128891,22169],{}," argument that controls the JSON shape.",[4273,128894,128895,128907],{},[4276,128896,128897],{},[4279,128898,128899,128902,128905],{},[4282,128900,22169],{"align":128901},"left",[4282,128903,128904],{"align":128901},"JSON shape",[4282,128906,64975],{"align":128901},[4292,128908,128909,128923,128941,128955,128970,128985],{},[4279,128910,128911,128915,128920],{},[4297,128912,128913],{"align":128901},[30,128914,21222],{},[4297,128916,128917],{"align":128901},[30,128918,128919],{},"[{col: val, ...}, ...]",[4297,128921,128922],{"align":128901},"REST APIs, most consumers",[4279,128924,128925,128930,128935],{},[4297,128926,128927],{"align":128901},[30,128928,128929],{},"\"split\"",[4297,128931,128932],{"align":128901},[30,128933,128934],{},"{columns: [...], data: [[...]]}",[4297,128936,128937,128938],{"align":128901},"Compact transfer, reconstruct with ",[30,128939,128940],{},"pd.read_json",[4279,128942,128943,128947,128952],{},[4297,128944,128945],{"align":128901},[30,128946,119616],{},[4297,128948,128949],{"align":128901},[30,128950,128951],{},"{row_index: {col: val}}",[4297,128953,128954],{"align":128901},"Keyed lookup by row index",[4279,128956,128957,128962,128967],{},[4297,128958,128959],{"align":128901},[30,128960,128961],{},"\"columns\"",[4297,128963,128964],{"align":128901},[30,128965,128966],{},"{col: {row_index: val}}",[4297,128968,128969],{"align":128901},"Column-oriented stores",[4279,128971,128972,128977,128982],{},[4297,128973,128974],{"align":128901},[30,128975,128976],{},"\"values\"",[4297,128978,128979],{"align":128901},[30,128980,128981],{},"[[val, ...], ...]",[4297,128983,128984],{"align":128901},"Minimal size, no header",[4279,128986,128987,128992,128995],{},[4297,128988,128989],{"align":128901},[30,128990,128991],{},"\"table\"",[4297,128993,128994],{"align":128901},"JSON Table Schema + data",[4297,128996,128997],{"align":128901},"Self-describing, pandas round-trip",[14,128999,129000,129002,129003,129005,129006,3035],{},[30,129001,21222],{}," is the right default for API output. Use ",[30,129004,128991],{}," if you need to reconstruct exact dtypes with ",[30,129007,129008],{},"pd.read_json(orient=\"table\")",[14,129010,39550,129011,129013,129014,10065,129016,129018,129019,10065,129022,129025,129026,102442,129028,129031,129032,129035,129036,129038,129039,129042,129043,129045,129046,129049,129050,92982,129053,3035],{},[30,129012,22169],{}," also affects how ",[30,129015,8884],{},[30,129017,571],{}," appear in the output. With ",[30,129020,129021],{},"orient=\"records\"",[30,129023,129024],{},"df.to_json()",", pandas renders ",[30,129027,8884],{},[30,129029,129030],{},"null"," automatically — no manual replacement needed. With ",[30,129033,129034],{},"json.dumps(df.to_dict())",", pandas ",[30,129037,8884],{}," values survive as Python ",[30,129040,129041],{},"float('nan')",", which the ",[30,129044,128361],{}," module rejects with ",[30,129047,129048],{},"ValueError: Out of range float values are not JSON compliant",". The fix is ",[30,129051,129052],{},"df.where(df.notna(), other=None)",[30,129054,129055],{},"to_dict",[18,129057,129059],{"id":129058},"step-3-fix-serialization-errors","Step 3 — Fix Serialization Errors",[424,129061,129063],{"id":129062},"option-a-use-to_json-simplest","Option A — use to_json() (simplest)",[14,129065,129066,129069,129070,129073,129074,129077],{},[30,129067,129068],{},"DataFrame.to_json"," handles NumPy and pandas types automatically. The ",[30,129071,129072],{},"date_format=\"iso\""," argument converts ",[30,129075,129076],{},"Timestamp"," to ISO 8601 strings.",[23,129079,129081],{"className":126,"code":129080,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nfrom pathlib import Path\nimport pandas as pd\n\nSOURCE = Path(\"report.xlsx\")\nDEST   = Path(\"output.json\")\n\ntry:\n    df = pd.read_excel(SOURCE, engine=\"openpyxl\")\nexcept FileNotFoundError as exc:\n    raise SystemExit(f\"File not found: {exc}\")\n\n# Replace NaN with None so JSON gets null, not NaN\ndf = df.where(df.notna(), other=None)\n\ndf.to_json(\n    DEST,\n    orient=\"records\",\n    date_format=\"iso\",   # Timestamps → \"2026-01-15T00:00:00\"\n    indent=2,\n    force_ascii=False,   # preserve non-ASCII characters\n)\nprint(f\"Wrote {len(df)} records to {DEST}\")\n",[30,129082,129083,129087,129097,129107,129111,129123,129137,129141,129147,129167,129177,129199,129203,129208,129226,129230,129235,129242,129253,129267,129278,129292,129296],{"__ignoreMap":28},[33,129084,129085],{"class":35,"line":36},[33,129086,3952],{"class":39},[33,129088,129089,129091,129093,129095],{"class":35,"line":43},[33,129090,190],{"class":163},[33,129092,193],{"class":167},[33,129094,164],{"class":163},[33,129096,198],{"class":167},[33,129098,129099,129101,129103,129105],{"class":35,"line":61},[33,129100,164],{"class":163},[33,129102,492],{"class":167},[33,129104,495],{"class":163},[33,129106,498],{"class":167},[33,129108,129109],{"class":35,"line":73},[33,129110,92],{"emptyLinePlaceholder":91},[33,129112,129113,129115,129117,129119,129121],{"class":35,"line":88},[33,129114,86272],{"class":50},[33,129116,212],{"class":163},[33,129118,215],{"class":167},[33,129120,128434],{"class":54},[33,129122,221],{"class":167},[33,129124,129125,129128,129130,129132,129135],{"class":35,"line":95},[33,129126,129127],{"class":50},"DEST",[33,129129,21012],{"class":163},[33,129131,215],{"class":167},[33,129133,129134],{"class":54},"\"output.json\"",[33,129136,221],{"class":167},[33,129138,129139],{"class":35,"line":101},[33,129140,92],{"emptyLinePlaceholder":91},[33,129142,129143,129145],{"class":35,"line":171},[33,129144,35574],{"class":163},[33,129146,574],{"class":167},[33,129148,129149,129151,129153,129155,129157,129159,129161,129163,129165],{"class":35,"line":179},[33,129150,4025],{"class":167},[33,129152,242],{"class":163},[33,129154,126254],{"class":167},[33,129156,86272],{"class":50},[33,129158,365],{"class":167},[33,129160,17351],{"class":238},[33,129162,242],{"class":163},[33,129164,17356],{"class":54},[33,129166,221],{"class":167},[33,129168,129169,129171,129173,129175],{"class":35,"line":187},[33,129170,35726],{"class":163},[33,129172,2945],{"class":50},[33,129174,1852],{"class":163},[33,129176,1855],{"class":167},[33,129178,129179,129181,129183,129185,129187,129189,129191,129193,129195,129197],{"class":35,"line":201},[33,129180,35742],{"class":163},[33,129182,16617],{"class":50},[33,129184,602],{"class":167},[33,129186,4059],{"class":163},[33,129188,15677],{"class":54},[33,129190,1115],{"class":50},[33,129192,6565],{"class":167},[33,129194,1121],{"class":50},[33,129196,274],{"class":54},[33,129198,221],{"class":167},[33,129200,129201],{"class":35,"line":206},[33,129202,92],{"emptyLinePlaceholder":91},[33,129204,129205],{"class":35,"line":224},[33,129206,129207],{"class":39},"# Replace NaN with None so JSON gets null, not NaN\n",[33,129209,129210,129212,129214,129217,129220,129222,129224],{"class":35,"line":229},[33,129211,13459],{"class":167},[33,129213,242],{"class":163},[33,129215,129216],{"class":167}," df.where(df.notna(), ",[33,129218,129219],{"class":238},"other",[33,129221,242],{"class":163},[33,129223,571],{"class":50},[33,129225,221],{"class":167},[33,129227,129228],{"class":35,"line":235},[33,129229,92],{"emptyLinePlaceholder":91},[33,129231,129232],{"class":35,"line":250},[33,129233,129234],{"class":167},"df.to_json(\n",[33,129236,129237,129240],{"class":35,"line":266},[33,129238,129239],{"class":50},"    DEST",[33,129241,247],{"class":167},[33,129243,129244,129247,129249,129251],{"class":35,"line":290},[33,129245,129246],{"class":238},"    orient",[33,129248,242],{"class":163},[33,129250,21222],{"class":54},[33,129252,247],{"class":167},[33,129254,129255,129257,129259,129262,129264],{"class":35,"line":295},[33,129256,127569],{"class":238},[33,129258,242],{"class":163},[33,129260,129261],{"class":54},"\"iso\"",[33,129263,1166],{"class":167},[33,129265,129266],{"class":39},"# Timestamps → \"2026-01-15T00:00:00\"\n",[33,129268,129269,129272,129274,129276],{"class":35,"line":300},[33,129270,129271],{"class":238},"    indent",[33,129273,242],{"class":163},[33,129275,1533],{"class":50},[33,129277,247],{"class":167},[33,129279,129280,129283,129285,129287,129289],{"class":35,"line":317},[33,129281,129282],{"class":238},"    force_ascii",[33,129284,242],{"class":163},[33,129286,902],{"class":50},[33,129288,1166],{"class":167},[33,129290,129291],{"class":39},"# preserve non-ASCII characters\n",[33,129293,129294],{"class":35,"line":332},[33,129295,221],{"class":167},[33,129297,129298,129300,129302,129304,129306,129308,129310,129312,129315,129318,129320],{"class":35,"line":347},[33,129299,13474],{"class":50},[33,129301,602],{"class":167},[33,129303,4059],{"class":163},[33,129305,913],{"class":54},[33,129307,4065],{"class":50},[33,129309,4068],{"class":167},[33,129311,1121],{"class":50},[33,129313,129314],{"class":54}," records to ",[33,129316,129317],{"class":50},"{DEST}",[33,129319,274],{"class":54},[33,129321,221],{"class":167},[424,129323,129325],{"id":129324},"option-b-custom-encoder-for-jsondumps","Option B — custom encoder for json.dumps()",[14,129327,129328,129329,129331],{},"When you need ",[30,129330,128354],{}," (e.g., to embed the JSON in a larger dict), write a small encoder:",[23,129333,129335],{"className":126,"code":129334,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nfrom pathlib import Path\nimport json\nimport pandas as pd\n\nSOURCE = Path(\"report.xlsx\")\n\ndf = pd.read_excel(SOURCE, engine=\"openpyxl\")\ndf = df.where(df.notna(), other=None)   # NaN → None → JSON null\n\nclass PandasEncoder(json.JSONEncoder):\n    def default(self, obj):\n        if isinstance(obj, (pd.Timestamp,)):\n            return obj.isoformat()       # \"2026-01-15T00:00:00\"\n        if hasattr(obj, \"item\"):         # numpy scalar (int64, float64, etc.)\n            return obj.item()\n        return super().default(obj)\n\nrecords = df.to_dict(orient=\"records\")\npayload = json.dumps({\"data\": records, \"count\": len(records)}, cls=PandasEncoder, indent=2)\nprint(payload[:200])\n",[30,129336,129337,129341,129351,129357,129367,129371,129383,129387,129407,129426,129430,129449,129459,129468,129478,129496,129503,129513,129517,129534,129573],{"__ignoreMap":28},[33,129338,129339],{"class":35,"line":36},[33,129340,3952],{"class":39},[33,129342,129343,129345,129347,129349],{"class":35,"line":43},[33,129344,190],{"class":163},[33,129346,193],{"class":167},[33,129348,164],{"class":163},[33,129350,198],{"class":167},[33,129352,129353,129355],{"class":35,"line":61},[33,129354,164],{"class":163},[33,129356,3081],{"class":167},[33,129358,129359,129361,129363,129365],{"class":35,"line":73},[33,129360,164],{"class":163},[33,129362,492],{"class":167},[33,129364,495],{"class":163},[33,129366,498],{"class":167},[33,129368,129369],{"class":35,"line":88},[33,129370,92],{"emptyLinePlaceholder":91},[33,129372,129373,129375,129377,129379,129381],{"class":35,"line":95},[33,129374,86272],{"class":50},[33,129376,212],{"class":163},[33,129378,215],{"class":167},[33,129380,128434],{"class":54},[33,129382,221],{"class":167},[33,129384,129385],{"class":35,"line":101},[33,129386,92],{"emptyLinePlaceholder":91},[33,129388,129389,129391,129393,129395,129397,129399,129401,129403,129405],{"class":35,"line":171},[33,129390,13459],{"class":167},[33,129392,242],{"class":163},[33,129394,126254],{"class":167},[33,129396,86272],{"class":50},[33,129398,365],{"class":167},[33,129400,17351],{"class":238},[33,129402,242],{"class":163},[33,129404,17356],{"class":54},[33,129406,221],{"class":167},[33,129408,129409,129411,129413,129415,129417,129419,129421,129423],{"class":35,"line":179},[33,129410,13459],{"class":167},[33,129412,242],{"class":163},[33,129414,129216],{"class":167},[33,129416,129219],{"class":238},[33,129418,242],{"class":163},[33,129420,571],{"class":50},[33,129422,12000],{"class":167},[33,129424,129425],{"class":39},"# NaN → None → JSON null\n",[33,129427,129428],{"class":35,"line":187},[33,129429,92],{"emptyLinePlaceholder":91},[33,129431,129432,129435,129438,129440,129442,129444,129447],{"class":35,"line":201},[33,129433,129434],{"class":163},"class",[33,129436,129437],{"class":46}," PandasEncoder",[33,129439,602],{"class":167},[33,129441,128361],{"class":46},[33,129443,3035],{"class":167},[33,129445,129446],{"class":46},"JSONEncoder",[33,129448,1737],{"class":167},[33,129450,129451,129453,129456],{"class":35,"line":206},[33,129452,1742],{"class":163},[33,129454,129455],{"class":46}," default",[33,129457,129458],{"class":167},"(self, obj):\n",[33,129460,129461,129463,129465],{"class":35,"line":224},[33,129462,8221],{"class":163},[33,129464,36538],{"class":50},[33,129466,129467],{"class":167},"(obj, (pd.Timestamp,)):\n",[33,129469,129470,129472,129475],{"class":35,"line":229},[33,129471,28782],{"class":163},[33,129473,129474],{"class":167}," obj.isoformat()       ",[33,129476,129477],{"class":39},"# \"2026-01-15T00:00:00\"\n",[33,129479,129480,129482,129484,129487,129490,129493],{"class":35,"line":235},[33,129481,8221],{"class":163},[33,129483,75961],{"class":50},[33,129485,129486],{"class":167},"(obj, ",[33,129488,129489],{"class":54},"\"item\"",[33,129491,129492],{"class":167},"):         ",[33,129494,129495],{"class":39},"# numpy scalar (int64, float64, etc.)\n",[33,129497,129498,129500],{"class":35,"line":250},[33,129499,28782],{"class":163},[33,129501,129502],{"class":167}," obj.item()\n",[33,129504,129505,129507,129510],{"class":35,"line":266},[33,129506,1659],{"class":163},[33,129508,129509],{"class":50}," super",[33,129511,129512],{"class":167},"().default(obj)\n",[33,129514,129515],{"class":35,"line":290},[33,129516,92],{"emptyLinePlaceholder":91},[33,129518,129519,129522,129524,129526,129528,129530,129532],{"class":35,"line":295},[33,129520,129521],{"class":167},"records ",[33,129523,242],{"class":163},[33,129525,54131],{"class":167},[33,129527,22169],{"class":238},[33,129529,242],{"class":163},[33,129531,21222],{"class":54},[33,129533,221],{"class":167},[33,129535,129536,129539,129541,129543,129545,129548,129550,129552,129554,129557,129560,129562,129565,129567,129569,129571],{"class":35,"line":300},[33,129537,129538],{"class":167},"payload ",[33,129540,242],{"class":163},[33,129542,3456],{"class":167},[33,129544,95970],{"class":54},[33,129546,129547],{"class":167},": records, ",[33,129549,96601],{"class":54},[33,129551,2079],{"class":167},[33,129553,928],{"class":50},[33,129555,129556],{"class":167},"(records)}, ",[33,129558,129559],{"class":238},"cls",[33,129561,242],{"class":163},[33,129563,129564],{"class":167},"PandasEncoder, ",[33,129566,37382],{"class":238},[33,129568,242],{"class":163},[33,129570,1533],{"class":50},[33,129572,221],{"class":167},[33,129574,129575,129577,129580,129582],{"class":35,"line":317},[33,129576,13474],{"class":50},[33,129578,129579],{"class":167},"(payload[:",[33,129581,2611],{"class":50},[33,129583,751],{"class":167},[14,129585,129586,129589,129590,365,129592,129594,129595,129597],{},[30,129587,129588],{},"obj.item()"," converts any NumPy scalar (",[30,129591,102448],{},[30,129593,102445],{},") to its native Python equivalent, which the standard ",[30,129596,128361],{}," module can serialize.",[14,129599,129600],{},"When you control the read step, a simpler alternative is to force all columns to Python-native types at load time:",[23,129602,129604],{"className":126,"code":129603,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nfrom pathlib import Path\nimport pandas as pd\nimport json\n\nSOURCE = Path(\"report.xlsx\")\n\ndf = pd.read_excel(SOURCE, engine=\"openpyxl\")\n\n# Convert datetime cols to ISO strings; convert NaN → None\nfor col in df.select_dtypes(include=[\"datetime64[ns]\", \"datetimetz\"]).columns:\n    df[col] = df[col].dt.strftime(\"%Y-%m-%dT%H:%M:%S\")   # ISO 8601\n\ndf = df.where(df.notna(), other=None)\n\n# Now to_dict produces only native Python types: str, int, float, None\nrecords = df.to_dict(orient=\"records\")\nprint(json.dumps(records[:1], indent=2))   # no encoder needed\n",[30,129605,129606,129610,129620,129630,129636,129640,129652,129656,129676,129680,129685,129712,129733,129737,129753,129757,129762,129778],{"__ignoreMap":28},[33,129607,129608],{"class":35,"line":36},[33,129609,3952],{"class":39},[33,129611,129612,129614,129616,129618],{"class":35,"line":43},[33,129613,190],{"class":163},[33,129615,193],{"class":167},[33,129617,164],{"class":163},[33,129619,198],{"class":167},[33,129621,129622,129624,129626,129628],{"class":35,"line":61},[33,129623,164],{"class":163},[33,129625,492],{"class":167},[33,129627,495],{"class":163},[33,129629,498],{"class":167},[33,129631,129632,129634],{"class":35,"line":73},[33,129633,164],{"class":163},[33,129635,3081],{"class":167},[33,129637,129638],{"class":35,"line":88},[33,129639,92],{"emptyLinePlaceholder":91},[33,129641,129642,129644,129646,129648,129650],{"class":35,"line":95},[33,129643,86272],{"class":50},[33,129645,212],{"class":163},[33,129647,215],{"class":167},[33,129649,128434],{"class":54},[33,129651,221],{"class":167},[33,129653,129654],{"class":35,"line":101},[33,129655,92],{"emptyLinePlaceholder":91},[33,129657,129658,129660,129662,129664,129666,129668,129670,129672,129674],{"class":35,"line":171},[33,129659,13459],{"class":167},[33,129661,242],{"class":163},[33,129663,126254],{"class":167},[33,129665,86272],{"class":50},[33,129667,365],{"class":167},[33,129669,17351],{"class":238},[33,129671,242],{"class":163},[33,129673,17356],{"class":54},[33,129675,221],{"class":167},[33,129677,129678],{"class":35,"line":179},[33,129679,92],{"emptyLinePlaceholder":91},[33,129681,129682],{"class":35,"line":187},[33,129683,129684],{"class":39},"# Convert datetime cols to ISO strings; convert NaN → None\n",[33,129686,129687,129689,129691,129693,129695,129697,129699,129701,129704,129706,129709],{"class":35,"line":201},[33,129688,6124],{"class":163},[33,129690,7985],{"class":167},[33,129692,662],{"class":163},[33,129694,23604],{"class":167},[33,129696,23607],{"class":238},[33,129698,242],{"class":163},[33,129700,8309],{"class":167},[33,129702,129703],{"class":54},"\"datetime64[ns]\"",[33,129705,365],{"class":167},[33,129707,129708],{"class":54},"\"datetimetz\"",[33,129710,129711],{"class":167},"]).columns:\n",[33,129713,129714,129717,129719,129722,129724,129726,129728,129730],{"class":35,"line":206},[33,129715,129716],{"class":167},"    df[col] ",[33,129718,242],{"class":163},[33,129720,129721],{"class":167}," df[col].dt.strftime(",[33,129723,1244],{"class":54},[33,129725,916],{"class":50},[33,129727,1249],{"class":54},[33,129729,12000],{"class":167},[33,129731,129732],{"class":39},"# ISO 8601\n",[33,129734,129735],{"class":35,"line":224},[33,129736,92],{"emptyLinePlaceholder":91},[33,129738,129739,129741,129743,129745,129747,129749,129751],{"class":35,"line":229},[33,129740,13459],{"class":167},[33,129742,242],{"class":163},[33,129744,129216],{"class":167},[33,129746,129219],{"class":238},[33,129748,242],{"class":163},[33,129750,571],{"class":50},[33,129752,221],{"class":167},[33,129754,129755],{"class":35,"line":235},[33,129756,92],{"emptyLinePlaceholder":91},[33,129758,129759],{"class":35,"line":250},[33,129760,129761],{"class":39},"# Now to_dict produces only native Python types: str, int, float, None\n",[33,129763,129764,129766,129768,129770,129772,129774,129776],{"class":35,"line":266},[33,129765,129521],{"class":167},[33,129767,242],{"class":163},[33,129769,54131],{"class":167},[33,129771,22169],{"class":238},[33,129773,242],{"class":163},[33,129775,21222],{"class":54},[33,129777,221],{"class":167},[33,129779,129780,129782,129785,129787,129789,129791,129793,129795,129797],{"class":35,"line":290},[33,129781,13474],{"class":50},[33,129783,129784],{"class":167},"(json.dumps(records[:",[33,129786,734],{"class":50},[33,129788,8314],{"class":167},[33,129790,37382],{"class":238},[33,129792,242],{"class":163},[33,129794,1533],{"class":50},[33,129796,73462],{"class":167},[33,129798,129799],{"class":39},"# no encoder needed\n",[14,129801,129802],{},"This approach avoids the custom encoder entirely and is easier to debug.",[18,129804,129806],{"id":129805},"step-4-nested-structures","Step 4 — Nested Structures",[14,129808,129809,129810,20891],{},"Sometimes the flat records shape is wrong — the consumer expects grouped or hierarchical JSON. Build it from ",[30,129811,21820],{},[23,129813,129815],{"className":126,"code":129814,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nfrom pathlib import Path\nimport json\nimport pandas as pd\n\nSOURCE = Path(\"report.xlsx\")\n\ndf = pd.read_excel(SOURCE, engine=\"openpyxl\")\ndf[\"order_date\"] = df[\"order_date\"].dt.strftime(\"%Y-%m-%d\")  # date → string up front\ndf = df.where(df.notna(), other=None)\n\n# Group orders by customer → {customer: [order, ...]}\nnested = (\n    df.groupby(\"customer\", dropna=False)\n      .apply(lambda g: g.drop(columns=\"customer\").to_dict(orient=\"records\"), include_groups=False)\n      .to_dict()\n)\n\nprint(json.dumps(nested, indent=2)[:400])\n",[30,129816,129817,129821,129831,129837,129847,129851,129863,129867,129887,129915,129931,129935,129940,129949,129966,130002,130007,130011,130015],{"__ignoreMap":28},[33,129818,129819],{"class":35,"line":36},[33,129820,3952],{"class":39},[33,129822,129823,129825,129827,129829],{"class":35,"line":43},[33,129824,190],{"class":163},[33,129826,193],{"class":167},[33,129828,164],{"class":163},[33,129830,198],{"class":167},[33,129832,129833,129835],{"class":35,"line":61},[33,129834,164],{"class":163},[33,129836,3081],{"class":167},[33,129838,129839,129841,129843,129845],{"class":35,"line":73},[33,129840,164],{"class":163},[33,129842,492],{"class":167},[33,129844,495],{"class":163},[33,129846,498],{"class":167},[33,129848,129849],{"class":35,"line":88},[33,129850,92],{"emptyLinePlaceholder":91},[33,129852,129853,129855,129857,129859,129861],{"class":35,"line":95},[33,129854,86272],{"class":50},[33,129856,212],{"class":163},[33,129858,215],{"class":167},[33,129860,128434],{"class":54},[33,129862,221],{"class":167},[33,129864,129865],{"class":35,"line":101},[33,129866,92],{"emptyLinePlaceholder":91},[33,129868,129869,129871,129873,129875,129877,129879,129881,129883,129885],{"class":35,"line":171},[33,129870,13459],{"class":167},[33,129872,242],{"class":163},[33,129874,126254],{"class":167},[33,129876,86272],{"class":50},[33,129878,365],{"class":167},[33,129880,17351],{"class":238},[33,129882,242],{"class":163},[33,129884,17356],{"class":54},[33,129886,221],{"class":167},[33,129888,129889,129891,129893,129895,129897,129899,129901,129904,129906,129908,129910,129912],{"class":35,"line":179},[33,129890,11038],{"class":167},[33,129892,108767],{"class":54},[33,129894,763],{"class":167},[33,129896,242],{"class":163},[33,129898,7935],{"class":167},[33,129900,108767],{"class":54},[33,129902,129903],{"class":167},"].dt.strftime(",[33,129905,1244],{"class":54},[33,129907,916],{"class":50},[33,129909,274],{"class":54},[33,129911,10922],{"class":167},[33,129913,129914],{"class":39},"# date → string up front\n",[33,129916,129917,129919,129921,129923,129925,129927,129929],{"class":35,"line":187},[33,129918,13459],{"class":167},[33,129920,242],{"class":163},[33,129922,129216],{"class":167},[33,129924,129219],{"class":238},[33,129926,242],{"class":163},[33,129928,571],{"class":50},[33,129930,221],{"class":167},[33,129932,129933],{"class":35,"line":201},[33,129934,92],{"emptyLinePlaceholder":91},[33,129936,129937],{"class":35,"line":206},[33,129938,129939],{"class":39},"# Group orders by customer → {customer: [order, ...]}\n",[33,129941,129942,129945,129947],{"class":35,"line":224},[33,129943,129944],{"class":167},"nested ",[33,129946,242],{"class":163},[33,129948,1415],{"class":167},[33,129950,129951,129953,129955,129957,129960,129962,129964],{"class":35,"line":229},[33,129952,102776],{"class":167},[33,129954,59673],{"class":54},[33,129956,365],{"class":167},[33,129958,129959],{"class":238},"dropna",[33,129961,242],{"class":163},[33,129963,902],{"class":50},[33,129965,221],{"class":167},[33,129967,129968,129971,129973,129976,129978,129980,129982,129985,129987,129989,129991,129993,129996,129998,130000],{"class":35,"line":235},[33,129969,129970],{"class":167},"      .apply(",[33,129972,39839],{"class":163},[33,129974,129975],{"class":167}," g: g.drop(",[33,129977,740],{"class":238},[33,129979,242],{"class":163},[33,129981,59673],{"class":54},[33,129983,129984],{"class":167},").to_dict(",[33,129986,22169],{"class":238},[33,129988,242],{"class":163},[33,129990,21222],{"class":54},[33,129992,18525],{"class":167},[33,129994,129995],{"class":238},"include_groups",[33,129997,242],{"class":163},[33,129999,902],{"class":50},[33,130001,221],{"class":167},[33,130003,130004],{"class":35,"line":250},[33,130005,130006],{"class":167},"      .to_dict()\n",[33,130008,130009],{"class":35,"line":266},[33,130010,221],{"class":167},[33,130012,130013],{"class":35,"line":290},[33,130014,92],{"emptyLinePlaceholder":91},[33,130016,130017,130019,130022,130024,130026,130028,130031,130033],{"class":35,"line":295},[33,130018,13474],{"class":50},[33,130020,130021],{"class":167},"(json.dumps(nested, ",[33,130023,37382],{"class":238},[33,130025,242],{"class":163},[33,130027,1533],{"class":50},[33,130029,130030],{"class":167},")[:",[33,130032,47140],{"class":50},[33,130034,751],{"class":167},[14,130036,130037,130040,130041,130043],{},[30,130038,130039],{},"dt.strftime"," converts the date column to a plain string before the groupby, removing any remaining ",[30,130042,129076],{}," objects.",[14,130045,130046,130047,130049],{},"A common variant is to produce a two-level nested structure: one object per customer, with a nested array of orders. The same ",[30,130048,21820],{}," pattern applies — just choose the grouping key to match the consumer's expected shape.",[18,130051,130053],{"id":130052},"variant-convert-all-sheets","Variant — Convert All Sheets",[14,130055,130056,130059,130060,130062],{},[30,130057,130058],{},"pd.read_excel(sheet_name=None)"," returns a dict of DataFrames — one key per sheet. This pattern, also used in ",[940,130061,28119],{"href":28118},", maps directly to a multi-sheet JSON structure:",[23,130064,130066],{"className":126,"code":130065,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nfrom pathlib import Path\nimport json\nimport pandas as pd\n\nSOURCE = Path(\"annual_report.xlsx\")\nDEST   = Path(\"annual_report.json\")\n\ntry:\n    workbook: dict[str, pd.DataFrame] = pd.read_excel(\n        SOURCE, sheet_name=None, engine=\"openpyxl\"\n    )\nexcept FileNotFoundError as exc:\n    raise SystemExit(f\"File not found: {exc}\")\n\nresult: dict[str, list] = {}\nfor sheet_name, df in workbook.items():\n    df[\"order_date\"] = pd.to_datetime(df.get(\"order_date\", pd.Series(dtype=\"object\")),\n                                       errors=\"coerce\").dt.strftime(\"%Y-%m-%d\")\n    df = df.where(df.notna(), other=None)\n    result[sheet_name] = df.to_dict(orient=\"records\")\n\nDEST.write_text(json.dumps(result, indent=2, ensure_ascii=False))\nprint(f\"Exported {len(result)} sheets to {DEST}\")\n",[30,130067,130068,130072,130082,130088,130098,130102,130115,130128,130132,130138,130151,130173,130177,130187,130209,130213,130230,130242,130268,130288,130304,130321,130325,130349],{"__ignoreMap":28},[33,130069,130070],{"class":35,"line":36},[33,130071,3952],{"class":39},[33,130073,130074,130076,130078,130080],{"class":35,"line":43},[33,130075,190],{"class":163},[33,130077,193],{"class":167},[33,130079,164],{"class":163},[33,130081,198],{"class":167},[33,130083,130084,130086],{"class":35,"line":61},[33,130085,164],{"class":163},[33,130087,3081],{"class":167},[33,130089,130090,130092,130094,130096],{"class":35,"line":73},[33,130091,164],{"class":163},[33,130093,492],{"class":167},[33,130095,495],{"class":163},[33,130097,498],{"class":167},[33,130099,130100],{"class":35,"line":88},[33,130101,92],{"emptyLinePlaceholder":91},[33,130103,130104,130106,130108,130110,130113],{"class":35,"line":95},[33,130105,86272],{"class":50},[33,130107,212],{"class":163},[33,130109,215],{"class":167},[33,130111,130112],{"class":54},"\"annual_report.xlsx\"",[33,130114,221],{"class":167},[33,130116,130117,130119,130121,130123,130126],{"class":35,"line":101},[33,130118,129127],{"class":50},[33,130120,21012],{"class":163},[33,130122,215],{"class":167},[33,130124,130125],{"class":54},"\"annual_report.json\"",[33,130127,221],{"class":167},[33,130129,130130],{"class":35,"line":171},[33,130131,92],{"emptyLinePlaceholder":91},[33,130133,130134,130136],{"class":35,"line":179},[33,130135,35574],{"class":163},[33,130137,574],{"class":167},[33,130139,130140,130143,130145,130147,130149],{"class":35,"line":187},[33,130141,130142],{"class":167},"    workbook: dict[",[33,130144,1053],{"class":50},[33,130146,14088],{"class":167},[33,130148,242],{"class":163},[33,130150,126171],{"class":167},[33,130152,130153,130156,130158,130160,130162,130164,130166,130168,130170],{"class":35,"line":201},[33,130154,130155],{"class":50},"        SOURCE",[33,130157,365],{"class":167},[33,130159,17371],{"class":238},[33,130161,242],{"class":163},[33,130163,571],{"class":50},[33,130165,365],{"class":167},[33,130167,17351],{"class":238},[33,130169,242],{"class":163},[33,130171,130172],{"class":54},"\"openpyxl\"\n",[33,130174,130175],{"class":35,"line":206},[33,130176,1202],{"class":167},[33,130178,130179,130181,130183,130185],{"class":35,"line":224},[33,130180,35726],{"class":163},[33,130182,2945],{"class":50},[33,130184,1852],{"class":163},[33,130186,1855],{"class":167},[33,130188,130189,130191,130193,130195,130197,130199,130201,130203,130205,130207],{"class":35,"line":229},[33,130190,35742],{"class":163},[33,130192,16617],{"class":50},[33,130194,602],{"class":167},[33,130196,4059],{"class":163},[33,130198,15677],{"class":54},[33,130200,1115],{"class":50},[33,130202,6565],{"class":167},[33,130204,1121],{"class":50},[33,130206,274],{"class":54},[33,130208,221],{"class":167},[33,130210,130211],{"class":35,"line":235},[33,130212,92],{"emptyLinePlaceholder":91},[33,130214,130215,130218,130220,130222,130224,130226,130228],{"class":35,"line":250},[33,130216,130217],{"class":167},"result: dict[",[33,130219,1053],{"class":50},[33,130221,365],{"class":167},[33,130223,25066],{"class":50},[33,130225,763],{"class":167},[33,130227,242],{"class":163},[33,130229,14093],{"class":167},[33,130231,130232,130234,130237,130239],{"class":35,"line":266},[33,130233,6124],{"class":163},[33,130235,130236],{"class":167}," sheet_name, df ",[33,130238,662],{"class":163},[33,130240,130241],{"class":167}," workbook.items():\n",[33,130243,130244,130246,130248,130250,130252,130255,130257,130260,130262,130264,130266],{"class":35,"line":290},[33,130245,27581],{"class":167},[33,130247,108767],{"class":54},[33,130249,763],{"class":167},[33,130251,242],{"class":163},[33,130253,130254],{"class":167}," pd.to_datetime(df.get(",[33,130256,108767],{"class":54},[33,130258,130259],{"class":167},", pd.Series(",[33,130261,23262],{"class":238},[33,130263,242],{"class":163},[33,130265,110111],{"class":54},[33,130267,1571],{"class":167},[33,130269,130270,130273,130275,130277,130280,130282,130284,130286],{"class":35,"line":295},[33,130271,130272],{"class":238},"                                       errors",[33,130274,242],{"class":163},[33,130276,12107],{"class":54},[33,130278,130279],{"class":167},").dt.strftime(",[33,130281,1244],{"class":54},[33,130283,916],{"class":50},[33,130285,274],{"class":54},[33,130287,221],{"class":167},[33,130289,130290,130292,130294,130296,130298,130300,130302],{"class":35,"line":300},[33,130291,4025],{"class":167},[33,130293,242],{"class":163},[33,130295,129216],{"class":167},[33,130297,129219],{"class":238},[33,130299,242],{"class":163},[33,130301,571],{"class":50},[33,130303,221],{"class":167},[33,130305,130306,130309,130311,130313,130315,130317,130319],{"class":35,"line":317},[33,130307,130308],{"class":167},"    result[sheet_name] ",[33,130310,242],{"class":163},[33,130312,54131],{"class":167},[33,130314,22169],{"class":238},[33,130316,242],{"class":163},[33,130318,21222],{"class":54},[33,130320,221],{"class":167},[33,130322,130323],{"class":35,"line":332},[33,130324,92],{"emptyLinePlaceholder":91},[33,130326,130327,130329,130332,130334,130336,130338,130340,130343,130345,130347],{"class":35,"line":347},[33,130328,129127],{"class":50},[33,130330,130331],{"class":167},".write_text(json.dumps(result, ",[33,130333,37382],{"class":238},[33,130335,242],{"class":163},[33,130337,1533],{"class":50},[33,130339,365],{"class":167},[33,130341,130342],{"class":238},"ensure_ascii",[33,130344,242],{"class":163},[33,130346,902],{"class":50},[33,130348,371],{"class":167},[33,130350,130351,130353,130355,130357,130359,130361,130364,130366,130369,130371,130373],{"class":35,"line":374},[33,130352,13474],{"class":50},[33,130354,602],{"class":167},[33,130356,4059],{"class":163},[33,130358,44444],{"class":54},[33,130360,4065],{"class":50},[33,130362,130363],{"class":167},"(result)",[33,130365,1121],{"class":50},[33,130367,130368],{"class":54}," sheets to ",[33,130370,129317],{"class":50},[33,130372,274],{"class":54},[33,130374,221],{"class":167},[18,130376,130378],{"id":130377},"variant-stream-large-workbooks-via-csv","Variant — Stream Large Workbooks via CSV",[14,130380,130381],{},"For very large Excel files (tens of thousands of rows), writing to JSON can exhaust memory. Convert to CSV first, then stream-process the CSV into JSON lines format, which most log-ingestion and analytics platforms accept:",[23,130383,130385],{"className":126,"code":130384,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nfrom pathlib import Path\nimport pandas as pd\n\nSOURCE = Path(\"large_report.xlsx\")\nDEST   = Path(\"large_report.jsonl\")   # one JSON object per line\n\ntry:\n    # Read only needed columns to reduce memory footprint\n    df = pd.read_excel(SOURCE, engine=\"openpyxl\", usecols=[\"order_id\", \"customer\", \"order_date\", \"amount\"])\nexcept FileNotFoundError as exc:\n    raise SystemExit(f\"File not found: {exc}\")\n\ndf[\"order_date\"] = df[\"order_date\"].dt.strftime(\"%Y-%m-%d\")\ndf = df.where(df.notna(), other=None)\n\nwith DEST.open(\"w\", encoding=\"utf-8\") as fh:\n    for record in df.to_dict(orient=\"records\"):\n        import json\n        fh.write(json.dumps(record, ensure_ascii=False) + \"\\n\")\n\nprint(f\"Wrote {len(df)} JSON lines to {DEST}\")\n",[30,130386,130387,130391,130401,130411,130415,130428,130444,130448,130454,130459,130501,130511,130533,130537,130561,130577,130581,130606,130624,130630,130653,130657],{"__ignoreMap":28},[33,130388,130389],{"class":35,"line":36},[33,130390,3952],{"class":39},[33,130392,130393,130395,130397,130399],{"class":35,"line":43},[33,130394,190],{"class":163},[33,130396,193],{"class":167},[33,130398,164],{"class":163},[33,130400,198],{"class":167},[33,130402,130403,130405,130407,130409],{"class":35,"line":61},[33,130404,164],{"class":163},[33,130406,492],{"class":167},[33,130408,495],{"class":163},[33,130410,498],{"class":167},[33,130412,130413],{"class":35,"line":73},[33,130414,92],{"emptyLinePlaceholder":91},[33,130416,130417,130419,130421,130423,130426],{"class":35,"line":88},[33,130418,86272],{"class":50},[33,130420,212],{"class":163},[33,130422,215],{"class":167},[33,130424,130425],{"class":54},"\"large_report.xlsx\"",[33,130427,221],{"class":167},[33,130429,130430,130432,130434,130436,130439,130441],{"class":35,"line":95},[33,130431,129127],{"class":50},[33,130433,21012],{"class":163},[33,130435,215],{"class":167},[33,130437,130438],{"class":54},"\"large_report.jsonl\"",[33,130440,12000],{"class":167},[33,130442,130443],{"class":39},"# one JSON object per line\n",[33,130445,130446],{"class":35,"line":101},[33,130447,92],{"emptyLinePlaceholder":91},[33,130449,130450,130452],{"class":35,"line":171},[33,130451,35574],{"class":163},[33,130453,574],{"class":167},[33,130455,130456],{"class":35,"line":179},[33,130457,130458],{"class":39},"    # Read only needed columns to reduce memory footprint\n",[33,130460,130461,130463,130465,130467,130469,130471,130473,130475,130477,130479,130481,130483,130485,130487,130489,130491,130493,130495,130497,130499],{"class":35,"line":187},[33,130462,4025],{"class":167},[33,130464,242],{"class":163},[33,130466,126254],{"class":167},[33,130468,86272],{"class":50},[33,130470,365],{"class":167},[33,130472,17351],{"class":238},[33,130474,242],{"class":163},[33,130476,17356],{"class":54},[33,130478,365],{"class":167},[33,130480,21904],{"class":238},[33,130482,242],{"class":163},[33,130484,8309],{"class":167},[33,130486,108849],{"class":54},[33,130488,365],{"class":167},[33,130490,59673],{"class":54},[33,130492,365],{"class":167},[33,130494,108767],{"class":54},[33,130496,365],{"class":167},[33,130498,4106],{"class":54},[33,130500,751],{"class":167},[33,130502,130503,130505,130507,130509],{"class":35,"line":201},[33,130504,35726],{"class":163},[33,130506,2945],{"class":50},[33,130508,1852],{"class":163},[33,130510,1855],{"class":167},[33,130512,130513,130515,130517,130519,130521,130523,130525,130527,130529,130531],{"class":35,"line":206},[33,130514,35742],{"class":163},[33,130516,16617],{"class":50},[33,130518,602],{"class":167},[33,130520,4059],{"class":163},[33,130522,15677],{"class":54},[33,130524,1115],{"class":50},[33,130526,6565],{"class":167},[33,130528,1121],{"class":50},[33,130530,274],{"class":54},[33,130532,221],{"class":167},[33,130534,130535],{"class":35,"line":224},[33,130536,92],{"emptyLinePlaceholder":91},[33,130538,130539,130541,130543,130545,130547,130549,130551,130553,130555,130557,130559],{"class":35,"line":229},[33,130540,11038],{"class":167},[33,130542,108767],{"class":54},[33,130544,763],{"class":167},[33,130546,242],{"class":163},[33,130548,7935],{"class":167},[33,130550,108767],{"class":54},[33,130552,129903],{"class":167},[33,130554,1244],{"class":54},[33,130556,916],{"class":50},[33,130558,274],{"class":54},[33,130560,221],{"class":167},[33,130562,130563,130565,130567,130569,130571,130573,130575],{"class":35,"line":235},[33,130564,13459],{"class":167},[33,130566,242],{"class":163},[33,130568,129216],{"class":167},[33,130570,129219],{"class":238},[33,130572,242],{"class":163},[33,130574,571],{"class":50},[33,130576,221],{"class":167},[33,130578,130579],{"class":35,"line":250},[33,130580,92],{"emptyLinePlaceholder":91},[33,130582,130583,130585,130588,130590,130592,130594,130596,130598,130600,130602,130604],{"class":35,"line":266},[33,130584,22271],{"class":163},[33,130586,130587],{"class":50}," DEST",[33,130589,107916],{"class":167},[33,130591,123455],{"class":54},[33,130593,365],{"class":167},[33,130595,27249],{"class":238},[33,130597,242],{"class":163},[33,130599,1195],{"class":54},[33,130601,1649],{"class":167},[33,130603,495],{"class":163},[33,130605,67176],{"class":167},[33,130607,130608,130610,130612,130614,130616,130618,130620,130622],{"class":35,"line":290},[33,130609,656],{"class":163},[33,130611,108575],{"class":167},[33,130613,662],{"class":163},[33,130615,54131],{"class":167},[33,130617,22169],{"class":238},[33,130619,242],{"class":163},[33,130621,21222],{"class":54},[33,130623,1737],{"class":167},[33,130625,130626,130628],{"class":35,"line":295},[33,130627,3388],{"class":163},[33,130629,3081],{"class":167},[33,130631,130632,130635,130637,130639,130641,130643,130645,130647,130649,130651],{"class":35,"line":300},[33,130633,130634],{"class":167},"        fh.write(json.dumps(record, ",[33,130636,130342],{"class":238},[33,130638,242],{"class":163},[33,130640,902],{"class":50},[33,130642,1649],{"class":167},[33,130644,1811],{"class":163},[33,130646,44625],{"class":54},[33,130648,25830],{"class":50},[33,130650,274],{"class":54},[33,130652,221],{"class":167},[33,130654,130655],{"class":35,"line":317},[33,130656,92],{"emptyLinePlaceholder":91},[33,130658,130659,130661,130663,130665,130667,130669,130671,130673,130676,130678,130680],{"class":35,"line":332},[33,130660,13474],{"class":50},[33,130662,602],{"class":167},[33,130664,4059],{"class":163},[33,130666,913],{"class":54},[33,130668,4065],{"class":50},[33,130670,4068],{"class":167},[33,130672,1121],{"class":50},[33,130674,130675],{"class":54}," JSON lines to ",[33,130677,129317],{"class":50},[33,130679,274],{"class":54},[33,130681,221],{"class":167},[14,130683,130684,130685,130688],{},"JSON Lines (",[30,130686,130687],{},".jsonl",") lets consumers read one record at a time without loading the entire file into memory.",[18,130690,9247],{"id":9246},[23,130692,130694],{"className":126,"code":130693,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nimport json\nfrom pathlib import Path\n\nraw = Path(\"output.json\").read_text()\n\ntry:\n    data = json.loads(raw)\nexcept json.JSONDecodeError as exc:\n    raise SystemExit(f\"Invalid JSON: {exc}\")\n\nassert isinstance(data, list),             \"Expected a JSON array at top level\"\nassert all(\"order_id\" in r for r in data), \"Missing order_id in at least one record\"\n\n# Confirm no NaN survived serialization\nraw_text = Path(\"output.json\").read_text()\nassert \"NaN\" not in raw_text, \"NaN found in JSON output — replace before serializing\"\n\nprint(f\"Valid JSON — {len(data)} records, keys: {list(data[0].keys())}\")\n",[30,130695,130696,130700,130706,130716,130720,130733,130737,130743,130752,130762,130785,130789,130806,130833,130837,130842,130855,130872,130876],{"__ignoreMap":28},[33,130697,130698],{"class":35,"line":36},[33,130699,3952],{"class":39},[33,130701,130702,130704],{"class":35,"line":43},[33,130703,164],{"class":163},[33,130705,3081],{"class":167},[33,130707,130708,130710,130712,130714],{"class":35,"line":61},[33,130709,190],{"class":163},[33,130711,193],{"class":167},[33,130713,164],{"class":163},[33,130715,198],{"class":167},[33,130717,130718],{"class":35,"line":73},[33,130719,92],{"emptyLinePlaceholder":91},[33,130721,130722,130724,130726,130728,130730],{"class":35,"line":88},[33,130723,96164],{"class":167},[33,130725,242],{"class":163},[33,130727,215],{"class":167},[33,130729,129134],{"class":54},[33,130731,130732],{"class":167},").read_text()\n",[33,130734,130735],{"class":35,"line":95},[33,130736,92],{"emptyLinePlaceholder":91},[33,130738,130739,130741],{"class":35,"line":101},[33,130740,35574],{"class":163},[33,130742,574],{"class":167},[33,130744,130745,130747,130749],{"class":35,"line":171},[33,130746,24507],{"class":167},[33,130748,242],{"class":163},[33,130750,130751],{"class":167}," json.loads(raw)\n",[33,130753,130754,130756,130758,130760],{"class":35,"line":179},[33,130755,35726],{"class":163},[33,130757,54396],{"class":167},[33,130759,495],{"class":163},[33,130761,1855],{"class":167},[33,130763,130764,130766,130768,130770,130772,130775,130777,130779,130781,130783],{"class":35,"line":187},[33,130765,35742],{"class":163},[33,130767,16617],{"class":50},[33,130769,602],{"class":167},[33,130771,4059],{"class":163},[33,130773,130774],{"class":54},"\"Invalid JSON: ",[33,130776,1115],{"class":50},[33,130778,6565],{"class":167},[33,130780,1121],{"class":50},[33,130782,274],{"class":54},[33,130784,221],{"class":167},[33,130786,130787],{"class":35,"line":201},[33,130788,92],{"emptyLinePlaceholder":91},[33,130790,130791,130793,130795,130798,130800,130803],{"class":35,"line":206},[33,130792,36397],{"class":163},[33,130794,36538],{"class":50},[33,130796,130797],{"class":167},"(data, ",[33,130799,25066],{"class":50},[33,130801,130802],{"class":167},"),             ",[33,130804,130805],{"class":54},"\"Expected a JSON array at top level\"\n",[33,130807,130808,130810,130813,130815,130817,130819,130821,130823,130825,130827,130830],{"class":35,"line":224},[33,130809,36397],{"class":163},[33,130811,130812],{"class":50}," all",[33,130814,602],{"class":167},[33,130816,108849],{"class":54},[33,130818,8002],{"class":163},[33,130820,45721],{"class":167},[33,130822,6124],{"class":163},[33,130824,45721],{"class":167},[33,130826,662],{"class":163},[33,130828,130829],{"class":167}," data), ",[33,130831,130832],{"class":54},"\"Missing order_id in at least one record\"\n",[33,130834,130835],{"class":35,"line":229},[33,130836,92],{"emptyLinePlaceholder":91},[33,130838,130839],{"class":35,"line":235},[33,130840,130841],{"class":39},"# Confirm no NaN survived serialization\n",[33,130843,130844,130847,130849,130851,130853],{"class":35,"line":250},[33,130845,130846],{"class":167},"raw_text ",[33,130848,242],{"class":163},[33,130850,215],{"class":167},[33,130852,129134],{"class":54},[33,130854,130732],{"class":167},[33,130856,130857,130859,130862,130864,130866,130869],{"class":35,"line":266},[33,130858,36397],{"class":163},[33,130860,130861],{"class":54}," \"NaN\"",[33,130863,620],{"class":163},[33,130865,8002],{"class":163},[33,130867,130868],{"class":167}," raw_text, ",[33,130870,130871],{"class":54},"\"NaN found in JSON output — replace before serializing\"\n",[33,130873,130874],{"class":35,"line":290},[33,130875,92],{"emptyLinePlaceholder":91},[33,130877,130878,130880,130882,130884,130887,130889,130892,130894,130897,130899,130901,130903,130906,130908,130910],{"class":35,"line":295},[33,130879,13474],{"class":50},[33,130881,602],{"class":167},[33,130883,4059],{"class":163},[33,130885,130886],{"class":54},"\"Valid JSON — ",[33,130888,4065],{"class":50},[33,130890,130891],{"class":167},"(data)",[33,130893,1121],{"class":50},[33,130895,130896],{"class":54}," records, keys: ",[33,130898,16875],{"class":50},[33,130900,20361],{"class":167},[33,130902,748],{"class":50},[33,130904,130905],{"class":167},"].keys())",[33,130907,1121],{"class":50},[33,130909,274],{"class":54},[33,130911,221],{"class":167},[14,130913,130914],{},"Round-trip back to a DataFrame to confirm no data was lost:",[23,130916,130918],{"className":126,"code":130917,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nimport pandas as pd\n\ndf_check = pd.read_json(\"output.json\", orient=\"records\")\nprint(df_check.dtypes)\nprint(df_check.head())\n",[30,130919,130920,130924,130934,130938,130960,130967],{"__ignoreMap":28},[33,130921,130922],{"class":35,"line":36},[33,130923,3952],{"class":39},[33,130925,130926,130928,130930,130932],{"class":35,"line":43},[33,130927,164],{"class":163},[33,130929,492],{"class":167},[33,130931,495],{"class":163},[33,130933,498],{"class":167},[33,130935,130936],{"class":35,"line":61},[33,130937,92],{"emptyLinePlaceholder":91},[33,130939,130940,130943,130945,130948,130950,130952,130954,130956,130958],{"class":35,"line":73},[33,130941,130942],{"class":167},"df_check ",[33,130944,242],{"class":163},[33,130946,130947],{"class":167}," pd.read_json(",[33,130949,129134],{"class":54},[33,130951,365],{"class":167},[33,130953,22169],{"class":238},[33,130955,242],{"class":163},[33,130957,21222],{"class":54},[33,130959,221],{"class":167},[33,130961,130962,130964],{"class":35,"line":88},[33,130963,13474],{"class":50},[33,130965,130966],{"class":167},"(df_check.dtypes)\n",[33,130968,130969,130971],{"class":35,"line":95},[33,130970,13474],{"class":50},[33,130972,130973],{"class":167},"(df_check.head())\n",[14,130975,130976,130977,36661,130980,130982,130983,3035],{},"If date columns come back as integers (millisecond epoch), pass ",[30,130978,130979],{},"convert_dates=[\"order_date\"]",[30,130981,128940],{}," to restore them as ",[30,130984,130985],{},"datetime64",[18,130987,36626],{"id":36625},[14,130989,130990,130995,130996,42238,130998,4348,131001,131004,131005,131007,131008,3035],{},[1974,130991,36631,130992,130994],{},[30,130993,129024],{}," produce epoch integers for dates instead of ISO strings?","\nThe default ",[30,130997,119308],{},[30,130999,131000],{},"to_json",[30,131002,131003],{},"\"epoch\""," (milliseconds since 1970-01-01). Pass ",[30,131006,129072],{}," to get ",[30,131009,131010],{},"\"2026-01-15T00:00:00\"",[14,131012,131013,131016,131017,36661,131019,131021,131022,131024,131025,131027,131028,2012,131030,131032,131033,131036],{},[1974,131014,131015],{},"How do I keep integer columns as integers instead of floats in the JSON output?","\npandas promotes integer columns that contain ",[30,131018,8884],{},[30,131020,102445],{}," (because integers cannot represent ",[30,131023,8884],{},"). Replace ",[30,131026,8884],{}," with a sentinel (e.g., ",[30,131029,748],{},[30,131031,83558],{},") before serializing, or use ",[30,131034,131035],{},"pd.Int64Dtype"," (nullable integer dtype) so the column stays integer-typed even with missing values.",[14,131038,131039,131042,131043,131046,131047,3035],{},[1974,131040,131041],{},"Can I convert a DataFrame to JSON without writing a file?","\nYes. ",[30,131044,131045],{},"df.to_json(orient=\"records\")"," returns a string when no path is passed. You can embed it directly in an API response or pass it to ",[30,131048,131049],{},"json.loads",[14,131051,131052,131055,131056,131059,131060,131062,131063,8877,131065,131067],{},[1974,131053,131054],{},"What is the fastest way to convert a large Excel file to JSON?","\nRead only the columns you need with ",[30,131057,131058],{},"usecols=",", convert date columns up front with ",[30,131061,130039],{},", replace ",[30,131064,8884],{},[30,131066,571],{},", then write JSON Lines with a streaming loop. Avoid loading the entire output string into memory before writing.",[18,131069,48994],{"id":29070},[4273,131071,131072,131083],{},[4276,131073,131074],{},[4279,131075,131076,131078,131081],{},[4282,131077,79442],{"align":128901},[4282,131079,131080],{"align":128901},"Result",[4282,131082,4290],{"align":128901},[4292,131084,131085,131108,131123,131142],{},[4279,131086,131087,131095,131101],{},[4297,131088,131089,131091,131092],{"align":128901},[30,131090,129034],{}," without ",[30,131093,131094],{},"where",[4297,131096,131097,42238,131099],{"align":128901},[30,131098,129048],{},[30,131100,8884],{},[4297,131102,42543,131103,8877,131105,131107],{"align":128901},[30,131104,8884],{},[30,131106,571],{}," before serialization",[4279,131109,131110,131116,131119],{},[4297,131111,131112,131091,131114],{"align":128901},[30,131113,129024],{},[30,131115,129072],{},[4297,131117,131118],{"align":128901},"Dates serialize as millisecond epoch integers",[4297,131120,14408,131121],{"align":128901},[30,131122,129072],{},[4279,131124,131125,131131,131138],{},[4297,131126,131127,131130],{"align":128901},[30,131128,131129],{},"orient=\"columns\""," for API output",[4297,131132,131133,131134,131137],{"align":128901},"Nested ",[30,131135,131136],{},"{col: {idx: val}}"," structure breaks most REST consumers",[4297,131139,17059,131140],{"align":128901},[30,131141,129021],{},[4279,131143,131144,131151,131157],{},[4297,131145,131146,131147,8363,131149],{"align":128901},"Keeping ",[30,131148,118005],{},[30,131150,131000],{},[4297,131152,131153,131154,131156],{"align":128901},"Extra numeric ",[30,131155,897],{}," key in every record",[4297,131158,131159,131160,131162,131163,8877,131165],{"align":128901},"Default is ",[30,131161,118005],{},"; pass ",[30,131164,28142],{},[30,131166,129021],{},[18,131168,6918],{"id":6917},[4211,131170,131171,131176,131183,131188],{},[4214,131172,131173,131175],{},[940,131174,28119],{"href":28118}," — consolidate files before exporting to JSON",[4214,131177,131178,131180,131181],{},[940,131179,99577],{"href":99576}," — engine selection and sheet targeting with ",[30,131182,57240],{},[4214,131184,131185,131187],{},[940,131186,9599],{"href":9598}," — fix encoding and type issues before serialization",[4214,131189,131190,131192],{},[940,131191,28114],{"href":28113}," — clean up merged DataFrames before converting to JSON",[14,131194,6947,131195,3035],{},[940,131196,28119],{"href":28118},[6953,131198,131199],{},"html pre.shiki code .sAwPA, html code.shiki .sAwPA{--shiki-default:#6A737D}html pre.shiki code .sD7c4, html code.shiki .sD7c4{--shiki-default:#D73A49}html pre.shiki code .sgsFI, html code.shiki .sgsFI{--shiki-default:#24292E}html pre.shiki code .sYBdl, html code.shiki .sYBdl{--shiki-default:#032F62}html pre.shiki code .sqxcx, html code.shiki .sqxcx{--shiki-default:#E36209}html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html pre.shiki code .s7eDp, html code.shiki .s7eDp{--shiki-default:#6F42C1}html pre.shiki code .sYu0t, html code.shiki .sYu0t{--shiki-default:#005CC5}",{"title":28,"searchDepth":43,"depth":43,"links":131201},[131202,131203,131204,131205,131206,131210,131211,131212,131213,131214,131215,131216],{"id":7020,"depth":43,"text":7021},{"id":20,"depth":43,"text":21},{"id":128687,"depth":43,"text":128688},{"id":128879,"depth":43,"text":128880},{"id":129058,"depth":43,"text":129059,"children":131207},[131208,131209],{"id":129062,"depth":61,"text":129063},{"id":129324,"depth":61,"text":129325},{"id":129805,"depth":43,"text":129806},{"id":130052,"depth":43,"text":130053},{"id":130377,"depth":43,"text":130378},{"id":9246,"depth":43,"text":9247},{"id":36625,"depth":43,"text":36626},{"id":29070,"depth":43,"text":48994},{"id":6917,"depth":43,"text":6918},"Excel to JSON",{},"\u002Fpython-for-excel-csv-data-processing\u002Fmerging-multiple-spreadsheets\u002Fconverting-excel-to-json-with-python",{"title":128340,"description":131221},{"Convert Excel workbooks to JSON with Python and pandas":131222,"date":46387,"updatedAt":6978,"tags":131223},"fix Timestamp serialization, choose the right orient, handle nested structures and multi-sheet workbooks.",[99614,128361,47,9630],"Convert Excel to JSON with Python & pandas","python-for-excel-csv-data-processing\u002Fmerging-multiple-spreadsheets\u002Fconverting-excel-to-json-with-python\u002Findex","V9w9-7DvyNvKgmYAagBVkDvMkBCXHFjN_W8MZFuORl8",{"id":131228,"title":28114,"body":131229,"breadcrumbTitle":133925,"canonical":6977,"date":6978,"description":133926,"draft":6980,"extension":6981,"image":6977,"meta":133927,"navigation":91,"path":133928,"robots":6977,"seo":133929,"seoTitle":133930,"stem":133931,"tags":133932,"updatedAt":6978,"__hash__":133934},"content\u002Fpython-for-excel-csv-data-processing\u002Fmerging-multiple-spreadsheets\u002Ffix-pandas-merge-overlapping-columns\u002Findex.md",{"type":7,"value":131230,"toc":133913},[131231,131234,131255,131271,131273,131290,131443,131456,131478,131489,131491,131494,131644,131649,131656,131660,131671,131848,131856,131859,132048,132052,132055,132232,132235,132403,132410,132420,132424,132430,132657,132665,132671,132737,132746,132750,132758,132894,132897,133091,133099,133103,133112,133247,133254,133263,133407,133412,133414,133522,133524,133532,133659,133662,133868,133882,133884,133906,133910],[10,131232,28114],{"id":131233},"fix-pandas-merge-overlapping-column-suffixes",[14,131235,131236,131237,131239,131240,131243,131244,10065,131247,131250,131251,131254],{},"After running ",[30,131238,127087],{},", columns that exist in both DataFrames under the same name — but are not listed in ",[30,131241,131242],{},"on="," — appear as ",[30,131245,131246],{},"col_x",[30,131248,131249],{},"col_y"," in the output. If the suffix strings themselves collide you also get ",[30,131252,131253],{},"MergeError: columns overlap but no suffix specified",". Neither outcome is what you want.",[14,131256,131257,131258,131261,131262,1351,131265,131268,131269,3035],{},"This page covers the root cause, how to diagnose which columns will collide before merging, and four concrete fixes: setting meaningful suffixes, dropping before merging, coalescing with ",[30,131259,131260],{},"combine_first",", and using ",[30,131263,131264],{},"left_on",[30,131266,131267],{},"right_on"," for differently-named keys. For the broader merge workflow across many files, see ",[940,131270,28119],{"href":28118},[18,131272,7021],{"id":7020},[14,131274,131275,131276,10065,131278,131280,131281,36604,131283,1351,131286,131289],{},"pandas appends ",[30,131277,28106],{},[30,131279,28109],{}," to every non-key column that shares a name between the left and right DataFrames. Only the column(s) named in ",[30,131282,131242],{},[30,131284,131285],{},"left_on=",[30,131287,131288],{},"right_on=",") are treated as keys and deduplicated. Everything else gets suffixed to avoid silent data loss.",[23,131291,131293],{"className":126,"code":131292,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\n\nleft  = pd.DataFrame({\"order_id\": [1, 2], \"status\": [\"open\",  \"closed\"], \"amount\": [100, 200]})\nright = pd.DataFrame({\"order_id\": [1, 2], \"status\": [\"paid\",  \"refund\" ], \"notes\":  [\"ok\", \"check\"]})\n\nresult = pd.merge(left, right, on=\"order_id\")\nprint(result.columns.tolist())\n# ['order_id', 'status_x', 'amount', 'status_y', 'notes']\n",[30,131294,131295,131299,131309,131313,131360,131410,131414,131431,131438],{"__ignoreMap":28},[33,131296,131297],{"class":35,"line":36},[33,131298,8895],{"class":39},[33,131300,131301,131303,131305,131307],{"class":35,"line":43},[33,131302,164],{"class":163},[33,131304,492],{"class":167},[33,131306,495],{"class":163},[33,131308,498],{"class":167},[33,131310,131311],{"class":35,"line":61},[33,131312,92],{"emptyLinePlaceholder":91},[33,131314,131315,131318,131320,131322,131324,131326,131328,131330,131332,131334,131336,131338,131341,131343,131346,131348,131350,131352,131354,131356,131358],{"class":35,"line":73},[33,131316,131317],{"class":167},"left  ",[33,131319,242],{"class":163},[33,131321,101407],{"class":167},[33,131323,108849],{"class":54},[33,131325,12426],{"class":167},[33,131327,734],{"class":50},[33,131329,365],{"class":167},[33,131331,1533],{"class":50},[33,131333,8314],{"class":167},[33,131335,43379],{"class":54},[33,131337,12426],{"class":167},[33,131339,131340],{"class":54},"\"open\"",[33,131342,25480],{"class":167},[33,131344,131345],{"class":54},"\"closed\"",[33,131347,8314],{"class":167},[33,131349,4106],{"class":54},[33,131351,12426],{"class":167},[33,131353,2650],{"class":50},[33,131355,365],{"class":167},[33,131357,2611],{"class":50},[33,131359,45051],{"class":167},[33,131361,131362,131365,131367,131369,131371,131373,131375,131377,131379,131381,131383,131385,131388,131390,131393,131396,131399,131401,131403,131405,131408],{"class":35,"line":88},[33,131363,131364],{"class":167},"right ",[33,131366,242],{"class":163},[33,131368,101407],{"class":167},[33,131370,108849],{"class":54},[33,131372,12426],{"class":167},[33,131374,734],{"class":50},[33,131376,365],{"class":167},[33,131378,1533],{"class":50},[33,131380,8314],{"class":167},[33,131382,43379],{"class":54},[33,131384,12426],{"class":167},[33,131386,131387],{"class":54},"\"paid\"",[33,131389,25480],{"class":167},[33,131391,131392],{"class":54},"\"refund\"",[33,131394,131395],{"class":167}," ], ",[33,131397,131398],{"class":54},"\"notes\"",[33,131400,11818],{"class":167},[33,131402,57024],{"class":54},[33,131404,365],{"class":167},[33,131406,131407],{"class":54},"\"check\"",[33,131409,45051],{"class":167},[33,131411,131412],{"class":35,"line":95},[33,131413,92],{"emptyLinePlaceholder":91},[33,131415,131416,131418,131420,131423,131425,131427,131429],{"class":35,"line":101},[33,131417,35055],{"class":167},[33,131419,242],{"class":163},[33,131421,131422],{"class":167}," pd.merge(left, right, ",[33,131424,2091],{"class":238},[33,131426,242],{"class":163},[33,131428,108849],{"class":54},[33,131430,221],{"class":167},[33,131432,131433,131435],{"class":35,"line":171},[33,131434,13474],{"class":50},[33,131436,131437],{"class":167},"(result.columns.tolist())\n",[33,131439,131440],{"class":35,"line":179},[33,131441,131442],{"class":39},"# ['order_id', 'status_x', 'amount', 'status_y', 'notes']\n",[14,131444,131445,131447,131448,131450,131451,36608,131453,131455],{},[30,131446,68351],{}," appears in both frames. Because it is not listed in ",[30,131449,131242],{},", pandas duplicates it with ",[30,131452,28106],{},[30,131454,28109],{}," instead of silently discarding one side.",[14,131457,131458,131459,131461,131462,1351,131464,131466,131467,131470,131471,10065,131474,131477],{},"The suffix logic is intentional: pandas has no way to know which version of ",[30,131460,68351],{}," is correct, so it keeps both and lets you decide. The problem is that ",[30,131463,28106],{},[30,131465,28109],{}," are meaningless names. If both frames also share a second non-key column (say ",[30,131468,131469],{},"updated_at","), you get ",[30,131472,131473],{},"updated_at_x",[30,131475,131476],{},"updated_at_y"," as well. The output quickly becomes unreadable.",[14,131479,131480,131481,131484,131485,131488],{},"The default suffixes are controlled by the ",[30,131482,131483],{},"suffixes"," parameter, which defaults to ",[30,131486,131487],{},"(\"_x\", \"_y\")",". Every fix below either changes those suffixes to something meaningful or eliminates the collision before the merge runs.",[18,131490,35017],{"id":35016},[14,131492,131493],{},"Run this before merging to see exactly which columns will collide:",[23,131495,131497],{"className":126,"code":131496,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\n\ndef find_overlapping_cols(left: pd.DataFrame, right: pd.DataFrame, key: str | list) -> list[str]:\n    \"\"\"Return non-key columns present in both frames.\"\"\"\n    keys = {key} if isinstance(key, str) else set(key)\n    left_cols  = set(left.columns)  - keys\n    right_cols = set(right.columns) - keys\n    return sorted(left_cols & right_cols)\n\nprint(find_overlapping_cols(left, right, key=\"order_id\"))\n# ['status']\n",[30,131498,131499,131503,131513,131517,131539,131544,131572,131589,131605,131620,131624,131639],{"__ignoreMap":28},[33,131500,131501],{"class":35,"line":36},[33,131502,8895],{"class":39},[33,131504,131505,131507,131509,131511],{"class":35,"line":43},[33,131506,164],{"class":163},[33,131508,492],{"class":167},[33,131510,495],{"class":163},[33,131512,498],{"class":167},[33,131514,131515],{"class":35,"line":61},[33,131516,92],{"emptyLinePlaceholder":91},[33,131518,131519,131521,131524,131527,131529,131531,131533,131535,131537],{"class":35,"line":73},[33,131520,562],{"class":163},[33,131522,131523],{"class":46}," find_overlapping_cols",[33,131525,131526],{"class":167},"(left: pd.DataFrame, right: pd.DataFrame, key: ",[33,131528,1053],{"class":50},[33,131530,2850],{"class":163},[33,131532,599],{"class":50},[33,131534,44775],{"class":167},[33,131536,1053],{"class":50},[33,131538,17477],{"class":167},[33,131540,131541],{"class":35,"line":88},[33,131542,131543],{"class":54},"    \"\"\"Return non-key columns present in both frames.\"\"\"\n",[33,131545,131546,131549,131551,131554,131556,131558,131561,131563,131565,131567,131569],{"class":35,"line":95},[33,131547,131548],{"class":167},"    keys ",[33,131550,242],{"class":163},[33,131552,131553],{"class":167}," {key} ",[33,131555,2491],{"class":163},[33,131557,36538],{"class":50},[33,131559,131560],{"class":167},"(key, ",[33,131562,1053],{"class":50},[33,131564,1649],{"class":167},[33,131566,7489],{"class":163},[33,131568,4129],{"class":50},[33,131570,131571],{"class":167},"(key)\n",[33,131573,131574,131577,131579,131581,131584,131586],{"class":35,"line":101},[33,131575,131576],{"class":167},"    left_cols  ",[33,131578,242],{"class":163},[33,131580,4129],{"class":50},[33,131582,131583],{"class":167},"(left.columns)  ",[33,131585,4126],{"class":163},[33,131587,131588],{"class":167}," keys\n",[33,131590,131591,131594,131596,131598,131601,131603],{"class":35,"line":171},[33,131592,131593],{"class":167},"    right_cols ",[33,131595,242],{"class":163},[33,131597,4129],{"class":50},[33,131599,131600],{"class":167},"(right.columns) ",[33,131602,4126],{"class":163},[33,131604,131588],{"class":167},[33,131606,131607,131609,131611,131614,131617],{"class":35,"line":179},[33,131608,1332],{"class":163},[33,131610,28924],{"class":50},[33,131612,131613],{"class":167},"(left_cols ",[33,131615,131616],{"class":163},"&",[33,131618,131619],{"class":167}," right_cols)\n",[33,131621,131622],{"class":35,"line":187},[33,131623,92],{"emptyLinePlaceholder":91},[33,131625,131626,131628,131631,131633,131635,131637],{"class":35,"line":201},[33,131627,13474],{"class":50},[33,131629,131630],{"class":167},"(find_overlapping_cols(left, right, ",[33,131632,44114],{"class":238},[33,131634,242],{"class":163},[33,131636,108849],{"class":54},[33,131638,371],{"class":167},[33,131640,131641],{"class":35,"line":206},[33,131642,131643],{"class":39},"# ['status']\n",[14,131645,131646,131647,3035],{},"If this list is non-empty, choose one of the fixes below before calling ",[30,131648,127087],{},[14,131650,131651,131652,131655],{},"Knowing the colliding column names up front also tells you whether the overlap is intentional (both frames have genuinely different values for the same concept, like order status vs payment status) or accidental (both frames have a ",[30,131653,131654],{},"created_at"," audit column that carries identical values and should just be deduplicated). The fix differs for each case.",[18,131657,131659],{"id":131658},"fix-1-set-meaningful-suffixes","Fix 1 — Set Meaningful Suffixes",[14,131661,14408,131662,131665,131666,1351,131668,131670],{},[30,131663,131664],{},"suffixes="," to replace the default ",[30,131667,28106],{},[30,131669,28109],{}," with labels that describe each source:",[23,131672,131674],{"className":126,"code":131673,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\n\nleft  = pd.DataFrame({\"order_id\": [1, 2], \"status\": [\"open\",  \"closed\"], \"amount\": [100, 200]})\nright = pd.DataFrame({\"order_id\": [1, 2], \"status\": [\"paid\",  \"refund\" ], \"notes\":  [\"ok\", \"check\"]})\n\nresult = pd.merge(\n    left, right,\n    on=\"order_id\",\n    suffixes=(\"_order\", \"_payment\"),   # replaces _x \u002F _y\n)\nprint(result.columns.tolist())\n# ['order_id', 'status_order', 'amount', 'status_payment', 'notes']\n",[30,131675,131676,131680,131690,131694,131738,131782,131786,131795,131800,131811,131833,131837,131843],{"__ignoreMap":28},[33,131677,131678],{"class":35,"line":36},[33,131679,8895],{"class":39},[33,131681,131682,131684,131686,131688],{"class":35,"line":43},[33,131683,164],{"class":163},[33,131685,492],{"class":167},[33,131687,495],{"class":163},[33,131689,498],{"class":167},[33,131691,131692],{"class":35,"line":61},[33,131693,92],{"emptyLinePlaceholder":91},[33,131695,131696,131698,131700,131702,131704,131706,131708,131710,131712,131714,131716,131718,131720,131722,131724,131726,131728,131730,131732,131734,131736],{"class":35,"line":73},[33,131697,131317],{"class":167},[33,131699,242],{"class":163},[33,131701,101407],{"class":167},[33,131703,108849],{"class":54},[33,131705,12426],{"class":167},[33,131707,734],{"class":50},[33,131709,365],{"class":167},[33,131711,1533],{"class":50},[33,131713,8314],{"class":167},[33,131715,43379],{"class":54},[33,131717,12426],{"class":167},[33,131719,131340],{"class":54},[33,131721,25480],{"class":167},[33,131723,131345],{"class":54},[33,131725,8314],{"class":167},[33,131727,4106],{"class":54},[33,131729,12426],{"class":167},[33,131731,2650],{"class":50},[33,131733,365],{"class":167},[33,131735,2611],{"class":50},[33,131737,45051],{"class":167},[33,131739,131740,131742,131744,131746,131748,131750,131752,131754,131756,131758,131760,131762,131764,131766,131768,131770,131772,131774,131776,131778,131780],{"class":35,"line":88},[33,131741,131364],{"class":167},[33,131743,242],{"class":163},[33,131745,101407],{"class":167},[33,131747,108849],{"class":54},[33,131749,12426],{"class":167},[33,131751,734],{"class":50},[33,131753,365],{"class":167},[33,131755,1533],{"class":50},[33,131757,8314],{"class":167},[33,131759,43379],{"class":54},[33,131761,12426],{"class":167},[33,131763,131387],{"class":54},[33,131765,25480],{"class":167},[33,131767,131392],{"class":54},[33,131769,131395],{"class":167},[33,131771,131398],{"class":54},[33,131773,11818],{"class":167},[33,131775,57024],{"class":54},[33,131777,365],{"class":167},[33,131779,131407],{"class":54},[33,131781,45051],{"class":167},[33,131783,131784],{"class":35,"line":95},[33,131785,92],{"emptyLinePlaceholder":91},[33,131787,131788,131790,131792],{"class":35,"line":101},[33,131789,35055],{"class":167},[33,131791,242],{"class":163},[33,131793,131794],{"class":167}," pd.merge(\n",[33,131796,131797],{"class":35,"line":171},[33,131798,131799],{"class":167},"    left, right,\n",[33,131801,131802,131805,131807,131809],{"class":35,"line":179},[33,131803,131804],{"class":238},"    on",[33,131806,242],{"class":163},[33,131808,108849],{"class":54},[33,131810,247],{"class":167},[33,131812,131813,131816,131818,131820,131823,131825,131828,131830],{"class":35,"line":187},[33,131814,131815],{"class":238},"    suffixes",[33,131817,242],{"class":163},[33,131819,602],{"class":167},[33,131821,131822],{"class":54},"\"_order\"",[33,131824,365],{"class":167},[33,131826,131827],{"class":54},"\"_payment\"",[33,131829,122176],{"class":167},[33,131831,131832],{"class":39},"# replaces _x \u002F _y\n",[33,131834,131835],{"class":35,"line":201},[33,131836,221],{"class":167},[33,131838,131839,131841],{"class":35,"line":206},[33,131840,13474],{"class":50},[33,131842,131437],{"class":167},[33,131844,131845],{"class":35,"line":224},[33,131846,131847],{"class":39},"# ['order_id', 'status_order', 'amount', 'status_payment', 'notes']\n",[14,131849,131850,131851,10065,131853,131855],{},"Use this when both columns carry different, useful information (e.g., order status vs. payment status). The suffix applies to every overlapping column in one call — if ",[30,131852,68351],{},[30,131854,131469],{}," both collide, both get the suffix.",[14,131857,131858],{},"When you have many overlapping columns and want to see the full renamed list before running the merge:",[23,131860,131862],{"className":126,"code":131861,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\n\nleft  = pd.DataFrame({\"order_id\": [1, 2], \"status\": [\"open\", \"closed\"], \"updated_at\": [\"2026-01-01\", \"2026-01-02\"]})\nright = pd.DataFrame({\"order_id\": [1, 2], \"status\": [\"paid\", \"refund\"], \"updated_at\": [\"2026-01-05\", \"2026-01-06\"], \"notes\": [\"ok\", \"check\"]})\n\n# Preview which columns would be renamed before committing\noverlap = find_overlapping_cols(left, right, key=\"order_id\")\nprint({col: (col + \"_order\", col + \"_payment\") for col in overlap})\n# {'status': ('status_order', 'status_payment'), 'updated_at': ('updated_at_order', 'updated_at_payment')}\n",[30,131863,131864,131868,131878,131882,131927,131985,131989,131994,132012,132043],{"__ignoreMap":28},[33,131865,131866],{"class":35,"line":36},[33,131867,8895],{"class":39},[33,131869,131870,131872,131874,131876],{"class":35,"line":43},[33,131871,164],{"class":163},[33,131873,492],{"class":167},[33,131875,495],{"class":163},[33,131877,498],{"class":167},[33,131879,131880],{"class":35,"line":61},[33,131881,92],{"emptyLinePlaceholder":91},[33,131883,131884,131886,131888,131890,131892,131894,131896,131898,131900,131902,131904,131906,131908,131910,131912,131914,131916,131918,131920,131922,131925],{"class":35,"line":73},[33,131885,131317],{"class":167},[33,131887,242],{"class":163},[33,131889,101407],{"class":167},[33,131891,108849],{"class":54},[33,131893,12426],{"class":167},[33,131895,734],{"class":50},[33,131897,365],{"class":167},[33,131899,1533],{"class":50},[33,131901,8314],{"class":167},[33,131903,43379],{"class":54},[33,131905,12426],{"class":167},[33,131907,131340],{"class":54},[33,131909,365],{"class":167},[33,131911,131345],{"class":54},[33,131913,8314],{"class":167},[33,131915,114134],{"class":54},[33,131917,12426],{"class":167},[33,131919,51713],{"class":54},[33,131921,365],{"class":167},[33,131923,131924],{"class":54},"\"2026-01-02\"",[33,131926,45051],{"class":167},[33,131928,131929,131931,131933,131935,131937,131939,131941,131943,131945,131947,131949,131951,131953,131955,131957,131959,131961,131963,131966,131968,131971,131973,131975,131977,131979,131981,131983],{"class":35,"line":88},[33,131930,131364],{"class":167},[33,131932,242],{"class":163},[33,131934,101407],{"class":167},[33,131936,108849],{"class":54},[33,131938,12426],{"class":167},[33,131940,734],{"class":50},[33,131942,365],{"class":167},[33,131944,1533],{"class":50},[33,131946,8314],{"class":167},[33,131948,43379],{"class":54},[33,131950,12426],{"class":167},[33,131952,131387],{"class":54},[33,131954,365],{"class":167},[33,131956,131392],{"class":54},[33,131958,8314],{"class":167},[33,131960,114134],{"class":54},[33,131962,12426],{"class":167},[33,131964,131965],{"class":54},"\"2026-01-05\"",[33,131967,365],{"class":167},[33,131969,131970],{"class":54},"\"2026-01-06\"",[33,131972,8314],{"class":167},[33,131974,131398],{"class":54},[33,131976,12426],{"class":167},[33,131978,57024],{"class":54},[33,131980,365],{"class":167},[33,131982,131407],{"class":54},[33,131984,45051],{"class":167},[33,131986,131987],{"class":35,"line":95},[33,131988,92],{"emptyLinePlaceholder":91},[33,131990,131991],{"class":35,"line":101},[33,131992,131993],{"class":39},"# Preview which columns would be renamed before committing\n",[33,131995,131996,131999,132001,132004,132006,132008,132010],{"class":35,"line":171},[33,131997,131998],{"class":167},"overlap ",[33,132000,242],{"class":163},[33,132002,132003],{"class":167}," find_overlapping_cols(left, right, ",[33,132005,44114],{"class":238},[33,132007,242],{"class":163},[33,132009,108849],{"class":54},[33,132011,221],{"class":167},[33,132013,132014,132016,132019,132021,132024,132027,132029,132032,132034,132036,132038,132040],{"class":35,"line":179},[33,132015,13474],{"class":50},[33,132017,132018],{"class":167},"({col: (col ",[33,132020,1811],{"class":163},[33,132022,132023],{"class":54}," \"_order\"",[33,132025,132026],{"class":167},", col ",[33,132028,1811],{"class":163},[33,132030,132031],{"class":54}," \"_payment\"",[33,132033,1649],{"class":167},[33,132035,6124],{"class":163},[33,132037,7985],{"class":167},[33,132039,662],{"class":163},[33,132041,132042],{"class":167}," overlap})\n",[33,132044,132045],{"class":35,"line":187},[33,132046,132047],{"class":39},"# {'status': ('status_order', 'status_payment'), 'updated_at': ('updated_at_order', 'updated_at_payment')}\n",[18,132049,132051],{"id":132050},"fix-2-drop-or-rename-before-merging","Fix 2 — Drop or Rename Before Merging",[14,132053,132054],{},"When one side's version is authoritative, drop the other before the merge so no suffix is needed:",[23,132056,132058],{"className":126,"code":132057,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\n\nleft  = pd.DataFrame({\"order_id\": [1, 2], \"status\": [\"open\",  \"closed\"], \"amount\": [100, 200]})\nright = pd.DataFrame({\"order_id\": [1, 2], \"status\": [\"paid\",  \"refund\" ], \"notes\":  [\"ok\", \"check\"]})\n\n# Keep only the left-hand status; drop the right-hand duplicate before merging\nresult = pd.merge(\n    left,\n    right.drop(columns=[\"status\"]),   # remove the duplicate column\n    on=\"order_id\",\n)\nprint(result.columns.tolist())\n# ['order_id', 'status', 'amount', 'notes']\n",[30,132059,132060,132064,132074,132078,132122,132166,132170,132175,132183,132188,132207,132217,132221,132227],{"__ignoreMap":28},[33,132061,132062],{"class":35,"line":36},[33,132063,8895],{"class":39},[33,132065,132066,132068,132070,132072],{"class":35,"line":43},[33,132067,164],{"class":163},[33,132069,492],{"class":167},[33,132071,495],{"class":163},[33,132073,498],{"class":167},[33,132075,132076],{"class":35,"line":61},[33,132077,92],{"emptyLinePlaceholder":91},[33,132079,132080,132082,132084,132086,132088,132090,132092,132094,132096,132098,132100,132102,132104,132106,132108,132110,132112,132114,132116,132118,132120],{"class":35,"line":73},[33,132081,131317],{"class":167},[33,132083,242],{"class":163},[33,132085,101407],{"class":167},[33,132087,108849],{"class":54},[33,132089,12426],{"class":167},[33,132091,734],{"class":50},[33,132093,365],{"class":167},[33,132095,1533],{"class":50},[33,132097,8314],{"class":167},[33,132099,43379],{"class":54},[33,132101,12426],{"class":167},[33,132103,131340],{"class":54},[33,132105,25480],{"class":167},[33,132107,131345],{"class":54},[33,132109,8314],{"class":167},[33,132111,4106],{"class":54},[33,132113,12426],{"class":167},[33,132115,2650],{"class":50},[33,132117,365],{"class":167},[33,132119,2611],{"class":50},[33,132121,45051],{"class":167},[33,132123,132124,132126,132128,132130,132132,132134,132136,132138,132140,132142,132144,132146,132148,132150,132152,132154,132156,132158,132160,132162,132164],{"class":35,"line":88},[33,132125,131364],{"class":167},[33,132127,242],{"class":163},[33,132129,101407],{"class":167},[33,132131,108849],{"class":54},[33,132133,12426],{"class":167},[33,132135,734],{"class":50},[33,132137,365],{"class":167},[33,132139,1533],{"class":50},[33,132141,8314],{"class":167},[33,132143,43379],{"class":54},[33,132145,12426],{"class":167},[33,132147,131387],{"class":54},[33,132149,25480],{"class":167},[33,132151,131392],{"class":54},[33,132153,131395],{"class":167},[33,132155,131398],{"class":54},[33,132157,11818],{"class":167},[33,132159,57024],{"class":54},[33,132161,365],{"class":167},[33,132163,131407],{"class":54},[33,132165,45051],{"class":167},[33,132167,132168],{"class":35,"line":95},[33,132169,92],{"emptyLinePlaceholder":91},[33,132171,132172],{"class":35,"line":101},[33,132173,132174],{"class":39},"# Keep only the left-hand status; drop the right-hand duplicate before merging\n",[33,132176,132177,132179,132181],{"class":35,"line":171},[33,132178,35055],{"class":167},[33,132180,242],{"class":163},[33,132182,131794],{"class":167},[33,132184,132185],{"class":35,"line":179},[33,132186,132187],{"class":167},"    left,\n",[33,132189,132190,132193,132195,132197,132199,132201,132204],{"class":35,"line":187},[33,132191,132192],{"class":167},"    right.drop(",[33,132194,740],{"class":238},[33,132196,242],{"class":163},[33,132198,8309],{"class":167},[33,132200,43379],{"class":54},[33,132202,132203],{"class":167},"]),   ",[33,132205,132206],{"class":39},"# remove the duplicate column\n",[33,132208,132209,132211,132213,132215],{"class":35,"line":201},[33,132210,131804],{"class":238},[33,132212,242],{"class":163},[33,132214,108849],{"class":54},[33,132216,247],{"class":167},[33,132218,132219],{"class":35,"line":206},[33,132220,221],{"class":167},[33,132222,132223,132225],{"class":35,"line":224},[33,132224,13474],{"class":50},[33,132226,131437],{"class":167},[33,132228,132229],{"class":35,"line":229},[33,132230,132231],{"class":39},"# ['order_id', 'status', 'amount', 'notes']\n",[14,132233,132234],{},"Alternatively, rename the right-hand column to something meaningful before the merge:",[23,132236,132238],{"className":126,"code":132237,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\n\nleft  = pd.DataFrame({\"order_id\": [1, 2], \"status\": [\"open\",  \"closed\"], \"amount\": [100, 200]})\nright = pd.DataFrame({\"order_id\": [1, 2], \"status\": [\"paid\",  \"refund\" ], \"notes\":  [\"ok\", \"check\"]})\n\nright_renamed = right.rename(columns={\"status\": \"payment_status\"})\nresult = pd.merge(left, right_renamed, on=\"order_id\")\nprint(result.columns.tolist())\n# ['order_id', 'status', 'amount', 'payment_status', 'notes']\n",[30,132239,132240,132244,132254,132258,132302,132346,132350,132375,132392,132398],{"__ignoreMap":28},[33,132241,132242],{"class":35,"line":36},[33,132243,8895],{"class":39},[33,132245,132246,132248,132250,132252],{"class":35,"line":43},[33,132247,164],{"class":163},[33,132249,492],{"class":167},[33,132251,495],{"class":163},[33,132253,498],{"class":167},[33,132255,132256],{"class":35,"line":61},[33,132257,92],{"emptyLinePlaceholder":91},[33,132259,132260,132262,132264,132266,132268,132270,132272,132274,132276,132278,132280,132282,132284,132286,132288,132290,132292,132294,132296,132298,132300],{"class":35,"line":73},[33,132261,131317],{"class":167},[33,132263,242],{"class":163},[33,132265,101407],{"class":167},[33,132267,108849],{"class":54},[33,132269,12426],{"class":167},[33,132271,734],{"class":50},[33,132273,365],{"class":167},[33,132275,1533],{"class":50},[33,132277,8314],{"class":167},[33,132279,43379],{"class":54},[33,132281,12426],{"class":167},[33,132283,131340],{"class":54},[33,132285,25480],{"class":167},[33,132287,131345],{"class":54},[33,132289,8314],{"class":167},[33,132291,4106],{"class":54},[33,132293,12426],{"class":167},[33,132295,2650],{"class":50},[33,132297,365],{"class":167},[33,132299,2611],{"class":50},[33,132301,45051],{"class":167},[33,132303,132304,132306,132308,132310,132312,132314,132316,132318,132320,132322,132324,132326,132328,132330,132332,132334,132336,132338,132340,132342,132344],{"class":35,"line":88},[33,132305,131364],{"class":167},[33,132307,242],{"class":163},[33,132309,101407],{"class":167},[33,132311,108849],{"class":54},[33,132313,12426],{"class":167},[33,132315,734],{"class":50},[33,132317,365],{"class":167},[33,132319,1533],{"class":50},[33,132321,8314],{"class":167},[33,132323,43379],{"class":54},[33,132325,12426],{"class":167},[33,132327,131387],{"class":54},[33,132329,25480],{"class":167},[33,132331,131392],{"class":54},[33,132333,131395],{"class":167},[33,132335,131398],{"class":54},[33,132337,11818],{"class":167},[33,132339,57024],{"class":54},[33,132341,365],{"class":167},[33,132343,131407],{"class":54},[33,132345,45051],{"class":167},[33,132347,132348],{"class":35,"line":95},[33,132349,92],{"emptyLinePlaceholder":91},[33,132351,132352,132355,132357,132360,132362,132364,132366,132368,132370,132373],{"class":35,"line":101},[33,132353,132354],{"class":167},"right_renamed ",[33,132356,242],{"class":163},[33,132358,132359],{"class":167}," right.rename(",[33,132361,740],{"class":238},[33,132363,242],{"class":163},[33,132365,1115],{"class":167},[33,132367,43379],{"class":54},[33,132369,2079],{"class":167},[33,132371,132372],{"class":54},"\"payment_status\"",[33,132374,103249],{"class":167},[33,132376,132377,132379,132381,132384,132386,132388,132390],{"class":35,"line":171},[33,132378,35055],{"class":167},[33,132380,242],{"class":163},[33,132382,132383],{"class":167}," pd.merge(left, right_renamed, ",[33,132385,2091],{"class":238},[33,132387,242],{"class":163},[33,132389,108849],{"class":54},[33,132391,221],{"class":167},[33,132393,132394,132396],{"class":35,"line":179},[33,132395,13474],{"class":50},[33,132397,131437],{"class":167},[33,132399,132400],{"class":35,"line":187},[33,132401,132402],{"class":39},"# ['order_id', 'status', 'amount', 'payment_status', 'notes']\n",[14,132404,132405,132406,132409],{},"Renaming is cleaner than dropping when you genuinely need both values downstream but want them to have unambiguous names from the start. If the right-hand frame comes from a third-party source you don't control, renaming at the point of load (inside your ",[30,132407,132408],{},"load_files"," function) keeps the merge call clean.",[14,132411,132412,132413,132416,132417,132419],{},"When normalizing column names for merging, the same ",[30,132414,132415],{},"re.sub"," lowercase pattern used in ",[940,132418,9599],{"href":9598}," works here too — run it before the merge, not after.",[18,132421,132423],{"id":132422},"fix-3-coalesce-with-combine_first","Fix 3 — Coalesce with combine_first",[14,132425,132426,132427,132429],{},"When both frames have the same column but with gaps (one has values where the other has ",[30,132428,8884],{},"), merge first and then coalesce:",[23,132431,132433],{"className":126,"code":132432,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\n\nleft  = pd.DataFrame({\"id\": [1, 2, 3], \"score\": [85.0, None,  90.0]})\nright = pd.DataFrame({\"id\": [1, 2, 3], \"score\": [None, 72.0, None]})\n\nresult = pd.merge(left, right, on=\"id\", suffixes=(\"_left\", \"_right\"))\n\n# Coalesce: take left value if available, fall back to right\nresult[\"score\"] = result[\"score_left\"].combine_first(result[\"score_right\"])\nresult = result.drop(columns=[\"score_left\", \"score_right\"])   # clean up\nprint(result)\n#    id  score\n# 0   1   85.0\n# 1   2   72.0\n# 2   3   90.0\n",[30,132434,132435,132439,132449,132453,132495,132536,132540,132572,132576,132581,132605,132631,132637,132642,132647,132652],{"__ignoreMap":28},[33,132436,132437],{"class":35,"line":36},[33,132438,8895],{"class":39},[33,132440,132441,132443,132445,132447],{"class":35,"line":43},[33,132442,164],{"class":163},[33,132444,492],{"class":167},[33,132446,495],{"class":163},[33,132448,498],{"class":167},[33,132450,132451],{"class":35,"line":61},[33,132452,92],{"emptyLinePlaceholder":91},[33,132454,132455,132457,132459,132461,132463,132465,132467,132469,132471,132473,132475,132477,132479,132481,132484,132486,132488,132490,132493],{"class":35,"line":73},[33,132456,131317],{"class":167},[33,132458,242],{"class":163},[33,132460,101407],{"class":167},[33,132462,57101],{"class":54},[33,132464,12426],{"class":167},[33,132466,734],{"class":50},[33,132468,365],{"class":167},[33,132470,1533],{"class":50},[33,132472,365],{"class":167},[33,132474,10258],{"class":50},[33,132476,8314],{"class":167},[33,132478,122826],{"class":54},[33,132480,12426],{"class":167},[33,132482,132483],{"class":50},"85.0",[33,132485,365],{"class":167},[33,132487,571],{"class":50},[33,132489,25480],{"class":167},[33,132491,132492],{"class":50},"90.0",[33,132494,45051],{"class":167},[33,132496,132497,132499,132501,132503,132505,132507,132509,132511,132513,132515,132517,132519,132521,132523,132525,132527,132530,132532,132534],{"class":35,"line":88},[33,132498,131364],{"class":167},[33,132500,242],{"class":163},[33,132502,101407],{"class":167},[33,132504,57101],{"class":54},[33,132506,12426],{"class":167},[33,132508,734],{"class":50},[33,132510,365],{"class":167},[33,132512,1533],{"class":50},[33,132514,365],{"class":167},[33,132516,10258],{"class":50},[33,132518,8314],{"class":167},[33,132520,122826],{"class":54},[33,132522,12426],{"class":167},[33,132524,571],{"class":50},[33,132526,365],{"class":167},[33,132528,132529],{"class":50},"72.0",[33,132531,365],{"class":167},[33,132533,571],{"class":50},[33,132535,45051],{"class":167},[33,132537,132538],{"class":35,"line":95},[33,132539,92],{"emptyLinePlaceholder":91},[33,132541,132542,132544,132546,132548,132550,132552,132554,132556,132558,132560,132562,132565,132567,132570],{"class":35,"line":101},[33,132543,35055],{"class":167},[33,132545,242],{"class":163},[33,132547,131422],{"class":167},[33,132549,2091],{"class":238},[33,132551,242],{"class":163},[33,132553,57101],{"class":54},[33,132555,365],{"class":167},[33,132557,131483],{"class":238},[33,132559,242],{"class":163},[33,132561,602],{"class":167},[33,132563,132564],{"class":54},"\"_left\"",[33,132566,365],{"class":167},[33,132568,132569],{"class":54},"\"_right\"",[33,132571,371],{"class":167},[33,132573,132574],{"class":35,"line":171},[33,132575,92],{"emptyLinePlaceholder":91},[33,132577,132578],{"class":35,"line":179},[33,132579,132580],{"class":39},"# Coalesce: take left value if available, fall back to right\n",[33,132582,132583,132586,132588,132590,132592,132594,132597,132600,132603],{"class":35,"line":187},[33,132584,132585],{"class":167},"result[",[33,132587,122826],{"class":54},[33,132589,763],{"class":167},[33,132591,242],{"class":163},[33,132593,49611],{"class":167},[33,132595,132596],{"class":54},"\"score_left\"",[33,132598,132599],{"class":167},"].combine_first(result[",[33,132601,132602],{"class":54},"\"score_right\"",[33,132604,751],{"class":167},[33,132606,132607,132609,132611,132614,132616,132618,132620,132622,132624,132626,132628],{"class":35,"line":201},[33,132608,35055],{"class":167},[33,132610,242],{"class":163},[33,132612,132613],{"class":167}," result.drop(",[33,132615,740],{"class":238},[33,132617,242],{"class":163},[33,132619,8309],{"class":167},[33,132621,132596],{"class":54},[33,132623,365],{"class":167},[33,132625,132602],{"class":54},[33,132627,7283],{"class":167},[33,132629,132630],{"class":39},"# clean up\n",[33,132632,132633,132635],{"class":35,"line":206},[33,132634,13474],{"class":50},[33,132636,8864],{"class":167},[33,132638,132639],{"class":35,"line":224},[33,132640,132641],{"class":39},"#    id  score\n",[33,132643,132644],{"class":35,"line":229},[33,132645,132646],{"class":39},"# 0   1   85.0\n",[33,132648,132649],{"class":35,"line":235},[33,132650,132651],{"class":39},"# 1   2   72.0\n",[33,132653,132654],{"class":35,"line":250},[33,132655,132656],{"class":39},"# 2   3   90.0\n",[14,132658,132659,132661,132662,132664],{},[30,132660,131260],{}," fills ",[30,132663,8884],{}," in the caller Series with values from the argument Series at matching index positions.",[14,132666,132667,132668,20891],{},"An equivalent using ",[30,132669,132670],{},"fillna",[23,132672,132674],{"className":126,"code":132673,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\n\nresult[\"score\"] = result[\"score_left\"].fillna(result[\"score_right\"])\nresult = result.drop(columns=[\"score_left\", \"score_right\"])\n",[30,132675,132676,132680,132690,132694,132715],{"__ignoreMap":28},[33,132677,132678],{"class":35,"line":36},[33,132679,8895],{"class":39},[33,132681,132682,132684,132686,132688],{"class":35,"line":43},[33,132683,164],{"class":163},[33,132685,492],{"class":167},[33,132687,495],{"class":163},[33,132689,498],{"class":167},[33,132691,132692],{"class":35,"line":61},[33,132693,92],{"emptyLinePlaceholder":91},[33,132695,132696,132698,132700,132702,132704,132706,132708,132711,132713],{"class":35,"line":73},[33,132697,132585],{"class":167},[33,132699,122826],{"class":54},[33,132701,763],{"class":167},[33,132703,242],{"class":163},[33,132705,49611],{"class":167},[33,132707,132596],{"class":54},[33,132709,132710],{"class":167},"].fillna(result[",[33,132712,132602],{"class":54},[33,132714,751],{"class":167},[33,132716,132717,132719,132721,132723,132725,132727,132729,132731,132733,132735],{"class":35,"line":88},[33,132718,35055],{"class":167},[33,132720,242],{"class":163},[33,132722,132613],{"class":167},[33,132724,740],{"class":238},[33,132726,242],{"class":163},[33,132728,8309],{"class":167},[33,132730,132596],{"class":54},[33,132732,365],{"class":167},[33,132734,132602],{"class":54},[33,132736,751],{"class":167},[14,132738,132739,132740,132742,132743,132745],{},"Both produce the same output. ",[30,132741,131260],{}," is slightly more idiomatic for \"left-wins\" coalescing; ",[30,132744,132670],{}," is more explicit about the direction.",[18,132747,132749],{"id":132748},"variant-merging-on-differently-named-keys","Variant — Merging on Differently-Named Keys",[14,132751,132752,132753,36608,132755,132757],{},"When the join key has different names in each frame, use ",[30,132754,131285],{},[30,132756,131288],{},". pandas keeps both key columns in the output, which causes an apparent duplicate:",[23,132759,132761],{"className":126,"code":132760,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\n\nsales   = pd.DataFrame({\"sale_region\": [\"North\", \"South\"], \"revenue\": [37000, 29400]})\ntargets = pd.DataFrame({\"region\":      [\"North\", \"South\"], \"target\":  [40000, 32000]})\n\nresult = pd.merge(sales, targets, left_on=\"sale_region\", right_on=\"region\")\nprint(result.columns.tolist())\n# ['sale_region', 'revenue', 'region', 'target']  ← two region-like columns\n",[30,132762,132763,132767,132777,132781,132817,132854,132858,132883,132889],{"__ignoreMap":28},[33,132764,132765],{"class":35,"line":36},[33,132766,8895],{"class":39},[33,132768,132769,132771,132773,132775],{"class":35,"line":43},[33,132770,164],{"class":163},[33,132772,492],{"class":167},[33,132774,495],{"class":163},[33,132776,498],{"class":167},[33,132778,132779],{"class":35,"line":61},[33,132780,92],{"emptyLinePlaceholder":91},[33,132782,132783,132786,132788,132790,132793,132795,132797,132799,132801,132803,132805,132807,132810,132812,132815],{"class":35,"line":73},[33,132784,132785],{"class":167},"sales   ",[33,132787,242],{"class":163},[33,132789,101407],{"class":167},[33,132791,132792],{"class":54},"\"sale_region\"",[33,132794,12426],{"class":167},[33,132796,11760],{"class":54},[33,132798,365],{"class":167},[33,132800,11773],{"class":54},[33,132802,8314],{"class":167},[33,132804,16465],{"class":54},[33,132806,12426],{"class":167},[33,132808,132809],{"class":50},"37000",[33,132811,365],{"class":167},[33,132813,132814],{"class":50},"29400",[33,132816,45051],{"class":167},[33,132818,132819,132822,132824,132826,132828,132831,132833,132835,132837,132839,132842,132844,132847,132849,132852],{"class":35,"line":88},[33,132820,132821],{"class":167},"targets ",[33,132823,242],{"class":163},[33,132825,101407],{"class":167},[33,132827,16649],{"class":54},[33,132829,132830],{"class":167},":      [",[33,132832,11760],{"class":54},[33,132834,365],{"class":167},[33,132836,11773],{"class":54},[33,132838,8314],{"class":167},[33,132840,132841],{"class":54},"\"target\"",[33,132843,11818],{"class":167},[33,132845,132846],{"class":50},"40000",[33,132848,365],{"class":167},[33,132850,132851],{"class":50},"32000",[33,132853,45051],{"class":167},[33,132855,132856],{"class":35,"line":95},[33,132857,92],{"emptyLinePlaceholder":91},[33,132859,132860,132862,132864,132867,132869,132871,132873,132875,132877,132879,132881],{"class":35,"line":101},[33,132861,35055],{"class":167},[33,132863,242],{"class":163},[33,132865,132866],{"class":167}," pd.merge(sales, targets, ",[33,132868,131264],{"class":238},[33,132870,242],{"class":163},[33,132872,132792],{"class":54},[33,132874,365],{"class":167},[33,132876,131267],{"class":238},[33,132878,242],{"class":163},[33,132880,16649],{"class":54},[33,132882,221],{"class":167},[33,132884,132885,132887],{"class":35,"line":171},[33,132886,13474],{"class":50},[33,132888,131437],{"class":167},[33,132890,132891],{"class":35,"line":179},[33,132892,132893],{"class":39},"# ['sale_region', 'revenue', 'region', 'target']  ← two region-like columns\n",[14,132895,132896],{},"Drop the redundant right-hand key after the merge:",[23,132898,132900],{"className":126,"code":132899,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\n\nsales   = pd.DataFrame({\"sale_region\": [\"North\", \"South\"], \"revenue\": [37000, 29400]})\ntargets = pd.DataFrame({\"region\":      [\"North\", \"South\"], \"target\":  [40000, 32000]})\n\nresult = pd.merge(\n    sales, targets,\n    left_on=\"sale_region\", right_on=\"region\",\n    how=\"left\",\n    validate=\"many_to_one\",   # raises MergeError if targets has duplicate region keys\n)\nresult = result.drop(columns=[\"region\"])   # remove the redundant right-hand key\nprint(result)\n#   sale_region  revenue  target\n# 0       North    37000   40000\n# 1       South    29400   32000\n",[30,132901,132902,132906,132916,132920,132952,132984,132988,132996,133001,133020,133031,133045,133049,133070,133076,133081,133086],{"__ignoreMap":28},[33,132903,132904],{"class":35,"line":36},[33,132905,8895],{"class":39},[33,132907,132908,132910,132912,132914],{"class":35,"line":43},[33,132909,164],{"class":163},[33,132911,492],{"class":167},[33,132913,495],{"class":163},[33,132915,498],{"class":167},[33,132917,132918],{"class":35,"line":61},[33,132919,92],{"emptyLinePlaceholder":91},[33,132921,132922,132924,132926,132928,132930,132932,132934,132936,132938,132940,132942,132944,132946,132948,132950],{"class":35,"line":73},[33,132923,132785],{"class":167},[33,132925,242],{"class":163},[33,132927,101407],{"class":167},[33,132929,132792],{"class":54},[33,132931,12426],{"class":167},[33,132933,11760],{"class":54},[33,132935,365],{"class":167},[33,132937,11773],{"class":54},[33,132939,8314],{"class":167},[33,132941,16465],{"class":54},[33,132943,12426],{"class":167},[33,132945,132809],{"class":50},[33,132947,365],{"class":167},[33,132949,132814],{"class":50},[33,132951,45051],{"class":167},[33,132953,132954,132956,132958,132960,132962,132964,132966,132968,132970,132972,132974,132976,132978,132980,132982],{"class":35,"line":88},[33,132955,132821],{"class":167},[33,132957,242],{"class":163},[33,132959,101407],{"class":167},[33,132961,16649],{"class":54},[33,132963,132830],{"class":167},[33,132965,11760],{"class":54},[33,132967,365],{"class":167},[33,132969,11773],{"class":54},[33,132971,8314],{"class":167},[33,132973,132841],{"class":54},[33,132975,11818],{"class":167},[33,132977,132846],{"class":50},[33,132979,365],{"class":167},[33,132981,132851],{"class":50},[33,132983,45051],{"class":167},[33,132985,132986],{"class":35,"line":95},[33,132987,92],{"emptyLinePlaceholder":91},[33,132989,132990,132992,132994],{"class":35,"line":101},[33,132991,35055],{"class":167},[33,132993,242],{"class":163},[33,132995,131794],{"class":167},[33,132997,132998],{"class":35,"line":171},[33,132999,133000],{"class":167},"    sales, targets,\n",[33,133002,133003,133006,133008,133010,133012,133014,133016,133018],{"class":35,"line":179},[33,133004,133005],{"class":238},"    left_on",[33,133007,242],{"class":163},[33,133009,132792],{"class":54},[33,133011,365],{"class":167},[33,133013,131267],{"class":238},[33,133015,242],{"class":163},[33,133017,16649],{"class":54},[33,133019,247],{"class":167},[33,133021,133022,133025,133027,133029],{"class":35,"line":187},[33,133023,133024],{"class":238},"    how",[33,133026,242],{"class":163},[33,133028,28050],{"class":54},[33,133030,247],{"class":167},[33,133032,133033,133036,133038,133040,133042],{"class":35,"line":201},[33,133034,133035],{"class":238},"    validate",[33,133037,242],{"class":163},[33,133039,127319],{"class":54},[33,133041,1166],{"class":167},[33,133043,133044],{"class":39},"# raises MergeError if targets has duplicate region keys\n",[33,133046,133047],{"class":35,"line":206},[33,133048,221],{"class":167},[33,133050,133051,133053,133055,133057,133059,133061,133063,133065,133067],{"class":35,"line":224},[33,133052,35055],{"class":167},[33,133054,242],{"class":163},[33,133056,132613],{"class":167},[33,133058,740],{"class":238},[33,133060,242],{"class":163},[33,133062,8309],{"class":167},[33,133064,16649],{"class":54},[33,133066,7283],{"class":167},[33,133068,133069],{"class":39},"# remove the redundant right-hand key\n",[33,133071,133072,133074],{"class":35,"line":229},[33,133073,13474],{"class":50},[33,133075,8864],{"class":167},[33,133077,133078],{"class":35,"line":235},[33,133079,133080],{"class":39},"#   sale_region  revenue  target\n",[33,133082,133083],{"class":35,"line":250},[33,133084,133085],{"class":39},"# 0       North    37000   40000\n",[33,133087,133088],{"class":35,"line":266},[33,133089,133090],{"class":39},"# 1       South    29400   32000\n",[14,133092,133093,20859,133095,133098],{},[30,133094,128183],{},[30,133096,133097],{},"pd.errors.MergeError"," immediately if the right-hand key is not unique, catching cardinality bugs before they silently inflate row counts.",[18,133100,133102],{"id":133101},"variant-mergeerror-columns-overlap-but-no-suffix-specified","Variant — MergeError: columns overlap but no suffix specified",[14,133104,133105,133106,133108,133109,133111],{},"This error fires when the ",[30,133107,131483],{}," tuple contains an empty string ",[30,133110,3198],{}," for a column that would otherwise collide:",[23,133113,133115],{"className":126,"code":133114,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\n\nleft  = pd.DataFrame({\"id\": [1], \"val\": [10]})\nright = pd.DataFrame({\"id\": [1], \"val\": [20]})\n\ntry:\n    pd.merge(left, right, on=\"id\", suffixes=(\"\", \"\"))  # both empty → MergeError\nexcept pd.errors.MergeError as exc:\n    print(exc)\n    # columns overlap but no suffix specified: {'val'}\n",[30,133116,133117,133121,133131,133135,133160,133184,133188,133194,133224,133235,133242],{"__ignoreMap":28},[33,133118,133119],{"class":35,"line":36},[33,133120,8895],{"class":39},[33,133122,133123,133125,133127,133129],{"class":35,"line":43},[33,133124,164],{"class":163},[33,133126,492],{"class":167},[33,133128,495],{"class":163},[33,133130,498],{"class":167},[33,133132,133133],{"class":35,"line":61},[33,133134,92],{"emptyLinePlaceholder":91},[33,133136,133137,133139,133141,133143,133145,133147,133149,133151,133154,133156,133158],{"class":35,"line":73},[33,133138,131317],{"class":167},[33,133140,242],{"class":163},[33,133142,101407],{"class":167},[33,133144,57101],{"class":54},[33,133146,12426],{"class":167},[33,133148,734],{"class":50},[33,133150,8314],{"class":167},[33,133152,133153],{"class":54},"\"val\"",[33,133155,12426],{"class":167},[33,133157,3545],{"class":50},[33,133159,45051],{"class":167},[33,133161,133162,133164,133166,133168,133170,133172,133174,133176,133178,133180,133182],{"class":35,"line":88},[33,133163,131364],{"class":167},[33,133165,242],{"class":163},[33,133167,101407],{"class":167},[33,133169,57101],{"class":54},[33,133171,12426],{"class":167},[33,133173,734],{"class":50},[33,133175,8314],{"class":167},[33,133177,133153],{"class":54},[33,133179,12426],{"class":167},[33,133181,2587],{"class":50},[33,133183,45051],{"class":167},[33,133185,133186],{"class":35,"line":95},[33,133187,92],{"emptyLinePlaceholder":91},[33,133189,133190,133192],{"class":35,"line":101},[33,133191,35574],{"class":163},[33,133193,574],{"class":167},[33,133195,133196,133199,133201,133203,133205,133207,133209,133211,133213,133215,133217,133219,133221],{"class":35,"line":171},[33,133197,133198],{"class":167},"    pd.merge(left, right, ",[33,133200,2091],{"class":238},[33,133202,242],{"class":163},[33,133204,57101],{"class":54},[33,133206,365],{"class":167},[33,133208,131483],{"class":238},[33,133210,242],{"class":163},[33,133212,602],{"class":167},[33,133214,3198],{"class":54},[33,133216,365],{"class":167},[33,133218,3198],{"class":54},[33,133220,58831],{"class":167},[33,133222,133223],{"class":39},"# both empty → MergeError\n",[33,133225,133226,133228,133231,133233],{"class":35,"line":179},[33,133227,35726],{"class":163},[33,133229,133230],{"class":167}," pd.errors.MergeError ",[33,133232,495],{"class":163},[33,133234,1855],{"class":167},[33,133236,133237,133239],{"class":35,"line":187},[33,133238,7268],{"class":50},[33,133240,133241],{"class":167},"(exc)\n",[33,133243,133244],{"class":35,"line":201},[33,133245,133246],{"class":39},"    # columns overlap but no suffix specified: {'val'}\n",[14,133248,133249,133250,133253],{},"Fix: never pass ",[30,133251,133252],{},"(\"\", \"\")"," for overlapping non-key columns. Either remove the duplicate column from one frame first, or provide distinct non-empty suffixes.",[14,133255,133256,133257,86173,133260,133262],{},"A related variant is passing ",[30,133258,133259],{},"suffixes=(None, \"_right\")",[30,133261,571],{}," side means \"keep the original name for the left frame's column\". This works when only the right frame's version is the duplicate:",[23,133264,133266],{"className":126,"code":133265,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\n\nleft  = pd.DataFrame({\"id\": [1, 2], \"val\": [10, 20], \"notes\": [\"a\", \"b\"]})\nright = pd.DataFrame({\"id\": [1, 2], \"val\": [11, 22]})\n\nresult = pd.merge(left, right, on=\"id\", suffixes=(None, \"_right\"))\nprint(result.columns.tolist())\n# ['id', 'val', 'notes', 'val_right']\n",[30,133267,133268,133272,133282,133286,133330,133362,133366,133396,133402],{"__ignoreMap":28},[33,133269,133270],{"class":35,"line":36},[33,133271,8895],{"class":39},[33,133273,133274,133276,133278,133280],{"class":35,"line":43},[33,133275,164],{"class":163},[33,133277,492],{"class":167},[33,133279,495],{"class":163},[33,133281,498],{"class":167},[33,133283,133284],{"class":35,"line":61},[33,133285,92],{"emptyLinePlaceholder":91},[33,133287,133288,133290,133292,133294,133296,133298,133300,133302,133304,133306,133308,133310,133312,133314,133316,133318,133320,133322,133324,133326,133328],{"class":35,"line":73},[33,133289,131317],{"class":167},[33,133291,242],{"class":163},[33,133293,101407],{"class":167},[33,133295,57101],{"class":54},[33,133297,12426],{"class":167},[33,133299,734],{"class":50},[33,133301,365],{"class":167},[33,133303,1533],{"class":50},[33,133305,8314],{"class":167},[33,133307,133153],{"class":54},[33,133309,12426],{"class":167},[33,133311,3545],{"class":50},[33,133313,365],{"class":167},[33,133315,2587],{"class":50},[33,133317,8314],{"class":167},[33,133319,131398],{"class":54},[33,133321,12426],{"class":167},[33,133323,118117],{"class":54},[33,133325,365],{"class":167},[33,133327,118134],{"class":54},[33,133329,45051],{"class":167},[33,133331,133332,133334,133336,133338,133340,133342,133344,133346,133348,133350,133352,133354,133356,133358,133360],{"class":35,"line":88},[33,133333,131364],{"class":167},[33,133335,242],{"class":163},[33,133337,101407],{"class":167},[33,133339,57101],{"class":54},[33,133341,12426],{"class":167},[33,133343,734],{"class":50},[33,133345,365],{"class":167},[33,133347,1533],{"class":50},[33,133349,8314],{"class":167},[33,133351,133153],{"class":54},[33,133353,12426],{"class":167},[33,133355,17260],{"class":50},[33,133357,365],{"class":167},[33,133359,11103],{"class":50},[33,133361,45051],{"class":167},[33,133363,133364],{"class":35,"line":95},[33,133365,92],{"emptyLinePlaceholder":91},[33,133367,133368,133370,133372,133374,133376,133378,133380,133382,133384,133386,133388,133390,133392,133394],{"class":35,"line":101},[33,133369,35055],{"class":167},[33,133371,242],{"class":163},[33,133373,131422],{"class":167},[33,133375,2091],{"class":238},[33,133377,242],{"class":163},[33,133379,57101],{"class":54},[33,133381,365],{"class":167},[33,133383,131483],{"class":238},[33,133385,242],{"class":163},[33,133387,602],{"class":167},[33,133389,571],{"class":50},[33,133391,365],{"class":167},[33,133393,132569],{"class":54},[33,133395,371],{"class":167},[33,133397,133398,133400],{"class":35,"line":171},[33,133399,13474],{"class":50},[33,133401,131437],{"class":167},[33,133403,133404],{"class":35,"line":179},[33,133405,133406],{"class":39},"# ['id', 'val', 'notes', 'val_right']\n",[14,133408,17059,133409,133411],{},[30,133410,571],{}," for the side whose original column name should be preserved as-is. If both sides need renaming, provide two distinct strings.",[18,133413,42592],{"id":42591},[4273,133415,133416,133426],{},[4276,133417,133418],{},[4279,133419,133420,133422,133424],{},[4282,133421,4284],{"align":128901},[4282,133423,4287],{"align":128901},[4282,133425,4290],{"align":128901},[4292,133427,133428,133444,133459,133472,133497],{},[4279,133429,133430,133436,133439],{},[4297,133431,133432,36608,133434,124231],{"align":128901},[30,133433,131246],{},[30,133435,131249],{},[4297,133437,133438],{"align":128901},"Non-key column shared between both frames",[4297,133440,35815,133441,133443],{"align":128901},[30,133442,131664],{}," or drop\u002Frename before merging",[4279,133445,133446,133450,133456],{},[4297,133447,133448],{"align":128901},[30,133449,131253],{},[4297,133451,133452,133455],{"align":128901},[30,133453,133454],{},"suffixes=(\"\", \"\")"," with overlapping column",[4297,133457,133458],{"align":128901},"Provide distinct non-empty suffixes or remove the overlap first",[4279,133460,133461,133464,133467],{},[4297,133462,133463],{"align":128901},"Row count unexpectedly multiplied",[4297,133465,133466],{"align":128901},"Right frame has duplicate key values, creating a many-to-many join",[4297,133468,4358,133469,133471],{"align":128901},[30,133470,128183],{}," or deduplicate the right frame's key",[4279,133473,133474,133482,133490],{},[4297,133475,133476,133477,65087,133480,12027],{"align":128901},"Both key columns in output (",[30,133478,133479],{},"sale_region",[30,133481,95904],{},[4297,133483,133484,133485,1351,133487,133489],{"align":128901},"Used ",[30,133486,131264],{},[30,133488,131267],{}," with different column names",[4297,133491,133492,133493,133496],{"align":128901},"Drop the redundant right-hand key with ",[30,133494,133495],{},".drop(columns=[...])"," after the merge",[4279,133498,133499,133504,133512],{},[4297,133500,133501,133503],{"align":128901},[30,133502,8377],{}," on a column that looks present",[4297,133505,133506,133507,1351,133509,133511],{"align":128901},"Column name has hidden whitespace or different case after ",[30,133508,28106],{},[30,133510,28109],{}," rename",[4297,133513,133514,133515,133518,133519],{"align":128901},"Print ",[30,133516,133517],{},"df.columns.tolist()"," to see exact names; normalize with ",[30,133520,133521],{},"str.strip().lower()",[18,133523,9247],{"id":9246},[14,133525,133526,133527,2012,133529,133531],{},"After any of the fixes above, assert that no ",[30,133528,28106],{},[30,133530,28109],{}," columns remain:",[23,133533,133535],{"className":126,"code":133534,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\n\ndef assert_no_suffix_columns(df: pd.DataFrame) -> None:\n    \"\"\"Raise if any column still has the default pandas merge suffixes.\"\"\"\n    bad = [c for c in df.columns if c.endswith(\"_x\") or c.endswith(\"_y\")]\n    if bad:\n        raise AssertionError(f\"Unexpected suffix columns after merge: {bad}\")\n    print(\"OK — no _x\u002F_y columns\")\n\n# assert_no_suffix_columns(result)\n",[30,133536,133537,133541,133551,133555,133568,133573,133608,133615,133639,133650,133654],{"__ignoreMap":28},[33,133538,133539],{"class":35,"line":36},[33,133540,8895],{"class":39},[33,133542,133543,133545,133547,133549],{"class":35,"line":43},[33,133544,164],{"class":163},[33,133546,492],{"class":167},[33,133548,495],{"class":163},[33,133550,498],{"class":167},[33,133552,133553],{"class":35,"line":61},[33,133554,92],{"emptyLinePlaceholder":91},[33,133556,133557,133559,133562,133564,133566],{"class":35,"line":73},[33,133558,562],{"class":163},[33,133560,133561],{"class":46}," assert_no_suffix_columns",[33,133563,9287],{"class":167},[33,133565,571],{"class":50},[33,133567,574],{"class":167},[33,133569,133570],{"class":35,"line":88},[33,133571,133572],{"class":54},"    \"\"\"Raise if any column still has the default pandas merge suffixes.\"\"\"\n",[33,133574,133575,133577,133579,133581,133583,133585,133587,133589,133591,133594,133597,133599,133601,133603,133606],{"class":35,"line":95},[33,133576,27698],{"class":167},[33,133578,242],{"class":163},[33,133580,7740],{"class":167},[33,133582,6124],{"class":163},[33,133584,7486],{"class":167},[33,133586,662],{"class":163},[33,133588,7837],{"class":167},[33,133590,2491],{"class":163},[33,133592,133593],{"class":167}," c.endswith(",[33,133595,133596],{"class":54},"\"_x\"",[33,133598,1649],{"class":167},[33,133600,7162],{"class":163},[33,133602,133593],{"class":167},[33,133604,133605],{"class":54},"\"_y\"",[33,133607,7767],{"class":167},[33,133609,133610,133612],{"class":35,"line":101},[33,133611,617],{"class":163},[33,133613,133614],{"class":167}," bad:\n",[33,133616,133617,133619,133621,133623,133625,133628,133630,133633,133635,133637],{"class":35,"line":171},[33,133618,4051],{"class":163},[33,133620,9445],{"class":50},[33,133622,602],{"class":167},[33,133624,4059],{"class":163},[33,133626,133627],{"class":54},"\"Unexpected suffix columns after merge: ",[33,133629,1115],{"class":50},[33,133631,133632],{"class":167},"bad",[33,133634,1121],{"class":50},[33,133636,274],{"class":54},[33,133638,221],{"class":167},[33,133640,133641,133643,133645,133648],{"class":35,"line":179},[33,133642,7268],{"class":50},[33,133644,602],{"class":167},[33,133646,133647],{"class":54},"\"OK — no _x\u002F_y columns\"",[33,133649,221],{"class":167},[33,133651,133652],{"class":35,"line":187},[33,133653,92],{"emptyLinePlaceholder":91},[33,133655,133656],{"class":35,"line":201},[33,133657,133658],{"class":39},"# assert_no_suffix_columns(result)\n",[14,133660,133661],{},"Also verify row count against expectations:",[23,133663,133665],{"className":126,"code":133664,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\n\nleft  = pd.DataFrame({\"sale_region\": [\"North\", \"South\"], \"revenue\": [37000, 29400]})\ntargets = pd.DataFrame({\"region\": [\"North\", \"South\"], \"target\": [40000, 32000]})\n\nresult = pd.merge(left, targets, left_on=\"sale_region\", right_on=\"region\", how=\"left\")\nresult = result.drop(columns=[\"region\"])\n\nexpected_rows = len(left)           # for a left join, result should have same row count as left\nassert len(result) == expected_rows, (\n    f\"Row count changed: expected {expected_rows}, got {len(result)}. \"\n    \"Check for duplicate keys in the right frame.\"\n)\n",[30,133666,133667,133671,133681,133685,133717,133749,133753,133786,133804,133808,133823,133836,133859,133864],{"__ignoreMap":28},[33,133668,133669],{"class":35,"line":36},[33,133670,8895],{"class":39},[33,133672,133673,133675,133677,133679],{"class":35,"line":43},[33,133674,164],{"class":163},[33,133676,492],{"class":167},[33,133678,495],{"class":163},[33,133680,498],{"class":167},[33,133682,133683],{"class":35,"line":61},[33,133684,92],{"emptyLinePlaceholder":91},[33,133686,133687,133689,133691,133693,133695,133697,133699,133701,133703,133705,133707,133709,133711,133713,133715],{"class":35,"line":73},[33,133688,131317],{"class":167},[33,133690,242],{"class":163},[33,133692,101407],{"class":167},[33,133694,132792],{"class":54},[33,133696,12426],{"class":167},[33,133698,11760],{"class":54},[33,133700,365],{"class":167},[33,133702,11773],{"class":54},[33,133704,8314],{"class":167},[33,133706,16465],{"class":54},[33,133708,12426],{"class":167},[33,133710,132809],{"class":50},[33,133712,365],{"class":167},[33,133714,132814],{"class":50},[33,133716,45051],{"class":167},[33,133718,133719,133721,133723,133725,133727,133729,133731,133733,133735,133737,133739,133741,133743,133745,133747],{"class":35,"line":88},[33,133720,132821],{"class":167},[33,133722,242],{"class":163},[33,133724,101407],{"class":167},[33,133726,16649],{"class":54},[33,133728,12426],{"class":167},[33,133730,11760],{"class":54},[33,133732,365],{"class":167},[33,133734,11773],{"class":54},[33,133736,8314],{"class":167},[33,133738,132841],{"class":54},[33,133740,12426],{"class":167},[33,133742,132846],{"class":50},[33,133744,365],{"class":167},[33,133746,132851],{"class":50},[33,133748,45051],{"class":167},[33,133750,133751],{"class":35,"line":95},[33,133752,92],{"emptyLinePlaceholder":91},[33,133754,133755,133757,133759,133762,133764,133766,133768,133770,133772,133774,133776,133778,133780,133782,133784],{"class":35,"line":101},[33,133756,35055],{"class":167},[33,133758,242],{"class":163},[33,133760,133761],{"class":167}," pd.merge(left, targets, ",[33,133763,131264],{"class":238},[33,133765,242],{"class":163},[33,133767,132792],{"class":54},[33,133769,365],{"class":167},[33,133771,131267],{"class":238},[33,133773,242],{"class":163},[33,133775,16649],{"class":54},[33,133777,365],{"class":167},[33,133779,28045],{"class":238},[33,133781,242],{"class":163},[33,133783,28050],{"class":54},[33,133785,221],{"class":167},[33,133787,133788,133790,133792,133794,133796,133798,133800,133802],{"class":35,"line":171},[33,133789,35055],{"class":167},[33,133791,242],{"class":163},[33,133793,132613],{"class":167},[33,133795,740],{"class":238},[33,133797,242],{"class":163},[33,133799,8309],{"class":167},[33,133801,16649],{"class":54},[33,133803,751],{"class":167},[33,133805,133806],{"class":35,"line":179},[33,133807,92],{"emptyLinePlaceholder":91},[33,133809,133810,133813,133815,133817,133820],{"class":35,"line":187},[33,133811,133812],{"class":167},"expected_rows ",[33,133814,242],{"class":163},[33,133816,4037],{"class":50},[33,133818,133819],{"class":167},"(left)           ",[33,133821,133822],{"class":39},"# for a left join, result should have same row count as left\n",[33,133824,133825,133827,133829,133832,133834],{"class":35,"line":201},[33,133826,36397],{"class":163},[33,133828,4037],{"class":50},[33,133830,133831],{"class":167},"(result) ",[33,133833,1865],{"class":163},[33,133835,21493],{"class":167},[33,133837,133838,133840,133843,133845,133847,133849,133851,133853,133855,133857],{"class":35,"line":206},[33,133839,112430],{"class":163},[33,133841,133842],{"class":54},"\"Row count changed: expected ",[33,133844,1115],{"class":50},[33,133846,21514],{"class":167},[33,133848,1121],{"class":50},[33,133850,21519],{"class":54},[33,133852,4065],{"class":50},[33,133854,130363],{"class":167},[33,133856,1121],{"class":50},[33,133858,52129],{"class":54},[33,133860,133861],{"class":35,"line":224},[33,133862,133863],{"class":54},"    \"Check for duplicate keys in the right frame.\"\n",[33,133865,133866],{"class":35,"line":229},[33,133867,221],{"class":167},[14,133869,133870,133871,133873,133874,1351,133876,133878,133879,3035],{},"For a full end-to-end pipeline that loads Excel files, merges, and exports, see ",[940,133872,28119],{"href":28118},". If you plan to convert the merged result to JSON, fix any ",[30,133875,28106],{},[30,133877,28109],{}," columns first — they produce ugly keys in the JSON output; see ",[940,133880,128340],{"href":133881},"\u002Fpython-for-excel-csv-data-processing\u002Fmerging-multiple-spreadsheets\u002Fconverting-excel-to-json-with-python\u002F",[18,133883,6918],{"id":6917},[4211,133885,133886,133891,133896,133901],{},[4214,133887,133888,133890],{},[940,133889,28119],{"href":28118}," — full workflow for concat, merge, and join across many files",[4214,133892,133893,133895],{},[940,133894,99577],{"href":99576}," — load individual workbooks before merging",[4214,133897,133898,133900],{},[940,133899,9599],{"href":9598}," — normalize column names before merging to prevent header-mismatch collisions",[4214,133902,133903,133905],{},[940,133904,128340],{"href":133881}," — export the clean merged table to JSON",[14,133907,6947,133908,3035],{},[940,133909,28119],{"href":28118},[6953,133911,133912],{},"html pre.shiki code .sAwPA, html code.shiki .sAwPA{--shiki-default:#6A737D}html pre.shiki code .sD7c4, html code.shiki .sD7c4{--shiki-default:#D73A49}html pre.shiki code .sgsFI, html code.shiki .sgsFI{--shiki-default:#24292E}html pre.shiki code .sYBdl, html code.shiki .sYBdl{--shiki-default:#032F62}html pre.shiki code .sYu0t, html code.shiki .sYu0t{--shiki-default:#005CC5}html pre.shiki code .sqxcx, html code.shiki .sqxcx{--shiki-default:#E36209}html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html pre.shiki code .s7eDp, html code.shiki .s7eDp{--shiki-default:#6F42C1}",{"title":28,"searchDepth":43,"depth":43,"links":133914},[133915,133916,133917,133918,133919,133920,133921,133922,133923,133924],{"id":7020,"depth":43,"text":7021},{"id":35016,"depth":43,"text":35017},{"id":131658,"depth":43,"text":131659},{"id":132050,"depth":43,"text":132051},{"id":132422,"depth":43,"text":132423},{"id":132748,"depth":43,"text":132749},{"id":133101,"depth":43,"text":133102},{"id":42591,"depth":43,"text":42592},{"id":9246,"depth":43,"text":9247},{"id":6917,"depth":43,"text":6918},"Fix merge _x\u002F_y columns","After pd.merge, columns appear as col_x and col_y. Root cause, how to set meaningful suffixes, drop redundant columns, coalesce, and validate cardinality.",{},"\u002Fpython-for-excel-csv-data-processing\u002Fmerging-multiple-spreadsheets\u002Ffix-pandas-merge-overlapping-columns",{"title":28114,"description":133926},"Fix pandas merge _x _y Overlapping Column Suffixes","python-for-excel-csv-data-processing\u002Fmerging-multiple-spreadsheets\u002Ffix-pandas-merge-overlapping-columns\u002Findex",[9630,47,99614,133933],"data-cleaning","irEpuU4g_-veFslFccGU4rTRTEjd4NcQv_uGdfm-pEA",{"id":133936,"title":28119,"body":133937,"breadcrumbTitle":138196,"canonical":6977,"date":6977,"description":107412,"draft":6980,"extension":6981,"image":6977,"meta":138197,"navigation":91,"path":138198,"robots":6977,"seo":138199,"seoTitle":138205,"stem":138206,"tags":6977,"updatedAt":6977,"__hash__":138207},"content\u002Fpython-for-excel-csv-data-processing\u002Fmerging-multiple-spreadsheets\u002Findex.md",{"type":7,"value":133938,"toc":138173},[133939,133942,133957,133965,133967,133985,133988,133994,133997,134291,134295,134308,134464,134475,134479,134701,134708,134712,134725,134964,134974,134978,135117,135121,135126,135335,135338,135356,135360,135372,135647,135664,135668,135674,135794,135802,135806,135819,136078,136081,136085,136088,136284,136287,136290,136472,136474,136478,136630,136634,136641,136754,136758,136769,136984,136995,136997,137066,137072,137074,137185,137187,138109,138111,138137,138139,138166,138170],[10,133940,28119],{"id":133941},"merging-multiple-spreadsheets",[14,133943,133944,133945,133948,133949,365,133951,26462,133953,133956],{},"Manual copy-paste consolidation breaks the moment a second person edits a file or a folder grows past a handful of workbooks. Columns drift, rows duplicate, and someone's monthly report quietly disappears. This guide replaces that process with a repeatable Python script that discovers files with ",[30,133946,133947],{},"glob",", aligns mismatched schemas, applies the right pandas operation (",[30,133950,99426],{},[30,133952,27844],{},[30,133954,133955],{},"join","), deduplicates, and handles multi-sheet workbooks — producing one clean table every time.",[14,133958,133959,133960,133962,133963,3035],{},"For reading individual files before consolidation, see ",[940,133961,99577],{"href":99576},". For cleaning up encoding and header noise before merging, see ",[940,133964,9599],{"href":9598},[18,133966,21],{"id":20},[23,133968,133969],{"className":25,"code":128511,"language":27,"meta":28,"style":28},[30,133970,133971,133975],{"__ignoreMap":28},[33,133972,133973],{"class":35,"line":36},[33,133974,3952],{"class":39},[33,133976,133977,133979,133981,133983],{"class":35,"line":43},[33,133978,76],{"class":46},[33,133980,79],{"class":54},[33,133982,16183],{"class":54},[33,133984,95887],{"class":54},[14,133986,133987],{},"Test data layout assumed throughout:",[23,133989,133992],{"className":133990,"code":133991,"language":2000},[1998],"reports\u002F\n  jan_sales.xlsx\n  feb_sales.xlsx\n  mar_sales.xlsx\n  targets.xlsx     # different schema — used for merge examples\n",[30,133993,133991],{"__ignoreMap":28},[14,133995,133996],{},"Create throwaway fixtures to follow along:",[23,133998,134000],{"className":126,"code":133999,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nfrom pathlib import Path\nimport pandas as pd\n\nPath(\"reports\").mkdir(exist_ok=True)\n\nfor month, data in [\n    (\"jan\", {\"region\": [\"North\", \"South\"], \"revenue\": [12000, 9500], \"units\": [120, 95]}),\n    (\"feb\", {\"region\": [\"North\", \"South\"], \"revenue\": [13200, 10100], \"units\": [130, 101]}),\n    (\"mar\", {\"region\": [\"North\", \"South\"], \"revenue\": [11800, 9800], \"units\": [115, 98]}),\n]:\n    pd.DataFrame(data).to_excel(f\"reports\u002F{month}_sales.xlsx\", index=False)\n\npd.DataFrame({\"region\": [\"North\", \"South\"], \"target\": [40000, 32000]}).to_excel(\n    \"reports\u002Ftargets.xlsx\", index=False\n)\n",[30,134001,134002,134006,134016,134026,134030,134047,134051,134062,134111,134159,134206,134210,134240,134244,134274,134287],{"__ignoreMap":28},[33,134003,134004],{"class":35,"line":36},[33,134005,3952],{"class":39},[33,134007,134008,134010,134012,134014],{"class":35,"line":43},[33,134009,190],{"class":163},[33,134011,193],{"class":167},[33,134013,164],{"class":163},[33,134015,198],{"class":167},[33,134017,134018,134020,134022,134024],{"class":35,"line":61},[33,134019,164],{"class":163},[33,134021,492],{"class":167},[33,134023,495],{"class":163},[33,134025,498],{"class":167},[33,134027,134028],{"class":35,"line":73},[33,134029,92],{"emptyLinePlaceholder":91},[33,134031,134032,134034,134036,134039,134041,134043,134045],{"class":35,"line":88},[33,134033,15641],{"class":167},[33,134035,96899],{"class":54},[33,134037,134038],{"class":167},").mkdir(",[33,134040,878],{"class":238},[33,134042,242],{"class":163},[33,134044,855],{"class":50},[33,134046,221],{"class":167},[33,134048,134049],{"class":35,"line":95},[33,134050,92],{"emptyLinePlaceholder":91},[33,134052,134053,134055,134058,134060],{"class":35,"line":101},[33,134054,6124],{"class":163},[33,134056,134057],{"class":167}," month, data ",[33,134059,662],{"class":163},[33,134061,7473],{"class":167},[33,134063,134064,134067,134070,134073,134075,134077,134079,134081,134083,134085,134087,134089,134091,134093,134096,134098,134100,134102,134104,134106,134108],{"class":35,"line":171},[33,134065,134066],{"class":167},"    (",[33,134068,134069],{"class":54},"\"jan\"",[33,134071,134072],{"class":167},", {",[33,134074,16649],{"class":54},[33,134076,12426],{"class":167},[33,134078,11760],{"class":54},[33,134080,365],{"class":167},[33,134082,11773],{"class":54},[33,134084,8314],{"class":167},[33,134086,16465],{"class":54},[33,134088,12426],{"class":167},[33,134090,101426],{"class":50},[33,134092,365],{"class":167},[33,134094,134095],{"class":50},"9500",[33,134097,8314],{"class":167},[33,134099,16659],{"class":54},[33,134101,12426],{"class":167},[33,134103,2589],{"class":50},[33,134105,365],{"class":167},[33,134107,16357],{"class":50},[33,134109,134110],{"class":167},"]}),\n",[33,134112,134113,134115,134118,134120,134122,134124,134126,134128,134130,134132,134134,134136,134139,134141,134144,134146,134148,134150,134152,134154,134157],{"class":35,"line":179},[33,134114,134066],{"class":167},[33,134116,134117],{"class":54},"\"feb\"",[33,134119,134072],{"class":167},[33,134121,16649],{"class":54},[33,134123,12426],{"class":167},[33,134125,11760],{"class":54},[33,134127,365],{"class":167},[33,134129,11773],{"class":54},[33,134131,8314],{"class":167},[33,134133,16465],{"class":54},[33,134135,12426],{"class":167},[33,134137,134138],{"class":50},"13200",[33,134140,365],{"class":167},[33,134142,134143],{"class":50},"10100",[33,134145,8314],{"class":167},[33,134147,16659],{"class":54},[33,134149,12426],{"class":167},[33,134151,2588],{"class":50},[33,134153,365],{"class":167},[33,134155,134156],{"class":50},"101",[33,134158,134110],{"class":167},[33,134160,134161,134163,134166,134168,134170,134172,134174,134176,134178,134180,134182,134184,134187,134189,134192,134194,134196,134198,134200,134202,134204],{"class":35,"line":187},[33,134162,134066],{"class":167},[33,134164,134165],{"class":54},"\"mar\"",[33,134167,134072],{"class":167},[33,134169,16649],{"class":54},[33,134171,12426],{"class":167},[33,134173,11760],{"class":54},[33,134175,365],{"class":167},[33,134177,11773],{"class":54},[33,134179,8314],{"class":167},[33,134181,16465],{"class":54},[33,134183,12426],{"class":167},[33,134185,134186],{"class":50},"11800",[33,134188,365],{"class":167},[33,134190,134191],{"class":50},"9800",[33,134193,8314],{"class":167},[33,134195,16659],{"class":54},[33,134197,12426],{"class":167},[33,134199,2629],{"class":50},[33,134201,365],{"class":167},[33,134203,82416],{"class":50},[33,134205,134110],{"class":167},[33,134207,134208],{"class":35,"line":201},[33,134209,17477],{"class":167},[33,134211,134212,134215,134217,134220,134222,134225,134227,134230,134232,134234,134236,134238],{"class":35,"line":206},[33,134213,134214],{"class":167},"    pd.DataFrame(data).to_excel(",[33,134216,4059],{"class":163},[33,134218,134219],{"class":54},"\"reports\u002F",[33,134221,1115],{"class":50},[33,134223,134224],{"class":167},"month",[33,134226,1121],{"class":50},[33,134228,134229],{"class":54},"_sales.xlsx\"",[33,134231,365],{"class":167},[33,134233,897],{"class":238},[33,134235,242],{"class":163},[33,134237,902],{"class":50},[33,134239,221],{"class":167},[33,134241,134242],{"class":35,"line":224},[33,134243,92],{"emptyLinePlaceholder":91},[33,134245,134246,134249,134251,134253,134255,134257,134259,134261,134263,134265,134267,134269,134271],{"class":35,"line":229},[33,134247,134248],{"class":167},"pd.DataFrame({",[33,134250,16649],{"class":54},[33,134252,12426],{"class":167},[33,134254,11760],{"class":54},[33,134256,365],{"class":167},[33,134258,11773],{"class":54},[33,134260,8314],{"class":167},[33,134262,132841],{"class":54},[33,134264,12426],{"class":167},[33,134266,132846],{"class":50},[33,134268,365],{"class":167},[33,134270,132851],{"class":50},[33,134272,134273],{"class":167},"]}).to_excel(\n",[33,134275,134276,134279,134281,134283,134285],{"class":35,"line":235},[33,134277,134278],{"class":54},"    \"reports\u002Ftargets.xlsx\"",[33,134280,365],{"class":167},[33,134282,897],{"class":238},[33,134284,242],{"class":163},[33,134286,8339],{"class":50},[33,134288,134289],{"class":35,"line":250},[33,134290,221],{"class":167},[18,134292,134294],{"id":134293},"step-1-discover-files-with-glob","Step 1 — Discover Files with glob",[14,134296,134297,134298,36604,134301,134304,134305,3035],{},"Hard-coded file lists break when a new month is added. Use ",[30,134299,134300],{},"pathlib.Path.glob",[30,134302,134303],{},"rglob"," for nested folders) and filter out Office lock files that start with ",[30,134306,134307],{},"~$",[23,134309,134311],{"className":126,"code":134310,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nfrom pathlib import Path\n\nREPORTS_DIR = Path(\"reports\")\n\ndef discover_files(directory: Path, pattern: str = \"*.xlsx\") -> list[Path]:\n    \"\"\"Return sorted list of matching files, skipping Office temp files.\"\"\"\n    return sorted(\n        p for p in directory.glob(pattern)\n        if not p.name.startswith((\"~$\", \".\"))\n    )\n\nsales_files = [f for f in discover_files(REPORTS_DIR) if \"sales\" in f.stem]\nprint(sales_files)\n# [PosixPath('reports\u002Ffeb_sales.xlsx'), PosixPath('reports\u002Fjan_sales.xlsx'), ...]\n",[30,134312,134313,134317,134327,134331,134344,134348,134367,134372,134380,134393,134412,134416,134420,134452,134459],{"__ignoreMap":28},[33,134314,134315],{"class":35,"line":36},[33,134316,3952],{"class":39},[33,134318,134319,134321,134323,134325],{"class":35,"line":43},[33,134320,190],{"class":163},[33,134322,193],{"class":167},[33,134324,164],{"class":163},[33,134326,198],{"class":167},[33,134328,134329],{"class":35,"line":61},[33,134330,92],{"emptyLinePlaceholder":91},[33,134332,134333,134336,134338,134340,134342],{"class":35,"line":73},[33,134334,134335],{"class":50},"REPORTS_DIR",[33,134337,212],{"class":163},[33,134339,215],{"class":167},[33,134341,96899],{"class":54},[33,134343,221],{"class":167},[33,134345,134346],{"class":35,"line":88},[33,134347,92],{"emptyLinePlaceholder":91},[33,134349,134350,134352,134355,134358,134360,134362,134365],{"class":35,"line":95},[33,134351,562],{"class":163},[33,134353,134354],{"class":46}," discover_files",[33,134356,134357],{"class":167},"(directory: Path, pattern: ",[33,134359,1053],{"class":50},[33,134361,212],{"class":163},[33,134363,134364],{"class":54}," \"*.xlsx\"",[33,134366,72642],{"class":167},[33,134368,134369],{"class":35,"line":101},[33,134370,134371],{"class":54},"    \"\"\"Return sorted list of matching files, skipping Office temp files.\"\"\"\n",[33,134373,134374,134376,134378],{"class":35,"line":171},[33,134375,1332],{"class":163},[33,134377,28924],{"class":50},[33,134379,7637],{"class":167},[33,134381,134382,134384,134386,134388,134390],{"class":35,"line":179},[33,134383,25099],{"class":167},[33,134385,6124],{"class":163},[33,134387,6127],{"class":167},[33,134389,662],{"class":163},[33,134391,134392],{"class":167}," directory.glob(pattern)\n",[33,134394,134395,134397,134399,134402,134405,134407,134410],{"class":35,"line":187},[33,134396,8221],{"class":163},[33,134398,620],{"class":163},[33,134400,134401],{"class":167}," p.name.startswith((",[33,134403,134404],{"class":54},"\"~$\"",[33,134406,365],{"class":167},[33,134408,134409],{"class":54},"\".\"",[33,134411,371],{"class":167},[33,134413,134414],{"class":35,"line":201},[33,134415,1202],{"class":167},[33,134417,134418],{"class":35,"line":206},[33,134419,92],{"emptyLinePlaceholder":91},[33,134421,134422,134425,134427,134429,134431,134433,134435,134438,134440,134442,134444,134447,134449],{"class":35,"line":224},[33,134423,134424],{"class":167},"sales_files ",[33,134426,242],{"class":163},[33,134428,27906],{"class":167},[33,134430,6124],{"class":163},[33,134432,8832],{"class":167},[33,134434,662],{"class":163},[33,134436,134437],{"class":167}," discover_files(",[33,134439,134335],{"class":50},[33,134441,1649],{"class":167},[33,134443,2491],{"class":163},[33,134445,134446],{"class":54}," \"sales\"",[33,134448,8002],{"class":163},[33,134450,134451],{"class":167}," f.stem]\n",[33,134453,134454,134456],{"class":35,"line":229},[33,134455,13474],{"class":50},[33,134457,134458],{"class":167},"(sales_files)\n",[33,134460,134461],{"class":35,"line":235},[33,134462,134463],{"class":39},"# [PosixPath('reports\u002Ffeb_sales.xlsx'), PosixPath('reports\u002Fjan_sales.xlsx'), ...]\n",[14,134465,134466,134467,134470,134471,134474],{},"Pass multiple patterns by calling ",[30,134468,134469],{},"discover_files"," twice and combining the lists, or switch to ",[30,134472,134473],{},"rglob(\"**\u002F*.csv\")"," for recursive discovery.",[18,134476,134478],{"id":134477},"step-2-load-each-file-into-a-dataframe","Step 2 — Load Each File into a DataFrame",[23,134480,134482],{"className":126,"code":134481,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nfrom pathlib import Path\nimport pandas as pd\nimport logging\n\nlogging.basicConfig(level=logging.INFO, format=\"%(levelname)s: %(message)s\")\n\ndef load_files(paths: list[Path]) -> list[pd.DataFrame]:\n    \"\"\"Load .xlsx and .csv files; tag each row with its source filename.\"\"\"\n    frames: list[pd.DataFrame] = []\n    for p in paths:\n        try:\n            df = pd.read_excel(p, engine=\"openpyxl\") if p.suffix == \".xlsx\" else pd.read_csv(p)\n            df[\"source_file\"] = p.name          # traceability column\n            frames.append(df)\n            logging.info(\"Loaded %s  (%d rows)\", p.name, len(df))\n        except Exception as exc:\n            logging.warning(\"Skipping %s: %s\", p.name, exc)\n    return frames\n",[30,134483,134484,134488,134498,134508,134514,134518,134548,134552,134562,134567,134575,134585,134591,134622,134640,134645,134667,134677,134695],{"__ignoreMap":28},[33,134485,134486],{"class":35,"line":36},[33,134487,3952],{"class":39},[33,134489,134490,134492,134494,134496],{"class":35,"line":43},[33,134491,190],{"class":163},[33,134493,193],{"class":167},[33,134495,164],{"class":163},[33,134497,198],{"class":167},[33,134499,134500,134502,134504,134506],{"class":35,"line":61},[33,134501,164],{"class":163},[33,134503,492],{"class":167},[33,134505,495],{"class":163},[33,134507,498],{"class":167},[33,134509,134510,134512],{"class":35,"line":73},[33,134511,164],{"class":163},[33,134513,184],{"class":167},[33,134515,134516],{"class":35,"line":88},[33,134517,92],{"emptyLinePlaceholder":91},[33,134519,134520,134522,134524,134526,134528,134530,134532,134534,134536,134538,134540,134542,134544,134546],{"class":35,"line":95},[33,134521,71660],{"class":167},[33,134523,18267],{"class":238},[33,134525,242],{"class":163},[33,134527,258],{"class":167},[33,134529,1067],{"class":50},[33,134531,365],{"class":167},[33,134533,61926],{"class":238},[33,134535,242],{"class":163},[33,134537,274],{"class":54},[33,134539,26817],{"class":50},[33,134541,2079],{"class":54},[33,134543,26827],{"class":50},[33,134545,274],{"class":54},[33,134547,221],{"class":167},[33,134549,134550],{"class":35,"line":101},[33,134551,92],{"emptyLinePlaceholder":91},[33,134553,134554,134556,134559],{"class":35,"line":171},[33,134555,562],{"class":163},[33,134557,134558],{"class":46}," load_files",[33,134560,134561],{"class":167},"(paths: list[Path]) -> list[pd.DataFrame]:\n",[33,134563,134564],{"class":35,"line":179},[33,134565,134566],{"class":54},"    \"\"\"Load .xlsx and .csv files; tag each row with its source filename.\"\"\"\n",[33,134568,134569,134571,134573],{"class":35,"line":187},[33,134570,6183],{"class":167},[33,134572,242],{"class":163},[33,134574,589],{"class":167},[33,134576,134577,134579,134581,134583],{"class":35,"line":201},[33,134578,656],{"class":163},[33,134580,6127],{"class":167},[33,134582,662],{"class":163},[33,134584,73215],{"class":167},[33,134586,134587,134589],{"class":35,"line":206},[33,134588,670],{"class":163},[33,134590,574],{"class":167},[33,134592,134593,134595,134597,134599,134601,134603,134605,134607,134609,134612,134614,134617,134619],{"class":35,"line":224},[33,134594,51528],{"class":167},[33,134596,242],{"class":163},[33,134598,25128],{"class":167},[33,134600,17351],{"class":238},[33,134602,242],{"class":163},[33,134604,17356],{"class":54},[33,134606,1649],{"class":167},[33,134608,2491],{"class":163},[33,134610,134611],{"class":167}," p.suffix ",[33,134613,1865],{"class":163},[33,134615,134616],{"class":54}," \".xlsx\"",[33,134618,15715],{"class":163},[33,134620,134621],{"class":167}," pd.read_csv(p)\n",[33,134623,134624,134627,134630,134632,134634,134637],{"class":35,"line":229},[33,134625,134626],{"class":167},"            df[",[33,134628,134629],{"class":54},"\"source_file\"",[33,134631,763],{"class":167},[33,134633,242],{"class":163},[33,134635,134636],{"class":167}," p.name          ",[33,134638,134639],{"class":39},"# traceability column\n",[33,134641,134642],{"class":35,"line":235},[33,134643,134644],{"class":167},"            frames.append(df)\n",[33,134646,134647,134650,134652,134654,134656,134658,134660,134663,134665],{"class":35,"line":250},[33,134648,134649],{"class":167},"            logging.info(",[33,134651,96187],{"class":54},[33,134653,309],{"class":50},[33,134655,18019],{"class":54},[33,134657,916],{"class":50},[33,134659,18029],{"class":54},[33,134661,134662],{"class":167},", p.name, ",[33,134664,928],{"class":50},[33,134666,128027],{"class":167},[33,134668,134669,134671,134673,134675],{"class":35,"line":266},[33,134670,780],{"class":163},[33,134672,783],{"class":50},[33,134674,1852],{"class":163},[33,134676,1855],{"class":167},[33,134678,134679,134682,134684,134686,134688,134690,134692],{"class":35,"line":290},[33,134680,134681],{"class":167},"            logging.warning(",[33,134683,6272],{"class":54},[33,134685,309],{"class":50},[33,134687,2079],{"class":54},[33,134689,309],{"class":50},[33,134691,274],{"class":54},[33,134693,134694],{"class":167},", p.name, exc)\n",[33,134696,134697,134699],{"class":35,"line":295},[33,134698,1332],{"class":163},[33,134700,6065],{"class":167},[14,134702,134703,134704,134707],{},"Always include ",[30,134705,134706],{},"source_file",". Without it, duplicate-row debugging is painful after consolidation.",[18,134709,134711],{"id":134710},"step-3-align-mismatched-columns","Step 3 — Align Mismatched Columns",[14,134713,134714,134715,365,134718,71132,134721,134724],{},"Before stacking, normalize column names so ",[30,134716,134717],{},"Client_ID",[30,134719,134720],{},"ClientID",[30,134722,134723],{},"client id"," all resolve to the same key. Then check which columns are present in every file vs. only some.",[23,134726,134728],{"className":126,"code":134727,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nimport pandas as pd\nimport re\n\ndef normalize_columns(df: pd.DataFrame) -> pd.DataFrame:\n    \"\"\"Lowercase, strip, replace spaces\u002Fhyphens with underscores.\"\"\"\n    df.columns = [re.sub(r\"[\\s\\-]+\", \"_\", c.strip().lower()) for c in df.columns]\n    return df\n\ndef align_schemas(frames: list[pd.DataFrame]) -> list[pd.DataFrame]:\n    \"\"\"Report column coverage and return normalized frames.\"\"\"\n    normalized = [normalize_columns(df.copy()) for df in frames]\n    all_cols: set[str] = set()\n    for df in normalized:\n        all_cols |= set(df.columns)\n    common = set.intersection(*(set(df.columns) for df in normalized))\n    extra  = all_cols - common\n    if extra:\n        logging.info(\"Columns present in some files only: %s\", extra)\n    return normalized\n",[30,134729,134730,134734,134744,134750,134754,134763,134768,134807,134813,134817,134827,134832,134850,134866,134877,134889,134920,134935,134942,134957],{"__ignoreMap":28},[33,134731,134732],{"class":35,"line":36},[33,134733,3952],{"class":39},[33,134735,134736,134738,134740,134742],{"class":35,"line":43},[33,134737,164],{"class":163},[33,134739,492],{"class":167},[33,134741,495],{"class":163},[33,134743,498],{"class":167},[33,134745,134746,134748],{"class":35,"line":61},[33,134747,164],{"class":163},[33,134749,11917],{"class":167},[33,134751,134752],{"class":35,"line":73},[33,134753,92],{"emptyLinePlaceholder":91},[33,134755,134756,134758,134761],{"class":35,"line":88},[33,134757,562],{"class":163},[33,134759,134760],{"class":46}," normalize_columns",[33,134762,12127],{"class":167},[33,134764,134765],{"class":35,"line":95},[33,134766,134767],{"class":54},"    \"\"\"Lowercase, strip, replace spaces\u002Fhyphens with underscores.\"\"\"\n",[33,134769,134770,134772,134774,134777,134779,134781,134784,134786,134788,134790,134792,134794,134796,134799,134801,134803,134805],{"class":35,"line":101},[33,134771,27546],{"class":167},[33,134773,242],{"class":163},[33,134775,134776],{"class":167}," [re.sub(",[33,134778,11977],{"class":163},[33,134780,274],{"class":54},[33,134782,134783],{"class":50},"[\\s",[33,134785,113960],{"class":12018},[33,134787,9546],{"class":50},[33,134789,1811],{"class":163},[33,134791,274],{"class":54},[33,134793,365],{"class":167},[33,134795,7764],{"class":54},[33,134797,134798],{"class":167},", c.strip().lower()) ",[33,134800,6124],{"class":163},[33,134802,7486],{"class":167},[33,134804,662],{"class":163},[33,134806,12624],{"class":167},[33,134808,134809,134811],{"class":35,"line":171},[33,134810,1332],{"class":163},[33,134812,11719],{"class":167},[33,134814,134815],{"class":35,"line":179},[33,134816,92],{"emptyLinePlaceholder":91},[33,134818,134819,134821,134824],{"class":35,"line":187},[33,134820,562],{"class":163},[33,134822,134823],{"class":46}," align_schemas",[33,134825,134826],{"class":167},"(frames: list[pd.DataFrame]) -> list[pd.DataFrame]:\n",[33,134828,134829],{"class":35,"line":201},[33,134830,134831],{"class":54},"    \"\"\"Report column coverage and return normalized frames.\"\"\"\n",[33,134833,134834,134837,134839,134842,134844,134846,134848],{"class":35,"line":206},[33,134835,134836],{"class":167},"    normalized ",[33,134838,242],{"class":163},[33,134840,134841],{"class":167}," [normalize_columns(df.copy()) ",[33,134843,6124],{"class":163},[33,134845,7810],{"class":167},[33,134847,662],{"class":163},[33,134849,8837],{"class":167},[33,134851,134852,134855,134857,134859,134861,134863],{"class":35,"line":224},[33,134853,134854],{"class":167},"    all_cols: set[",[33,134856,1053],{"class":50},[33,134858,763],{"class":167},[33,134860,242],{"class":163},[33,134862,4129],{"class":50},[33,134864,134865],{"class":167},"()\n",[33,134867,134868,134870,134872,134874],{"class":35,"line":229},[33,134869,656],{"class":163},[33,134871,7810],{"class":167},[33,134873,662],{"class":163},[33,134875,134876],{"class":167}," normalized:\n",[33,134878,134879,134882,134885,134887],{"class":35,"line":235},[33,134880,134881],{"class":167},"        all_cols ",[33,134883,134884],{"class":163},"|=",[33,134886,4129],{"class":50},[33,134888,4132],{"class":167},[33,134890,134891,134894,134896,134898,134901,134903,134905,134908,134911,134913,134915,134917],{"class":35,"line":250},[33,134892,134893],{"class":167},"    common ",[33,134895,242],{"class":163},[33,134897,4129],{"class":50},[33,134899,134900],{"class":167},".intersection(",[33,134902,1769],{"class":163},[33,134904,602],{"class":167},[33,134906,134907],{"class":50},"set",[33,134909,134910],{"class":167},"(df.columns) ",[33,134912,6124],{"class":163},[33,134914,7810],{"class":167},[33,134916,662],{"class":163},[33,134918,134919],{"class":167}," normalized))\n",[33,134921,134922,134925,134927,134930,134932],{"class":35,"line":266},[33,134923,134924],{"class":167},"    extra  ",[33,134926,242],{"class":163},[33,134928,134929],{"class":167}," all_cols ",[33,134931,4126],{"class":163},[33,134933,134934],{"class":167}," common\n",[33,134936,134937,134939],{"class":35,"line":290},[33,134938,617],{"class":163},[33,134940,134941],{"class":167}," extra:\n",[33,134943,134944,134947,134950,134952,134954],{"class":35,"line":295},[33,134945,134946],{"class":167},"        logging.info(",[33,134948,134949],{"class":54},"\"Columns present in some files only: ",[33,134951,309],{"class":50},[33,134953,274],{"class":54},[33,134955,134956],{"class":167},", extra)\n",[33,134958,134959,134961],{"class":35,"line":300},[33,134960,1332],{"class":163},[33,134962,134963],{"class":167}," normalized\n",[14,134965,134966,8877,134968,134970,134971,134973],{},[30,134967,8366],{},[30,134969,8880],{}," will create ",[30,134972,8884],{},"-filled columns for rows that lack an optional column — that is usually fine for reporting. If a column must be present in every file, validate explicitly here and raise before the concatenation.",[18,134975,134977],{"id":134976},"the-concat-vs-merge-vs-join-decision","The concat vs merge vs join Decision",[2540,134979,2547,134981,2547,134984,2547,134987,2547,2547,135001,2547,135003,2547,135006,2547,135009,2547,135011,2547,135014,2547,135016,2547,135018,2547,135021,2547,135023,2547,135025,2547,135028,2547,2547,135032,2547,135035,2547,135037,2547,135039,2547,2547,135041,2547,135044,2547,135047,2547,135050,2547,135052,2547,135055,2547,2547,135059,2547,135062,2547,135065,2547,135069,2547,135073,2547,2547,135076,2547,135078,2547,135081,2547,135083,2547,135086,2547,2547,135089,2547,135091,2547,135093,2547,135095,2547,135097,2547,2547,135100,2547,135103,2547,135106,2547,135108,2547,135110,2547,135113],{"viewBox":2542,"role":2543,"ariaLabel":134980,"xmlns":2545,"style":2546},"Decision flow: many files to one table via glob, align, then concat or merge",[2549,134982,134983],{},"Merging spreadsheets workflow",[2553,134985,134986],{},"Flow diagram showing: multiple source files go through glob discovery and column alignment, then branch to pd.concat for same-schema stacking or pd.merge for key-based joining, both producing one consolidated table.",[2557,134988,2559,134989,2559,134996,2547],{},[2561,134990,2564,134992,2564,134994,2559],{"id":134991,"x1":748,"y1":748,"x2":734,"y2":748},"merge-sheets-grad",[2566,134993],{"offset":748,"style":2568},[2566,134995],{"offset":734,"style":2571},[2573,134997,2564,134999,2559],{"id":134998,"viewBox":2576,"refX":2577,"refY":1153,"markerWidth":1179,"markerHeight":1179,"orient":2578},"merge-sheets-arrow",[2580,135000],{"d":2582,"fill":2583},[2585,135002],{"x":2587,"y":2587,"width":2609,"height":26341,"rx":2591,"fill":2592,"stroke":2593,"style":2594},[2000,135004,135005],{"x":2630,"y":49816,"fill":2599,"style":2600},"jan_sales.xlsx",[2000,135007,135008],{"x":2630,"y":71517,"fill":2583,"style":2685},"region · revenue · units",[2585,135010],{"x":2587,"y":26350,"width":2609,"height":26341,"rx":2591,"fill":2592,"stroke":2593,"style":2594},[2000,135012,135013],{"x":2630,"y":82416,"fill":2599,"style":2600},"feb_sales.xlsx",[2000,135015,135008],{"x":2630,"y":102523,"fill":2583,"style":2685},[2585,135017],{"x":2587,"y":26345,"width":2609,"height":26341,"rx":2591,"fill":2592,"stroke":2593,"style":2594},[2000,135019,135020],{"x":2630,"y":114614,"fill":2599,"style":2600},"mar_sales.xlsx",[2000,135022,135008],{"x":2630,"y":58371,"fill":2583,"style":2685},[2585,135024],{"x":2587,"y":58401,"width":2609,"height":26341,"rx":2591,"fill":2592,"stroke":2593,"style":2594},[2000,135026,135027],{"x":2630,"y":26410,"fill":2599,"style":2600},"targets.xlsx",[2000,135029,135031],{"x":2630,"y":135030,"fill":2583,"style":2685},"247","region · target",[35,135033],{"x1":2610,"y1":49816,"x2":2701,"y2":2679,"stroke":2583,"markerEnd":135034,"style":2594},"url(#merge-sheets-arrow)",[35,135036],{"x1":2610,"y1":82416,"x2":49872,"y2":2629,"stroke":2583,"markerEnd":135034,"style":2594},[35,135038],{"x1":2610,"y1":114614,"x2":2701,"y2":71551,"stroke":2583,"markerEnd":135034,"style":2594},[35,135040],{"x1":2610,"y1":26410,"x2":2701,"y2":17008,"stroke":2583,"markerEnd":135034,"style":2594},[2585,135042],{"x":49872,"y":2630,"width":11108,"height":2680,"rx":2591,"fill":135043,"stroke":2593,"style":2594},"url(#merge-sheets-grad)",[2000,135045,135046],{"x":11231,"y":71536,"fill":2599,"style":2600},"glob + normalize",[2000,135048,135049],{"x":11231,"y":100328,"fill":2599,"style":2685},"column names",[2585,135051],{"x":49872,"y":59939,"width":11108,"height":2680,"rx":2591,"fill":2615,"stroke":2593,"style":2594},[2000,135053,135054],{"x":11231,"y":17008,"fill":2599,"style":2600},"different schema",[2000,135056,135058],{"x":11231,"y":135057,"fill":2583,"style":2685},"257","needs a key column",[49826,135060],{"points":135061,"fill":2592,"stroke":11166,"style":2594},"430,115 480,95 530,115 480,135",[2000,135063,135064],{"x":49852,"y":71536,"fill":2599,"style":2685},"same",[2000,135066,135068],{"x":49852,"y":135067,"fill":2599,"style":2685},"127","schema?",[35,135070],{"x1":135071,"y1":2629,"x2":135072,"y2":2629,"stroke":2583,"markerEnd":135034,"style":2594},"358","430",[35,135074],{"x1":135071,"y1":107607,"x2":135075,"y2":2611,"stroke":2583,"markerEnd":135034,"style":2594},"425",[35,135077],{"x1":110841,"y1":2629,"x2":49894,"y2":2597,"stroke":2583,"markerEnd":135034,"style":2594},[2000,135079,38631],{"x":135080,"y":2630,"fill":2583,"style":2685},"548",[2585,135082],{"x":49894,"y":2680,"width":2635,"height":2680,"rx":2591,"fill":11165,"stroke":11166,"style":2594},[2000,135084,8366],{"x":135085,"y":49823,"fill":2599,"style":16983},"655",[2000,135087,135088],{"x":135085,"y":102542,"fill":2583,"style":2685},"stack rows",[35,135090],{"x1":110841,"y1":2629,"x2":49894,"y2":2610,"stroke":2583,"markerEnd":135034,"style":2594},[2000,135092,38628],{"x":135080,"y":11112,"fill":2583,"style":2685},[2585,135094],{"x":49894,"y":2609,"width":2635,"height":2680,"rx":2591,"fill":11165,"stroke":11166,"style":2594},[2000,135096,127087],{"x":135085,"y":11173,"fill":2599,"style":16983},[2000,135098,135099],{"x":135085,"y":11176,"fill":2583,"style":2685},"join on key",[35,135101],{"x1":58169,"y1":16991,"x2":38713,"y2":59981,"stroke":2583,"style":135102},"stroke-width:1;stroke-dasharray:4 3",[35,135104],{"x1":58169,"y1":59956,"x2":38713,"y2":135105,"stroke":2583,"style":135102},"237",[35,135107],{"x1":38713,"y1":59981,"x2":38713,"y2":26446,"stroke":2583,"markerEnd":135034,"style":2594},[2585,135109],{"x":49894,"y":26446,"width":2635,"height":2680,"rx":2591,"fill":135043,"stroke":2593,"style":2594},[2000,135111,135112],{"x":135085,"y":89124,"fill":2599,"style":16983},"one table",[2000,135114,135116],{"x":135085,"y":135115,"fill":2599,"style":2685},"309","dedup · validate",[424,135118,135120],{"id":135119},"pdconcat-stacking-same-schema-files","pd.concat — stacking same-schema files",[14,135122,17059,135123,135125],{},[30,135124,8366],{}," when every file represents the same measurement at a different time or from a different source (monthly sales, per-branch exports). It aligns on column names, not position, so column order mismatches are handled automatically.",[23,135127,135129],{"className":126,"code":135128,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nfrom pathlib import Path\nimport pandas as pd\n\nREPORTS_DIR = Path(\"reports\")\n\nframes = []\nfor p in sorted(REPORTS_DIR.glob(\"*_sales.xlsx\")):\n    if p.name.startswith(\"~$\"):\n        continue\n    df = pd.read_excel(p, engine=\"openpyxl\")\n    df[\"source_file\"] = p.name\n    frames.append(df)\n\ntry:\n    combined = pd.concat(frames, ignore_index=True, sort=False)\nexcept ValueError as exc:\n    raise RuntimeError(\"No valid DataFrames to concatenate\") from exc\n\nprint(combined.shape)   # (6, 4) — 3 months × 2 regions\nprint(combined.head())\n",[30,135130,135131,135135,135145,135155,135159,135171,135175,135183,135204,135215,135220,135236,135249,135253,135257,135263,135288,135298,135315,135319,135329],{"__ignoreMap":28},[33,135132,135133],{"class":35,"line":36},[33,135134,3952],{"class":39},[33,135136,135137,135139,135141,135143],{"class":35,"line":43},[33,135138,190],{"class":163},[33,135140,193],{"class":167},[33,135142,164],{"class":163},[33,135144,198],{"class":167},[33,135146,135147,135149,135151,135153],{"class":35,"line":61},[33,135148,164],{"class":163},[33,135150,492],{"class":167},[33,135152,495],{"class":163},[33,135154,498],{"class":167},[33,135156,135157],{"class":35,"line":73},[33,135158,92],{"emptyLinePlaceholder":91},[33,135160,135161,135163,135165,135167,135169],{"class":35,"line":88},[33,135162,134335],{"class":50},[33,135164,212],{"class":163},[33,135166,215],{"class":167},[33,135168,96899],{"class":54},[33,135170,221],{"class":167},[33,135172,135173],{"class":35,"line":95},[33,135174,92],{"emptyLinePlaceholder":91},[33,135176,135177,135179,135181],{"class":35,"line":101},[33,135178,96062],{"class":167},[33,135180,242],{"class":163},[33,135182,589],{"class":167},[33,135184,135185,135187,135189,135191,135193,135195,135197,135199,135202],{"class":35,"line":171},[33,135186,6124],{"class":163},[33,135188,6127],{"class":167},[33,135190,662],{"class":163},[33,135192,28924],{"class":50},[33,135194,602],{"class":167},[33,135196,134335],{"class":50},[33,135198,607],{"class":167},[33,135200,135201],{"class":54},"\"*_sales.xlsx\"",[33,135203,8687],{"class":167},[33,135205,135206,135208,135211,135213],{"class":35,"line":179},[33,135207,617],{"class":163},[33,135209,135210],{"class":167}," p.name.startswith(",[33,135212,134404],{"class":54},[33,135214,1737],{"class":167},[33,135216,135217],{"class":35,"line":187},[33,135218,135219],{"class":163},"        continue\n",[33,135221,135222,135224,135226,135228,135230,135232,135234],{"class":35,"line":201},[33,135223,4025],{"class":167},[33,135225,242],{"class":163},[33,135227,25128],{"class":167},[33,135229,17351],{"class":238},[33,135231,242],{"class":163},[33,135233,17356],{"class":54},[33,135235,221],{"class":167},[33,135237,135238,135240,135242,135244,135246],{"class":35,"line":206},[33,135239,27581],{"class":167},[33,135241,134629],{"class":54},[33,135243,763],{"class":167},[33,135245,242],{"class":163},[33,135247,135248],{"class":167}," p.name\n",[33,135250,135251],{"class":35,"line":224},[33,135252,96155],{"class":167},[33,135254,135255],{"class":35,"line":229},[33,135256,92],{"emptyLinePlaceholder":91},[33,135258,135259,135261],{"class":35,"line":235},[33,135260,35574],{"class":163},[33,135262,574],{"class":167},[33,135264,135265,135267,135269,135271,135273,135275,135277,135279,135282,135284,135286],{"class":35,"line":250},[33,135266,842],{"class":167},[33,135268,242],{"class":163},[33,135270,847],{"class":167},[33,135272,850],{"class":238},[33,135274,242],{"class":163},[33,135276,855],{"class":50},[33,135278,365],{"class":167},[33,135280,135281],{"class":238},"sort",[33,135283,242],{"class":163},[33,135285,902],{"class":50},[33,135287,221],{"class":167},[33,135289,135290,135292,135294,135296],{"class":35,"line":266},[33,135291,35726],{"class":163},[33,135293,4054],{"class":50},[33,135295,1852],{"class":163},[33,135297,1855],{"class":167},[33,135299,135300,135302,135304,135306,135309,135311,135313],{"class":35,"line":290},[33,135301,35742],{"class":163},[33,135303,7590],{"class":50},[33,135305,602],{"class":167},[33,135307,135308],{"class":54},"\"No valid DataFrames to concatenate\"",[33,135310,1649],{"class":167},[33,135312,190],{"class":163},[33,135314,20843],{"class":167},[33,135316,135317],{"class":35,"line":295},[33,135318,92],{"emptyLinePlaceholder":91},[33,135320,135321,135323,135326],{"class":35,"line":300},[33,135322,13474],{"class":50},[33,135324,135325],{"class":167},"(combined.shape)   ",[33,135327,135328],{"class":39},"# (6, 4) — 3 months × 2 regions\n",[33,135330,135331,135333],{"class":35,"line":317},[33,135332,13474],{"class":50},[33,135334,11569],{"class":167},[14,135336,135337],{},"Key parameters:",[4211,135339,135340,135345,135350],{},[4214,135341,135342,135344],{},[30,135343,8362],{}," — re-numbers the index from 0 instead of repeating 0, 1 from each source.",[4214,135346,135347,135349],{},[30,135348,8880],{}," — preserves column order from the first frame; avoids surprising reordering.",[4214,135351,135352,135355],{},[30,135353,135354],{},"join=\"inner\""," — keep only columns common to all frames (useful when optional columns are genuinely unwanted).",[424,135357,135359],{"id":135358},"pdmerge-joining-on-a-shared-key","pd.merge — joining on a shared key",[14,135361,17059,135362,135364,135365,135367,135368,36608,135370,3035],{},[30,135363,127087],{}," when two DataFrames contain different information about the same entities. The ",[30,135366,131242],{}," parameter names the shared key column. If the key column has different names in each frame use ",[30,135369,131285],{},[30,135371,131288],{},[23,135373,135375],{"className":126,"code":135374,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nfrom pathlib import Path\nimport pandas as pd\n\nREPORTS_DIR = Path(\"reports\")\n\ntry:\n    actuals  = pd.concat(\n        [pd.read_excel(p, engine=\"openpyxl\") for p in sorted(REPORTS_DIR.glob(\"*_sales.xlsx\"))\n         if not p.name.startswith(\"~$\")],\n        ignore_index=True\n    ).groupby(\"region\", as_index=False)[[\"revenue\", \"units\"]].sum()\n\n    targets = pd.read_excel(REPORTS_DIR \u002F \"targets.xlsx\", engine=\"openpyxl\")\n\n    result = pd.merge(actuals, targets, on=\"region\", how=\"left\")\n    result[\"attainment_pct\"] = (result[\"revenue\"] \u002F result[\"target\"] * 100).round(1)\n    print(result)\nexcept FileNotFoundError as exc:\n    print(f\"Missing file: {exc}\")\n",[30,135376,135377,135381,135391,135401,135405,135417,135421,135427,135436,135467,135480,135489,135516,135520,135546,135550,135575,135611,135617,135627],{"__ignoreMap":28},[33,135378,135379],{"class":35,"line":36},[33,135380,3952],{"class":39},[33,135382,135383,135385,135387,135389],{"class":35,"line":43},[33,135384,190],{"class":163},[33,135386,193],{"class":167},[33,135388,164],{"class":163},[33,135390,198],{"class":167},[33,135392,135393,135395,135397,135399],{"class":35,"line":61},[33,135394,164],{"class":163},[33,135396,492],{"class":167},[33,135398,495],{"class":163},[33,135400,498],{"class":167},[33,135402,135403],{"class":35,"line":73},[33,135404,92],{"emptyLinePlaceholder":91},[33,135406,135407,135409,135411,135413,135415],{"class":35,"line":88},[33,135408,134335],{"class":50},[33,135410,212],{"class":163},[33,135412,215],{"class":167},[33,135414,96899],{"class":54},[33,135416,221],{"class":167},[33,135418,135419],{"class":35,"line":95},[33,135420,92],{"emptyLinePlaceholder":91},[33,135422,135423,135425],{"class":35,"line":101},[33,135424,35574],{"class":163},[33,135426,574],{"class":167},[33,135428,135429,135432,135434],{"class":35,"line":171},[33,135430,135431],{"class":167},"    actuals  ",[33,135433,242],{"class":163},[33,135435,126283],{"class":167},[33,135437,135438,135441,135443,135445,135447,135449,135451,135453,135455,135457,135459,135461,135463,135465],{"class":35,"line":179},[33,135439,135440],{"class":167},"        [pd.read_excel(p, ",[33,135442,17351],{"class":238},[33,135444,242],{"class":163},[33,135446,17356],{"class":54},[33,135448,1649],{"class":167},[33,135450,6124],{"class":163},[33,135452,6127],{"class":167},[33,135454,662],{"class":163},[33,135456,28924],{"class":50},[33,135458,602],{"class":167},[33,135460,134335],{"class":50},[33,135462,607],{"class":167},[33,135464,135201],{"class":54},[33,135466,371],{"class":167},[33,135468,135469,135472,135474,135476,135478],{"class":35,"line":187},[33,135470,135471],{"class":163},"         if",[33,135473,620],{"class":163},[33,135475,135210],{"class":167},[33,135477,134404],{"class":54},[33,135479,73103],{"class":167},[33,135481,135482,135484,135486],{"class":35,"line":201},[33,135483,126311],{"class":238},[33,135485,242],{"class":163},[33,135487,135488],{"class":50},"True\n",[33,135490,135491,135494,135496,135498,135500,135502,135504,135507,135509,135511,135513],{"class":35,"line":206},[33,135492,135493],{"class":167},"    ).groupby(",[33,135495,16649],{"class":54},[33,135497,365],{"class":167},[33,135499,96540],{"class":238},[33,135501,242],{"class":163},[33,135503,902],{"class":50},[33,135505,135506],{"class":167},")[[",[33,135508,16465],{"class":54},[33,135510,365],{"class":167},[33,135512,16659],{"class":54},[33,135514,135515],{"class":167},"]].sum()\n",[33,135517,135518],{"class":35,"line":224},[33,135519,92],{"emptyLinePlaceholder":91},[33,135521,135522,135525,135527,135529,135531,135533,135536,135538,135540,135542,135544],{"class":35,"line":229},[33,135523,135524],{"class":167},"    targets ",[33,135526,242],{"class":163},[33,135528,126254],{"class":167},[33,135530,134335],{"class":50},[33,135532,1107],{"class":163},[33,135534,135535],{"class":54}," \"targets.xlsx\"",[33,135537,365],{"class":167},[33,135539,17351],{"class":238},[33,135541,242],{"class":163},[33,135543,17356],{"class":54},[33,135545,221],{"class":167},[33,135547,135548],{"class":35,"line":235},[33,135549,92],{"emptyLinePlaceholder":91},[33,135551,135552,135554,135556,135559,135561,135563,135565,135567,135569,135571,135573],{"class":35,"line":250},[33,135553,8842],{"class":167},[33,135555,242],{"class":163},[33,135557,135558],{"class":167}," pd.merge(actuals, targets, ",[33,135560,2091],{"class":238},[33,135562,242],{"class":163},[33,135564,16649],{"class":54},[33,135566,365],{"class":167},[33,135568,28045],{"class":238},[33,135570,242],{"class":163},[33,135572,28050],{"class":54},[33,135574,221],{"class":167},[33,135576,135577,135579,135582,135584,135586,135589,135591,135593,135595,135597,135599,135601,135603,135605,135607,135609],{"class":35,"line":266},[33,135578,49600],{"class":167},[33,135580,135581],{"class":54},"\"attainment_pct\"",[33,135583,763],{"class":167},[33,135585,242],{"class":163},[33,135587,135588],{"class":167}," (result[",[33,135590,16465],{"class":54},[33,135592,763],{"class":167},[33,135594,1351],{"class":163},[33,135596,49611],{"class":167},[33,135598,132841],{"class":54},[33,135600,763],{"class":167},[33,135602,1769],{"class":163},[33,135604,18366],{"class":50},[33,135606,59790],{"class":167},[33,135608,734],{"class":50},[33,135610,221],{"class":167},[33,135612,135613,135615],{"class":35,"line":290},[33,135614,7268],{"class":50},[33,135616,8864],{"class":167},[33,135618,135619,135621,135623,135625],{"class":35,"line":295},[33,135620,35726],{"class":163},[33,135622,2945],{"class":50},[33,135624,1852],{"class":163},[33,135626,1855],{"class":167},[33,135628,135629,135631,135633,135635,135637,135639,135641,135643,135645],{"class":35,"line":300},[33,135630,7268],{"class":50},[33,135632,602],{"class":167},[33,135634,4059],{"class":163},[33,135636,89805],{"class":54},[33,135638,1115],{"class":50},[33,135640,6565],{"class":167},[33,135642,1121],{"class":50},[33,135644,274],{"class":54},[33,135646,221],{"class":167},[14,135648,135649,135652,135653,135656,135657,135660,135661,135663],{},[30,135650,135651],{},"how=\"left\""," keeps every row from ",[30,135654,135655],{},"actuals"," even if ",[30,135658,135659],{},"targets"," has no matching region — missing targets become ",[30,135662,8884],{}," rather than silently dropping rows.",[424,135665,135667],{"id":135666},"join-merging-on-the-index","join — merging on the index",[14,135669,135670,135673],{},[30,135671,135672],{},"DataFrame.join"," is shorthand for a merge on the index. It is useful when both frames are already indexed by the same key (e.g., a date-indexed time series).",[23,135675,135677],{"className":126,"code":135676,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nimport pandas as pd\n\ndf_a = pd.DataFrame({\"revenue\": [12000, 13200]}, index=[\"North\", \"South\"])\ndf_b = pd.DataFrame({\"cost\": [8000, 7500]},    index=[\"North\", \"South\"])\n\ncombined = df_a.join(df_b)   # index-aligned, no explicit key needed\nprint(combined)\n",[30,135678,135679,135683,135693,135697,135733,135771,135775,135788],{"__ignoreMap":28},[33,135680,135681],{"class":35,"line":36},[33,135682,3952],{"class":39},[33,135684,135685,135687,135689,135691],{"class":35,"line":43},[33,135686,164],{"class":163},[33,135688,492],{"class":167},[33,135690,495],{"class":163},[33,135692,498],{"class":167},[33,135694,135695],{"class":35,"line":61},[33,135696,92],{"emptyLinePlaceholder":91},[33,135698,135699,135702,135704,135706,135708,135710,135712,135714,135716,135719,135721,135723,135725,135727,135729,135731],{"class":35,"line":73},[33,135700,135701],{"class":167},"df_a ",[33,135703,242],{"class":163},[33,135705,101407],{"class":167},[33,135707,16465],{"class":54},[33,135709,12426],{"class":167},[33,135711,101426],{"class":50},[33,135713,365],{"class":167},[33,135715,134138],{"class":50},[33,135717,135718],{"class":167},"]}, ",[33,135720,897],{"class":238},[33,135722,242],{"class":163},[33,135724,8309],{"class":167},[33,135726,11760],{"class":54},[33,135728,365],{"class":167},[33,135730,11773],{"class":54},[33,135732,751],{"class":167},[33,135734,135735,135738,135740,135742,135744,135746,135749,135751,135754,135757,135759,135761,135763,135765,135767,135769],{"class":35,"line":88},[33,135736,135737],{"class":167},"df_b ",[33,135739,242],{"class":163},[33,135741,101407],{"class":167},[33,135743,16474],{"class":54},[33,135745,12426],{"class":167},[33,135747,135748],{"class":50},"8000",[33,135750,365],{"class":167},[33,135752,135753],{"class":50},"7500",[33,135755,135756],{"class":167},"]},    ",[33,135758,897],{"class":238},[33,135760,242],{"class":163},[33,135762,8309],{"class":167},[33,135764,11760],{"class":54},[33,135766,365],{"class":167},[33,135768,11773],{"class":54},[33,135770,751],{"class":167},[33,135772,135773],{"class":35,"line":95},[33,135774,92],{"emptyLinePlaceholder":91},[33,135776,135777,135780,135782,135785],{"class":35,"line":101},[33,135778,135779],{"class":167},"combined ",[33,135781,242],{"class":163},[33,135783,135784],{"class":167}," df_a.join(df_b)   ",[33,135786,135787],{"class":39},"# index-aligned, no explicit key needed\n",[33,135789,135790,135792],{"class":35,"line":171},[33,135791,13474],{"class":50},[33,135793,66563],{"class":167},[14,135795,17059,135796,135798,135799,135801],{},[30,135797,133955],{}," for index-keyed frames, ",[30,135800,27844],{}," for everything else.",[18,135803,135805],{"id":135804},"step-4-handle-multi-sheet-workbooks","Step 4 — Handle Multi-Sheet Workbooks",[14,135807,135808,135809,135811,135812,135814,135815,135818],{},"When one ",[30,135810,26542],{}," file contains multiple sheets, ",[30,135813,130058],{}," returns an ",[30,135816,135817],{},"OrderedDict"," mapping sheet name to DataFrame.",[23,135820,135822],{"className":126,"code":135821,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nfrom pathlib import Path\nimport pandas as pd\n\ndef load_all_sheets(path: Path) -> pd.DataFrame:\n    \"\"\"Concatenate all sheets in a workbook into one DataFrame.\"\"\"\n    try:\n        sheets: dict[str, pd.DataFrame] = pd.read_excel(\n            path, sheet_name=None, engine=\"openpyxl\"\n        )\n    except Exception as exc:\n        raise RuntimeError(f\"Cannot open {path}: {exc}\") from exc\n\n    frames = []\n    for sheet_name, df in sheets.items():\n        df[\"sheet\"] = sheet_name      # track origin sheet\n        frames.append(df)\n\n    return pd.concat(frames, ignore_index=True, sort=False)\n\n# Usage\nwb_path = Path(\"reports\") \u002F \"annual_summary.xlsx\"\nif wb_path.exists():\n    all_data = load_all_sheets(wb_path)\n    print(all_data[\"sheet\"].value_counts())\n",[30,135823,135824,135828,135838,135848,135852,135861,135866,135872,135885,135904,135908,135918,135952,135956,135964,135975,135992,135996,136000,136022,136026,136031,136049,136056,136066],{"__ignoreMap":28},[33,135825,135826],{"class":35,"line":36},[33,135827,3952],{"class":39},[33,135829,135830,135832,135834,135836],{"class":35,"line":43},[33,135831,190],{"class":163},[33,135833,193],{"class":167},[33,135835,164],{"class":163},[33,135837,198],{"class":167},[33,135839,135840,135842,135844,135846],{"class":35,"line":61},[33,135841,164],{"class":163},[33,135843,492],{"class":167},[33,135845,495],{"class":163},[33,135847,498],{"class":167},[33,135849,135850],{"class":35,"line":73},[33,135851,92],{"emptyLinePlaceholder":91},[33,135853,135854,135856,135859],{"class":35,"line":88},[33,135855,562],{"class":163},[33,135857,135858],{"class":46}," load_all_sheets",[33,135860,7103],{"class":167},[33,135862,135863],{"class":35,"line":95},[33,135864,135865],{"class":54},"    \"\"\"Concatenate all sheets in a workbook into one DataFrame.\"\"\"\n",[33,135867,135868,135870],{"class":35,"line":101},[33,135869,2424],{"class":163},[33,135871,574],{"class":167},[33,135873,135874,135877,135879,135881,135883],{"class":35,"line":171},[33,135875,135876],{"class":167},"        sheets: dict[",[33,135878,1053],{"class":50},[33,135880,14088],{"class":167},[33,135882,242],{"class":163},[33,135884,126171],{"class":167},[33,135886,135887,135890,135892,135894,135896,135898,135900,135902],{"class":35,"line":179},[33,135888,135889],{"class":167},"            path, ",[33,135891,17371],{"class":238},[33,135893,242],{"class":163},[33,135895,571],{"class":50},[33,135897,365],{"class":167},[33,135899,17351],{"class":238},[33,135901,242],{"class":163},[33,135903,130172],{"class":54},[33,135905,135906],{"class":35,"line":187},[33,135907,5867],{"class":167},[33,135909,135910,135912,135914,135916],{"class":35,"line":201},[33,135911,2449],{"class":163},[33,135913,783],{"class":50},[33,135915,1852],{"class":163},[33,135917,1855],{"class":167},[33,135919,135920,135922,135924,135926,135928,135930,135932,135934,135936,135938,135940,135942,135944,135946,135948,135950],{"class":35,"line":206},[33,135921,4051],{"class":163},[33,135923,7590],{"class":50},[33,135925,602],{"class":167},[33,135927,4059],{"class":163},[33,135929,9935],{"class":54},[33,135931,1115],{"class":50},[33,135933,2580],{"class":167},[33,135935,1121],{"class":50},[33,135937,2079],{"class":54},[33,135939,1115],{"class":50},[33,135941,6565],{"class":167},[33,135943,1121],{"class":50},[33,135945,274],{"class":54},[33,135947,1649],{"class":167},[33,135949,190],{"class":163},[33,135951,20843],{"class":167},[33,135953,135954],{"class":35,"line":224},[33,135955,92],{"emptyLinePlaceholder":91},[33,135957,135958,135960,135962],{"class":35,"line":229},[33,135959,584],{"class":167},[33,135961,242],{"class":163},[33,135963,589],{"class":167},[33,135965,135966,135968,135970,135972],{"class":35,"line":235},[33,135967,656],{"class":163},[33,135969,130236],{"class":167},[33,135971,662],{"class":163},[33,135973,135974],{"class":167}," sheets.items():\n",[33,135976,135977,135979,135982,135984,135986,135989],{"class":35,"line":250},[33,135978,10902],{"class":167},[33,135980,135981],{"class":54},"\"sheet\"",[33,135983,763],{"class":167},[33,135985,242],{"class":163},[33,135987,135988],{"class":167}," sheet_name      ",[33,135990,135991],{"class":39},"# track origin sheet\n",[33,135993,135994],{"class":35,"line":266},[33,135995,10929],{"class":167},[33,135997,135998],{"class":35,"line":290},[33,135999,92],{"emptyLinePlaceholder":91},[33,136001,136002,136004,136006,136008,136010,136012,136014,136016,136018,136020],{"class":35,"line":295},[33,136003,1332],{"class":163},[33,136005,847],{"class":167},[33,136007,850],{"class":238},[33,136009,242],{"class":163},[33,136011,855],{"class":50},[33,136013,365],{"class":167},[33,136015,135281],{"class":238},[33,136017,242],{"class":163},[33,136019,902],{"class":50},[33,136021,221],{"class":167},[33,136023,136024],{"class":35,"line":300},[33,136025,92],{"emptyLinePlaceholder":91},[33,136027,136028],{"class":35,"line":317},[33,136029,136030],{"class":39},"# Usage\n",[33,136032,136033,136036,136038,136040,136042,136044,136046],{"class":35,"line":332},[33,136034,136035],{"class":167},"wb_path ",[33,136037,242],{"class":163},[33,136039,215],{"class":167},[33,136041,96899],{"class":54},[33,136043,1649],{"class":167},[33,136045,1351],{"class":163},[33,136047,136048],{"class":54}," \"annual_summary.xlsx\"\n",[33,136050,136051,136053],{"class":35,"line":347},[33,136052,2491],{"class":163},[33,136054,136055],{"class":167}," wb_path.exists():\n",[33,136057,136058,136061,136063],{"class":35,"line":374},[33,136059,136060],{"class":167},"    all_data ",[33,136062,242],{"class":163},[33,136064,136065],{"class":167}," load_all_sheets(wb_path)\n",[33,136067,136068,136070,136073,136075],{"class":35,"line":397},[33,136069,7268],{"class":50},[33,136071,136072],{"class":167},"(all_data[",[33,136074,135981],{"class":54},[33,136076,136077],{"class":167},"].value_counts())\n",[14,136079,136080],{},"This pattern is identical to the multi-file loop: tag the source, then concatenate.",[18,136082,136084],{"id":136083},"step-5-deduplicate","Step 5 — Deduplicate",[14,136086,136087],{},"Concatenating data from overlapping sources (daily extracts that include the prior day, workbooks shared between teams) inevitably introduces duplicates.",[23,136089,136091],{"className":126,"code":136090,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nimport pandas as pd\n\ndef deduplicate(df: pd.DataFrame, subset: list[str] | None = None, keep: str = \"last\") -> pd.DataFrame:\n    \"\"\"\n    Drop exact duplicate rows.\n    subset  — columns that define identity; None means all columns.\n    keep    — 'last' retains the most recently loaded copy (useful when later files are corrections).\n    \"\"\"\n    before = len(df)\n    df = df.drop_duplicates(subset=subset, keep=keep).reset_index(drop=True)\n    removed = before - len(df)\n    if removed:\n        print(f\"Removed {removed} duplicate rows ({before} → {len(df)})\")\n    return df\n\n# deduplicate(combined, subset=[\"region\", \"source_file\"])\n",[30,136092,136093,136097,136107,136111,136145,136149,136154,136159,136164,136168,136178,136208,136223,136230,136269,136275,136279],{"__ignoreMap":28},[33,136094,136095],{"class":35,"line":36},[33,136096,3952],{"class":39},[33,136098,136099,136101,136103,136105],{"class":35,"line":43},[33,136100,164],{"class":163},[33,136102,492],{"class":167},[33,136104,495],{"class":163},[33,136106,498],{"class":167},[33,136108,136109],{"class":35,"line":61},[33,136110,92],{"emptyLinePlaceholder":91},[33,136112,136113,136115,136118,136121,136123,136125,136127,136129,136131,136133,136136,136138,136140,136143],{"class":35,"line":73},[33,136114,562],{"class":163},[33,136116,136117],{"class":46}," deduplicate",[33,136119,136120],{"class":167},"(df: pd.DataFrame, subset: list[",[33,136122,1053],{"class":50},[33,136124,763],{"class":167},[33,136126,7654],{"class":163},[33,136128,7657],{"class":50},[33,136130,212],{"class":163},[33,136132,7657],{"class":50},[33,136134,136135],{"class":167},", keep: ",[33,136137,1053],{"class":50},[33,136139,212],{"class":163},[33,136141,136142],{"class":54}," \"last\"",[33,136144,7668],{"class":167},[33,136146,136147],{"class":35,"line":88},[33,136148,7673],{"class":54},[33,136150,136151],{"class":35,"line":95},[33,136152,136153],{"class":54},"    Drop exact duplicate rows.\n",[33,136155,136156],{"class":35,"line":101},[33,136157,136158],{"class":54},"    subset  — columns that define identity; None means all columns.\n",[33,136160,136161],{"class":35,"line":171},[33,136162,136163],{"class":54},"    keep    — 'last' retains the most recently loaded copy (useful when later files are corrections).\n",[33,136165,136166],{"class":35,"line":179},[33,136167,7673],{"class":54},[33,136169,136170,136172,136174,136176],{"class":35,"line":187},[33,136171,66556],{"class":167},[33,136173,242],{"class":163},[33,136175,4037],{"class":50},[33,136177,13477],{"class":167},[33,136179,136180,136182,136184,136186,136188,136190,136193,136195,136197,136200,136202,136204,136206],{"class":35,"line":201},[33,136181,4025],{"class":167},[33,136183,242],{"class":163},[33,136185,114508],{"class":167},[33,136187,28066],{"class":238},[33,136189,242],{"class":163},[33,136191,136192],{"class":167},"subset, ",[33,136194,28077],{"class":238},[33,136196,242],{"class":163},[33,136198,136199],{"class":167},"keep).reset_index(",[33,136201,10868],{"class":238},[33,136203,242],{"class":163},[33,136205,855],{"class":50},[33,136207,221],{"class":167},[33,136209,136210,136213,136215,136217,136219,136221],{"class":35,"line":206},[33,136211,136212],{"class":167},"    removed ",[33,136214,242],{"class":163},[33,136216,68044],{"class":167},[33,136218,4126],{"class":163},[33,136220,4037],{"class":50},[33,136222,13477],{"class":167},[33,136224,136225,136227],{"class":35,"line":224},[33,136226,617],{"class":163},[33,136228,136229],{"class":167}," removed:\n",[33,136231,136232,136234,136236,136238,136241,136243,136246,136248,136251,136253,136255,136257,136259,136261,136263,136265,136267],{"class":35,"line":229},[33,136233,9414],{"class":50},[33,136235,602],{"class":167},[33,136237,4059],{"class":163},[33,136239,136240],{"class":54},"\"Removed ",[33,136242,1115],{"class":50},[33,136244,136245],{"class":167},"removed",[33,136247,1121],{"class":50},[33,136249,136250],{"class":54}," duplicate rows (",[33,136252,1115],{"class":50},[33,136254,59039],{"class":167},[33,136256,1121],{"class":50},[33,136258,69863],{"class":54},[33,136260,4065],{"class":50},[33,136262,4068],{"class":167},[33,136264,1121],{"class":50},[33,136266,72406],{"class":54},[33,136268,221],{"class":167},[33,136270,136271,136273],{"class":35,"line":235},[33,136272,1332],{"class":163},[33,136274,11719],{"class":167},[33,136276,136277],{"class":35,"line":250},[33,136278,92],{"emptyLinePlaceholder":91},[33,136280,136281],{"class":35,"line":266},[33,136282,136283],{"class":39},"# deduplicate(combined, subset=[\"region\", \"source_file\"])\n",[14,136285,136286],{},"For near-duplicate detection (same entity, slightly different values) use a hash of normalized key columns rather than relying on exact equality.",[18,136288,136289],{"id":12502},"Step 6 — Validate Before Export",[23,136291,136293],{"className":126,"code":136292,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nimport pandas as pd\n\ndef validate(df: pd.DataFrame, required_cols: list[str]) -> None:\n    missing = [c for c in required_cols if c not in df.columns]\n    if missing:\n        raise ValueError(f\"Missing required columns: {missing}\")\n\n    null_counts = df[required_cols].isnull().sum()\n    if null_counts.any():\n        print(\"Nulls in required columns:\\n\", null_counts[null_counts > 0])\n\n    assert df.index.is_unique, \"Index is not unique — call reset_index(drop=True)\"\n    print(f\"Validation passed. Shape: {df.shape}\")\n\n# validate(combined, required_cols=[\"region\", \"revenue\", \"units\"])\n",[30,136294,136295,136299,136309,136313,136330,136356,136362,136385,136389,136399,136406,136428,136432,136442,136463,136467],{"__ignoreMap":28},[33,136296,136297],{"class":35,"line":36},[33,136298,3952],{"class":39},[33,136300,136301,136303,136305,136307],{"class":35,"line":43},[33,136302,164],{"class":163},[33,136304,492],{"class":167},[33,136306,495],{"class":163},[33,136308,498],{"class":167},[33,136310,136311],{"class":35,"line":61},[33,136312,92],{"emptyLinePlaceholder":91},[33,136314,136315,136317,136319,136322,136324,136326,136328],{"class":35,"line":73},[33,136316,562],{"class":163},[33,136318,25052],{"class":46},[33,136320,136321],{"class":167},"(df: pd.DataFrame, required_cols: list[",[33,136323,1053],{"class":50},[33,136325,28895],{"class":167},[33,136327,571],{"class":50},[33,136329,574],{"class":167},[33,136331,136332,136334,136336,136338,136340,136342,136344,136346,136348,136350,136352,136354],{"class":35,"line":88},[33,136333,4118],{"class":167},[33,136335,242],{"class":163},[33,136337,7740],{"class":167},[33,136339,6124],{"class":163},[33,136341,7486],{"class":167},[33,136343,662],{"class":163},[33,136345,4123],{"class":167},[33,136347,2491],{"class":163},[33,136349,7486],{"class":167},[33,136351,7999],{"class":163},[33,136353,8002],{"class":163},[33,136355,12624],{"class":167},[33,136357,136358,136360],{"class":35,"line":95},[33,136359,617],{"class":163},[33,136361,4139],{"class":167},[33,136363,136364,136366,136368,136370,136372,136375,136377,136379,136381,136383],{"class":35,"line":101},[33,136365,4051],{"class":163},[33,136367,4054],{"class":50},[33,136369,602],{"class":167},[33,136371,4059],{"class":163},[33,136373,136374],{"class":54},"\"Missing required columns: ",[33,136376,1115],{"class":50},[33,136378,4157],{"class":167},[33,136380,1121],{"class":50},[33,136382,274],{"class":54},[33,136384,221],{"class":167},[33,136386,136387],{"class":35,"line":171},[33,136388,92],{"emptyLinePlaceholder":91},[33,136390,136391,136394,136396],{"class":35,"line":179},[33,136392,136393],{"class":167},"    null_counts ",[33,136395,242],{"class":163},[33,136397,136398],{"class":167}," df[required_cols].isnull().sum()\n",[33,136400,136401,136403],{"class":35,"line":187},[33,136402,617],{"class":163},[33,136404,136405],{"class":167}," null_counts.any():\n",[33,136407,136408,136410,136412,136415,136417,136419,136422,136424,136426],{"class":35,"line":201},[33,136409,9414],{"class":50},[33,136411,602],{"class":167},[33,136413,136414],{"class":54},"\"Nulls in required columns:",[33,136416,25830],{"class":50},[33,136418,274],{"class":54},[33,136420,136421],{"class":167},", null_counts[null_counts ",[33,136423,6009],{"class":163},[33,136425,10791],{"class":50},[33,136427,751],{"class":167},[33,136429,136430],{"class":35,"line":206},[33,136431,92],{"emptyLinePlaceholder":91},[33,136433,136434,136436,136439],{"class":35,"line":224},[33,136435,9228],{"class":163},[33,136437,136438],{"class":167}," df.index.is_unique, ",[33,136440,136441],{"class":54},"\"Index is not unique — call reset_index(drop=True)\"\n",[33,136443,136444,136446,136448,136450,136453,136455,136457,136459,136461],{"class":35,"line":229},[33,136445,7268],{"class":50},[33,136447,602],{"class":167},[33,136449,4059],{"class":163},[33,136451,136452],{"class":54},"\"Validation passed. Shape: ",[33,136454,1115],{"class":50},[33,136456,9426],{"class":167},[33,136458,1121],{"class":50},[33,136460,274],{"class":54},[33,136462,221],{"class":167},[33,136464,136465],{"class":35,"line":235},[33,136466,92],{"emptyLinePlaceholder":91},[33,136468,136469],{"class":35,"line":250},[33,136470,136471],{"class":39},"# validate(combined, required_cols=[\"region\", \"revenue\", \"units\"])\n",[18,136473,12944],{"id":12943},[424,136475,136477],{"id":136476},"variant-a-mixed-xlsx-and-csv-in-the-same-folder","Variant A — Mixed .xlsx and .csv in the same folder",[23,136479,136481],{"className":126,"code":136480,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nfrom pathlib import Path\nimport pandas as pd\n\nDATA_DIR = Path(\"mixed_reports\")\n\nframes = []\nfor p in DATA_DIR.iterdir():\n    if p.suffix == \".xlsx\" and not p.name.startswith(\"~$\"):\n        frames.append(pd.read_excel(p, engine=\"openpyxl\"))\n    elif p.suffix == \".csv\":\n        frames.append(pd.read_csv(p))\n\ncombined = pd.concat(frames, ignore_index=True, sort=False)\n",[30,136482,136483,136487,136497,136507,136511,136524,136528,136536,136550,136570,136583,136597,136602,136606],{"__ignoreMap":28},[33,136484,136485],{"class":35,"line":36},[33,136486,3952],{"class":39},[33,136488,136489,136491,136493,136495],{"class":35,"line":43},[33,136490,190],{"class":163},[33,136492,193],{"class":167},[33,136494,164],{"class":163},[33,136496,198],{"class":167},[33,136498,136499,136501,136503,136505],{"class":35,"line":61},[33,136500,164],{"class":163},[33,136502,492],{"class":167},[33,136504,495],{"class":163},[33,136506,498],{"class":167},[33,136508,136509],{"class":35,"line":73},[33,136510,92],{"emptyLinePlaceholder":91},[33,136512,136513,136515,136517,136519,136522],{"class":35,"line":88},[33,136514,95963],{"class":50},[33,136516,212],{"class":163},[33,136518,215],{"class":167},[33,136520,136521],{"class":54},"\"mixed_reports\"",[33,136523,221],{"class":167},[33,136525,136526],{"class":35,"line":95},[33,136527,92],{"emptyLinePlaceholder":91},[33,136529,136530,136532,136534],{"class":35,"line":101},[33,136531,96062],{"class":167},[33,136533,242],{"class":163},[33,136535,589],{"class":167},[33,136537,136538,136540,136542,136544,136547],{"class":35,"line":171},[33,136539,6124],{"class":163},[33,136541,6127],{"class":167},[33,136543,662],{"class":163},[33,136545,136546],{"class":50}," DATA_DIR",[33,136548,136549],{"class":167},".iterdir():\n",[33,136551,136552,136554,136556,136558,136560,136562,136564,136566,136568],{"class":35,"line":179},[33,136553,617],{"class":163},[33,136555,134611],{"class":167},[33,136557,1865],{"class":163},[33,136559,134616],{"class":54},[33,136561,5615],{"class":163},[33,136563,620],{"class":163},[33,136565,135210],{"class":167},[33,136567,134404],{"class":54},[33,136569,1737],{"class":167},[33,136571,136572,136575,136577,136579,136581],{"class":35,"line":187},[33,136573,136574],{"class":167},"        frames.append(pd.read_excel(p, ",[33,136576,17351],{"class":238},[33,136578,242],{"class":163},[33,136580,17356],{"class":54},[33,136582,371],{"class":167},[33,136584,136585,136588,136590,136592,136595],{"class":35,"line":201},[33,136586,136587],{"class":163},"    elif",[33,136589,134611],{"class":167},[33,136591,1865],{"class":163},[33,136593,136594],{"class":54}," \".csv\"",[33,136596,574],{"class":167},[33,136598,136599],{"class":35,"line":206},[33,136600,136601],{"class":167},"        frames.append(pd.read_csv(p))\n",[33,136603,136604],{"class":35,"line":224},[33,136605,92],{"emptyLinePlaceholder":91},[33,136607,136608,136610,136612,136614,136616,136618,136620,136622,136624,136626,136628],{"class":35,"line":229},[33,136609,135779],{"class":167},[33,136611,242],{"class":163},[33,136613,847],{"class":167},[33,136615,850],{"class":238},[33,136617,242],{"class":163},[33,136619,855],{"class":50},[33,136621,365],{"class":167},[33,136623,135281],{"class":238},[33,136625,242],{"class":163},[33,136627,902],{"class":50},[33,136629,221],{"class":167},[424,136631,136633],{"id":136632},"variant-b-read_excel-with-explicit-sheet-and-header-row","Variant B — read_excel with explicit sheet and header row",[14,136635,136636,136637,136640],{},"Some workbooks have metadata rows above the actual header. Use ",[30,136638,136639],{},"header="," to skip them.",[23,136642,136644],{"className":126,"code":136643,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nimport pandas as pd\nfrom pathlib import Path\n\ndf = pd.read_excel(\n    Path(\"reports\") \u002F \"budget_2026.xlsx\",\n    sheet_name=\"Q1\",\n    header=3,           # row index 3 (0-based) contains the real header\n    usecols=\"B:F\",      # only read columns B through F\n    engine=\"openpyxl\",\n)\n",[30,136645,136646,136650,136660,136670,136674,136682,136697,136709,136724,136739,136750],{"__ignoreMap":28},[33,136647,136648],{"class":35,"line":36},[33,136649,3952],{"class":39},[33,136651,136652,136654,136656,136658],{"class":35,"line":43},[33,136653,164],{"class":163},[33,136655,492],{"class":167},[33,136657,495],{"class":163},[33,136659,498],{"class":167},[33,136661,136662,136664,136666,136668],{"class":35,"line":61},[33,136663,190],{"class":163},[33,136665,193],{"class":167},[33,136667,164],{"class":163},[33,136669,198],{"class":167},[33,136671,136672],{"class":35,"line":73},[33,136673,92],{"emptyLinePlaceholder":91},[33,136675,136676,136678,136680],{"class":35,"line":88},[33,136677,13459],{"class":167},[33,136679,242],{"class":163},[33,136681,126171],{"class":167},[33,136683,136684,136686,136688,136690,136692,136695],{"class":35,"line":95},[33,136685,84167],{"class":167},[33,136687,96899],{"class":54},[33,136689,1649],{"class":167},[33,136691,1351],{"class":163},[33,136693,136694],{"class":54}," \"budget_2026.xlsx\"",[33,136696,247],{"class":167},[33,136698,136699,136702,136704,136707],{"class":35,"line":101},[33,136700,136701],{"class":238},"    sheet_name",[33,136703,242],{"class":163},[33,136705,136706],{"class":54},"\"Q1\"",[33,136708,247],{"class":167},[33,136710,136711,136714,136716,136718,136721],{"class":35,"line":171},[33,136712,136713],{"class":238},"    header",[33,136715,242],{"class":163},[33,136717,10258],{"class":50},[33,136719,136720],{"class":167},",           ",[33,136722,136723],{"class":39},"# row index 3 (0-based) contains the real header\n",[33,136725,136726,136729,136731,136734,136736],{"class":35,"line":179},[33,136727,136728],{"class":238},"    usecols",[33,136730,242],{"class":163},[33,136732,136733],{"class":54},"\"B:F\"",[33,136735,121141],{"class":167},[33,136737,136738],{"class":39},"# only read columns B through F\n",[33,136740,136741,136744,136746,136748],{"class":35,"line":187},[33,136742,136743],{"class":238},"    engine",[33,136745,242],{"class":163},[33,136747,17356],{"class":54},[33,136749,247],{"class":167},[33,136751,136752],{"class":35,"line":201},[33,136753,221],{"class":167},[424,136755,136757],{"id":136756},"variant-c-merging-on-differently-named-keys-with-validation","Variant C — merging on differently-named keys with validation",[14,136759,136760,136761,36608,136763,136765,136766,136768],{},"If the key column has a different name in each DataFrame, use ",[30,136762,131264],{},[30,136764,131267],{},". Add ",[30,136767,127407],{}," to catch unexpected many-to-many joins early.",[23,136770,136772],{"className":126,"code":136771,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nimport pandas as pd\n\nsales   = pd.DataFrame({\"sale_region\": [\"North\", \"South\"], \"revenue\": [37000, 29400]})\ntargets = pd.DataFrame({\"region\":      [\"North\", \"South\"], \"target\":  [40000, 32000]})\n\ntry:\n    result = pd.merge(\n        sales, targets,\n        left_on=\"sale_region\", right_on=\"region\",\n        how=\"left\",\n        validate=\"many_to_one\",   # raises MergeError if targets has duplicate region keys\n    )\n    result = result.drop(columns=[\"region\"])   # remove the redundant right-hand key column\n    print(result)\nexcept pd.errors.MergeError as exc:\n    print(f\"Join cardinality error: {exc}\")\n",[30,136773,136774,136778,136788,136792,136824,136856,136860,136866,136874,136879,136898,136909,136922,136926,136947,136953,136963],{"__ignoreMap":28},[33,136775,136776],{"class":35,"line":36},[33,136777,3952],{"class":39},[33,136779,136780,136782,136784,136786],{"class":35,"line":43},[33,136781,164],{"class":163},[33,136783,492],{"class":167},[33,136785,495],{"class":163},[33,136787,498],{"class":167},[33,136789,136790],{"class":35,"line":61},[33,136791,92],{"emptyLinePlaceholder":91},[33,136793,136794,136796,136798,136800,136802,136804,136806,136808,136810,136812,136814,136816,136818,136820,136822],{"class":35,"line":73},[33,136795,132785],{"class":167},[33,136797,242],{"class":163},[33,136799,101407],{"class":167},[33,136801,132792],{"class":54},[33,136803,12426],{"class":167},[33,136805,11760],{"class":54},[33,136807,365],{"class":167},[33,136809,11773],{"class":54},[33,136811,8314],{"class":167},[33,136813,16465],{"class":54},[33,136815,12426],{"class":167},[33,136817,132809],{"class":50},[33,136819,365],{"class":167},[33,136821,132814],{"class":50},[33,136823,45051],{"class":167},[33,136825,136826,136828,136830,136832,136834,136836,136838,136840,136842,136844,136846,136848,136850,136852,136854],{"class":35,"line":88},[33,136827,132821],{"class":167},[33,136829,242],{"class":163},[33,136831,101407],{"class":167},[33,136833,16649],{"class":54},[33,136835,132830],{"class":167},[33,136837,11760],{"class":54},[33,136839,365],{"class":167},[33,136841,11773],{"class":54},[33,136843,8314],{"class":167},[33,136845,132841],{"class":54},[33,136847,11818],{"class":167},[33,136849,132846],{"class":50},[33,136851,365],{"class":167},[33,136853,132851],{"class":50},[33,136855,45051],{"class":167},[33,136857,136858],{"class":35,"line":95},[33,136859,92],{"emptyLinePlaceholder":91},[33,136861,136862,136864],{"class":35,"line":101},[33,136863,35574],{"class":163},[33,136865,574],{"class":167},[33,136867,136868,136870,136872],{"class":35,"line":171},[33,136869,8842],{"class":167},[33,136871,242],{"class":163},[33,136873,131794],{"class":167},[33,136875,136876],{"class":35,"line":179},[33,136877,136878],{"class":167},"        sales, targets,\n",[33,136880,136881,136884,136886,136888,136890,136892,136894,136896],{"class":35,"line":187},[33,136882,136883],{"class":238},"        left_on",[33,136885,242],{"class":163},[33,136887,132792],{"class":54},[33,136889,365],{"class":167},[33,136891,131267],{"class":238},[33,136893,242],{"class":163},[33,136895,16649],{"class":54},[33,136897,247],{"class":167},[33,136899,136900,136903,136905,136907],{"class":35,"line":201},[33,136901,136902],{"class":238},"        how",[33,136904,242],{"class":163},[33,136906,28050],{"class":54},[33,136908,247],{"class":167},[33,136910,136911,136914,136916,136918,136920],{"class":35,"line":206},[33,136912,136913],{"class":238},"        validate",[33,136915,242],{"class":163},[33,136917,127319],{"class":54},[33,136919,1166],{"class":167},[33,136921,133044],{"class":39},[33,136923,136924],{"class":35,"line":224},[33,136925,1202],{"class":167},[33,136927,136928,136930,136932,136934,136936,136938,136940,136942,136944],{"class":35,"line":229},[33,136929,8842],{"class":167},[33,136931,242],{"class":163},[33,136933,132613],{"class":167},[33,136935,740],{"class":238},[33,136937,242],{"class":163},[33,136939,8309],{"class":167},[33,136941,16649],{"class":54},[33,136943,7283],{"class":167},[33,136945,136946],{"class":39},"# remove the redundant right-hand key column\n",[33,136948,136949,136951],{"class":35,"line":235},[33,136950,7268],{"class":50},[33,136952,8864],{"class":167},[33,136954,136955,136957,136959,136961],{"class":35,"line":250},[33,136956,35726],{"class":163},[33,136958,133230],{"class":167},[33,136960,495],{"class":163},[33,136962,1855],{"class":167},[33,136964,136965,136967,136969,136971,136974,136976,136978,136980,136982],{"class":35,"line":266},[33,136966,7268],{"class":50},[33,136968,602],{"class":167},[33,136970,4059],{"class":163},[33,136972,136973],{"class":54},"\"Join cardinality error: ",[33,136975,1115],{"class":50},[33,136977,6565],{"class":167},[33,136979,1121],{"class":50},[33,136981,274],{"class":54},[33,136983,221],{"class":167},[14,136985,136986,136987,1351,136989,136991,136992,136994],{},"When the merge produces unexpected ",[30,136988,28106],{},[30,136990,28109],{}," suffix columns, see ",[940,136993,28114],{"href":28113}," for root cause and fixes.",[18,136996,13845],{"id":13844},[4273,136998,136999,137012],{},[4276,137000,137001],{},[4279,137002,137003,137006,137009],{},[4282,137004,137005],{"align":128901},"Approach",[4282,137007,137008],{"align":128901},"When to use",[4282,137010,137011],{"align":128901},"Typical limit",[4292,137013,137014,137027,137040,137052],{},[4279,137015,137016,137021,137024],{},[4297,137017,137018,137020],{"align":128901},[30,137019,8366],{}," in memory",[4297,137022,137023],{"align":128901},"\u003C 500 MB total",[4297,137025,137026],{"align":128901},"Available RAM",[4279,137028,137029,137034,137037],{},[4297,137030,137031,137032,12027],{"align":128901},"CSV chunking (",[30,137033,21944],{},[4297,137035,137036],{"align":128901},"Large CSVs, stream-friendly",[4297,137038,137039],{"align":128901},"Unlimited",[4279,137041,137042,137046,137049],{},[4297,137043,137044],{"align":128901},[30,137045,116077],{},[4297,137047,137048],{"align":128901},"Parallel, out-of-core, multi-file",[4297,137050,137051],{"align":128901},"Disk size",[4279,137053,137054,137061,137064],{},[4297,137055,137056,137057,36608,137059],{"align":128901},"DuckDB ",[30,137058,57240],{},[30,137060,57237],{},[4297,137062,137063],{"align":128901},"SQL over flat files, joins at scale",[4297,137065,137051],{"align":128901},[14,137067,137068,137069,137071],{},"For workloads that fit in memory, ",[30,137070,8366],{}," is fastest. Beyond ~500 MB, chunked CSV reading or DuckDB avoids out-of-memory crashes without rewriting your logic.",[18,137073,4271],{"id":4270},[4273,137075,137076,137086],{},[4276,137077,137078],{},[4279,137079,137080,137082,137084],{},[4282,137081,14317],{"align":128901},[4282,137083,4287],{"align":128901},[4282,137085,4290],{"align":128901},[4292,137087,137088,137110,137132,137146,137167],{},[4279,137089,137090,137095,137101],{},[4297,137091,137092],{"align":128901},[30,137093,137094],{},"ValueError: No objects to concatenate",[4297,137096,137097,137100],{"align":128901},[30,137098,137099],{},"frames"," list is empty — no matching files found",[4297,137102,67848,137103,137105,137106,68044,137108],{"align":128901},[30,137104,133947],{}," pattern and directory path; print ",[30,137107,137099],{},[30,137109,99426],{},[4279,137111,137112,137118,137126],{},[4297,137113,137114,137117],{"align":128901},[30,137115,137116],{},"KeyError: 'region'"," after concat",[4297,137119,137120,137121,71066,137124,12027],{"align":128901},"Column name normalized differently in one file (",[30,137122,137123],{},"Region",[30,137125,95904],{},[4297,137127,11870,137128,137131],{"align":128901},[30,137129,137130],{},"normalize_columns"," before concatenating",[4279,137133,137134,137138,137141],{},[4297,137135,137136],{"align":128901},[30,137137,70953],{},[4297,137139,137140],{"align":128901},"Too many large files loaded at once",[4297,137142,137143,137144],{"align":128901},"Process in batches or use ",[30,137145,116077],{},[4279,137147,137148,137155,137160],{},[4297,137149,137150,137151,36608,137153],{"align":128901},"Columns appear as ",[30,137152,131246],{},[30,137154,131249],{},[4297,137156,137157,137158],{"align":128901},"Shared non-key column names in ",[30,137159,127087],{},[4297,137161,35815,137162,137164,137165],{"align":128901},[30,137163,131664],{}," or drop\u002Frename before merging — see ",[940,137166,28114],{"href":28113},[4279,137168,137169,137174,137179],{},[4297,137170,137171],{"align":128901},[30,137172,137173],{},"MergeError: Merge keys are not unique",[4297,137175,137176,137178],{"align":128901},[30,137177,127407],{}," caught a cardinality mismatch",[4297,137180,137181,137182],{"align":128901},"Inspect duplicate keys with ",[30,137183,137184],{},"df[df.duplicated(subset=[\"key\"])]",[18,137186,4402],{"id":4401},[23,137188,137190],{"className":126,"code":137189,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\n\"\"\"merge_spreadsheets.py — consolidate a folder of .xlsx and .csv files into one output.\"\"\"\nimport argparse\nimport logging\nimport re\nimport sys\nfrom pathlib import Path\n\nimport pandas as pd\n\nlogging.basicConfig(level=logging.INFO, format=\"%(levelname)s: %(message)s\")\n\n\ndef discover(directory: Path, extensions: tuple[str, ...] = (\".xlsx\", \".csv\")) -> list[Path]:\n    paths: list[Path] = []\n    for ext in extensions:\n        paths.extend(p for p in directory.rglob(f\"*{ext}\") if not p.name.startswith((\"~$\", \".\")))\n    return sorted(paths)\n\n\ndef load(paths: list[Path]) -> list[pd.DataFrame]:\n    frames: list[pd.DataFrame] = []\n    for p in paths:\n        try:\n            df = pd.read_excel(p, engine=\"openpyxl\") if p.suffix == \".xlsx\" else pd.read_csv(p)\n            df[\"source_file\"] = p.name\n            frames.append(df)\n            logging.info(\"Loaded %s (%d rows)\", p.name, len(df))\n        except Exception as exc:\n            logging.warning(\"Skipping %s: %s\", p.name, exc)\n    return frames\n\n\ndef normalize_cols(df: pd.DataFrame) -> pd.DataFrame:\n    df.columns = [re.sub(r\"[\\s\\-]+\", \"_\", c.strip().lower()) for c in df.columns]\n    return df\n\n\ndef main() -> None:\n    parser = argparse.ArgumentParser(description=\"Merge spreadsheets in a folder\")\n    parser.add_argument(\"input_dir\",  type=Path, help=\"Folder containing source files\")\n    parser.add_argument(\"output_file\", type=Path, help=\"Destination .csv or .xlsx\")\n    parser.add_argument(\"--dedup-cols\", nargs=\"*\", help=\"Columns that define a unique row\")\n    args = parser.parse_args()\n\n    if not args.input_dir.is_dir():\n        sys.exit(f\"Not a directory: {args.input_dir}\")\n\n    paths = discover(args.input_dir)\n    if not paths:\n        sys.exit(\"No .xlsx or .csv files found\")\n\n    frames = load(paths)\n    if not frames:\n        sys.exit(\"All files failed to load\")\n\n    frames = [normalize_cols(df) for df in frames]\n\n    try:\n        combined = pd.concat(frames, ignore_index=True, sort=False)\n    except ValueError as exc:\n        sys.exit(f\"Concatenation failed: {exc}\")\n\n    if args.dedup_cols:\n        before = len(combined)\n        combined = combined.drop_duplicates(subset=args.dedup_cols, keep=\"last\")\n        logging.info(\"Dedup removed %d rows\", before - len(combined))\n\n    combined = combined.reset_index(drop=True)\n\n    try:\n        if args.output_file.suffix == \".xlsx\":\n            combined.to_excel(args.output_file, index=False, engine=\"openpyxl\")\n        else:\n            combined.to_csv(args.output_file, index=False)\n        logging.info(\"Wrote %d rows to %s\", len(combined), args.output_file)\n    except Exception as exc:\n        sys.exit(f\"Export failed: {exc}\")\n\n\nif __name__ == \"__main__\":\n    main()\n",[30,137191,137192,137196,137201,137207,137213,137219,137225,137235,137239,137249,137253,137283,137287,137291,137324,137333,137345,137388,137397,137401,137405,137414,137422,137432,137438,137466,137478,137482,137502,137512,137528,137534,137538,137542,137551,137587,137593,137597,137601,137613,137630,137654,137678,137704,137712,137716,137725,137744,137748,137757,137765,137774,137778,137787,137795,137804,137808,137825,137829,137835,137859,137869,137888,137892,137899,137910,137933,137953,137957,137973,137977,137983,137996,138017,138023,138036,138057,138067,138085,138089,138093,138105],{"__ignoreMap":28},[33,137193,137194],{"class":35,"line":36},[33,137195,3952],{"class":39},[33,137197,137198],{"class":35,"line":43},[33,137199,137200],{"class":54},"\"\"\"merge_spreadsheets.py — consolidate a folder of .xlsx and .csv files into one output.\"\"\"\n",[33,137202,137203,137205],{"class":35,"line":61},[33,137204,164],{"class":163},[33,137206,4461],{"class":167},[33,137208,137209,137211],{"class":35,"line":73},[33,137210,164],{"class":163},[33,137212,184],{"class":167},[33,137214,137215,137217],{"class":35,"line":88},[33,137216,164],{"class":163},[33,137218,11917],{"class":167},[33,137220,137221,137223],{"class":35,"line":95},[33,137222,164],{"class":163},[33,137224,168],{"class":167},[33,137226,137227,137229,137231,137233],{"class":35,"line":101},[33,137228,190],{"class":163},[33,137230,193],{"class":167},[33,137232,164],{"class":163},[33,137234,198],{"class":167},[33,137236,137237],{"class":35,"line":171},[33,137238,92],{"emptyLinePlaceholder":91},[33,137240,137241,137243,137245,137247],{"class":35,"line":179},[33,137242,164],{"class":163},[33,137244,492],{"class":167},[33,137246,495],{"class":163},[33,137248,498],{"class":167},[33,137250,137251],{"class":35,"line":187},[33,137252,92],{"emptyLinePlaceholder":91},[33,137254,137255,137257,137259,137261,137263,137265,137267,137269,137271,137273,137275,137277,137279,137281],{"class":35,"line":201},[33,137256,71660],{"class":167},[33,137258,18267],{"class":238},[33,137260,242],{"class":163},[33,137262,258],{"class":167},[33,137264,1067],{"class":50},[33,137266,365],{"class":167},[33,137268,61926],{"class":238},[33,137270,242],{"class":163},[33,137272,274],{"class":54},[33,137274,26817],{"class":50},[33,137276,2079],{"class":54},[33,137278,26827],{"class":50},[33,137280,274],{"class":54},[33,137282,221],{"class":167},[33,137284,137285],{"class":35,"line":206},[33,137286,92],{"emptyLinePlaceholder":91},[33,137288,137289],{"class":35,"line":224},[33,137290,92],{"emptyLinePlaceholder":91},[33,137292,137293,137295,137298,137301,137303,137305,137308,137310,137312,137314,137316,137318,137321],{"class":35,"line":229},[33,137294,562],{"class":163},[33,137296,137297],{"class":46}," discover",[33,137299,137300],{"class":167},"(directory: Path, extensions: tuple[",[33,137302,1053],{"class":50},[33,137304,365],{"class":167},[33,137306,137307],{"class":50},"...",[33,137309,763],{"class":167},[33,137311,242],{"class":163},[33,137313,17583],{"class":167},[33,137315,27374],{"class":54},[33,137317,365],{"class":167},[33,137319,137320],{"class":54},"\".csv\"",[33,137322,137323],{"class":167},")) -> list[Path]:\n",[33,137325,137326,137329,137331],{"class":35,"line":235},[33,137327,137328],{"class":167},"    paths: list[Path] ",[33,137330,242],{"class":163},[33,137332,589],{"class":167},[33,137334,137335,137337,137340,137342],{"class":35,"line":250},[33,137336,656],{"class":163},[33,137338,137339],{"class":167}," ext ",[33,137341,662],{"class":163},[33,137343,137344],{"class":167}," extensions:\n",[33,137346,137347,137350,137352,137354,137356,137359,137361,137363,137365,137368,137370,137372,137374,137376,137378,137380,137382,137384,137386],{"class":35,"line":266},[33,137348,137349],{"class":167},"        paths.extend(p ",[33,137351,6124],{"class":163},[33,137353,6127],{"class":167},[33,137355,662],{"class":163},[33,137357,137358],{"class":167}," directory.rglob(",[33,137360,4059],{"class":163},[33,137362,5781],{"class":54},[33,137364,1115],{"class":50},[33,137366,137367],{"class":167},"ext",[33,137369,1121],{"class":50},[33,137371,274],{"class":54},[33,137373,1649],{"class":167},[33,137375,2491],{"class":163},[33,137377,620],{"class":163},[33,137379,134401],{"class":167},[33,137381,134404],{"class":54},[33,137383,365],{"class":167},[33,137385,134409],{"class":54},[33,137387,23269],{"class":167},[33,137389,137390,137392,137394],{"class":35,"line":290},[33,137391,1332],{"class":163},[33,137393,28924],{"class":50},[33,137395,137396],{"class":167},"(paths)\n",[33,137398,137399],{"class":35,"line":295},[33,137400,92],{"emptyLinePlaceholder":91},[33,137402,137403],{"class":35,"line":300},[33,137404,92],{"emptyLinePlaceholder":91},[33,137406,137407,137409,137412],{"class":35,"line":317},[33,137408,562],{"class":163},[33,137410,137411],{"class":46}," load",[33,137413,134561],{"class":167},[33,137415,137416,137418,137420],{"class":35,"line":332},[33,137417,6183],{"class":167},[33,137419,242],{"class":163},[33,137421,589],{"class":167},[33,137423,137424,137426,137428,137430],{"class":35,"line":347},[33,137425,656],{"class":163},[33,137427,6127],{"class":167},[33,137429,662],{"class":163},[33,137431,73215],{"class":167},[33,137433,137434,137436],{"class":35,"line":374},[33,137435,670],{"class":163},[33,137437,574],{"class":167},[33,137439,137440,137442,137444,137446,137448,137450,137452,137454,137456,137458,137460,137462,137464],{"class":35,"line":397},[33,137441,51528],{"class":167},[33,137443,242],{"class":163},[33,137445,25128],{"class":167},[33,137447,17351],{"class":238},[33,137449,242],{"class":163},[33,137451,17356],{"class":54},[33,137453,1649],{"class":167},[33,137455,2491],{"class":163},[33,137457,134611],{"class":167},[33,137459,1865],{"class":163},[33,137461,134616],{"class":54},[33,137463,15715],{"class":163},[33,137465,134621],{"class":167},[33,137467,137468,137470,137472,137474,137476],{"class":35,"line":653},[33,137469,134626],{"class":167},[33,137471,134629],{"class":54},[33,137473,763],{"class":167},[33,137475,242],{"class":163},[33,137477,135248],{"class":167},[33,137479,137480],{"class":35,"line":667},[33,137481,134644],{"class":167},[33,137483,137484,137486,137488,137490,137492,137494,137496,137498,137500],{"class":35,"line":675},[33,137485,134649],{"class":167},[33,137487,96187],{"class":54},[33,137489,309],{"class":50},[33,137491,17583],{"class":54},[33,137493,916],{"class":50},[33,137495,18029],{"class":54},[33,137497,134662],{"class":167},[33,137499,928],{"class":50},[33,137501,128027],{"class":167},[33,137503,137504,137506,137508,137510],{"class":35,"line":689},[33,137505,780],{"class":163},[33,137507,783],{"class":50},[33,137509,1852],{"class":163},[33,137511,1855],{"class":167},[33,137513,137514,137516,137518,137520,137522,137524,137526],{"class":35,"line":703},[33,137515,134681],{"class":167},[33,137517,6272],{"class":54},[33,137519,309],{"class":50},[33,137521,2079],{"class":54},[33,137523,309],{"class":50},[33,137525,274],{"class":54},[33,137527,134694],{"class":167},[33,137529,137530,137532],{"class":35,"line":714},[33,137531,1332],{"class":163},[33,137533,6065],{"class":167},[33,137535,137536],{"class":35,"line":723},[33,137537,92],{"emptyLinePlaceholder":91},[33,137539,137540],{"class":35,"line":754},[33,137541,92],{"emptyLinePlaceholder":91},[33,137543,137544,137546,137549],{"class":35,"line":771},[33,137545,562],{"class":163},[33,137547,137548],{"class":46}," normalize_cols",[33,137550,12127],{"class":167},[33,137552,137553,137555,137557,137559,137561,137563,137565,137567,137569,137571,137573,137575,137577,137579,137581,137583,137585],{"class":35,"line":777},[33,137554,27546],{"class":167},[33,137556,242],{"class":163},[33,137558,134776],{"class":167},[33,137560,11977],{"class":163},[33,137562,274],{"class":54},[33,137564,134783],{"class":50},[33,137566,113960],{"class":12018},[33,137568,9546],{"class":50},[33,137570,1811],{"class":163},[33,137572,274],{"class":54},[33,137574,365],{"class":167},[33,137576,7764],{"class":54},[33,137578,134798],{"class":167},[33,137580,6124],{"class":163},[33,137582,7486],{"class":167},[33,137584,662],{"class":163},[33,137586,12624],{"class":167},[33,137588,137589,137591],{"class":35,"line":788},[33,137590,1332],{"class":163},[33,137592,11719],{"class":167},[33,137594,137595],{"class":35,"line":804},[33,137596,92],{"emptyLinePlaceholder":91},[33,137598,137599],{"class":35,"line":809},[33,137600,92],{"emptyLinePlaceholder":91},[33,137602,137603,137605,137607,137609,137611],{"class":35,"line":819},[33,137604,562],{"class":163},[33,137606,6636],{"class":46},[33,137608,568],{"class":167},[33,137610,571],{"class":50},[33,137612,574],{"class":167},[33,137614,137615,137617,137619,137621,137623,137625,137628],{"class":35,"line":829},[33,137616,6648],{"class":167},[33,137618,242],{"class":163},[33,137620,6653],{"class":167},[33,137622,6656],{"class":238},[33,137624,242],{"class":163},[33,137626,137627],{"class":54},"\"Merge spreadsheets in a folder\"",[33,137629,221],{"class":167},[33,137631,137632,137634,137637,137639,137641,137643,137645,137647,137649,137652],{"class":35,"line":834},[33,137633,6669],{"class":167},[33,137635,137636],{"class":54},"\"input_dir\"",[33,137638,25480],{"class":167},[33,137640,6677],{"class":238},[33,137642,242],{"class":163},[33,137644,6682],{"class":167},[33,137646,25463],{"class":238},[33,137648,242],{"class":163},[33,137650,137651],{"class":54},"\"Folder containing source files\"",[33,137653,221],{"class":167},[33,137655,137656,137658,137661,137663,137665,137667,137669,137671,137673,137676],{"class":35,"line":839},[33,137657,6669],{"class":167},[33,137659,137660],{"class":54},"\"output_file\"",[33,137662,365],{"class":167},[33,137664,6677],{"class":238},[33,137666,242],{"class":163},[33,137668,6682],{"class":167},[33,137670,25463],{"class":238},[33,137672,242],{"class":163},[33,137674,137675],{"class":54},"\"Destination .csv or .xlsx\"",[33,137677,221],{"class":167},[33,137679,137680,137682,137685,137687,137689,137691,137693,137695,137697,137699,137702],{"class":35,"line":860},[33,137681,6669],{"class":167},[33,137683,137684],{"class":54},"\"--dedup-cols\"",[33,137686,365],{"class":167},[33,137688,25542],{"class":238},[33,137690,242],{"class":163},[33,137692,117617],{"class":54},[33,137694,365],{"class":167},[33,137696,25463],{"class":238},[33,137698,242],{"class":163},[33,137700,137701],{"class":54},"\"Columns that define a unique row\"",[33,137703,221],{"class":167},[33,137705,137706,137708,137710],{"class":35,"line":887},[33,137707,6766],{"class":167},[33,137709,242],{"class":163},[33,137711,6771],{"class":167},[33,137713,137714],{"class":35,"line":907},[33,137715,92],{"emptyLinePlaceholder":91},[33,137717,137718,137720,137722],{"class":35,"line":1826},[33,137719,617],{"class":163},[33,137721,620],{"class":163},[33,137723,137724],{"class":167}," args.input_dir.is_dir():\n",[33,137726,137727,137729,137731,137733,137735,137738,137740,137742],{"class":35,"line":1844},[33,137728,2995],{"class":167},[33,137730,4059],{"class":163},[33,137732,69491],{"class":54},[33,137734,1115],{"class":50},[33,137736,137737],{"class":167},"args.input_dir",[33,137739,1121],{"class":50},[33,137741,274],{"class":54},[33,137743,221],{"class":167},[33,137745,137746],{"class":35,"line":1858},[33,137747,92],{"emptyLinePlaceholder":91},[33,137749,137750,137752,137754],{"class":35,"line":1871},[33,137751,14067],{"class":167},[33,137753,242],{"class":163},[33,137755,137756],{"class":167}," discover(args.input_dir)\n",[33,137758,137759,137761,137763],{"class":35,"line":1877},[33,137760,617],{"class":163},[33,137762,620],{"class":163},[33,137764,73215],{"class":167},[33,137766,137767,137769,137772],{"class":35,"line":1883},[33,137768,2995],{"class":167},[33,137770,137771],{"class":54},"\"No .xlsx or .csv files found\"",[33,137773,221],{"class":167},[33,137775,137776],{"class":35,"line":1915},[33,137777,92],{"emptyLinePlaceholder":91},[33,137779,137780,137782,137784],{"class":35,"line":1926},[33,137781,584],{"class":167},[33,137783,242],{"class":163},[33,137785,137786],{"class":167}," load(paths)\n",[33,137788,137789,137791,137793],{"class":35,"line":1932},[33,137790,617],{"class":163},[33,137792,620],{"class":163},[33,137794,816],{"class":167},[33,137796,137797,137799,137802],{"class":35,"line":1938},[33,137798,2995],{"class":167},[33,137800,137801],{"class":54},"\"All files failed to load\"",[33,137803,221],{"class":167},[33,137805,137806],{"class":35,"line":1950},[33,137807,92],{"emptyLinePlaceholder":91},[33,137809,137810,137812,137814,137817,137819,137821,137823],{"class":35,"line":1958},[33,137811,584],{"class":167},[33,137813,242],{"class":163},[33,137815,137816],{"class":167}," [normalize_cols(df) ",[33,137818,6124],{"class":163},[33,137820,7810],{"class":167},[33,137822,662],{"class":163},[33,137824,8837],{"class":167},[33,137826,137827],{"class":35,"line":4904},[33,137828,92],{"emptyLinePlaceholder":91},[33,137830,137831,137833],{"class":35,"line":4909},[33,137832,2424],{"class":163},[33,137834,574],{"class":167},[33,137836,137837,137839,137841,137843,137845,137847,137849,137851,137853,137855,137857],{"class":35,"line":4915},[33,137838,28029],{"class":167},[33,137840,242],{"class":163},[33,137842,847],{"class":167},[33,137844,850],{"class":238},[33,137846,242],{"class":163},[33,137848,855],{"class":50},[33,137850,365],{"class":167},[33,137852,135281],{"class":238},[33,137854,242],{"class":163},[33,137856,902],{"class":50},[33,137858,221],{"class":167},[33,137860,137861,137863,137865,137867],{"class":35,"line":4925},[33,137862,2449],{"class":163},[33,137864,4054],{"class":50},[33,137866,1852],{"class":163},[33,137868,1855],{"class":167},[33,137870,137871,137873,137875,137878,137880,137882,137884,137886],{"class":35,"line":4935},[33,137872,2995],{"class":167},[33,137874,4059],{"class":163},[33,137876,137877],{"class":54},"\"Concatenation failed: ",[33,137879,1115],{"class":50},[33,137881,6565],{"class":167},[33,137883,1121],{"class":50},[33,137885,274],{"class":54},[33,137887,221],{"class":167},[33,137889,137890],{"class":35,"line":4941},[33,137891,92],{"emptyLinePlaceholder":91},[33,137893,137894,137896],{"class":35,"line":4950},[33,137895,617],{"class":163},[33,137897,137898],{"class":167}," args.dedup_cols:\n",[33,137900,137901,137904,137906,137908],{"class":35,"line":4960},[33,137902,137903],{"class":167},"        before ",[33,137905,242],{"class":163},[33,137907,4037],{"class":50},[33,137909,66563],{"class":167},[33,137911,137912,137914,137916,137918,137920,137922,137925,137927,137929,137931],{"class":35,"line":4965},[33,137913,28029],{"class":167},[33,137915,242],{"class":163},[33,137917,28063],{"class":167},[33,137919,28066],{"class":238},[33,137921,242],{"class":163},[33,137923,137924],{"class":167},"args.dedup_cols, ",[33,137926,28077],{"class":238},[33,137928,242],{"class":163},[33,137930,114482],{"class":54},[33,137932,221],{"class":167},[33,137934,137935,137937,137940,137942,137944,137947,137949,137951],{"class":35,"line":4971},[33,137936,134946],{"class":167},[33,137938,137939],{"class":54},"\"Dedup removed ",[33,137941,916],{"class":50},[33,137943,65937],{"class":54},[33,137945,137946],{"class":167},", before ",[33,137948,4126],{"class":163},[33,137950,4037],{"class":50},[33,137952,66664],{"class":167},[33,137954,137955],{"class":35,"line":4983},[33,137956,92],{"emptyLinePlaceholder":91},[33,137958,137959,137961,137963,137965,137967,137969,137971],{"class":35,"line":4988},[33,137960,842],{"class":167},[33,137962,242],{"class":163},[33,137964,66671],{"class":167},[33,137966,10868],{"class":238},[33,137968,242],{"class":163},[33,137970,855],{"class":50},[33,137972,221],{"class":167},[33,137974,137975],{"class":35,"line":4993},[33,137976,92],{"emptyLinePlaceholder":91},[33,137978,137979,137981],{"class":35,"line":5003},[33,137980,2424],{"class":163},[33,137982,574],{"class":167},[33,137984,137985,137987,137990,137992,137994],{"class":35,"line":5008},[33,137986,8221],{"class":163},[33,137988,137989],{"class":167}," args.output_file.suffix ",[33,137991,1865],{"class":163},[33,137993,134616],{"class":54},[33,137995,574],{"class":167},[33,137997,137998,138001,138003,138005,138007,138009,138011,138013,138015],{"class":35,"line":5014},[33,137999,138000],{"class":167},"            combined.to_excel(args.output_file, ",[33,138002,897],{"class":238},[33,138004,242],{"class":163},[33,138006,902],{"class":50},[33,138008,365],{"class":167},[33,138010,17351],{"class":238},[33,138012,242],{"class":163},[33,138014,17356],{"class":54},[33,138016,221],{"class":167},[33,138018,138019,138021],{"class":35,"line":5019},[33,138020,41290],{"class":163},[33,138022,574],{"class":167},[33,138024,138025,138028,138030,138032,138034],{"class":35,"line":5032},[33,138026,138027],{"class":167},"            combined.to_csv(args.output_file, ",[33,138029,897],{"class":238},[33,138031,242],{"class":163},[33,138033,902],{"class":50},[33,138035,221],{"class":167},[33,138037,138038,138040,138042,138044,138046,138048,138050,138052,138054],{"class":35,"line":5039},[33,138039,134946],{"class":167},[33,138041,913],{"class":54},[33,138043,916],{"class":50},[33,138045,919],{"class":54},[33,138047,309],{"class":50},[33,138049,274],{"class":54},[33,138051,365],{"class":167},[33,138053,928],{"class":50},[33,138055,138056],{"class":167},"(combined), args.output_file)\n",[33,138058,138059,138061,138063,138065],{"class":35,"line":5068},[33,138060,2449],{"class":163},[33,138062,783],{"class":50},[33,138064,1852],{"class":163},[33,138066,1855],{"class":167},[33,138068,138069,138071,138073,138075,138077,138079,138081,138083],{"class":35,"line":5077},[33,138070,2995],{"class":167},[33,138072,4059],{"class":163},[33,138074,101705],{"class":54},[33,138076,1115],{"class":50},[33,138078,6565],{"class":167},[33,138080,1121],{"class":50},[33,138082,274],{"class":54},[33,138084,221],{"class":167},[33,138086,138087],{"class":35,"line":5082},[33,138088,92],{"emptyLinePlaceholder":91},[33,138090,138091],{"class":35,"line":5089},[33,138092,92],{"emptyLinePlaceholder":91},[33,138094,138095,138097,138099,138101,138103],{"class":35,"line":5098},[33,138096,2491],{"class":163},[33,138098,2494],{"class":50},[33,138100,2497],{"class":163},[33,138102,2500],{"class":54},[33,138104,574],{"class":167},[33,138106,138107],{"class":35,"line":5105},[33,138108,6914],{"class":167},[14,138110,41347],{},[23,138112,138114],{"className":25,"code":138113,"language":27,"meta":28,"style":28},"python merge_spreadsheets.py reports\u002F output\u002Fcombined.csv --dedup-cols region source_file\n",[30,138115,138116],{"__ignoreMap":28},[33,138117,138118,138120,138123,138126,138129,138132,138134],{"class":35,"line":36},[33,138119,47],{"class":46},[33,138121,138122],{"class":54}," merge_spreadsheets.py",[33,138124,138125],{"class":54}," reports\u002F",[33,138127,138128],{"class":54}," output\u002Fcombined.csv",[33,138130,138131],{"class":50}," --dedup-cols",[33,138133,26148],{"class":54},[33,138135,138136],{"class":54}," source_file\n",[18,138138,6918],{"id":6917},[4211,138140,138141,138146,138151,138156],{},[4214,138142,138143,138145],{},[940,138144,99577],{"href":99576}," — engine options, sheet selection, and header row handling before you merge",[4214,138147,138148,138150],{},[940,138149,9599],{"href":9598}," — normalize headers and fix encoding before concatenating",[4214,138152,138153,138155],{},[940,138154,128340],{"href":133881}," — serialize the merged table for API or web consumption",[4214,138157,138158,138160,138161,1351,138163,138165],{},[940,138159,28114],{"href":28113}," — resolve ",[30,138162,28106],{},[30,138164,28109],{}," columns after a merge",[14,138167,6947,138168,3035],{},[940,138169,26258],{"href":26257},[6953,138171,138172],{},"html pre.shiki code .sAwPA, html code.shiki .sAwPA{--shiki-default:#6A737D}html pre.shiki code .s7eDp, html code.shiki .s7eDp{--shiki-default:#6F42C1}html pre.shiki code .sYBdl, html code.shiki .sYBdl{--shiki-default:#032F62}html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html pre.shiki code .sD7c4, html code.shiki .sD7c4{--shiki-default:#D73A49}html pre.shiki code .sgsFI, html code.shiki .sgsFI{--shiki-default:#24292E}html pre.shiki code .sqxcx, html code.shiki .sqxcx{--shiki-default:#E36209}html pre.shiki code .sYu0t, html code.shiki .sYu0t{--shiki-default:#005CC5}html pre.shiki code .s691h, html code.shiki .s691h{--shiki-default:#22863A;--shiki-default-font-weight:bold}",{"title":28,"searchDepth":43,"depth":43,"links":138174},[138175,138176,138177,138178,138179,138184,138185,138186,138187,138192,138193,138194,138195],{"id":20,"depth":43,"text":21},{"id":134293,"depth":43,"text":134294},{"id":134477,"depth":43,"text":134478},{"id":134710,"depth":43,"text":134711},{"id":134976,"depth":43,"text":134977,"children":138180},[138181,138182,138183],{"id":135119,"depth":61,"text":135120},{"id":135358,"depth":61,"text":135359},{"id":135666,"depth":61,"text":135667},{"id":135804,"depth":43,"text":135805},{"id":136083,"depth":43,"text":136084},{"id":12502,"depth":43,"text":136289},{"id":12943,"depth":43,"text":12944,"children":138188},[138189,138190,138191],{"id":136476,"depth":61,"text":136477},{"id":136632,"depth":61,"text":136633},{"id":136756,"depth":61,"text":136757},{"id":13844,"depth":43,"text":13845},{"id":4270,"depth":43,"text":4271},{"id":4401,"depth":43,"text":4402},{"id":6917,"depth":43,"text":6918},"Merging Spreadsheets",{},"\u002Fpython-for-excel-csv-data-processing\u002Fmerging-multiple-spreadsheets",{"title":28119,"description":138200},{"Combine ":138201,"date":46387,"updatedAt":6978,"tags":138204},{"xlsx and ":138202},{"csv workbooks with pandas":138203},"glob file discovery, concat vs merge vs join, column alignment, deduplication, and multi-sheet workbooks.",[99614,107436,47,9630],"Merging Multiple Spreadsheets with Python & pandas","python-for-excel-csv-data-processing\u002Fmerging-multiple-spreadsheets\u002Findex","33F202Vhz5WwM5IW9dscLwnQBqHOmt-KIWxMU_xQj_Y",{"id":138209,"title":126410,"body":138210,"breadcrumbTitle":139789,"canonical":6977,"date":6977,"description":107412,"draft":6980,"extension":6981,"image":6977,"meta":139790,"navigation":91,"path":139791,"robots":6977,"seo":139792,"seoTitle":139797,"stem":139798,"tags":6977,"updatedAt":6977,"__hash__":139799},"content\u002Fpython-for-excel-csv-data-processing\u002Freading-excel-files-with-python\u002Ffix-xlrd-error-reading-xlsx-files\u002Findex.md",{"type":7,"value":138211,"toc":139775},[138212,138215,138222,138228,138231,138237,138252,138254,138273,138291,138294,138334,138341,138343,138346,138490,138493,138499,138502,138531,138549,138555,138751,138754,138771,138783,138790,138803,138932,138952,138967,139110,139121,139127,139130,139181,139199,139202,139221,139224,139228,139241,139388,139398,139402,139421,139493,139505,139507,139510,139731,139744,139746,139768,139772],[10,138213,126410],{"id":138214},"fix-xlrd-error-reading-xlsx-files",[14,138216,138217,138218,138221],{},"Running ",[30,138219,138220],{},"pd.read_excel(\"file.xlsx\")"," raises one of two errors depending on your environment:",[23,138223,138226],{"className":138224,"code":138225,"language":2000},[1998],"XLRDError: Excel xlsx file; not supported\n",[30,138227,138225],{"__ignoreMap":28},[14,138229,138230],{},"or:",[23,138232,138235],{"className":138233,"code":138234,"language":2000},[1998],"ImportError: Missing optional dependency 'xlrd'. Install xlrd >= 1.0.0 for xls support.\nUse pip or conda to install xlrd.\n",[30,138236,138234],{"__ignoreMap":28},[14,138238,138239,138240,95799,138242,71132,138244,138246,138247,138249,138250,3035],{},"Both mean the same thing: pandas attempted to open a ",[30,138241,26542],{},[30,138243,125595],{},[30,138245,125595],{}," cannot do it. The fix is to install ",[30,138248,22009],{}," and pass ",[30,138251,22395],{},[18,138253,7021],{"id":7020},[14,138255,138256,138258,138259,138261,138262,138264,138265,138267,138268,138270,138271,3035],{},[30,138257,125595],{}," dropped ",[30,138260,26542],{}," support in version 2.0 (released October 2020). Before 2.0, ",[30,138263,125595],{}," handled both ",[30,138266,112255],{}," (legacy binary) and ",[30,138269,26542],{}," (Open XML). After 2.0, it handles only ",[30,138272,112255],{},[14,138274,138275,138276,138278,138279,138281,138282,138284,138285,138287,138288,138290],{},"When you call ",[30,138277,138220],{}," without an explicit ",[30,138280,17351],{}," argument, pandas picks a backend. On systems where ",[30,138283,125595],{}," is installed and ",[30,138286,22009],{}," is not (or on older pandas versions), the inference picks ",[30,138289,125595],{},". The result is one of the two errors above.",[14,138292,138293],{},"The two error variants map to distinct situations:",[4273,138295,138296,138304],{},[4276,138297,138298],{},[4279,138299,138300,138302],{},[4282,138301,14317],{},[4282,138303,112550],{},[4292,138305,138306,138321],{},[4279,138307,138308,138313],{},[4297,138309,138310],{},[30,138311,138312],{},"XLRDError: Excel xlsx file; not supported",[4297,138314,138315,138317,138318,138320],{},[30,138316,125595],{}," ≥ 2.0 is installed; it opens the file but rejects the ",[30,138319,26542],{}," format immediately",[4279,138322,138323,138328],{},[4297,138324,138325],{},[30,138326,138327],{},"ImportError: Missing optional dependency 'xlrd'",[4297,138329,138330,138331,138333],{},"Pandas resolved to ",[30,138332,125595],{}," as the required backend but it is not installed at all",[14,138335,138336,138337,138249,138339,3035],{},"Both are fixed the same way: install ",[30,138338,22009],{},[30,138340,22395],{},[18,138342,35017],{"id":35016},[14,138344,138345],{},"Before applying any fix, confirm what is actually installed in your environment:",[23,138347,138349],{"className":126,"code":138348,"language":47,"meta":28,"style":28},"# no extra install needed\nimport importlib\n\nfor pkg in (\"xlrd\", \"openpyxl\", \"python_calamine\", \"pandas\"):\n    try:\n        mod = importlib.import_module(pkg.replace(\"-\", \"_\"))\n        print(f\"{pkg}: {getattr(mod, '__version__', 'installed')}\")\n    except ImportError:\n        print(f\"{pkg}: NOT INSTALLED\")\n",[30,138350,138351,138356,138363,138367,138396,138402,138420,138461,138469],{"__ignoreMap":28},[33,138352,138353],{"class":35,"line":36},[33,138354,138355],{"class":39},"# no extra install needed\n",[33,138357,138358,138360],{"class":35,"line":43},[33,138359,164],{"class":163},[33,138361,138362],{"class":167}," importlib\n",[33,138364,138365],{"class":35,"line":61},[33,138366,92],{"emptyLinePlaceholder":91},[33,138368,138369,138371,138374,138376,138378,138381,138383,138385,138387,138390,138392,138394],{"class":35,"line":73},[33,138370,6124],{"class":163},[33,138372,138373],{"class":167}," pkg ",[33,138375,662],{"class":163},[33,138377,17583],{"class":167},[33,138379,138380],{"class":54},"\"xlrd\"",[33,138382,365],{"class":167},[33,138384,17356],{"class":54},[33,138386,365],{"class":167},[33,138388,138389],{"class":54},"\"python_calamine\"",[33,138391,365],{"class":167},[33,138393,108249],{"class":54},[33,138395,1737],{"class":167},[33,138397,138398,138400],{"class":35,"line":88},[33,138399,2424],{"class":163},[33,138401,574],{"class":167},[33,138403,138404,138407,138409,138412,138414,138416,138418],{"class":35,"line":95},[33,138405,138406],{"class":167},"        mod ",[33,138408,242],{"class":163},[33,138410,138411],{"class":167}," importlib.import_module(pkg.replace(",[33,138413,75122],{"class":54},[33,138415,365],{"class":167},[33,138417,7764],{"class":54},[33,138419,371],{"class":167},[33,138421,138422,138424,138426,138428,138430,138432,138435,138437,138439,138442,138445,138448,138450,138453,138455,138457,138459],{"class":35,"line":101},[33,138423,9414],{"class":50},[33,138425,602],{"class":167},[33,138427,4059],{"class":163},[33,138429,274],{"class":54},[33,138431,1115],{"class":50},[33,138433,138434],{"class":167},"pkg",[33,138436,1121],{"class":50},[33,138438,2079],{"class":54},[33,138440,138441],{"class":50},"{getattr",[33,138443,138444],{"class":167},"(mod, ",[33,138446,138447],{"class":54},"'__version__'",[33,138449,365],{"class":167},[33,138451,138452],{"class":54},"'installed'",[33,138454,12027],{"class":167},[33,138456,1121],{"class":50},[33,138458,274],{"class":54},[33,138460,221],{"class":167},[33,138462,138463,138465,138467],{"class":35,"line":171},[33,138464,2449],{"class":163},[33,138466,40488],{"class":50},[33,138468,574],{"class":167},[33,138470,138471,138473,138475,138477,138479,138481,138483,138485,138488],{"class":35,"line":179},[33,138472,9414],{"class":50},[33,138474,602],{"class":167},[33,138476,4059],{"class":163},[33,138478,274],{"class":54},[33,138480,1115],{"class":50},[33,138482,138434],{"class":167},[33,138484,1121],{"class":50},[33,138486,138487],{"class":54},": NOT INSTALLED\"",[33,138489,221],{"class":167},[14,138491,138492],{},"Expected output after the fix:",[23,138494,138497],{"className":138495,"code":138496,"language":2000},[1998],"xlrd: 2.0.1           ← still present, but only used for .xls files\nopenpyxl: 3.1.2       ← handles .xlsx\npython_calamine: NOT INSTALLED   ← optional fast engine\npandas: 2.2.2\n",[30,138498,138496],{"__ignoreMap":28},[14,138500,138501],{},"Also confirm that you are running in the correct Python environment:",[23,138503,138505],{"className":25,"code":138504,"language":27,"meta":28,"style":28},"which python\npython -c \"import sys; print(sys.executable)\"\npip show openpyxl\n",[30,138506,138507,138514,138523],{"__ignoreMap":28},[33,138508,138509,138511],{"class":35,"line":36},[33,138510,35269],{"class":50},[33,138512,138513],{"class":54}," python\n",[33,138515,138516,138518,138520],{"class":35,"line":43},[33,138517,47],{"class":46},[33,138519,106],{"class":50},[33,138521,138522],{"class":54}," \"import sys; print(sys.executable)\"\n",[33,138524,138525,138527,138529],{"class":35,"line":61},[33,138526,76],{"class":46},[33,138528,41946],{"class":54},[33,138530,95887],{"class":54},[14,138532,41963,138533,138536,138537,138539,138540,138542,138543,138545,138546,3035],{},[30,138534,138535],{},"pip show openpyxl"," shows \"not installed\" but you ran ",[30,138538,26548],{}," earlier, the ",[30,138541,76],{}," command targeted a different Python interpreter. Re-run ",[30,138544,36846],{}," using the exact Python shown above: ",[30,138547,138548],{},"python -m pip install openpyxl",[18,138550,138552,138553],{"id":138551},"fix-install-openpyxl-and-pass-engineopenpyxl","Fix: Install openpyxl and Pass ",[30,138554,22395],{},[23,138556,138558],{"className":126,"code":138557,"language":47,"meta":28,"style":28},"# pip install openpyxl\nfrom pathlib import Path\nimport pandas as pd\n\nEXCEL_PATH = Path(\"report.xlsx\")   # the .xlsx file that was failing\n\ntry:\n    df = pd.read_excel(\n        EXCEL_PATH,\n        engine=\"openpyxl\",     # replaces the failing xlrd backend\n    )\n    print(df.shape)\n    print(df.head())\nexcept FileNotFoundError:\n    raise SystemExit(f\"File not found: {EXCEL_PATH}\")\nexcept ImportError as e:\n    raise SystemExit(f\"openpyxl not installed — run: pip install openpyxl\\n{e}\")\nexcept Exception as e:\n    raise SystemExit(f\"Read error: {e}\")\n",[30,138559,138560,138564,138574,138584,138588,138604,138608,138614,138622,138629,138642,138646,138652,138658,138666,138685,138695,138718,138728],{"__ignoreMap":28},[33,138561,138562],{"class":35,"line":36},[33,138563,98209],{"class":39},[33,138565,138566,138568,138570,138572],{"class":35,"line":43},[33,138567,190],{"class":163},[33,138569,193],{"class":167},[33,138571,164],{"class":163},[33,138573,198],{"class":167},[33,138575,138576,138578,138580,138582],{"class":35,"line":61},[33,138577,164],{"class":163},[33,138579,492],{"class":167},[33,138581,495],{"class":163},[33,138583,498],{"class":167},[33,138585,138586],{"class":35,"line":73},[33,138587,92],{"emptyLinePlaceholder":91},[33,138589,138590,138593,138595,138597,138599,138601],{"class":35,"line":88},[33,138591,138592],{"class":50},"EXCEL_PATH",[33,138594,212],{"class":163},[33,138596,215],{"class":167},[33,138598,128434],{"class":54},[33,138600,12000],{"class":167},[33,138602,138603],{"class":39},"# the .xlsx file that was failing\n",[33,138605,138606],{"class":35,"line":95},[33,138607,92],{"emptyLinePlaceholder":91},[33,138609,138610,138612],{"class":35,"line":101},[33,138611,35574],{"class":163},[33,138613,574],{"class":167},[33,138615,138616,138618,138620],{"class":35,"line":171},[33,138617,4025],{"class":167},[33,138619,242],{"class":163},[33,138621,126171],{"class":167},[33,138623,138624,138627],{"class":35,"line":179},[33,138625,138626],{"class":50},"        EXCEL_PATH",[33,138628,247],{"class":167},[33,138630,138631,138633,138635,138637,138639],{"class":35,"line":187},[33,138632,111493],{"class":238},[33,138634,242],{"class":163},[33,138636,17356],{"class":54},[33,138638,25539],{"class":167},[33,138640,138641],{"class":39},"# replaces the failing xlrd backend\n",[33,138643,138644],{"class":35,"line":201},[33,138645,1202],{"class":167},[33,138647,138648,138650],{"class":35,"line":206},[33,138649,7268],{"class":50},[33,138651,39529],{"class":167},[33,138653,138654,138656],{"class":35,"line":224},[33,138655,7268],{"class":50},[33,138657,13311],{"class":167},[33,138659,138660,138662,138664],{"class":35,"line":229},[33,138661,35726],{"class":163},[33,138663,2945],{"class":50},[33,138665,574],{"class":167},[33,138667,138668,138670,138672,138674,138676,138678,138681,138683],{"class":35,"line":235},[33,138669,35742],{"class":163},[33,138671,16617],{"class":50},[33,138673,602],{"class":167},[33,138675,4059],{"class":163},[33,138677,15677],{"class":54},[33,138679,138680],{"class":50},"{EXCEL_PATH}",[33,138682,274],{"class":54},[33,138684,221],{"class":167},[33,138686,138687,138689,138691,138693],{"class":35,"line":250},[33,138688,35726],{"class":163},[33,138690,40488],{"class":50},[33,138692,1852],{"class":163},[33,138694,7583],{"class":167},[33,138696,138697,138699,138701,138703,138705,138708,138710,138712,138714,138716],{"class":35,"line":266},[33,138698,35742],{"class":163},[33,138700,16617],{"class":50},[33,138702,602],{"class":167},[33,138704,4059],{"class":163},[33,138706,138707],{"class":54},"\"openpyxl not installed — run: pip install openpyxl",[33,138709,5793],{"class":50},[33,138711,7602],{"class":167},[33,138713,1121],{"class":50},[33,138715,274],{"class":54},[33,138717,221],{"class":167},[33,138719,138720,138722,138724,138726],{"class":35,"line":290},[33,138721,35726],{"class":163},[33,138723,783],{"class":50},[33,138725,1852],{"class":163},[33,138727,7583],{"class":167},[33,138729,138730,138732,138734,138736,138738,138741,138743,138745,138747,138749],{"class":35,"line":295},[33,138731,35742],{"class":163},[33,138733,16617],{"class":50},[33,138735,602],{"class":167},[33,138737,4059],{"class":163},[33,138739,138740],{"class":54},"\"Read error: ",[33,138742,1115],{"class":50},[33,138744,7602],{"class":167},[33,138746,1121],{"class":50},[33,138748,274],{"class":54},[33,138750,221],{"class":167},[14,138752,138753],{},"Two changes from the failing call:",[35387,138755,138756,138763],{},[4214,138757,138758,138760,138761,3035],{},[30,138759,26548],{}," installs the correct backend for ",[30,138762,26542],{},[4214,138764,138765,138767,138768,138770],{},[30,138766,22395],{}," routes pandas explicitly to ",[30,138769,22009],{}," instead of letting it guess.",[14,138772,138773,138774,138776,138777,138779,138780,138782],{},"Always pass ",[30,138775,17351],{}," explicitly in production code. Even on modern pandas where ",[30,138778,22009],{}," is the documented default for ",[30,138781,26542],{},", explicit engine selection eliminates the dependency on inference behaviour across pandas versions and package combinations.",[18,138784,138786,138787,138789],{"id":138785},"variant-1-you-have-a-genuine-xls-file","Variant 1: You Have a Genuine ",[30,138788,112255],{}," File",[14,138791,138792,138793,138795,138796,138798,138799,10065,138801,20891],{},"If the file is actually a legacy ",[30,138794,112255],{}," binary workbook (Excel 97–2003 format), ",[30,138797,22009],{}," cannot read it. You need ",[30,138800,125595],{},[30,138802,128873],{},[23,138804,138806],{"className":126,"code":138805,"language":47,"meta":28,"style":28},"# pip install xlrd\nfrom pathlib import Path\nimport pandas as pd\n\nEXCEL_PATH = Path(\"legacy_data.xls\")   # confirmed .xls binary\n\ntry:\n    df = pd.read_excel(\n        EXCEL_PATH,\n        engine=\"xlrd\",    # xlrd 2.x works fine for genuine .xls files\n    )\n    print(df.shape)\nexcept Exception as e:\n    raise SystemExit(f\"Read error: {e}\")\n",[30,138807,138808,138813,138823,138833,138837,138853,138857,138863,138871,138877,138890,138894,138900,138910],{"__ignoreMap":28},[33,138809,138810],{"class":35,"line":36},[33,138811,138812],{"class":39},"# pip install xlrd\n",[33,138814,138815,138817,138819,138821],{"class":35,"line":43},[33,138816,190],{"class":163},[33,138818,193],{"class":167},[33,138820,164],{"class":163},[33,138822,198],{"class":167},[33,138824,138825,138827,138829,138831],{"class":35,"line":61},[33,138826,164],{"class":163},[33,138828,492],{"class":167},[33,138830,495],{"class":163},[33,138832,498],{"class":167},[33,138834,138835],{"class":35,"line":73},[33,138836,92],{"emptyLinePlaceholder":91},[33,138838,138839,138841,138843,138845,138848,138850],{"class":35,"line":88},[33,138840,138592],{"class":50},[33,138842,212],{"class":163},[33,138844,215],{"class":167},[33,138846,138847],{"class":54},"\"legacy_data.xls\"",[33,138849,12000],{"class":167},[33,138851,138852],{"class":39},"# confirmed .xls binary\n",[33,138854,138855],{"class":35,"line":95},[33,138856,92],{"emptyLinePlaceholder":91},[33,138858,138859,138861],{"class":35,"line":101},[33,138860,35574],{"class":163},[33,138862,574],{"class":167},[33,138864,138865,138867,138869],{"class":35,"line":171},[33,138866,4025],{"class":167},[33,138868,242],{"class":163},[33,138870,126171],{"class":167},[33,138872,138873,138875],{"class":35,"line":179},[33,138874,138626],{"class":50},[33,138876,247],{"class":167},[33,138878,138879,138881,138883,138885,138887],{"class":35,"line":187},[33,138880,111493],{"class":238},[33,138882,242],{"class":163},[33,138884,138380],{"class":54},[33,138886,38342],{"class":167},[33,138888,138889],{"class":39},"# xlrd 2.x works fine for genuine .xls files\n",[33,138891,138892],{"class":35,"line":201},[33,138893,1202],{"class":167},[33,138895,138896,138898],{"class":35,"line":206},[33,138897,7268],{"class":50},[33,138899,39529],{"class":167},[33,138901,138902,138904,138906,138908],{"class":35,"line":224},[33,138903,35726],{"class":163},[33,138905,783],{"class":50},[33,138907,1852],{"class":163},[33,138909,7583],{"class":167},[33,138911,138912,138914,138916,138918,138920,138922,138924,138926,138928,138930],{"class":35,"line":229},[33,138913,35742],{"class":163},[33,138915,16617],{"class":50},[33,138917,602],{"class":167},[33,138919,4059],{"class":163},[33,138921,138740],{"class":54},[33,138923,1115],{"class":50},[33,138925,7602],{"class":167},[33,138927,1121],{"class":50},[33,138929,274],{"class":54},[33,138931,221],{"class":167},[14,138933,138934,138936,138937,138939,138940,138943,138944,138946,138947,138949,138950,3035],{},[30,138935,125595],{}," 2.x supports the ",[30,138938,112255],{}," format; there is no need to pin ",[30,138941,138942],{},"xlrd\u003C2.0"," for genuine ",[30,138945,112255],{}," files. The version restriction only matters if you have old code that used ",[30,138948,125595],{}," to open ",[30,138951,26542],{},[14,138953,138954,138955,138957,138958,22506,138961,138963,138964,20891],{},"To confirm the actual file format without opening Excel, inspect the first bytes. ",[30,138956,112255],{}," binary files start with the OLE2 magic number ",[30,138959,138960],{},"D0 CF 11 E0",[30,138962,26542],{}," files start with the ZIP header ",[30,138965,138966],{},"50 4B 03 04",[23,138968,138970],{"className":126,"code":138969,"language":47,"meta":28,"style":28},"# no extra install needed\nfrom pathlib import Path\n\ndef detect_format(path: Path) -> str:\n    magic = path.read_bytes()[:4]\n    if magic == b\"PK\\x03\\x04\":\n        return \"xlsx — ZIP-based Open XML\"\n    if magic[:4] == b\"\\xd0\\xcf\\x11\\xe0\":\n        return \"xls — OLE2 binary (legacy)\"\n    return f\"unknown format — first 4 bytes: {magic.hex()}\"\n\nprint(detect_format(Path(\"mystery_file.xls\")))\n",[30,138971,138972,138976,138986,138990,139003,139016,139038,139045,139069,139076,139094,139098],{"__ignoreMap":28},[33,138973,138974],{"class":35,"line":36},[33,138975,138355],{"class":39},[33,138977,138978,138980,138982,138984],{"class":35,"line":43},[33,138979,190],{"class":163},[33,138981,193],{"class":167},[33,138983,164],{"class":163},[33,138985,198],{"class":167},[33,138987,138988],{"class":35,"line":61},[33,138989,92],{"emptyLinePlaceholder":91},[33,138991,138992,138994,138997,138999,139001],{"class":35,"line":73},[33,138993,562],{"class":163},[33,138995,138996],{"class":46}," detect_format",[33,138998,3743],{"class":167},[33,139000,1053],{"class":50},[33,139002,574],{"class":167},[33,139004,139005,139008,139010,139012,139014],{"class":35,"line":88},[33,139006,139007],{"class":167},"    magic ",[33,139009,242],{"class":163},[33,139011,116349],{"class":167},[33,139013,1503],{"class":50},[33,139015,9202],{"class":167},[33,139017,139018,139020,139023,139025,139028,139031,139034,139036],{"class":35,"line":95},[33,139019,617],{"class":163},[33,139021,139022],{"class":167}," magic ",[33,139024,1865],{"class":163},[33,139026,139027],{"class":163}," b",[33,139029,139030],{"class":54},"\"PK",[33,139032,139033],{"class":50},"\\x03\\x04",[33,139035,274],{"class":54},[33,139037,574],{"class":167},[33,139039,139040,139042],{"class":35,"line":101},[33,139041,1659],{"class":163},[33,139043,139044],{"class":54}," \"xlsx — ZIP-based Open XML\"\n",[33,139046,139047,139049,139052,139054,139056,139058,139060,139062,139065,139067],{"class":35,"line":171},[33,139048,617],{"class":163},[33,139050,139051],{"class":167}," magic[:",[33,139053,1503],{"class":50},[33,139055,763],{"class":167},[33,139057,1865],{"class":163},[33,139059,139027],{"class":163},[33,139061,274],{"class":54},[33,139063,139064],{"class":50},"\\xd0\\xcf\\x11\\xe0",[33,139066,274],{"class":54},[33,139068,574],{"class":167},[33,139070,139071,139073],{"class":35,"line":179},[33,139072,1659],{"class":163},[33,139074,139075],{"class":54}," \"xls — OLE2 binary (legacy)\"\n",[33,139077,139078,139080,139082,139085,139087,139090,139092],{"class":35,"line":187},[33,139079,1332],{"class":163},[33,139081,1110],{"class":163},[33,139083,139084],{"class":54},"\"unknown format — first 4 bytes: ",[33,139086,1115],{"class":50},[33,139088,139089],{"class":167},"magic.hex()",[33,139091,1121],{"class":50},[33,139093,7504],{"class":54},[33,139095,139096],{"class":35,"line":201},[33,139097,92],{"emptyLinePlaceholder":91},[33,139099,139100,139102,139105,139108],{"class":35,"line":206},[33,139101,13474],{"class":50},[33,139103,139104],{"class":167},"(detect_format(Path(",[33,139106,139107],{"class":54},"\"mystery_file.xls\"",[33,139109,23269],{"class":167},[14,139111,139112,139113,139115,139116,139118,139119,3035],{},"Run this on the file before deciding which engine to use. A file named ",[30,139114,112255],{}," that returns \"ZIP-based Open XML\" is actually an ",[30,139117,26542],{}," that was renamed — pass ",[30,139120,22395],{},[18,139122,139124,139125,80125],{"id":139123},"variant-2-you-explicitly-passed-enginexlrd-in-code","Variant 2: You Explicitly Passed ",[30,139126,128873],{},[14,139128,139129],{},"If the error appears in code that already specifies an engine, find and replace the argument:",[23,139131,139133],{"className":126,"code":139132,"language":47,"meta":28,"style":28},"# Before — fails for .xlsx files:\ndf = pd.read_excel(path, engine=\"xlrd\")\n\n# After — correct for .xlsx:\ndf = pd.read_excel(path, engine=\"openpyxl\")\n",[30,139134,139135,139140,139156,139160,139165],{"__ignoreMap":28},[33,139136,139137],{"class":35,"line":36},[33,139138,139139],{"class":39},"# Before — fails for .xlsx files:\n",[33,139141,139142,139144,139146,139148,139150,139152,139154],{"class":35,"line":43},[33,139143,13459],{"class":167},[33,139145,242],{"class":163},[33,139147,27389],{"class":167},[33,139149,17351],{"class":238},[33,139151,242],{"class":163},[33,139153,138380],{"class":54},[33,139155,221],{"class":167},[33,139157,139158],{"class":35,"line":61},[33,139159,92],{"emptyLinePlaceholder":91},[33,139161,139162],{"class":35,"line":73},[33,139163,139164],{"class":39},"# After — correct for .xlsx:\n",[33,139166,139167,139169,139171,139173,139175,139177,139179],{"class":35,"line":88},[33,139168,13459],{"class":167},[33,139170,242],{"class":163},[33,139172,27389],{"class":167},[33,139174,17351],{"class":238},[33,139176,242],{"class":163},[33,139178,17356],{"class":54},[33,139180,221],{"class":167},[14,139182,139183,139184,139186,139187,139189,139190,139192,139193,139195,139196,139198],{},"Search your codebase for ",[30,139185,128873],{}," and replace each occurrence with ",[30,139188,22395],{}," for any path that points to ",[30,139191,26542],{}," files. Keep ",[30,139194,128873],{}," only for paths that are confirmed ",[30,139197,112255],{}," binary workbooks.",[14,139200,139201],{},"A quick codebase search:",[23,139203,139205],{"className":25,"code":139204,"language":27,"meta":28,"style":28},"grep -rn 'engine=\"xlrd\"' .\n",[30,139206,139207],{"__ignoreMap":28},[33,139208,139209,139212,139215,139218],{"class":35,"line":36},[33,139210,139211],{"class":46},"grep",[33,139213,139214],{"class":50}," -rn",[33,139216,139217],{"class":54}," 'engine=\"xlrd\"'",[33,139219,139220],{"class":54}," .\n",[14,139222,139223],{},"Review each result and update as needed.",[18,139225,139227],{"id":139226},"variant-3-use-calamine-for-faster-reads","Variant 3: Use calamine for Faster Reads",[14,139229,139230,139231,139233,139234,139237,139238,139240],{},"If raw read performance matters — for example, loading a 50 MB ",[30,139232,26542],{}," on every CI run — the ",[30,139235,139236],{},"calamine"," engine is typically 2–5x faster than ",[30,139239,22009],{}," because it skips formula and style parsing:",[23,139242,139244],{"className":126,"code":139243,"language":47,"meta":28,"style":28},"# pip install python-calamine\nfrom pathlib import Path\nimport pandas as pd\n\nEXCEL_PATH = Path(\"large_report.xlsx\")\n\ntry:\n    df = pd.read_excel(\n        EXCEL_PATH,\n        engine=\"calamine\",    # handles .xlsx, .xls, .xlsb, .ods\n    )\n    print(df.shape)\nexcept ImportError:\n    raise SystemExit(\"Run: pip install python-calamine\")\nexcept Exception as e:\n    raise SystemExit(f\"Read error: {e}\")\n",[30,139245,139246,139251,139261,139271,139275,139287,139291,139297,139305,139311,139325,139329,139335,139343,139356,139366],{"__ignoreMap":28},[33,139247,139248],{"class":35,"line":36},[33,139249,139250],{"class":39},"# pip install python-calamine\n",[33,139252,139253,139255,139257,139259],{"class":35,"line":43},[33,139254,190],{"class":163},[33,139256,193],{"class":167},[33,139258,164],{"class":163},[33,139260,198],{"class":167},[33,139262,139263,139265,139267,139269],{"class":35,"line":61},[33,139264,164],{"class":163},[33,139266,492],{"class":167},[33,139268,495],{"class":163},[33,139270,498],{"class":167},[33,139272,139273],{"class":35,"line":73},[33,139274,92],{"emptyLinePlaceholder":91},[33,139276,139277,139279,139281,139283,139285],{"class":35,"line":88},[33,139278,138592],{"class":50},[33,139280,212],{"class":163},[33,139282,215],{"class":167},[33,139284,130425],{"class":54},[33,139286,221],{"class":167},[33,139288,139289],{"class":35,"line":95},[33,139290,92],{"emptyLinePlaceholder":91},[33,139292,139293,139295],{"class":35,"line":101},[33,139294,35574],{"class":163},[33,139296,574],{"class":167},[33,139298,139299,139301,139303],{"class":35,"line":171},[33,139300,4025],{"class":167},[33,139302,242],{"class":163},[33,139304,126171],{"class":167},[33,139306,139307,139309],{"class":35,"line":179},[33,139308,138626],{"class":50},[33,139310,247],{"class":167},[33,139312,139313,139315,139317,139320,139322],{"class":35,"line":187},[33,139314,111493],{"class":238},[33,139316,242],{"class":163},[33,139318,139319],{"class":54},"\"calamine\"",[33,139321,38342],{"class":167},[33,139323,139324],{"class":39},"# handles .xlsx, .xls, .xlsb, .ods\n",[33,139326,139327],{"class":35,"line":201},[33,139328,1202],{"class":167},[33,139330,139331,139333],{"class":35,"line":206},[33,139332,7268],{"class":50},[33,139334,39529],{"class":167},[33,139336,139337,139339,139341],{"class":35,"line":224},[33,139338,35726],{"class":163},[33,139340,40488],{"class":50},[33,139342,574],{"class":167},[33,139344,139345,139347,139349,139351,139354],{"class":35,"line":229},[33,139346,35742],{"class":163},[33,139348,16617],{"class":50},[33,139350,602],{"class":167},[33,139352,139353],{"class":54},"\"Run: pip install python-calamine\"",[33,139355,221],{"class":167},[33,139357,139358,139360,139362,139364],{"class":35,"line":235},[33,139359,35726],{"class":163},[33,139361,783],{"class":50},[33,139363,1852],{"class":163},[33,139365,7583],{"class":167},[33,139367,139368,139370,139372,139374,139376,139378,139380,139382,139384,139386],{"class":35,"line":250},[33,139369,35742],{"class":163},[33,139371,16617],{"class":50},[33,139373,602],{"class":167},[33,139375,4059],{"class":163},[33,139377,138740],{"class":54},[33,139379,1115],{"class":50},[33,139381,7602],{"class":167},[33,139383,1121],{"class":50},[33,139385,274],{"class":54},[33,139387,221],{"class":167},[14,139389,139390,139392,139393,107354,139395,139397],{},[30,139391,139236],{}," is read-only and does not expose formula strings, cell styles, or comments. For anything beyond tabular data extraction, use ",[30,139394,22009],{},[940,139396,99577],{"href":99576}," for a full engine comparison.",[18,139399,139401],{"id":139400},"variant-4-the-file-is-not-a-zip-file-error","Variant 4: The \"File is not a zip file\" Error",[14,139403,139404,139405,139408,139409,139411,139412,139414,139415,139417,139418,139420],{},"If the error is ",[30,139406,139407],{},"ValueError: File is not a zip file"," rather than ",[30,139410,128863],{},", the file is either corrupted or its extension does not match its content. An ",[30,139413,26542],{}," file with corrupted bytes, or a genuine ",[30,139416,112255],{}," file that was renamed to ",[30,139419,26542],{},", produces this error.",[23,139422,139424],{"className":126,"code":139423,"language":47,"meta":28,"style":28},"# no extra install needed\nfrom pathlib import Path\n\npath = Path(\"suspect_file.xlsx\")\nmagic = path.read_bytes()[:4]\nprint(magic.hex())\n# PK0304 → valid .xlsx (ZIP)\n# d0cf11e0 → actually .xls (OLE2 binary) — use engine=\"xlrd\"\n# anything else → file may be corrupted\n",[30,139425,139426,139430,139440,139444,139458,139471,139478,139483,139488],{"__ignoreMap":28},[33,139427,139428],{"class":35,"line":36},[33,139429,138355],{"class":39},[33,139431,139432,139434,139436,139438],{"class":35,"line":43},[33,139433,190],{"class":163},[33,139435,193],{"class":167},[33,139437,164],{"class":163},[33,139439,198],{"class":167},[33,139441,139442],{"class":35,"line":61},[33,139443,92],{"emptyLinePlaceholder":91},[33,139445,139446,139449,139451,139453,139456],{"class":35,"line":73},[33,139447,139448],{"class":167},"path ",[33,139450,242],{"class":163},[33,139452,215],{"class":167},[33,139454,139455],{"class":54},"\"suspect_file.xlsx\"",[33,139457,221],{"class":167},[33,139459,139460,139463,139465,139467,139469],{"class":35,"line":88},[33,139461,139462],{"class":167},"magic ",[33,139464,242],{"class":163},[33,139466,116349],{"class":167},[33,139468,1503],{"class":50},[33,139470,9202],{"class":167},[33,139472,139473,139475],{"class":35,"line":95},[33,139474,13474],{"class":50},[33,139476,139477],{"class":167},"(magic.hex())\n",[33,139479,139480],{"class":35,"line":101},[33,139481,139482],{"class":39},"# PK0304 → valid .xlsx (ZIP)\n",[33,139484,139485],{"class":35,"line":171},[33,139486,139487],{"class":39},"# d0cf11e0 → actually .xls (OLE2 binary) — use engine=\"xlrd\"\n",[33,139489,139490],{"class":35,"line":179},[33,139491,139492],{"class":39},"# anything else → file may be corrupted\n",[14,139494,139495,139496,139498,139499,139501,139502,139504],{},"If the file is actually ",[30,139497,112255],{}," masquerading as ",[30,139500,26542],{},", change the extension reference and use ",[30,139503,128873],{},". If the file is corrupted, it needs to be re-exported from the source system.",[18,139506,9247],{"id":9246},[14,139508,139509],{},"After applying the fix, confirm that the load succeeds and that column types look correct:",[23,139511,139513],{"className":126,"code":139512,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nfrom pathlib import Path\nimport pandas as pd\n\nEXCEL_PATH = Path(\"report.xlsx\")\n\ntry:\n    df = pd.read_excel(EXCEL_PATH, engine=\"openpyxl\")\n    assert df.shape[0] > 0, \"DataFrame is empty — check sheet name and skiprows\"\n    assert df.shape[1] > 0, \"No columns loaded\"\n    print(f\"OK — {df.shape[0]} rows, {df.shape[1]} columns\")\n    print(df.dtypes)\nexcept AssertionError as e:\n    raise SystemExit(f\"Validation failed: {e}\")\nexcept Exception as e:\n    raise SystemExit(f\"Load error after fix: {e}\")\n",[30,139514,139515,139519,139529,139539,139543,139555,139559,139565,139585,139604,139623,139660,139666,139676,139698,139708],{"__ignoreMap":28},[33,139516,139517],{"class":35,"line":36},[33,139518,3952],{"class":39},[33,139520,139521,139523,139525,139527],{"class":35,"line":43},[33,139522,190],{"class":163},[33,139524,193],{"class":167},[33,139526,164],{"class":163},[33,139528,198],{"class":167},[33,139530,139531,139533,139535,139537],{"class":35,"line":61},[33,139532,164],{"class":163},[33,139534,492],{"class":167},[33,139536,495],{"class":163},[33,139538,498],{"class":167},[33,139540,139541],{"class":35,"line":73},[33,139542,92],{"emptyLinePlaceholder":91},[33,139544,139545,139547,139549,139551,139553],{"class":35,"line":88},[33,139546,138592],{"class":50},[33,139548,212],{"class":163},[33,139550,215],{"class":167},[33,139552,128434],{"class":54},[33,139554,221],{"class":167},[33,139556,139557],{"class":35,"line":95},[33,139558,92],{"emptyLinePlaceholder":91},[33,139560,139561,139563],{"class":35,"line":101},[33,139562,35574],{"class":163},[33,139564,574],{"class":167},[33,139566,139567,139569,139571,139573,139575,139577,139579,139581,139583],{"class":35,"line":171},[33,139568,4025],{"class":167},[33,139570,242],{"class":163},[33,139572,126254],{"class":167},[33,139574,138592],{"class":50},[33,139576,365],{"class":167},[33,139578,17351],{"class":238},[33,139580,242],{"class":163},[33,139582,17356],{"class":54},[33,139584,221],{"class":167},[33,139586,139587,139589,139591,139593,139595,139597,139599,139601],{"class":35,"line":179},[33,139588,9228],{"class":163},[33,139590,9516],{"class":167},[33,139592,748],{"class":50},[33,139594,763],{"class":167},[33,139596,6009],{"class":163},[33,139598,10791],{"class":50},[33,139600,365],{"class":167},[33,139602,139603],{"class":54},"\"DataFrame is empty — check sheet name and skiprows\"\n",[33,139605,139606,139608,139610,139612,139614,139616,139618,139620],{"class":35,"line":187},[33,139607,9228],{"class":163},[33,139609,9516],{"class":167},[33,139611,734],{"class":50},[33,139613,763],{"class":167},[33,139615,6009],{"class":163},[33,139617,10791],{"class":50},[33,139619,365],{"class":167},[33,139621,139622],{"class":54},"\"No columns loaded\"\n",[33,139624,139625,139627,139629,139631,139634,139636,139638,139640,139642,139644,139646,139648,139650,139652,139654,139656,139658],{"class":35,"line":201},[33,139626,7268],{"class":50},[33,139628,602],{"class":167},[33,139630,4059],{"class":163},[33,139632,139633],{"class":54},"\"OK — ",[33,139635,1115],{"class":50},[33,139637,9541],{"class":167},[33,139639,748],{"class":50},[33,139641,9546],{"class":167},[33,139643,1121],{"class":50},[33,139645,115486],{"class":54},[33,139647,1115],{"class":50},[33,139649,9541],{"class":167},[33,139651,734],{"class":50},[33,139653,9546],{"class":167},[33,139655,1121],{"class":50},[33,139657,115499],{"class":54},[33,139659,221],{"class":167},[33,139661,139662,139664],{"class":35,"line":206},[33,139663,7268],{"class":50},[33,139665,108834],{"class":167},[33,139667,139668,139670,139672,139674],{"class":35,"line":224},[33,139669,35726],{"class":163},[33,139671,9445],{"class":50},[33,139673,1852],{"class":163},[33,139675,7583],{"class":167},[33,139677,139678,139680,139682,139684,139686,139688,139690,139692,139694,139696],{"class":35,"line":229},[33,139679,35742],{"class":163},[33,139681,16617],{"class":50},[33,139683,602],{"class":167},[33,139685,4059],{"class":163},[33,139687,124100],{"class":54},[33,139689,1115],{"class":50},[33,139691,7602],{"class":167},[33,139693,1121],{"class":50},[33,139695,274],{"class":54},[33,139697,221],{"class":167},[33,139699,139700,139702,139704,139706],{"class":35,"line":235},[33,139701,35726],{"class":163},[33,139703,783],{"class":50},[33,139705,1852],{"class":163},[33,139707,7583],{"class":167},[33,139709,139710,139712,139714,139716,139718,139721,139723,139725,139727,139729],{"class":35,"line":250},[33,139711,35742],{"class":163},[33,139713,16617],{"class":50},[33,139715,602],{"class":167},[33,139717,4059],{"class":163},[33,139719,139720],{"class":54},"\"Load error after fix: ",[33,139722,1115],{"class":50},[33,139724,7602],{"class":167},[33,139726,1121],{"class":50},[33,139728,274],{"class":54},[33,139730,221],{"class":167},[14,139732,139733,139734,139737,139738,139740,139741,139743],{},"A non-empty shape and a complete ",[30,139735,139736],{},"dtypes"," printout without a traceback confirms the engine error is resolved. If the error persists despite installing ",[30,139739,22009],{},", double-check that ",[30,139742,26548],{}," ran in the same Python environment that is executing the script.",[18,139745,6918],{"id":6917},[4211,139747,139748,139753,139763],{},[4214,139749,139750,139752],{},[940,139751,99577],{"href":99576}," — full engine comparison: openpyxl, xlrd, calamine, with decision tree SVG",[4214,139754,139755,139759,139760,139762],{},[940,139756,139758],{"href":139757},"\u002Fpython-for-excel-csv-data-processing\u002Freading-excel-files-with-python\u002Fhow-to-read-excel-with-pandas-step-by-step\u002F","How to Read Excel with Pandas, Step by Step"," — beginner walkthrough of all ",[30,139761,57240],{}," parameters",[4214,139764,139765,139767],{},[940,139766,6936],{"href":6935}," — once files load cleanly, automate report generation",[14,139769,6947,139770,3035],{},[940,139771,99577],{"href":99576},[6953,139773,139774],{},"html pre.shiki code .sAwPA, html code.shiki .sAwPA{--shiki-default:#6A737D}html pre.shiki code .sD7c4, html code.shiki .sD7c4{--shiki-default:#D73A49}html pre.shiki code .sgsFI, html code.shiki .sgsFI{--shiki-default:#24292E}html pre.shiki code .sYBdl, html code.shiki .sYBdl{--shiki-default:#032F62}html pre.shiki code .sYu0t, html code.shiki .sYu0t{--shiki-default:#005CC5}html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html pre.shiki code .s7eDp, html code.shiki .s7eDp{--shiki-default:#6F42C1}html pre.shiki code .sqxcx, html code.shiki .sqxcx{--shiki-default:#E36209}",{"title":28,"searchDepth":43,"depth":43,"links":139776},[139777,139778,139779,139781,139783,139785,139786,139787,139788],{"id":7020,"depth":43,"text":7021},{"id":35016,"depth":43,"text":35017},{"id":138551,"depth":43,"text":139780},"Fix: Install openpyxl and Pass engine=\"openpyxl\"",{"id":138785,"depth":43,"text":139782},"Variant 1: You Have a Genuine .xls File",{"id":139123,"depth":43,"text":139784},"Variant 2: You Explicitly Passed engine=\"xlrd\" in Code",{"id":139226,"depth":43,"text":139227},{"id":139400,"depth":43,"text":139401},{"id":9246,"depth":43,"text":9247},{"id":6917,"depth":43,"text":6918},"Fix xlrd .xlsx Error",{},"\u002Fpython-for-excel-csv-data-processing\u002Freading-excel-files-with-python\u002Ffix-xlrd-error-reading-xlsx-files",{"title":126410,"description":139793},{"Resolve XLRDError":139794},{"Excel xlsx file; not supported and ImportError":139795,"date":6978,"updatedAt":6978,"tags":139796},"Missing optional dependency 'xlrd' when reading .xlsx files with pandas. Switch to openpyxl.",[99614,9630,47,22009],"Fix XLRDError and ImportError Reading .xlsx Files with pandas","python-for-excel-csv-data-processing\u002Freading-excel-files-with-python\u002Ffix-xlrd-error-reading-xlsx-files\u002Findex","JnVFTvJsK3mr3aOdVF7OmT5K3iLVpTL1ESzKUpY7nkw",{"id":139801,"title":139802,"body":139803,"breadcrumbTitle":142382,"canonical":6977,"date":46387,"description":142383,"draft":6980,"extension":6981,"image":6977,"meta":142384,"navigation":91,"path":142385,"robots":6977,"seo":142386,"seoTitle":142387,"stem":142388,"tags":142389,"updatedAt":6978,"__hash__":142390},"content\u002Fpython-for-excel-csv-data-processing\u002Freading-excel-files-with-python\u002Fhow-to-read-excel-with-pandas-step-by-step\u002Findex.md","How to Read Excel with Pandas Step by Step",{"type":7,"value":139804,"toc":142368},[139805,139808,139826,139837,139839,139857,139860,139914,139920,139923,140163,140167,140173,140324,140333,140337,140343,140552,140567,140571,140577,140746,140757,140770,140774,140780,141092,141100,141108,141127,141132,141347,141355,141428,141436,141440,141453,141745,141753,141757,141760,142096,142099,142103,142241,142243,142254,142267,142279,142293,142312,142334,142336,142362,142366],[10,139806,139802],{"id":139807},"how-to-read-excel-with-pandas-step-by-step",[14,139809,139810,139813,139814,139816,139817,139819,139820,139822,139823,139825],{},[30,139811,139812],{},"pd.read_excel()"," has over a dozen parameters that interact in non-obvious ways. Skip ",[30,139815,17351],{}," and you hit ",[30,139818,42237],{},"; ignore ",[30,139821,23262],{}," and numeric IDs silently become floats; miss ",[30,139824,126099],{}," on a report with a title block and your column headers land in the wrong row. This walkthrough moves through each decision point in sequence so the first call you write is correct.",[14,139827,139828,139829,139831,139832,139834,139835,3035],{},"For a full reference that covers ",[30,139830,22009],{}," direct access, ",[30,139833,139236],{},", merged cells, and BytesIO loading, see ",[940,139836,99577],{"href":99576},[18,139838,21],{"id":20},[23,139840,139841],{"className":25,"code":128511,"language":27,"meta":28,"style":28},[30,139842,139843,139847],{"__ignoreMap":28},[33,139844,139845],{"class":35,"line":36},[33,139846,3952],{"class":39},[33,139848,139849,139851,139853,139855],{"class":35,"line":43},[33,139850,76],{"class":46},[33,139852,79],{"class":54},[33,139854,16183],{"class":54},[33,139856,95887],{"class":54},[14,139858,139859],{},"Verify the install:",[23,139861,139863],{"className":126,"code":139862,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nimport pandas as pd\nimport openpyxl\nprint(\"pandas\", pd.__version__)\nprint(\"openpyxl\", openpyxl.__version__)\n",[30,139864,139865,139869,139879,139885,139899],{"__ignoreMap":28},[33,139866,139867],{"class":35,"line":36},[33,139868,3952],{"class":39},[33,139870,139871,139873,139875,139877],{"class":35,"line":43},[33,139872,164],{"class":163},[33,139874,492],{"class":167},[33,139876,495],{"class":163},[33,139878,498],{"class":167},[33,139880,139881,139883],{"class":35,"line":61},[33,139882,164],{"class":163},[33,139884,95887],{"class":167},[33,139886,139887,139889,139891,139893,139895,139897],{"class":35,"line":73},[33,139888,13474],{"class":50},[33,139890,602],{"class":167},[33,139892,108249],{"class":54},[33,139894,10884],{"class":167},[33,139896,37016],{"class":50},[33,139898,221],{"class":167},[33,139900,139901,139903,139905,139907,139910,139912],{"class":35,"line":88},[33,139902,13474],{"class":50},[33,139904,602],{"class":167},[33,139906,17356],{"class":54},[33,139908,139909],{"class":167},", openpyxl.",[33,139911,37016],{"class":50},[33,139913,221],{"class":167},[14,139915,37021,139916,139919],{},[30,139917,139918],{},"ModuleNotFoundError: No module named 'openpyxl'"," after the pip install, your terminal is pointing at a different Python than the one running your scripts. Activate the correct virtualenv and reinstall.",[14,139921,139922],{},"Create a test workbook to follow along with each step:",[23,139924,139926],{"className":126,"code":139925,"language":47,"meta":28,"style":28},"# pip install openpyxl\nfrom pathlib import Path\nimport openpyxl\n\nTEST_PATH = Path(\"sample.xlsx\")\nwb = openpyxl.Workbook()\nws = wb.active\nws.title = \"Sales\"\nws.append([\"order_id\", \"customer\", \"amount\", \"order_date\"])\nws.append([1001, \"Alice\", 199.99, \"2026-01-15\"])\nws.append([1002, \"Bob\", 54.50, \"2026-01-16\"])\nws.append([1003, \"Carol\", 320.00, \"2026-01-17\"])\n\n# Second sheet with different data\nws2 = wb.create_sheet(\"Returns\")\nws2.append([\"order_id\", \"reason\", \"refund\"])\nws2.append([1001, \"Wrong size\", 199.99])\nwb.save(TEST_PATH)\nprint(\"Created:\", TEST_PATH.resolve())\n",[30,139927,139928,139932,139942,139948,139952,139966,139975,139983,139993,140013,140035,140058,140081,140085,140090,140104,140122,140139,140147],{"__ignoreMap":28},[33,139929,139930],{"class":35,"line":36},[33,139931,98209],{"class":39},[33,139933,139934,139936,139938,139940],{"class":35,"line":43},[33,139935,190],{"class":163},[33,139937,193],{"class":167},[33,139939,164],{"class":163},[33,139941,198],{"class":167},[33,139943,139944,139946],{"class":35,"line":61},[33,139945,164],{"class":163},[33,139947,95887],{"class":167},[33,139949,139950],{"class":35,"line":73},[33,139951,92],{"emptyLinePlaceholder":91},[33,139953,139954,139957,139959,139961,139964],{"class":35,"line":88},[33,139955,139956],{"class":50},"TEST_PATH",[33,139958,212],{"class":163},[33,139960,215],{"class":167},[33,139962,139963],{"class":54},"\"sample.xlsx\"",[33,139965,221],{"class":167},[33,139967,139968,139970,139972],{"class":35,"line":95},[33,139969,98274],{"class":167},[33,139971,242],{"class":163},[33,139973,139974],{"class":167}," openpyxl.Workbook()\n",[33,139976,139977,139979,139981],{"class":35,"line":101},[33,139978,98330],{"class":167},[33,139980,242],{"class":163},[33,139982,99877],{"class":167},[33,139984,139985,139988,139990],{"class":35,"line":171},[33,139986,139987],{"class":167},"ws.title ",[33,139989,242],{"class":163},[33,139991,139992],{"class":54}," \"Sales\"\n",[33,139994,139995,139997,139999,140001,140003,140005,140007,140009,140011],{"class":35,"line":179},[33,139996,100864],{"class":167},[33,139998,108849],{"class":54},[33,140000,365],{"class":167},[33,140002,59673],{"class":54},[33,140004,365],{"class":167},[33,140006,4106],{"class":54},[33,140008,365],{"class":167},[33,140010,108767],{"class":54},[33,140012,751],{"class":167},[33,140014,140015,140017,140019,140021,140024,140026,140029,140031,140033],{"class":35,"line":187},[33,140016,100864],{"class":167},[33,140018,120342],{"class":50},[33,140020,365],{"class":167},[33,140022,140023],{"class":54},"\"Alice\"",[33,140025,365],{"class":167},[33,140027,140028],{"class":50},"199.99",[33,140030,365],{"class":167},[33,140032,12407],{"class":54},[33,140034,751],{"class":167},[33,140036,140037,140039,140041,140043,140046,140048,140051,140053,140056],{"class":35,"line":201},[33,140038,100864],{"class":167},[33,140040,120347],{"class":50},[33,140042,365],{"class":167},[33,140044,140045],{"class":54},"\"Bob\"",[33,140047,365],{"class":167},[33,140049,140050],{"class":50},"54.50",[33,140052,365],{"class":167},[33,140054,140055],{"class":54},"\"2026-01-16\"",[33,140057,751],{"class":167},[33,140059,140060,140062,140064,140066,140069,140071,140074,140076,140079],{"class":35,"line":206},[33,140061,100864],{"class":167},[33,140063,120352],{"class":50},[33,140065,365],{"class":167},[33,140067,140068],{"class":54},"\"Carol\"",[33,140070,365],{"class":167},[33,140072,140073],{"class":50},"320.00",[33,140075,365],{"class":167},[33,140077,140078],{"class":54},"\"2026-01-17\"",[33,140080,751],{"class":167},[33,140082,140083],{"class":35,"line":224},[33,140084,92],{"emptyLinePlaceholder":91},[33,140086,140087],{"class":35,"line":229},[33,140088,140089],{"class":39},"# Second sheet with different data\n",[33,140091,140092,140095,140097,140099,140102],{"class":35,"line":235},[33,140093,140094],{"class":167},"ws2 ",[33,140096,242],{"class":163},[33,140098,100851],{"class":167},[33,140100,140101],{"class":54},"\"Returns\"",[33,140103,221],{"class":167},[33,140105,140106,140109,140111,140113,140116,140118,140120],{"class":35,"line":250},[33,140107,140108],{"class":167},"ws2.append([",[33,140110,108849],{"class":54},[33,140112,365],{"class":167},[33,140114,140115],{"class":54},"\"reason\"",[33,140117,365],{"class":167},[33,140119,131392],{"class":54},[33,140121,751],{"class":167},[33,140123,140124,140126,140128,140130,140133,140135,140137],{"class":35,"line":266},[33,140125,140108],{"class":167},[33,140127,120342],{"class":50},[33,140129,365],{"class":167},[33,140131,140132],{"class":54},"\"Wrong size\"",[33,140134,365],{"class":167},[33,140136,140028],{"class":50},[33,140138,751],{"class":167},[33,140140,140141,140143,140145],{"class":35,"line":290},[33,140142,100907],{"class":167},[33,140144,139956],{"class":50},[33,140146,221],{"class":167},[33,140148,140149,140151,140153,140156,140158,140160],{"class":35,"line":295},[33,140150,13474],{"class":50},[33,140152,602],{"class":167},[33,140154,140155],{"class":54},"\"Created:\"",[33,140157,365],{"class":167},[33,140159,139956],{"class":50},[33,140161,140162],{"class":167},".resolve())\n",[18,140164,140166],{"id":140165},"step-1-discover-sheet-names","Step 1: Discover Sheet Names",[14,140168,140169,140170,140172],{},"Before reading data, confirm what sheets exist. A ",[30,140171,8377],{}," caused by a sheet name that has an invisible trailing space is a surprisingly common failure in production:",[23,140174,140176],{"className":126,"code":140175,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nfrom pathlib import Path\nimport pandas as pd\n\nEXCEL_PATH = Path(\"sample.xlsx\")\n\ntry:\n    xl = pd.ExcelFile(EXCEL_PATH, engine=\"openpyxl\")\n    print(\"Sheets:\", xl.sheet_names)     # ['Sales', 'Returns']\nexcept FileNotFoundError:\n    raise SystemExit(f\"File not found: {EXCEL_PATH}\")\nexcept Exception as e:\n    raise SystemExit(f\"Cannot open: {e}\")\n",[30,140177,140178,140182,140192,140202,140206,140218,140222,140228,140250,140265,140273,140291,140301],{"__ignoreMap":28},[33,140179,140180],{"class":35,"line":36},[33,140181,3952],{"class":39},[33,140183,140184,140186,140188,140190],{"class":35,"line":43},[33,140185,190],{"class":163},[33,140187,193],{"class":167},[33,140189,164],{"class":163},[33,140191,198],{"class":167},[33,140193,140194,140196,140198,140200],{"class":35,"line":61},[33,140195,164],{"class":163},[33,140197,492],{"class":167},[33,140199,495],{"class":163},[33,140201,498],{"class":167},[33,140203,140204],{"class":35,"line":73},[33,140205,92],{"emptyLinePlaceholder":91},[33,140207,140208,140210,140212,140214,140216],{"class":35,"line":88},[33,140209,138592],{"class":50},[33,140211,212],{"class":163},[33,140213,215],{"class":167},[33,140215,139963],{"class":54},[33,140217,221],{"class":167},[33,140219,140220],{"class":35,"line":95},[33,140221,92],{"emptyLinePlaceholder":91},[33,140223,140224,140226],{"class":35,"line":101},[33,140225,35574],{"class":163},[33,140227,574],{"class":167},[33,140229,140230,140233,140235,140238,140240,140242,140244,140246,140248],{"class":35,"line":171},[33,140231,140232],{"class":167},"    xl ",[33,140234,242],{"class":163},[33,140236,140237],{"class":167}," pd.ExcelFile(",[33,140239,138592],{"class":50},[33,140241,365],{"class":167},[33,140243,17351],{"class":238},[33,140245,242],{"class":163},[33,140247,17356],{"class":54},[33,140249,221],{"class":167},[33,140251,140252,140254,140256,140259,140262],{"class":35,"line":179},[33,140253,7268],{"class":50},[33,140255,602],{"class":167},[33,140257,140258],{"class":54},"\"Sheets:\"",[33,140260,140261],{"class":167},", xl.sheet_names)     ",[33,140263,140264],{"class":39},"# ['Sales', 'Returns']\n",[33,140266,140267,140269,140271],{"class":35,"line":187},[33,140268,35726],{"class":163},[33,140270,2945],{"class":50},[33,140272,574],{"class":167},[33,140274,140275,140277,140279,140281,140283,140285,140287,140289],{"class":35,"line":201},[33,140276,35742],{"class":163},[33,140278,16617],{"class":50},[33,140280,602],{"class":167},[33,140282,4059],{"class":163},[33,140284,15677],{"class":54},[33,140286,138680],{"class":50},[33,140288,274],{"class":54},[33,140290,221],{"class":167},[33,140292,140293,140295,140297,140299],{"class":35,"line":206},[33,140294,35726],{"class":163},[33,140296,783],{"class":50},[33,140298,1852],{"class":163},[33,140300,7583],{"class":167},[33,140302,140303,140305,140307,140309,140311,140314,140316,140318,140320,140322],{"class":35,"line":224},[33,140304,35742],{"class":163},[33,140306,16617],{"class":50},[33,140308,602],{"class":167},[33,140310,4059],{"class":163},[33,140312,140313],{"class":54},"\"Cannot open: ",[33,140315,1115],{"class":50},[33,140317,7602],{"class":167},[33,140319,1121],{"class":50},[33,140321,274],{"class":54},[33,140323,221],{"class":167},[14,140325,140326,140329,140330,140332],{},[30,140327,140328],{},"pd.ExcelFile"," opens the workbook container without parsing any sheet data. It is cheap even on large files and gives you the exact sheet names as Python strings, including any whitespace that would cause a ",[30,140331,8377],{}," if you typed the name manually.",[18,140334,140336],{"id":140335},"step-2-load-a-single-sheet","Step 2: Load a Single Sheet",[14,140338,140339,140340,140342],{},"The minimum correct call for a ",[30,140341,26542],{}," file:",[23,140344,140346],{"className":126,"code":140345,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nfrom pathlib import Path\nimport pandas as pd\n\nEXCEL_PATH = Path(\"sample.xlsx\")\n\ntry:\n    df = pd.read_excel(\n        EXCEL_PATH,\n        sheet_name=\"Sales\",    # target sheet by exact name\n        engine=\"openpyxl\",     # required for .xlsx — never omit\n    )\n    print(df.shape)            # (3, 4)\n    print(df.head())\nexcept FileNotFoundError:\n    raise SystemExit(f\"File not found: {EXCEL_PATH}\")\nexcept ImportError as e:\n    raise SystemExit(f\"Engine missing — pip install openpyxl: {e}\")\nexcept KeyError as e:\n    raise SystemExit(f\"Sheet not found: {e}\")\n",[30,140347,140348,140352,140362,140372,140376,140388,140392,140398,140406,140412,140426,140439,140443,140453,140459,140467,140485,140495,140518,140529],{"__ignoreMap":28},[33,140349,140350],{"class":35,"line":36},[33,140351,3952],{"class":39},[33,140353,140354,140356,140358,140360],{"class":35,"line":43},[33,140355,190],{"class":163},[33,140357,193],{"class":167},[33,140359,164],{"class":163},[33,140361,198],{"class":167},[33,140363,140364,140366,140368,140370],{"class":35,"line":61},[33,140365,164],{"class":163},[33,140367,492],{"class":167},[33,140369,495],{"class":163},[33,140371,498],{"class":167},[33,140373,140374],{"class":35,"line":73},[33,140375,92],{"emptyLinePlaceholder":91},[33,140377,140378,140380,140382,140384,140386],{"class":35,"line":88},[33,140379,138592],{"class":50},[33,140381,212],{"class":163},[33,140383,215],{"class":167},[33,140385,139963],{"class":54},[33,140387,221],{"class":167},[33,140389,140390],{"class":35,"line":95},[33,140391,92],{"emptyLinePlaceholder":91},[33,140393,140394,140396],{"class":35,"line":101},[33,140395,35574],{"class":163},[33,140397,574],{"class":167},[33,140399,140400,140402,140404],{"class":35,"line":171},[33,140401,4025],{"class":167},[33,140403,242],{"class":163},[33,140405,126171],{"class":167},[33,140407,140408,140410],{"class":35,"line":179},[33,140409,138626],{"class":50},[33,140411,247],{"class":167},[33,140413,140414,140416,140418,140421,140423],{"class":35,"line":187},[33,140415,126183],{"class":238},[33,140417,242],{"class":163},[33,140419,140420],{"class":54},"\"Sales\"",[33,140422,38342],{"class":167},[33,140424,140425],{"class":39},"# target sheet by exact name\n",[33,140427,140428,140430,140432,140434,140436],{"class":35,"line":201},[33,140429,111493],{"class":238},[33,140431,242],{"class":163},[33,140433,17356],{"class":54},[33,140435,25539],{"class":167},[33,140437,140438],{"class":39},"# required for .xlsx — never omit\n",[33,140440,140441],{"class":35,"line":206},[33,140442,1202],{"class":167},[33,140444,140445,140447,140450],{"class":35,"line":224},[33,140446,7268],{"class":50},[33,140448,140449],{"class":167},"(df.shape)            ",[33,140451,140452],{"class":39},"# (3, 4)\n",[33,140454,140455,140457],{"class":35,"line":229},[33,140456,7268],{"class":50},[33,140458,13311],{"class":167},[33,140460,140461,140463,140465],{"class":35,"line":235},[33,140462,35726],{"class":163},[33,140464,2945],{"class":50},[33,140466,574],{"class":167},[33,140468,140469,140471,140473,140475,140477,140479,140481,140483],{"class":35,"line":250},[33,140470,35742],{"class":163},[33,140472,16617],{"class":50},[33,140474,602],{"class":167},[33,140476,4059],{"class":163},[33,140478,15677],{"class":54},[33,140480,138680],{"class":50},[33,140482,274],{"class":54},[33,140484,221],{"class":167},[33,140486,140487,140489,140491,140493],{"class":35,"line":266},[33,140488,35726],{"class":163},[33,140490,40488],{"class":50},[33,140492,1852],{"class":163},[33,140494,7583],{"class":167},[33,140496,140497,140499,140501,140503,140505,140508,140510,140512,140514,140516],{"class":35,"line":290},[33,140498,35742],{"class":163},[33,140500,16617],{"class":50},[33,140502,602],{"class":167},[33,140504,4059],{"class":163},[33,140506,140507],{"class":54},"\"Engine missing — pip install openpyxl: ",[33,140509,1115],{"class":50},[33,140511,7602],{"class":167},[33,140513,1121],{"class":50},[33,140515,274],{"class":54},[33,140517,221],{"class":167},[33,140519,140520,140522,140525,140527],{"class":35,"line":295},[33,140521,35726],{"class":163},[33,140523,140524],{"class":50}," KeyError",[33,140526,1852],{"class":163},[33,140528,7583],{"class":167},[33,140530,140531,140533,140535,140537,140539,140542,140544,140546,140548,140550],{"class":35,"line":300},[33,140532,35742],{"class":163},[33,140534,16617],{"class":50},[33,140536,602],{"class":167},[33,140538,4059],{"class":163},[33,140540,140541],{"class":54},"\"Sheet not found: ",[33,140543,1115],{"class":50},[33,140545,7602],{"class":167},[33,140547,1121],{"class":50},[33,140549,274],{"class":54},[33,140551,221],{"class":167},[14,140553,39550,140554,140556,140557,140559,140560,42238,140562,140564,140565,104942],{},[30,140555,22395],{}," argument is not optional in production code. Without it, pandas infers the backend from the file extension; on some system configurations this picks ",[30,140558,125595],{},", which raises ",[30,140561,128863],{},[30,140563,26542],{}," files. See ",[940,140566,126410],{"href":126409},[18,140568,140570],{"id":140569},"step-3-handle-reports-with-non-zero-header-rows","Step 3: Handle Reports with Non-Zero Header Rows",[14,140572,140573,140574,140576],{},"Reports from accounting software, ERP systems, and generated exports often have a company name, print date, filter summary, or logo block above the actual data table. Use ",[30,140575,126099],{}," to remove those rows before pandas identifies the header:",[23,140578,140580],{"className":126,"code":140579,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nfrom pathlib import Path\nimport pandas as pd\n\nEXCEL_PATH = Path(\"monthly_report.xlsx\")\n\ntry:\n    # Rows 0–2 are a title block; row 3 is the real column header\n    df = pd.read_excel(\n        EXCEL_PATH,\n        skiprows=3,        # discard rows 0, 1, 2\n        header=0,          # the next row (original row 3) is now row 0 and becomes the header\n        skipfooter=2,      # ignore the last 2 rows (totals lines)\n        engine=\"openpyxl\",\n    )\n    print(df.columns.tolist())\nexcept Exception as e:\n    raise SystemExit(f\"Read error: {e}\")\n",[30,140581,140582,140586,140596,140606,140610,140623,140627,140633,140638,140646,140652,140666,140679,140693,140703,140707,140714,140724],{"__ignoreMap":28},[33,140583,140584],{"class":35,"line":36},[33,140585,3952],{"class":39},[33,140587,140588,140590,140592,140594],{"class":35,"line":43},[33,140589,190],{"class":163},[33,140591,193],{"class":167},[33,140593,164],{"class":163},[33,140595,198],{"class":167},[33,140597,140598,140600,140602,140604],{"class":35,"line":61},[33,140599,164],{"class":163},[33,140601,492],{"class":167},[33,140603,495],{"class":163},[33,140605,498],{"class":167},[33,140607,140608],{"class":35,"line":73},[33,140609,92],{"emptyLinePlaceholder":91},[33,140611,140612,140614,140616,140618,140621],{"class":35,"line":88},[33,140613,138592],{"class":50},[33,140615,212],{"class":163},[33,140617,215],{"class":167},[33,140619,140620],{"class":54},"\"monthly_report.xlsx\"",[33,140622,221],{"class":167},[33,140624,140625],{"class":35,"line":95},[33,140626,92],{"emptyLinePlaceholder":91},[33,140628,140629,140631],{"class":35,"line":101},[33,140630,35574],{"class":163},[33,140632,574],{"class":167},[33,140634,140635],{"class":35,"line":171},[33,140636,140637],{"class":39},"    # Rows 0–2 are a title block; row 3 is the real column header\n",[33,140639,140640,140642,140644],{"class":35,"line":179},[33,140641,4025],{"class":167},[33,140643,242],{"class":163},[33,140645,126171],{"class":167},[33,140647,140648,140650],{"class":35,"line":187},[33,140649,138626],{"class":50},[33,140651,247],{"class":167},[33,140653,140654,140657,140659,140661,140663],{"class":35,"line":201},[33,140655,140656],{"class":238},"        skiprows",[33,140658,242],{"class":163},[33,140660,10258],{"class":50},[33,140662,89262],{"class":167},[33,140664,140665],{"class":39},"# discard rows 0, 1, 2\n",[33,140667,140668,140670,140672,140674,140676],{"class":35,"line":206},[33,140669,126205],{"class":238},[33,140671,242],{"class":163},[33,140673,748],{"class":50},[33,140675,98374],{"class":167},[33,140677,140678],{"class":39},"# the next row (original row 3) is now row 0 and becomes the header\n",[33,140680,140681,140684,140686,140688,140690],{"class":35,"line":224},[33,140682,140683],{"class":238},"        skipfooter",[33,140685,242],{"class":163},[33,140687,1533],{"class":50},[33,140689,121141],{"class":167},[33,140691,140692],{"class":39},"# ignore the last 2 rows (totals lines)\n",[33,140694,140695,140697,140699,140701],{"class":35,"line":229},[33,140696,111493],{"class":238},[33,140698,242],{"class":163},[33,140700,17356],{"class":54},[33,140702,247],{"class":167},[33,140704,140705],{"class":35,"line":235},[33,140706,1202],{"class":167},[33,140708,140709,140711],{"class":35,"line":250},[33,140710,7268],{"class":50},[33,140712,140713],{"class":167},"(df.columns.tolist())\n",[33,140715,140716,140718,140720,140722],{"class":35,"line":266},[33,140717,35726],{"class":163},[33,140719,783],{"class":50},[33,140721,1852],{"class":163},[33,140723,7583],{"class":167},[33,140725,140726,140728,140730,140732,140734,140736,140738,140740,140742,140744],{"class":35,"line":290},[33,140727,35742],{"class":163},[33,140729,16617],{"class":50},[33,140731,602],{"class":167},[33,140733,4059],{"class":163},[33,140735,138740],{"class":54},[33,140737,1115],{"class":50},[33,140739,7602],{"class":167},[33,140741,1121],{"class":50},[33,140743,274],{"class":54},[33,140745,221],{"class":167},[14,140747,140748,140749,140752,140753,140756],{},"To skip specific non-contiguous rows — e.g., row 0 contains a logo, rows 3 and 4 are blank sub-headers — pass a list: ",[30,140750,140751],{},"skiprows=[0, 3, 4]",". Use ",[30,140754,140755],{},"nrows=10"," in a preview call first to count the exact rows to skip.",[14,140758,12951,140759,140762,140763,365,140765,365,140767,140769],{},[30,140760,140761],{},"header=None"," is passed instead, pandas treats every row as data and assigns integer column indices (",[30,140764,748],{},[30,140766,734],{},[30,140768,1533],{},", …). This is useful when you want to handle the header row manually — for example, to forward-fill merged cell labels.",[18,140771,140773],{"id":140772},"step-4-select-only-the-columns-you-need","Step 4: Select Only the Columns You Need",[14,140775,140776,140777,140779],{},"Wide workbooks exported from ERP or BI systems often contain 40–80 columns. Loading all of them when you need 5 wastes memory and slows type inference. ",[30,140778,21904],{}," restricts what is read from disk:",[23,140781,140783],{"className":126,"code":140782,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nfrom pathlib import Path\nimport pandas as pd\n\nEXCEL_PATH = Path(\"large_export.xlsx\")\n\ntry:\n    # Option A: column names — most readable and maintainable\n    df = pd.read_excel(\n        EXCEL_PATH,\n        usecols=[\"order_id\", \"customer\", \"amount\", \"order_date\"],\n        engine=\"openpyxl\",\n    )\n\n    # Option B: Excel letter range — useful when column names are unknown\n    df_range = pd.read_excel(EXCEL_PATH, usecols=\"A:D\", engine=\"openpyxl\")\n\n    # Option C: callable — keep columns matching a pattern\n    df_q = pd.read_excel(\n        EXCEL_PATH,\n        usecols=lambda c: str(c).startswith(\"Revenue\"),\n        engine=\"openpyxl\",\n    )\n\n    print(f\"Loaded {df.shape[1]} columns, {df.shape[0]} rows\")\n    print(\"Memory:\", df.memory_usage(deep=True).sum() \u002F\u002F 1024, \"KB\")\nexcept Exception as e:\n    raise SystemExit(f\"Read error: {e}\")\n",[30,140784,140785,140789,140799,140809,140813,140826,140830,140836,140841,140849,140855,140880,140890,140894,140898,140903,140933,140937,140942,140951,140957,140974,140984,140988,140992,141029,141060,141070],{"__ignoreMap":28},[33,140786,140787],{"class":35,"line":36},[33,140788,3952],{"class":39},[33,140790,140791,140793,140795,140797],{"class":35,"line":43},[33,140792,190],{"class":163},[33,140794,193],{"class":167},[33,140796,164],{"class":163},[33,140798,198],{"class":167},[33,140800,140801,140803,140805,140807],{"class":35,"line":61},[33,140802,164],{"class":163},[33,140804,492],{"class":167},[33,140806,495],{"class":163},[33,140808,498],{"class":167},[33,140810,140811],{"class":35,"line":73},[33,140812,92],{"emptyLinePlaceholder":91},[33,140814,140815,140817,140819,140821,140824],{"class":35,"line":88},[33,140816,138592],{"class":50},[33,140818,212],{"class":163},[33,140820,215],{"class":167},[33,140822,140823],{"class":54},"\"large_export.xlsx\"",[33,140825,221],{"class":167},[33,140827,140828],{"class":35,"line":95},[33,140829,92],{"emptyLinePlaceholder":91},[33,140831,140832,140834],{"class":35,"line":101},[33,140833,35574],{"class":163},[33,140835,574],{"class":167},[33,140837,140838],{"class":35,"line":171},[33,140839,140840],{"class":39},"    # Option A: column names — most readable and maintainable\n",[33,140842,140843,140845,140847],{"class":35,"line":179},[33,140844,4025],{"class":167},[33,140846,242],{"class":163},[33,140848,126171],{"class":167},[33,140850,140851,140853],{"class":35,"line":187},[33,140852,138626],{"class":50},[33,140854,247],{"class":167},[33,140856,140857,140860,140862,140864,140866,140868,140870,140872,140874,140876,140878],{"class":35,"line":201},[33,140858,140859],{"class":238},"        usecols",[33,140861,242],{"class":163},[33,140863,8309],{"class":167},[33,140865,108849],{"class":54},[33,140867,365],{"class":167},[33,140869,59673],{"class":54},[33,140871,365],{"class":167},[33,140873,4106],{"class":54},[33,140875,365],{"class":167},[33,140877,108767],{"class":54},[33,140879,8935],{"class":167},[33,140881,140882,140884,140886,140888],{"class":35,"line":206},[33,140883,111493],{"class":238},[33,140885,242],{"class":163},[33,140887,17356],{"class":54},[33,140889,247],{"class":167},[33,140891,140892],{"class":35,"line":224},[33,140893,1202],{"class":167},[33,140895,140896],{"class":35,"line":229},[33,140897,92],{"emptyLinePlaceholder":91},[33,140899,140900],{"class":35,"line":235},[33,140901,140902],{"class":39},"    # Option B: Excel letter range — useful when column names are unknown\n",[33,140904,140905,140908,140910,140912,140914,140916,140918,140920,140923,140925,140927,140929,140931],{"class":35,"line":250},[33,140906,140907],{"class":167},"    df_range ",[33,140909,242],{"class":163},[33,140911,126254],{"class":167},[33,140913,138592],{"class":50},[33,140915,365],{"class":167},[33,140917,21904],{"class":238},[33,140919,242],{"class":163},[33,140921,140922],{"class":54},"\"A:D\"",[33,140924,365],{"class":167},[33,140926,17351],{"class":238},[33,140928,242],{"class":163},[33,140930,17356],{"class":54},[33,140932,221],{"class":167},[33,140934,140935],{"class":35,"line":266},[33,140936,92],{"emptyLinePlaceholder":91},[33,140938,140939],{"class":35,"line":290},[33,140940,140941],{"class":39},"    # Option C: callable — keep columns matching a pattern\n",[33,140943,140944,140947,140949],{"class":35,"line":295},[33,140945,140946],{"class":167},"    df_q ",[33,140948,242],{"class":163},[33,140950,126171],{"class":167},[33,140952,140953,140955],{"class":35,"line":300},[33,140954,138626],{"class":50},[33,140956,247],{"class":167},[33,140958,140959,140961,140963,140966,140968,140970,140972],{"class":35,"line":317},[33,140960,140859],{"class":238},[33,140962,44117],{"class":163},[33,140964,140965],{"class":167}," c: ",[33,140967,1053],{"class":50},[33,140969,118939],{"class":167},[33,140971,12925],{"class":54},[33,140973,1506],{"class":167},[33,140975,140976,140978,140980,140982],{"class":35,"line":332},[33,140977,111493],{"class":238},[33,140979,242],{"class":163},[33,140981,17356],{"class":54},[33,140983,247],{"class":167},[33,140985,140986],{"class":35,"line":347},[33,140987,1202],{"class":167},[33,140989,140990],{"class":35,"line":374},[33,140991,92],{"emptyLinePlaceholder":91},[33,140993,140994,140996,140998,141000,141002,141004,141006,141008,141010,141012,141015,141017,141019,141021,141023,141025,141027],{"class":35,"line":397},[33,140995,7268],{"class":50},[33,140997,602],{"class":167},[33,140999,4059],{"class":163},[33,141001,96187],{"class":54},[33,141003,1115],{"class":50},[33,141005,9541],{"class":167},[33,141007,734],{"class":50},[33,141009,9546],{"class":167},[33,141011,1121],{"class":50},[33,141013,141014],{"class":54}," columns, ",[33,141016,1115],{"class":50},[33,141018,9541],{"class":167},[33,141020,748],{"class":50},[33,141022,9546],{"class":167},[33,141024,1121],{"class":50},[33,141026,65937],{"class":54},[33,141028,221],{"class":167},[33,141030,141031,141033,141035,141038,141041,141043,141045,141047,141049,141051,141053,141055,141058],{"class":35,"line":653},[33,141032,7268],{"class":50},[33,141034,602],{"class":167},[33,141036,141037],{"class":54},"\"Memory:\"",[33,141039,141040],{"class":167},", df.memory_usage(",[33,141042,115520],{"class":238},[33,141044,242],{"class":163},[33,141046,855],{"class":50},[33,141048,115527],{"class":167},[33,141050,74328],{"class":163},[33,141052,1159],{"class":50},[33,141054,365],{"class":167},[33,141056,141057],{"class":54},"\"KB\"",[33,141059,221],{"class":167},[33,141061,141062,141064,141066,141068],{"class":35,"line":667},[33,141063,35726],{"class":163},[33,141065,783],{"class":50},[33,141067,1852],{"class":163},[33,141069,7583],{"class":167},[33,141071,141072,141074,141076,141078,141080,141082,141084,141086,141088,141090],{"class":35,"line":675},[33,141073,35742],{"class":163},[33,141075,16617],{"class":50},[33,141077,602],{"class":167},[33,141079,4059],{"class":163},[33,141081,138740],{"class":54},[33,141083,1115],{"class":50},[33,141085,7602],{"class":167},[33,141087,1121],{"class":50},[33,141089,274],{"class":54},[33,141091,221],{"class":167},[14,141093,12951,141094,141096,141097,3035],{},[30,141095,21904],{}," is a list of names, those names must match the header row exactly — case-sensitive, including any whitespace in the original file. Strip column names after load if you suspect padding: ",[30,141098,141099],{},"df.columns = df.columns.str.strip()",[18,141101,141103,141104,10065,141106],{"id":141102},"step-5-fix-type-inference-with-dtype-and-parse_dates","Step 5: Fix Type Inference with ",[30,141105,23262],{},[30,141107,102641],{},[14,141109,141110,141111,141114,141115,141117,141118,141121,141122,141124,141125,3035],{},"Pandas type inference makes two predictable mistakes on Excel data. First, columns that look numeric to the sampler (but contain IDs like ",[30,141112,141113],{},"001234",") get cast to ",[30,141116,102445],{},", adding ",[30,141119,141120],{},".0"," and dropping leading zeros. Second, date columns stored as strings or formatted text cells stay as ",[30,141123,11888],{}," dtype rather than ",[30,141126,130985],{},[14,141128,141129,141130,57922],{},"Fix both issues at the ",[30,141131,57240],{},[23,141133,141135],{"className":126,"code":141134,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nfrom pathlib import Path\nimport pandas as pd\n\nEXCEL_PATH = Path(\"sample.xlsx\")\n\ntry:\n    df = pd.read_excel(\n        EXCEL_PATH,\n        sheet_name=\"Sales\",\n        dtype={\n            \"order_id\": str,       # keep as \"1001\", not 1001.0\n            \"customer\": str,\n            \"amount\": float,\n        },\n        parse_dates=[\"order_date\"],    # coerce to datetime64[ns]\n        engine=\"openpyxl\",\n    )\n    print(df.dtypes)\n    # order_id       object\n    # customer       object\n    # amount        float64\n    # order_date    datetime64[ns]\nexcept Exception as e:\n    raise SystemExit(f\"Read error: {e}\")\n",[30,141136,141137,141141,141151,141161,141165,141177,141181,141187,141195,141201,141211,141219,141233,141244,141255,141260,141275,141285,141289,141295,141300,141305,141310,141315,141325],{"__ignoreMap":28},[33,141138,141139],{"class":35,"line":36},[33,141140,3952],{"class":39},[33,141142,141143,141145,141147,141149],{"class":35,"line":43},[33,141144,190],{"class":163},[33,141146,193],{"class":167},[33,141148,164],{"class":163},[33,141150,198],{"class":167},[33,141152,141153,141155,141157,141159],{"class":35,"line":61},[33,141154,164],{"class":163},[33,141156,492],{"class":167},[33,141158,495],{"class":163},[33,141160,498],{"class":167},[33,141162,141163],{"class":35,"line":73},[33,141164,92],{"emptyLinePlaceholder":91},[33,141166,141167,141169,141171,141173,141175],{"class":35,"line":88},[33,141168,138592],{"class":50},[33,141170,212],{"class":163},[33,141172,215],{"class":167},[33,141174,139963],{"class":54},[33,141176,221],{"class":167},[33,141178,141179],{"class":35,"line":95},[33,141180,92],{"emptyLinePlaceholder":91},[33,141182,141183,141185],{"class":35,"line":101},[33,141184,35574],{"class":163},[33,141186,574],{"class":167},[33,141188,141189,141191,141193],{"class":35,"line":171},[33,141190,4025],{"class":167},[33,141192,242],{"class":163},[33,141194,126171],{"class":167},[33,141196,141197,141199],{"class":35,"line":179},[33,141198,138626],{"class":50},[33,141200,247],{"class":167},[33,141202,141203,141205,141207,141209],{"class":35,"line":187},[33,141204,126183],{"class":238},[33,141206,242],{"class":163},[33,141208,140420],{"class":54},[33,141210,247],{"class":167},[33,141212,141213,141215,141217],{"class":35,"line":201},[33,141214,125888],{"class":238},[33,141216,242],{"class":163},[33,141218,10225],{"class":167},[33,141220,141221,141224,141226,141228,141230],{"class":35,"line":206},[33,141222,141223],{"class":54},"            \"order_id\"",[33,141225,2079],{"class":167},[33,141227,1053],{"class":50},[33,141229,25445],{"class":167},[33,141231,141232],{"class":39},"# keep as \"1001\", not 1001.0\n",[33,141234,141235,141238,141240,141242],{"class":35,"line":224},[33,141236,141237],{"class":54},"            \"customer\"",[33,141239,2079],{"class":167},[33,141241,1053],{"class":50},[33,141243,247],{"class":167},[33,141245,141246,141249,141251,141253],{"class":35,"line":229},[33,141247,141248],{"class":54},"            \"amount\"",[33,141250,2079],{"class":167},[33,141252,1720],{"class":50},[33,141254,247],{"class":167},[33,141256,141257],{"class":35,"line":235},[33,141258,141259],{"class":167},"        },\n",[33,141261,141262,141264,141266,141268,141270,141272],{"class":35,"line":250},[33,141263,108760],{"class":238},[33,141265,242],{"class":163},[33,141267,8309],{"class":167},[33,141269,108767],{"class":54},[33,141271,86960],{"class":167},[33,141273,141274],{"class":39},"# coerce to datetime64[ns]\n",[33,141276,141277,141279,141281,141283],{"class":35,"line":266},[33,141278,111493],{"class":238},[33,141280,242],{"class":163},[33,141282,17356],{"class":54},[33,141284,247],{"class":167},[33,141286,141287],{"class":35,"line":290},[33,141288,1202],{"class":167},[33,141290,141291,141293],{"class":35,"line":295},[33,141292,7268],{"class":50},[33,141294,108834],{"class":167},[33,141296,141297],{"class":35,"line":300},[33,141298,141299],{"class":39},"    # order_id       object\n",[33,141301,141302],{"class":35,"line":317},[33,141303,141304],{"class":39},"    # customer       object\n",[33,141306,141307],{"class":35,"line":332},[33,141308,141309],{"class":39},"    # amount        float64\n",[33,141311,141312],{"class":35,"line":347},[33,141313,141314],{"class":39},"    # order_date    datetime64[ns]\n",[33,141316,141317,141319,141321,141323],{"class":35,"line":374},[33,141318,35726],{"class":163},[33,141320,783],{"class":50},[33,141322,1852],{"class":163},[33,141324,7583],{"class":167},[33,141326,141327,141329,141331,141333,141335,141337,141339,141341,141343,141345],{"class":35,"line":397},[33,141328,35742],{"class":163},[33,141330,16617],{"class":50},[33,141332,602],{"class":167},[33,141334,4059],{"class":163},[33,141336,138740],{"class":54},[33,141338,1115],{"class":50},[33,141340,7602],{"class":167},[33,141342,1121],{"class":50},[33,141344,274],{"class":54},[33,141346,221],{"class":167},[14,141348,41963,141349,141351,141352,141354],{},[30,141350,102641],{}," fails silently and the column stays ",[30,141353,11888],{},", the cells contain text strings rather than Excel date serials. Use this fallback:",[23,141356,141358],{"className":126,"code":141357,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\n\n# After loading df above\ndf[\"order_date\"] = pd.to_datetime(df[\"order_date\"], errors=\"coerce\")\n# errors=\"coerce\" inserts NaT for values that cannot be parsed\nprint(df[\"order_date\"].isna().sum(), \"unparseable dates\")\n",[30,141359,141360,141364,141374,141378,141383,141407,141412],{"__ignoreMap":28},[33,141361,141362],{"class":35,"line":36},[33,141363,8895],{"class":39},[33,141365,141366,141368,141370,141372],{"class":35,"line":43},[33,141367,164],{"class":163},[33,141369,492],{"class":167},[33,141371,495],{"class":163},[33,141373,498],{"class":167},[33,141375,141376],{"class":35,"line":61},[33,141377,92],{"emptyLinePlaceholder":91},[33,141379,141380],{"class":35,"line":73},[33,141381,141382],{"class":39},"# After loading df above\n",[33,141384,141385,141387,141389,141391,141393,141395,141397,141399,141401,141403,141405],{"class":35,"line":88},[33,141386,11038],{"class":167},[33,141388,108767],{"class":54},[33,141390,763],{"class":167},[33,141392,242],{"class":163},[33,141394,27668],{"class":167},[33,141396,108767],{"class":54},[33,141398,8314],{"class":167},[33,141400,8317],{"class":238},[33,141402,242],{"class":163},[33,141404,12107],{"class":54},[33,141406,221],{"class":167},[33,141408,141409],{"class":35,"line":95},[33,141410,141411],{"class":39},"# errors=\"coerce\" inserts NaT for values that cannot be parsed\n",[33,141413,141414,141416,141418,141420,141423,141426],{"class":35,"line":101},[33,141415,13474],{"class":50},[33,141417,127012],{"class":167},[33,141419,108767],{"class":54},[33,141421,141422],{"class":167},"].isna().sum(), ",[33,141424,141425],{"class":54},"\"unparseable dates\"",[33,141427,221],{"class":167},[14,141429,39550,141430,141432,141433,141435],{},[30,141431,27816],{}," argument makes failures visible as ",[30,141434,116202],{}," rather than silently keeping the original strings.",[18,141437,141439],{"id":141438},"step-6-load-all-sheets-at-once","Step 6: Load All Sheets at Once",[14,141441,141442,141443,141445,141446,141448,141449,141452],{},"When all sheets share the same column structure and you want to concatenate them, pass ",[30,141444,126093],{},". This returns an ",[30,141447,135817],{}," of ",[30,141450,141451],{},"{sheet_name: DataFrame}"," without requiring you to enumerate names in advance:",[23,141454,141456],{"className":126,"code":141455,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nfrom pathlib import Path\nimport pandas as pd\n\nEXCEL_PATH = Path(\"annual_report.xlsx\")\n\ntry:\n    all_sheets: dict[str, pd.DataFrame] = pd.read_excel(\n        EXCEL_PATH,\n        sheet_name=None,           # load every sheet\n        dtype={\"order_id\": str},\n        engine=\"openpyxl\",\n    )\n\n    frames = []\n    for name, df in all_sheets.items():\n        df = df.dropna(how=\"all\")           # drop fully blank rows\n        df[\"source_sheet\"] = name           # track which sheet each row came from\n        frames.append(df)\n\n    if not frames:\n        raise ValueError(\"No non-empty sheets found\")\n\n    combined = pd.concat(frames, ignore_index=True)\n    print(f\"Combined: {combined.shape[0]} rows across {len(frames)} sheets\")\nexcept Exception as e:\n    raise SystemExit(f\"Error: {e}\")\n",[30,141457,141458,141462,141472,141482,141486,141498,141502,141508,141521,141527,141540,141556,141566,141570,141574,141582,141593,141612,141629,141633,141637,141645,141658,141662,141678,141713,141723],{"__ignoreMap":28},[33,141459,141460],{"class":35,"line":36},[33,141461,3952],{"class":39},[33,141463,141464,141466,141468,141470],{"class":35,"line":43},[33,141465,190],{"class":163},[33,141467,193],{"class":167},[33,141469,164],{"class":163},[33,141471,198],{"class":167},[33,141473,141474,141476,141478,141480],{"class":35,"line":61},[33,141475,164],{"class":163},[33,141477,492],{"class":167},[33,141479,495],{"class":163},[33,141481,498],{"class":167},[33,141483,141484],{"class":35,"line":73},[33,141485,92],{"emptyLinePlaceholder":91},[33,141487,141488,141490,141492,141494,141496],{"class":35,"line":88},[33,141489,138592],{"class":50},[33,141491,212],{"class":163},[33,141493,215],{"class":167},[33,141495,130112],{"class":54},[33,141497,221],{"class":167},[33,141499,141500],{"class":35,"line":95},[33,141501,92],{"emptyLinePlaceholder":91},[33,141503,141504,141506],{"class":35,"line":101},[33,141505,35574],{"class":163},[33,141507,574],{"class":167},[33,141509,141510,141513,141515,141517,141519],{"class":35,"line":171},[33,141511,141512],{"class":167},"    all_sheets: dict[",[33,141514,1053],{"class":50},[33,141516,14088],{"class":167},[33,141518,242],{"class":163},[33,141520,126171],{"class":167},[33,141522,141523,141525],{"class":35,"line":179},[33,141524,138626],{"class":50},[33,141526,247],{"class":167},[33,141528,141529,141531,141533,141535,141537],{"class":35,"line":187},[33,141530,126183],{"class":238},[33,141532,242],{"class":163},[33,141534,571],{"class":50},[33,141536,136720],{"class":167},[33,141538,141539],{"class":39},"# load every sheet\n",[33,141541,141542,141544,141546,141548,141550,141552,141554],{"class":35,"line":201},[33,141543,125888],{"class":238},[33,141545,242],{"class":163},[33,141547,1115],{"class":167},[33,141549,108849],{"class":54},[33,141551,2079],{"class":167},[33,141553,1053],{"class":50},[33,141555,3509],{"class":167},[33,141557,141558,141560,141562,141564],{"class":35,"line":206},[33,141559,111493],{"class":238},[33,141561,242],{"class":163},[33,141563,17356],{"class":54},[33,141565,247],{"class":167},[33,141567,141568],{"class":35,"line":224},[33,141569,1202],{"class":167},[33,141571,141572],{"class":35,"line":229},[33,141573,92],{"emptyLinePlaceholder":91},[33,141575,141576,141578,141580],{"class":35,"line":235},[33,141577,584],{"class":167},[33,141579,242],{"class":163},[33,141581,589],{"class":167},[33,141583,141584,141586,141588,141590],{"class":35,"line":250},[33,141585,656],{"class":163},[33,141587,14253],{"class":167},[33,141589,662],{"class":163},[33,141591,141592],{"class":167}," all_sheets.items():\n",[33,141594,141595,141597,141599,141601,141603,141605,141607,141609],{"class":35,"line":266},[33,141596,7930],{"class":167},[33,141598,242],{"class":163},[33,141600,114425],{"class":167},[33,141602,28045],{"class":238},[33,141604,242],{"class":163},[33,141606,35616],{"class":54},[33,141608,28335],{"class":167},[33,141610,141611],{"class":39},"# drop fully blank rows\n",[33,141613,141614,141616,141619,141621,141623,141626],{"class":35,"line":290},[33,141615,10902],{"class":167},[33,141617,141618],{"class":54},"\"source_sheet\"",[33,141620,763],{"class":167},[33,141622,242],{"class":163},[33,141624,141625],{"class":167}," name           ",[33,141627,141628],{"class":39},"# track which sheet each row came from\n",[33,141630,141631],{"class":35,"line":295},[33,141632,10929],{"class":167},[33,141634,141635],{"class":35,"line":300},[33,141636,92],{"emptyLinePlaceholder":91},[33,141638,141639,141641,141643],{"class":35,"line":317},[33,141640,617],{"class":163},[33,141642,620],{"class":163},[33,141644,816],{"class":167},[33,141646,141647,141649,141651,141653,141656],{"class":35,"line":332},[33,141648,4051],{"class":163},[33,141650,4054],{"class":50},[33,141652,602],{"class":167},[33,141654,141655],{"class":54},"\"No non-empty sheets found\"",[33,141657,221],{"class":167},[33,141659,141660],{"class":35,"line":347},[33,141661,92],{"emptyLinePlaceholder":91},[33,141663,141664,141666,141668,141670,141672,141674,141676],{"class":35,"line":374},[33,141665,842],{"class":167},[33,141667,242],{"class":163},[33,141669,847],{"class":167},[33,141671,850],{"class":238},[33,141673,242],{"class":163},[33,141675,855],{"class":50},[33,141677,221],{"class":167},[33,141679,141680,141682,141684,141686,141689,141691,141693,141695,141697,141699,141702,141704,141706,141708,141711],{"class":35,"line":397},[33,141681,7268],{"class":50},[33,141683,602],{"class":167},[33,141685,4059],{"class":163},[33,141687,141688],{"class":54},"\"Combined: ",[33,141690,1115],{"class":50},[33,141692,16013],{"class":167},[33,141694,748],{"class":50},[33,141696,9546],{"class":167},[33,141698,1121],{"class":50},[33,141700,141701],{"class":54}," rows across ",[33,141703,4065],{"class":50},[33,141705,8147],{"class":167},[33,141707,1121],{"class":50},[33,141709,141710],{"class":54}," sheets\"",[33,141712,221],{"class":167},[33,141714,141715,141717,141719,141721],{"class":35,"line":653},[33,141716,35726],{"class":163},[33,141718,783],{"class":50},[33,141720,1852],{"class":163},[33,141722,7583],{"class":167},[33,141724,141725,141727,141729,141731,141733,141735,141737,141739,141741,141743],{"class":35,"line":667},[33,141726,35742],{"class":163},[33,141728,16617],{"class":50},[33,141730,602],{"class":167},[33,141732,4059],{"class":163},[33,141734,39108],{"class":54},[33,141736,1115],{"class":50},[33,141738,7602],{"class":167},[33,141740,1121],{"class":50},[33,141742,274],{"class":54},[33,141744,221],{"class":167},[14,141746,100932,141747,141749,141750,141752],{},[30,141748,126093],{}," pattern is the standard entry point for consolidation workflows. See ",[940,141751,28119],{"href":28118}," for how to extend this to workbooks from different sources.",[18,141754,141756],{"id":141755},"step-7-validate-the-result","Step 7: Validate the Result",[14,141758,141759],{},"Validate immediately after loading, before any downstream transformation or reporting. A malformed source file should raise an explicit error rather than produce a silent wrong result in a downstream report:",[23,141761,141763],{"className":126,"code":141762,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nfrom pathlib import Path\nimport pandas as pd\n\nEXCEL_PATH = Path(\"sample.xlsx\")\nREQUIRED_COLS = {\"order_id\", \"customer\", \"amount\", \"order_date\"}\n\ntry:\n    df = pd.read_excel(\n        EXCEL_PATH,\n        sheet_name=\"Sales\",\n        dtype={\"order_id\": str},\n        parse_dates=[\"order_date\"],\n        engine=\"openpyxl\",\n    )\n\n    missing_cols = REQUIRED_COLS - set(df.columns)\n    assert not missing_cols, f\"Missing columns: {missing_cols}\"\n    assert df.shape[0] > 0, \"DataFrame is empty\"\n    assert not df[\"order_id\"].isna().any(), \"Null order_ids found\"\n    assert pd.api.types.is_datetime64_any_dtype(df[\"order_date\"]), \"order_date not datetime\"\n\n    print(f\"Validation passed — {df.shape[0]} rows\")\n    print(df.dtypes)\nexcept AssertionError as e:\n    raise SystemExit(f\"Validation failed: {e}\")\nexcept Exception as e:\n    raise SystemExit(f\"Load error: {e}\")\n",[30,141764,141765,141769,141779,141789,141793,141805,141830,141834,141840,141848,141854,141864,141880,141892,141902,141906,141910,141926,141948,141967,141983,141997,142001,142025,142031,142041,142063,142073],{"__ignoreMap":28},[33,141766,141767],{"class":35,"line":36},[33,141768,3952],{"class":39},[33,141770,141771,141773,141775,141777],{"class":35,"line":43},[33,141772,190],{"class":163},[33,141774,193],{"class":167},[33,141776,164],{"class":163},[33,141778,198],{"class":167},[33,141780,141781,141783,141785,141787],{"class":35,"line":61},[33,141782,164],{"class":163},[33,141784,492],{"class":167},[33,141786,495],{"class":163},[33,141788,498],{"class":167},[33,141790,141791],{"class":35,"line":73},[33,141792,92],{"emptyLinePlaceholder":91},[33,141794,141795,141797,141799,141801,141803],{"class":35,"line":88},[33,141796,138592],{"class":50},[33,141798,212],{"class":163},[33,141800,215],{"class":167},[33,141802,139963],{"class":54},[33,141804,221],{"class":167},[33,141806,141807,141810,141812,141814,141816,141818,141820,141822,141824,141826,141828],{"class":35,"line":95},[33,141808,141809],{"class":50},"REQUIRED_COLS",[33,141811,212],{"class":163},[33,141813,4098],{"class":167},[33,141815,108849],{"class":54},[33,141817,365],{"class":167},[33,141819,59673],{"class":54},[33,141821,365],{"class":167},[33,141823,4106],{"class":54},[33,141825,365],{"class":167},[33,141827,108767],{"class":54},[33,141829,4113],{"class":167},[33,141831,141832],{"class":35,"line":101},[33,141833,92],{"emptyLinePlaceholder":91},[33,141835,141836,141838],{"class":35,"line":171},[33,141837,35574],{"class":163},[33,141839,574],{"class":167},[33,141841,141842,141844,141846],{"class":35,"line":179},[33,141843,4025],{"class":167},[33,141845,242],{"class":163},[33,141847,126171],{"class":167},[33,141849,141850,141852],{"class":35,"line":187},[33,141851,138626],{"class":50},[33,141853,247],{"class":167},[33,141855,141856,141858,141860,141862],{"class":35,"line":201},[33,141857,126183],{"class":238},[33,141859,242],{"class":163},[33,141861,140420],{"class":54},[33,141863,247],{"class":167},[33,141865,141866,141868,141870,141872,141874,141876,141878],{"class":35,"line":206},[33,141867,125888],{"class":238},[33,141869,242],{"class":163},[33,141871,1115],{"class":167},[33,141873,108849],{"class":54},[33,141875,2079],{"class":167},[33,141877,1053],{"class":50},[33,141879,3509],{"class":167},[33,141881,141882,141884,141886,141888,141890],{"class":35,"line":224},[33,141883,108760],{"class":238},[33,141885,242],{"class":163},[33,141887,8309],{"class":167},[33,141889,108767],{"class":54},[33,141891,8935],{"class":167},[33,141893,141894,141896,141898,141900],{"class":35,"line":229},[33,141895,111493],{"class":238},[33,141897,242],{"class":163},[33,141899,17356],{"class":54},[33,141901,247],{"class":167},[33,141903,141904],{"class":35,"line":235},[33,141905,1202],{"class":167},[33,141907,141908],{"class":35,"line":250},[33,141909,92],{"emptyLinePlaceholder":91},[33,141911,141912,141915,141917,141920,141922,141924],{"class":35,"line":266},[33,141913,141914],{"class":167},"    missing_cols ",[33,141916,242],{"class":163},[33,141918,141919],{"class":50}," REQUIRED_COLS",[33,141921,39025],{"class":163},[33,141923,4129],{"class":50},[33,141925,4132],{"class":167},[33,141927,141928,141930,141932,141935,141937,141939,141941,141944,141946],{"class":35,"line":290},[33,141929,9228],{"class":163},[33,141931,620],{"class":163},[33,141933,141934],{"class":167}," missing_cols, ",[33,141936,4059],{"class":163},[33,141938,4152],{"class":54},[33,141940,1115],{"class":50},[33,141942,141943],{"class":167},"missing_cols",[33,141945,1121],{"class":50},[33,141947,7504],{"class":54},[33,141949,141950,141952,141954,141956,141958,141960,141962,141964],{"class":35,"line":295},[33,141951,9228],{"class":163},[33,141953,9516],{"class":167},[33,141955,748],{"class":50},[33,141957,763],{"class":167},[33,141959,6009],{"class":163},[33,141961,10791],{"class":50},[33,141963,365],{"class":167},[33,141965,141966],{"class":54},"\"DataFrame is empty\"\n",[33,141968,141969,141971,141973,141975,141977,141980],{"class":35,"line":300},[33,141970,9228],{"class":163},[33,141972,620],{"class":163},[33,141974,7935],{"class":167},[33,141976,108849],{"class":54},[33,141978,141979],{"class":167},"].isna().any(), ",[33,141981,141982],{"class":54},"\"Null order_ids found\"\n",[33,141984,141985,141987,141990,141992,141994],{"class":35,"line":317},[33,141986,9228],{"class":163},[33,141988,141989],{"class":167}," pd.api.types.is_datetime64_any_dtype(df[",[33,141991,108767],{"class":54},[33,141993,54320],{"class":167},[33,141995,141996],{"class":54},"\"order_date not datetime\"\n",[33,141998,141999],{"class":35,"line":332},[33,142000,92],{"emptyLinePlaceholder":91},[33,142002,142003,142005,142007,142009,142011,142013,142015,142017,142019,142021,142023],{"class":35,"line":347},[33,142004,7268],{"class":50},[33,142006,602],{"class":167},[33,142008,4059],{"class":163},[33,142010,124023],{"class":54},[33,142012,1115],{"class":50},[33,142014,9541],{"class":167},[33,142016,748],{"class":50},[33,142018,9546],{"class":167},[33,142020,1121],{"class":50},[33,142022,65937],{"class":54},[33,142024,221],{"class":167},[33,142026,142027,142029],{"class":35,"line":374},[33,142028,7268],{"class":50},[33,142030,108834],{"class":167},[33,142032,142033,142035,142037,142039],{"class":35,"line":397},[33,142034,35726],{"class":163},[33,142036,9445],{"class":50},[33,142038,1852],{"class":163},[33,142040,7583],{"class":167},[33,142042,142043,142045,142047,142049,142051,142053,142055,142057,142059,142061],{"class":35,"line":653},[33,142044,35742],{"class":163},[33,142046,16617],{"class":50},[33,142048,602],{"class":167},[33,142050,4059],{"class":163},[33,142052,124100],{"class":54},[33,142054,1115],{"class":50},[33,142056,7602],{"class":167},[33,142058,1121],{"class":50},[33,142060,274],{"class":54},[33,142062,221],{"class":167},[33,142064,142065,142067,142069,142071],{"class":35,"line":667},[33,142066,35726],{"class":163},[33,142068,783],{"class":50},[33,142070,1852],{"class":163},[33,142072,7583],{"class":167},[33,142074,142075,142077,142079,142081,142083,142086,142088,142090,142092,142094],{"class":35,"line":675},[33,142076,35742],{"class":163},[33,142078,16617],{"class":50},[33,142080,602],{"class":167},[33,142082,4059],{"class":163},[33,142084,142085],{"class":54},"\"Load error: ",[33,142087,1115],{"class":50},[33,142089,7602],{"class":167},[33,142091,1121],{"class":50},[33,142093,274],{"class":54},[33,142095,221],{"class":167},[14,142097,142098],{},"These four assertions catch the four most common silent failures: missing columns (schema drift), empty sheet (wrong tab or empty export), null IDs (join will fail), and unparsed dates (time-series operations will break).",[18,142100,142102],{"id":142101},"troubleshooting-common-errors","Troubleshooting Common Errors",[4273,142104,142105,142115],{},[4276,142106,142107],{},[4279,142108,142109,142111,142113],{},[4282,142110,14317],{},[4282,142112,4287],{},[4282,142114,4290],{},[4292,142116,142117,142133,142152,142168,142190,142208,142226],{},[4279,142118,142119,142124,142129],{},[4297,142120,142121],{},[30,142122,142123],{},"ImportError: Missing optional dependency 'openpyxl'",[4297,142125,142126,142128],{},[30,142127,22009],{}," not installed",[4297,142130,142131],{},[30,142132,26548],{},[4279,142134,142135,142139,142146],{},[4297,142136,142137],{},[30,142138,138312],{},[4297,142140,142141,142143,142144],{},[30,142142,125595],{}," ≥ 2.0 selected for ",[30,142145,26542],{},[4297,142147,4358,142148,10073,142150],{},[30,142149,22395],{},[940,142151,126410],{"href":126409},[4279,142153,142154,142159,142162],{},[4297,142155,142156],{},[30,142157,142158],{},"KeyError: 'SheetName'",[4297,142160,142161],{},"Sheet name has trailing space or wrong case",[4297,142163,133514,142164,142167],{},[30,142165,142166],{},"pd.ExcelFile(path).sheet_names"," to confirm exact string",[4279,142169,142170,142178,142181],{},[4297,142171,142172,142173,365,142175],{},"Columns named ",[30,142174,117994],{},[30,142176,142177],{},"Unnamed: 1",[4297,142179,142180],{},"Header row is not row 0",[4297,142182,4358,142183,142186,142187],{},[30,142184,142185],{},"skiprows=N"," after previewing with ",[30,142188,142189],{},"nrows=5",[4279,142191,142192,142198,142203],{},[4297,142193,142194,142195],{},"Numeric IDs shown as ",[30,142196,142197],{},"1001.0",[4297,142199,142200,142201],{},"Pandas inferred ",[30,142202,102445],{},[4297,142204,4358,142205],{},[30,142206,142207],{},"dtype={\"order_id\": str}",[4279,142209,142210,142215,142220],{},[4297,142211,142212],{},[30,142213,142214],{},"EmptyDataError",[4297,142216,142217,142219],{},[30,142218,126099],{}," value exceeded total row count",[4297,142221,142222,142223,142225],{},"Preview with ",[30,142224,140755],{}," first to count intro rows",[4279,142227,142228,142232,142235],{},[4297,142229,14400,142230,14362],{},[30,142231,11888],{},[4297,142233,142234],{},"Cells contain strings, not Excel date serials",[4297,142236,74566,142237,142240],{},[30,142238,142239],{},"pd.to_datetime(df[\"col\"], errors=\"coerce\")"," post-load",[18,142242,88566],{"id":29183},[14,142244,142245,107296,142248,36661,142251,142253],{},[1974,142246,142247],{},"How do I read only the first N rows?",[30,142249,142250],{},"nrows=100",[30,142252,139812],{},". Useful for previewing large files before committing to a full load.",[14,142255,142256,142259,142260,142263,142264,142266],{},[1974,142257,142258],{},"How do I read multiple specific sheets by name?","\nPass a list: ",[30,142261,142262],{},"sheet_name=[\"Q1\", \"Q2\", \"Q3\"]",". Pandas returns a ",[30,142265,37100],{}," of DataFrames keyed by sheet name.",[14,142268,142269,131042,142272,142275,142276,3035],{},[1974,142270,142271],{},"Can I skip the first few rows of a report header?",[30,142273,142274],{},"skiprows=3"," skips rows 0, 1, and 2; the next row becomes the header. For non-contiguous rows, pass a list: ",[30,142277,142278],{},"skiprows=[0, 2, 5]",[14,142280,142281,142289,142290,142292],{},[1974,142282,142283,142284,142286,142287,36637],{},"Why is my date column still ",[30,142285,11888],{}," dtype after ",[30,142288,102641],{},"\nThe cell values are text strings rather than Excel date serials (integers). Call ",[30,142291,142239],{}," after loading to force the conversion.",[14,142294,142295,142303,142305,142306,142308,142309,142311],{},[1974,142296,142297,142298,10065,142300,36637],{},"What is the difference between ",[30,142299,140761],{},[30,142301,142302],{},"header=0",[30,142304,142302],{}," (the default) uses row 0 as column names. ",[30,142307,140761],{}," treats every row as data and assigns integer column indices. Use ",[30,142310,140761],{}," when you need to manipulate the header row yourself before assigning it — for example, to forward-fill merged cell labels.",[14,142313,142314,142322,142323,138249,142326,2015,142328,142330,142331,142333],{},[1974,142315,142316,142317,142319,142320,36637],{},"How do I load a ",[30,142318,112255],{}," file instead of ",[30,142321,26542],{},"\nInstall ",[30,142324,142325],{},"xlrd \u003C 2.0",[30,142327,128873],{},[30,142329,22009],{}," engine cannot open legacy ",[30,142332,112255],{}," binary files.",[18,142335,6918],{"id":6917},[4211,142337,142338,142343,142352,142357],{},[4214,142339,142340,142342],{},[940,142341,99577],{"href":99576}," — full reference: engines, openpyxl direct access, calamine, merged cells, BytesIO",[4214,142344,142345,138160,142347,10065,142349,142351],{},[940,142346,126410],{"href":126409},[30,142348,128863],{},[30,142350,42237],{}," for xlsx files",[4214,142353,142354,142356],{},[940,142355,6936],{"href":6935}," — next step after loading: generate formatted output workbooks",[4214,142358,142359,142361],{},[940,142360,28119],{"href":28118}," — combine DataFrames from multiple workbooks",[14,142363,6947,142364,3035],{},[940,142365,99577],{"href":99576},[6953,142367,26204],{},{"title":28,"searchDepth":43,"depth":43,"links":142369},[142370,142371,142372,142373,142374,142375,142377,142378,142379,142380,142381],{"id":20,"depth":43,"text":21},{"id":140165,"depth":43,"text":140166},{"id":140335,"depth":43,"text":140336},{"id":140569,"depth":43,"text":140570},{"id":140772,"depth":43,"text":140773},{"id":141102,"depth":43,"text":142376},"Step 5: Fix Type Inference with dtype and parse_dates",{"id":141438,"depth":43,"text":141439},{"id":141755,"depth":43,"text":141756},{"id":142101,"depth":43,"text":142102},{"id":29183,"depth":43,"text":88566},{"id":6917,"depth":43,"text":6918},"Read Excel Step by Step","Step-by-step walkthrough for loading .xlsx and .xls files into pandas DataFrames. Covers read_excel(), engine selection, sheet targeting, dtype, and parse_dates.",{},"\u002Fpython-for-excel-csv-data-processing\u002Freading-excel-files-with-python\u002Fhow-to-read-excel-with-pandas-step-by-step",{"title":139802,"description":142383},"Read Excel with Pandas — Step-by-Step Guide","python-for-excel-csv-data-processing\u002Freading-excel-files-with-python\u002Fhow-to-read-excel-with-pandas-step-by-step\u002Findex",[99614,9630,47,22009],"7Uv5GXBrGBwc8kAOW--ayqovg7RTVotxEOtrXAfNQVY",{"id":142392,"title":99577,"body":142393,"breadcrumbTitle":147036,"canonical":6977,"date":46387,"description":147037,"draft":6980,"extension":6981,"image":6977,"meta":147038,"navigation":91,"path":147039,"robots":6977,"seo":147040,"seoTitle":99577,"stem":147041,"tags":147042,"updatedAt":6978,"__hash__":147043},"content\u002Fpython-for-excel-csv-data-processing\u002Freading-excel-files-with-python\u002Findex.md",{"type":7,"value":142394,"toc":147007},[142395,142398,142410,142419,142428,142430,142433,142480,142483,142652,142656,142662,142761,142849,142861,142865,142868,143060,143076,143083,143088,143247,143261,143268,143589,143597,143605,143608,143769,143778,143785,143788,144053,144061,144069,144076,144348,144363,144367,144370,144626,144630,144641,144875,144887,144889,144893,144903,145038,145042,145048,145239,145243,145249,145424,145428,145431,145623,145638,145640,145643,145989,145991,146055,146063,146073,146075,146201,146203,146834,146837,146858,146860,146876,146898,146914,146930,146948,146966,146968,147001,147005],[10,142396,99577],{"id":142397},"reading-excel-files-with-python",[14,142399,142400,142401,142403,142404,142406,142407,142409],{},"Excel workbooks arrive in two distinct binary formats — ",[30,142402,26542],{}," (Office Open XML, used since Excel 2007) and ",[30,142405,112255],{}," (legacy BIFF binary format from Excel 97–2003). Each format requires a different Python parsing engine. Code that skips engine selection either picks the wrong backend and raises ",[30,142408,128863],{},", or uses the correct backend but misses key parameters and corrupts types or column alignment.",[14,142411,142412,142413,142415,142416,142418],{},"This guide walks through every decision point in order: which engine to install, how to inspect a workbook before loading it, how to target specific sheets and columns, how to coerce types at load time, how to use ",[30,142414,22009],{}," directly when ",[30,142417,9630],{}," is not enough, and how to handle the real-world edge cases that appear once your scripts move from test files to production data.",[14,142420,142421,142422,142424,142425,142427],{},"For downstream work, see ",[940,142423,6936],{"href":6935}," once data is loaded, or ",[940,142426,28119],{"href":28118}," when combining data from several workbooks.",[18,142429,21],{"id":20},[14,142431,142432],{},"Install the required libraries before running any snippet in this guide:",[23,142434,142436],{"className":25,"code":142435,"language":27,"meta":28,"style":28},"# pip install pandas openpyxl\npip install pandas openpyxl\n# for legacy .xls files only:\npip install \"xlrd>=1.2,\u003C2.0\"\n# for the fast calamine engine (optional):\npip install python-calamine\n",[30,142437,142438,142442,142452,142457,142466,142471],{"__ignoreMap":28},[33,142439,142440],{"class":35,"line":36},[33,142441,3952],{"class":39},[33,142443,142444,142446,142448,142450],{"class":35,"line":43},[33,142445,76],{"class":46},[33,142447,79],{"class":54},[33,142449,16183],{"class":54},[33,142451,95887],{"class":54},[33,142453,142454],{"class":35,"line":61},[33,142455,142456],{"class":39},"# for legacy .xls files only:\n",[33,142458,142459,142461,142463],{"class":35,"line":73},[33,142460,76],{"class":46},[33,142462,79],{"class":54},[33,142464,142465],{"class":54}," \"xlrd>=1.2,\u003C2.0\"\n",[33,142467,142468],{"class":35,"line":88},[33,142469,142470],{"class":39},"# for the fast calamine engine (optional):\n",[33,142472,142473,142475,142477],{"class":35,"line":95},[33,142474,76],{"class":46},[33,142476,79],{"class":54},[33,142478,142479],{"class":54}," python-calamine\n",[14,142481,142482],{},"Create a minimal test workbook to validate your setup before pointing scripts at real data:",[23,142484,142486],{"className":126,"code":142485,"language":47,"meta":28,"style":28},"# pip install openpyxl\nfrom pathlib import Path\nimport openpyxl\n\nTEST_PATH = Path(\"test_workbook.xlsx\")\nwb = openpyxl.Workbook()\nws = wb.active\nws.title = \"Sales\"\nws.append([\"order_id\", \"customer\", \"amount\", \"order_date\"])\nws.append([1001, \"Alice\", 199.99, \"2026-01-15\"])\nws.append([1002, \"Bob\", 54.50, \"2026-01-16\"])\nws.append([1003, \"Carol\", 320.00, \"2026-01-17\"])\nwb.save(TEST_PATH)\nprint(\"Test file written:\", TEST_PATH.resolve())\n",[30,142487,142488,142492,142502,142508,142512,142525,142533,142541,142549,142569,142589,142609,142629,142637],{"__ignoreMap":28},[33,142489,142490],{"class":35,"line":36},[33,142491,98209],{"class":39},[33,142493,142494,142496,142498,142500],{"class":35,"line":43},[33,142495,190],{"class":163},[33,142497,193],{"class":167},[33,142499,164],{"class":163},[33,142501,198],{"class":167},[33,142503,142504,142506],{"class":35,"line":61},[33,142505,164],{"class":163},[33,142507,95887],{"class":167},[33,142509,142510],{"class":35,"line":73},[33,142511,92],{"emptyLinePlaceholder":91},[33,142513,142514,142516,142518,142520,142523],{"class":35,"line":88},[33,142515,139956],{"class":50},[33,142517,212],{"class":163},[33,142519,215],{"class":167},[33,142521,142522],{"class":54},"\"test_workbook.xlsx\"",[33,142524,221],{"class":167},[33,142526,142527,142529,142531],{"class":35,"line":95},[33,142528,98274],{"class":167},[33,142530,242],{"class":163},[33,142532,139974],{"class":167},[33,142534,142535,142537,142539],{"class":35,"line":101},[33,142536,98330],{"class":167},[33,142538,242],{"class":163},[33,142540,99877],{"class":167},[33,142542,142543,142545,142547],{"class":35,"line":171},[33,142544,139987],{"class":167},[33,142546,242],{"class":163},[33,142548,139992],{"class":54},[33,142550,142551,142553,142555,142557,142559,142561,142563,142565,142567],{"class":35,"line":179},[33,142552,100864],{"class":167},[33,142554,108849],{"class":54},[33,142556,365],{"class":167},[33,142558,59673],{"class":54},[33,142560,365],{"class":167},[33,142562,4106],{"class":54},[33,142564,365],{"class":167},[33,142566,108767],{"class":54},[33,142568,751],{"class":167},[33,142570,142571,142573,142575,142577,142579,142581,142583,142585,142587],{"class":35,"line":187},[33,142572,100864],{"class":167},[33,142574,120342],{"class":50},[33,142576,365],{"class":167},[33,142578,140023],{"class":54},[33,142580,365],{"class":167},[33,142582,140028],{"class":50},[33,142584,365],{"class":167},[33,142586,12407],{"class":54},[33,142588,751],{"class":167},[33,142590,142591,142593,142595,142597,142599,142601,142603,142605,142607],{"class":35,"line":201},[33,142592,100864],{"class":167},[33,142594,120347],{"class":50},[33,142596,365],{"class":167},[33,142598,140045],{"class":54},[33,142600,365],{"class":167},[33,142602,140050],{"class":50},[33,142604,365],{"class":167},[33,142606,140055],{"class":54},[33,142608,751],{"class":167},[33,142610,142611,142613,142615,142617,142619,142621,142623,142625,142627],{"class":35,"line":206},[33,142612,100864],{"class":167},[33,142614,120352],{"class":50},[33,142616,365],{"class":167},[33,142618,140068],{"class":54},[33,142620,365],{"class":167},[33,142622,140073],{"class":50},[33,142624,365],{"class":167},[33,142626,140078],{"class":54},[33,142628,751],{"class":167},[33,142630,142631,142633,142635],{"class":35,"line":224},[33,142632,100907],{"class":167},[33,142634,139956],{"class":50},[33,142636,221],{"class":167},[33,142638,142639,142641,142643,142646,142648,142650],{"class":35,"line":229},[33,142640,13474],{"class":50},[33,142642,602],{"class":167},[33,142644,142645],{"class":54},"\"Test file written:\"",[33,142647,365],{"class":167},[33,142649,139956],{"class":50},[33,142651,140162],{"class":167},[18,142653,142655],{"id":142654},"engine-selection","Engine Selection",[14,142657,142658,142661],{},[30,142659,142660],{},"pandas.read_excel"," delegates file I\u002FO to a backend engine. The choice is determined by file format, not personal preference. Passing the wrong engine raises an immediate error; omitting the engine parameter lets pandas guess, which can pick incorrectly when multiple engines are installed.",[2540,142663,2547,142665,2547,142668,2547,142671,2547,2547,142685,2547,142688,2547,142692,2547,2547,142695,2547,142698,2547,142700,2547,142702,2547,142705,2547,142709,2547,2547,142712,2547,142714,2547,142717,2547,2547,142720,2547,142722,2547,142725,2547,2547,142728,2547,142732,2547,2547,142734,2547,142736,2547,142738,2547,2547,142741,2547,142743,2547,142745,2547,2547,142748,2547,142751,2547,142753,2547,2547,142755,2547,142757],{"viewBox":26298,"role":2543,"ariaLabel":142664,"xmlns":2545,"style":2546},"Decision tree mapping xlsx and xls file formats to openpyxl, calamine, or xlrd engines, all producing a pandas DataFrame",[2549,142666,142667],{},"Excel engine selection decision tree",[2553,142669,142670],{},"Shows how .xlsx and .xls file formats map to openpyxl, calamine, or xlrd engines, all producing a pandas DataFrame output.",[2557,142672,2559,142673,2559,142680,2547],{},[2561,142674,2564,142676,2564,142678,2559],{"id":142675,"x1":748,"y1":748,"x2":734,"y2":748},"read-excel-grad",[2566,142677],{"offset":748,"style":2568},[2566,142679],{"offset":734,"style":2571},[2573,142681,2564,142683,2559],{"id":142682,"viewBox":2576,"refX":2577,"refY":1153,"markerWidth":1179,"markerHeight":1179,"orient":2578},"read-excel-arrow",[2580,142684],{"d":2582,"fill":2583},[2585,142686],{"x":2678,"y":19300,"width":49869,"height":49813,"rx":3545,"fill":142687,"stroke":2593,"style":2594},"url(#read-excel-grad)",[2000,142689,142691],{"x":2626,"y":26323,"fill":2599,"style":142690},"text-anchor:middle;font-size:15px;font-weight:bold","Excel File",[2000,142693,142694],{"x":2626,"y":58360,"fill":2599,"style":2600},".xlsx \u002F .xls \u002F .xlsb?",[35,142696],{"x1":2698,"y1":38748,"x2":2635,"y2":2648,"stroke":2583,"markerEnd":142697,"style":2594},"url(#read-excel-arrow)",[2000,142699,26542],{"x":16997,"y":2589,"fill":2583,"style":2685},[35,142701],{"x1":2626,"y1":38748,"x2":2626,"y2":2648,"stroke":2583,"markerEnd":142697,"style":2594},[2000,142703,142704],{"x":120798,"y":2589,"fill":2583,"style":2685},".xlsb",[35,142706],{"x1":142707,"y1":38748,"x2":142708,"y2":2648,"stroke":2583,"markerEnd":142697,"style":2594},"450","615",[2000,142710,112255],{"x":142711,"y":2589,"fill":2583,"style":2685},"565",[2585,142713],{"x":1543,"y":2648,"width":26354,"height":49813,"rx":2591,"fill":2592,"stroke":11166,"style":2594},[2000,142715,22395],{"x":2648,"y":142716,"fill":2599,"style":38718},"167",[2000,142718,142719],{"x":2648,"y":2643,"fill":2583,"style":2605},"full features, formatting",[2585,142721],{"x":1543,"y":2665,"width":26354,"height":49813,"rx":2591,"fill":2615,"stroke":2593,"style":2594},[2000,142723,142724],{"x":2648,"y":135030,"fill":2599,"style":2600},"engine=\"calamine\"",[2000,142726,142727],{"x":2648,"y":38762,"fill":2583,"style":2605},"faster, read-only",[35,142729],{"x1":2648,"y1":142730,"x2":2648,"y2":2665,"stroke":142731,"style":2694},"197","#cbd5e1",[2000,142733,7162],{"x":58337,"y":100318,"fill":38773,"style":2605},[2585,142735],{"x":71592,"y":2648,"width":26354,"height":49813,"rx":2591,"fill":2615,"stroke":2593,"style":2594},[2000,142737,142724],{"x":2626,"y":142716,"fill":2599,"style":2600},[2000,142739,142740],{"x":2626,"y":2643,"fill":2583,"style":2605},"only supported engine",[2585,142742],{"x":13437,"y":2648,"width":26354,"height":49813,"rx":2591,"fill":2592,"stroke":2593,"style":2594},[2000,142744,128873],{"x":142708,"y":142716,"fill":2599,"style":38718},[2000,142746,142747],{"x":142708,"y":2643,"fill":2583,"style":2605},"xlrd \u003C 2.0 required",[35,142749],{"x1":2648,"y1":142750,"x2":2698,"y2":2625,"stroke":2583,"markerEnd":142697,"style":2594},"277",[35,142752],{"x1":2626,"y1":142730,"x2":2626,"y2":2625,"stroke":2583,"markerEnd":142697,"style":2594},[35,142754],{"x1":142708,"y1":142730,"x2":64900,"y2":2625,"stroke":2583,"markerEnd":142697,"style":2594},[2585,142756],{"x":2678,"y":2625,"width":49869,"height":38717,"rx":2591,"fill":11165,"stroke":11166,"style":2594},[2000,142758,142760],{"x":2626,"y":142759,"fill":2599,"style":16979},"342","pandas DataFrame",[4273,142762,142763,142777],{},[4276,142764,142765],{},[4279,142766,142767,142770,142773,142775],{},[4282,142768,142769],{},"Engine",[4282,142771,142772],{},"Formats",[4282,142774,26491],{},[4282,142776,26494],{},[4292,142778,142779,142799,142824],{},[4279,142780,142781,142785,142792,142796],{},[4297,142782,142783],{},[30,142784,22009],{},[4297,142786,142787,365,142789],{},[30,142788,26542],{},[30,142790,142791],{},".xlsm",[4297,142793,142794],{},[30,142795,26548],{},[4297,142797,142798],{},"Very large files where read speed is critical",[4279,142800,142801,142805,142816,142821],{},[4297,142802,142803],{},[30,142804,139236],{},[4297,142806,142807,365,142809,365,142811,365,142813],{},[30,142808,26542],{},[30,142810,112255],{},[30,142812,142704],{},[30,142814,142815],{},".ods",[4297,142817,142818],{},[30,142819,142820],{},"pip install python-calamine",[4297,142822,142823],{},"When you need formula strings, cell styles, or comments",[4279,142825,142826,142830,142835,142840],{},[4297,142827,142828],{},[30,142829,125595],{},[4297,142831,142832,142834],{},[30,142833,112255],{}," only",[4297,142836,142837],{},[30,142838,142839],{},"pip install \"xlrd\u003C2.0\"",[4297,142841,41801,142842,2012,142844,142846,142847],{},[30,142843,26542],{},[30,142845,142704],{}," file — raises ",[30,142848,128863],{},[142850,142851,142852],"blockquote",{},[14,142853,142854,142855,142857,142858,142860],{},"Hitting ",[30,142856,138312],{},"? See ",[940,142859,126410],{"href":126409}," for the exact fix.",[18,142862,142864],{"id":142863},"step-1-inspect-before-loading","Step 1: Inspect Before Loading",[14,142866,142867],{},"Check sheet names and preview a few rows before committing to a full workbook load. On a 100 MB file this saves significant time:",[23,142869,142871],{"className":126,"code":142870,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nfrom pathlib import Path\nimport pandas as pd\n\nEXCEL_PATH = Path(\"data\u002Fsales_q3.xlsx\")\n\ntry:\n    xl = pd.ExcelFile(EXCEL_PATH, engine=\"openpyxl\")\n    print(\"Sheets:\", xl.sheet_names)\n    # Preview first 5 rows without loading the whole file\n    preview = xl.parse(xl.sheet_names[0], nrows=5)\n    print(preview)\n    print(\"Columns:\", preview.columns.tolist())\nexcept FileNotFoundError:\n    raise SystemExit(f\"File not found: {EXCEL_PATH}\")\nexcept Exception as e:\n    raise SystemExit(f\"Cannot open workbook: {e}\")\n",[30,142872,142873,142877,142887,142897,142901,142914,142918,142924,142944,142955,142960,142983,142990,143001,143009,143027,143037],{"__ignoreMap":28},[33,142874,142875],{"class":35,"line":36},[33,142876,3952],{"class":39},[33,142878,142879,142881,142883,142885],{"class":35,"line":43},[33,142880,190],{"class":163},[33,142882,193],{"class":167},[33,142884,164],{"class":163},[33,142886,198],{"class":167},[33,142888,142889,142891,142893,142895],{"class":35,"line":61},[33,142890,164],{"class":163},[33,142892,492],{"class":167},[33,142894,495],{"class":163},[33,142896,498],{"class":167},[33,142898,142899],{"class":35,"line":73},[33,142900,92],{"emptyLinePlaceholder":91},[33,142902,142903,142905,142907,142909,142912],{"class":35,"line":88},[33,142904,138592],{"class":50},[33,142906,212],{"class":163},[33,142908,215],{"class":167},[33,142910,142911],{"class":54},"\"data\u002Fsales_q3.xlsx\"",[33,142913,221],{"class":167},[33,142915,142916],{"class":35,"line":95},[33,142917,92],{"emptyLinePlaceholder":91},[33,142919,142920,142922],{"class":35,"line":101},[33,142921,35574],{"class":163},[33,142923,574],{"class":167},[33,142925,142926,142928,142930,142932,142934,142936,142938,142940,142942],{"class":35,"line":171},[33,142927,140232],{"class":167},[33,142929,242],{"class":163},[33,142931,140237],{"class":167},[33,142933,138592],{"class":50},[33,142935,365],{"class":167},[33,142937,17351],{"class":238},[33,142939,242],{"class":163},[33,142941,17356],{"class":54},[33,142943,221],{"class":167},[33,142945,142946,142948,142950,142952],{"class":35,"line":179},[33,142947,7268],{"class":50},[33,142949,602],{"class":167},[33,142951,140258],{"class":54},[33,142953,142954],{"class":167},", xl.sheet_names)\n",[33,142956,142957],{"class":35,"line":187},[33,142958,142959],{"class":39},"    # Preview first 5 rows without loading the whole file\n",[33,142961,142962,142965,142967,142970,142972,142974,142977,142979,142981],{"class":35,"line":201},[33,142963,142964],{"class":167},"    preview ",[33,142966,242],{"class":163},[33,142968,142969],{"class":167}," xl.parse(xl.sheet_names[",[33,142971,748],{"class":50},[33,142973,8314],{"class":167},[33,142975,142976],{"class":238},"nrows",[33,142978,242],{"class":163},[33,142980,1153],{"class":50},[33,142982,221],{"class":167},[33,142984,142985,142987],{"class":35,"line":206},[33,142986,7268],{"class":50},[33,142988,142989],{"class":167},"(preview)\n",[33,142991,142992,142994,142996,142998],{"class":35,"line":224},[33,142993,7268],{"class":50},[33,142995,602],{"class":167},[33,142997,119773],{"class":54},[33,142999,143000],{"class":167},", preview.columns.tolist())\n",[33,143002,143003,143005,143007],{"class":35,"line":229},[33,143004,35726],{"class":163},[33,143006,2945],{"class":50},[33,143008,574],{"class":167},[33,143010,143011,143013,143015,143017,143019,143021,143023,143025],{"class":35,"line":235},[33,143012,35742],{"class":163},[33,143014,16617],{"class":50},[33,143016,602],{"class":167},[33,143018,4059],{"class":163},[33,143020,15677],{"class":54},[33,143022,138680],{"class":50},[33,143024,274],{"class":54},[33,143026,221],{"class":167},[33,143028,143029,143031,143033,143035],{"class":35,"line":250},[33,143030,35726],{"class":163},[33,143032,783],{"class":50},[33,143034,1852],{"class":163},[33,143036,7583],{"class":167},[33,143038,143039,143041,143043,143045,143047,143050,143052,143054,143056,143058],{"class":35,"line":266},[33,143040,35742],{"class":163},[33,143042,16617],{"class":50},[33,143044,602],{"class":167},[33,143046,4059],{"class":163},[33,143048,143049],{"class":54},"\"Cannot open workbook: ",[33,143051,1115],{"class":50},[33,143053,7602],{"class":167},[33,143055,1121],{"class":50},[33,143057,274],{"class":54},[33,143059,221],{"class":167},[14,143061,143062,143064,143065,143068,143069,365,143071,71132,143073,143075],{},[30,143063,140328],{}," opens the workbook container without parsing any sheet data. ",[30,143066,143067],{},"xl.parse()"," reads only the rows you request. Use this diagnostic step to determine correct values for ",[30,143070,126099],{},[30,143072,44427],{},[30,143074,21904],{}," before writing the final load call.",[18,143077,143079,143080,143082],{"id":143078},"step-2-basic-read_excel-call","Step 2: Basic ",[30,143081,57240],{}," Call",[14,143084,143085,143086,140342],{},"The minimum correct call for a modern ",[30,143087,26542],{},[23,143089,143091],{"className":126,"code":143090,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nfrom pathlib import Path\nimport pandas as pd\n\nEXCEL_PATH = Path(\"data\u002Freport.xlsx\")\n\ntry:\n    df = pd.read_excel(\n        EXCEL_PATH,\n        engine=\"openpyxl\",    # always pass explicitly\n    )\n    print(df.shape)\n    print(df.dtypes)\nexcept FileNotFoundError:\n    raise SystemExit(f\"File missing: {EXCEL_PATH}\")\nexcept ImportError as e:\n    raise SystemExit(f\"Engine not installed — pip install openpyxl: {e}\")\n",[30,143092,143093,143097,143107,143117,143121,143134,143138,143144,143152,143158,143171,143175,143181,143187,143195,143214,143224],{"__ignoreMap":28},[33,143094,143095],{"class":35,"line":36},[33,143096,3952],{"class":39},[33,143098,143099,143101,143103,143105],{"class":35,"line":43},[33,143100,190],{"class":163},[33,143102,193],{"class":167},[33,143104,164],{"class":163},[33,143106,198],{"class":167},[33,143108,143109,143111,143113,143115],{"class":35,"line":61},[33,143110,164],{"class":163},[33,143112,492],{"class":167},[33,143114,495],{"class":163},[33,143116,498],{"class":167},[33,143118,143119],{"class":35,"line":73},[33,143120,92],{"emptyLinePlaceholder":91},[33,143122,143123,143125,143127,143129,143132],{"class":35,"line":88},[33,143124,138592],{"class":50},[33,143126,212],{"class":163},[33,143128,215],{"class":167},[33,143130,143131],{"class":54},"\"data\u002Freport.xlsx\"",[33,143133,221],{"class":167},[33,143135,143136],{"class":35,"line":95},[33,143137,92],{"emptyLinePlaceholder":91},[33,143139,143140,143142],{"class":35,"line":101},[33,143141,35574],{"class":163},[33,143143,574],{"class":167},[33,143145,143146,143148,143150],{"class":35,"line":171},[33,143147,4025],{"class":167},[33,143149,242],{"class":163},[33,143151,126171],{"class":167},[33,143153,143154,143156],{"class":35,"line":179},[33,143155,138626],{"class":50},[33,143157,247],{"class":167},[33,143159,143160,143162,143164,143166,143168],{"class":35,"line":187},[33,143161,111493],{"class":238},[33,143163,242],{"class":163},[33,143165,17356],{"class":54},[33,143167,38342],{"class":167},[33,143169,143170],{"class":39},"# always pass explicitly\n",[33,143172,143173],{"class":35,"line":201},[33,143174,1202],{"class":167},[33,143176,143177,143179],{"class":35,"line":206},[33,143178,7268],{"class":50},[33,143180,39529],{"class":167},[33,143182,143183,143185],{"class":35,"line":224},[33,143184,7268],{"class":50},[33,143186,108834],{"class":167},[33,143188,143189,143191,143193],{"class":35,"line":229},[33,143190,35726],{"class":163},[33,143192,2945],{"class":50},[33,143194,574],{"class":167},[33,143196,143197,143199,143201,143203,143205,143208,143210,143212],{"class":35,"line":235},[33,143198,35742],{"class":163},[33,143200,16617],{"class":50},[33,143202,602],{"class":167},[33,143204,4059],{"class":163},[33,143206,143207],{"class":54},"\"File missing: ",[33,143209,138680],{"class":50},[33,143211,274],{"class":54},[33,143213,221],{"class":167},[33,143215,143216,143218,143220,143222],{"class":35,"line":250},[33,143217,35726],{"class":163},[33,143219,40488],{"class":50},[33,143221,1852],{"class":163},[33,143223,7583],{"class":167},[33,143225,143226,143228,143230,143232,143234,143237,143239,143241,143243,143245],{"class":35,"line":266},[33,143227,35742],{"class":163},[33,143229,16617],{"class":50},[33,143231,602],{"class":167},[33,143233,4059],{"class":163},[33,143235,143236],{"class":54},"\"Engine not installed — pip install openpyxl: ",[33,143238,1115],{"class":50},[33,143240,7602],{"class":167},[33,143242,1121],{"class":50},[33,143244,274],{"class":54},[33,143246,221],{"class":167},[14,143248,39550,143249,143251,143252,143254,143255,143257,143258,143260],{},[30,143250,17351],{}," argument is always required in production code. Without it, pandas infers the backend from the file extension, and on systems where ",[30,143253,125595],{}," is installed alongside ",[30,143256,22009],{},", the inference can pick ",[30,143259,125595],{}," and fail.",[18,143262,143264,143265,143267],{"id":143263},"step-3-sheet_name-targeting-sheets","Step 3: ",[30,143266,17371],{}," — Targeting Sheets",[23,143269,143271],{"className":126,"code":143270,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nfrom pathlib import Path\nimport pandas as pd\n\nEXCEL_PATH = Path(\"data\u002Fworkbook.xlsx\")\n\ntry:\n    # By name\n    df_q3 = pd.read_excel(EXCEL_PATH, sheet_name=\"Q3\", engine=\"openpyxl\")\n\n    # By zero-based index (first sheet)\n    df_first = pd.read_excel(EXCEL_PATH, sheet_name=0, engine=\"openpyxl\")\n\n    # All sheets → OrderedDict of {name: DataFrame}\n    all_sheets = pd.read_excel(EXCEL_PATH, sheet_name=None, engine=\"openpyxl\")\n    for name, df in all_sheets.items():\n        print(f\"{name}: {df.shape}\")\n\n    # Specific subset of sheets\n    subset = pd.read_excel(\n        EXCEL_PATH, sheet_name=[\"Q1\", \"Q2\", \"Q3\"], engine=\"openpyxl\"\n    )\nexcept KeyError as e:\n    raise SystemExit(f\"Sheet not found: {e}\")\nexcept Exception as e:\n    raise SystemExit(f\"Load error: {e}\")\n",[30,143272,143273,143277,143287,143297,143301,143314,143318,143324,143329,143359,143363,143368,143397,143401,143406,143434,143444,143472,143476,143481,143490,143521,143525,143535,143557,143567],{"__ignoreMap":28},[33,143274,143275],{"class":35,"line":36},[33,143276,3952],{"class":39},[33,143278,143279,143281,143283,143285],{"class":35,"line":43},[33,143280,190],{"class":163},[33,143282,193],{"class":167},[33,143284,164],{"class":163},[33,143286,198],{"class":167},[33,143288,143289,143291,143293,143295],{"class":35,"line":61},[33,143290,164],{"class":163},[33,143292,492],{"class":167},[33,143294,495],{"class":163},[33,143296,498],{"class":167},[33,143298,143299],{"class":35,"line":73},[33,143300,92],{"emptyLinePlaceholder":91},[33,143302,143303,143305,143307,143309,143312],{"class":35,"line":88},[33,143304,138592],{"class":50},[33,143306,212],{"class":163},[33,143308,215],{"class":167},[33,143310,143311],{"class":54},"\"data\u002Fworkbook.xlsx\"",[33,143313,221],{"class":167},[33,143315,143316],{"class":35,"line":95},[33,143317,92],{"emptyLinePlaceholder":91},[33,143319,143320,143322],{"class":35,"line":101},[33,143321,35574],{"class":163},[33,143323,574],{"class":167},[33,143325,143326],{"class":35,"line":171},[33,143327,143328],{"class":39},"    # By name\n",[33,143330,143331,143334,143336,143338,143340,143342,143344,143346,143349,143351,143353,143355,143357],{"class":35,"line":179},[33,143332,143333],{"class":167},"    df_q3 ",[33,143335,242],{"class":163},[33,143337,126254],{"class":167},[33,143339,138592],{"class":50},[33,143341,365],{"class":167},[33,143343,17371],{"class":238},[33,143345,242],{"class":163},[33,143347,143348],{"class":54},"\"Q3\"",[33,143350,365],{"class":167},[33,143352,17351],{"class":238},[33,143354,242],{"class":163},[33,143356,17356],{"class":54},[33,143358,221],{"class":167},[33,143360,143361],{"class":35,"line":187},[33,143362,92],{"emptyLinePlaceholder":91},[33,143364,143365],{"class":35,"line":201},[33,143366,143367],{"class":39},"    # By zero-based index (first sheet)\n",[33,143369,143370,143373,143375,143377,143379,143381,143383,143385,143387,143389,143391,143393,143395],{"class":35,"line":206},[33,143371,143372],{"class":167},"    df_first ",[33,143374,242],{"class":163},[33,143376,126254],{"class":167},[33,143378,138592],{"class":50},[33,143380,365],{"class":167},[33,143382,17371],{"class":238},[33,143384,242],{"class":163},[33,143386,748],{"class":50},[33,143388,365],{"class":167},[33,143390,17351],{"class":238},[33,143392,242],{"class":163},[33,143394,17356],{"class":54},[33,143396,221],{"class":167},[33,143398,143399],{"class":35,"line":224},[33,143400,92],{"emptyLinePlaceholder":91},[33,143402,143403],{"class":35,"line":229},[33,143404,143405],{"class":39},"    # All sheets → OrderedDict of {name: DataFrame}\n",[33,143407,143408,143410,143412,143414,143416,143418,143420,143422,143424,143426,143428,143430,143432],{"class":35,"line":235},[33,143409,126249],{"class":167},[33,143411,242],{"class":163},[33,143413,126254],{"class":167},[33,143415,138592],{"class":50},[33,143417,365],{"class":167},[33,143419,17371],{"class":238},[33,143421,242],{"class":163},[33,143423,571],{"class":50},[33,143425,365],{"class":167},[33,143427,17351],{"class":238},[33,143429,242],{"class":163},[33,143431,17356],{"class":54},[33,143433,221],{"class":167},[33,143435,143436,143438,143440,143442],{"class":35,"line":250},[33,143437,656],{"class":163},[33,143439,14253],{"class":167},[33,143441,662],{"class":163},[33,143443,141592],{"class":167},[33,143445,143446,143448,143450,143452,143454,143456,143458,143460,143462,143464,143466,143468,143470],{"class":35,"line":266},[33,143447,9414],{"class":50},[33,143449,602],{"class":167},[33,143451,4059],{"class":163},[33,143453,274],{"class":54},[33,143455,1115],{"class":50},[33,143457,1118],{"class":167},[33,143459,1121],{"class":50},[33,143461,2079],{"class":54},[33,143463,1115],{"class":50},[33,143465,9426],{"class":167},[33,143467,1121],{"class":50},[33,143469,274],{"class":54},[33,143471,221],{"class":167},[33,143473,143474],{"class":35,"line":290},[33,143475,92],{"emptyLinePlaceholder":91},[33,143477,143478],{"class":35,"line":295},[33,143479,143480],{"class":39},"    # Specific subset of sheets\n",[33,143482,143483,143486,143488],{"class":35,"line":300},[33,143484,143485],{"class":167},"    subset ",[33,143487,242],{"class":163},[33,143489,126171],{"class":167},[33,143491,143492,143494,143496,143498,143500,143502,143504,143506,143509,143511,143513,143515,143517,143519],{"class":35,"line":317},[33,143493,138626],{"class":50},[33,143495,365],{"class":167},[33,143497,17371],{"class":238},[33,143499,242],{"class":163},[33,143501,8309],{"class":167},[33,143503,136706],{"class":54},[33,143505,365],{"class":167},[33,143507,143508],{"class":54},"\"Q2\"",[33,143510,365],{"class":167},[33,143512,143348],{"class":54},[33,143514,8314],{"class":167},[33,143516,17351],{"class":238},[33,143518,242],{"class":163},[33,143520,130172],{"class":54},[33,143522,143523],{"class":35,"line":332},[33,143524,1202],{"class":167},[33,143526,143527,143529,143531,143533],{"class":35,"line":347},[33,143528,35726],{"class":163},[33,143530,140524],{"class":50},[33,143532,1852],{"class":163},[33,143534,7583],{"class":167},[33,143536,143537,143539,143541,143543,143545,143547,143549,143551,143553,143555],{"class":35,"line":374},[33,143538,35742],{"class":163},[33,143540,16617],{"class":50},[33,143542,602],{"class":167},[33,143544,4059],{"class":163},[33,143546,140541],{"class":54},[33,143548,1115],{"class":50},[33,143550,7602],{"class":167},[33,143552,1121],{"class":50},[33,143554,274],{"class":54},[33,143556,221],{"class":167},[33,143558,143559,143561,143563,143565],{"class":35,"line":397},[33,143560,35726],{"class":163},[33,143562,783],{"class":50},[33,143564,1852],{"class":163},[33,143566,7583],{"class":167},[33,143568,143569,143571,143573,143575,143577,143579,143581,143583,143585,143587],{"class":35,"line":653},[33,143570,35742],{"class":163},[33,143572,16617],{"class":50},[33,143574,602],{"class":167},[33,143576,4059],{"class":163},[33,143578,142085],{"class":54},[33,143580,1115],{"class":50},[33,143582,7602],{"class":167},[33,143584,1121],{"class":50},[33,143586,274],{"class":54},[33,143588,221],{"class":167},[14,143590,39550,143591,143593,143594,143596],{},[30,143592,126093],{}," pattern is the correct entry point for the consolidation workflow in ",[940,143595,28119],{"href":28118}," — it returns all sheets without requiring you to enumerate names in advance.",[18,143598,143600,143601,10065,143603],{"id":143599},"step-4-header-and-skiprows","Step 4: ",[30,143602,44427],{},[30,143604,126099],{},[14,143606,143607],{},"Business reports rarely have data starting at row 1. A title block, company name, print date, and blank separator row are common before the real header:",[23,143609,143611],{"className":126,"code":143610,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nfrom pathlib import Path\nimport pandas as pd\n\nEXCEL_PATH = Path(\"data\u002Fmonthly_report.xlsx\")\n\ntry:\n    df = pd.read_excel(\n        EXCEL_PATH,\n        skiprows=3,        # skip rows 0, 1, 2 (title block)\n        header=0,          # row 3 (now row 0 after skip) becomes the header\n        skipfooter=2,      # ignore last 2 rows (grand-total lines)\n        engine=\"openpyxl\",\n    )\n    print(df.columns.tolist())\nexcept Exception as e:\n    raise SystemExit(f\"Read error: {e}\")\n",[30,143612,143613,143617,143627,143637,143641,143654,143658,143664,143672,143678,143691,143704,143717,143727,143731,143737,143747],{"__ignoreMap":28},[33,143614,143615],{"class":35,"line":36},[33,143616,3952],{"class":39},[33,143618,143619,143621,143623,143625],{"class":35,"line":43},[33,143620,190],{"class":163},[33,143622,193],{"class":167},[33,143624,164],{"class":163},[33,143626,198],{"class":167},[33,143628,143629,143631,143633,143635],{"class":35,"line":61},[33,143630,164],{"class":163},[33,143632,492],{"class":167},[33,143634,495],{"class":163},[33,143636,498],{"class":167},[33,143638,143639],{"class":35,"line":73},[33,143640,92],{"emptyLinePlaceholder":91},[33,143642,143643,143645,143647,143649,143652],{"class":35,"line":88},[33,143644,138592],{"class":50},[33,143646,212],{"class":163},[33,143648,215],{"class":167},[33,143650,143651],{"class":54},"\"data\u002Fmonthly_report.xlsx\"",[33,143653,221],{"class":167},[33,143655,143656],{"class":35,"line":95},[33,143657,92],{"emptyLinePlaceholder":91},[33,143659,143660,143662],{"class":35,"line":101},[33,143661,35574],{"class":163},[33,143663,574],{"class":167},[33,143665,143666,143668,143670],{"class":35,"line":171},[33,143667,4025],{"class":167},[33,143669,242],{"class":163},[33,143671,126171],{"class":167},[33,143673,143674,143676],{"class":35,"line":179},[33,143675,138626],{"class":50},[33,143677,247],{"class":167},[33,143679,143680,143682,143684,143686,143688],{"class":35,"line":187},[33,143681,140656],{"class":238},[33,143683,242],{"class":163},[33,143685,10258],{"class":50},[33,143687,89262],{"class":167},[33,143689,143690],{"class":39},"# skip rows 0, 1, 2 (title block)\n",[33,143692,143693,143695,143697,143699,143701],{"class":35,"line":201},[33,143694,126205],{"class":238},[33,143696,242],{"class":163},[33,143698,748],{"class":50},[33,143700,98374],{"class":167},[33,143702,143703],{"class":39},"# row 3 (now row 0 after skip) becomes the header\n",[33,143705,143706,143708,143710,143712,143714],{"class":35,"line":206},[33,143707,140683],{"class":238},[33,143709,242],{"class":163},[33,143711,1533],{"class":50},[33,143713,121141],{"class":167},[33,143715,143716],{"class":39},"# ignore last 2 rows (grand-total lines)\n",[33,143718,143719,143721,143723,143725],{"class":35,"line":224},[33,143720,111493],{"class":238},[33,143722,242],{"class":163},[33,143724,17356],{"class":54},[33,143726,247],{"class":167},[33,143728,143729],{"class":35,"line":229},[33,143730,1202],{"class":167},[33,143732,143733,143735],{"class":35,"line":235},[33,143734,7268],{"class":50},[33,143736,140713],{"class":167},[33,143738,143739,143741,143743,143745],{"class":35,"line":250},[33,143740,35726],{"class":163},[33,143742,783],{"class":50},[33,143744,1852],{"class":163},[33,143746,7583],{"class":167},[33,143748,143749,143751,143753,143755,143757,143759,143761,143763,143765,143767],{"class":35,"line":266},[33,143750,35742],{"class":163},[33,143752,16617],{"class":50},[33,143754,602],{"class":167},[33,143756,4059],{"class":163},[33,143758,138740],{"class":54},[33,143760,1115],{"class":50},[33,143762,7602],{"class":167},[33,143764,1121],{"class":50},[33,143766,274],{"class":54},[33,143768,221],{"class":167},[14,143770,143771,143772,143775,143776,3035],{},"For non-contiguous rows to skip — e.g., row 0 is a logo, rows 1–2 are blank, row 3 is a subtitle — pass a list: ",[30,143773,143774],{},"skiprows=[0, 1, 2, 3]",". The row immediately after the last skipped row becomes the header if ",[30,143777,142302],{},[18,143779,143781,143782,143784],{"id":143780},"step-5-usecols-limiting-column-scope","Step 5: ",[30,143783,21904],{}," — Limiting Column Scope",[14,143786,143787],{},"Loading only the columns you need can reduce memory use by 70–90% on wide exports:",[23,143789,143791],{"className":126,"code":143790,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nfrom pathlib import Path\nimport pandas as pd\n\nEXCEL_PATH = Path(\"data\u002Flarge_export.xlsx\")\n\ntry:\n    # By name — most explicit and readable\n    df = pd.read_excel(\n        EXCEL_PATH,\n        usecols=[\"order_id\", \"customer\", \"amount\", \"order_date\"],\n        engine=\"openpyxl\",\n    )\n\n    # By Excel column letter range — useful when column names are unknown\n    df_range = pd.read_excel(EXCEL_PATH, usecols=\"A:D,F\", engine=\"openpyxl\")\n\n    # By callable — keep columns whose name starts with \"Revenue\"\n    df_rev = pd.read_excel(\n        EXCEL_PATH,\n        usecols=lambda c: str(c).startswith(\"Revenue\"),\n        engine=\"openpyxl\",\n    )\n    print(\"Memory:\", df.memory_usage(deep=True).sum() \u002F\u002F 1024, \"KB\")\nexcept Exception as e:\n    raise SystemExit(f\"Read error: {e}\")\n",[30,143792,143793,143797,143807,143817,143821,143834,143838,143844,143849,143857,143863,143887,143897,143901,143905,143910,143939,143943,143948,143957,143963,143979,143989,143993,144021,144031],{"__ignoreMap":28},[33,143794,143795],{"class":35,"line":36},[33,143796,3952],{"class":39},[33,143798,143799,143801,143803,143805],{"class":35,"line":43},[33,143800,190],{"class":163},[33,143802,193],{"class":167},[33,143804,164],{"class":163},[33,143806,198],{"class":167},[33,143808,143809,143811,143813,143815],{"class":35,"line":61},[33,143810,164],{"class":163},[33,143812,492],{"class":167},[33,143814,495],{"class":163},[33,143816,498],{"class":167},[33,143818,143819],{"class":35,"line":73},[33,143820,92],{"emptyLinePlaceholder":91},[33,143822,143823,143825,143827,143829,143832],{"class":35,"line":88},[33,143824,138592],{"class":50},[33,143826,212],{"class":163},[33,143828,215],{"class":167},[33,143830,143831],{"class":54},"\"data\u002Flarge_export.xlsx\"",[33,143833,221],{"class":167},[33,143835,143836],{"class":35,"line":95},[33,143837,92],{"emptyLinePlaceholder":91},[33,143839,143840,143842],{"class":35,"line":101},[33,143841,35574],{"class":163},[33,143843,574],{"class":167},[33,143845,143846],{"class":35,"line":171},[33,143847,143848],{"class":39},"    # By name — most explicit and readable\n",[33,143850,143851,143853,143855],{"class":35,"line":179},[33,143852,4025],{"class":167},[33,143854,242],{"class":163},[33,143856,126171],{"class":167},[33,143858,143859,143861],{"class":35,"line":187},[33,143860,138626],{"class":50},[33,143862,247],{"class":167},[33,143864,143865,143867,143869,143871,143873,143875,143877,143879,143881,143883,143885],{"class":35,"line":201},[33,143866,140859],{"class":238},[33,143868,242],{"class":163},[33,143870,8309],{"class":167},[33,143872,108849],{"class":54},[33,143874,365],{"class":167},[33,143876,59673],{"class":54},[33,143878,365],{"class":167},[33,143880,4106],{"class":54},[33,143882,365],{"class":167},[33,143884,108767],{"class":54},[33,143886,8935],{"class":167},[33,143888,143889,143891,143893,143895],{"class":35,"line":206},[33,143890,111493],{"class":238},[33,143892,242],{"class":163},[33,143894,17356],{"class":54},[33,143896,247],{"class":167},[33,143898,143899],{"class":35,"line":224},[33,143900,1202],{"class":167},[33,143902,143903],{"class":35,"line":229},[33,143904,92],{"emptyLinePlaceholder":91},[33,143906,143907],{"class":35,"line":235},[33,143908,143909],{"class":39},"    # By Excel column letter range — useful when column names are unknown\n",[33,143911,143912,143914,143916,143918,143920,143922,143924,143926,143929,143931,143933,143935,143937],{"class":35,"line":250},[33,143913,140907],{"class":167},[33,143915,242],{"class":163},[33,143917,126254],{"class":167},[33,143919,138592],{"class":50},[33,143921,365],{"class":167},[33,143923,21904],{"class":238},[33,143925,242],{"class":163},[33,143927,143928],{"class":54},"\"A:D,F\"",[33,143930,365],{"class":167},[33,143932,17351],{"class":238},[33,143934,242],{"class":163},[33,143936,17356],{"class":54},[33,143938,221],{"class":167},[33,143940,143941],{"class":35,"line":266},[33,143942,92],{"emptyLinePlaceholder":91},[33,143944,143945],{"class":35,"line":290},[33,143946,143947],{"class":39},"    # By callable — keep columns whose name starts with \"Revenue\"\n",[33,143949,143950,143953,143955],{"class":35,"line":295},[33,143951,143952],{"class":167},"    df_rev ",[33,143954,242],{"class":163},[33,143956,126171],{"class":167},[33,143958,143959,143961],{"class":35,"line":300},[33,143960,138626],{"class":50},[33,143962,247],{"class":167},[33,143964,143965,143967,143969,143971,143973,143975,143977],{"class":35,"line":317},[33,143966,140859],{"class":238},[33,143968,44117],{"class":163},[33,143970,140965],{"class":167},[33,143972,1053],{"class":50},[33,143974,118939],{"class":167},[33,143976,12925],{"class":54},[33,143978,1506],{"class":167},[33,143980,143981,143983,143985,143987],{"class":35,"line":332},[33,143982,111493],{"class":238},[33,143984,242],{"class":163},[33,143986,17356],{"class":54},[33,143988,247],{"class":167},[33,143990,143991],{"class":35,"line":347},[33,143992,1202],{"class":167},[33,143994,143995,143997,143999,144001,144003,144005,144007,144009,144011,144013,144015,144017,144019],{"class":35,"line":374},[33,143996,7268],{"class":50},[33,143998,602],{"class":167},[33,144000,141037],{"class":54},[33,144002,141040],{"class":167},[33,144004,115520],{"class":238},[33,144006,242],{"class":163},[33,144008,855],{"class":50},[33,144010,115527],{"class":167},[33,144012,74328],{"class":163},[33,144014,1159],{"class":50},[33,144016,365],{"class":167},[33,144018,141057],{"class":54},[33,144020,221],{"class":167},[33,144022,144023,144025,144027,144029],{"class":35,"line":397},[33,144024,35726],{"class":163},[33,144026,783],{"class":50},[33,144028,1852],{"class":163},[33,144030,7583],{"class":167},[33,144032,144033,144035,144037,144039,144041,144043,144045,144047,144049,144051],{"class":35,"line":653},[33,144034,35742],{"class":163},[33,144036,16617],{"class":50},[33,144038,602],{"class":167},[33,144040,4059],{"class":163},[33,144042,138740],{"class":54},[33,144044,1115],{"class":50},[33,144046,7602],{"class":167},[33,144048,1121],{"class":50},[33,144050,274],{"class":54},[33,144052,221],{"class":167},[14,144054,12951,144055,144057,144058,144060],{},[30,144056,21904],{}," is a list of names, those names must match the header row exactly — including case and leading\u002Ftrailing whitespace. Strip column names after load with ",[30,144059,141099],{}," if the source file has invisible padding.",[18,144062,144064,144065,10065,144067],{"id":144063},"step-6-dtype-and-parse_dates","Step 6: ",[30,144066,23262],{},[30,144068,102641],{},[14,144070,144071,144072,144075],{},"Type inference is lossy for two categories of data: identifiers (IDs become floats) and dates (stored as integers or strings, not converted to ",[30,144073,144074],{},"datetime","). Fix both at load time:",[23,144077,144079],{"className":126,"code":144078,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nfrom pathlib import Path\nimport pandas as pd\n\nEXCEL_PATH = Path(\"data\u002Forders.xlsx\")\n\ntry:\n    df = pd.read_excel(\n        EXCEL_PATH,\n        dtype={\n            \"order_id\": str,        # keeps \"1001\" not 1001.0\n            \"customer_id\": str,     # preserves leading zeros on codes like \"00123\"\n            \"amount\": float,\n            \"qty\": int,\n        },\n        parse_dates=[\"order_date\", \"ship_date\"],\n        engine=\"openpyxl\",\n    )\n    print(df.dtypes)\n    assert pd.api.types.is_datetime64_any_dtype(df[\"order_date\"]), \"Date parse failed\"\nexcept AssertionError as e:\n    print(f\"Type check: {e}\")\n    # Fallback: coerce after load\n    df[\"order_date\"] = pd.to_datetime(df[\"order_date\"], errors=\"coerce\")\nexcept Exception as e:\n    raise SystemExit(f\"Read error: {e}\")\n",[30,144080,144081,144085,144095,144105,144109,144122,144126,144132,144140,144146,144154,144167,144181,144191,144202,144206,144223,144233,144237,144243,144256,144266,144287,144292,144316,144326],{"__ignoreMap":28},[33,144082,144083],{"class":35,"line":36},[33,144084,3952],{"class":39},[33,144086,144087,144089,144091,144093],{"class":35,"line":43},[33,144088,190],{"class":163},[33,144090,193],{"class":167},[33,144092,164],{"class":163},[33,144094,198],{"class":167},[33,144096,144097,144099,144101,144103],{"class":35,"line":61},[33,144098,164],{"class":163},[33,144100,492],{"class":167},[33,144102,495],{"class":163},[33,144104,498],{"class":167},[33,144106,144107],{"class":35,"line":73},[33,144108,92],{"emptyLinePlaceholder":91},[33,144110,144111,144113,144115,144117,144120],{"class":35,"line":88},[33,144112,138592],{"class":50},[33,144114,212],{"class":163},[33,144116,215],{"class":167},[33,144118,144119],{"class":54},"\"data\u002Forders.xlsx\"",[33,144121,221],{"class":167},[33,144123,144124],{"class":35,"line":95},[33,144125,92],{"emptyLinePlaceholder":91},[33,144127,144128,144130],{"class":35,"line":101},[33,144129,35574],{"class":163},[33,144131,574],{"class":167},[33,144133,144134,144136,144138],{"class":35,"line":171},[33,144135,4025],{"class":167},[33,144137,242],{"class":163},[33,144139,126171],{"class":167},[33,144141,144142,144144],{"class":35,"line":179},[33,144143,138626],{"class":50},[33,144145,247],{"class":167},[33,144147,144148,144150,144152],{"class":35,"line":187},[33,144149,125888],{"class":238},[33,144151,242],{"class":163},[33,144153,10225],{"class":167},[33,144155,144156,144158,144160,144162,144164],{"class":35,"line":201},[33,144157,141223],{"class":54},[33,144159,2079],{"class":167},[33,144161,1053],{"class":50},[33,144163,89262],{"class":167},[33,144165,144166],{"class":39},"# keeps \"1001\" not 1001.0\n",[33,144168,144169,144172,144174,144176,144178],{"class":35,"line":206},[33,144170,144171],{"class":54},"            \"customer_id\"",[33,144173,2079],{"class":167},[33,144175,1053],{"class":50},[33,144177,25539],{"class":167},[33,144179,144180],{"class":39},"# preserves leading zeros on codes like \"00123\"\n",[33,144182,144183,144185,144187,144189],{"class":35,"line":224},[33,144184,141248],{"class":54},[33,144186,2079],{"class":167},[33,144188,1720],{"class":50},[33,144190,247],{"class":167},[33,144192,144193,144196,144198,144200],{"class":35,"line":229},[33,144194,144195],{"class":54},"            \"qty\"",[33,144197,2079],{"class":167},[33,144199,1059],{"class":50},[33,144201,247],{"class":167},[33,144203,144204],{"class":35,"line":235},[33,144205,141259],{"class":167},[33,144207,144208,144210,144212,144214,144216,144218,144221],{"class":35,"line":250},[33,144209,108760],{"class":238},[33,144211,242],{"class":163},[33,144213,8309],{"class":167},[33,144215,108767],{"class":54},[33,144217,365],{"class":167},[33,144219,144220],{"class":54},"\"ship_date\"",[33,144222,8935],{"class":167},[33,144224,144225,144227,144229,144231],{"class":35,"line":266},[33,144226,111493],{"class":238},[33,144228,242],{"class":163},[33,144230,17356],{"class":54},[33,144232,247],{"class":167},[33,144234,144235],{"class":35,"line":290},[33,144236,1202],{"class":167},[33,144238,144239,144241],{"class":35,"line":295},[33,144240,7268],{"class":50},[33,144242,108834],{"class":167},[33,144244,144245,144247,144249,144251,144253],{"class":35,"line":300},[33,144246,9228],{"class":163},[33,144248,141989],{"class":167},[33,144250,108767],{"class":54},[33,144252,54320],{"class":167},[33,144254,144255],{"class":54},"\"Date parse failed\"\n",[33,144257,144258,144260,144262,144264],{"class":35,"line":317},[33,144259,35726],{"class":163},[33,144261,9445],{"class":50},[33,144263,1852],{"class":163},[33,144265,7583],{"class":167},[33,144267,144268,144270,144272,144274,144277,144279,144281,144283,144285],{"class":35,"line":332},[33,144269,7268],{"class":50},[33,144271,602],{"class":167},[33,144273,4059],{"class":163},[33,144275,144276],{"class":54},"\"Type check: ",[33,144278,1115],{"class":50},[33,144280,7602],{"class":167},[33,144282,1121],{"class":50},[33,144284,274],{"class":54},[33,144286,221],{"class":167},[33,144288,144289],{"class":35,"line":347},[33,144290,144291],{"class":39},"    # Fallback: coerce after load\n",[33,144293,144294,144296,144298,144300,144302,144304,144306,144308,144310,144312,144314],{"class":35,"line":374},[33,144295,27581],{"class":167},[33,144297,108767],{"class":54},[33,144299,763],{"class":167},[33,144301,242],{"class":163},[33,144303,27668],{"class":167},[33,144305,108767],{"class":54},[33,144307,8314],{"class":167},[33,144309,8317],{"class":238},[33,144311,242],{"class":163},[33,144313,12107],{"class":54},[33,144315,221],{"class":167},[33,144317,144318,144320,144322,144324],{"class":35,"line":397},[33,144319,35726],{"class":163},[33,144321,783],{"class":50},[33,144323,1852],{"class":163},[33,144325,7583],{"class":167},[33,144327,144328,144330,144332,144334,144336,144338,144340,144342,144344,144346],{"class":35,"line":653},[33,144329,35742],{"class":163},[33,144331,16617],{"class":50},[33,144333,602],{"class":167},[33,144335,4059],{"class":163},[33,144337,138740],{"class":54},[33,144339,1115],{"class":50},[33,144341,7602],{"class":167},[33,144343,1121],{"class":50},[33,144345,274],{"class":54},[33,144347,221],{"class":167},[14,144349,41963,144350,144352,144353,144355,144356,144359,144360,144362],{},[30,144351,102641],{}," silently fails (the column stays ",[30,144354,11888],{},"), the Excel cells contain text strings rather than date serials. The fallback ",[30,144357,144358],{},"pd.to_datetime(..., errors=\"coerce\")"," converts what it can and inserts ",[30,144361,116202],{}," for values that cannot be parsed, making failures visible rather than silent.",[18,144364,144366],{"id":144365},"step-7-loading-multiple-sheets-into-one-dataframe","Step 7: Loading Multiple Sheets into One DataFrame",[14,144368,144369],{},"Stack identically structured sheets into a single DataFrame before analysis or reporting:",[23,144371,144373],{"className":126,"code":144372,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nfrom pathlib import Path\nimport pandas as pd\n\nEXCEL_PATH = Path(\"data\u002Fannual_sales.xlsx\")\n\ntry:\n    all_sheets = pd.read_excel(EXCEL_PATH, sheet_name=None, engine=\"openpyxl\")\n\n    frames = []\n    for name, df in all_sheets.items():\n        df = df.dropna(how=\"all\")       # drop rows that are entirely blank\n        df[\"quarter\"] = name            # tag source sheet\n        frames.append(df)\n\n    if not frames:\n        raise ValueError(\"No sheets with data found\")\n\n    combined = pd.concat(frames, ignore_index=True)\n    print(f\"Combined: {combined.shape[0]} rows across {len(frames)} sheets\")\nexcept Exception as e:\n    raise SystemExit(f\"Concat error: {e}\")\n",[30,144374,144375,144379,144389,144399,144403,144416,144420,144426,144454,144458,144466,144476,144495,144512,144516,144520,144528,144541,144545,144561,144593,144603],{"__ignoreMap":28},[33,144376,144377],{"class":35,"line":36},[33,144378,3952],{"class":39},[33,144380,144381,144383,144385,144387],{"class":35,"line":43},[33,144382,190],{"class":163},[33,144384,193],{"class":167},[33,144386,164],{"class":163},[33,144388,198],{"class":167},[33,144390,144391,144393,144395,144397],{"class":35,"line":61},[33,144392,164],{"class":163},[33,144394,492],{"class":167},[33,144396,495],{"class":163},[33,144398,498],{"class":167},[33,144400,144401],{"class":35,"line":73},[33,144402,92],{"emptyLinePlaceholder":91},[33,144404,144405,144407,144409,144411,144414],{"class":35,"line":88},[33,144406,138592],{"class":50},[33,144408,212],{"class":163},[33,144410,215],{"class":167},[33,144412,144413],{"class":54},"\"data\u002Fannual_sales.xlsx\"",[33,144415,221],{"class":167},[33,144417,144418],{"class":35,"line":95},[33,144419,92],{"emptyLinePlaceholder":91},[33,144421,144422,144424],{"class":35,"line":101},[33,144423,35574],{"class":163},[33,144425,574],{"class":167},[33,144427,144428,144430,144432,144434,144436,144438,144440,144442,144444,144446,144448,144450,144452],{"class":35,"line":171},[33,144429,126249],{"class":167},[33,144431,242],{"class":163},[33,144433,126254],{"class":167},[33,144435,138592],{"class":50},[33,144437,365],{"class":167},[33,144439,17371],{"class":238},[33,144441,242],{"class":163},[33,144443,571],{"class":50},[33,144445,365],{"class":167},[33,144447,17351],{"class":238},[33,144449,242],{"class":163},[33,144451,17356],{"class":54},[33,144453,221],{"class":167},[33,144455,144456],{"class":35,"line":179},[33,144457,92],{"emptyLinePlaceholder":91},[33,144459,144460,144462,144464],{"class":35,"line":187},[33,144461,584],{"class":167},[33,144463,242],{"class":163},[33,144465,589],{"class":167},[33,144467,144468,144470,144472,144474],{"class":35,"line":201},[33,144469,656],{"class":163},[33,144471,14253],{"class":167},[33,144473,662],{"class":163},[33,144475,141592],{"class":167},[33,144477,144478,144480,144482,144484,144486,144488,144490,144492],{"class":35,"line":206},[33,144479,7930],{"class":167},[33,144481,242],{"class":163},[33,144483,114425],{"class":167},[33,144485,28045],{"class":238},[33,144487,242],{"class":163},[33,144489,35616],{"class":54},[33,144491,8815],{"class":167},[33,144493,144494],{"class":39},"# drop rows that are entirely blank\n",[33,144496,144497,144499,144502,144504,144506,144509],{"class":35,"line":224},[33,144498,10902],{"class":167},[33,144500,144501],{"class":54},"\"quarter\"",[33,144503,763],{"class":167},[33,144505,242],{"class":163},[33,144507,144508],{"class":167}," name            ",[33,144510,144511],{"class":39},"# tag source sheet\n",[33,144513,144514],{"class":35,"line":229},[33,144515,10929],{"class":167},[33,144517,144518],{"class":35,"line":235},[33,144519,92],{"emptyLinePlaceholder":91},[33,144521,144522,144524,144526],{"class":35,"line":250},[33,144523,617],{"class":163},[33,144525,620],{"class":163},[33,144527,816],{"class":167},[33,144529,144530,144532,144534,144536,144539],{"class":35,"line":266},[33,144531,4051],{"class":163},[33,144533,4054],{"class":50},[33,144535,602],{"class":167},[33,144537,144538],{"class":54},"\"No sheets with data found\"",[33,144540,221],{"class":167},[33,144542,144543],{"class":35,"line":290},[33,144544,92],{"emptyLinePlaceholder":91},[33,144546,144547,144549,144551,144553,144555,144557,144559],{"class":35,"line":295},[33,144548,842],{"class":167},[33,144550,242],{"class":163},[33,144552,847],{"class":167},[33,144554,850],{"class":238},[33,144556,242],{"class":163},[33,144558,855],{"class":50},[33,144560,221],{"class":167},[33,144562,144563,144565,144567,144569,144571,144573,144575,144577,144579,144581,144583,144585,144587,144589,144591],{"class":35,"line":300},[33,144564,7268],{"class":50},[33,144566,602],{"class":167},[33,144568,4059],{"class":163},[33,144570,141688],{"class":54},[33,144572,1115],{"class":50},[33,144574,16013],{"class":167},[33,144576,748],{"class":50},[33,144578,9546],{"class":167},[33,144580,1121],{"class":50},[33,144582,141701],{"class":54},[33,144584,4065],{"class":50},[33,144586,8147],{"class":167},[33,144588,1121],{"class":50},[33,144590,141710],{"class":54},[33,144592,221],{"class":167},[33,144594,144595,144597,144599,144601],{"class":35,"line":317},[33,144596,35726],{"class":163},[33,144598,783],{"class":50},[33,144600,1852],{"class":163},[33,144602,7583],{"class":167},[33,144604,144605,144607,144609,144611,144613,144616,144618,144620,144622,144624],{"class":35,"line":332},[33,144606,35742],{"class":163},[33,144608,16617],{"class":50},[33,144610,602],{"class":167},[33,144612,4059],{"class":163},[33,144614,144615],{"class":54},"\"Concat error: ",[33,144617,1115],{"class":50},[33,144619,7602],{"class":167},[33,144621,1121],{"class":50},[33,144623,274],{"class":54},[33,144625,221],{"class":167},[18,144627,144629],{"id":144628},"step-8-reading-with-openpyxl-directly","Step 8: Reading with openpyxl Directly",[14,144631,17059,144632,144634,144635,144637,144638,144640],{},[30,144633,22009],{}," directly — instead of via ",[30,144636,9630],{}," — when you need data that ",[30,144639,9630],{}," discards: cell background colors, font styles, comment text, formula strings (not results), or merged cell geometry.",[23,144642,144644],{"className":126,"code":144643,"language":47,"meta":28,"style":28},"# pip install openpyxl\nfrom pathlib import Path\nimport openpyxl\n\nEXCEL_PATH = Path(\"data\u002Fstyled_report.xlsx\")\n\ntry:\n    # data_only=True returns cached formula results, not formula strings\n    # read_only=True streams rows without loading full workbook into RAM\n    wb = openpyxl.load_workbook(EXCEL_PATH, read_only=True, data_only=True)\n    ws = wb[\"Summary\"]\n\n    rows = []\n    for row in ws.iter_rows(min_row=2, values_only=True):\n        rows.append(row)\n\n    wb.close()\n    print(f\"Read {len(rows)} data rows via openpyxl\")\nexcept FileNotFoundError:\n    raise SystemExit(f\"File not found: {EXCEL_PATH}\")\nexcept KeyError as e:\n    raise SystemExit(f\"Sheet not found: {e}\")\n",[30,144645,144646,144650,144660,144666,144670,144683,144687,144693,144698,144703,144733,144745,144749,144757,144783,144788,144792,144796,144817,144825,144843,144853],{"__ignoreMap":28},[33,144647,144648],{"class":35,"line":36},[33,144649,98209],{"class":39},[33,144651,144652,144654,144656,144658],{"class":35,"line":43},[33,144653,190],{"class":163},[33,144655,193],{"class":167},[33,144657,164],{"class":163},[33,144659,198],{"class":167},[33,144661,144662,144664],{"class":35,"line":61},[33,144663,164],{"class":163},[33,144665,95887],{"class":167},[33,144667,144668],{"class":35,"line":73},[33,144669,92],{"emptyLinePlaceholder":91},[33,144671,144672,144674,144676,144678,144681],{"class":35,"line":88},[33,144673,138592],{"class":50},[33,144675,212],{"class":163},[33,144677,215],{"class":167},[33,144679,144680],{"class":54},"\"data\u002Fstyled_report.xlsx\"",[33,144682,221],{"class":167},[33,144684,144685],{"class":35,"line":95},[33,144686,92],{"emptyLinePlaceholder":91},[33,144688,144689,144691],{"class":35,"line":101},[33,144690,35574],{"class":163},[33,144692,574],{"class":167},[33,144694,144695],{"class":35,"line":171},[33,144696,144697],{"class":39},"    # data_only=True returns cached formula results, not formula strings\n",[33,144699,144700],{"class":35,"line":179},[33,144701,144702],{"class":39},"    # read_only=True streams rows without loading full workbook into RAM\n",[33,144704,144705,144707,144709,144712,144714,144716,144718,144720,144722,144724,144727,144729,144731],{"class":35,"line":187},[33,144706,17432],{"class":167},[33,144708,242],{"class":163},[33,144710,144711],{"class":167}," openpyxl.load_workbook(",[33,144713,138592],{"class":50},[33,144715,365],{"class":167},[33,144717,98285],{"class":238},[33,144719,242],{"class":163},[33,144721,855],{"class":50},[33,144723,365],{"class":167},[33,144725,144726],{"class":238},"data_only",[33,144728,242],{"class":163},[33,144730,855],{"class":50},[33,144732,221],{"class":167},[33,144734,144735,144737,144739,144741,144743],{"class":35,"line":201},[33,144736,17442],{"class":167},[33,144738,242],{"class":163},[33,144740,17447],{"class":167},[33,144742,103086],{"class":54},[33,144744,9202],{"class":167},[33,144746,144747],{"class":35,"line":206},[33,144748,92],{"emptyLinePlaceholder":91},[33,144750,144751,144753,144755],{"class":35,"line":224},[33,144752,44390],{"class":167},[33,144754,242],{"class":163},[33,144756,589],{"class":167},[33,144758,144759,144761,144763,144765,144767,144769,144771,144773,144775,144777,144779,144781],{"class":35,"line":229},[33,144760,656],{"class":163},[33,144762,3844],{"class":167},[33,144764,662],{"class":163},[33,144766,17639],{"class":167},[33,144768,17642],{"class":238},[33,144770,242],{"class":163},[33,144772,1533],{"class":50},[33,144774,365],{"class":167},[33,144776,98352],{"class":238},[33,144778,242],{"class":163},[33,144780,855],{"class":50},[33,144782,1737],{"class":167},[33,144784,144785],{"class":35,"line":235},[33,144786,144787],{"class":167},"        rows.append(row)\n",[33,144789,144790],{"class":35,"line":250},[33,144791,92],{"emptyLinePlaceholder":91},[33,144793,144794],{"class":35,"line":266},[33,144795,99958],{"class":167},[33,144797,144798,144800,144802,144804,144806,144808,144810,144812,144815],{"class":35,"line":290},[33,144799,7268],{"class":50},[33,144801,602],{"class":167},[33,144803,4059],{"class":163},[33,144805,128016],{"class":54},[33,144807,4065],{"class":50},[33,144809,44449],{"class":167},[33,144811,1121],{"class":50},[33,144813,144814],{"class":54}," data rows via openpyxl\"",[33,144816,221],{"class":167},[33,144818,144819,144821,144823],{"class":35,"line":295},[33,144820,35726],{"class":163},[33,144822,2945],{"class":50},[33,144824,574],{"class":167},[33,144826,144827,144829,144831,144833,144835,144837,144839,144841],{"class":35,"line":300},[33,144828,35742],{"class":163},[33,144830,16617],{"class":50},[33,144832,602],{"class":167},[33,144834,4059],{"class":163},[33,144836,15677],{"class":54},[33,144838,138680],{"class":50},[33,144840,274],{"class":54},[33,144842,221],{"class":167},[33,144844,144845,144847,144849,144851],{"class":35,"line":317},[33,144846,35726],{"class":163},[33,144848,140524],{"class":50},[33,144850,1852],{"class":163},[33,144852,7583],{"class":167},[33,144854,144855,144857,144859,144861,144863,144865,144867,144869,144871,144873],{"class":35,"line":332},[33,144856,35742],{"class":163},[33,144858,16617],{"class":50},[33,144860,602],{"class":167},[33,144862,4059],{"class":163},[33,144864,140541],{"class":54},[33,144866,1115],{"class":50},[33,144868,7602],{"class":167},[33,144870,1121],{"class":50},[33,144872,274],{"class":54},[33,144874,221],{"class":167},[14,144876,144877,144879,144880,10065,144882,144884,144885,3035],{},[30,144878,97854],{}," streams rows without loading the full workbook into RAM — essential for files over 50 MB. Note that ",[30,144881,97854],{},[30,144883,105730],{}," cannot both access cell styles; for styling metadata, open without ",[30,144886,98285],{},[18,144888,2709],{"id":2708},[424,144890,144892],{"id":144891},"xlsb-binary-format","xlsb Binary Format",[14,144894,144895,144897,144898,144900,144901,20891],{},[30,144896,142704],{}," files are not supported by ",[30,144899,22009],{},". The only engine that handles them is ",[30,144902,139236],{},[23,144904,144906],{"className":126,"code":144905,"language":47,"meta":28,"style":28},"# pip install pandas python-calamine\nfrom pathlib import Path\nimport pandas as pd\n\nEXCEL_PATH = Path(\"data\u002Flarge_model.xlsb\")\n\ntry:\n    df = pd.read_excel(EXCEL_PATH, engine=\"calamine\")\n    print(df.shape)\nexcept ImportError:\n    raise SystemExit(\"Run: pip install python-calamine\")\nexcept Exception as e:\n    raise SystemExit(f\"Read error: {e}\")\n",[30,144907,144908,144913,144923,144933,144937,144950,144954,144960,144980,144986,144994,145006,145016],{"__ignoreMap":28},[33,144909,144910],{"class":35,"line":36},[33,144911,144912],{"class":39},"# pip install pandas python-calamine\n",[33,144914,144915,144917,144919,144921],{"class":35,"line":43},[33,144916,190],{"class":163},[33,144918,193],{"class":167},[33,144920,164],{"class":163},[33,144922,198],{"class":167},[33,144924,144925,144927,144929,144931],{"class":35,"line":61},[33,144926,164],{"class":163},[33,144928,492],{"class":167},[33,144930,495],{"class":163},[33,144932,498],{"class":167},[33,144934,144935],{"class":35,"line":73},[33,144936,92],{"emptyLinePlaceholder":91},[33,144938,144939,144941,144943,144945,144948],{"class":35,"line":88},[33,144940,138592],{"class":50},[33,144942,212],{"class":163},[33,144944,215],{"class":167},[33,144946,144947],{"class":54},"\"data\u002Flarge_model.xlsb\"",[33,144949,221],{"class":167},[33,144951,144952],{"class":35,"line":95},[33,144953,92],{"emptyLinePlaceholder":91},[33,144955,144956,144958],{"class":35,"line":101},[33,144957,35574],{"class":163},[33,144959,574],{"class":167},[33,144961,144962,144964,144966,144968,144970,144972,144974,144976,144978],{"class":35,"line":171},[33,144963,4025],{"class":167},[33,144965,242],{"class":163},[33,144967,126254],{"class":167},[33,144969,138592],{"class":50},[33,144971,365],{"class":167},[33,144973,17351],{"class":238},[33,144975,242],{"class":163},[33,144977,139319],{"class":54},[33,144979,221],{"class":167},[33,144981,144982,144984],{"class":35,"line":179},[33,144983,7268],{"class":50},[33,144985,39529],{"class":167},[33,144987,144988,144990,144992],{"class":35,"line":187},[33,144989,35726],{"class":163},[33,144991,40488],{"class":50},[33,144993,574],{"class":167},[33,144995,144996,144998,145000,145002,145004],{"class":35,"line":201},[33,144997,35742],{"class":163},[33,144999,16617],{"class":50},[33,145001,602],{"class":167},[33,145003,139353],{"class":54},[33,145005,221],{"class":167},[33,145007,145008,145010,145012,145014],{"class":35,"line":206},[33,145009,35726],{"class":163},[33,145011,783],{"class":50},[33,145013,1852],{"class":163},[33,145015,7583],{"class":167},[33,145017,145018,145020,145022,145024,145026,145028,145030,145032,145034,145036],{"class":35,"line":224},[33,145019,35742],{"class":163},[33,145021,16617],{"class":50},[33,145023,602],{"class":167},[33,145025,4059],{"class":163},[33,145027,138740],{"class":54},[33,145029,1115],{"class":50},[33,145031,7602],{"class":167},[33,145033,1121],{"class":50},[33,145035,274],{"class":54},[33,145037,221],{"class":167},[424,145039,145041],{"id":145040},"multi-level-column-headers","Multi-level Column Headers",[14,145043,145044,145045,20891],{},"Reports exported from pivot tables often have two header rows forming a ",[30,145046,145047],{},"MultiIndex",[23,145049,145051],{"className":126,"code":145050,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nfrom pathlib import Path\nimport pandas as pd\n\nEXCEL_PATH = Path(\"data\u002Fpivot_export.xlsx\")\n\ntry:\n    df = pd.read_excel(\n        EXCEL_PATH,\n        header=[0, 1],      # rows 0 and 1 both contribute to the column index\n        engine=\"openpyxl\",\n    )\n    print(df.columns)       # MultiIndex tuples like ('Revenue', 'Q1')\n    # Flatten to single-level\n    df.columns = [\"_\".join(str(c) for c in col).strip() for col in df.columns]\n    print(df.columns.tolist())\nexcept Exception as e:\n    raise SystemExit(f\"Read error: {e}\")\n",[30,145052,145053,145057,145067,145077,145081,145094,145098,145104,145112,145118,145138,145148,145152,145162,145167,145201,145207,145217],{"__ignoreMap":28},[33,145054,145055],{"class":35,"line":36},[33,145056,3952],{"class":39},[33,145058,145059,145061,145063,145065],{"class":35,"line":43},[33,145060,190],{"class":163},[33,145062,193],{"class":167},[33,145064,164],{"class":163},[33,145066,198],{"class":167},[33,145068,145069,145071,145073,145075],{"class":35,"line":61},[33,145070,164],{"class":163},[33,145072,492],{"class":167},[33,145074,495],{"class":163},[33,145076,498],{"class":167},[33,145078,145079],{"class":35,"line":73},[33,145080,92],{"emptyLinePlaceholder":91},[33,145082,145083,145085,145087,145089,145092],{"class":35,"line":88},[33,145084,138592],{"class":50},[33,145086,212],{"class":163},[33,145088,215],{"class":167},[33,145090,145091],{"class":54},"\"data\u002Fpivot_export.xlsx\"",[33,145093,221],{"class":167},[33,145095,145096],{"class":35,"line":95},[33,145097,92],{"emptyLinePlaceholder":91},[33,145099,145100,145102],{"class":35,"line":101},[33,145101,35574],{"class":163},[33,145103,574],{"class":167},[33,145105,145106,145108,145110],{"class":35,"line":171},[33,145107,4025],{"class":167},[33,145109,242],{"class":163},[33,145111,126171],{"class":167},[33,145113,145114,145116],{"class":35,"line":179},[33,145115,138626],{"class":50},[33,145117,247],{"class":167},[33,145119,145120,145122,145124,145126,145128,145130,145132,145135],{"class":35,"line":187},[33,145121,126205],{"class":238},[33,145123,242],{"class":163},[33,145125,8309],{"class":167},[33,145127,748],{"class":50},[33,145129,365],{"class":167},[33,145131,734],{"class":50},[33,145133,145134],{"class":167},"],      ",[33,145136,145137],{"class":39},"# rows 0 and 1 both contribute to the column index\n",[33,145139,145140,145142,145144,145146],{"class":35,"line":201},[33,145141,111493],{"class":238},[33,145143,242],{"class":163},[33,145145,17356],{"class":54},[33,145147,247],{"class":167},[33,145149,145150],{"class":35,"line":206},[33,145151,1202],{"class":167},[33,145153,145154,145156,145159],{"class":35,"line":224},[33,145155,7268],{"class":50},[33,145157,145158],{"class":167},"(df.columns)       ",[33,145160,145161],{"class":39},"# MultiIndex tuples like ('Revenue', 'Q1')\n",[33,145163,145164],{"class":35,"line":229},[33,145165,145166],{"class":39},"    # Flatten to single-level\n",[33,145168,145169,145171,145173,145175,145177,145180,145182,145184,145186,145188,145190,145193,145195,145197,145199],{"class":35,"line":235},[33,145170,27546],{"class":167},[33,145172,242],{"class":163},[33,145174,9178],{"class":167},[33,145176,7764],{"class":54},[33,145178,145179],{"class":167},".join(",[33,145181,1053],{"class":50},[33,145183,68507],{"class":167},[33,145185,6124],{"class":163},[33,145187,7486],{"class":167},[33,145189,662],{"class":163},[33,145191,145192],{"class":167}," col).strip() ",[33,145194,6124],{"class":163},[33,145196,7985],{"class":167},[33,145198,662],{"class":163},[33,145200,12624],{"class":167},[33,145202,145203,145205],{"class":35,"line":250},[33,145204,7268],{"class":50},[33,145206,140713],{"class":167},[33,145208,145209,145211,145213,145215],{"class":35,"line":266},[33,145210,35726],{"class":163},[33,145212,783],{"class":50},[33,145214,1852],{"class":163},[33,145216,7583],{"class":167},[33,145218,145219,145221,145223,145225,145227,145229,145231,145233,145235,145237],{"class":35,"line":290},[33,145220,35742],{"class":163},[33,145222,16617],{"class":50},[33,145224,602],{"class":167},[33,145226,4059],{"class":163},[33,145228,138740],{"class":54},[33,145230,1115],{"class":50},[33,145232,7602],{"class":167},[33,145234,1121],{"class":50},[33,145236,274],{"class":54},[33,145238,221],{"class":167},[424,145240,145242],{"id":145241},"merged-cell-headers","Merged Cell Headers",[14,145244,145245,145246,145248],{},"Merged cells export a value only in the top-left cell; adjacent cells in the merge read as ",[30,145247,8884],{},". Forward-fill restores the intended labels:",[23,145250,145252],{"className":126,"code":145251,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nfrom pathlib import Path\nimport pandas as pd\n\nEXCEL_PATH = Path(\"data\u002Fmerged_headers.xlsx\")\n\ntry:\n    df = pd.read_excel(EXCEL_PATH, header=None, engine=\"openpyxl\")\n    df.iloc[0] = df.iloc[0].ffill()    # propagate merged labels rightward\n    df.columns = df.iloc[0]\n    df = df.iloc[1:].reset_index(drop=True)\n    print(df.columns.tolist())\nexcept Exception as e:\n    raise SystemExit(f\"Read error: {e}\")\n",[30,145253,145254,145258,145268,145278,145282,145295,145299,145305,145333,145354,145366,145386,145392,145402],{"__ignoreMap":28},[33,145255,145256],{"class":35,"line":36},[33,145257,3952],{"class":39},[33,145259,145260,145262,145264,145266],{"class":35,"line":43},[33,145261,190],{"class":163},[33,145263,193],{"class":167},[33,145265,164],{"class":163},[33,145267,198],{"class":167},[33,145269,145270,145272,145274,145276],{"class":35,"line":61},[33,145271,164],{"class":163},[33,145273,492],{"class":167},[33,145275,495],{"class":163},[33,145277,498],{"class":167},[33,145279,145280],{"class":35,"line":73},[33,145281,92],{"emptyLinePlaceholder":91},[33,145283,145284,145286,145288,145290,145293],{"class":35,"line":88},[33,145285,138592],{"class":50},[33,145287,212],{"class":163},[33,145289,215],{"class":167},[33,145291,145292],{"class":54},"\"data\u002Fmerged_headers.xlsx\"",[33,145294,221],{"class":167},[33,145296,145297],{"class":35,"line":95},[33,145298,92],{"emptyLinePlaceholder":91},[33,145300,145301,145303],{"class":35,"line":101},[33,145302,35574],{"class":163},[33,145304,574],{"class":167},[33,145306,145307,145309,145311,145313,145315,145317,145319,145321,145323,145325,145327,145329,145331],{"class":35,"line":171},[33,145308,4025],{"class":167},[33,145310,242],{"class":163},[33,145312,126254],{"class":167},[33,145314,138592],{"class":50},[33,145316,365],{"class":167},[33,145318,44427],{"class":238},[33,145320,242],{"class":163},[33,145322,571],{"class":50},[33,145324,365],{"class":167},[33,145326,17351],{"class":238},[33,145328,242],{"class":163},[33,145330,17356],{"class":54},[33,145332,221],{"class":167},[33,145334,145335,145338,145340,145342,145344,145346,145348,145351],{"class":35,"line":179},[33,145336,145337],{"class":167},"    df.iloc[",[33,145339,748],{"class":50},[33,145341,763],{"class":167},[33,145343,242],{"class":163},[33,145345,10847],{"class":167},[33,145347,748],{"class":50},[33,145349,145350],{"class":167},"].ffill()    ",[33,145352,145353],{"class":39},"# propagate merged labels rightward\n",[33,145355,145356,145358,145360,145362,145364],{"class":35,"line":187},[33,145357,27546],{"class":167},[33,145359,242],{"class":163},[33,145361,10847],{"class":167},[33,145363,748],{"class":50},[33,145365,9202],{"class":167},[33,145367,145368,145370,145372,145374,145376,145378,145380,145382,145384],{"class":35,"line":201},[33,145369,4025],{"class":167},[33,145371,242],{"class":163},[33,145373,10847],{"class":167},[33,145375,734],{"class":50},[33,145377,10865],{"class":167},[33,145379,10868],{"class":238},[33,145381,242],{"class":163},[33,145383,855],{"class":50},[33,145385,221],{"class":167},[33,145387,145388,145390],{"class":35,"line":206},[33,145389,7268],{"class":50},[33,145391,140713],{"class":167},[33,145393,145394,145396,145398,145400],{"class":35,"line":224},[33,145395,35726],{"class":163},[33,145397,783],{"class":50},[33,145399,1852],{"class":163},[33,145401,7583],{"class":167},[33,145403,145404,145406,145408,145410,145412,145414,145416,145418,145420,145422],{"class":35,"line":229},[33,145405,35742],{"class":163},[33,145407,16617],{"class":50},[33,145409,602],{"class":167},[33,145411,4059],{"class":163},[33,145413,138740],{"class":54},[33,145415,1115],{"class":50},[33,145417,7602],{"class":167},[33,145419,1121],{"class":50},[33,145421,274],{"class":54},[33,145423,221],{"class":167},[424,145425,145427],{"id":145426},"reading-from-a-bytesio-buffer","Reading from a BytesIO Buffer",[14,145429,145430],{},"When Excel files arrive over HTTP, from an email attachment, or from S3, avoid writing to disk first:",[23,145432,145434],{"className":126,"code":145433,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl requests\nimport io\nfrom pathlib import Path\nimport pandas as pd\nimport requests\n\nURL = \"https:\u002F\u002Fexample.com\u002Fdata\u002Freport.xlsx\"\n\ntry:\n    response = requests.get(URL, timeout=30)\n    response.raise_for_status()\n    buf = io.BytesIO(response.content)\n    df = pd.read_excel(buf, engine=\"openpyxl\")\n    print(df.shape)\nexcept requests.RequestException as e:\n    raise SystemExit(f\"Download failed: {e}\")\nexcept Exception as e:\n    raise SystemExit(f\"Parse error: {e}\")\n",[30,145435,145436,145441,145447,145457,145467,145474,145478,145488,145492,145498,145520,145525,145534,145551,145557,145568,145591,145601],{"__ignoreMap":28},[33,145437,145438],{"class":35,"line":36},[33,145439,145440],{"class":39},"# pip install pandas openpyxl requests\n",[33,145442,145443,145445],{"class":35,"line":43},[33,145444,164],{"class":163},[33,145446,60058],{"class":167},[33,145448,145449,145451,145453,145455],{"class":35,"line":61},[33,145450,190],{"class":163},[33,145452,193],{"class":167},[33,145454,164],{"class":163},[33,145456,198],{"class":167},[33,145458,145459,145461,145463,145465],{"class":35,"line":73},[33,145460,164],{"class":163},[33,145462,492],{"class":167},[33,145464,495],{"class":163},[33,145466,498],{"class":167},[33,145468,145469,145471],{"class":35,"line":88},[33,145470,164],{"class":163},[33,145472,145473],{"class":167}," requests\n",[33,145475,145476],{"class":35,"line":95},[33,145477,92],{"emptyLinePlaceholder":91},[33,145479,145480,145483,145485],{"class":35,"line":101},[33,145481,145482],{"class":50},"URL",[33,145484,212],{"class":163},[33,145486,145487],{"class":54}," \"https:\u002F\u002Fexample.com\u002Fdata\u002Freport.xlsx\"\n",[33,145489,145490],{"class":35,"line":171},[33,145491,92],{"emptyLinePlaceholder":91},[33,145493,145494,145496],{"class":35,"line":179},[33,145495,35574],{"class":163},[33,145497,574],{"class":167},[33,145499,145500,145503,145505,145508,145510,145512,145514,145516,145518],{"class":35,"line":187},[33,145501,145502],{"class":167},"    response ",[33,145504,242],{"class":163},[33,145506,145507],{"class":167}," requests.get(",[33,145509,145482],{"class":50},[33,145511,365],{"class":167},[33,145513,1641],{"class":238},[33,145515,242],{"class":163},[33,145517,1543],{"class":50},[33,145519,221],{"class":167},[33,145521,145522],{"class":35,"line":201},[33,145523,145524],{"class":167},"    response.raise_for_status()\n",[33,145526,145527,145529,145531],{"class":35,"line":206},[33,145528,61913],{"class":167},[33,145530,242],{"class":163},[33,145532,145533],{"class":167}," io.BytesIO(response.content)\n",[33,145535,145536,145538,145540,145543,145545,145547,145549],{"class":35,"line":224},[33,145537,4025],{"class":167},[33,145539,242],{"class":163},[33,145541,145542],{"class":167}," pd.read_excel(buf, ",[33,145544,17351],{"class":238},[33,145546,242],{"class":163},[33,145548,17356],{"class":54},[33,145550,221],{"class":167},[33,145552,145553,145555],{"class":35,"line":229},[33,145554,7268],{"class":50},[33,145556,39529],{"class":167},[33,145558,145559,145561,145564,145566],{"class":35,"line":235},[33,145560,35726],{"class":163},[33,145562,145563],{"class":167}," requests.RequestException ",[33,145565,495],{"class":163},[33,145567,7583],{"class":167},[33,145569,145570,145572,145574,145576,145578,145581,145583,145585,145587,145589],{"class":35,"line":250},[33,145571,35742],{"class":163},[33,145573,16617],{"class":50},[33,145575,602],{"class":167},[33,145577,4059],{"class":163},[33,145579,145580],{"class":54},"\"Download failed: ",[33,145582,1115],{"class":50},[33,145584,7602],{"class":167},[33,145586,1121],{"class":50},[33,145588,274],{"class":54},[33,145590,221],{"class":167},[33,145592,145593,145595,145597,145599],{"class":35,"line":266},[33,145594,35726],{"class":163},[33,145596,783],{"class":50},[33,145598,1852],{"class":163},[33,145600,7583],{"class":167},[33,145602,145603,145605,145607,145609,145611,145613,145615,145617,145619,145621],{"class":35,"line":290},[33,145604,35742],{"class":163},[33,145606,16617],{"class":50},[33,145608,602],{"class":167},[33,145610,4059],{"class":163},[33,145612,124795],{"class":54},[33,145614,1115],{"class":50},[33,145616,7602],{"class":167},[33,145618,1121],{"class":50},[33,145620,274],{"class":54},[33,145622,221],{"class":167},[14,145624,145625,145626,145629,145630,145632,145633,145635,145636,3035],{},"The same pattern works with ",[30,145627,145628],{},"boto3"," for S3: fetch the object body as ",[30,145631,1620],{},", wrap in ",[30,145634,61504],{},", pass to ",[30,145637,57240],{},[18,145639,52030],{"id":52029},[14,145641,145642],{},"Validate immediately after load, before any downstream processing:",[23,145644,145646],{"className":126,"code":145645,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nfrom pathlib import Path\nimport pandas as pd\n\nEXCEL_PATH = Path(\"data\u002Forders.xlsx\")\nREQUIRED_COLS = {\"order_id\", \"customer_id\", \"amount\", \"order_date\"}\n\ntry:\n    df = pd.read_excel(\n        EXCEL_PATH,\n        dtype={\"order_id\": str, \"customer_id\": str},\n        parse_dates=[\"order_date\"],\n        engine=\"openpyxl\",\n    )\n\n    missing = REQUIRED_COLS - set(df.columns)\n    assert not missing, f\"Missing columns: {missing}\"\n    assert df.shape[0] > 0, \"DataFrame is empty\"\n    assert df[\"amount\"].dtype in (float, \"float64\"), \"amount not numeric\"\n    assert pd.api.types.is_datetime64_any_dtype(df[\"order_date\"]), \"order_date not parsed\"\n    assert not df[\"order_id\"].isna().any(), \"Null order_ids present\"\n\n    print(f\"Validation passed — {df.shape[0]} rows\")\nexcept AssertionError as e:\n    raise SystemExit(f\"Validation failed: {e}\")\nexcept Exception as e:\n    raise SystemExit(f\"Load error: {e}\")\n",[30,145647,145648,145652,145662,145672,145676,145688,145712,145716,145722,145730,145736,145760,145772,145782,145786,145790,145804,145825,145843,145869,145882,145897,145901,145925,145935,145957,145967],{"__ignoreMap":28},[33,145649,145650],{"class":35,"line":36},[33,145651,3952],{"class":39},[33,145653,145654,145656,145658,145660],{"class":35,"line":43},[33,145655,190],{"class":163},[33,145657,193],{"class":167},[33,145659,164],{"class":163},[33,145661,198],{"class":167},[33,145663,145664,145666,145668,145670],{"class":35,"line":61},[33,145665,164],{"class":163},[33,145667,492],{"class":167},[33,145669,495],{"class":163},[33,145671,498],{"class":167},[33,145673,145674],{"class":35,"line":73},[33,145675,92],{"emptyLinePlaceholder":91},[33,145677,145678,145680,145682,145684,145686],{"class":35,"line":88},[33,145679,138592],{"class":50},[33,145681,212],{"class":163},[33,145683,215],{"class":167},[33,145685,144119],{"class":54},[33,145687,221],{"class":167},[33,145689,145690,145692,145694,145696,145698,145700,145702,145704,145706,145708,145710],{"class":35,"line":95},[33,145691,141809],{"class":50},[33,145693,212],{"class":163},[33,145695,4098],{"class":167},[33,145697,108849],{"class":54},[33,145699,365],{"class":167},[33,145701,125895],{"class":54},[33,145703,365],{"class":167},[33,145705,4106],{"class":54},[33,145707,365],{"class":167},[33,145709,108767],{"class":54},[33,145711,4113],{"class":167},[33,145713,145714],{"class":35,"line":101},[33,145715,92],{"emptyLinePlaceholder":91},[33,145717,145718,145720],{"class":35,"line":171},[33,145719,35574],{"class":163},[33,145721,574],{"class":167},[33,145723,145724,145726,145728],{"class":35,"line":179},[33,145725,4025],{"class":167},[33,145727,242],{"class":163},[33,145729,126171],{"class":167},[33,145731,145732,145734],{"class":35,"line":187},[33,145733,138626],{"class":50},[33,145735,247],{"class":167},[33,145737,145738,145740,145742,145744,145746,145748,145750,145752,145754,145756,145758],{"class":35,"line":201},[33,145739,125888],{"class":238},[33,145741,242],{"class":163},[33,145743,1115],{"class":167},[33,145745,108849],{"class":54},[33,145747,2079],{"class":167},[33,145749,1053],{"class":50},[33,145751,365],{"class":167},[33,145753,125895],{"class":54},[33,145755,2079],{"class":167},[33,145757,1053],{"class":50},[33,145759,3509],{"class":167},[33,145761,145762,145764,145766,145768,145770],{"class":35,"line":206},[33,145763,108760],{"class":238},[33,145765,242],{"class":163},[33,145767,8309],{"class":167},[33,145769,108767],{"class":54},[33,145771,8935],{"class":167},[33,145773,145774,145776,145778,145780],{"class":35,"line":224},[33,145775,111493],{"class":238},[33,145777,242],{"class":163},[33,145779,17356],{"class":54},[33,145781,247],{"class":167},[33,145783,145784],{"class":35,"line":229},[33,145785,1202],{"class":167},[33,145787,145788],{"class":35,"line":235},[33,145789,92],{"emptyLinePlaceholder":91},[33,145791,145792,145794,145796,145798,145800,145802],{"class":35,"line":250},[33,145793,4118],{"class":167},[33,145795,242],{"class":163},[33,145797,141919],{"class":50},[33,145799,39025],{"class":163},[33,145801,4129],{"class":50},[33,145803,4132],{"class":167},[33,145805,145806,145808,145810,145813,145815,145817,145819,145821,145823],{"class":35,"line":266},[33,145807,9228],{"class":163},[33,145809,620],{"class":163},[33,145811,145812],{"class":167}," missing, ",[33,145814,4059],{"class":163},[33,145816,4152],{"class":54},[33,145818,1115],{"class":50},[33,145820,4157],{"class":167},[33,145822,1121],{"class":50},[33,145824,7504],{"class":54},[33,145826,145827,145829,145831,145833,145835,145837,145839,145841],{"class":35,"line":290},[33,145828,9228],{"class":163},[33,145830,9516],{"class":167},[33,145832,748],{"class":50},[33,145834,763],{"class":167},[33,145836,6009],{"class":163},[33,145838,10791],{"class":50},[33,145840,365],{"class":167},[33,145842,141966],{"class":54},[33,145844,145845,145847,145849,145851,145854,145856,145858,145860,145862,145864,145866],{"class":35,"line":295},[33,145846,9228],{"class":163},[33,145848,7935],{"class":167},[33,145850,4106],{"class":54},[33,145852,145853],{"class":167},"].dtype ",[33,145855,662],{"class":163},[33,145857,17583],{"class":167},[33,145859,1720],{"class":50},[33,145861,365],{"class":167},[33,145863,114044],{"class":54},[33,145865,18525],{"class":167},[33,145867,145868],{"class":54},"\"amount not numeric\"\n",[33,145870,145871,145873,145875,145877,145879],{"class":35,"line":300},[33,145872,9228],{"class":163},[33,145874,141989],{"class":167},[33,145876,108767],{"class":54},[33,145878,54320],{"class":167},[33,145880,145881],{"class":54},"\"order_date not parsed\"\n",[33,145883,145884,145886,145888,145890,145892,145894],{"class":35,"line":317},[33,145885,9228],{"class":163},[33,145887,620],{"class":163},[33,145889,7935],{"class":167},[33,145891,108849],{"class":54},[33,145893,141979],{"class":167},[33,145895,145896],{"class":54},"\"Null order_ids present\"\n",[33,145898,145899],{"class":35,"line":332},[33,145900,92],{"emptyLinePlaceholder":91},[33,145902,145903,145905,145907,145909,145911,145913,145915,145917,145919,145921,145923],{"class":35,"line":347},[33,145904,7268],{"class":50},[33,145906,602],{"class":167},[33,145908,4059],{"class":163},[33,145910,124023],{"class":54},[33,145912,1115],{"class":50},[33,145914,9541],{"class":167},[33,145916,748],{"class":50},[33,145918,9546],{"class":167},[33,145920,1121],{"class":50},[33,145922,65937],{"class":54},[33,145924,221],{"class":167},[33,145926,145927,145929,145931,145933],{"class":35,"line":374},[33,145928,35726],{"class":163},[33,145930,9445],{"class":50},[33,145932,1852],{"class":163},[33,145934,7583],{"class":167},[33,145936,145937,145939,145941,145943,145945,145947,145949,145951,145953,145955],{"class":35,"line":397},[33,145938,35742],{"class":163},[33,145940,16617],{"class":50},[33,145942,602],{"class":167},[33,145944,4059],{"class":163},[33,145946,124100],{"class":54},[33,145948,1115],{"class":50},[33,145950,7602],{"class":167},[33,145952,1121],{"class":50},[33,145954,274],{"class":54},[33,145956,221],{"class":167},[33,145958,145959,145961,145963,145965],{"class":35,"line":653},[33,145960,35726],{"class":163},[33,145962,783],{"class":50},[33,145964,1852],{"class":163},[33,145966,7583],{"class":167},[33,145968,145969,145971,145973,145975,145977,145979,145981,145983,145985,145987],{"class":35,"line":667},[33,145970,35742],{"class":163},[33,145972,16617],{"class":50},[33,145974,602],{"class":167},[33,145976,4059],{"class":163},[33,145978,142085],{"class":54},[33,145980,1115],{"class":50},[33,145982,7602],{"class":167},[33,145984,1121],{"class":50},[33,145986,274],{"class":54},[33,145988,221],{"class":167},[18,145990,21810],{"id":21809},[4273,145992,145993,146002],{},[4276,145994,145995],{},[4279,145996,145997,146000],{},[4282,145998,145999],{},"File size",[4282,146001,120841],{},[4292,146003,146004,146014,146027,146041],{},[4279,146005,146006,146009],{},[4297,146007,146008],{},"\u003C 10 MB",[4297,146010,146011],{},[30,146012,146013],{},"pd.read_excel(..., engine=\"openpyxl\")",[4279,146015,146016,146019],{},[4297,146017,146018],{},"10–100 MB",[4297,146020,4358,146021,146023,146024,146026],{},[30,146022,21904],{}," and explicit ",[30,146025,23262],{}," to cut allocations",[4279,146028,146029,146032],{},[4297,146030,146031],{},"> 100 MB",[4297,146033,146034,8877,146036,146038,146039],{},[30,146035,22009],{},[30,146037,97854],{}," streaming, or ",[30,146040,142724],{},[4279,146042,146043,146047],{},[4297,146044,41801,146045],{},[30,146046,142704],{},[4297,146048,146049,146051,146052,146054],{},[30,146050,142724],{}," only — ",[30,146053,22009],{}," cannot open this format",[14,146056,146057,146059,146060,146062],{},[30,146058,57240],{}," does not support chunked reading. For very large workbooks, split by sheet (",[30,146061,126093],{},") and process each sheet separately, or convert to CSV\u002FParquet first if you control the source.",[14,146064,146065,8877,146067,146069,146070,146072],{},[30,146066,22009],{},[30,146068,97854],{}," is the memory-efficient option for ",[30,146071,26542],{}," files above 100 MB. It streams rows as an iterator and does not hold the full parsed workbook in RAM.",[18,146074,4271],{"id":4270},[4273,146076,146077,146087],{},[4276,146078,146079],{},[4279,146080,146081,146083,146085],{},[4282,146082,14317],{},[4282,146084,4287],{},[4282,146086,4290],{},[4292,146088,146089,146108,146121,146135,146149,146167,146183],{},[4279,146090,146091,146095,146102],{},[4297,146092,146093],{},[30,146094,138312],{},[4297,146096,146097,146099,146100],{},[30,146098,125595],{}," ≥ 2.0 used for ",[30,146101,26542],{},[4297,146103,14408,146104,10073,146106],{},[30,146105,22395],{},[940,146107,126410],{"href":126409},[4279,146109,146110,146114,146117],{},[4297,146111,146112],{},[30,146113,142123],{},[4297,146115,146116],{},"Engine not installed",[4297,146118,146119],{},[30,146120,26548],{},[4279,146122,146123,146127,146130],{},[4297,146124,146125],{},[30,146126,142158],{},[4297,146128,146129],{},"Sheet name has spaces or wrong case",[4297,146131,133514,146132,146134],{},[30,146133,142166],{}," to confirm exact name",[4279,146136,146137,146141,146146],{},[4297,146138,146139],{},[30,146140,139407],{},[4297,146142,146143,146144],{},"File is corrupted or mislabeled as ",[30,146145,26542],{},[4297,146147,146148],{},"Check first 4 bytes to confirm format; pass correct engine",[4279,146150,146151,146157,146159],{},[4297,146152,142172,146153,365,146155],{},[30,146154,117994],{},[30,146156,142177],{},[4297,146158,142180],{},[4297,146160,4358,146161,146163,146164,146166],{},[30,146162,142185],{}," to skip intro rows; preview with ",[30,146165,142189],{}," first",[4279,146168,146169,146174,146179],{},[4297,146170,146171,146172],{},"Numeric IDs display as ",[30,146173,142197],{},[4297,146175,142200,146176,146178],{},[30,146177,102445],{}," from int-looking cells",[4297,146180,4358,146181],{},[30,146182,142207],{},[4279,146184,146185,146189,146194],{},[4297,146186,146187],{},[30,146188,142214],{},[4297,146190,146191,146193],{},[30,146192,126099],{}," removed all rows including the header",[4297,146195,17059,146196,146198,146199],{},[30,146197,140755],{}," to diagnose row layout before setting ",[30,146200,126099],{},[18,146202,4402],{"id":4401},[23,146204,146206],{"className":126,"code":146205,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\n\"\"\"read_excel_all_sheets.py — load, validate, and export every sheet to CSV.\"\"\"\nimport argparse\nfrom pathlib import Path\nimport pandas as pd\n\n\ndef load_workbook(path: Path, engine: str = \"openpyxl\") -> dict[str, pd.DataFrame]:\n    try:\n        return pd.read_excel(path, sheet_name=None, engine=engine)\n    except FileNotFoundError:\n        raise SystemExit(f\"File not found: {path}\")\n    except ImportError as e:\n        raise SystemExit(f\"Engine not installed — pip install {engine}: {e}\")\n    except Exception as e:\n        raise SystemExit(f\"Cannot open workbook: {e}\")\n\n\ndef clean_sheet(df: pd.DataFrame, name: str) -> pd.DataFrame:\n    df = df.dropna(how=\"all\").reset_index(drop=True)\n    df.columns = df.columns.astype(str).str.strip()\n    if df.empty:\n        print(f\"  WARN: sheet '{name}' has no data after cleaning\")\n    return df\n\n\ndef main() -> None:\n    parser = argparse.ArgumentParser(description=\"Export all Excel sheets to CSV\")\n    parser.add_argument(\"file\", type=Path, help=\"Path to .xlsx file\")\n    parser.add_argument(\n        \"--engine\",\n        default=\"openpyxl\",\n        choices=[\"openpyxl\", \"calamine\", \"xlrd\"],\n        help=\"Parsing engine\",\n    )\n    parser.add_argument(\n        \"--out-dir\",\n        type=Path,\n        default=Path(\".\"),\n        help=\"Output directory for CSV files\",\n    )\n    args = parser.parse_args()\n\n    args.out_dir.mkdir(parents=True, exist_ok=True)\n    sheets = load_workbook(args.file, engine=args.engine)\n\n    for name, df in sheets.items():\n        df = clean_sheet(df, name)\n        out = args.out_dir \u002F f\"{name}.csv\"\n        df.to_csv(out, index=False)\n        print(f\"  {name}: {df.shape[0]} rows → {out}\")\n\n\nif __name__ == \"__main__\":\n    main()\n",[30,146207,146208,146212,146217,146223,146233,146243,146247,146251,146274,146280,146301,146309,146331,146341,146372,146382,146404,146408,146412,146426,146450,146463,146469,146491,146497,146501,146505,146517,146534,146558,146562,146569,146579,146600,146611,146615,146619,146626,146636,146648,146658,146662,146670,146674,146695,146712,146716,146726,146735,146758,146770,146810,146814,146818,146830],{"__ignoreMap":28},[33,146209,146210],{"class":35,"line":36},[33,146211,3952],{"class":39},[33,146213,146214],{"class":35,"line":43},[33,146215,146216],{"class":54},"\"\"\"read_excel_all_sheets.py — load, validate, and export every sheet to CSV.\"\"\"\n",[33,146218,146219,146221],{"class":35,"line":61},[33,146220,164],{"class":163},[33,146222,4461],{"class":167},[33,146224,146225,146227,146229,146231],{"class":35,"line":73},[33,146226,190],{"class":163},[33,146228,193],{"class":167},[33,146230,164],{"class":163},[33,146232,198],{"class":167},[33,146234,146235,146237,146239,146241],{"class":35,"line":88},[33,146236,164],{"class":163},[33,146238,492],{"class":167},[33,146240,495],{"class":163},[33,146242,498],{"class":167},[33,146244,146245],{"class":35,"line":95},[33,146246,92],{"emptyLinePlaceholder":91},[33,146248,146249],{"class":35,"line":101},[33,146250,92],{"emptyLinePlaceholder":91},[33,146252,146253,146255,146258,146261,146263,146265,146268,146270,146272],{"class":35,"line":171},[33,146254,562],{"class":163},[33,146256,146257],{"class":46}," load_workbook",[33,146259,146260],{"class":167},"(path: Path, engine: ",[33,146262,1053],{"class":50},[33,146264,212],{"class":163},[33,146266,146267],{"class":54}," \"openpyxl\"",[33,146269,84928],{"class":167},[33,146271,1053],{"class":50},[33,146273,14062],{"class":167},[33,146275,146276,146278],{"class":35,"line":179},[33,146277,2424],{"class":163},[33,146279,574],{"class":167},[33,146281,146282,146284,146286,146288,146290,146292,146294,146296,146298],{"class":35,"line":187},[33,146283,1659],{"class":163},[33,146285,27389],{"class":167},[33,146287,17371],{"class":238},[33,146289,242],{"class":163},[33,146291,571],{"class":50},[33,146293,365],{"class":167},[33,146295,17351],{"class":238},[33,146297,242],{"class":163},[33,146299,146300],{"class":167},"engine)\n",[33,146302,146303,146305,146307],{"class":35,"line":201},[33,146304,2449],{"class":163},[33,146306,2945],{"class":50},[33,146308,574],{"class":167},[33,146310,146311,146313,146315,146317,146319,146321,146323,146325,146327,146329],{"class":35,"line":206},[33,146312,4051],{"class":163},[33,146314,16617],{"class":50},[33,146316,602],{"class":167},[33,146318,4059],{"class":163},[33,146320,15677],{"class":54},[33,146322,1115],{"class":50},[33,146324,2580],{"class":167},[33,146326,1121],{"class":50},[33,146328,274],{"class":54},[33,146330,221],{"class":167},[33,146332,146333,146335,146337,146339],{"class":35,"line":224},[33,146334,2449],{"class":163},[33,146336,40488],{"class":50},[33,146338,1852],{"class":163},[33,146340,7583],{"class":167},[33,146342,146343,146345,146347,146349,146351,146354,146356,146358,146360,146362,146364,146366,146368,146370],{"class":35,"line":229},[33,146344,4051],{"class":163},[33,146346,16617],{"class":50},[33,146348,602],{"class":167},[33,146350,4059],{"class":163},[33,146352,146353],{"class":54},"\"Engine not installed — pip install ",[33,146355,1115],{"class":50},[33,146357,17351],{"class":167},[33,146359,1121],{"class":50},[33,146361,2079],{"class":54},[33,146363,1115],{"class":50},[33,146365,7602],{"class":167},[33,146367,1121],{"class":50},[33,146369,274],{"class":54},[33,146371,221],{"class":167},[33,146373,146374,146376,146378,146380],{"class":35,"line":235},[33,146375,2449],{"class":163},[33,146377,783],{"class":50},[33,146379,1852],{"class":163},[33,146381,7583],{"class":167},[33,146383,146384,146386,146388,146390,146392,146394,146396,146398,146400,146402],{"class":35,"line":250},[33,146385,4051],{"class":163},[33,146387,16617],{"class":50},[33,146389,602],{"class":167},[33,146391,4059],{"class":163},[33,146393,143049],{"class":54},[33,146395,1115],{"class":50},[33,146397,7602],{"class":167},[33,146399,1121],{"class":50},[33,146401,274],{"class":54},[33,146403,221],{"class":167},[33,146405,146406],{"class":35,"line":266},[33,146407,92],{"emptyLinePlaceholder":91},[33,146409,146410],{"class":35,"line":290},[33,146411,92],{"emptyLinePlaceholder":91},[33,146413,146414,146416,146419,146422,146424],{"class":35,"line":295},[33,146415,562],{"class":163},[33,146417,146418],{"class":46}," clean_sheet",[33,146420,146421],{"class":167},"(df: pd.DataFrame, name: ",[33,146423,1053],{"class":50},[33,146425,7668],{"class":167},[33,146427,146428,146430,146432,146434,146436,146438,146440,146442,146444,146446,146448],{"class":35,"line":300},[33,146429,4025],{"class":167},[33,146431,242],{"class":163},[33,146433,114425],{"class":167},[33,146435,28045],{"class":238},[33,146437,242],{"class":163},[33,146439,35616],{"class":54},[33,146441,28085],{"class":167},[33,146443,10868],{"class":238},[33,146445,242],{"class":163},[33,146447,855],{"class":50},[33,146449,221],{"class":167},[33,146451,146452,146454,146456,146459,146461],{"class":35,"line":317},[33,146453,27546],{"class":167},[33,146455,242],{"class":163},[33,146457,146458],{"class":167}," df.columns.astype(",[33,146460,1053],{"class":50},[33,146462,11965],{"class":167},[33,146464,146465,146467],{"class":35,"line":332},[33,146466,617],{"class":163},[33,146468,27514],{"class":167},[33,146470,146471,146473,146475,146477,146480,146482,146484,146486,146489],{"class":35,"line":347},[33,146472,9414],{"class":50},[33,146474,602],{"class":167},[33,146476,4059],{"class":163},[33,146478,146479],{"class":54},"\"  WARN: sheet '",[33,146481,1115],{"class":50},[33,146483,1118],{"class":167},[33,146485,1121],{"class":50},[33,146487,146488],{"class":54},"' has no data after cleaning\"",[33,146490,221],{"class":167},[33,146492,146493,146495],{"class":35,"line":374},[33,146494,1332],{"class":163},[33,146496,11719],{"class":167},[33,146498,146499],{"class":35,"line":397},[33,146500,92],{"emptyLinePlaceholder":91},[33,146502,146503],{"class":35,"line":653},[33,146504,92],{"emptyLinePlaceholder":91},[33,146506,146507,146509,146511,146513,146515],{"class":35,"line":667},[33,146508,562],{"class":163},[33,146510,6636],{"class":46},[33,146512,568],{"class":167},[33,146514,571],{"class":50},[33,146516,574],{"class":167},[33,146518,146519,146521,146523,146525,146527,146529,146532],{"class":35,"line":675},[33,146520,6648],{"class":167},[33,146522,242],{"class":163},[33,146524,6653],{"class":167},[33,146526,6656],{"class":238},[33,146528,242],{"class":163},[33,146530,146531],{"class":54},"\"Export all Excel sheets to CSV\"",[33,146533,221],{"class":167},[33,146535,146536,146538,146541,146543,146545,146547,146549,146551,146553,146556],{"class":35,"line":689},[33,146537,6669],{"class":167},[33,146539,146540],{"class":54},"\"file\"",[33,146542,365],{"class":167},[33,146544,6677],{"class":238},[33,146546,242],{"class":163},[33,146548,6682],{"class":167},[33,146550,25463],{"class":238},[33,146552,242],{"class":163},[33,146554,146555],{"class":54},"\"Path to .xlsx file\"",[33,146557,221],{"class":167},[33,146559,146560],{"class":35,"line":703},[33,146561,6721],{"class":167},[33,146563,146564,146567],{"class":35,"line":714},[33,146565,146566],{"class":54},"        \"--engine\"",[33,146568,247],{"class":167},[33,146570,146571,146573,146575,146577],{"class":35,"line":723},[33,146572,53342],{"class":238},[33,146574,242],{"class":163},[33,146576,17356],{"class":54},[33,146578,247],{"class":167},[33,146580,146581,146584,146586,146588,146590,146592,146594,146596,146598],{"class":35,"line":754},[33,146582,146583],{"class":238},"        choices",[33,146585,242],{"class":163},[33,146587,8309],{"class":167},[33,146589,17356],{"class":54},[33,146591,365],{"class":167},[33,146593,139319],{"class":54},[33,146595,365],{"class":167},[33,146597,138380],{"class":54},[33,146599,8935],{"class":167},[33,146601,146602,146604,146606,146609],{"class":35,"line":771},[33,146603,6748],{"class":238},[33,146605,242],{"class":163},[33,146607,146608],{"class":54},"\"Parsing engine\"",[33,146610,247],{"class":167},[33,146612,146613],{"class":35,"line":777},[33,146614,1202],{"class":167},[33,146616,146617],{"class":35,"line":788},[33,146618,6721],{"class":167},[33,146620,146621,146624],{"class":35,"line":804},[33,146622,146623],{"class":54},"        \"--out-dir\"",[33,146625,247],{"class":167},[33,146627,146628,146631,146633],{"class":35,"line":809},[33,146629,146630],{"class":238},"        type",[33,146632,242],{"class":163},[33,146634,146635],{"class":167},"Path,\n",[33,146637,146638,146640,146642,146644,146646],{"class":35,"line":819},[33,146639,53342],{"class":238},[33,146641,242],{"class":163},[33,146643,15641],{"class":167},[33,146645,134409],{"class":54},[33,146647,1506],{"class":167},[33,146649,146650,146652,146654,146656],{"class":35,"line":829},[33,146651,6748],{"class":238},[33,146653,242],{"class":163},[33,146655,53393],{"class":54},[33,146657,247],{"class":167},[33,146659,146660],{"class":35,"line":834},[33,146661,1202],{"class":167},[33,146663,146664,146666,146668],{"class":35,"line":839},[33,146665,6766],{"class":167},[33,146667,242],{"class":163},[33,146669,6771],{"class":167},[33,146671,146672],{"class":35,"line":860},[33,146673,92],{"emptyLinePlaceholder":91},[33,146675,146676,146679,146681,146683,146685,146687,146689,146691,146693],{"class":35,"line":887},[33,146677,146678],{"class":167},"    args.out_dir.mkdir(",[33,146680,869],{"class":238},[33,146682,242],{"class":163},[33,146684,855],{"class":50},[33,146686,365],{"class":167},[33,146688,878],{"class":238},[33,146690,242],{"class":163},[33,146692,855],{"class":50},[33,146694,221],{"class":167},[33,146696,146697,146700,146702,146705,146707,146709],{"class":35,"line":907},[33,146698,146699],{"class":167},"    sheets ",[33,146701,242],{"class":163},[33,146703,146704],{"class":167}," load_workbook(args.file, ",[33,146706,17351],{"class":238},[33,146708,242],{"class":163},[33,146710,146711],{"class":167},"args.engine)\n",[33,146713,146714],{"class":35,"line":1826},[33,146715,92],{"emptyLinePlaceholder":91},[33,146717,146718,146720,146722,146724],{"class":35,"line":1844},[33,146719,656],{"class":163},[33,146721,14253],{"class":167},[33,146723,662],{"class":163},[33,146725,135974],{"class":167},[33,146727,146728,146730,146732],{"class":35,"line":1858},[33,146729,7930],{"class":167},[33,146731,242],{"class":163},[33,146733,146734],{"class":167}," clean_sheet(df, name)\n",[33,146736,146737,146739,146741,146744,146746,146748,146750,146752,146754,146756],{"class":35,"line":1871},[33,146738,50344],{"class":167},[33,146740,242],{"class":163},[33,146742,146743],{"class":167}," args.out_dir ",[33,146745,1351],{"class":163},[33,146747,1110],{"class":163},[33,146749,274],{"class":54},[33,146751,1115],{"class":50},[33,146753,1118],{"class":167},[33,146755,1121],{"class":50},[33,146757,40176],{"class":54},[33,146759,146760,146762,146764,146766,146768],{"class":35,"line":1877},[33,146761,50371],{"class":167},[33,146763,897],{"class":238},[33,146765,242],{"class":163},[33,146767,902],{"class":50},[33,146769,221],{"class":167},[33,146771,146772,146774,146776,146778,146780,146782,146784,146786,146788,146790,146792,146794,146796,146798,146800,146802,146804,146806,146808],{"class":35,"line":1883},[33,146773,9414],{"class":50},[33,146775,602],{"class":167},[33,146777,4059],{"class":163},[33,146779,48959],{"class":54},[33,146781,1115],{"class":50},[33,146783,1118],{"class":167},[33,146785,1121],{"class":50},[33,146787,2079],{"class":54},[33,146789,1115],{"class":50},[33,146791,9541],{"class":167},[33,146793,748],{"class":50},[33,146795,9546],{"class":167},[33,146797,1121],{"class":50},[33,146799,121221],{"class":54},[33,146801,1115],{"class":50},[33,146803,18014],{"class":167},[33,146805,1121],{"class":50},[33,146807,274],{"class":54},[33,146809,221],{"class":167},[33,146811,146812],{"class":35,"line":1915},[33,146813,92],{"emptyLinePlaceholder":91},[33,146815,146816],{"class":35,"line":1926},[33,146817,92],{"emptyLinePlaceholder":91},[33,146819,146820,146822,146824,146826,146828],{"class":35,"line":1932},[33,146821,2491],{"class":163},[33,146823,2494],{"class":50},[33,146825,2497],{"class":163},[33,146827,2500],{"class":54},[33,146829,574],{"class":167},[33,146831,146832],{"class":35,"line":1938},[33,146833,6914],{"class":167},[14,146835,146836],{},"Run it as:",[23,146838,146840],{"className":25,"code":146839,"language":27,"meta":28,"style":28},"python read_excel_all_sheets.py data\u002Fannual_report.xlsx --out-dir output\u002F\n",[30,146841,146842],{"__ignoreMap":28},[33,146843,146844,146846,146849,146852,146855],{"class":35,"line":36},[33,146845,47],{"class":46},[33,146847,146848],{"class":54}," read_excel_all_sheets.py",[33,146850,146851],{"class":54}," data\u002Fannual_report.xlsx",[33,146853,146854],{"class":50}," --out-dir",[33,146856,146857],{"class":54}," output\u002F\n",[18,146859,36626],{"id":36625},[14,146861,146862,146865,146866,146869,146870,146872,146873,146875],{},[1974,146863,146864],{},"Can Python read password-protected Excel files?","\nNot directly. Use ",[30,146867,146868],{},"msoffcrypto-tool"," to decrypt the file into a ",[30,146871,61504],{}," buffer, then pass the buffer to ",[30,146874,128158],{},". The decryption step requires the password as a string.",[14,146877,146878,146886,146887,146889,146890,146893,146894,146897],{},[1974,146879,36631,146880,146882,146883,146885],{},[30,146881,57240],{}," return ",[30,146884,8884],{}," for empty cells instead of blank strings?","\nPandas uses ",[30,146888,8884],{}," as its universal missing value indicator. Pass ",[30,146891,146892],{},"keep_default_na=False"," to suppress this, or call ",[30,146895,146896],{},"df.fillna(\"\")"," after loading if downstream code needs empty strings.",[14,146899,146900,146908,146910,146911,146913],{},[1974,146901,146902,146903,2012,146905,146907],{},"Is ",[30,146904,22009],{},[30,146906,139236],{}," faster for large read-only workbooks?",[30,146909,139236],{}," is typically 2–5x faster for pure data reads because it skips formula parsing and cell style processing. Use ",[30,146912,22009],{}," when you need formatting metadata, formula strings, or cell comments.",[14,146915,146916,146919,146920,146922,146923,146925,146926,146929],{},[1974,146917,146918],{},"How do I read an Excel file from a URL or S3?","\nDownload the file bytes into a ",[30,146921,61504],{}," buffer and pass the buffer to ",[30,146924,128158],{},". See the \"Reading from a BytesIO Buffer\" section above for an HTTP example. For S3, use ",[30,146927,146928],{},"boto3.client(\"s3\").get_object(...)[\"Body\"].read()"," to get the bytes.",[14,146931,146932,146938,146940,146941,146943,146944,146947],{},[1974,146933,142297,146934,10065,146936,36637],{},[30,146935,126099],{},[30,146937,44427],{},[30,146939,126099],{}," removes rows from the file before pandas processes it. ",[30,146942,44427],{}," tells pandas which remaining row (by index) contains column names. They work together: ",[30,146945,146946],{},"skiprows=3, header=0"," means \"discard the first 3 rows, then treat the very next row as the header.\"",[14,146949,146950,146953,146955,146956,8877,146958,1351,146960,8363,146962,146965],{},[1974,146951,146952],{},"How do I read only specific rows from a large workbook?",[30,146954,128158],{}," does not support row-range reads like CSV chunking. To read a window of rows, use ",[30,146957,22009],{},[30,146959,17642],{},[30,146961,97398],{},[30,146963,146964],{},"ws.iter_rows()",", or load the full sheet and slice the DataFrame afterward.",[18,146967,6918],{"id":6917},[4211,146969,146970,146980,146987,146992,146996],{},[4214,146971,146972,146974,146975,10065,146977,146979],{},[940,146973,126410],{"href":126409}," — exact fix for ",[30,146976,128863],{},[30,146978,42237],{}," on xlsx",[4214,146981,146982,146984,146985,88513],{},[940,146983,139758],{"href":139757}," — beginner walkthrough of each ",[30,146986,57240],{},[4214,146988,146989,146991],{},[940,146990,6936],{"href":6935}," — build on loaded data to produce formatted output workbooks",[4214,146993,146994,142361],{},[940,146995,28119],{"href":28118},[4214,146997,146998,147000],{},[940,146999,9599],{"href":9598}," — cleaning patterns that apply equally to Excel-sourced DataFrames",[14,147002,6947,147003,3035],{},[940,147004,26258],{"href":26257},[6953,147006,26204],{},{"title":28,"searchDepth":43,"depth":43,"links":147008},[147009,147010,147011,147012,147014,147016,147018,147020,147022,147023,147024,147030,147031,147032,147033,147034,147035],{"id":20,"depth":43,"text":21},{"id":142654,"depth":43,"text":142655},{"id":142863,"depth":43,"text":142864},{"id":143078,"depth":43,"text":147013},"Step 2: Basic read_excel Call",{"id":143263,"depth":43,"text":147015},"Step 3: sheet_name — Targeting Sheets",{"id":143599,"depth":43,"text":147017},"Step 4: header and skiprows",{"id":143780,"depth":43,"text":147019},"Step 5: usecols — Limiting Column Scope",{"id":144063,"depth":43,"text":147021},"Step 6: dtype and parse_dates",{"id":144365,"depth":43,"text":144366},{"id":144628,"depth":43,"text":144629},{"id":2708,"depth":43,"text":2709,"children":147025},[147026,147027,147028,147029],{"id":144891,"depth":61,"text":144892},{"id":145040,"depth":61,"text":145041},{"id":145241,"depth":61,"text":145242},{"id":145426,"depth":61,"text":145427},{"id":52029,"depth":43,"text":52030},{"id":21809,"depth":43,"text":21810},{"id":4270,"depth":43,"text":4271},{"id":4401,"depth":43,"text":4402},{"id":36625,"depth":43,"text":36626},{"id":6917,"depth":43,"text":6918},"Reading Excel Files","Load .xlsx and .xls workbooks into pandas DataFrames. Covers engine selection, sheet parsing, usecols, dtype, parse_dates, openpyxl direct access, and error handling.",{},"\u002Fpython-for-excel-csv-data-processing\u002Freading-excel-files-with-python",{"title":99577,"description":147037},"python-for-excel-csv-data-processing\u002Freading-excel-files-with-python\u002Findex",[99614,47,22009,9630],"7vEy2z8lp6asS4sHtat6tuy2WQws-PDcLO4-7jmhgKc",{"id":147045,"title":147046,"body":147047,"breadcrumbTitle":149532,"canonical":6977,"date":6978,"description":149533,"draft":6980,"extension":6981,"image":6977,"meta":149534,"navigation":91,"path":149535,"robots":6977,"seo":149536,"seoTitle":149537,"stem":149538,"tags":149539,"updatedAt":6978,"__hash__":149541},"content\u002Fpython-for-excel-csv-data-processing\u002Fwriting-excel-formulas-and-charts-with-openpyxl\u002Ffix-openpyxl-formulas-not-calculating\u002Findex.md","Fix openpyxl Formulas Showing as Blank",{"type":7,"value":147048,"toc":149515},[147049,147052,147068,147070,147072,147090,147109,147124,147128,147138,147165,147168,147191,147203,147207,147256,147258,147261,147264,147500,147510,147512,147516,147522,147742,147745,147747,147751,147754,148266,148272,148274,148278,148284,148643,148652,148654,148661,148666,148833,148836,148838,148845,148851,149027,149032,149034,149038,149044,149049,149065,149073,149223,149241,149247,149249,149251,149254,149476,149487,149489,149491,149508,149512],[10,147050,147046],{"id":147051},"fix-openpyxl-formulas-showing-as-blank",[14,147053,147054,147055,147058,147059,4348,147062,147064,147065,147067],{},"You write ",[30,147056,147057],{},"ws[\"B8\"].value = \"=SUM(B2:B7)\""," with openpyxl, save the file, then read it back — and ",[30,147060,147061],{},"ws[\"B8\"].value",[30,147063,571],{},". Or the cell appears blank when you open the ",[30,147066,26542],{}," in another tool that does not recalculate on open. This is the most common openpyxl confusion: openpyxl writes formulas as strings and has no calculation engine.",[2537,147069],{},[18,147071,4287],{"id":7020},[14,147073,147074,147075,147078,147079,147081,147082,147085,147086,147089],{},"openpyxl is a read\u002Fwrite library, not a spreadsheet engine. When you assign ",[30,147076,147077],{},"cell.value = \"=SUM(B2:B7)\"",", openpyxl stores the formula string in the XML. It does ",[1974,147080,7999],{}," evaluate ",[30,147083,147084],{},"=SUM(B2:B7)"," and does not store a cached result (",[30,147087,147088],{},"\u003Cv>"," element in the XML).",[14,147091,147092,147093,147095,147096,147099,147100,147102,147103,147105,147106,147108],{},"Excel's ",[30,147094,26542],{}," format has two separate fields per cell: the formula (",[30,147097,147098],{},"\u003Cf>",") and the last-computed value (",[30,147101,147088],{},"). When a file is opened in Excel, Excel fills ",[30,147104,147088],{}," and saves it. Until that happens, ",[30,147107,147088],{}," is absent or stale.",[14,147110,147111,147112,147114,147115,147117,147118,147120,147121,147123],{},"openpyxl's ",[30,147113,105730],{}," mode reads ",[30,147116,147088],{}," only — so it returns ",[30,147119,571],{}," for any cell whose ",[30,147122,147088],{}," has never been written.",[424,147125,147127],{"id":147126},"what-the-xml-looks-like","What the XML looks like",[14,147129,147130,147131,147134,147135,147137],{},"Inside ",[30,147132,147133],{},"xl\u002Fworksheets\u002Fsheet1.xml"," (you can inspect it by unzipping the ",[30,147136,26542],{},"), a formula cell written by openpyxl looks like:",[23,147139,147143],{"className":147140,"code":147141,"language":147142,"meta":28,"style":28},"language-xml shiki shiki-themes github-light","\u003Cc r=\"B8\" t=\"str\">\n  \u003Cf>SUM(B2:B7)\u003C\u002Ff>\n  \u003C!-- no \u003Cv> element — openpyxl never writes it -->\n\u003C\u002Fc>\n","xml",[30,147144,147145,147150,147155,147160],{"__ignoreMap":28},[33,147146,147147],{"class":35,"line":36},[33,147148,147149],{},"\u003Cc r=\"B8\" t=\"str\">\n",[33,147151,147152],{"class":35,"line":43},[33,147153,147154],{},"  \u003Cf>SUM(B2:B7)\u003C\u002Ff>\n",[33,147156,147157],{"class":35,"line":61},[33,147158,147159],{},"  \u003C!-- no \u003Cv> element — openpyxl never writes it -->\n",[33,147161,147162],{"class":35,"line":73},[33,147163,147164],{},"\u003C\u002Fc>\n",[14,147166,147167],{},"After Excel opens and saves the file:",[23,147169,147171],{"className":147140,"code":147170,"language":147142,"meta":28,"style":28},"\u003Cc r=\"B8\">\n  \u003Cf>SUM(B2:B7)\u003C\u002Ff>\n  \u003Cv>99000\u003C\u002Fv>   \u003C!-- Excel computed and cached this -->\n\u003C\u002Fc>\n",[30,147172,147173,147178,147182,147187],{"__ignoreMap":28},[33,147174,147175],{"class":35,"line":36},[33,147176,147177],{},"\u003Cc r=\"B8\">\n",[33,147179,147180],{"class":35,"line":43},[33,147181,147154],{},[33,147183,147184],{"class":35,"line":61},[33,147185,147186],{},"  \u003Cv>99000\u003C\u002Fv>   \u003C!-- Excel computed and cached this -->\n",[33,147188,147189],{"class":35,"line":73},[33,147190,147164],{},[14,147192,147193,147195,147196,34992,147198,147200,147201,3035],{},[30,147194,105730],{}," maps to reading ",[30,147197,147088],{},[30,147199,147088],{}," is absent, openpyxl returns ",[30,147202,571],{},[424,147204,147206],{"id":147205},"choosing-the-right-fix","Choosing the right fix",[4273,147208,147209,147218],{},[4276,147210,147211],{},[4279,147212,147213,147215],{},[4282,147214,120838],{},[4282,147216,147217],{},"Best fix",[4292,147219,147220,147228,147236,147246],{},[4279,147221,147222,147225],{},[4297,147223,147224],{},"CI pipeline, no Excel\u002FLibreOffice available",[4297,147226,147227],{},"Fix 2: compute in Python, write literals",[4279,147229,147230,147233],{},[4297,147231,147232],{},"Need live formulas in the file for end-users",[4297,147234,147235],{},"Fix 3: LibreOffice headless on Linux, or Fix 4: xlwings on Windows\u002FmacOS",[4279,147237,147238,147241],{},[4297,147239,147240],{},"Just auditing or rewriting formula text",[4297,147242,147243,147244],{},"Fix 1: ",[30,147245,107326],{},[4279,147247,147248,147251],{},[4297,147249,147250],{},"Human workflow — file opened manually in Excel",[4297,147252,81248,147253,147255],{},[30,147254,105730],{}," after Excel save",[2537,147257],{},[18,147259,147260],{"id":35016},"Minimal diagnostic",[14,147262,147263],{},"Confirm the root cause before applying a fix:",[23,147265,147267],{"className":126,"code":147266,"language":47,"meta":28,"style":28},"# pip install openpyxl\nfrom pathlib import Path\nimport openpyxl\n\nWORKBOOK = Path(\"report.xlsx\")\n\ntry:\n    # Read formula string (data_only=False is the default)\n    wb_formula = openpyxl.load_workbook(WORKBOOK, data_only=False)\n    formula_val = wb_formula[\"Sheet1\"][\"B8\"].value\n    print(f\"data_only=False: {formula_val!r}\")   # e.g. '=SUM(B2:B7)'\n\n    # Read cached value\n    wb_data = openpyxl.load_workbook(WORKBOOK, data_only=True)\n    cached_val = wb_data[\"Sheet1\"][\"B8\"].value\n    print(f\"data_only=True:  {cached_val!r}\")    # None if Excel never opened the file\nexcept FileNotFoundError as exc:\n    raise SystemExit(f\"File not found: {exc}\") from exc\n",[30,147268,147269,147273,147283,147289,147293,147305,147309,147315,147320,147341,147362,147389,147393,147398,147419,147437,147464,147474],{"__ignoreMap":28},[33,147270,147271],{"class":35,"line":36},[33,147272,98209],{"class":39},[33,147274,147275,147277,147279,147281],{"class":35,"line":43},[33,147276,190],{"class":163},[33,147278,193],{"class":167},[33,147280,164],{"class":163},[33,147282,198],{"class":167},[33,147284,147285,147287],{"class":35,"line":61},[33,147286,164],{"class":163},[33,147288,95887],{"class":167},[33,147290,147291],{"class":35,"line":73},[33,147292,92],{"emptyLinePlaceholder":91},[33,147294,147295,147297,147299,147301,147303],{"class":35,"line":88},[33,147296,126138],{"class":50},[33,147298,212],{"class":163},[33,147300,215],{"class":167},[33,147302,128434],{"class":54},[33,147304,221],{"class":167},[33,147306,147307],{"class":35,"line":95},[33,147308,92],{"emptyLinePlaceholder":91},[33,147310,147311,147313],{"class":35,"line":101},[33,147312,35574],{"class":163},[33,147314,574],{"class":167},[33,147316,147317],{"class":35,"line":171},[33,147318,147319],{"class":39},"    # Read formula string (data_only=False is the default)\n",[33,147321,147322,147325,147327,147329,147331,147333,147335,147337,147339],{"class":35,"line":179},[33,147323,147324],{"class":167},"    wb_formula ",[33,147326,242],{"class":163},[33,147328,144711],{"class":167},[33,147330,126138],{"class":50},[33,147332,365],{"class":167},[33,147334,144726],{"class":238},[33,147336,242],{"class":163},[33,147338,902],{"class":50},[33,147340,221],{"class":167},[33,147342,147343,147346,147348,147351,147354,147356,147359],{"class":35,"line":187},[33,147344,147345],{"class":167},"    formula_val ",[33,147347,242],{"class":163},[33,147349,147350],{"class":167}," wb_formula[",[33,147352,147353],{"class":54},"\"Sheet1\"",[33,147355,44179],{"class":167},[33,147357,147358],{"class":54},"\"B8\"",[33,147360,147361],{"class":167},"].value\n",[33,147363,147364,147366,147368,147370,147373,147375,147378,147380,147382,147384,147386],{"class":35,"line":201},[33,147365,7268],{"class":50},[33,147367,602],{"class":167},[33,147369,4059],{"class":163},[33,147371,147372],{"class":54},"\"data_only=False: ",[33,147374,1115],{"class":50},[33,147376,147377],{"class":167},"formula_val",[33,147379,76954],{"class":163},[33,147381,1121],{"class":50},[33,147383,274],{"class":54},[33,147385,12000],{"class":167},[33,147387,147388],{"class":39},"# e.g. '=SUM(B2:B7)'\n",[33,147390,147391],{"class":35,"line":206},[33,147392,92],{"emptyLinePlaceholder":91},[33,147394,147395],{"class":35,"line":224},[33,147396,147397],{"class":39},"    # Read cached value\n",[33,147399,147400,147403,147405,147407,147409,147411,147413,147415,147417],{"class":35,"line":229},[33,147401,147402],{"class":167},"    wb_data ",[33,147404,242],{"class":163},[33,147406,144711],{"class":167},[33,147408,126138],{"class":50},[33,147410,365],{"class":167},[33,147412,144726],{"class":238},[33,147414,242],{"class":163},[33,147416,855],{"class":50},[33,147418,221],{"class":167},[33,147420,147421,147424,147426,147429,147431,147433,147435],{"class":35,"line":235},[33,147422,147423],{"class":167},"    cached_val ",[33,147425,242],{"class":163},[33,147427,147428],{"class":167}," wb_data[",[33,147430,147353],{"class":54},[33,147432,44179],{"class":167},[33,147434,147358],{"class":54},[33,147436,147361],{"class":167},[33,147438,147439,147441,147443,147445,147448,147450,147453,147455,147457,147459,147461],{"class":35,"line":250},[33,147440,7268],{"class":50},[33,147442,602],{"class":167},[33,147444,4059],{"class":163},[33,147446,147447],{"class":54},"\"data_only=True:  ",[33,147449,1115],{"class":50},[33,147451,147452],{"class":167},"cached_val",[33,147454,76954],{"class":163},[33,147456,1121],{"class":50},[33,147458,274],{"class":54},[33,147460,101057],{"class":167},[33,147462,147463],{"class":39},"# None if Excel never opened the file\n",[33,147465,147466,147468,147470,147472],{"class":35,"line":266},[33,147467,35726],{"class":163},[33,147469,2945],{"class":50},[33,147471,1852],{"class":163},[33,147473,1855],{"class":167},[33,147475,147476,147478,147480,147482,147484,147486,147488,147490,147492,147494,147496,147498],{"class":35,"line":290},[33,147477,35742],{"class":163},[33,147479,16617],{"class":50},[33,147481,602],{"class":167},[33,147483,4059],{"class":163},[33,147485,15677],{"class":54},[33,147487,1115],{"class":50},[33,147489,6565],{"class":167},[33,147491,1121],{"class":50},[33,147493,274],{"class":54},[33,147495,1649],{"class":167},[33,147497,190],{"class":163},[33,147499,20843],{"class":167},[14,147501,41963,147502,147504,147505,95600,147507,147509],{},[30,147503,107326],{}," returns the formula string and ",[30,147506,105730],{},[30,147508,571],{},", the file was never opened in Excel (or was created entirely by openpyxl without Excel touching it). That confirms the root cause.",[2537,147511],{},[18,147513,147515],{"id":147514},"fix-1-read-the-formula-not-the-cached-value","Fix 1 — Read the formula, not the cached value",[14,147517,147518,147519,147521],{},"If you only need to inspect or transform the formula text, use ",[30,147520,107326],{}," (the default). This is not a workaround — it is the correct mode for formula-aware reads.",[23,147523,147525],{"className":126,"code":147524,"language":47,"meta":28,"style":28},"# pip install openpyxl\nfrom pathlib import Path\nimport openpyxl\n\nWORKBOOK = Path(\"report.xlsx\")\n\ntry:\n    wb = openpyxl.load_workbook(WORKBOOK, data_only=False)  # default; explicit for clarity\nexcept FileNotFoundError as exc:\n    raise SystemExit(f\"File not found: {exc}\") from exc\n\nws = wb[\"Sheet1\"]\n\nfor row in ws.iter_rows(min_row=2, max_row=ws.max_row):\n    for cell in row:\n        if isinstance(cell.value, str) and cell.value.startswith(\"=\"):\n            print(f\"{cell.coordinate}: formula = {cell.value!r}\")\n",[30,147526,147527,147531,147541,147547,147551,147563,147567,147573,147596,147606,147632,147636,147648,147652,147676,147686,147709],{"__ignoreMap":28},[33,147528,147529],{"class":35,"line":36},[33,147530,98209],{"class":39},[33,147532,147533,147535,147537,147539],{"class":35,"line":43},[33,147534,190],{"class":163},[33,147536,193],{"class":167},[33,147538,164],{"class":163},[33,147540,198],{"class":167},[33,147542,147543,147545],{"class":35,"line":61},[33,147544,164],{"class":163},[33,147546,95887],{"class":167},[33,147548,147549],{"class":35,"line":73},[33,147550,92],{"emptyLinePlaceholder":91},[33,147552,147553,147555,147557,147559,147561],{"class":35,"line":88},[33,147554,126138],{"class":50},[33,147556,212],{"class":163},[33,147558,215],{"class":167},[33,147560,128434],{"class":54},[33,147562,221],{"class":167},[33,147564,147565],{"class":35,"line":95},[33,147566,92],{"emptyLinePlaceholder":91},[33,147568,147569,147571],{"class":35,"line":101},[33,147570,35574],{"class":163},[33,147572,574],{"class":167},[33,147574,147575,147577,147579,147581,147583,147585,147587,147589,147591,147593],{"class":35,"line":171},[33,147576,17432],{"class":167},[33,147578,242],{"class":163},[33,147580,144711],{"class":167},[33,147582,126138],{"class":50},[33,147584,365],{"class":167},[33,147586,144726],{"class":238},[33,147588,242],{"class":163},[33,147590,902],{"class":50},[33,147592,10922],{"class":167},[33,147594,147595],{"class":39},"# default; explicit for clarity\n",[33,147597,147598,147600,147602,147604],{"class":35,"line":179},[33,147599,35726],{"class":163},[33,147601,2945],{"class":50},[33,147603,1852],{"class":163},[33,147605,1855],{"class":167},[33,147607,147608,147610,147612,147614,147616,147618,147620,147622,147624,147626,147628,147630],{"class":35,"line":187},[33,147609,35742],{"class":163},[33,147611,16617],{"class":50},[33,147613,602],{"class":167},[33,147615,4059],{"class":163},[33,147617,15677],{"class":54},[33,147619,1115],{"class":50},[33,147621,6565],{"class":167},[33,147623,1121],{"class":50},[33,147625,274],{"class":54},[33,147627,1649],{"class":167},[33,147629,190],{"class":163},[33,147631,20843],{"class":167},[33,147633,147634],{"class":35,"line":201},[33,147635,92],{"emptyLinePlaceholder":91},[33,147637,147638,147640,147642,147644,147646],{"class":35,"line":206},[33,147639,98330],{"class":167},[33,147641,242],{"class":163},[33,147643,17447],{"class":167},[33,147645,147353],{"class":54},[33,147647,9202],{"class":167},[33,147649,147650],{"class":35,"line":224},[33,147651,92],{"emptyLinePlaceholder":91},[33,147653,147654,147656,147658,147660,147662,147664,147666,147668,147670,147672,147674],{"class":35,"line":229},[33,147655,6124],{"class":163},[33,147657,3844],{"class":167},[33,147659,662],{"class":163},[33,147661,17639],{"class":167},[33,147663,17642],{"class":238},[33,147665,242],{"class":163},[33,147667,1533],{"class":50},[33,147669,365],{"class":167},[33,147671,97398],{"class":238},[33,147673,242],{"class":163},[33,147675,98009],{"class":167},[33,147677,147678,147680,147682,147684],{"class":35,"line":235},[33,147679,656],{"class":163},[33,147681,17467],{"class":167},[33,147683,662],{"class":163},[33,147685,17675],{"class":167},[33,147687,147688,147690,147692,147695,147697,147699,147701,147704,147707],{"class":35,"line":250},[33,147689,8221],{"class":163},[33,147691,36538],{"class":50},[33,147693,147694],{"class":167},"(cell.value, ",[33,147696,1053],{"class":50},[33,147698,1649],{"class":167},[33,147700,6001],{"class":163},[33,147702,147703],{"class":167}," cell.value.startswith(",[33,147705,147706],{"class":54},"\"=\"",[33,147708,1737],{"class":167},[33,147710,147711,147713,147715,147717,147719,147721,147724,147726,147729,147731,147734,147736,147738,147740],{"class":35,"line":266},[33,147712,9364],{"class":50},[33,147714,602],{"class":167},[33,147716,4059],{"class":163},[33,147718,274],{"class":54},[33,147720,1115],{"class":50},[33,147722,147723],{"class":167},"cell.coordinate",[33,147725,1121],{"class":50},[33,147727,147728],{"class":54},": formula = ",[33,147730,1115],{"class":50},[33,147732,147733],{"class":167},"cell.value",[33,147735,76954],{"class":163},[33,147737,1121],{"class":50},[33,147739,274],{"class":54},[33,147741,221],{"class":167},[14,147743,147744],{},"Use this when you are auditing formulas, rewriting cell references, or building a report template — anywhere the formula text matters more than the numeric result.",[2537,147746],{},[18,147748,147750],{"id":147749},"fix-2-compute-in-python-and-write-literal-values","Fix 2 — Compute in Python and write literal values",[14,147752,147753],{},"The most portable fix: skip formulas entirely. Compute the result in Python (using pandas or plain arithmetic) and write the numeric value directly. No Excel round-trip required.",[23,147755,147757],{"className":126,"code":147756,"language":47,"meta":28,"style":28},"# pip install openpyxl pandas\nfrom pathlib import Path\nimport openpyxl\nimport pandas as pd\n\nSOURCE   = Path(\"sales_data.xlsx\")\nDEST     = Path(\"report_computed.xlsx\")\n\ntry:\n    df = pd.read_excel(SOURCE, sheet_name=\"Sales\", engine=\"openpyxl\")\nexcept FileNotFoundError as exc:\n    raise SystemExit(f\"File not found: {exc}\") from exc\n\n# Compute values in pandas — no formulas needed\ndf[\"Margin\"] = (df[\"Revenue\"] - df[\"Cost\"]) \u002F df[\"Revenue\"]\ntotal_revenue = df[\"Revenue\"].sum()\ntotal_cost    = df[\"Cost\"].sum()\navg_margin    = df[\"Margin\"].mean()\n\n# Write literal values — openpyxl stores them as \u003Cv>, readable with data_only=True\ntry:\n    wb = openpyxl.load_workbook(SOURCE)\nexcept FileNotFoundError as exc:\n    raise SystemExit(f\"Cannot open source: {exc}\") from exc\n\nws = wb[\"Sales\"]\nsummary_row = ws.max_row + 2\nws[f\"A{summary_row}\"] = \"Total \u002F Avg\"\nws[f\"B{summary_row}\"] = total_revenue   # literal float, not a formula string\nws[f\"C{summary_row}\"] = total_cost\nws[f\"D{summary_row}\"] = avg_margin\nws[f\"B{summary_row}\"].number_format = '\"$\"#,##0'\nws[f\"C{summary_row}\"].number_format = '\"$\"#,##0'\nws[f\"D{summary_row}\"].number_format = \"0.0%\"\n\nwb.save(DEST)\nprint(f\"Written with literal values: {DEST}\")\n",[30,147758,147759,147763,147773,147779,147789,147793,147806,147819,147823,147829,147857,147867,147893,147897,147902,147934,147947,147960,147974,147978,147983,147989,148001,148011,148038,148042,148054,148067,148093,148120,148144,148168,148192,148214,148237,148241,148249],{"__ignoreMap":28},[33,147760,147761],{"class":35,"line":36},[33,147762,97874],{"class":39},[33,147764,147765,147767,147769,147771],{"class":35,"line":43},[33,147766,190],{"class":163},[33,147768,193],{"class":167},[33,147770,164],{"class":163},[33,147772,198],{"class":167},[33,147774,147775,147777],{"class":35,"line":61},[33,147776,164],{"class":163},[33,147778,95887],{"class":167},[33,147780,147781,147783,147785,147787],{"class":35,"line":73},[33,147782,164],{"class":163},[33,147784,492],{"class":167},[33,147786,495],{"class":163},[33,147788,498],{"class":167},[33,147790,147791],{"class":35,"line":88},[33,147792,92],{"emptyLinePlaceholder":91},[33,147794,147795,147797,147799,147801,147804],{"class":35,"line":95},[33,147796,86272],{"class":50},[33,147798,21012],{"class":163},[33,147800,215],{"class":167},[33,147802,147803],{"class":54},"\"sales_data.xlsx\"",[33,147805,221],{"class":167},[33,147807,147808,147810,147812,147814,147817],{"class":35,"line":101},[33,147809,129127],{"class":50},[33,147811,96938],{"class":163},[33,147813,215],{"class":167},[33,147815,147816],{"class":54},"\"report_computed.xlsx\"",[33,147818,221],{"class":167},[33,147820,147821],{"class":35,"line":171},[33,147822,92],{"emptyLinePlaceholder":91},[33,147824,147825,147827],{"class":35,"line":179},[33,147826,35574],{"class":163},[33,147828,574],{"class":167},[33,147830,147831,147833,147835,147837,147839,147841,147843,147845,147847,147849,147851,147853,147855],{"class":35,"line":187},[33,147832,4025],{"class":167},[33,147834,242],{"class":163},[33,147836,126254],{"class":167},[33,147838,86272],{"class":50},[33,147840,365],{"class":167},[33,147842,17371],{"class":238},[33,147844,242],{"class":163},[33,147846,140420],{"class":54},[33,147848,365],{"class":167},[33,147850,17351],{"class":238},[33,147852,242],{"class":163},[33,147854,17356],{"class":54},[33,147856,221],{"class":167},[33,147858,147859,147861,147863,147865],{"class":35,"line":201},[33,147860,35726],{"class":163},[33,147862,2945],{"class":50},[33,147864,1852],{"class":163},[33,147866,1855],{"class":167},[33,147868,147869,147871,147873,147875,147877,147879,147881,147883,147885,147887,147889,147891],{"class":35,"line":206},[33,147870,35742],{"class":163},[33,147872,16617],{"class":50},[33,147874,602],{"class":167},[33,147876,4059],{"class":163},[33,147878,15677],{"class":54},[33,147880,1115],{"class":50},[33,147882,6565],{"class":167},[33,147884,1121],{"class":50},[33,147886,274],{"class":54},[33,147888,1649],{"class":167},[33,147890,190],{"class":163},[33,147892,20843],{"class":167},[33,147894,147895],{"class":35,"line":224},[33,147896,92],{"emptyLinePlaceholder":91},[33,147898,147899],{"class":35,"line":229},[33,147900,147901],{"class":39},"# Compute values in pandas — no formulas needed\n",[33,147903,147904,147906,147908,147910,147912,147914,147916,147918,147920,147922,147924,147926,147928,147930,147932],{"class":35,"line":235},[33,147905,11038],{"class":167},[33,147907,19598],{"class":54},[33,147909,763],{"class":167},[33,147911,242],{"class":163},[33,147913,59771],{"class":167},[33,147915,12925],{"class":54},[33,147917,763],{"class":167},[33,147919,4126],{"class":163},[33,147921,7935],{"class":167},[33,147923,19593],{"class":54},[33,147925,8675],{"class":167},[33,147927,1351],{"class":163},[33,147929,7935],{"class":167},[33,147931,12925],{"class":54},[33,147933,9202],{"class":167},[33,147935,147936,147939,147941,147943,147945],{"class":35,"line":250},[33,147937,147938],{"class":167},"total_revenue ",[33,147940,242],{"class":163},[33,147942,7935],{"class":167},[33,147944,12925],{"class":54},[33,147946,18333],{"class":167},[33,147948,147949,147952,147954,147956,147958],{"class":35,"line":266},[33,147950,147951],{"class":167},"total_cost    ",[33,147953,242],{"class":163},[33,147955,7935],{"class":167},[33,147957,19593],{"class":54},[33,147959,18333],{"class":167},[33,147961,147962,147965,147967,147969,147971],{"class":35,"line":290},[33,147963,147964],{"class":167},"avg_margin    ",[33,147966,242],{"class":163},[33,147968,7935],{"class":167},[33,147970,19598],{"class":54},[33,147972,147973],{"class":167},"].mean()\n",[33,147975,147976],{"class":35,"line":295},[33,147977,92],{"emptyLinePlaceholder":91},[33,147979,147980],{"class":35,"line":300},[33,147981,147982],{"class":39},"# Write literal values — openpyxl stores them as \u003Cv>, readable with data_only=True\n",[33,147984,147985,147987],{"class":35,"line":317},[33,147986,35574],{"class":163},[33,147988,574],{"class":167},[33,147990,147991,147993,147995,147997,147999],{"class":35,"line":332},[33,147992,17432],{"class":167},[33,147994,242],{"class":163},[33,147996,144711],{"class":167},[33,147998,86272],{"class":50},[33,148000,221],{"class":167},[33,148002,148003,148005,148007,148009],{"class":35,"line":347},[33,148004,35726],{"class":163},[33,148006,2945],{"class":50},[33,148008,1852],{"class":163},[33,148010,1855],{"class":167},[33,148012,148013,148015,148017,148019,148021,148024,148026,148028,148030,148032,148034,148036],{"class":35,"line":374},[33,148014,35742],{"class":163},[33,148016,16617],{"class":50},[33,148018,602],{"class":167},[33,148020,4059],{"class":163},[33,148022,148023],{"class":54},"\"Cannot open source: ",[33,148025,1115],{"class":50},[33,148027,6565],{"class":167},[33,148029,1121],{"class":50},[33,148031,274],{"class":54},[33,148033,1649],{"class":167},[33,148035,190],{"class":163},[33,148037,20843],{"class":167},[33,148039,148040],{"class":35,"line":397},[33,148041,92],{"emptyLinePlaceholder":91},[33,148043,148044,148046,148048,148050,148052],{"class":35,"line":653},[33,148045,98330],{"class":167},[33,148047,242],{"class":163},[33,148049,17447],{"class":167},[33,148051,140420],{"class":54},[33,148053,9202],{"class":167},[33,148055,148056,148059,148061,148063,148065],{"class":35,"line":667},[33,148057,148058],{"class":167},"summary_row ",[33,148060,242],{"class":163},[33,148062,17704],{"class":167},[33,148064,1811],{"class":163},[33,148066,97531],{"class":50},[33,148068,148069,148072,148074,148077,148079,148082,148084,148086,148088,148090],{"class":35,"line":675},[33,148070,148071],{"class":167},"ws[",[33,148073,4059],{"class":163},[33,148075,148076],{"class":54},"\"A",[33,148078,1115],{"class":50},[33,148080,148081],{"class":167},"summary_row",[33,148083,1121],{"class":50},[33,148085,274],{"class":54},[33,148087,763],{"class":167},[33,148089,242],{"class":163},[33,148091,148092],{"class":54}," \"Total \u002F Avg\"\n",[33,148094,148095,148097,148099,148102,148104,148106,148108,148110,148112,148114,148117],{"class":35,"line":689},[33,148096,148071],{"class":167},[33,148098,4059],{"class":163},[33,148100,148101],{"class":54},"\"B",[33,148103,1115],{"class":50},[33,148105,148081],{"class":167},[33,148107,1121],{"class":50},[33,148109,274],{"class":54},[33,148111,763],{"class":167},[33,148113,242],{"class":163},[33,148115,148116],{"class":167}," total_revenue   ",[33,148118,148119],{"class":39},"# literal float, not a formula string\n",[33,148121,148122,148124,148126,148129,148131,148133,148135,148137,148139,148141],{"class":35,"line":703},[33,148123,148071],{"class":167},[33,148125,4059],{"class":163},[33,148127,148128],{"class":54},"\"C",[33,148130,1115],{"class":50},[33,148132,148081],{"class":167},[33,148134,1121],{"class":50},[33,148136,274],{"class":54},[33,148138,763],{"class":167},[33,148140,242],{"class":163},[33,148142,148143],{"class":167}," total_cost\n",[33,148145,148146,148148,148150,148153,148155,148157,148159,148161,148163,148165],{"class":35,"line":714},[33,148147,148071],{"class":167},[33,148149,4059],{"class":163},[33,148151,148152],{"class":54},"\"D",[33,148154,1115],{"class":50},[33,148156,148081],{"class":167},[33,148158,1121],{"class":50},[33,148160,274],{"class":54},[33,148162,763],{"class":167},[33,148164,242],{"class":163},[33,148166,148167],{"class":167}," avg_margin\n",[33,148169,148170,148172,148174,148176,148178,148180,148182,148184,148187,148189],{"class":35,"line":723},[33,148171,148071],{"class":167},[33,148173,4059],{"class":163},[33,148175,148101],{"class":54},[33,148177,1115],{"class":50},[33,148179,148081],{"class":167},[33,148181,1121],{"class":50},[33,148183,274],{"class":54},[33,148185,148186],{"class":167},"].number_format ",[33,148188,242],{"class":163},[33,148190,148191],{"class":54}," '\"$\"#,##0'\n",[33,148193,148194,148196,148198,148200,148202,148204,148206,148208,148210,148212],{"class":35,"line":754},[33,148195,148071],{"class":167},[33,148197,4059],{"class":163},[33,148199,148128],{"class":54},[33,148201,1115],{"class":50},[33,148203,148081],{"class":167},[33,148205,1121],{"class":50},[33,148207,274],{"class":54},[33,148209,148186],{"class":167},[33,148211,242],{"class":163},[33,148213,148191],{"class":54},[33,148215,148216,148218,148220,148222,148224,148226,148228,148230,148232,148234],{"class":35,"line":771},[33,148217,148071],{"class":167},[33,148219,4059],{"class":163},[33,148221,148152],{"class":54},[33,148223,1115],{"class":50},[33,148225,148081],{"class":167},[33,148227,1121],{"class":50},[33,148229,274],{"class":54},[33,148231,148186],{"class":167},[33,148233,242],{"class":163},[33,148235,148236],{"class":54}," \"0.0%\"\n",[33,148238,148239],{"class":35,"line":777},[33,148240,92],{"emptyLinePlaceholder":91},[33,148242,148243,148245,148247],{"class":35,"line":788},[33,148244,100907],{"class":167},[33,148246,129127],{"class":50},[33,148248,221],{"class":167},[33,148250,148251,148253,148255,148257,148260,148262,148264],{"class":35,"line":804},[33,148252,13474],{"class":50},[33,148254,602],{"class":167},[33,148256,4059],{"class":163},[33,148258,148259],{"class":54},"\"Written with literal values: ",[33,148261,129317],{"class":50},[33,148263,274],{"class":54},[33,148265,221],{"class":167},[14,148267,148268,148269,148271],{},"This approach integrates naturally with the ",[940,148270,6936],{"href":6935}," workflow and works in any CI\u002FCD environment where Excel is not installed.",[2537,148273],{},[18,148275,148277],{"id":148276},"fix-3-force-recalculation-with-libreoffice-headless","Fix 3 — Force recalculation with LibreOffice headless",[14,148279,148280,148281,148283],{},"If you must have Excel-compatible cached values (for downstream tools that read ",[30,148282,147088],{},") without opening Excel manually, use LibreOffice headless to open and re-save the file.",[23,148285,148287],{"className":126,"code":148286,"language":47,"meta":28,"style":28},"# Requires: LibreOffice installed (apt install libreoffice or brew install --cask libreoffice)\n# pip install openpyxl (for verification step)\nimport subprocess\nfrom pathlib import Path\n\nWORKBOOK = Path(\"report.xlsx\").resolve()\nOUTPUT_DIR = WORKBOOK.parent\n\ntry:\n    result = subprocess.run(\n        [\n            \"libreoffice\", \"--headless\",\n            \"--calc\",\n            \"--convert-to\", \"xlsx\",\n            \"--outdir\", str(OUTPUT_DIR),\n            str(WORKBOOK),\n        ],\n        capture_output=True,\n        text=True,\n        timeout=60,\n        check=True,   # raises CalledProcessError on non-zero exit\n    )\n    print(result.stdout)\nexcept FileNotFoundError:\n    raise SystemExit(\"LibreOffice not found — install it or use Fix 2 instead.\")\nexcept subprocess.CalledProcessError as exc:\n    raise SystemExit(f\"LibreOffice failed: {exc.stderr}\") from exc\nexcept subprocess.TimeoutExpired:\n    raise SystemExit(\"LibreOffice timed out after 60 s.\")\n\n# Verify the cached values are now present\nimport openpyxl\nwb = openpyxl.load_workbook(WORKBOOK, data_only=True)\ncached = wb[\"Sheet1\"][\"B8\"].value\nprint(f\"Cached value after LibreOffice recalc: {cached!r}\")   # should be a number now\n",[30,148288,148289,148294,148299,148305,148315,148319,148332,148344,148348,148354,148362,148366,148378,148385,148397,148412,148422,148426,148437,148448,148459,148473,148477,148484,148492,148505,148516,148544,148551,148564,148568,148573,148579,148599,148616],{"__ignoreMap":28},[33,148290,148291],{"class":35,"line":36},[33,148292,148293],{"class":39},"# Requires: LibreOffice installed (apt install libreoffice or brew install --cask libreoffice)\n",[33,148295,148296],{"class":35,"line":43},[33,148297,148298],{"class":39},"# pip install openpyxl (for verification step)\n",[33,148300,148301,148303],{"class":35,"line":61},[33,148302,164],{"class":163},[33,148304,35040],{"class":167},[33,148306,148307,148309,148311,148313],{"class":35,"line":73},[33,148308,190],{"class":163},[33,148310,193],{"class":167},[33,148312,164],{"class":163},[33,148314,198],{"class":167},[33,148316,148317],{"class":35,"line":88},[33,148318,92],{"emptyLinePlaceholder":91},[33,148320,148321,148323,148325,148327,148329],{"class":35,"line":95},[33,148322,126138],{"class":50},[33,148324,212],{"class":163},[33,148326,215],{"class":167},[33,148328,128434],{"class":54},[33,148330,148331],{"class":167},").resolve()\n",[33,148333,148334,148336,148338,148341],{"class":35,"line":101},[33,148335,4615],{"class":50},[33,148337,212],{"class":163},[33,148339,148340],{"class":50}," WORKBOOK",[33,148342,148343],{"class":167},".parent\n",[33,148345,148346],{"class":35,"line":171},[33,148347,92],{"emptyLinePlaceholder":91},[33,148349,148350,148352],{"class":35,"line":179},[33,148351,35574],{"class":163},[33,148353,574],{"class":167},[33,148355,148356,148358,148360],{"class":35,"line":187},[33,148357,8842],{"class":167},[33,148359,242],{"class":163},[33,148361,35060],{"class":167},[33,148363,148364],{"class":35,"line":201},[33,148365,19619],{"class":167},[33,148367,148368,148371,148373,148376],{"class":35,"line":206},[33,148369,148370],{"class":54},"            \"libreoffice\"",[33,148372,365],{"class":167},[33,148374,148375],{"class":54},"\"--headless\"",[33,148377,247],{"class":167},[33,148379,148380,148383],{"class":35,"line":224},[33,148381,148382],{"class":54},"            \"--calc\"",[33,148384,247],{"class":167},[33,148386,148387,148390,148392,148395],{"class":35,"line":229},[33,148388,148389],{"class":54},"            \"--convert-to\"",[33,148391,365],{"class":167},[33,148393,148394],{"class":54},"\"xlsx\"",[33,148396,247],{"class":167},[33,148398,148399,148402,148404,148406,148408,148410],{"class":35,"line":235},[33,148400,148401],{"class":54},"            \"--outdir\"",[33,148403,365],{"class":167},[33,148405,1053],{"class":50},[33,148407,602],{"class":167},[33,148409,4615],{"class":50},[33,148411,1506],{"class":167},[33,148413,148414,148416,148418,148420],{"class":35,"line":250},[33,148415,10673],{"class":50},[33,148417,602],{"class":167},[33,148419,126138],{"class":50},[33,148421,1506],{"class":167},[33,148423,148424],{"class":35,"line":266},[33,148425,20776],{"class":167},[33,148427,148428,148431,148433,148435],{"class":35,"line":290},[33,148429,148430],{"class":238},"        capture_output",[33,148432,242],{"class":163},[33,148434,855],{"class":50},[33,148436,247],{"class":167},[33,148438,148439,148442,148444,148446],{"class":35,"line":295},[33,148440,148441],{"class":238},"        text",[33,148443,242],{"class":163},[33,148445,855],{"class":50},[33,148447,247],{"class":167},[33,148449,148450,148453,148455,148457],{"class":35,"line":300},[33,148451,148452],{"class":238},"        timeout",[33,148454,242],{"class":163},[33,148456,2590],{"class":50},[33,148458,247],{"class":167},[33,148460,148461,148464,148466,148468,148470],{"class":35,"line":317},[33,148462,148463],{"class":238},"        check",[33,148465,242],{"class":163},[33,148467,855],{"class":50},[33,148469,1166],{"class":167},[33,148471,148472],{"class":39},"# raises CalledProcessError on non-zero exit\n",[33,148474,148475],{"class":35,"line":332},[33,148476,1202],{"class":167},[33,148478,148479,148481],{"class":35,"line":347},[33,148480,7268],{"class":50},[33,148482,148483],{"class":167},"(result.stdout)\n",[33,148485,148486,148488,148490],{"class":35,"line":374},[33,148487,35726],{"class":163},[33,148489,2945],{"class":50},[33,148491,574],{"class":167},[33,148493,148494,148496,148498,148500,148503],{"class":35,"line":397},[33,148495,35742],{"class":163},[33,148497,16617],{"class":50},[33,148499,602],{"class":167},[33,148501,148502],{"class":54},"\"LibreOffice not found — install it or use Fix 2 instead.\"",[33,148504,221],{"class":167},[33,148506,148507,148509,148512,148514],{"class":35,"line":653},[33,148508,35726],{"class":163},[33,148510,148511],{"class":167}," subprocess.CalledProcessError ",[33,148513,495],{"class":163},[33,148515,1855],{"class":167},[33,148517,148518,148520,148522,148524,148526,148529,148531,148534,148536,148538,148540,148542],{"class":35,"line":667},[33,148519,35742],{"class":163},[33,148521,16617],{"class":50},[33,148523,602],{"class":167},[33,148525,4059],{"class":163},[33,148527,148528],{"class":54},"\"LibreOffice failed: ",[33,148530,1115],{"class":50},[33,148532,148533],{"class":167},"exc.stderr",[33,148535,1121],{"class":50},[33,148537,274],{"class":54},[33,148539,1649],{"class":167},[33,148541,190],{"class":163},[33,148543,20843],{"class":167},[33,148545,148546,148548],{"class":35,"line":675},[33,148547,35726],{"class":163},[33,148549,148550],{"class":167}," subprocess.TimeoutExpired:\n",[33,148552,148553,148555,148557,148559,148562],{"class":35,"line":689},[33,148554,35742],{"class":163},[33,148556,16617],{"class":50},[33,148558,602],{"class":167},[33,148560,148561],{"class":54},"\"LibreOffice timed out after 60 s.\"",[33,148563,221],{"class":167},[33,148565,148566],{"class":35,"line":703},[33,148567,92],{"emptyLinePlaceholder":91},[33,148569,148570],{"class":35,"line":714},[33,148571,148572],{"class":39},"# Verify the cached values are now present\n",[33,148574,148575,148577],{"class":35,"line":723},[33,148576,164],{"class":163},[33,148578,95887],{"class":167},[33,148580,148581,148583,148585,148587,148589,148591,148593,148595,148597],{"class":35,"line":754},[33,148582,98274],{"class":167},[33,148584,242],{"class":163},[33,148586,144711],{"class":167},[33,148588,126138],{"class":50},[33,148590,365],{"class":167},[33,148592,144726],{"class":238},[33,148594,242],{"class":163},[33,148596,855],{"class":50},[33,148598,221],{"class":167},[33,148600,148601,148604,148606,148608,148610,148612,148614],{"class":35,"line":771},[33,148602,148603],{"class":167},"cached ",[33,148605,242],{"class":163},[33,148607,17447],{"class":167},[33,148609,147353],{"class":54},[33,148611,44179],{"class":167},[33,148613,147358],{"class":54},[33,148615,147361],{"class":167},[33,148617,148618,148620,148622,148624,148627,148629,148632,148634,148636,148638,148640],{"class":35,"line":777},[33,148619,13474],{"class":50},[33,148621,602],{"class":167},[33,148623,4059],{"class":163},[33,148625,148626],{"class":54},"\"Cached value after LibreOffice recalc: ",[33,148628,1115],{"class":50},[33,148630,148631],{"class":167},"cached",[33,148633,76954],{"class":163},[33,148635,1121],{"class":50},[33,148637,274],{"class":54},[33,148639,12000],{"class":167},[33,148641,148642],{"class":39},"# should be a number now\n",[14,148644,148645,148646,148648,148649,148651],{},"LibreOffice evaluates formulas on open and writes ",[30,148647,147088],{}," before converting, so ",[30,148650,105730],{}," returns numeric results. The output file replaces the input (LibreOffice uses the same filename).",[2537,148653],{},[18,148655,148657,148658,148660],{"id":148656},"variant-fix-data_onlytrue-after-excel-has-opened-the-file","Variant fix — ",[30,148659,105730],{}," after Excel has opened the file",[14,148662,148663,148664,3035],{},"If a human has already opened the file in Excel and saved it, the cached values exist. You can then read them reliably with ",[30,148665,105730],{},[23,148667,148669],{"className":126,"code":148668,"language":47,"meta":28,"style":28},"# pip install openpyxl\nfrom pathlib import Path\nimport openpyxl\n\n# This only works if the file was previously saved by Excel (cached values present)\nWORKBOOK = Path(\"report_excel_saved.xlsx\")\n\ntry:\n    wb = openpyxl.load_workbook(WORKBOOK, data_only=True)\nexcept FileNotFoundError as exc:\n    raise SystemExit(f\"File not found: {exc}\") from exc\n\nws = wb[\"Sheet1\"]\ncached = ws[\"B8\"].value\nprint(f\"Cached value: {cached!r}\")   # numeric result, e.g. 99000\n",[30,148670,148671,148675,148685,148691,148695,148700,148713,148717,148723,148743,148753,148779,148783,148795,148807],{"__ignoreMap":28},[33,148672,148673],{"class":35,"line":36},[33,148674,98209],{"class":39},[33,148676,148677,148679,148681,148683],{"class":35,"line":43},[33,148678,190],{"class":163},[33,148680,193],{"class":167},[33,148682,164],{"class":163},[33,148684,198],{"class":167},[33,148686,148687,148689],{"class":35,"line":61},[33,148688,164],{"class":163},[33,148690,95887],{"class":167},[33,148692,148693],{"class":35,"line":73},[33,148694,92],{"emptyLinePlaceholder":91},[33,148696,148697],{"class":35,"line":88},[33,148698,148699],{"class":39},"# This only works if the file was previously saved by Excel (cached values present)\n",[33,148701,148702,148704,148706,148708,148711],{"class":35,"line":95},[33,148703,126138],{"class":50},[33,148705,212],{"class":163},[33,148707,215],{"class":167},[33,148709,148710],{"class":54},"\"report_excel_saved.xlsx\"",[33,148712,221],{"class":167},[33,148714,148715],{"class":35,"line":101},[33,148716,92],{"emptyLinePlaceholder":91},[33,148718,148719,148721],{"class":35,"line":171},[33,148720,35574],{"class":163},[33,148722,574],{"class":167},[33,148724,148725,148727,148729,148731,148733,148735,148737,148739,148741],{"class":35,"line":179},[33,148726,17432],{"class":167},[33,148728,242],{"class":163},[33,148730,144711],{"class":167},[33,148732,126138],{"class":50},[33,148734,365],{"class":167},[33,148736,144726],{"class":238},[33,148738,242],{"class":163},[33,148740,855],{"class":50},[33,148742,221],{"class":167},[33,148744,148745,148747,148749,148751],{"class":35,"line":187},[33,148746,35726],{"class":163},[33,148748,2945],{"class":50},[33,148750,1852],{"class":163},[33,148752,1855],{"class":167},[33,148754,148755,148757,148759,148761,148763,148765,148767,148769,148771,148773,148775,148777],{"class":35,"line":201},[33,148756,35742],{"class":163},[33,148758,16617],{"class":50},[33,148760,602],{"class":167},[33,148762,4059],{"class":163},[33,148764,15677],{"class":54},[33,148766,1115],{"class":50},[33,148768,6565],{"class":167},[33,148770,1121],{"class":50},[33,148772,274],{"class":54},[33,148774,1649],{"class":167},[33,148776,190],{"class":163},[33,148778,20843],{"class":167},[33,148780,148781],{"class":35,"line":206},[33,148782,92],{"emptyLinePlaceholder":91},[33,148784,148785,148787,148789,148791,148793],{"class":35,"line":224},[33,148786,98330],{"class":167},[33,148788,242],{"class":163},[33,148790,17447],{"class":167},[33,148792,147353],{"class":54},[33,148794,9202],{"class":167},[33,148796,148797,148799,148801,148803,148805],{"class":35,"line":229},[33,148798,148603],{"class":167},[33,148800,242],{"class":163},[33,148802,17472],{"class":167},[33,148804,147358],{"class":54},[33,148806,147361],{"class":167},[33,148808,148809,148811,148813,148815,148818,148820,148822,148824,148826,148828,148830],{"class":35,"line":235},[33,148810,13474],{"class":50},[33,148812,602],{"class":167},[33,148814,4059],{"class":163},[33,148816,148817],{"class":54},"\"Cached value: ",[33,148819,1115],{"class":50},[33,148821,148631],{"class":167},[33,148823,76954],{"class":163},[33,148825,1121],{"class":50},[33,148827,274],{"class":54},[33,148829,12000],{"class":167},[33,148831,148832],{"class":39},"# numeric result, e.g. 99000\n",[14,148834,148835],{},"This works for interactive workflows (a human maintains the file) but is not reliable in fully automated pipelines — you cannot guarantee Excel has been run.",[2537,148837],{},[18,148839,148657,148841,148844],{"id":148840},"variant-fix-xlwings-for-live-recalculation-windowsmacos-only",[30,148842,148843],{},"xlwings"," for live recalculation (Windows\u002FmacOS only)",[14,148846,148847,148848,148850],{},"On Windows or macOS, ",[30,148849,148843],{}," drives Excel via COM\u002FAppleScript, forcing a true recalculation.",[23,148852,148854],{"className":126,"code":148853,"language":47,"meta":28,"style":28},"# pip install xlwings\n# Requires Excel installed on Windows or macOS\nfrom pathlib import Path\nimport xlwings as xw\n\nWORKBOOK = Path(\"report.xlsx\").resolve()\n\ntry:\n    app = xw.App(visible=False)\n    wb  = app.books.open(str(WORKBOOK))\n    wb.app.calculate()         # force recalculation\n    wb.save()\n    wb.close()\n    app.quit()\n    print(f\"Recalculated and saved: {WORKBOOK}\")\nexcept Exception as exc:\n    raise SystemExit(f\"xlwings error: {exc}\") from exc\n",[30,148855,148856,148861,148866,148876,148888,148892,148904,148908,148914,148932,148950,148958,148963,148967,148972,148990,149000],{"__ignoreMap":28},[33,148857,148858],{"class":35,"line":36},[33,148859,148860],{"class":39},"# pip install xlwings\n",[33,148862,148863],{"class":35,"line":43},[33,148864,148865],{"class":39},"# Requires Excel installed on Windows or macOS\n",[33,148867,148868,148870,148872,148874],{"class":35,"line":61},[33,148869,190],{"class":163},[33,148871,193],{"class":167},[33,148873,164],{"class":163},[33,148875,198],{"class":167},[33,148877,148878,148880,148883,148885],{"class":35,"line":73},[33,148879,164],{"class":163},[33,148881,148882],{"class":167}," xlwings ",[33,148884,495],{"class":163},[33,148886,148887],{"class":167}," xw\n",[33,148889,148890],{"class":35,"line":88},[33,148891,92],{"emptyLinePlaceholder":91},[33,148893,148894,148896,148898,148900,148902],{"class":35,"line":95},[33,148895,126138],{"class":50},[33,148897,212],{"class":163},[33,148899,215],{"class":167},[33,148901,128434],{"class":54},[33,148903,148331],{"class":167},[33,148905,148906],{"class":35,"line":101},[33,148907,92],{"emptyLinePlaceholder":91},[33,148909,148910,148912],{"class":35,"line":171},[33,148911,35574],{"class":163},[33,148913,574],{"class":167},[33,148915,148916,148919,148921,148924,148926,148928,148930],{"class":35,"line":179},[33,148917,148918],{"class":167},"    app ",[33,148920,242],{"class":163},[33,148922,148923],{"class":167}," xw.App(",[33,148925,28642],{"class":238},[33,148927,242],{"class":163},[33,148929,902],{"class":50},[33,148931,221],{"class":167},[33,148933,148934,148937,148939,148942,148944,148946,148948],{"class":35,"line":187},[33,148935,148936],{"class":167},"    wb  ",[33,148938,242],{"class":163},[33,148940,148941],{"class":167}," app.books.open(",[33,148943,1053],{"class":50},[33,148945,602],{"class":167},[33,148947,126138],{"class":50},[33,148949,371],{"class":167},[33,148951,148952,148955],{"class":35,"line":201},[33,148953,148954],{"class":167},"    wb.app.calculate()         ",[33,148956,148957],{"class":39},"# force recalculation\n",[33,148959,148960],{"class":35,"line":206},[33,148961,148962],{"class":167},"    wb.save()\n",[33,148964,148965],{"class":35,"line":224},[33,148966,99958],{"class":167},[33,148968,148969],{"class":35,"line":229},[33,148970,148971],{"class":167},"    app.quit()\n",[33,148973,148974,148976,148978,148980,148983,148986,148988],{"class":35,"line":235},[33,148975,7268],{"class":50},[33,148977,602],{"class":167},[33,148979,4059],{"class":163},[33,148981,148982],{"class":54},"\"Recalculated and saved: ",[33,148984,148985],{"class":50},"{WORKBOOK}",[33,148987,274],{"class":54},[33,148989,221],{"class":167},[33,148991,148992,148994,148996,148998],{"class":35,"line":250},[33,148993,35726],{"class":163},[33,148995,783],{"class":50},[33,148997,1852],{"class":163},[33,148999,1855],{"class":167},[33,149001,149002,149004,149006,149008,149010,149013,149015,149017,149019,149021,149023,149025],{"class":35,"line":266},[33,149003,35742],{"class":163},[33,149005,16617],{"class":50},[33,149007,602],{"class":167},[33,149009,4059],{"class":163},[33,149011,149012],{"class":54},"\"xlwings error: ",[33,149014,1115],{"class":50},[33,149016,6565],{"class":167},[33,149018,1121],{"class":50},[33,149020,274],{"class":54},[33,149022,1649],{"class":167},[33,149024,190],{"class":163},[33,149026,20843],{"class":167},[14,149028,149029,149031],{},[30,149030,148843],{}," is the right tool when you need Excel-precise results and are running on a desktop OS. It does not work in Linux CI environments without Excel.",[2537,149033],{},[18,149035,149037],{"id":149036},"additional-troubleshooting-formulas-that-appear-blank-in-excel-itself","Additional troubleshooting: formulas that appear blank in Excel itself",[14,149039,149040,149041,149043],{},"Sometimes the formula is stored correctly but displays blank in Excel without triggering a recalculation. This is a separate problem from the ",[30,149042,105730],{}," \u002F Python read-back issue.",[14,149045,149046,149048],{},[1974,149047,4284],{},": The cell shows no value in Excel, even after opening.",[14,149050,149051,149058,149059,36661,149061,149064],{},[1974,149052,149053,149054,149057],{},"Cause 1 — ",[30,149055,149056],{},"calcPr"," recalculation is set to manual."," openpyxl sets ",[30,149060,149056],{},[30,149062,149063],{},"fullCalcOnLoad=\"1\""," by default, but if the workbook was created with a template that sets recalculation to manual, Excel will not auto-calculate on open.",[14,149066,149067,149069,149070,20891],{},[1974,149068,4290],{},": In the Python script, explicitly set ",[30,149071,149072],{},"fullCalcOnLoad",[23,149074,149076],{"className":126,"code":149075,"language":47,"meta":28,"style":28},"# pip install openpyxl\nfrom pathlib import Path\nimport openpyxl\n\nWORKBOOK = Path(\"report.xlsx\")\n\ntry:\n    wb = openpyxl.load_workbook(WORKBOOK)\nexcept FileNotFoundError as exc:\n    raise SystemExit(f\"File not found: {exc}\") from exc\n\n# Force Excel to recalculate all formulas on open\nwb.calculation.calcMode = \"auto\"\nwb.calculation.fullCalcOnLoad = True\n\nwb.save(WORKBOOK)\nprint(\"calcPr set to fullCalcOnLoad.\")\n",[30,149077,149078,149082,149092,149098,149102,149114,149118,149124,149136,149146,149172,149176,149181,149191,149200,149204,149212],{"__ignoreMap":28},[33,149079,149080],{"class":35,"line":36},[33,149081,98209],{"class":39},[33,149083,149084,149086,149088,149090],{"class":35,"line":43},[33,149085,190],{"class":163},[33,149087,193],{"class":167},[33,149089,164],{"class":163},[33,149091,198],{"class":167},[33,149093,149094,149096],{"class":35,"line":61},[33,149095,164],{"class":163},[33,149097,95887],{"class":167},[33,149099,149100],{"class":35,"line":73},[33,149101,92],{"emptyLinePlaceholder":91},[33,149103,149104,149106,149108,149110,149112],{"class":35,"line":88},[33,149105,126138],{"class":50},[33,149107,212],{"class":163},[33,149109,215],{"class":167},[33,149111,128434],{"class":54},[33,149113,221],{"class":167},[33,149115,149116],{"class":35,"line":95},[33,149117,92],{"emptyLinePlaceholder":91},[33,149119,149120,149122],{"class":35,"line":101},[33,149121,35574],{"class":163},[33,149123,574],{"class":167},[33,149125,149126,149128,149130,149132,149134],{"class":35,"line":171},[33,149127,17432],{"class":167},[33,149129,242],{"class":163},[33,149131,144711],{"class":167},[33,149133,126138],{"class":50},[33,149135,221],{"class":167},[33,149137,149138,149140,149142,149144],{"class":35,"line":179},[33,149139,35726],{"class":163},[33,149141,2945],{"class":50},[33,149143,1852],{"class":163},[33,149145,1855],{"class":167},[33,149147,149148,149150,149152,149154,149156,149158,149160,149162,149164,149166,149168,149170],{"class":35,"line":187},[33,149149,35742],{"class":163},[33,149151,16617],{"class":50},[33,149153,602],{"class":167},[33,149155,4059],{"class":163},[33,149157,15677],{"class":54},[33,149159,1115],{"class":50},[33,149161,6565],{"class":167},[33,149163,1121],{"class":50},[33,149165,274],{"class":54},[33,149167,1649],{"class":167},[33,149169,190],{"class":163},[33,149171,20843],{"class":167},[33,149173,149174],{"class":35,"line":201},[33,149175,92],{"emptyLinePlaceholder":91},[33,149177,149178],{"class":35,"line":206},[33,149179,149180],{"class":39},"# Force Excel to recalculate all formulas on open\n",[33,149182,149183,149186,149188],{"class":35,"line":224},[33,149184,149185],{"class":167},"wb.calculation.calcMode ",[33,149187,242],{"class":163},[33,149189,149190],{"class":54}," \"auto\"\n",[33,149192,149193,149196,149198],{"class":35,"line":229},[33,149194,149195],{"class":167},"wb.calculation.fullCalcOnLoad ",[33,149197,242],{"class":163},[33,149199,2887],{"class":50},[33,149201,149202],{"class":35,"line":235},[33,149203,92],{"emptyLinePlaceholder":91},[33,149205,149206,149208,149210],{"class":35,"line":250},[33,149207,100907],{"class":167},[33,149209,126138],{"class":50},[33,149211,221],{"class":167},[33,149213,149214,149216,149218,149221],{"class":35,"line":266},[33,149215,13474],{"class":50},[33,149217,602],{"class":167},[33,149219,149220],{"class":54},"\"calcPr set to fullCalcOnLoad.\"",[33,149222,221],{"class":167},[14,149224,149225,149228,149229,149232,149233,149236,149237,149240],{},[1974,149226,149227],{},"Cause 2 — Array formula not marked as array."," Formulas like ",[30,149230,149231],{},"=SUMPRODUCT(...)"," work as regular formulas, but ",[30,149234,149235],{},"=SUM(IF(...))"," requires array entry (",[30,149238,149239],{},"Ctrl+Shift+Enter"," in Excel). openpyxl does not support writing array formulas natively. In this case, rewrite the formula to avoid array syntax, or use a helper column.",[14,149242,149243,149246],{},[1974,149244,149245],{},"Cause 3 — Circular reference."," If the formula references its own cell (directly or via a chain), Excel may display zero or blank. Inspect the formula logic — openpyxl does not validate formula semantics.",[2537,149248],{},[18,149250,9247],{"id":9246},[14,149252,149253],{},"After applying any fix, confirm the cached value is now readable:",[23,149255,149257],{"className":126,"code":149256,"language":47,"meta":28,"style":28},"# pip install openpyxl\nfrom pathlib import Path\nimport openpyxl\n\nWORKBOOK = Path(\"report.xlsx\")\n\ntry:\n    wb = openpyxl.load_workbook(WORKBOOK, data_only=True)\nexcept FileNotFoundError as exc:\n    raise SystemExit(f\"File not found: {exc}\") from exc\n\nws = wb[\"Sheet1\"]\nvalue = ws[\"B8\"].value\n\nassert value is not None, (\n    \"B8 is still None — file was not recalculated by Excel\u002FLibreOffice, \"\n    \"or cached value was not written. Use Fix 2 (literal values) instead.\"\n)\nassert isinstance(value, (int, float)), f\"Expected numeric, got {type(value)}\"\nprint(f\"B8 cached value: {value}   ✓\")\n",[30,149258,149259,149263,149273,149279,149283,149295,149299,149305,149325,149335,149361,149365,149377,149390,149394,149409,149414,149419,149423,149454],{"__ignoreMap":28},[33,149260,149261],{"class":35,"line":36},[33,149262,98209],{"class":39},[33,149264,149265,149267,149269,149271],{"class":35,"line":43},[33,149266,190],{"class":163},[33,149268,193],{"class":167},[33,149270,164],{"class":163},[33,149272,198],{"class":167},[33,149274,149275,149277],{"class":35,"line":61},[33,149276,164],{"class":163},[33,149278,95887],{"class":167},[33,149280,149281],{"class":35,"line":73},[33,149282,92],{"emptyLinePlaceholder":91},[33,149284,149285,149287,149289,149291,149293],{"class":35,"line":88},[33,149286,126138],{"class":50},[33,149288,212],{"class":163},[33,149290,215],{"class":167},[33,149292,128434],{"class":54},[33,149294,221],{"class":167},[33,149296,149297],{"class":35,"line":95},[33,149298,92],{"emptyLinePlaceholder":91},[33,149300,149301,149303],{"class":35,"line":101},[33,149302,35574],{"class":163},[33,149304,574],{"class":167},[33,149306,149307,149309,149311,149313,149315,149317,149319,149321,149323],{"class":35,"line":171},[33,149308,17432],{"class":167},[33,149310,242],{"class":163},[33,149312,144711],{"class":167},[33,149314,126138],{"class":50},[33,149316,365],{"class":167},[33,149318,144726],{"class":238},[33,149320,242],{"class":163},[33,149322,855],{"class":50},[33,149324,221],{"class":167},[33,149326,149327,149329,149331,149333],{"class":35,"line":179},[33,149328,35726],{"class":163},[33,149330,2945],{"class":50},[33,149332,1852],{"class":163},[33,149334,1855],{"class":167},[33,149336,149337,149339,149341,149343,149345,149347,149349,149351,149353,149355,149357,149359],{"class":35,"line":187},[33,149338,35742],{"class":163},[33,149340,16617],{"class":50},[33,149342,602],{"class":167},[33,149344,4059],{"class":163},[33,149346,15677],{"class":54},[33,149348,1115],{"class":50},[33,149350,6565],{"class":167},[33,149352,1121],{"class":50},[33,149354,274],{"class":54},[33,149356,1649],{"class":167},[33,149358,190],{"class":163},[33,149360,20843],{"class":167},[33,149362,149363],{"class":35,"line":201},[33,149364,92],{"emptyLinePlaceholder":91},[33,149366,149367,149369,149371,149373,149375],{"class":35,"line":206},[33,149368,98330],{"class":167},[33,149370,242],{"class":163},[33,149372,17447],{"class":167},[33,149374,147353],{"class":54},[33,149376,9202],{"class":167},[33,149378,149379,149382,149384,149386,149388],{"class":35,"line":224},[33,149380,149381],{"class":167},"value ",[33,149383,242],{"class":163},[33,149385,17472],{"class":167},[33,149387,147358],{"class":54},[33,149389,147361],{"class":167},[33,149391,149392],{"class":35,"line":229},[33,149393,92],{"emptyLinePlaceholder":91},[33,149395,149396,149398,149401,149403,149405,149407],{"class":35,"line":235},[33,149397,36397],{"class":163},[33,149399,149400],{"class":167}," value ",[33,149402,3847],{"class":163},[33,149404,620],{"class":163},[33,149406,7657],{"class":50},[33,149408,9528],{"class":167},[33,149410,149411],{"class":35,"line":250},[33,149412,149413],{"class":54},"    \"B8 is still None — file was not recalculated by Excel\u002FLibreOffice, \"\n",[33,149415,149416],{"class":35,"line":266},[33,149417,149418],{"class":54},"    \"or cached value was not written. Use Fix 2 (literal values) instead.\"\n",[33,149420,149421],{"class":35,"line":290},[33,149422,221],{"class":167},[33,149424,149425,149427,149429,149432,149434,149436,149438,149440,149442,149445,149447,149450,149452],{"class":35,"line":295},[33,149426,36397],{"class":163},[33,149428,36538],{"class":50},[33,149430,149431],{"class":167},"(value, (",[33,149433,1059],{"class":50},[33,149435,365],{"class":167},[33,149437,1720],{"class":50},[33,149439,77348],{"class":167},[33,149441,4059],{"class":163},[33,149443,149444],{"class":54},"\"Expected numeric, got ",[33,149446,86374],{"class":50},[33,149448,149449],{"class":167},"(value)",[33,149451,1121],{"class":50},[33,149453,7504],{"class":54},[33,149455,149456,149458,149460,149462,149465,149467,149469,149471,149474],{"class":35,"line":300},[33,149457,13474],{"class":50},[33,149459,602],{"class":167},[33,149461,4059],{"class":163},[33,149463,149464],{"class":54},"\"B8 cached value: ",[33,149466,1115],{"class":50},[33,149468,67110],{"class":167},[33,149470,1121],{"class":50},[33,149472,149473],{"class":54},"   ✓\"",[33,149475,221],{"class":167},[14,149477,149478,149479,149482,149483,149486],{},"If the assertion fails after Fix 3 (LibreOffice), check that LibreOffice did not error silently — inspect ",[30,149480,149481],{},"result.stderr"," from the ",[30,149484,149485],{},"subprocess.run"," call.",[2537,149488],{},[18,149490,6918],{"id":6917},[4211,149492,149493,149498,149503],{},[4214,149494,149495,149497],{},[940,149496,102074],{"href":102073}," — full guide to writing formulas, named ranges, and charts",[4214,149499,149500,149502],{},[940,149501,99577],{"href":99576}," — loading .xlsx data correctly with pandas and openpyxl",[4214,149504,149505,149507],{},[940,149506,6936],{"href":6935}," — end-to-end automated report workflows",[14,149509,6947,149510,3035],{},[940,149511,102074],{"href":102073},[6953,149513,149514],{},"html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html pre.shiki code .sAwPA, html code.shiki .sAwPA{--shiki-default:#6A737D}html pre.shiki code .sD7c4, html code.shiki .sD7c4{--shiki-default:#D73A49}html pre.shiki code .sgsFI, html code.shiki .sgsFI{--shiki-default:#24292E}html pre.shiki code .sYu0t, html code.shiki .sYu0t{--shiki-default:#005CC5}html pre.shiki code .sYBdl, html code.shiki .sYBdl{--shiki-default:#032F62}html pre.shiki code .sqxcx, html code.shiki .sqxcx{--shiki-default:#E36209}",{"title":28,"searchDepth":43,"depth":43,"links":149516},[149517,149521,149522,149523,149524,149525,149527,149529,149530,149531],{"id":7020,"depth":43,"text":4287,"children":149518},[149519,149520],{"id":147126,"depth":61,"text":147127},{"id":147205,"depth":61,"text":147206},{"id":35016,"depth":43,"text":147260},{"id":147514,"depth":43,"text":147515},{"id":147749,"depth":43,"text":147750},{"id":148276,"depth":43,"text":148277},{"id":148656,"depth":43,"text":149526},"Variant fix — data_only=True after Excel has opened the file",{"id":148840,"depth":43,"text":149528},"Variant fix — xlwings for live recalculation (Windows\u002FmacOS only)",{"id":149036,"depth":43,"text":149037},{"id":9246,"depth":43,"text":9247},{"id":6917,"depth":43,"text":6918},"Formulas Showing as Blank","openpyxl writes formula strings but never computes them — reading back with data_only=True returns None until Excel caches values. Here are four concrete fixes.",{},"\u002Fpython-for-excel-csv-data-processing\u002Fwriting-excel-formulas-and-charts-with-openpyxl\u002Ffix-openpyxl-formulas-not-calculating",{"title":147046,"description":149533},"Fix openpyxl Formulas Not Calculating (None on Read-Back)","python-for-excel-csv-data-processing\u002Fwriting-excel-formulas-and-charts-with-openpyxl\u002Ffix-openpyxl-formulas-not-calculating\u002Findex",[47,22009,99614,149540,81739],"formulas","Wdi5l19ZDd5bHilwCPFZbFLucXjwgCCcVI6D1gOnsd0",{"id":149543,"title":102074,"body":149544,"breadcrumbTitle":156141,"canonical":6977,"date":6978,"description":156142,"draft":6980,"extension":6981,"image":6977,"meta":156143,"navigation":91,"path":156144,"robots":6977,"seo":156145,"seoTitle":102074,"stem":156146,"tags":156147,"updatedAt":6978,"__hash__":156149},"content\u002Fpython-for-excel-csv-data-processing\u002Fwriting-excel-formulas-and-charts-with-openpyxl\u002Findex.md",{"type":7,"value":149545,"toc":156116},[149546,149549,149562,149564,149567,149661,149663,149667,149670,149842,149852,149854,149858,149870,150298,150301,150320,150322,150326,150338,150709,150725,150727,150731,150738,151111,151114,151187,151189,151193,151209,151598,151611,151613,151617,151620,151962,151968,151970,151974,151977,152362,152372,152374,152378,152381,152478,152480,152484,152488,152962,152966,152982,153132,153138,153140,153142,153146,153149,153533,153537,153543,153854,153858,153869,154054,154056,154058,154061,154365,154367,154369,154407,154409,154411,154517,154519,154521,156083,156085,156087,156109,156113],[10,149547,102074],{"id":149548},"writing-excel-formulas-and-charts-with-openpyxl",[14,149550,149551,149552,149554,149555,1351,149558,149561],{},"openpyxl lets you write ",[30,149553,26542],{}," files entirely from Python — including live formulas, named ranges, custom number formats, and embedded charts. The tricky parts are understanding what openpyxl actually stores (strings and references, not computed values) and knowing how the ",[30,149556,149557],{},"Reference",[30,149559,149560],{},"Series"," API maps onto the chart object model. This guide covers both in full.",[18,149563,21],{"id":20},[14,149565,149566],{},"You need Python 3.9+, openpyxl, and a sample data file to experiment with. The snippet below creates one.",[23,149568,149570],{"className":25,"code":149569,"language":27,"meta":28,"style":28},"# pip install openpyxl\npython - \u003C\u003C'EOF'\nfrom pathlib import Path\nimport openpyxl\n\nwb = openpyxl.Workbook()\nws = wb.active\nws.title = \"Sales\"\nws.append([\"Month\", \"Revenue\", \"Cost\"])\nrows = [\n    (\"Jan\", 12000, 8000), (\"Feb\", 15000, 9500), (\"Mar\", 13500, 8800),\n    (\"Apr\", 17000, 10200), (\"May\", 19500, 11000), (\"Jun\", 22000, 12500),\n]\nfor r in rows:\n    ws.append(r)\nwb.save(Path(\"sample_sales.xlsx\"))\nprint(\"sample_sales.xlsx written\")\nEOF\n",[30,149571,149572,149576,149586,149590,149595,149599,149604,149609,149614,149619,149623,149628,149633,149637,149642,149647,149652,149657],{"__ignoreMap":28},[33,149573,149574],{"class":35,"line":36},[33,149575,98209],{"class":39},[33,149577,149578,149580,149582,149584],{"class":35,"line":43},[33,149579,47],{"class":46},[33,149581,39025],{"class":54},[33,149583,53957],{"class":163},[33,149585,53960],{"class":54},[33,149587,149588],{"class":35,"line":61},[33,149589,112890],{"class":54},[33,149591,149592],{"class":35,"line":73},[33,149593,149594],{"class":54},"import openpyxl\n",[33,149596,149597],{"class":35,"line":88},[33,149598,92],{"emptyLinePlaceholder":91},[33,149600,149601],{"class":35,"line":95},[33,149602,149603],{"class":54},"wb = openpyxl.Workbook()\n",[33,149605,149606],{"class":35,"line":101},[33,149607,149608],{"class":54},"ws = wb.active\n",[33,149610,149611],{"class":35,"line":171},[33,149612,149613],{"class":54},"ws.title = \"Sales\"\n",[33,149615,149616],{"class":35,"line":179},[33,149617,149618],{"class":54},"ws.append([\"Month\", \"Revenue\", \"Cost\"])\n",[33,149620,149621],{"class":35,"line":187},[33,149622,53970],{"class":54},[33,149624,149625],{"class":35,"line":201},[33,149626,149627],{"class":54},"    (\"Jan\", 12000, 8000), (\"Feb\", 15000, 9500), (\"Mar\", 13500, 8800),\n",[33,149629,149630],{"class":35,"line":206},[33,149631,149632],{"class":54},"    (\"Apr\", 17000, 10200), (\"May\", 19500, 11000), (\"Jun\", 22000, 12500),\n",[33,149634,149635],{"class":35,"line":224},[33,149636,9202],{"class":54},[33,149638,149639],{"class":35,"line":229},[33,149640,149641],{"class":54},"for r in rows:\n",[33,149643,149644],{"class":35,"line":235},[33,149645,149646],{"class":54},"    ws.append(r)\n",[33,149648,149649],{"class":35,"line":250},[33,149650,149651],{"class":54},"wb.save(Path(\"sample_sales.xlsx\"))\n",[33,149653,149654],{"class":35,"line":266},[33,149655,149656],{"class":54},"print(\"sample_sales.xlsx written\")\n",[33,149658,149659],{"class":35,"line":290},[33,149660,54019],{"class":54},[2537,149662],{},[18,149664,149666],{"id":149665},"step-1-load-the-workbook-and-inspect-the-sheet","Step 1 — Load the workbook and inspect the sheet",[14,149668,149669],{},"Before writing formulas, confirm column layout and data extent.",[23,149671,149673],{"className":126,"code":149672,"language":47,"meta":28,"style":28},"# pip install openpyxl\nfrom pathlib import Path\nimport openpyxl\n\nWORKBOOK = Path(\"sample_sales.xlsx\")\n\ntry:\n    wb = openpyxl.load_workbook(WORKBOOK)\nexcept FileNotFoundError as exc:\n    raise SystemExit(f\"File not found: {exc}\") from exc\n\nws = wb[\"Sales\"]\nprint(f\"Dimensions: {ws.dimensions}\")   # e.g. A1:C7\nprint(f\"Max row: {ws.max_row}, Max col: {ws.max_column}\")\n",[30,149674,149675,149679,149689,149695,149699,149712,149716,149722,149734,149744,149770,149774,149786,149811],{"__ignoreMap":28},[33,149676,149677],{"class":35,"line":36},[33,149678,98209],{"class":39},[33,149680,149681,149683,149685,149687],{"class":35,"line":43},[33,149682,190],{"class":163},[33,149684,193],{"class":167},[33,149686,164],{"class":163},[33,149688,198],{"class":167},[33,149690,149691,149693],{"class":35,"line":61},[33,149692,164],{"class":163},[33,149694,95887],{"class":167},[33,149696,149697],{"class":35,"line":73},[33,149698,92],{"emptyLinePlaceholder":91},[33,149700,149701,149703,149705,149707,149710],{"class":35,"line":88},[33,149702,126138],{"class":50},[33,149704,212],{"class":163},[33,149706,215],{"class":167},[33,149708,149709],{"class":54},"\"sample_sales.xlsx\"",[33,149711,221],{"class":167},[33,149713,149714],{"class":35,"line":95},[33,149715,92],{"emptyLinePlaceholder":91},[33,149717,149718,149720],{"class":35,"line":101},[33,149719,35574],{"class":163},[33,149721,574],{"class":167},[33,149723,149724,149726,149728,149730,149732],{"class":35,"line":171},[33,149725,17432],{"class":167},[33,149727,242],{"class":163},[33,149729,144711],{"class":167},[33,149731,126138],{"class":50},[33,149733,221],{"class":167},[33,149735,149736,149738,149740,149742],{"class":35,"line":179},[33,149737,35726],{"class":163},[33,149739,2945],{"class":50},[33,149741,1852],{"class":163},[33,149743,1855],{"class":167},[33,149745,149746,149748,149750,149752,149754,149756,149758,149760,149762,149764,149766,149768],{"class":35,"line":187},[33,149747,35742],{"class":163},[33,149749,16617],{"class":50},[33,149751,602],{"class":167},[33,149753,4059],{"class":163},[33,149755,15677],{"class":54},[33,149757,1115],{"class":50},[33,149759,6565],{"class":167},[33,149761,1121],{"class":50},[33,149763,274],{"class":54},[33,149765,1649],{"class":167},[33,149767,190],{"class":163},[33,149769,20843],{"class":167},[33,149771,149772],{"class":35,"line":201},[33,149773,92],{"emptyLinePlaceholder":91},[33,149775,149776,149778,149780,149782,149784],{"class":35,"line":206},[33,149777,98330],{"class":167},[33,149779,242],{"class":163},[33,149781,17447],{"class":167},[33,149783,140420],{"class":54},[33,149785,9202],{"class":167},[33,149787,149788,149790,149792,149794,149797,149799,149802,149804,149806,149808],{"class":35,"line":224},[33,149789,13474],{"class":50},[33,149791,602],{"class":167},[33,149793,4059],{"class":163},[33,149795,149796],{"class":54},"\"Dimensions: ",[33,149798,1115],{"class":50},[33,149800,149801],{"class":167},"ws.dimensions",[33,149803,1121],{"class":50},[33,149805,274],{"class":54},[33,149807,12000],{"class":167},[33,149809,149810],{"class":39},"# e.g. A1:C7\n",[33,149812,149813,149815,149817,149819,149822,149824,149826,149828,149831,149833,149836,149838,149840],{"class":35,"line":229},[33,149814,13474],{"class":50},[33,149816,602],{"class":167},[33,149818,4059],{"class":163},[33,149820,149821],{"class":54},"\"Max row: ",[33,149823,1115],{"class":50},[33,149825,22493],{"class":167},[33,149827,1121],{"class":50},[33,149829,149830],{"class":54},", Max col: ",[33,149832,1115],{"class":50},[33,149834,149835],{"class":167},"ws.max_column",[33,149837,1121],{"class":50},[33,149839,274],{"class":54},[33,149841,221],{"class":167},[14,149843,149844,149846,149847,10065,149849,149851],{},[30,149845,149801],{}," returns the bounding box of used cells. Use ",[30,149848,22493],{},[30,149850,149835],{}," to compute formula ranges programmatically instead of hard-coding them.",[2537,149853],{},[18,149855,149857],{"id":149856},"step-2-write-live-formulas","Step 2 — Write live formulas",[14,149859,149860,149861,149863,149864,149867,149868,3035],{},"Assign a formula string to ",[30,149862,147733],{},". openpyxl stores the string verbatim; Excel evaluates it when the file is opened. See ",[940,149865,147046],{"href":149866},"\u002Fpython-for-excel-csv-data-processing\u002Fwriting-excel-formulas-and-charts-with-openpyxl\u002Ffix-openpyxl-formulas-not-calculating\u002F"," for what happens when you read the file back with ",[30,149869,105730],{},[23,149871,149873],{"className":126,"code":149872,"language":47,"meta":28,"style":28},"# pip install openpyxl\nfrom pathlib import Path\nimport openpyxl\n\nWORKBOOK = Path(\"sample_sales.xlsx\")\n\ntry:\n    wb = openpyxl.load_workbook(WORKBOOK)\nexcept FileNotFoundError as exc:\n    raise SystemExit(f\"File not found: {exc}\") from exc\n\nws = wb[\"Sales\"]\n\nlast_data_row = ws.max_row          # 7 (header on row 1, data rows 2-7)\nsummary_row   = last_data_row + 2   # leave a blank row\n\n# SUM formulas\nws[f\"B{summary_row}\"] = f\"=SUM(B2:B{last_data_row})\"\nws[f\"C{summary_row}\"] = f\"=SUM(C2:C{last_data_row})\"\n\n# Profit margin column (new column D)\nws[\"D1\"] = \"Margin\"\nfor row in range(2, last_data_row + 1):\n    ws[f\"D{row}\"] = f\"=(B{row}-C{row})\u002FB{row}\"\n\n# AVERAGE in summary row\nws[f\"D{summary_row}\"] = f\"=AVERAGE(D2:D{last_data_row})\"\n\n# Labels\nws[f\"A{summary_row}\"] = \"Total \u002F Avg\"\n\nwb.save(WORKBOOK)\nprint(f\"Formulas written to row {summary_row} and column D.\")\n",[30,149874,149875,149879,149889,149895,149899,149911,149915,149921,149933,149943,149969,149973,149985,149989,150002,150019,150023,150028,150061,150094,150098,150103,150117,150140,150191,150195,150200,150233,150237,150242,150264,150268,150276],{"__ignoreMap":28},[33,149876,149877],{"class":35,"line":36},[33,149878,98209],{"class":39},[33,149880,149881,149883,149885,149887],{"class":35,"line":43},[33,149882,190],{"class":163},[33,149884,193],{"class":167},[33,149886,164],{"class":163},[33,149888,198],{"class":167},[33,149890,149891,149893],{"class":35,"line":61},[33,149892,164],{"class":163},[33,149894,95887],{"class":167},[33,149896,149897],{"class":35,"line":73},[33,149898,92],{"emptyLinePlaceholder":91},[33,149900,149901,149903,149905,149907,149909],{"class":35,"line":88},[33,149902,126138],{"class":50},[33,149904,212],{"class":163},[33,149906,215],{"class":167},[33,149908,149709],{"class":54},[33,149910,221],{"class":167},[33,149912,149913],{"class":35,"line":95},[33,149914,92],{"emptyLinePlaceholder":91},[33,149916,149917,149919],{"class":35,"line":101},[33,149918,35574],{"class":163},[33,149920,574],{"class":167},[33,149922,149923,149925,149927,149929,149931],{"class":35,"line":171},[33,149924,17432],{"class":167},[33,149926,242],{"class":163},[33,149928,144711],{"class":167},[33,149930,126138],{"class":50},[33,149932,221],{"class":167},[33,149934,149935,149937,149939,149941],{"class":35,"line":179},[33,149936,35726],{"class":163},[33,149938,2945],{"class":50},[33,149940,1852],{"class":163},[33,149942,1855],{"class":167},[33,149944,149945,149947,149949,149951,149953,149955,149957,149959,149961,149963,149965,149967],{"class":35,"line":187},[33,149946,35742],{"class":163},[33,149948,16617],{"class":50},[33,149950,602],{"class":167},[33,149952,4059],{"class":163},[33,149954,15677],{"class":54},[33,149956,1115],{"class":50},[33,149958,6565],{"class":167},[33,149960,1121],{"class":50},[33,149962,274],{"class":54},[33,149964,1649],{"class":167},[33,149966,190],{"class":163},[33,149968,20843],{"class":167},[33,149970,149971],{"class":35,"line":201},[33,149972,92],{"emptyLinePlaceholder":91},[33,149974,149975,149977,149979,149981,149983],{"class":35,"line":206},[33,149976,98330],{"class":167},[33,149978,242],{"class":163},[33,149980,17447],{"class":167},[33,149982,140420],{"class":54},[33,149984,9202],{"class":167},[33,149986,149987],{"class":35,"line":224},[33,149988,92],{"emptyLinePlaceholder":91},[33,149990,149991,149994,149996,149999],{"class":35,"line":229},[33,149992,149993],{"class":167},"last_data_row ",[33,149995,242],{"class":163},[33,149997,149998],{"class":167}," ws.max_row          ",[33,150000,150001],{"class":39},"# 7 (header on row 1, data rows 2-7)\n",[33,150003,150004,150007,150009,150012,150014,150016],{"class":35,"line":235},[33,150005,150006],{"class":167},"summary_row   ",[33,150008,242],{"class":163},[33,150010,150011],{"class":167}," last_data_row ",[33,150013,1811],{"class":163},[33,150015,7451],{"class":50},[33,150017,150018],{"class":39},"   # leave a blank row\n",[33,150020,150021],{"class":35,"line":250},[33,150022,92],{"emptyLinePlaceholder":91},[33,150024,150025],{"class":35,"line":266},[33,150026,150027],{"class":39},"# SUM formulas\n",[33,150029,150030,150032,150034,150036,150038,150040,150042,150044,150046,150048,150050,150052,150054,150057,150059],{"class":35,"line":290},[33,150031,148071],{"class":167},[33,150033,4059],{"class":163},[33,150035,148101],{"class":54},[33,150037,1115],{"class":50},[33,150039,148081],{"class":167},[33,150041,1121],{"class":50},[33,150043,274],{"class":54},[33,150045,763],{"class":167},[33,150047,242],{"class":163},[33,150049,1110],{"class":163},[33,150051,106716],{"class":54},[33,150053,1115],{"class":50},[33,150055,150056],{"class":167},"last_data_row",[33,150058,1121],{"class":50},[33,150060,17841],{"class":54},[33,150062,150063,150065,150067,150069,150071,150073,150075,150077,150079,150081,150083,150086,150088,150090,150092],{"class":35,"line":295},[33,150064,148071],{"class":167},[33,150066,4059],{"class":163},[33,150068,148128],{"class":54},[33,150070,1115],{"class":50},[33,150072,148081],{"class":167},[33,150074,1121],{"class":50},[33,150076,274],{"class":54},[33,150078,763],{"class":167},[33,150080,242],{"class":163},[33,150082,1110],{"class":163},[33,150084,150085],{"class":54},"\"=SUM(C2:C",[33,150087,1115],{"class":50},[33,150089,150056],{"class":167},[33,150091,1121],{"class":50},[33,150093,17841],{"class":54},[33,150095,150096],{"class":35,"line":300},[33,150097,92],{"emptyLinePlaceholder":91},[33,150099,150100],{"class":35,"line":317},[33,150101,150102],{"class":39},"# Profit margin column (new column D)\n",[33,150104,150105,150107,150110,150112,150114],{"class":35,"line":332},[33,150106,148071],{"class":167},[33,150108,150109],{"class":54},"\"D1\"",[33,150111,763],{"class":167},[33,150113,242],{"class":163},[33,150115,150116],{"class":54}," \"Margin\"\n",[33,150118,150119,150121,150123,150125,150127,150129,150131,150134,150136,150138],{"class":35,"line":347},[33,150120,6124],{"class":163},[33,150122,3844],{"class":167},[33,150124,662],{"class":163},[33,150126,1801],{"class":50},[33,150128,602],{"class":167},[33,150130,1533],{"class":50},[33,150132,150133],{"class":167},", last_data_row ",[33,150135,1811],{"class":163},[33,150137,1814],{"class":50},[33,150139,1737],{"class":167},[33,150141,150142,150144,150146,150148,150150,150152,150154,150156,150158,150160,150162,150165,150167,150169,150171,150174,150176,150178,150180,150183,150185,150187,150189],{"class":35,"line":374},[33,150143,99896],{"class":167},[33,150145,4059],{"class":163},[33,150147,148152],{"class":54},[33,150149,1115],{"class":50},[33,150151,98107],{"class":167},[33,150153,1121],{"class":50},[33,150155,274],{"class":54},[33,150157,763],{"class":167},[33,150159,242],{"class":163},[33,150161,1110],{"class":163},[33,150163,150164],{"class":54},"\"=(B",[33,150166,1115],{"class":50},[33,150168,98107],{"class":167},[33,150170,1121],{"class":50},[33,150172,150173],{"class":54},"-C",[33,150175,1115],{"class":50},[33,150177,98107],{"class":167},[33,150179,1121],{"class":50},[33,150181,150182],{"class":54},")\u002FB",[33,150184,1115],{"class":50},[33,150186,98107],{"class":167},[33,150188,1121],{"class":50},[33,150190,7504],{"class":54},[33,150192,150193],{"class":35,"line":397},[33,150194,92],{"emptyLinePlaceholder":91},[33,150196,150197],{"class":35,"line":653},[33,150198,150199],{"class":39},"# AVERAGE in summary row\n",[33,150201,150202,150204,150206,150208,150210,150212,150214,150216,150218,150220,150222,150225,150227,150229,150231],{"class":35,"line":667},[33,150203,148071],{"class":167},[33,150205,4059],{"class":163},[33,150207,148152],{"class":54},[33,150209,1115],{"class":50},[33,150211,148081],{"class":167},[33,150213,1121],{"class":50},[33,150215,274],{"class":54},[33,150217,763],{"class":167},[33,150219,242],{"class":163},[33,150221,1110],{"class":163},[33,150223,150224],{"class":54},"\"=AVERAGE(D2:D",[33,150226,1115],{"class":50},[33,150228,150056],{"class":167},[33,150230,1121],{"class":50},[33,150232,17841],{"class":54},[33,150234,150235],{"class":35,"line":675},[33,150236,92],{"emptyLinePlaceholder":91},[33,150238,150239],{"class":35,"line":689},[33,150240,150241],{"class":39},"# Labels\n",[33,150243,150244,150246,150248,150250,150252,150254,150256,150258,150260,150262],{"class":35,"line":703},[33,150245,148071],{"class":167},[33,150247,4059],{"class":163},[33,150249,148076],{"class":54},[33,150251,1115],{"class":50},[33,150253,148081],{"class":167},[33,150255,1121],{"class":50},[33,150257,274],{"class":54},[33,150259,763],{"class":167},[33,150261,242],{"class":163},[33,150263,148092],{"class":54},[33,150265,150266],{"class":35,"line":714},[33,150267,92],{"emptyLinePlaceholder":91},[33,150269,150270,150272,150274],{"class":35,"line":723},[33,150271,100907],{"class":167},[33,150273,126138],{"class":50},[33,150275,221],{"class":167},[33,150277,150278,150280,150282,150284,150287,150289,150291,150293,150296],{"class":35,"line":754},[33,150279,13474],{"class":50},[33,150281,602],{"class":167},[33,150283,4059],{"class":163},[33,150285,150286],{"class":54},"\"Formulas written to row ",[33,150288,1115],{"class":50},[33,150290,148081],{"class":167},[33,150292,1121],{"class":50},[33,150294,150295],{"class":54}," and column D.\"",[33,150297,221],{"class":167},[14,150299,150300],{},"Key points:",[4211,150302,150303,150308,150311],{},[4214,150304,150305,150306,3035],{},"Formulas use standard Excel syntax — start every formula with ",[30,150307,242],{},[4214,150309,150310],{},"Use f-strings to keep row references dynamic rather than hard-coding row numbers.",[4214,150312,150313,150314,150316,150317,150319],{},"openpyxl does ",[1974,150315,7999],{}," calculate formula results; read the file back with ",[30,150318,107326],{}," to retrieve the formula string, or open it in Excel\u002FLibreOffice once to force evaluation.",[2537,150321],{},[18,150323,150325],{"id":150324},"step-3-named-ranges","Step 3 — Named ranges",[14,150327,150328,150329,49047,150331,150334,150335,3035],{},"Named ranges let formulas use ",[30,150330,95834],{},[30,150332,150333],{},"Sales!$B$2:$B$7",". Define them via ",[30,150336,150337],{},"wb.defined_names",[23,150339,150341],{"className":126,"code":150340,"language":47,"meta":28,"style":28},"# pip install openpyxl\nfrom pathlib import Path\nimport openpyxl\nfrom openpyxl.workbook.defined_name import DefinedName\nfrom openpyxl.utils import quote_sheetname, absolute_coordinate\n\nWORKBOOK = Path(\"sample_sales.xlsx\")\n\ntry:\n    wb = openpyxl.load_workbook(WORKBOOK)\nexcept FileNotFoundError as exc:\n    raise SystemExit(f\"File not found: {exc}\") from exc\n\nws = wb[\"Sales\"]\nlast_data_row = 7   # adjust if data length changes\n\n# Build an absolute reference string: 'Sales'!$B$2:$B$7\nsheet_ref = quote_sheetname(ws.title)\nrev_range  = absolute_coordinate(f\"B2:B{last_data_row}\")\ncost_range = absolute_coordinate(f\"C2:C{last_data_row}\")\n\nwb.defined_names[\"Revenue\"] = DefinedName(\"Revenue\", attr_text=f\"{sheet_ref}!{rev_range}\")\nwb.defined_names[\"Cost\"]    = DefinedName(\"Cost\",    attr_text=f\"{sheet_ref}!{cost_range}\")\n\n# Now use named ranges in formulas\nws[\"B10\"] = \"=SUM(Revenue)\"\nws[\"C10\"] = \"=SUM(Cost)\"\n\nwb.save(WORKBOOK)\nprint(\"Named ranges 'Revenue' and 'Cost' defined.\")\n",[30,150342,150343,150347,150357,150363,150375,150386,150390,150402,150406,150412,150424,150434,150460,150464,150476,150488,150492,150497,150507,150531,150554,150558,150606,150649,150653,150658,150672,150686,150690,150698],{"__ignoreMap":28},[33,150344,150345],{"class":35,"line":36},[33,150346,98209],{"class":39},[33,150348,150349,150351,150353,150355],{"class":35,"line":43},[33,150350,190],{"class":163},[33,150352,193],{"class":167},[33,150354,164],{"class":163},[33,150356,198],{"class":167},[33,150358,150359,150361],{"class":35,"line":61},[33,150360,164],{"class":163},[33,150362,95887],{"class":167},[33,150364,150365,150367,150370,150372],{"class":35,"line":73},[33,150366,190],{"class":163},[33,150368,150369],{"class":167}," openpyxl.workbook.defined_name ",[33,150371,164],{"class":163},[33,150373,150374],{"class":167}," DefinedName\n",[33,150376,150377,150379,150381,150383],{"class":35,"line":88},[33,150378,190],{"class":163},[33,150380,17127],{"class":167},[33,150382,164],{"class":163},[33,150384,150385],{"class":167}," quote_sheetname, absolute_coordinate\n",[33,150387,150388],{"class":35,"line":95},[33,150389,92],{"emptyLinePlaceholder":91},[33,150391,150392,150394,150396,150398,150400],{"class":35,"line":101},[33,150393,126138],{"class":50},[33,150395,212],{"class":163},[33,150397,215],{"class":167},[33,150399,149709],{"class":54},[33,150401,221],{"class":167},[33,150403,150404],{"class":35,"line":171},[33,150405,92],{"emptyLinePlaceholder":91},[33,150407,150408,150410],{"class":35,"line":179},[33,150409,35574],{"class":163},[33,150411,574],{"class":167},[33,150413,150414,150416,150418,150420,150422],{"class":35,"line":187},[33,150415,17432],{"class":167},[33,150417,242],{"class":163},[33,150419,144711],{"class":167},[33,150421,126138],{"class":50},[33,150423,221],{"class":167},[33,150425,150426,150428,150430,150432],{"class":35,"line":201},[33,150427,35726],{"class":163},[33,150429,2945],{"class":50},[33,150431,1852],{"class":163},[33,150433,1855],{"class":167},[33,150435,150436,150438,150440,150442,150444,150446,150448,150450,150452,150454,150456,150458],{"class":35,"line":206},[33,150437,35742],{"class":163},[33,150439,16617],{"class":50},[33,150441,602],{"class":167},[33,150443,4059],{"class":163},[33,150445,15677],{"class":54},[33,150447,1115],{"class":50},[33,150449,6565],{"class":167},[33,150451,1121],{"class":50},[33,150453,274],{"class":54},[33,150455,1649],{"class":167},[33,150457,190],{"class":163},[33,150459,20843],{"class":167},[33,150461,150462],{"class":35,"line":224},[33,150463,92],{"emptyLinePlaceholder":91},[33,150465,150466,150468,150470,150472,150474],{"class":35,"line":229},[33,150467,98330],{"class":167},[33,150469,242],{"class":163},[33,150471,17447],{"class":167},[33,150473,140420],{"class":54},[33,150475,9202],{"class":167},[33,150477,150478,150480,150482,150485],{"class":35,"line":235},[33,150479,149993],{"class":167},[33,150481,242],{"class":163},[33,150483,150484],{"class":50}," 7",[33,150486,150487],{"class":39},"   # adjust if data length changes\n",[33,150489,150490],{"class":35,"line":250},[33,150491,92],{"emptyLinePlaceholder":91},[33,150493,150494],{"class":35,"line":266},[33,150495,150496],{"class":39},"# Build an absolute reference string: 'Sales'!$B$2:$B$7\n",[33,150498,150499,150502,150504],{"class":35,"line":290},[33,150500,150501],{"class":167},"sheet_ref ",[33,150503,242],{"class":163},[33,150505,150506],{"class":167}," quote_sheetname(ws.title)\n",[33,150508,150509,150512,150514,150517,150519,150521,150523,150525,150527,150529],{"class":35,"line":295},[33,150510,150511],{"class":167},"rev_range  ",[33,150513,242],{"class":163},[33,150515,150516],{"class":167}," absolute_coordinate(",[33,150518,4059],{"class":163},[33,150520,98615],{"class":54},[33,150522,1115],{"class":50},[33,150524,150056],{"class":167},[33,150526,1121],{"class":50},[33,150528,274],{"class":54},[33,150530,221],{"class":167},[33,150532,150533,150536,150538,150540,150542,150544,150546,150548,150550,150552],{"class":35,"line":300},[33,150534,150535],{"class":167},"cost_range ",[33,150537,242],{"class":163},[33,150539,150516],{"class":167},[33,150541,4059],{"class":163},[33,150543,104381],{"class":54},[33,150545,1115],{"class":50},[33,150547,150056],{"class":167},[33,150549,1121],{"class":50},[33,150551,274],{"class":54},[33,150553,221],{"class":167},[33,150555,150556],{"class":35,"line":317},[33,150557,92],{"emptyLinePlaceholder":91},[33,150559,150560,150563,150565,150567,150569,150572,150574,150576,150579,150581,150583,150585,150587,150590,150592,150595,150597,150600,150602,150604],{"class":35,"line":332},[33,150561,150562],{"class":167},"wb.defined_names[",[33,150564,12925],{"class":54},[33,150566,763],{"class":167},[33,150568,242],{"class":163},[33,150570,150571],{"class":167}," DefinedName(",[33,150573,12925],{"class":54},[33,150575,365],{"class":167},[33,150577,150578],{"class":238},"attr_text",[33,150580,242],{"class":163},[33,150582,4059],{"class":163},[33,150584,274],{"class":54},[33,150586,1115],{"class":50},[33,150588,150589],{"class":167},"sheet_ref",[33,150591,1121],{"class":50},[33,150593,150594],{"class":54},"!",[33,150596,1115],{"class":50},[33,150598,150599],{"class":167},"rev_range",[33,150601,1121],{"class":50},[33,150603,274],{"class":54},[33,150605,221],{"class":167},[33,150607,150608,150610,150612,150614,150616,150618,150620,150622,150624,150626,150628,150630,150632,150634,150636,150638,150640,150643,150645,150647],{"class":35,"line":347},[33,150609,150562],{"class":167},[33,150611,19593],{"class":54},[33,150613,96251],{"class":167},[33,150615,242],{"class":163},[33,150617,150571],{"class":167},[33,150619,19593],{"class":54},[33,150621,38342],{"class":167},[33,150623,150578],{"class":238},[33,150625,242],{"class":163},[33,150627,4059],{"class":163},[33,150629,274],{"class":54},[33,150631,1115],{"class":50},[33,150633,150589],{"class":167},[33,150635,1121],{"class":50},[33,150637,150594],{"class":54},[33,150639,1115],{"class":50},[33,150641,150642],{"class":167},"cost_range",[33,150644,1121],{"class":50},[33,150646,274],{"class":54},[33,150648,221],{"class":167},[33,150650,150651],{"class":35,"line":374},[33,150652,92],{"emptyLinePlaceholder":91},[33,150654,150655],{"class":35,"line":397},[33,150656,150657],{"class":39},"# Now use named ranges in formulas\n",[33,150659,150660,150662,150665,150667,150669],{"class":35,"line":653},[33,150661,148071],{"class":167},[33,150663,150664],{"class":54},"\"B10\"",[33,150666,763],{"class":167},[33,150668,242],{"class":163},[33,150670,150671],{"class":54}," \"=SUM(Revenue)\"\n",[33,150673,150674,150676,150679,150681,150683],{"class":35,"line":667},[33,150675,148071],{"class":167},[33,150677,150678],{"class":54},"\"C10\"",[33,150680,763],{"class":167},[33,150682,242],{"class":163},[33,150684,150685],{"class":54}," \"=SUM(Cost)\"\n",[33,150687,150688],{"class":35,"line":675},[33,150689,92],{"emptyLinePlaceholder":91},[33,150691,150692,150694,150696],{"class":35,"line":689},[33,150693,100907],{"class":167},[33,150695,126138],{"class":50},[33,150697,221],{"class":167},[33,150699,150700,150702,150704,150707],{"class":35,"line":703},[33,150701,13474],{"class":50},[33,150703,602],{"class":167},[33,150705,150706],{"class":54},"\"Named ranges 'Revenue' and 'Cost' defined.\"",[33,150708,221],{"class":167},[14,150710,150711,150714,150715,150718,150719,36661,150722,3035],{},[30,150712,150713],{},"quote_sheetname"," wraps the sheet name in single quotes if it contains spaces. ",[30,150716,150717],{},"absolute_coordinate"," converts ",[30,150720,150721],{},"B2:B7",[30,150723,150724],{},"$B$2:$B$7",[2537,150726],{},[18,150728,150730],{"id":150729},"step-4-number-formats","Step 4 — Number formats",[14,150732,150733,150734,150737],{},"Number formats are stored as strings on the cell's ",[30,150735,150736],{},"number_format"," attribute. They follow Excel's custom format syntax.",[23,150739,150741],{"className":126,"code":150740,"language":47,"meta":28,"style":28},"# pip install openpyxl\nfrom pathlib import Path\nimport openpyxl\n\nWORKBOOK = Path(\"sample_sales.xlsx\")\n\ntry:\n    wb = openpyxl.load_workbook(WORKBOOK)\nexcept FileNotFoundError as exc:\n    raise SystemExit(f\"File not found: {exc}\") from exc\n\nws = wb[\"Sales\"]\n\n# Currency for revenue and cost columns\ncurrency_fmt = '\"$\"#,##0.00'\nfor row in range(2, ws.max_row + 1):\n    if ws[f\"B{row}\"].value is not None:\n        ws[f\"B{row}\"].number_format = currency_fmt\n    if ws[f\"C{row}\"].value is not None:\n        ws[f\"C{row}\"].number_format = currency_fmt\n\n# Percentage for margin column\npct_fmt = \"0.0%\"\nfor row in range(2, ws.max_row + 1):\n    if ws[f\"D{row}\"].value is not None:\n        ws[f\"D{row}\"].number_format = pct_fmt\n\nwb.save(WORKBOOK)\nprint(\"Number formats applied.\")\n",[30,150742,150743,150747,150757,150763,150767,150779,150783,150789,150801,150811,150837,150841,150853,150857,150862,150871,150894,150923,150947,150975,150997,151001,151006,151015,151037,151065,151088,151092,151100],{"__ignoreMap":28},[33,150744,150745],{"class":35,"line":36},[33,150746,98209],{"class":39},[33,150748,150749,150751,150753,150755],{"class":35,"line":43},[33,150750,190],{"class":163},[33,150752,193],{"class":167},[33,150754,164],{"class":163},[33,150756,198],{"class":167},[33,150758,150759,150761],{"class":35,"line":61},[33,150760,164],{"class":163},[33,150762,95887],{"class":167},[33,150764,150765],{"class":35,"line":73},[33,150766,92],{"emptyLinePlaceholder":91},[33,150768,150769,150771,150773,150775,150777],{"class":35,"line":88},[33,150770,126138],{"class":50},[33,150772,212],{"class":163},[33,150774,215],{"class":167},[33,150776,149709],{"class":54},[33,150778,221],{"class":167},[33,150780,150781],{"class":35,"line":95},[33,150782,92],{"emptyLinePlaceholder":91},[33,150784,150785,150787],{"class":35,"line":101},[33,150786,35574],{"class":163},[33,150788,574],{"class":167},[33,150790,150791,150793,150795,150797,150799],{"class":35,"line":171},[33,150792,17432],{"class":167},[33,150794,242],{"class":163},[33,150796,144711],{"class":167},[33,150798,126138],{"class":50},[33,150800,221],{"class":167},[33,150802,150803,150805,150807,150809],{"class":35,"line":179},[33,150804,35726],{"class":163},[33,150806,2945],{"class":50},[33,150808,1852],{"class":163},[33,150810,1855],{"class":167},[33,150812,150813,150815,150817,150819,150821,150823,150825,150827,150829,150831,150833,150835],{"class":35,"line":187},[33,150814,35742],{"class":163},[33,150816,16617],{"class":50},[33,150818,602],{"class":167},[33,150820,4059],{"class":163},[33,150822,15677],{"class":54},[33,150824,1115],{"class":50},[33,150826,6565],{"class":167},[33,150828,1121],{"class":50},[33,150830,274],{"class":54},[33,150832,1649],{"class":167},[33,150834,190],{"class":163},[33,150836,20843],{"class":167},[33,150838,150839],{"class":35,"line":201},[33,150840,92],{"emptyLinePlaceholder":91},[33,150842,150843,150845,150847,150849,150851],{"class":35,"line":206},[33,150844,98330],{"class":167},[33,150846,242],{"class":163},[33,150848,17447],{"class":167},[33,150850,140420],{"class":54},[33,150852,9202],{"class":167},[33,150854,150855],{"class":35,"line":224},[33,150856,92],{"emptyLinePlaceholder":91},[33,150858,150859],{"class":35,"line":229},[33,150860,150861],{"class":39},"# Currency for revenue and cost columns\n",[33,150863,150864,150867,150869],{"class":35,"line":235},[33,150865,150866],{"class":167},"currency_fmt ",[33,150868,242],{"class":163},[33,150870,17685],{"class":54},[33,150872,150873,150875,150877,150879,150881,150883,150885,150888,150890,150892],{"class":35,"line":250},[33,150874,6124],{"class":163},[33,150876,3844],{"class":167},[33,150878,662],{"class":163},[33,150880,1801],{"class":50},[33,150882,602],{"class":167},[33,150884,1533],{"class":50},[33,150886,150887],{"class":167},", ws.max_row ",[33,150889,1811],{"class":163},[33,150891,1814],{"class":50},[33,150893,1737],{"class":167},[33,150895,150896,150898,150900,150902,150904,150906,150908,150910,150912,150915,150917,150919,150921],{"class":35,"line":266},[33,150897,617],{"class":163},[33,150899,17472],{"class":167},[33,150901,4059],{"class":163},[33,150903,148101],{"class":54},[33,150905,1115],{"class":50},[33,150907,98107],{"class":167},[33,150909,1121],{"class":50},[33,150911,274],{"class":54},[33,150913,150914],{"class":167},"].value ",[33,150916,3847],{"class":163},[33,150918,620],{"class":163},[33,150920,7657],{"class":50},[33,150922,574],{"class":167},[33,150924,150925,150928,150930,150932,150934,150936,150938,150940,150942,150944],{"class":35,"line":290},[33,150926,150927],{"class":167},"        ws[",[33,150929,4059],{"class":163},[33,150931,148101],{"class":54},[33,150933,1115],{"class":50},[33,150935,98107],{"class":167},[33,150937,1121],{"class":50},[33,150939,274],{"class":54},[33,150941,148186],{"class":167},[33,150943,242],{"class":163},[33,150945,150946],{"class":167}," currency_fmt\n",[33,150948,150949,150951,150953,150955,150957,150959,150961,150963,150965,150967,150969,150971,150973],{"class":35,"line":295},[33,150950,617],{"class":163},[33,150952,17472],{"class":167},[33,150954,4059],{"class":163},[33,150956,148128],{"class":54},[33,150958,1115],{"class":50},[33,150960,98107],{"class":167},[33,150962,1121],{"class":50},[33,150964,274],{"class":54},[33,150966,150914],{"class":167},[33,150968,3847],{"class":163},[33,150970,620],{"class":163},[33,150972,7657],{"class":50},[33,150974,574],{"class":167},[33,150976,150977,150979,150981,150983,150985,150987,150989,150991,150993,150995],{"class":35,"line":300},[33,150978,150927],{"class":167},[33,150980,4059],{"class":163},[33,150982,148128],{"class":54},[33,150984,1115],{"class":50},[33,150986,98107],{"class":167},[33,150988,1121],{"class":50},[33,150990,274],{"class":54},[33,150992,148186],{"class":167},[33,150994,242],{"class":163},[33,150996,150946],{"class":167},[33,150998,150999],{"class":35,"line":317},[33,151000,92],{"emptyLinePlaceholder":91},[33,151002,151003],{"class":35,"line":332},[33,151004,151005],{"class":39},"# Percentage for margin column\n",[33,151007,151008,151011,151013],{"class":35,"line":347},[33,151009,151010],{"class":167},"pct_fmt ",[33,151012,242],{"class":163},[33,151014,148236],{"class":54},[33,151016,151017,151019,151021,151023,151025,151027,151029,151031,151033,151035],{"class":35,"line":374},[33,151018,6124],{"class":163},[33,151020,3844],{"class":167},[33,151022,662],{"class":163},[33,151024,1801],{"class":50},[33,151026,602],{"class":167},[33,151028,1533],{"class":50},[33,151030,150887],{"class":167},[33,151032,1811],{"class":163},[33,151034,1814],{"class":50},[33,151036,1737],{"class":167},[33,151038,151039,151041,151043,151045,151047,151049,151051,151053,151055,151057,151059,151061,151063],{"class":35,"line":397},[33,151040,617],{"class":163},[33,151042,17472],{"class":167},[33,151044,4059],{"class":163},[33,151046,148152],{"class":54},[33,151048,1115],{"class":50},[33,151050,98107],{"class":167},[33,151052,1121],{"class":50},[33,151054,274],{"class":54},[33,151056,150914],{"class":167},[33,151058,3847],{"class":163},[33,151060,620],{"class":163},[33,151062,7657],{"class":50},[33,151064,574],{"class":167},[33,151066,151067,151069,151071,151073,151075,151077,151079,151081,151083,151085],{"class":35,"line":653},[33,151068,150927],{"class":167},[33,151070,4059],{"class":163},[33,151072,148152],{"class":54},[33,151074,1115],{"class":50},[33,151076,98107],{"class":167},[33,151078,1121],{"class":50},[33,151080,274],{"class":54},[33,151082,148186],{"class":167},[33,151084,242],{"class":163},[33,151086,151087],{"class":167}," pct_fmt\n",[33,151089,151090],{"class":35,"line":667},[33,151091,92],{"emptyLinePlaceholder":91},[33,151093,151094,151096,151098],{"class":35,"line":675},[33,151095,100907],{"class":167},[33,151097,126138],{"class":50},[33,151099,221],{"class":167},[33,151101,151102,151104,151106,151109],{"class":35,"line":689},[33,151103,13474],{"class":50},[33,151105,602],{"class":167},[33,151107,151108],{"class":54},"\"Number formats applied.\"",[33,151110,221],{"class":167},[14,151112,151113],{},"Common format strings:",[4273,151115,151116,151126],{},[4276,151117,151118],{},[4279,151119,151120,151123],{},[4282,151121,151122],{},"Purpose",[4282,151124,151125],{},"Format string",[4292,151127,151128,151138,151148,151158,151167,151177],{},[4279,151129,151130,151133],{},[4297,151131,151132],{},"Currency USD",[4297,151134,151135],{},[30,151136,151137],{},"\"$\"#,##0.00",[4279,151139,151140,151143],{},[4297,151141,151142],{},"Percentage",[4297,151144,151145],{},[30,151146,151147],{},"0.0%",[4279,151149,151150,151153],{},[4297,151151,151152],{},"Integer with thousands",[4297,151154,151155],{},[30,151156,151157],{},"#,##0",[4279,151159,151160,151163],{},[4297,151161,151162],{},"ISO date",[4297,151164,151165],{},[30,151166,97759],{},[4279,151168,151169,151172],{},[4297,151170,151171],{},"Date + time",[4297,151173,151174],{},[30,151175,151176],{},"YYYY-MM-DD HH:MM:SS",[4279,151178,151179,151182],{},[4297,151180,151181],{},"Scientific",[4297,151183,151184],{},[30,151185,151186],{},"0.00E+00",[2537,151188],{},[18,151190,151192],{"id":151191},"step-5-barchart","Step 5 — BarChart",[14,151194,151195,151196,151198,151199,10065,151202,151205,151206,3035],{},"The chart API always follows the same pattern: create a chart object, create ",[30,151197,149557],{}," objects for the data and categories, call ",[30,151200,151201],{},"add_data",[30,151203,151204],{},"set_categories",", then call ",[30,151207,151208],{},"ws.add_chart",[23,151210,151212],{"className":126,"code":151211,"language":47,"meta":28,"style":28},"# pip install openpyxl\nfrom pathlib import Path\nimport openpyxl\nfrom openpyxl.chart import BarChart, Reference\n\nWORKBOOK = Path(\"sample_sales.xlsx\")\n\ntry:\n    wb = openpyxl.load_workbook(WORKBOOK)\nexcept FileNotFoundError as exc:\n    raise SystemExit(f\"File not found: {exc}\") from exc\n\nws = wb[\"Sales\"]\n\nchart = BarChart()\nchart.type    = \"col\"          # vertical bars; \"bar\" = horizontal\nchart.grouping = \"clustered\"   # or \"stacked\", \"percentStacked\"\nchart.title   = \"Monthly Revenue vs Cost\"\nchart.y_axis.title = \"Amount (USD)\"\nchart.x_axis.title = \"Month\"\nchart.style   = 10             # built-in Excel style 1-48\n\n# Data reference: columns B and C, rows 1-7 (row 1 = header, used as series title)\ndata_ref = Reference(ws, min_col=2, max_col=3, min_row=1, max_row=ws.max_row)\nchart.add_data(data_ref, titles_from_data=True)\n\n# Category labels: column A, rows 2-7 (skip header)\ncats_ref = Reference(ws, min_col=1, min_row=2, max_row=ws.max_row)\nchart.set_categories(cats_ref)\n\n# Size in EMU (English Metric Units): 1 cm = 360000 EMU\nchart.width  = 20   # cm\nchart.height = 12   # cm\n\nws.add_chart(chart, \"F2\")   # anchor top-left corner at F2\n\nwb.save(WORKBOOK)\nprint(\"BarChart written.\")\n",[30,151213,151214,151218,151228,151234,151246,151250,151262,151266,151272,151284,151294,151320,151324,151336,151340,151350,151363,151376,151386,151396,151406,151418,151422,151427,151468,151482,151486,151491,151522,151527,151531,151536,151548,151559,151563,151575,151579,151587],{"__ignoreMap":28},[33,151215,151216],{"class":35,"line":36},[33,151217,98209],{"class":39},[33,151219,151220,151222,151224,151226],{"class":35,"line":43},[33,151221,190],{"class":163},[33,151223,193],{"class":167},[33,151225,164],{"class":163},[33,151227,198],{"class":167},[33,151229,151230,151232],{"class":35,"line":61},[33,151231,164],{"class":163},[33,151233,95887],{"class":167},[33,151235,151236,151238,151241,151243],{"class":35,"line":73},[33,151237,190],{"class":163},[33,151239,151240],{"class":167}," openpyxl.chart ",[33,151242,164],{"class":163},[33,151244,151245],{"class":167}," BarChart, Reference\n",[33,151247,151248],{"class":35,"line":88},[33,151249,92],{"emptyLinePlaceholder":91},[33,151251,151252,151254,151256,151258,151260],{"class":35,"line":95},[33,151253,126138],{"class":50},[33,151255,212],{"class":163},[33,151257,215],{"class":167},[33,151259,149709],{"class":54},[33,151261,221],{"class":167},[33,151263,151264],{"class":35,"line":101},[33,151265,92],{"emptyLinePlaceholder":91},[33,151267,151268,151270],{"class":35,"line":171},[33,151269,35574],{"class":163},[33,151271,574],{"class":167},[33,151273,151274,151276,151278,151280,151282],{"class":35,"line":179},[33,151275,17432],{"class":167},[33,151277,242],{"class":163},[33,151279,144711],{"class":167},[33,151281,126138],{"class":50},[33,151283,221],{"class":167},[33,151285,151286,151288,151290,151292],{"class":35,"line":187},[33,151287,35726],{"class":163},[33,151289,2945],{"class":50},[33,151291,1852],{"class":163},[33,151293,1855],{"class":167},[33,151295,151296,151298,151300,151302,151304,151306,151308,151310,151312,151314,151316,151318],{"class":35,"line":201},[33,151297,35742],{"class":163},[33,151299,16617],{"class":50},[33,151301,602],{"class":167},[33,151303,4059],{"class":163},[33,151305,15677],{"class":54},[33,151307,1115],{"class":50},[33,151309,6565],{"class":167},[33,151311,1121],{"class":50},[33,151313,274],{"class":54},[33,151315,1649],{"class":167},[33,151317,190],{"class":163},[33,151319,20843],{"class":167},[33,151321,151322],{"class":35,"line":206},[33,151323,92],{"emptyLinePlaceholder":91},[33,151325,151326,151328,151330,151332,151334],{"class":35,"line":224},[33,151327,98330],{"class":167},[33,151329,242],{"class":163},[33,151331,17447],{"class":167},[33,151333,140420],{"class":54},[33,151335,9202],{"class":167},[33,151337,151338],{"class":35,"line":229},[33,151339,92],{"emptyLinePlaceholder":91},[33,151341,151342,151345,151347],{"class":35,"line":235},[33,151343,151344],{"class":167},"chart ",[33,151346,242],{"class":163},[33,151348,151349],{"class":167}," BarChart()\n",[33,151351,151352,151355,151357,151360],{"class":35,"line":250},[33,151353,151354],{"class":167},"chart.type    ",[33,151356,242],{"class":163},[33,151358,151359],{"class":54}," \"col\"",[33,151361,151362],{"class":39},"          # vertical bars; \"bar\" = horizontal\n",[33,151364,151365,151368,151370,151373],{"class":35,"line":266},[33,151366,151367],{"class":167},"chart.grouping ",[33,151369,242],{"class":163},[33,151371,151372],{"class":54}," \"clustered\"",[33,151374,151375],{"class":39},"   # or \"stacked\", \"percentStacked\"\n",[33,151377,151378,151381,151383],{"class":35,"line":290},[33,151379,151380],{"class":167},"chart.title   ",[33,151382,242],{"class":163},[33,151384,151385],{"class":54}," \"Monthly Revenue vs Cost\"\n",[33,151387,151388,151391,151393],{"class":35,"line":295},[33,151389,151390],{"class":167},"chart.y_axis.title ",[33,151392,242],{"class":163},[33,151394,151395],{"class":54}," \"Amount (USD)\"\n",[33,151397,151398,151401,151403],{"class":35,"line":300},[33,151399,151400],{"class":167},"chart.x_axis.title ",[33,151402,242],{"class":163},[33,151404,151405],{"class":54}," \"Month\"\n",[33,151407,151408,151411,151413,151415],{"class":35,"line":317},[33,151409,151410],{"class":167},"chart.style   ",[33,151412,242],{"class":163},[33,151414,37265],{"class":50},[33,151416,151417],{"class":39},"             # built-in Excel style 1-48\n",[33,151419,151420],{"class":35,"line":332},[33,151421,92],{"emptyLinePlaceholder":91},[33,151423,151424],{"class":35,"line":347},[33,151425,151426],{"class":39},"# Data reference: columns B and C, rows 1-7 (row 1 = header, used as series title)\n",[33,151428,151429,151432,151434,151437,151439,151441,151443,151445,151447,151449,151451,151453,151455,151457,151459,151461,151463,151465],{"class":35,"line":374},[33,151430,151431],{"class":167},"data_ref ",[33,151433,242],{"class":163},[33,151435,151436],{"class":167}," Reference(ws, ",[33,151438,17651],{"class":238},[33,151440,242],{"class":163},[33,151442,1533],{"class":50},[33,151444,365],{"class":167},[33,151446,17659],{"class":238},[33,151448,242],{"class":163},[33,151450,10258],{"class":50},[33,151452,365],{"class":167},[33,151454,17642],{"class":238},[33,151456,242],{"class":163},[33,151458,734],{"class":50},[33,151460,365],{"class":167},[33,151462,97398],{"class":238},[33,151464,242],{"class":163},[33,151466,151467],{"class":167},"ws.max_row)\n",[33,151469,151470,151473,151476,151478,151480],{"class":35,"line":397},[33,151471,151472],{"class":167},"chart.add_data(data_ref, ",[33,151474,151475],{"class":238},"titles_from_data",[33,151477,242],{"class":163},[33,151479,855],{"class":50},[33,151481,221],{"class":167},[33,151483,151484],{"class":35,"line":653},[33,151485,92],{"emptyLinePlaceholder":91},[33,151487,151488],{"class":35,"line":667},[33,151489,151490],{"class":39},"# Category labels: column A, rows 2-7 (skip header)\n",[33,151492,151493,151496,151498,151500,151502,151504,151506,151508,151510,151512,151514,151516,151518,151520],{"class":35,"line":675},[33,151494,151495],{"class":167},"cats_ref ",[33,151497,242],{"class":163},[33,151499,151436],{"class":167},[33,151501,17651],{"class":238},[33,151503,242],{"class":163},[33,151505,734],{"class":50},[33,151507,365],{"class":167},[33,151509,17642],{"class":238},[33,151511,242],{"class":163},[33,151513,1533],{"class":50},[33,151515,365],{"class":167},[33,151517,97398],{"class":238},[33,151519,242],{"class":163},[33,151521,151467],{"class":167},[33,151523,151524],{"class":35,"line":689},[33,151525,151526],{"class":167},"chart.set_categories(cats_ref)\n",[33,151528,151529],{"class":35,"line":703},[33,151530,92],{"emptyLinePlaceholder":91},[33,151532,151533],{"class":35,"line":714},[33,151534,151535],{"class":39},"# Size in EMU (English Metric Units): 1 cm = 360000 EMU\n",[33,151537,151538,151541,151543,151545],{"class":35,"line":723},[33,151539,151540],{"class":167},"chart.width  ",[33,151542,242],{"class":163},[33,151544,43599],{"class":50},[33,151546,151547],{"class":39},"   # cm\n",[33,151549,151550,151553,151555,151557],{"class":35,"line":754},[33,151551,151552],{"class":167},"chart.height ",[33,151554,242],{"class":163},[33,151556,60774],{"class":50},[33,151558,151547],{"class":39},[33,151560,151561],{"class":35,"line":771},[33,151562,92],{"emptyLinePlaceholder":91},[33,151564,151565,151568,151570,151572],{"class":35,"line":777},[33,151566,151567],{"class":167},"ws.add_chart(chart, ",[33,151569,104865],{"class":54},[33,151571,12000],{"class":167},[33,151573,151574],{"class":39},"# anchor top-left corner at F2\n",[33,151576,151577],{"class":35,"line":788},[33,151578,92],{"emptyLinePlaceholder":91},[33,151580,151581,151583,151585],{"class":35,"line":804},[33,151582,100907],{"class":167},[33,151584,126138],{"class":50},[33,151586,221],{"class":167},[33,151588,151589,151591,151593,151596],{"class":35,"line":809},[33,151590,13474],{"class":50},[33,151592,602],{"class":167},[33,151594,151595],{"class":54},"\"BarChart written.\"",[33,151597,221],{"class":167},[14,151599,151600,151603,151604,20867,151607,151610],{},[30,151601,151602],{},"titles_from_data=True"," uses the first row of the reference as series names. If your data has no header row, pass ",[30,151605,151606],{},"titles_from_data=False",[30,151608,151609],{},"chart.series[i].title"," manually.",[2537,151612],{},[18,151614,151616],{"id":151615},"step-6-linechart","Step 6 — LineChart",[14,151618,151619],{},"LineChart follows identical steps; swap the chart class and adjust style options.",[23,151621,151623],{"className":126,"code":151622,"language":47,"meta":28,"style":28},"# pip install openpyxl\nfrom pathlib import Path\nimport openpyxl\nfrom openpyxl.chart import LineChart, Reference\n\nWORKBOOK = Path(\"sample_sales.xlsx\")\n\ntry:\n    wb = openpyxl.load_workbook(WORKBOOK)\nexcept FileNotFoundError as exc:\n    raise SystemExit(f\"File not found: {exc}\") from exc\n\nws = wb[\"Sales\"]\n\nline = LineChart()\nline.title         = \"Profit Margin Over Time\"\nline.y_axis.title  = \"Margin\"\nline.x_axis.title  = \"Month\"\nline.y_axis.numFmt = \"0%\"\nline.smooth        = True   # curved lines\n\n# Margin column D (no header row offset needed — row 1 IS the header)\ndata_ref = Reference(ws, min_col=4, min_row=1, max_row=ws.max_row)\nline.add_data(data_ref, titles_from_data=True)\n\ncats_ref = Reference(ws, min_col=1, min_row=2, max_row=ws.max_row)\nline.set_categories(cats_ref)\n\nline.width  = 18\nline.height = 10\n\nws.add_chart(line, \"F20\")\n\nwb.save(WORKBOOK)\nprint(\"LineChart written.\")\n",[30,151624,151625,151629,151639,151645,151656,151660,151672,151676,151682,151694,151704,151730,151734,151746,151750,151760,151770,151779,151788,151798,151810,151814,151819,151849,151862,151866,151896,151901,151905,151915,151925,151929,151939,151943,151951],{"__ignoreMap":28},[33,151626,151627],{"class":35,"line":36},[33,151628,98209],{"class":39},[33,151630,151631,151633,151635,151637],{"class":35,"line":43},[33,151632,190],{"class":163},[33,151634,193],{"class":167},[33,151636,164],{"class":163},[33,151638,198],{"class":167},[33,151640,151641,151643],{"class":35,"line":61},[33,151642,164],{"class":163},[33,151644,95887],{"class":167},[33,151646,151647,151649,151651,151653],{"class":35,"line":73},[33,151648,190],{"class":163},[33,151650,151240],{"class":167},[33,151652,164],{"class":163},[33,151654,151655],{"class":167}," LineChart, Reference\n",[33,151657,151658],{"class":35,"line":88},[33,151659,92],{"emptyLinePlaceholder":91},[33,151661,151662,151664,151666,151668,151670],{"class":35,"line":95},[33,151663,126138],{"class":50},[33,151665,212],{"class":163},[33,151667,215],{"class":167},[33,151669,149709],{"class":54},[33,151671,221],{"class":167},[33,151673,151674],{"class":35,"line":101},[33,151675,92],{"emptyLinePlaceholder":91},[33,151677,151678,151680],{"class":35,"line":171},[33,151679,35574],{"class":163},[33,151681,574],{"class":167},[33,151683,151684,151686,151688,151690,151692],{"class":35,"line":179},[33,151685,17432],{"class":167},[33,151687,242],{"class":163},[33,151689,144711],{"class":167},[33,151691,126138],{"class":50},[33,151693,221],{"class":167},[33,151695,151696,151698,151700,151702],{"class":35,"line":187},[33,151697,35726],{"class":163},[33,151699,2945],{"class":50},[33,151701,1852],{"class":163},[33,151703,1855],{"class":167},[33,151705,151706,151708,151710,151712,151714,151716,151718,151720,151722,151724,151726,151728],{"class":35,"line":201},[33,151707,35742],{"class":163},[33,151709,16617],{"class":50},[33,151711,602],{"class":167},[33,151713,4059],{"class":163},[33,151715,15677],{"class":54},[33,151717,1115],{"class":50},[33,151719,6565],{"class":167},[33,151721,1121],{"class":50},[33,151723,274],{"class":54},[33,151725,1649],{"class":167},[33,151727,190],{"class":163},[33,151729,20843],{"class":167},[33,151731,151732],{"class":35,"line":206},[33,151733,92],{"emptyLinePlaceholder":91},[33,151735,151736,151738,151740,151742,151744],{"class":35,"line":224},[33,151737,98330],{"class":167},[33,151739,242],{"class":163},[33,151741,17447],{"class":167},[33,151743,140420],{"class":54},[33,151745,9202],{"class":167},[33,151747,151748],{"class":35,"line":229},[33,151749,92],{"emptyLinePlaceholder":91},[33,151751,151752,151755,151757],{"class":35,"line":235},[33,151753,151754],{"class":167},"line ",[33,151756,242],{"class":163},[33,151758,151759],{"class":167}," LineChart()\n",[33,151761,151762,151765,151767],{"class":35,"line":250},[33,151763,151764],{"class":167},"line.title         ",[33,151766,242],{"class":163},[33,151768,151769],{"class":54}," \"Profit Margin Over Time\"\n",[33,151771,151772,151775,151777],{"class":35,"line":266},[33,151773,151774],{"class":167},"line.y_axis.title  ",[33,151776,242],{"class":163},[33,151778,150116],{"class":54},[33,151780,151781,151784,151786],{"class":35,"line":290},[33,151782,151783],{"class":167},"line.x_axis.title  ",[33,151785,242],{"class":163},[33,151787,151405],{"class":54},[33,151789,151790,151793,151795],{"class":35,"line":295},[33,151791,151792],{"class":167},"line.y_axis.numFmt ",[33,151794,242],{"class":163},[33,151796,151797],{"class":54}," \"0%\"\n",[33,151799,151800,151803,151805,151807],{"class":35,"line":300},[33,151801,151802],{"class":167},"line.smooth        ",[33,151804,242],{"class":163},[33,151806,2519],{"class":50},[33,151808,151809],{"class":39},"   # curved lines\n",[33,151811,151812],{"class":35,"line":317},[33,151813,92],{"emptyLinePlaceholder":91},[33,151815,151816],{"class":35,"line":332},[33,151817,151818],{"class":39},"# Margin column D (no header row offset needed — row 1 IS the header)\n",[33,151820,151821,151823,151825,151827,151829,151831,151833,151835,151837,151839,151841,151843,151845,151847],{"class":35,"line":347},[33,151822,151431],{"class":167},[33,151824,242],{"class":163},[33,151826,151436],{"class":167},[33,151828,17651],{"class":238},[33,151830,242],{"class":163},[33,151832,1503],{"class":50},[33,151834,365],{"class":167},[33,151836,17642],{"class":238},[33,151838,242],{"class":163},[33,151840,734],{"class":50},[33,151842,365],{"class":167},[33,151844,97398],{"class":238},[33,151846,242],{"class":163},[33,151848,151467],{"class":167},[33,151850,151851,151854,151856,151858,151860],{"class":35,"line":374},[33,151852,151853],{"class":167},"line.add_data(data_ref, ",[33,151855,151475],{"class":238},[33,151857,242],{"class":163},[33,151859,855],{"class":50},[33,151861,221],{"class":167},[33,151863,151864],{"class":35,"line":397},[33,151865,92],{"emptyLinePlaceholder":91},[33,151867,151868,151870,151872,151874,151876,151878,151880,151882,151884,151886,151888,151890,151892,151894],{"class":35,"line":653},[33,151869,151495],{"class":167},[33,151871,242],{"class":163},[33,151873,151436],{"class":167},[33,151875,17651],{"class":238},[33,151877,242],{"class":163},[33,151879,734],{"class":50},[33,151881,365],{"class":167},[33,151883,17642],{"class":238},[33,151885,242],{"class":163},[33,151887,1533],{"class":50},[33,151889,365],{"class":167},[33,151891,97398],{"class":238},[33,151893,242],{"class":163},[33,151895,151467],{"class":167},[33,151897,151898],{"class":35,"line":667},[33,151899,151900],{"class":167},"line.set_categories(cats_ref)\n",[33,151902,151903],{"class":35,"line":675},[33,151904,92],{"emptyLinePlaceholder":91},[33,151906,151907,151910,151912],{"class":35,"line":689},[33,151908,151909],{"class":167},"line.width  ",[33,151911,242],{"class":163},[33,151913,151914],{"class":50}," 18\n",[33,151916,151917,151920,151922],{"class":35,"line":703},[33,151918,151919],{"class":167},"line.height ",[33,151921,242],{"class":163},[33,151923,151924],{"class":50}," 10\n",[33,151926,151927],{"class":35,"line":714},[33,151928,92],{"emptyLinePlaceholder":91},[33,151930,151931,151934,151937],{"class":35,"line":723},[33,151932,151933],{"class":167},"ws.add_chart(line, ",[33,151935,151936],{"class":54},"\"F20\"",[33,151938,221],{"class":167},[33,151940,151941],{"class":35,"line":754},[33,151942,92],{"emptyLinePlaceholder":91},[33,151944,151945,151947,151949],{"class":35,"line":771},[33,151946,100907],{"class":167},[33,151948,126138],{"class":50},[33,151950,221],{"class":167},[33,151952,151953,151955,151957,151960],{"class":35,"line":777},[33,151954,13474],{"class":50},[33,151956,602],{"class":167},[33,151958,151959],{"class":54},"\"LineChart written.\"",[33,151961,221],{"class":167},[14,151963,36018,151964,151967],{},[30,151965,151966],{},"y_axis.numFmt = \"0%\""," formats axis tick labels as percentages regardless of the cell format.",[2537,151969],{},[18,151971,151973],{"id":151972},"step-7-piechart","Step 7 — PieChart",[14,151975,151976],{},"PieChart takes a single data series. There is no category axis — categories become slice labels.",[23,151978,151980],{"className":126,"code":151979,"language":47,"meta":28,"style":28},"# pip install openpyxl\nfrom pathlib import Path\nimport openpyxl\nfrom openpyxl.chart import PieChart, Reference\nfrom openpyxl.chart.series import DataPoint\n\nWORKBOOK = Path(\"sample_sales.xlsx\")\n\ntry:\n    wb = openpyxl.load_workbook(WORKBOOK)\nexcept FileNotFoundError as exc:\n    raise SystemExit(f\"File not found: {exc}\") from exc\n\nws = wb[\"Sales\"]\n\npie = PieChart()\npie.title = \"Revenue by Month\"\n\n# Single series: revenue column B, rows 2-7 (no header)\ndata_ref = Reference(ws, min_col=2, min_row=2, max_row=ws.max_row)\npie.add_data(data_ref)\n\n# Labels from column A\ncats_ref = Reference(ws, min_col=1, min_row=2, max_row=ws.max_row)\npie.set_categories(cats_ref)\n\n# Explode the first slice to highlight it\nslice0 = DataPoint(idx=0, explosion=10)\npie.series[0].data_points = [slice0]\n\npie.dataLabels = openpyxl.chart.label.DataLabelList()\npie.dataLabels.showPercent = True\n\npie.width  = 15\npie.height = 12\n\nws.add_chart(pie, \"F35\")\n\nwb.save(WORKBOOK)\nprint(\"PieChart written.\")\n",[30,151981,151982,151986,151996,152002,152013,152025,152029,152041,152045,152051,152063,152073,152099,152103,152115,152119,152129,152139,152143,152148,152178,152183,152187,152192,152222,152227,152231,152236,152263,152278,152282,152292,152301,152305,152315,152325,152329,152339,152343,152351],{"__ignoreMap":28},[33,151983,151984],{"class":35,"line":36},[33,151985,98209],{"class":39},[33,151987,151988,151990,151992,151994],{"class":35,"line":43},[33,151989,190],{"class":163},[33,151991,193],{"class":167},[33,151993,164],{"class":163},[33,151995,198],{"class":167},[33,151997,151998,152000],{"class":35,"line":61},[33,151999,164],{"class":163},[33,152001,95887],{"class":167},[33,152003,152004,152006,152008,152010],{"class":35,"line":73},[33,152005,190],{"class":163},[33,152007,151240],{"class":167},[33,152009,164],{"class":163},[33,152011,152012],{"class":167}," PieChart, Reference\n",[33,152014,152015,152017,152020,152022],{"class":35,"line":88},[33,152016,190],{"class":163},[33,152018,152019],{"class":167}," openpyxl.chart.series ",[33,152021,164],{"class":163},[33,152023,152024],{"class":167}," DataPoint\n",[33,152026,152027],{"class":35,"line":95},[33,152028,92],{"emptyLinePlaceholder":91},[33,152030,152031,152033,152035,152037,152039],{"class":35,"line":101},[33,152032,126138],{"class":50},[33,152034,212],{"class":163},[33,152036,215],{"class":167},[33,152038,149709],{"class":54},[33,152040,221],{"class":167},[33,152042,152043],{"class":35,"line":171},[33,152044,92],{"emptyLinePlaceholder":91},[33,152046,152047,152049],{"class":35,"line":179},[33,152048,35574],{"class":163},[33,152050,574],{"class":167},[33,152052,152053,152055,152057,152059,152061],{"class":35,"line":187},[33,152054,17432],{"class":167},[33,152056,242],{"class":163},[33,152058,144711],{"class":167},[33,152060,126138],{"class":50},[33,152062,221],{"class":167},[33,152064,152065,152067,152069,152071],{"class":35,"line":201},[33,152066,35726],{"class":163},[33,152068,2945],{"class":50},[33,152070,1852],{"class":163},[33,152072,1855],{"class":167},[33,152074,152075,152077,152079,152081,152083,152085,152087,152089,152091,152093,152095,152097],{"class":35,"line":206},[33,152076,35742],{"class":163},[33,152078,16617],{"class":50},[33,152080,602],{"class":167},[33,152082,4059],{"class":163},[33,152084,15677],{"class":54},[33,152086,1115],{"class":50},[33,152088,6565],{"class":167},[33,152090,1121],{"class":50},[33,152092,274],{"class":54},[33,152094,1649],{"class":167},[33,152096,190],{"class":163},[33,152098,20843],{"class":167},[33,152100,152101],{"class":35,"line":224},[33,152102,92],{"emptyLinePlaceholder":91},[33,152104,152105,152107,152109,152111,152113],{"class":35,"line":229},[33,152106,98330],{"class":167},[33,152108,242],{"class":163},[33,152110,17447],{"class":167},[33,152112,140420],{"class":54},[33,152114,9202],{"class":167},[33,152116,152117],{"class":35,"line":235},[33,152118,92],{"emptyLinePlaceholder":91},[33,152120,152121,152124,152126],{"class":35,"line":250},[33,152122,152123],{"class":167},"pie ",[33,152125,242],{"class":163},[33,152127,152128],{"class":167}," PieChart()\n",[33,152130,152131,152134,152136],{"class":35,"line":266},[33,152132,152133],{"class":167},"pie.title ",[33,152135,242],{"class":163},[33,152137,152138],{"class":54}," \"Revenue by Month\"\n",[33,152140,152141],{"class":35,"line":290},[33,152142,92],{"emptyLinePlaceholder":91},[33,152144,152145],{"class":35,"line":295},[33,152146,152147],{"class":39},"# Single series: revenue column B, rows 2-7 (no header)\n",[33,152149,152150,152152,152154,152156,152158,152160,152162,152164,152166,152168,152170,152172,152174,152176],{"class":35,"line":300},[33,152151,151431],{"class":167},[33,152153,242],{"class":163},[33,152155,151436],{"class":167},[33,152157,17651],{"class":238},[33,152159,242],{"class":163},[33,152161,1533],{"class":50},[33,152163,365],{"class":167},[33,152165,17642],{"class":238},[33,152167,242],{"class":163},[33,152169,1533],{"class":50},[33,152171,365],{"class":167},[33,152173,97398],{"class":238},[33,152175,242],{"class":163},[33,152177,151467],{"class":167},[33,152179,152180],{"class":35,"line":317},[33,152181,152182],{"class":167},"pie.add_data(data_ref)\n",[33,152184,152185],{"class":35,"line":332},[33,152186,92],{"emptyLinePlaceholder":91},[33,152188,152189],{"class":35,"line":347},[33,152190,152191],{"class":39},"# Labels from column A\n",[33,152193,152194,152196,152198,152200,152202,152204,152206,152208,152210,152212,152214,152216,152218,152220],{"class":35,"line":374},[33,152195,151495],{"class":167},[33,152197,242],{"class":163},[33,152199,151436],{"class":167},[33,152201,17651],{"class":238},[33,152203,242],{"class":163},[33,152205,734],{"class":50},[33,152207,365],{"class":167},[33,152209,17642],{"class":238},[33,152211,242],{"class":163},[33,152213,1533],{"class":50},[33,152215,365],{"class":167},[33,152217,97398],{"class":238},[33,152219,242],{"class":163},[33,152221,151467],{"class":167},[33,152223,152224],{"class":35,"line":397},[33,152225,152226],{"class":167},"pie.set_categories(cats_ref)\n",[33,152228,152229],{"class":35,"line":653},[33,152230,92],{"emptyLinePlaceholder":91},[33,152232,152233],{"class":35,"line":667},[33,152234,152235],{"class":39},"# Explode the first slice to highlight it\n",[33,152237,152238,152241,152243,152246,152248,152250,152252,152254,152257,152259,152261],{"class":35,"line":675},[33,152239,152240],{"class":167},"slice0 ",[33,152242,242],{"class":163},[33,152244,152245],{"class":167}," DataPoint(",[33,152247,72912],{"class":238},[33,152249,242],{"class":163},[33,152251,748],{"class":50},[33,152253,365],{"class":167},[33,152255,152256],{"class":238},"explosion",[33,152258,242],{"class":163},[33,152260,3545],{"class":50},[33,152262,221],{"class":167},[33,152264,152265,152268,152270,152273,152275],{"class":35,"line":689},[33,152266,152267],{"class":167},"pie.series[",[33,152269,748],{"class":50},[33,152271,152272],{"class":167},"].data_points ",[33,152274,242],{"class":163},[33,152276,152277],{"class":167}," [slice0]\n",[33,152279,152280],{"class":35,"line":703},[33,152281,92],{"emptyLinePlaceholder":91},[33,152283,152284,152287,152289],{"class":35,"line":714},[33,152285,152286],{"class":167},"pie.dataLabels ",[33,152288,242],{"class":163},[33,152290,152291],{"class":167}," openpyxl.chart.label.DataLabelList()\n",[33,152293,152294,152297,152299],{"class":35,"line":723},[33,152295,152296],{"class":167},"pie.dataLabels.showPercent ",[33,152298,242],{"class":163},[33,152300,2887],{"class":50},[33,152302,152303],{"class":35,"line":754},[33,152304,92],{"emptyLinePlaceholder":91},[33,152306,152307,152310,152312],{"class":35,"line":771},[33,152308,152309],{"class":167},"pie.width  ",[33,152311,242],{"class":163},[33,152313,152314],{"class":50}," 15\n",[33,152316,152317,152320,152322],{"class":35,"line":777},[33,152318,152319],{"class":167},"pie.height ",[33,152321,242],{"class":163},[33,152323,152324],{"class":50}," 12\n",[33,152326,152327],{"class":35,"line":788},[33,152328,92],{"emptyLinePlaceholder":91},[33,152330,152331,152334,152337],{"class":35,"line":804},[33,152332,152333],{"class":167},"ws.add_chart(pie, ",[33,152335,152336],{"class":54},"\"F35\"",[33,152338,221],{"class":167},[33,152340,152341],{"class":35,"line":809},[33,152342,92],{"emptyLinePlaceholder":91},[33,152344,152345,152347,152349],{"class":35,"line":819},[33,152346,100907],{"class":167},[33,152348,126138],{"class":50},[33,152350,221],{"class":167},[33,152352,152353,152355,152357,152360],{"class":35,"line":829},[33,152354,13474],{"class":50},[33,152356,602],{"class":167},[33,152358,152359],{"class":54},"\"PieChart written.\"",[33,152361,221],{"class":167},[14,152363,152364,152367,152368,152371],{},[30,152365,152366],{},"DataPoint(idx=0, explosion=10)"," moves slice 0 outward by 10% for emphasis. ",[30,152369,152370],{},"showPercent=True"," adds percentage labels to each slice.",[2537,152373],{},[18,152375,152377],{"id":152376},"how-the-pieces-fit-together","How the pieces fit together",[14,152379,152380],{},"The diagram below shows the data flow from worksheet cells through the openpyxl object model to the embedded chart in the saved workbook.",[2540,152382,2547,152384,2547,152387,2547,152390,2547,2547,152404,2547,152406,2547,152408,2547,152411,2547,2547,152414,2547,2547,152417,2547,152420,2547,152423,2547,152426,2547,2547,152429,2547,2547,152432,2547,152434,2547,152438,2547,152441,2547,152444,2547,152447,2547,2547,152451,2547,2547,152454,2547,152456,2547,152458,2547,152460,2547,152462,152465,2547,152468,152471,2547,152473,152476],{"viewBox":2542,"role":2543,"ariaLabel":152383,"xmlns":2545,"style":2546},"Data flow from worksheet cells through Reference and chart objects to embedded chart in workbook",[2549,152385,152386],{},"openpyxl chart pipeline",[2553,152388,152389],{},"Four-stage flow: worksheet cells feed a Reference object, which feeds add_data\u002Fset_categories on a chart object, which is embedded in the workbook via add_chart.",[2557,152391,2559,152392,2559,152399,2547],{},[2561,152393,2564,152395,2564,152397,2559],{"id":152394,"x1":748,"y1":748,"x2":734,"y2":748},"excel-formulas-grad",[2566,152396],{"offset":748,"style":2568},[2566,152398],{"offset":734,"style":2571},[2573,152400,2564,152402,2559],{"id":152401,"viewBox":2576,"refX":2577,"refY":1153,"markerWidth":1179,"markerHeight":1179,"orient":2578},"excel-formulas-arrow",[2580,152403],{"d":2582,"fill":2583},[2585,152405],{"x":2587,"y":2597,"width":2635,"height":2597,"rx":3545,"fill":2592,"stroke":2593,"style":2594},[2000,152407,99747],{"x":16357,"y":71536,"fill":2599,"style":2600},[2000,152409,152410],{"x":16357,"y":2588,"fill":2583,"style":2685},"ws[\"B2:B7\"]",[2000,152412,152413],{"x":16357,"y":11194,"fill":2583,"style":2605},"cell.value = \"=SUM(...)\"",[35,152415],{"x1":11115,"y1":2589,"x2":100318,"y2":2589,"stroke":2583,"markerEnd":152416,"style":2594},"url(#excel-formulas-arrow)",[2585,152418],{"x":100318,"y":2597,"width":2598,"height":2597,"rx":3545,"fill":152419,"stroke":2593,"style":2594},"url(#excel-formulas-grad)",[2000,152421,149557],{"x":152422,"y":2679,"fill":2599,"style":2600},"293",[2000,152424,152425],{"x":152422,"y":11214,"fill":2599,"style":2685},"min_col, max_col",[2000,152427,152428],{"x":152422,"y":11218,"fill":2599,"style":2685},"min_row, max_row",[35,152430],{"x1":152431,"y1":2589,"x2":120798,"y2":2589,"stroke":2583,"markerEnd":152416,"style":2594},"370",[2585,152433],{"x":120798,"y":2590,"width":2610,"height":2589,"rx":3545,"fill":2592,"stroke":11166,"style":2594},[2000,152435,152437],{"x":152436,"y":2630,"fill":2599,"style":2600},"495","Chart Object",[2000,152439,152440],{"x":152436,"y":2679,"fill":2583,"style":2685},"BarChart \u002F LineChart",[2000,152442,152443],{"x":152436,"y":11214,"fill":2583,"style":2685},"add_data(ref)",[2000,152445,152446],{"x":152436,"y":11218,"fill":2583,"style":2685},"set_categories(ref)",[2000,152448,152450],{"x":152436,"y":152449,"fill":2583,"style":2605},"164","width \u002F height \u002F style",[35,152452],{"x1":152453,"y1":2589,"x2":49863,"y2":2589,"stroke":2583,"markerEnd":152416,"style":2594},"575",[2585,152455],{"x":49863,"y":2597,"width":2589,"height":2597,"rx":3545,"fill":11165,"stroke":11166,"style":2594},[2000,152457,102536],{"x":59986,"y":71536,"fill":2599,"style":2600},[2000,152459,151208],{"x":59986,"y":2588,"fill":2583,"style":2685},[2000,152461,100938],{"x":59986,"y":11194,"fill":2583,"style":2685},[2000,152463,152464],{"x":16357,"y":2611,"fill":11166,"style":2685},"\nformulas + number_format\n",[35,152466],{"x1":16357,"y1":2610,"x2":16357,"y2":64936,"stroke":11166,"style":152467},"stroke-width:1;stroke-dasharray:4,3",[2000,152469,152470],{"x":152422,"y":2611,"fill":11166,"style":2685},"\nDefinedName (named range)\n",[35,152472],{"x1":152422,"y1":2610,"x2":152422,"y2":64936,"stroke":11166,"style":152467},[2000,152474,152475],{"x":152436,"y":2701,"fill":2583,"style":2685},"\nfreeze_panes \u002F styling\n",[35,152477],{"x1":152436,"y1":58337,"x2":152436,"y2":110852,"stroke":2583,"style":152467},[2537,152479],{},[18,152481,152483],{"id":152482},"step-8-styling-cells-and-freezing-panes","Step 8 — Styling cells and freezing panes",[424,152485,152487],{"id":152486},"header-styling","Header styling",[23,152489,152491],{"className":126,"code":152490,"language":47,"meta":28,"style":28},"# pip install openpyxl\nfrom pathlib import Path\nimport openpyxl\nfrom openpyxl.styles import Font, PatternFill, Alignment, Border, Side\n\nWORKBOOK = Path(\"sample_sales.xlsx\")\n\ntry:\n    wb = openpyxl.load_workbook(WORKBOOK)\nexcept FileNotFoundError as exc:\n    raise SystemExit(f\"File not found: {exc}\") from exc\n\nws = wb[\"Sales\"]\n\nheader_font  = Font(bold=True, color=\"FFFFFF\", size=12)\nheader_fill  = PatternFill(\"solid\", fgColor=\"2563EB\")\ncenter_align = Alignment(horizontal=\"center\", vertical=\"center\")\nthin_border  = Border(\n    bottom=Side(style=\"thin\", color=\"E2E8F0\"),\n    right=Side(style=\"thin\",  color=\"E2E8F0\"),\n)\n\nfor col_idx in range(1, ws.max_column + 1):\n    cell = ws.cell(row=1, column=col_idx)\n    cell.font      = header_font\n    cell.fill      = header_fill\n    cell.alignment = center_align\n    cell.border    = thin_border\n\n# Auto-fit column widths (approximate — openpyxl has no native auto-fit)\nfor col in ws.columns:\n    max_len = max((len(str(c.value)) for c in col if c.value), default=8)\n    ws.column_dimensions[col[0].column_letter].width = min(max_len + 4, 40)\n\nwb.save(WORKBOOK)\nprint(\"Headers styled.\")\n",[30,152492,152493,152497,152507,152513,152524,152528,152540,152544,152550,152562,152572,152598,152602,152614,152618,152651,152672,152698,152708,152736,152761,152765,152769,152791,152815,152825,152834,152844,152854,152858,152863,152873,152914,152939,152943,152951],{"__ignoreMap":28},[33,152494,152495],{"class":35,"line":36},[33,152496,98209],{"class":39},[33,152498,152499,152501,152503,152505],{"class":35,"line":43},[33,152500,190],{"class":163},[33,152502,193],{"class":167},[33,152504,164],{"class":163},[33,152506,198],{"class":167},[33,152508,152509,152511],{"class":35,"line":61},[33,152510,164],{"class":163},[33,152512,95887],{"class":167},[33,152514,152515,152517,152519,152521],{"class":35,"line":73},[33,152516,190],{"class":163},[33,152518,17115],{"class":167},[33,152520,164],{"class":163},[33,152522,152523],{"class":167}," Font, PatternFill, Alignment, Border, Side\n",[33,152525,152526],{"class":35,"line":88},[33,152527,92],{"emptyLinePlaceholder":91},[33,152529,152530,152532,152534,152536,152538],{"class":35,"line":95},[33,152531,126138],{"class":50},[33,152533,212],{"class":163},[33,152535,215],{"class":167},[33,152537,149709],{"class":54},[33,152539,221],{"class":167},[33,152541,152542],{"class":35,"line":101},[33,152543,92],{"emptyLinePlaceholder":91},[33,152545,152546,152548],{"class":35,"line":171},[33,152547,35574],{"class":163},[33,152549,574],{"class":167},[33,152551,152552,152554,152556,152558,152560],{"class":35,"line":179},[33,152553,17432],{"class":167},[33,152555,242],{"class":163},[33,152557,144711],{"class":167},[33,152559,126138],{"class":50},[33,152561,221],{"class":167},[33,152563,152564,152566,152568,152570],{"class":35,"line":187},[33,152565,35726],{"class":163},[33,152567,2945],{"class":50},[33,152569,1852],{"class":163},[33,152571,1855],{"class":167},[33,152573,152574,152576,152578,152580,152582,152584,152586,152588,152590,152592,152594,152596],{"class":35,"line":201},[33,152575,35742],{"class":163},[33,152577,16617],{"class":50},[33,152579,602],{"class":167},[33,152581,4059],{"class":163},[33,152583,15677],{"class":54},[33,152585,1115],{"class":50},[33,152587,6565],{"class":167},[33,152589,1121],{"class":50},[33,152591,274],{"class":54},[33,152593,1649],{"class":167},[33,152595,190],{"class":163},[33,152597,20843],{"class":167},[33,152599,152600],{"class":35,"line":206},[33,152601,92],{"emptyLinePlaceholder":91},[33,152603,152604,152606,152608,152610,152612],{"class":35,"line":224},[33,152605,98330],{"class":167},[33,152607,242],{"class":163},[33,152609,17447],{"class":167},[33,152611,140420],{"class":54},[33,152613,9202],{"class":167},[33,152615,152616],{"class":35,"line":229},[33,152617,92],{"emptyLinePlaceholder":91},[33,152619,152620,152623,152625,152627,152629,152631,152633,152635,152637,152639,152641,152643,152645,152647,152649],{"class":35,"line":235},[33,152621,152622],{"class":167},"header_font  ",[33,152624,242],{"class":163},[33,152626,17233],{"class":167},[33,152628,17236],{"class":238},[33,152630,242],{"class":163},[33,152632,855],{"class":50},[33,152634,365],{"class":167},[33,152636,17245],{"class":238},[33,152638,242],{"class":163},[33,152640,17250],{"class":54},[33,152642,365],{"class":167},[33,152644,17255],{"class":238},[33,152646,242],{"class":163},[33,152648,55650],{"class":50},[33,152650,221],{"class":167},[33,152652,152653,152656,152658,152660,152662,152664,152666,152668,152670],{"class":35,"line":250},[33,152654,152655],{"class":167},"header_fill  ",[33,152657,242],{"class":163},[33,152659,17185],{"class":167},[33,152661,17188],{"class":54},[33,152663,365],{"class":167},[33,152665,17193],{"class":238},[33,152667,242],{"class":163},[33,152669,17198],{"class":54},[33,152671,221],{"class":167},[33,152673,152674,152677,152679,152681,152683,152685,152687,152689,152692,152694,152696],{"class":35,"line":266},[33,152675,152676],{"class":167},"center_align ",[33,152678,242],{"class":163},[33,152680,17507],{"class":167},[33,152682,17510],{"class":238},[33,152684,242],{"class":163},[33,152686,17515],{"class":54},[33,152688,365],{"class":167},[33,152690,152691],{"class":238},"vertical",[33,152693,242],{"class":163},[33,152695,17515],{"class":54},[33,152697,221],{"class":167},[33,152699,152700,152703,152705],{"class":35,"line":290},[33,152701,152702],{"class":167},"thin_border  ",[33,152704,242],{"class":163},[33,152706,152707],{"class":167}," Border(\n",[33,152709,152710,152713,152715,152718,152720,152722,152725,152727,152729,152731,152734],{"class":35,"line":295},[33,152711,152712],{"class":238},"    bottom",[33,152714,242],{"class":163},[33,152716,152717],{"class":167},"Side(",[33,152719,6953],{"class":238},[33,152721,242],{"class":163},[33,152723,152724],{"class":54},"\"thin\"",[33,152726,365],{"class":167},[33,152728,17245],{"class":238},[33,152730,242],{"class":163},[33,152732,152733],{"class":54},"\"E2E8F0\"",[33,152735,1506],{"class":167},[33,152737,152738,152741,152743,152745,152747,152749,152751,152753,152755,152757,152759],{"class":35,"line":300},[33,152739,152740],{"class":238},"    right",[33,152742,242],{"class":163},[33,152744,152717],{"class":167},[33,152746,6953],{"class":238},[33,152748,242],{"class":163},[33,152750,152724],{"class":54},[33,152752,25480],{"class":167},[33,152754,17245],{"class":238},[33,152756,242],{"class":163},[33,152758,152733],{"class":54},[33,152760,1506],{"class":167},[33,152762,152763],{"class":35,"line":317},[33,152764,221],{"class":167},[33,152766,152767],{"class":35,"line":332},[33,152768,92],{"emptyLinePlaceholder":91},[33,152770,152771,152773,152775,152777,152779,152781,152783,152785,152787,152789],{"class":35,"line":347},[33,152772,6124],{"class":163},[33,152774,17741],{"class":167},[33,152776,662],{"class":163},[33,152778,1801],{"class":50},[33,152780,602],{"class":167},[33,152782,734],{"class":50},[33,152784,17559],{"class":167},[33,152786,1811],{"class":163},[33,152788,1814],{"class":50},[33,152790,1737],{"class":167},[33,152792,152793,152796,152798,152800,152802,152804,152806,152808,152810,152812],{"class":35,"line":374},[33,152794,152795],{"class":167},"    cell ",[33,152797,242],{"class":163},[33,152799,17573],{"class":167},[33,152801,98107],{"class":238},[33,152803,242],{"class":163},[33,152805,734],{"class":50},[33,152807,365],{"class":167},[33,152809,98115],{"class":238},[33,152811,242],{"class":163},[33,152813,152814],{"class":167},"col_idx)\n",[33,152816,152817,152820,152822],{"class":35,"line":397},[33,152818,152819],{"class":167},"    cell.font      ",[33,152821,242],{"class":163},[33,152823,152824],{"class":167}," header_font\n",[33,152826,152827,152830,152832],{"class":35,"line":653},[33,152828,152829],{"class":167},"    cell.fill      ",[33,152831,242],{"class":163},[33,152833,23181],{"class":167},[33,152835,152836,152839,152841],{"class":35,"line":667},[33,152837,152838],{"class":167},"    cell.alignment ",[33,152840,242],{"class":163},[33,152842,152843],{"class":167}," center_align\n",[33,152845,152846,152849,152851],{"class":35,"line":675},[33,152847,152848],{"class":167},"    cell.border    ",[33,152850,242],{"class":163},[33,152852,152853],{"class":167}," thin_border\n",[33,152855,152856],{"class":35,"line":689},[33,152857,92],{"emptyLinePlaceholder":91},[33,152859,152860],{"class":35,"line":703},[33,152861,152862],{"class":39},"# Auto-fit column widths (approximate — openpyxl has no native auto-fit)\n",[33,152864,152865,152867,152869,152871],{"class":35,"line":714},[33,152866,6124],{"class":163},[33,152868,7985],{"class":167},[33,152870,662],{"class":163},[33,152872,97263],{"class":167},[33,152874,152875,152878,152880,152882,152884,152886,152888,152890,152893,152895,152897,152899,152901,152903,152906,152908,152910,152912],{"class":35,"line":723},[33,152876,152877],{"class":167},"    max_len ",[33,152879,242],{"class":163},[33,152881,45817],{"class":50},[33,152883,48215],{"class":167},[33,152885,928],{"class":50},[33,152887,602],{"class":167},[33,152889,1053],{"class":50},[33,152891,152892],{"class":167},"(c.value)) ",[33,152894,6124],{"class":163},[33,152896,7486],{"class":167},[33,152898,662],{"class":163},[33,152900,7985],{"class":167},[33,152902,2491],{"class":163},[33,152904,152905],{"class":167}," c.value), ",[33,152907,6685],{"class":238},[33,152909,242],{"class":163},[33,152911,2591],{"class":50},[33,152913,221],{"class":167},[33,152915,152916,152919,152921,152923,152925,152927,152929,152931,152933,152935,152937],{"class":35,"line":754},[33,152917,152918],{"class":167},"    ws.column_dimensions[col[",[33,152920,748],{"class":50},[33,152922,97326],{"class":167},[33,152924,242],{"class":163},[33,152926,73775],{"class":50},[33,152928,97333],{"class":167},[33,152930,1811],{"class":163},[33,152932,82708],{"class":50},[33,152934,365],{"class":167},[33,152936,26323],{"class":50},[33,152938,221],{"class":167},[33,152940,152941],{"class":35,"line":771},[33,152942,92],{"emptyLinePlaceholder":91},[33,152944,152945,152947,152949],{"class":35,"line":777},[33,152946,100907],{"class":167},[33,152948,126138],{"class":50},[33,152950,221],{"class":167},[33,152952,152953,152955,152957,152960],{"class":35,"line":788},[33,152954,13474],{"class":50},[33,152956,602],{"class":167},[33,152958,152959],{"class":54},"\"Headers styled.\"",[33,152961,221],{"class":167},[424,152963,152965],{"id":152964},"freezing-panes","Freezing panes",[14,152967,152968,152970,152971,152974,152975,152978,152979,152981],{},[30,152969,99760],{}," takes the address of the first ",[26245,152972,152973],{},"unfrozen"," cell. Setting it to ",[30,152976,152977],{},"\"A2\""," freezes row 1 only; ",[30,152980,119565],{}," freezes both the first row and the first column.",[23,152983,152985],{"className":126,"code":152984,"language":47,"meta":28,"style":28},"# pip install openpyxl\nfrom pathlib import Path\nimport openpyxl\n\nWORKBOOK = Path(\"sample_sales.xlsx\")\n\ntry:\n    wb = openpyxl.load_workbook(WORKBOOK)\nexcept FileNotFoundError as exc:\n    raise SystemExit(f\"File not found: {exc}\") from exc\n\nws = wb[\"Sales\"]\nws.freeze_panes = \"A2\"   # row 1 (header) stays visible when scrolling\n\nwb.save(WORKBOOK)\nprint(\"Row 1 frozen.\")\n",[30,152986,152987,152991,153001,153007,153011,153023,153027,153033,153045,153055,153081,153085,153097,153109,153113,153121],{"__ignoreMap":28},[33,152988,152989],{"class":35,"line":36},[33,152990,98209],{"class":39},[33,152992,152993,152995,152997,152999],{"class":35,"line":43},[33,152994,190],{"class":163},[33,152996,193],{"class":167},[33,152998,164],{"class":163},[33,153000,198],{"class":167},[33,153002,153003,153005],{"class":35,"line":61},[33,153004,164],{"class":163},[33,153006,95887],{"class":167},[33,153008,153009],{"class":35,"line":73},[33,153010,92],{"emptyLinePlaceholder":91},[33,153012,153013,153015,153017,153019,153021],{"class":35,"line":88},[33,153014,126138],{"class":50},[33,153016,212],{"class":163},[33,153018,215],{"class":167},[33,153020,149709],{"class":54},[33,153022,221],{"class":167},[33,153024,153025],{"class":35,"line":95},[33,153026,92],{"emptyLinePlaceholder":91},[33,153028,153029,153031],{"class":35,"line":101},[33,153030,35574],{"class":163},[33,153032,574],{"class":167},[33,153034,153035,153037,153039,153041,153043],{"class":35,"line":171},[33,153036,17432],{"class":167},[33,153038,242],{"class":163},[33,153040,144711],{"class":167},[33,153042,126138],{"class":50},[33,153044,221],{"class":167},[33,153046,153047,153049,153051,153053],{"class":35,"line":179},[33,153048,35726],{"class":163},[33,153050,2945],{"class":50},[33,153052,1852],{"class":163},[33,153054,1855],{"class":167},[33,153056,153057,153059,153061,153063,153065,153067,153069,153071,153073,153075,153077,153079],{"class":35,"line":187},[33,153058,35742],{"class":163},[33,153060,16617],{"class":50},[33,153062,602],{"class":167},[33,153064,4059],{"class":163},[33,153066,15677],{"class":54},[33,153068,1115],{"class":50},[33,153070,6565],{"class":167},[33,153072,1121],{"class":50},[33,153074,274],{"class":54},[33,153076,1649],{"class":167},[33,153078,190],{"class":163},[33,153080,20843],{"class":167},[33,153082,153083],{"class":35,"line":201},[33,153084,92],{"emptyLinePlaceholder":91},[33,153086,153087,153089,153091,153093,153095],{"class":35,"line":206},[33,153088,98330],{"class":167},[33,153090,242],{"class":163},[33,153092,17447],{"class":167},[33,153094,140420],{"class":54},[33,153096,9202],{"class":167},[33,153098,153099,153102,153104,153106],{"class":35,"line":224},[33,153100,153101],{"class":167},"ws.freeze_panes ",[33,153103,242],{"class":163},[33,153105,100145],{"class":54},[33,153107,153108],{"class":39},"   # row 1 (header) stays visible when scrolling\n",[33,153110,153111],{"class":35,"line":229},[33,153112,92],{"emptyLinePlaceholder":91},[33,153114,153115,153117,153119],{"class":35,"line":235},[33,153116,100907],{"class":167},[33,153118,126138],{"class":50},[33,153120,221],{"class":167},[33,153122,153123,153125,153127,153130],{"class":35,"line":250},[33,153124,13474],{"class":50},[33,153126,602],{"class":167},[33,153128,153129],{"class":54},"\"Row 1 frozen.\"",[33,153131,221],{"class":167},[14,153133,153134,153135,3035],{},"To unfreeze: ",[30,153136,153137],{},"ws.freeze_panes = None",[2537,153139],{},[18,153141,61980],{"id":2708},[424,153143,153145],{"id":153144},"multiple-chart-types-on-one-sheet","Multiple chart types on one sheet",[14,153147,153148],{},"Place each chart at a different anchor cell. Cell addresses use Excel notation — if charts overlap, the last one added renders on top.",[23,153150,153152],{"className":126,"code":153151,"language":47,"meta":28,"style":28},"# pip install openpyxl\nfrom pathlib import Path\nimport openpyxl\nfrom openpyxl.chart import BarChart, LineChart, Reference\n\nWORKBOOK = Path(\"sample_sales.xlsx\")\n\ntry:\n    wb = openpyxl.load_workbook(WORKBOOK)\nexcept FileNotFoundError as exc:\n    raise SystemExit(f\"File not found: {exc}\") from exc\n\nws = wb[\"Sales\"]\nlast_row = ws.max_row\n\nbar = BarChart()\nbar.title = \"Revenue vs Cost\"\nbar.add_data(Reference(ws, min_col=2, max_col=3, min_row=1, max_row=last_row), titles_from_data=True)\nbar.set_categories(Reference(ws, min_col=1, min_row=2, max_row=last_row))\nbar.width, bar.height = 18, 10\nws.add_chart(bar, \"F2\")\n\nline = LineChart()\nline.title = \"Margin Trend\"\nline.add_data(Reference(ws, min_col=4, min_row=1, max_row=last_row), titles_from_data=True)\nline.set_categories(Reference(ws, min_col=1, min_row=2, max_row=last_row))\nline.width, line.height = 18, 10\nws.add_chart(line, \"F22\")\n\nwb.save(WORKBOOK)\nprint(\"Two charts written.\")\n",[30,153153,153154,153158,153168,153174,153185,153189,153201,153205,153211,153223,153233,153259,153263,153275,153285,153289,153298,153308,153352,153380,153395,153404,153408,153416,153426,153461,153488,153501,153510,153514,153522],{"__ignoreMap":28},[33,153155,153156],{"class":35,"line":36},[33,153157,98209],{"class":39},[33,153159,153160,153162,153164,153166],{"class":35,"line":43},[33,153161,190],{"class":163},[33,153163,193],{"class":167},[33,153165,164],{"class":163},[33,153167,198],{"class":167},[33,153169,153170,153172],{"class":35,"line":61},[33,153171,164],{"class":163},[33,153173,95887],{"class":167},[33,153175,153176,153178,153180,153182],{"class":35,"line":73},[33,153177,190],{"class":163},[33,153179,151240],{"class":167},[33,153181,164],{"class":163},[33,153183,153184],{"class":167}," BarChart, LineChart, Reference\n",[33,153186,153187],{"class":35,"line":88},[33,153188,92],{"emptyLinePlaceholder":91},[33,153190,153191,153193,153195,153197,153199],{"class":35,"line":95},[33,153192,126138],{"class":50},[33,153194,212],{"class":163},[33,153196,215],{"class":167},[33,153198,149709],{"class":54},[33,153200,221],{"class":167},[33,153202,153203],{"class":35,"line":101},[33,153204,92],{"emptyLinePlaceholder":91},[33,153206,153207,153209],{"class":35,"line":171},[33,153208,35574],{"class":163},[33,153210,574],{"class":167},[33,153212,153213,153215,153217,153219,153221],{"class":35,"line":179},[33,153214,17432],{"class":167},[33,153216,242],{"class":163},[33,153218,144711],{"class":167},[33,153220,126138],{"class":50},[33,153222,221],{"class":167},[33,153224,153225,153227,153229,153231],{"class":35,"line":187},[33,153226,35726],{"class":163},[33,153228,2945],{"class":50},[33,153230,1852],{"class":163},[33,153232,1855],{"class":167},[33,153234,153235,153237,153239,153241,153243,153245,153247,153249,153251,153253,153255,153257],{"class":35,"line":201},[33,153236,35742],{"class":163},[33,153238,16617],{"class":50},[33,153240,602],{"class":167},[33,153242,4059],{"class":163},[33,153244,15677],{"class":54},[33,153246,1115],{"class":50},[33,153248,6565],{"class":167},[33,153250,1121],{"class":50},[33,153252,274],{"class":54},[33,153254,1649],{"class":167},[33,153256,190],{"class":163},[33,153258,20843],{"class":167},[33,153260,153261],{"class":35,"line":206},[33,153262,92],{"emptyLinePlaceholder":91},[33,153264,153265,153267,153269,153271,153273],{"class":35,"line":224},[33,153266,98330],{"class":167},[33,153268,242],{"class":163},[33,153270,17447],{"class":167},[33,153272,140420],{"class":54},[33,153274,9202],{"class":167},[33,153276,153277,153280,153282],{"class":35,"line":229},[33,153278,153279],{"class":167},"last_row ",[33,153281,242],{"class":163},[33,153283,153284],{"class":167}," ws.max_row\n",[33,153286,153287],{"class":35,"line":235},[33,153288,92],{"emptyLinePlaceholder":91},[33,153290,153291,153294,153296],{"class":35,"line":250},[33,153292,153293],{"class":167},"bar ",[33,153295,242],{"class":163},[33,153297,151349],{"class":167},[33,153299,153300,153303,153305],{"class":35,"line":266},[33,153301,153302],{"class":167},"bar.title ",[33,153304,242],{"class":163},[33,153306,153307],{"class":54}," \"Revenue vs Cost\"\n",[33,153309,153310,153313,153315,153317,153319,153321,153323,153325,153327,153329,153331,153333,153335,153337,153339,153341,153344,153346,153348,153350],{"class":35,"line":290},[33,153311,153312],{"class":167},"bar.add_data(Reference(ws, ",[33,153314,17651],{"class":238},[33,153316,242],{"class":163},[33,153318,1533],{"class":50},[33,153320,365],{"class":167},[33,153322,17659],{"class":238},[33,153324,242],{"class":163},[33,153326,10258],{"class":50},[33,153328,365],{"class":167},[33,153330,17642],{"class":238},[33,153332,242],{"class":163},[33,153334,734],{"class":50},[33,153336,365],{"class":167},[33,153338,97398],{"class":238},[33,153340,242],{"class":163},[33,153342,153343],{"class":167},"last_row), ",[33,153345,151475],{"class":238},[33,153347,242],{"class":163},[33,153349,855],{"class":50},[33,153351,221],{"class":167},[33,153353,153354,153357,153359,153361,153363,153365,153367,153369,153371,153373,153375,153377],{"class":35,"line":295},[33,153355,153356],{"class":167},"bar.set_categories(Reference(ws, ",[33,153358,17651],{"class":238},[33,153360,242],{"class":163},[33,153362,734],{"class":50},[33,153364,365],{"class":167},[33,153366,17642],{"class":238},[33,153368,242],{"class":163},[33,153370,1533],{"class":50},[33,153372,365],{"class":167},[33,153374,97398],{"class":238},[33,153376,242],{"class":163},[33,153378,153379],{"class":167},"last_row))\n",[33,153381,153382,153385,153387,153390,153392],{"class":35,"line":300},[33,153383,153384],{"class":167},"bar.width, bar.height ",[33,153386,242],{"class":163},[33,153388,153389],{"class":50}," 18",[33,153391,365],{"class":167},[33,153393,153394],{"class":50},"10\n",[33,153396,153397,153400,153402],{"class":35,"line":317},[33,153398,153399],{"class":167},"ws.add_chart(bar, ",[33,153401,104865],{"class":54},[33,153403,221],{"class":167},[33,153405,153406],{"class":35,"line":332},[33,153407,92],{"emptyLinePlaceholder":91},[33,153409,153410,153412,153414],{"class":35,"line":347},[33,153411,151754],{"class":167},[33,153413,242],{"class":163},[33,153415,151759],{"class":167},[33,153417,153418,153421,153423],{"class":35,"line":374},[33,153419,153420],{"class":167},"line.title ",[33,153422,242],{"class":163},[33,153424,153425],{"class":54}," \"Margin Trend\"\n",[33,153427,153428,153431,153433,153435,153437,153439,153441,153443,153445,153447,153449,153451,153453,153455,153457,153459],{"class":35,"line":397},[33,153429,153430],{"class":167},"line.add_data(Reference(ws, ",[33,153432,17651],{"class":238},[33,153434,242],{"class":163},[33,153436,1503],{"class":50},[33,153438,365],{"class":167},[33,153440,17642],{"class":238},[33,153442,242],{"class":163},[33,153444,734],{"class":50},[33,153446,365],{"class":167},[33,153448,97398],{"class":238},[33,153450,242],{"class":163},[33,153452,153343],{"class":167},[33,153454,151475],{"class":238},[33,153456,242],{"class":163},[33,153458,855],{"class":50},[33,153460,221],{"class":167},[33,153462,153463,153466,153468,153470,153472,153474,153476,153478,153480,153482,153484,153486],{"class":35,"line":653},[33,153464,153465],{"class":167},"line.set_categories(Reference(ws, ",[33,153467,17651],{"class":238},[33,153469,242],{"class":163},[33,153471,734],{"class":50},[33,153473,365],{"class":167},[33,153475,17642],{"class":238},[33,153477,242],{"class":163},[33,153479,1533],{"class":50},[33,153481,365],{"class":167},[33,153483,97398],{"class":238},[33,153485,242],{"class":163},[33,153487,153379],{"class":167},[33,153489,153490,153493,153495,153497,153499],{"class":35,"line":667},[33,153491,153492],{"class":167},"line.width, line.height ",[33,153494,242],{"class":163},[33,153496,153389],{"class":50},[33,153498,365],{"class":167},[33,153500,153394],{"class":50},[33,153502,153503,153505,153508],{"class":35,"line":675},[33,153504,151933],{"class":167},[33,153506,153507],{"class":54},"\"F22\"",[33,153509,221],{"class":167},[33,153511,153512],{"class":35,"line":689},[33,153513,92],{"emptyLinePlaceholder":91},[33,153515,153516,153518,153520],{"class":35,"line":703},[33,153517,100907],{"class":167},[33,153519,126138],{"class":50},[33,153521,221],{"class":167},[33,153523,153524,153526,153528,153531],{"class":35,"line":714},[33,153525,13474],{"class":50},[33,153527,602],{"class":167},[33,153529,153530],{"class":54},"\"Two charts written.\"",[33,153532,221],{"class":167},[424,153534,153536],{"id":153535},"writing-formulas-into-a-new-sheet","Writing formulas into a new sheet",[14,153538,153539,153540,153542],{},"When generating a report (as covered in ",[940,153541,6936],{"href":6935},"), create a summary sheet that references data sheets with cross-sheet formulas.",[23,153544,153546],{"className":126,"code":153545,"language":47,"meta":28,"style":28},"# pip install openpyxl\nfrom pathlib import Path\nimport openpyxl\n\nWORKBOOK = Path(\"sample_sales.xlsx\")\n\ntry:\n    wb = openpyxl.load_workbook(WORKBOOK)\nexcept FileNotFoundError as exc:\n    raise SystemExit(f\"File not found: {exc}\") from exc\n\n# Create a summary sheet\nif \"Summary\" not in wb.sheetnames:\n    wb.create_sheet(\"Summary\")\n\nsummary = wb[\"Summary\"]\nsummary[\"A1\"] = \"KPI\"\nsummary[\"B1\"] = \"Value\"\nsummary[\"A2\"] = \"Total Revenue\"\nsummary[\"B2\"] = \"=SUM(Sales!B2:B7)\"      # cross-sheet reference\nsummary[\"A3\"] = \"Total Cost\"\nsummary[\"B3\"] = \"=SUM(Sales!C2:C7)\"\nsummary[\"A4\"] = \"Net Profit\"\nsummary[\"B4\"] = \"=B2-B3\"\nsummary[\"A5\"] = \"Avg Margin\"\nsummary[\"B5\"] = \"=AVERAGE(Sales!D2:D7)\"\n\nwb.save(WORKBOOK)\nprint(\"Summary sheet with cross-sheet formulas written.\")\n",[30,153547,153548,153552,153562,153568,153572,153584,153588,153594,153606,153616,153642,153646,153651,153664,153673,153677,153690,153704,153718,153731,153747,153761,153775,153789,153803,153817,153831,153835,153843],{"__ignoreMap":28},[33,153549,153550],{"class":35,"line":36},[33,153551,98209],{"class":39},[33,153553,153554,153556,153558,153560],{"class":35,"line":43},[33,153555,190],{"class":163},[33,153557,193],{"class":167},[33,153559,164],{"class":163},[33,153561,198],{"class":167},[33,153563,153564,153566],{"class":35,"line":61},[33,153565,164],{"class":163},[33,153567,95887],{"class":167},[33,153569,153570],{"class":35,"line":73},[33,153571,92],{"emptyLinePlaceholder":91},[33,153573,153574,153576,153578,153580,153582],{"class":35,"line":88},[33,153575,126138],{"class":50},[33,153577,212],{"class":163},[33,153579,215],{"class":167},[33,153581,149709],{"class":54},[33,153583,221],{"class":167},[33,153585,153586],{"class":35,"line":95},[33,153587,92],{"emptyLinePlaceholder":91},[33,153589,153590,153592],{"class":35,"line":101},[33,153591,35574],{"class":163},[33,153593,574],{"class":167},[33,153595,153596,153598,153600,153602,153604],{"class":35,"line":171},[33,153597,17432],{"class":167},[33,153599,242],{"class":163},[33,153601,144711],{"class":167},[33,153603,126138],{"class":50},[33,153605,221],{"class":167},[33,153607,153608,153610,153612,153614],{"class":35,"line":179},[33,153609,35726],{"class":163},[33,153611,2945],{"class":50},[33,153613,1852],{"class":163},[33,153615,1855],{"class":167},[33,153617,153618,153620,153622,153624,153626,153628,153630,153632,153634,153636,153638,153640],{"class":35,"line":187},[33,153619,35742],{"class":163},[33,153621,16617],{"class":50},[33,153623,602],{"class":167},[33,153625,4059],{"class":163},[33,153627,15677],{"class":54},[33,153629,1115],{"class":50},[33,153631,6565],{"class":167},[33,153633,1121],{"class":50},[33,153635,274],{"class":54},[33,153637,1649],{"class":167},[33,153639,190],{"class":163},[33,153641,20843],{"class":167},[33,153643,153644],{"class":35,"line":201},[33,153645,92],{"emptyLinePlaceholder":91},[33,153647,153648],{"class":35,"line":206},[33,153649,153650],{"class":39},"# Create a summary sheet\n",[33,153652,153653,153655,153657,153659,153661],{"class":35,"line":224},[33,153654,2491],{"class":163},[33,153656,105593],{"class":54},[33,153658,620],{"class":163},[33,153660,8002],{"class":163},[33,153662,153663],{"class":167}," wb.sheetnames:\n",[33,153665,153666,153669,153671],{"class":35,"line":229},[33,153667,153668],{"class":167},"    wb.create_sheet(",[33,153670,103086],{"class":54},[33,153672,221],{"class":167},[33,153674,153675],{"class":35,"line":235},[33,153676,92],{"emptyLinePlaceholder":91},[33,153678,153679,153682,153684,153686,153688],{"class":35,"line":250},[33,153680,153681],{"class":167},"summary ",[33,153683,242],{"class":163},[33,153685,17447],{"class":167},[33,153687,103086],{"class":54},[33,153689,9202],{"class":167},[33,153691,153692,153695,153697,153699,153701],{"class":35,"line":266},[33,153693,153694],{"class":167},"summary[",[33,153696,99899],{"class":54},[33,153698,763],{"class":167},[33,153700,242],{"class":163},[33,153702,153703],{"class":54}," \"KPI\"\n",[33,153705,153706,153708,153711,153713,153715],{"class":35,"line":290},[33,153707,153694],{"class":167},[33,153709,153710],{"class":54},"\"B1\"",[33,153712,763],{"class":167},[33,153714,242],{"class":163},[33,153716,153717],{"class":54}," \"Value\"\n",[33,153719,153720,153722,153724,153726,153728],{"class":35,"line":295},[33,153721,153694],{"class":167},[33,153723,152977],{"class":54},[33,153725,763],{"class":167},[33,153727,242],{"class":163},[33,153729,153730],{"class":54}," \"Total Revenue\"\n",[33,153732,153733,153735,153737,153739,153741,153744],{"class":35,"line":300},[33,153734,153694],{"class":167},[33,153736,119565],{"class":54},[33,153738,763],{"class":167},[33,153740,242],{"class":163},[33,153742,153743],{"class":54}," \"=SUM(Sales!B2:B7)\"",[33,153745,153746],{"class":39},"      # cross-sheet reference\n",[33,153748,153749,153751,153754,153756,153758],{"class":35,"line":317},[33,153750,153694],{"class":167},[33,153752,153753],{"class":54},"\"A3\"",[33,153755,763],{"class":167},[33,153757,242],{"class":163},[33,153759,153760],{"class":54}," \"Total Cost\"\n",[33,153762,153763,153765,153768,153770,153772],{"class":35,"line":332},[33,153764,153694],{"class":167},[33,153766,153767],{"class":54},"\"B3\"",[33,153769,763],{"class":167},[33,153771,242],{"class":163},[33,153773,153774],{"class":54}," \"=SUM(Sales!C2:C7)\"\n",[33,153776,153777,153779,153782,153784,153786],{"class":35,"line":347},[33,153778,153694],{"class":167},[33,153780,153781],{"class":54},"\"A4\"",[33,153783,763],{"class":167},[33,153785,242],{"class":163},[33,153787,153788],{"class":54}," \"Net Profit\"\n",[33,153790,153791,153793,153796,153798,153800],{"class":35,"line":374},[33,153792,153694],{"class":167},[33,153794,153795],{"class":54},"\"B4\"",[33,153797,763],{"class":167},[33,153799,242],{"class":163},[33,153801,153802],{"class":54}," \"=B2-B3\"\n",[33,153804,153805,153807,153810,153812,153814],{"class":35,"line":397},[33,153806,153694],{"class":167},[33,153808,153809],{"class":54},"\"A5\"",[33,153811,763],{"class":167},[33,153813,242],{"class":163},[33,153815,153816],{"class":54}," \"Avg Margin\"\n",[33,153818,153819,153821,153824,153826,153828],{"class":35,"line":653},[33,153820,153694],{"class":167},[33,153822,153823],{"class":54},"\"B5\"",[33,153825,763],{"class":167},[33,153827,242],{"class":163},[33,153829,153830],{"class":54}," \"=AVERAGE(Sales!D2:D7)\"\n",[33,153832,153833],{"class":35,"line":667},[33,153834,92],{"emptyLinePlaceholder":91},[33,153836,153837,153839,153841],{"class":35,"line":675},[33,153838,100907],{"class":167},[33,153840,126138],{"class":50},[33,153842,221],{"class":167},[33,153844,153845,153847,153849,153852],{"class":35,"line":689},[33,153846,13474],{"class":50},[33,153848,602],{"class":167},[33,153850,153851],{"class":54},"\"Summary sheet with cross-sheet formulas written.\"",[33,153853,221],{"class":167},[424,153855,153857],{"id":153856},"reading-formula-results-back-into-python","Reading formula results back into Python",[14,153859,153860,153861,153865,153866,153868],{},"If you need formula results in Python rather than displaying them in Excel, compute the values in Python directly — either with ",[940,153862,9630],{"href":153863,"rel":153864},"https:\u002F\u002Fpandas.pydata.org\u002F",[1367]," before writing, or by reading with ",[940,153867,99577],{"href":99576}," after Excel has saved cached values.",[23,153870,153872],{"className":126,"code":153871,"language":47,"meta":28,"style":28},"# pip install openpyxl pandas\nfrom pathlib import Path\nimport pandas as pd\n\nWORKBOOK = Path(\"sample_sales.xlsx\")\n\ntry:\n    df = pd.read_excel(WORKBOOK, sheet_name=\"Sales\", engine=\"openpyxl\")\nexcept FileNotFoundError as exc:\n    raise SystemExit(f\"File not found: {exc}\") from exc\n\n# Compute margin in Python rather than relying on formula caching\ndf[\"Margin\"] = (df[\"Revenue\"] - df[\"Cost\"]) \u002F df[\"Revenue\"]\nprint(df[[\"Month\", \"Margin\"]].to_string(index=False))\n",[30,153873,153874,153878,153888,153898,153902,153914,153918,153924,153952,153962,153988,153992,153997,154029],{"__ignoreMap":28},[33,153875,153876],{"class":35,"line":36},[33,153877,97874],{"class":39},[33,153879,153880,153882,153884,153886],{"class":35,"line":43},[33,153881,190],{"class":163},[33,153883,193],{"class":167},[33,153885,164],{"class":163},[33,153887,198],{"class":167},[33,153889,153890,153892,153894,153896],{"class":35,"line":61},[33,153891,164],{"class":163},[33,153893,492],{"class":167},[33,153895,495],{"class":163},[33,153897,498],{"class":167},[33,153899,153900],{"class":35,"line":73},[33,153901,92],{"emptyLinePlaceholder":91},[33,153903,153904,153906,153908,153910,153912],{"class":35,"line":88},[33,153905,126138],{"class":50},[33,153907,212],{"class":163},[33,153909,215],{"class":167},[33,153911,149709],{"class":54},[33,153913,221],{"class":167},[33,153915,153916],{"class":35,"line":95},[33,153917,92],{"emptyLinePlaceholder":91},[33,153919,153920,153922],{"class":35,"line":101},[33,153921,35574],{"class":163},[33,153923,574],{"class":167},[33,153925,153926,153928,153930,153932,153934,153936,153938,153940,153942,153944,153946,153948,153950],{"class":35,"line":171},[33,153927,4025],{"class":167},[33,153929,242],{"class":163},[33,153931,126254],{"class":167},[33,153933,126138],{"class":50},[33,153935,365],{"class":167},[33,153937,17371],{"class":238},[33,153939,242],{"class":163},[33,153941,140420],{"class":54},[33,153943,365],{"class":167},[33,153945,17351],{"class":238},[33,153947,242],{"class":163},[33,153949,17356],{"class":54},[33,153951,221],{"class":167},[33,153953,153954,153956,153958,153960],{"class":35,"line":179},[33,153955,35726],{"class":163},[33,153957,2945],{"class":50},[33,153959,1852],{"class":163},[33,153961,1855],{"class":167},[33,153963,153964,153966,153968,153970,153972,153974,153976,153978,153980,153982,153984,153986],{"class":35,"line":187},[33,153965,35742],{"class":163},[33,153967,16617],{"class":50},[33,153969,602],{"class":167},[33,153971,4059],{"class":163},[33,153973,15677],{"class":54},[33,153975,1115],{"class":50},[33,153977,6565],{"class":167},[33,153979,1121],{"class":50},[33,153981,274],{"class":54},[33,153983,1649],{"class":167},[33,153985,190],{"class":163},[33,153987,20843],{"class":167},[33,153989,153990],{"class":35,"line":201},[33,153991,92],{"emptyLinePlaceholder":91},[33,153993,153994],{"class":35,"line":206},[33,153995,153996],{"class":39},"# Compute margin in Python rather than relying on formula caching\n",[33,153998,153999,154001,154003,154005,154007,154009,154011,154013,154015,154017,154019,154021,154023,154025,154027],{"class":35,"line":224},[33,154000,11038],{"class":167},[33,154002,19598],{"class":54},[33,154004,763],{"class":167},[33,154006,242],{"class":163},[33,154008,59771],{"class":167},[33,154010,12925],{"class":54},[33,154012,763],{"class":167},[33,154014,4126],{"class":163},[33,154016,7935],{"class":167},[33,154018,19593],{"class":54},[33,154020,8675],{"class":167},[33,154022,1351],{"class":163},[33,154024,7935],{"class":167},[33,154026,12925],{"class":54},[33,154028,9202],{"class":167},[33,154030,154031,154033,154036,154039,154041,154043,154046,154048,154050,154052],{"class":35,"line":229},[33,154032,13474],{"class":50},[33,154034,154035],{"class":167},"(df[[",[33,154037,154038],{"class":54},"\"Month\"",[33,154040,365],{"class":167},[33,154042,19598],{"class":54},[33,154044,154045],{"class":167},"]].to_string(",[33,154047,897],{"class":238},[33,154049,242],{"class":163},[33,154051,902],{"class":50},[33,154053,371],{"class":167},[2537,154055],{},[18,154057,52030],{"id":52029},[14,154059,154060],{},"After writing the file, assert its integrity before treating it as production output.",[23,154062,154064],{"className":126,"code":154063,"language":47,"meta":28,"style":28},"# pip install openpyxl\nfrom pathlib import Path\nimport openpyxl\n\nWORKBOOK = Path(\"sample_sales.xlsx\")\n\ntry:\n    wb = openpyxl.load_workbook(WORKBOOK, data_only=False)\nexcept FileNotFoundError as exc:\n    raise SystemExit(f\"File not found: {exc}\") from exc\n\nws = wb[\"Sales\"]\n\n# 1. Check formula strings were stored (not overwritten with None)\nmargin_formula = ws[\"D2\"].value\nassert isinstance(margin_formula, str) and margin_formula.startswith(\"=\"), \\\n    f\"Expected formula in D2, got: {margin_formula!r}\"\n\n# 2. Check named ranges exist\nassert \"Revenue\" in wb.defined_names, \"Named range 'Revenue' missing\"\nassert \"Cost\"    in wb.defined_names, \"Named range 'Cost' missing\"\n\n# 3. Check at least one chart is embedded\nassert len(ws._charts) >= 1, \"No charts found in Sales sheet\"\n\n# 4. Check freeze pane\nassert ws.freeze_panes == \"A2\", f\"Unexpected freeze_panes: {ws.freeze_panes!r}\"\n\nprint(\"All checks passed.\")\n",[30,154065,154066,154070,154080,154086,154090,154102,154106,154112,154132,154142,154168,154172,154184,154188,154193,154207,154229,154247,154251,154256,154271,154285,154289,154294,154312,154316,154321,154350,154354],{"__ignoreMap":28},[33,154067,154068],{"class":35,"line":36},[33,154069,98209],{"class":39},[33,154071,154072,154074,154076,154078],{"class":35,"line":43},[33,154073,190],{"class":163},[33,154075,193],{"class":167},[33,154077,164],{"class":163},[33,154079,198],{"class":167},[33,154081,154082,154084],{"class":35,"line":61},[33,154083,164],{"class":163},[33,154085,95887],{"class":167},[33,154087,154088],{"class":35,"line":73},[33,154089,92],{"emptyLinePlaceholder":91},[33,154091,154092,154094,154096,154098,154100],{"class":35,"line":88},[33,154093,126138],{"class":50},[33,154095,212],{"class":163},[33,154097,215],{"class":167},[33,154099,149709],{"class":54},[33,154101,221],{"class":167},[33,154103,154104],{"class":35,"line":95},[33,154105,92],{"emptyLinePlaceholder":91},[33,154107,154108,154110],{"class":35,"line":101},[33,154109,35574],{"class":163},[33,154111,574],{"class":167},[33,154113,154114,154116,154118,154120,154122,154124,154126,154128,154130],{"class":35,"line":171},[33,154115,17432],{"class":167},[33,154117,242],{"class":163},[33,154119,144711],{"class":167},[33,154121,126138],{"class":50},[33,154123,365],{"class":167},[33,154125,144726],{"class":238},[33,154127,242],{"class":163},[33,154129,902],{"class":50},[33,154131,221],{"class":167},[33,154133,154134,154136,154138,154140],{"class":35,"line":179},[33,154135,35726],{"class":163},[33,154137,2945],{"class":50},[33,154139,1852],{"class":163},[33,154141,1855],{"class":167},[33,154143,154144,154146,154148,154150,154152,154154,154156,154158,154160,154162,154164,154166],{"class":35,"line":187},[33,154145,35742],{"class":163},[33,154147,16617],{"class":50},[33,154149,602],{"class":167},[33,154151,4059],{"class":163},[33,154153,15677],{"class":54},[33,154155,1115],{"class":50},[33,154157,6565],{"class":167},[33,154159,1121],{"class":50},[33,154161,274],{"class":54},[33,154163,1649],{"class":167},[33,154165,190],{"class":163},[33,154167,20843],{"class":167},[33,154169,154170],{"class":35,"line":201},[33,154171,92],{"emptyLinePlaceholder":91},[33,154173,154174,154176,154178,154180,154182],{"class":35,"line":206},[33,154175,98330],{"class":167},[33,154177,242],{"class":163},[33,154179,17447],{"class":167},[33,154181,140420],{"class":54},[33,154183,9202],{"class":167},[33,154185,154186],{"class":35,"line":224},[33,154187,92],{"emptyLinePlaceholder":91},[33,154189,154190],{"class":35,"line":229},[33,154191,154192],{"class":39},"# 1. Check formula strings were stored (not overwritten with None)\n",[33,154194,154195,154198,154200,154202,154205],{"class":35,"line":235},[33,154196,154197],{"class":167},"margin_formula ",[33,154199,242],{"class":163},[33,154201,17472],{"class":167},[33,154203,154204],{"class":54},"\"D2\"",[33,154206,147361],{"class":167},[33,154208,154209,154211,154213,154216,154218,154220,154222,154225,154227],{"class":35,"line":250},[33,154210,36397],{"class":163},[33,154212,36538],{"class":50},[33,154214,154215],{"class":167},"(margin_formula, ",[33,154217,1053],{"class":50},[33,154219,1649],{"class":167},[33,154221,6001],{"class":163},[33,154223,154224],{"class":167}," margin_formula.startswith(",[33,154226,147706],{"class":54},[33,154228,119985],{"class":167},[33,154230,154231,154233,154236,154238,154241,154243,154245],{"class":35,"line":266},[33,154232,112430],{"class":163},[33,154234,154235],{"class":54},"\"Expected formula in D2, got: ",[33,154237,1115],{"class":50},[33,154239,154240],{"class":167},"margin_formula",[33,154242,76954],{"class":163},[33,154244,1121],{"class":50},[33,154246,7504],{"class":54},[33,154248,154249],{"class":35,"line":290},[33,154250,92],{"emptyLinePlaceholder":91},[33,154252,154253],{"class":35,"line":295},[33,154254,154255],{"class":39},"# 2. Check named ranges exist\n",[33,154257,154258,154260,154263,154265,154268],{"class":35,"line":300},[33,154259,36397],{"class":163},[33,154261,154262],{"class":54}," \"Revenue\"",[33,154264,8002],{"class":163},[33,154266,154267],{"class":167}," wb.defined_names, ",[33,154269,154270],{"class":54},"\"Named range 'Revenue' missing\"\n",[33,154272,154273,154275,154278,154280,154282],{"class":35,"line":317},[33,154274,36397],{"class":163},[33,154276,154277],{"class":54}," \"Cost\"",[33,154279,98316],{"class":163},[33,154281,154267],{"class":167},[33,154283,154284],{"class":54},"\"Named range 'Cost' missing\"\n",[33,154286,154287],{"class":35,"line":332},[33,154288,92],{"emptyLinePlaceholder":91},[33,154290,154291],{"class":35,"line":347},[33,154292,154293],{"class":39},"# 3. Check at least one chart is embedded\n",[33,154295,154296,154298,154300,154303,154305,154307,154309],{"class":35,"line":374},[33,154297,36397],{"class":163},[33,154299,4037],{"class":50},[33,154301,154302],{"class":167},"(ws._charts) ",[33,154304,43000],{"class":163},[33,154306,1814],{"class":50},[33,154308,365],{"class":167},[33,154310,154311],{"class":54},"\"No charts found in Sales sheet\"\n",[33,154313,154314],{"class":35,"line":397},[33,154315,92],{"emptyLinePlaceholder":91},[33,154317,154318],{"class":35,"line":653},[33,154319,154320],{"class":39},"# 4. Check freeze pane\n",[33,154322,154323,154325,154328,154330,154332,154334,154336,154339,154341,154344,154346,154348],{"class":35,"line":667},[33,154324,36397],{"class":163},[33,154326,154327],{"class":167}," ws.freeze_panes ",[33,154329,1865],{"class":163},[33,154331,100145],{"class":54},[33,154333,365],{"class":167},[33,154335,4059],{"class":163},[33,154337,154338],{"class":54},"\"Unexpected freeze_panes: ",[33,154340,1115],{"class":50},[33,154342,154343],{"class":167},"ws.freeze_panes",[33,154345,76954],{"class":163},[33,154347,1121],{"class":50},[33,154349,7504],{"class":54},[33,154351,154352],{"class":35,"line":675},[33,154353,92],{"emptyLinePlaceholder":91},[33,154355,154356,154358,154360,154363],{"class":35,"line":689},[33,154357,13474],{"class":50},[33,154359,602],{"class":167},[33,154361,154362],{"class":54},"\"All checks passed.\"",[33,154364,221],{"class":167},[2537,154366],{},[18,154368,62712],{"id":4208},[4211,154370,154371,154384,154393,154402],{},[4214,154372,154373,154376,154377,8877,154380,154383],{},[1974,154374,154375],{},"Large data sets",": for sheets with 50 000+ rows, use ",[30,154378,154379],{},"openpyxl.Workbook(write_only=True)",[30,154381,154382],{},"ws.append()"," — it streams rows without loading the whole workbook into memory.",[4214,154385,154386,154389,154390,154392],{},[1974,154387,154388],{},"Write-only mode"," cannot embed charts or apply styles after rows are written; write data first, then open with a regular ",[30,154391,22404],{}," pass to add formatting and charts.",[4214,154394,154395,154398,154399,154401],{},[1974,154396,154397],{},"Formula recalculation",": openpyxl sets ",[30,154400,149056],{}," to request full recalculation on next open. If your workflow requires pre-calculated values (e.g., for CI pipelines), compute in pandas and write literal values — no Excel required.",[4214,154403,154404,154406],{},[1974,154405,145999],{},": embedded charts add ~10–30 KB per chart. Hundreds of charts in one workbook can push file sizes above 10 MB; consider splitting into multiple workbooks.",[2537,154408],{},[18,154410,4271],{"id":4270},[4273,154412,154413,154423],{},[4276,154414,154415],{},[4279,154416,154417,154419,154421],{},[4282,154418,4284],{},[4282,154420,4287],{},[4282,154422,4290],{},[4292,154424,154425,154444,154461,154480,154494,154505],{},[4279,154426,154427,154432,154438],{},[4297,154428,154429,154430],{},"Formula cell reads back as ",[30,154431,571],{},[4297,154433,154434,154435,154437],{},"Opened with ",[30,154436,105730],{},"; no cached value yet",[4297,154439,154440,154441,154443],{},"Read with ",[30,154442,107326],{}," to get formula string, or open once in Excel to cache values",[4279,154445,154446,154451,154454],{},[4297,154447,154448],{},[30,154449,154450],{},"AttributeError: 'NoneType' object has no attribute 'value'",[4297,154452,154453],{},"Cell reference outside used range",[4297,154455,67848,154456,1351,154458,154460],{},[30,154457,22493],{},[30,154459,149835],{}," before iterating",[4279,154462,154463,154466,154474],{},[4297,154464,154465],{},"Chart renders but shows no data",[4297,154467,154468,154470,154471,154473],{},[30,154469,149557],{}," row\u002Fcolumn bounds wrong or ",[30,154472,151475],{}," mismatch",[4297,154475,133514,154476,154479],{},[30,154477,154478],{},"ws.cell(min_row, min_col).value"," to verify bounds",[4279,154481,154482,154488,154491],{},[4297,154483,154484,154487],{},[30,154485,154486],{},"InvalidFileException"," on load",[4297,154489,154490],{},"File opened\u002Flocked by Excel",[4297,154492,154493],{},"Close the file in Excel, then re-run",[4279,154495,154496,154499,154502],{},[4297,154497,154498],{},"Named range not visible in Excel Name Box",[4297,154500,154501],{},"Scope set to workbook but name conflicts with sheet-level name",[4297,154503,154504],{},"Delete conflicting sheet-level name in Excel Name Manager",[4279,154506,154507,154511,154514],{},[4297,154508,154509,105821],{},[30,154510,68035],{},[4297,154512,154513],{},"File is open in Excel",[4297,154515,154516],{},"Close Excel first, or save to a temp path then replace",[2537,154518],{},[18,154520,62848],{"id":4401},[23,154522,154524],{"className":126,"code":154523,"language":47,"meta":28,"style":28},"#!\u002Fusr\u002Fbin\u002Fenv python3\n\"\"\"\nWrite formulas, named ranges, number formats, and charts into a .xlsx file.\nUsage: python excel_formulas_charts.py [--input INPUT] [--output OUTPUT]\n\"\"\"\n# pip install openpyxl\n\nimport argparse\nfrom pathlib import Path\n\nimport openpyxl\nfrom openpyxl.chart import BarChart, LineChart, Reference\nfrom openpyxl.styles import Font, PatternFill, Alignment\nfrom openpyxl.workbook.defined_name import DefinedName\nfrom openpyxl.utils import quote_sheetname, absolute_coordinate\n\n\ndef build_sample_workbook(path: Path) -> None:\n    wb = openpyxl.Workbook()\n    ws = wb.active\n    ws.title = \"Sales\"\n    ws.append([\"Month\", \"Revenue\", \"Cost\"])\n    for month, rev, cost in [\n        (\"Jan\", 12000, 8000), (\"Feb\", 15000, 9500), (\"Mar\", 13500, 8800),\n        (\"Apr\", 17000, 10200), (\"May\", 19500, 11000), (\"Jun\", 22000, 12500),\n    ]:\n        ws.append([month, rev, cost])\n    wb.save(path)\n\n\ndef apply_formulas(ws) -> None:\n    last = ws.max_row\n    ws[\"D1\"] = \"Margin\"\n    for row in range(2, last + 1):\n        ws[f\"D{row}\"] = f\"=(B{row}-C{row})\u002FB{row}\"\n        ws[f\"D{row}\"].number_format = \"0.0%\"\n        ws[f\"B{row}\"].number_format = '\"$\"#,##0'\n        ws[f\"C{row}\"].number_format = '\"$\"#,##0'\n    summary = last + 2\n    ws[f\"A{summary}\"] = \"Total \u002F Avg\"\n    ws[f\"B{summary}\"] = f\"=SUM(B2:B{last})\"\n    ws[f\"C{summary}\"] = f\"=SUM(C2:C{last})\"\n    ws[f\"D{summary}\"] = f\"=AVERAGE(D2:D{last})\"\n    ws[f\"B{summary}\"].number_format = '\"$\"#,##0'\n    ws[f\"C{summary}\"].number_format = '\"$\"#,##0'\n    ws[f\"D{summary}\"].number_format = \"0.0%\"\n\n\ndef apply_named_ranges(wb, ws) -> None:\n    last = ws.max_row\n    sheet_ref = quote_sheetname(ws.title)\n    wb.defined_names[\"Revenue\"] = DefinedName(\n        \"Revenue\", attr_text=f\"{sheet_ref}!{absolute_coordinate(f'B2:B{last}')}\"\n    )\n    wb.defined_names[\"Cost\"] = DefinedName(\n        \"Cost\", attr_text=f\"{sheet_ref}!{absolute_coordinate(f'C2:C{last}')}\"\n    )\n\n\ndef style_headers(ws) -> None:\n    for col in range(1, ws.max_column + 1):\n        cell = ws.cell(row=1, column=col)\n        cell.font = Font(bold=True, color=\"FFFFFF\", size=12)\n        cell.fill = PatternFill(\"solid\", fgColor=\"2563EB\")\n        cell.alignment = Alignment(horizontal=\"center\")\n    ws.freeze_panes = \"A2\"\n\n\ndef add_charts(ws) -> None:\n    last = ws.max_row\n    bar = BarChart()\n    bar.title = \"Revenue vs Cost\"\n    bar.type = \"col\"\n    bar.add_data(Reference(ws, min_col=2, max_col=3, min_row=1, max_row=last),\n                 titles_from_data=True)\n    bar.set_categories(Reference(ws, min_col=1, min_row=2, max_row=last))\n    bar.width, bar.height = 18, 10\n    ws.add_chart(bar, \"F2\")\n\n    line = LineChart()\n    line.title = \"Margin Trend\"\n    line.smooth = True\n    line.add_data(Reference(ws, min_col=4, min_row=1, max_row=last),\n                  titles_from_data=True)\n    line.set_categories(Reference(ws, min_col=1, min_row=2, max_row=last))\n    line.width, line.height = 18, 10\n    ws.add_chart(line, \"F22\")\n\n\ndef main() -> None:\n    parser = argparse.ArgumentParser(description=\"Write formulas and charts to Excel.\")\n    parser.add_argument(\"--input\",  default=\"sample_sales.xlsx\",\n                        help=\"Source .xlsx file (created if absent)\")\n    parser.add_argument(\"--output\", default=None,\n                        help=\"Destination path (default: overwrite input)\")\n    args = parser.parse_args()\n\n    src  = Path(args.input)\n    dest = Path(args.output) if args.output else src\n\n    if not src.exists():\n        print(f\"Creating sample workbook at {src}\")\n        build_sample_workbook(src)\n\n    try:\n        wb = openpyxl.load_workbook(src)\n    except Exception as exc:\n        raise SystemExit(f\"Cannot open {src}: {exc}\") from exc\n\n    ws = wb[\"Sales\"]\n    apply_formulas(ws)\n    apply_named_ranges(wb, ws)\n    style_headers(ws)\n    add_charts(ws)\n\n    wb.save(dest)\n    print(f\"Written: {dest}\")\n\n\nif __name__ == \"__main__\":\n    main()\n",[30,154525,154526,154530,154534,154539,154544,154548,154552,154556,154562,154572,154576,154582,154592,154602,154612,154622,154626,154630,154643,154651,154659,154668,154685,154696,154738,154787,154792,154797,154801,154805,154809,154823,154832,154844,154867,154915,154937,154959,154981,154994,155017,155050,155082,155114,155136,155158,155180,155184,155188,155202,155210,155219,155233,155280,155284,155296,155342,155346,155350,155354,155367,155389,155412,155444,155464,155480,155488,155492,155496,155509,155517,155526,155535,155545,155581,155592,155620,155633,155642,155646,155655,155664,155673,155700,155711,155738,155751,155760,155764,155768,155780,155797,155813,155824,155840,155851,155859,155863,155872,155891,155895,155904,155926,155931,155935,155941,155950,155960,155994,155998,156010,156015,156020,156025,156030,156034,156039,156059,156063,156067,156079],{"__ignoreMap":28},[33,154527,154528],{"class":35,"line":36},[33,154529,14447],{"class":39},[33,154531,154532],{"class":35,"line":43},[33,154533,139],{"class":54},[33,154535,154536],{"class":35,"line":61},[33,154537,154538],{"class":54},"Write formulas, named ranges, number formats, and charts into a .xlsx file.\n",[33,154540,154541],{"class":35,"line":73},[33,154542,154543],{"class":54},"Usage: python excel_formulas_charts.py [--input INPUT] [--output OUTPUT]\n",[33,154545,154546],{"class":35,"line":88},[33,154547,139],{"class":54},[33,154549,154550],{"class":35,"line":95},[33,154551,98209],{"class":39},[33,154553,154554],{"class":35,"line":101},[33,154555,92],{"emptyLinePlaceholder":91},[33,154557,154558,154560],{"class":35,"line":171},[33,154559,164],{"class":163},[33,154561,4461],{"class":167},[33,154563,154564,154566,154568,154570],{"class":35,"line":179},[33,154565,190],{"class":163},[33,154567,193],{"class":167},[33,154569,164],{"class":163},[33,154571,198],{"class":167},[33,154573,154574],{"class":35,"line":187},[33,154575,92],{"emptyLinePlaceholder":91},[33,154577,154578,154580],{"class":35,"line":201},[33,154579,164],{"class":163},[33,154581,95887],{"class":167},[33,154583,154584,154586,154588,154590],{"class":35,"line":206},[33,154585,190],{"class":163},[33,154587,151240],{"class":167},[33,154589,164],{"class":163},[33,154591,153184],{"class":167},[33,154593,154594,154596,154598,154600],{"class":35,"line":224},[33,154595,190],{"class":163},[33,154597,17115],{"class":167},[33,154599,164],{"class":163},[33,154601,22631],{"class":167},[33,154603,154604,154606,154608,154610],{"class":35,"line":229},[33,154605,190],{"class":163},[33,154607,150369],{"class":167},[33,154609,164],{"class":163},[33,154611,150374],{"class":167},[33,154613,154614,154616,154618,154620],{"class":35,"line":235},[33,154615,190],{"class":163},[33,154617,17127],{"class":167},[33,154619,164],{"class":163},[33,154621,150385],{"class":167},[33,154623,154624],{"class":35,"line":250},[33,154625,92],{"emptyLinePlaceholder":91},[33,154627,154628],{"class":35,"line":266},[33,154629,92],{"emptyLinePlaceholder":91},[33,154631,154632,154634,154637,154639,154641],{"class":35,"line":290},[33,154633,562],{"class":163},[33,154635,154636],{"class":46}," build_sample_workbook",[33,154638,3743],{"class":167},[33,154640,571],{"class":50},[33,154642,574],{"class":167},[33,154644,154645,154647,154649],{"class":35,"line":295},[33,154646,17432],{"class":167},[33,154648,242],{"class":163},[33,154650,139974],{"class":167},[33,154652,154653,154655,154657],{"class":35,"line":300},[33,154654,17442],{"class":167},[33,154656,242],{"class":163},[33,154658,99877],{"class":167},[33,154660,154661,154664,154666],{"class":35,"line":317},[33,154662,154663],{"class":167},"    ws.title ",[33,154665,242],{"class":163},[33,154667,139992],{"class":54},[33,154669,154670,154673,154675,154677,154679,154681,154683],{"class":35,"line":332},[33,154671,154672],{"class":167},"    ws.append([",[33,154674,154038],{"class":54},[33,154676,365],{"class":167},[33,154678,12925],{"class":54},[33,154680,365],{"class":167},[33,154682,19593],{"class":54},[33,154684,751],{"class":167},[33,154686,154687,154689,154692,154694],{"class":35,"line":347},[33,154688,656],{"class":163},[33,154690,154691],{"class":167}," month, rev, cost ",[33,154693,662],{"class":163},[33,154695,7473],{"class":167},[33,154697,154698,154700,154702,154704,154706,154708,154710,154712,154714,154716,154718,154720,154722,154724,154726,154728,154731,154733,154736],{"class":35,"line":374},[33,154699,19819],{"class":167},[33,154701,11790],{"class":54},[33,154703,365],{"class":167},[33,154705,101426],{"class":50},[33,154707,365],{"class":167},[33,154709,135748],{"class":50},[33,154711,19834],{"class":167},[33,154713,11795],{"class":54},[33,154715,365],{"class":167},[33,154717,100887],{"class":50},[33,154719,365],{"class":167},[33,154721,134095],{"class":50},[33,154723,19834],{"class":167},[33,154725,11800],{"class":54},[33,154727,365],{"class":167},[33,154729,154730],{"class":50},"13500",[33,154732,365],{"class":167},[33,154734,154735],{"class":50},"8800",[33,154737,1506],{"class":167},[33,154739,154740,154742,154745,154747,154750,154752,154755,154757,154760,154762,154765,154767,154770,154772,154775,154777,154780,154782,154785],{"class":35,"line":397},[33,154741,19819],{"class":167},[33,154743,154744],{"class":54},"\"Apr\"",[33,154746,365],{"class":167},[33,154748,154749],{"class":50},"17000",[33,154751,365],{"class":167},[33,154753,154754],{"class":50},"10200",[33,154756,19834],{"class":167},[33,154758,154759],{"class":54},"\"May\"",[33,154761,365],{"class":167},[33,154763,154764],{"class":50},"19500",[33,154766,365],{"class":167},[33,154768,154769],{"class":50},"11000",[33,154771,19834],{"class":167},[33,154773,154774],{"class":54},"\"Jun\"",[33,154776,365],{"class":167},[33,154778,154779],{"class":50},"22000",[33,154781,365],{"class":167},[33,154783,154784],{"class":50},"12500",[33,154786,1506],{"class":167},[33,154788,154789],{"class":35,"line":653},[33,154790,154791],{"class":167},"    ]:\n",[33,154793,154794],{"class":35,"line":667},[33,154795,154796],{"class":167},"        ws.append([month, rev, cost])\n",[33,154798,154799],{"class":35,"line":675},[33,154800,17945],{"class":167},[33,154802,154803],{"class":35,"line":689},[33,154804,92],{"emptyLinePlaceholder":91},[33,154806,154807],{"class":35,"line":703},[33,154808,92],{"emptyLinePlaceholder":91},[33,154810,154811,154813,154816,154819,154821],{"class":35,"line":714},[33,154812,562],{"class":163},[33,154814,154815],{"class":46}," apply_formulas",[33,154817,154818],{"class":167},"(ws) -> ",[33,154820,571],{"class":50},[33,154822,574],{"class":167},[33,154824,154825,154828,154830],{"class":35,"line":723},[33,154826,154827],{"class":167},"    last ",[33,154829,242],{"class":163},[33,154831,153284],{"class":167},[33,154833,154834,154836,154838,154840,154842],{"class":35,"line":754},[33,154835,99896],{"class":167},[33,154837,150109],{"class":54},[33,154839,763],{"class":167},[33,154841,242],{"class":163},[33,154843,150116],{"class":54},[33,154845,154846,154848,154850,154852,154854,154856,154858,154861,154863,154865],{"class":35,"line":771},[33,154847,656],{"class":163},[33,154849,3844],{"class":167},[33,154851,662],{"class":163},[33,154853,1801],{"class":50},[33,154855,602],{"class":167},[33,154857,1533],{"class":50},[33,154859,154860],{"class":167},", last ",[33,154862,1811],{"class":163},[33,154864,1814],{"class":50},[33,154866,1737],{"class":167},[33,154868,154869,154871,154873,154875,154877,154879,154881,154883,154885,154887,154889,154891,154893,154895,154897,154899,154901,154903,154905,154907,154909,154911,154913],{"class":35,"line":777},[33,154870,150927],{"class":167},[33,154872,4059],{"class":163},[33,154874,148152],{"class":54},[33,154876,1115],{"class":50},[33,154878,98107],{"class":167},[33,154880,1121],{"class":50},[33,154882,274],{"class":54},[33,154884,763],{"class":167},[33,154886,242],{"class":163},[33,154888,1110],{"class":163},[33,154890,150164],{"class":54},[33,154892,1115],{"class":50},[33,154894,98107],{"class":167},[33,154896,1121],{"class":50},[33,154898,150173],{"class":54},[33,154900,1115],{"class":50},[33,154902,98107],{"class":167},[33,154904,1121],{"class":50},[33,154906,150182],{"class":54},[33,154908,1115],{"class":50},[33,154910,98107],{"class":167},[33,154912,1121],{"class":50},[33,154914,7504],{"class":54},[33,154916,154917,154919,154921,154923,154925,154927,154929,154931,154933,154935],{"class":35,"line":788},[33,154918,150927],{"class":167},[33,154920,4059],{"class":163},[33,154922,148152],{"class":54},[33,154924,1115],{"class":50},[33,154926,98107],{"class":167},[33,154928,1121],{"class":50},[33,154930,274],{"class":54},[33,154932,148186],{"class":167},[33,154934,242],{"class":163},[33,154936,148236],{"class":54},[33,154938,154939,154941,154943,154945,154947,154949,154951,154953,154955,154957],{"class":35,"line":804},[33,154940,150927],{"class":167},[33,154942,4059],{"class":163},[33,154944,148101],{"class":54},[33,154946,1115],{"class":50},[33,154948,98107],{"class":167},[33,154950,1121],{"class":50},[33,154952,274],{"class":54},[33,154954,148186],{"class":167},[33,154956,242],{"class":163},[33,154958,148191],{"class":54},[33,154960,154961,154963,154965,154967,154969,154971,154973,154975,154977,154979],{"class":35,"line":809},[33,154962,150927],{"class":167},[33,154964,4059],{"class":163},[33,154966,148128],{"class":54},[33,154968,1115],{"class":50},[33,154970,98107],{"class":167},[33,154972,1121],{"class":50},[33,154974,274],{"class":54},[33,154976,148186],{"class":167},[33,154978,242],{"class":163},[33,154980,148191],{"class":54},[33,154982,154983,154985,154987,154990,154992],{"class":35,"line":819},[33,154984,18382],{"class":167},[33,154986,242],{"class":163},[33,154988,154989],{"class":167}," last ",[33,154991,1811],{"class":163},[33,154993,97531],{"class":50},[33,154995,154996,154998,155000,155002,155004,155007,155009,155011,155013,155015],{"class":35,"line":829},[33,154997,99896],{"class":167},[33,154999,4059],{"class":163},[33,155001,148076],{"class":54},[33,155003,1115],{"class":50},[33,155005,155006],{"class":167},"summary",[33,155008,1121],{"class":50},[33,155010,274],{"class":54},[33,155012,763],{"class":167},[33,155014,242],{"class":163},[33,155016,148092],{"class":54},[33,155018,155019,155021,155023,155025,155027,155029,155031,155033,155035,155037,155039,155041,155043,155046,155048],{"class":35,"line":834},[33,155020,99896],{"class":167},[33,155022,4059],{"class":163},[33,155024,148101],{"class":54},[33,155026,1115],{"class":50},[33,155028,155006],{"class":167},[33,155030,1121],{"class":50},[33,155032,274],{"class":54},[33,155034,763],{"class":167},[33,155036,242],{"class":163},[33,155038,1110],{"class":163},[33,155040,106716],{"class":54},[33,155042,1115],{"class":50},[33,155044,155045],{"class":167},"last",[33,155047,1121],{"class":50},[33,155049,17841],{"class":54},[33,155051,155052,155054,155056,155058,155060,155062,155064,155066,155068,155070,155072,155074,155076,155078,155080],{"class":35,"line":839},[33,155053,99896],{"class":167},[33,155055,4059],{"class":163},[33,155057,148128],{"class":54},[33,155059,1115],{"class":50},[33,155061,155006],{"class":167},[33,155063,1121],{"class":50},[33,155065,274],{"class":54},[33,155067,763],{"class":167},[33,155069,242],{"class":163},[33,155071,1110],{"class":163},[33,155073,150085],{"class":54},[33,155075,1115],{"class":50},[33,155077,155045],{"class":167},[33,155079,1121],{"class":50},[33,155081,17841],{"class":54},[33,155083,155084,155086,155088,155090,155092,155094,155096,155098,155100,155102,155104,155106,155108,155110,155112],{"class":35,"line":860},[33,155085,99896],{"class":167},[33,155087,4059],{"class":163},[33,155089,148152],{"class":54},[33,155091,1115],{"class":50},[33,155093,155006],{"class":167},[33,155095,1121],{"class":50},[33,155097,274],{"class":54},[33,155099,763],{"class":167},[33,155101,242],{"class":163},[33,155103,1110],{"class":163},[33,155105,150224],{"class":54},[33,155107,1115],{"class":50},[33,155109,155045],{"class":167},[33,155111,1121],{"class":50},[33,155113,17841],{"class":54},[33,155115,155116,155118,155120,155122,155124,155126,155128,155130,155132,155134],{"class":35,"line":887},[33,155117,99896],{"class":167},[33,155119,4059],{"class":163},[33,155121,148101],{"class":54},[33,155123,1115],{"class":50},[33,155125,155006],{"class":167},[33,155127,1121],{"class":50},[33,155129,274],{"class":54},[33,155131,148186],{"class":167},[33,155133,242],{"class":163},[33,155135,148191],{"class":54},[33,155137,155138,155140,155142,155144,155146,155148,155150,155152,155154,155156],{"class":35,"line":907},[33,155139,99896],{"class":167},[33,155141,4059],{"class":163},[33,155143,148128],{"class":54},[33,155145,1115],{"class":50},[33,155147,155006],{"class":167},[33,155149,1121],{"class":50},[33,155151,274],{"class":54},[33,155153,148186],{"class":167},[33,155155,242],{"class":163},[33,155157,148191],{"class":54},[33,155159,155160,155162,155164,155166,155168,155170,155172,155174,155176,155178],{"class":35,"line":1826},[33,155161,99896],{"class":167},[33,155163,4059],{"class":163},[33,155165,148152],{"class":54},[33,155167,1115],{"class":50},[33,155169,155006],{"class":167},[33,155171,1121],{"class":50},[33,155173,274],{"class":54},[33,155175,148186],{"class":167},[33,155177,242],{"class":163},[33,155179,148236],{"class":54},[33,155181,155182],{"class":35,"line":1844},[33,155183,92],{"emptyLinePlaceholder":91},[33,155185,155186],{"class":35,"line":1858},[33,155187,92],{"emptyLinePlaceholder":91},[33,155189,155190,155192,155195,155198,155200],{"class":35,"line":1871},[33,155191,562],{"class":163},[33,155193,155194],{"class":46}," apply_named_ranges",[33,155196,155197],{"class":167},"(wb, ws) -> ",[33,155199,571],{"class":50},[33,155201,574],{"class":167},[33,155203,155204,155206,155208],{"class":35,"line":1877},[33,155205,154827],{"class":167},[33,155207,242],{"class":163},[33,155209,153284],{"class":167},[33,155211,155212,155215,155217],{"class":35,"line":1883},[33,155213,155214],{"class":167},"    sheet_ref ",[33,155216,242],{"class":163},[33,155218,150506],{"class":167},[33,155220,155221,155224,155226,155228,155230],{"class":35,"line":1915},[33,155222,155223],{"class":167},"    wb.defined_names[",[33,155225,12925],{"class":54},[33,155227,763],{"class":167},[33,155229,242],{"class":163},[33,155231,155232],{"class":167}," DefinedName(\n",[33,155234,155235,155237,155239,155241,155243,155245,155247,155249,155251,155253,155255,155257,155260,155262,155265,155267,155269,155271,155274,155276,155278],{"class":35,"line":1926},[33,155236,11815],{"class":54},[33,155238,365],{"class":167},[33,155240,150578],{"class":238},[33,155242,242],{"class":163},[33,155244,4059],{"class":163},[33,155246,274],{"class":54},[33,155248,1115],{"class":50},[33,155250,150589],{"class":167},[33,155252,1121],{"class":50},[33,155254,150594],{"class":54},[33,155256,1115],{"class":50},[33,155258,155259],{"class":167},"absolute_coordinate(",[33,155261,4059],{"class":163},[33,155263,155264],{"class":54},"'B2:B",[33,155266,1115],{"class":50},[33,155268,155045],{"class":167},[33,155270,1121],{"class":50},[33,155272,155273],{"class":54},"'",[33,155275,12027],{"class":167},[33,155277,1121],{"class":50},[33,155279,7504],{"class":54},[33,155281,155282],{"class":35,"line":1932},[33,155283,1202],{"class":167},[33,155285,155286,155288,155290,155292,155294],{"class":35,"line":1938},[33,155287,155223],{"class":167},[33,155289,19593],{"class":54},[33,155291,763],{"class":167},[33,155293,242],{"class":163},[33,155295,155232],{"class":167},[33,155297,155298,155301,155303,155305,155307,155309,155311,155313,155315,155317,155319,155321,155323,155325,155328,155330,155332,155334,155336,155338,155340],{"class":35,"line":1950},[33,155299,155300],{"class":54},"        \"Cost\"",[33,155302,365],{"class":167},[33,155304,150578],{"class":238},[33,155306,242],{"class":163},[33,155308,4059],{"class":163},[33,155310,274],{"class":54},[33,155312,1115],{"class":50},[33,155314,150589],{"class":167},[33,155316,1121],{"class":50},[33,155318,150594],{"class":54},[33,155320,1115],{"class":50},[33,155322,155259],{"class":167},[33,155324,4059],{"class":163},[33,155326,155327],{"class":54},"'C2:C",[33,155329,1115],{"class":50},[33,155331,155045],{"class":167},[33,155333,1121],{"class":50},[33,155335,155273],{"class":54},[33,155337,12027],{"class":167},[33,155339,1121],{"class":50},[33,155341,7504],{"class":54},[33,155343,155344],{"class":35,"line":1958},[33,155345,1202],{"class":167},[33,155347,155348],{"class":35,"line":4904},[33,155349,92],{"emptyLinePlaceholder":91},[33,155351,155352],{"class":35,"line":4909},[33,155353,92],{"emptyLinePlaceholder":91},[33,155355,155356,155358,155361,155363,155365],{"class":35,"line":4915},[33,155357,562],{"class":163},[33,155359,155360],{"class":46}," style_headers",[33,155362,154818],{"class":167},[33,155364,571],{"class":50},[33,155366,574],{"class":167},[33,155368,155369,155371,155373,155375,155377,155379,155381,155383,155385,155387],{"class":35,"line":4925},[33,155370,656],{"class":163},[33,155372,7985],{"class":167},[33,155374,662],{"class":163},[33,155376,1801],{"class":50},[33,155378,602],{"class":167},[33,155380,734],{"class":50},[33,155382,17559],{"class":167},[33,155384,1811],{"class":163},[33,155386,1814],{"class":50},[33,155388,1737],{"class":167},[33,155390,155391,155393,155395,155397,155399,155401,155403,155405,155407,155409],{"class":35,"line":4935},[33,155392,17776],{"class":167},[33,155394,242],{"class":163},[33,155396,17573],{"class":167},[33,155398,98107],{"class":238},[33,155400,242],{"class":163},[33,155402,734],{"class":50},[33,155404,365],{"class":167},[33,155406,98115],{"class":238},[33,155408,242],{"class":163},[33,155410,155411],{"class":167},"col)\n",[33,155413,155414,155416,155418,155420,155422,155424,155426,155428,155430,155432,155434,155436,155438,155440,155442],{"class":35,"line":4941},[33,155415,17482],{"class":167},[33,155417,242],{"class":163},[33,155419,17233],{"class":167},[33,155421,17236],{"class":238},[33,155423,242],{"class":163},[33,155425,855],{"class":50},[33,155427,365],{"class":167},[33,155429,17245],{"class":238},[33,155431,242],{"class":163},[33,155433,17250],{"class":54},[33,155435,365],{"class":167},[33,155437,17255],{"class":238},[33,155439,242],{"class":163},[33,155441,55650],{"class":50},[33,155443,221],{"class":167},[33,155445,155446,155448,155450,155452,155454,155456,155458,155460,155462],{"class":35,"line":4950},[33,155447,17492],{"class":167},[33,155449,242],{"class":163},[33,155451,17185],{"class":167},[33,155453,17188],{"class":54},[33,155455,365],{"class":167},[33,155457,17193],{"class":238},[33,155459,242],{"class":163},[33,155461,17198],{"class":54},[33,155463,221],{"class":167},[33,155465,155466,155468,155470,155472,155474,155476,155478],{"class":35,"line":4960},[33,155467,17502],{"class":167},[33,155469,242],{"class":163},[33,155471,17507],{"class":167},[33,155473,17510],{"class":238},[33,155475,242],{"class":163},[33,155477,17515],{"class":54},[33,155479,221],{"class":167},[33,155481,155482,155484,155486],{"class":35,"line":4965},[33,155483,100140],{"class":167},[33,155485,242],{"class":163},[33,155487,97243],{"class":54},[33,155489,155490],{"class":35,"line":4971},[33,155491,92],{"emptyLinePlaceholder":91},[33,155493,155494],{"class":35,"line":4983},[33,155495,92],{"emptyLinePlaceholder":91},[33,155497,155498,155500,155503,155505,155507],{"class":35,"line":4988},[33,155499,562],{"class":163},[33,155501,155502],{"class":46}," add_charts",[33,155504,154818],{"class":167},[33,155506,571],{"class":50},[33,155508,574],{"class":167},[33,155510,155511,155513,155515],{"class":35,"line":4993},[33,155512,154827],{"class":167},[33,155514,242],{"class":163},[33,155516,153284],{"class":167},[33,155518,155519,155522,155524],{"class":35,"line":5003},[33,155520,155521],{"class":167},"    bar ",[33,155523,242],{"class":163},[33,155525,151349],{"class":167},[33,155527,155528,155531,155533],{"class":35,"line":5008},[33,155529,155530],{"class":167},"    bar.title ",[33,155532,242],{"class":163},[33,155534,153307],{"class":54},[33,155536,155537,155540,155542],{"class":35,"line":5014},[33,155538,155539],{"class":167},"    bar.type ",[33,155541,242],{"class":163},[33,155543,155544],{"class":54}," \"col\"\n",[33,155546,155547,155550,155552,155554,155556,155558,155560,155562,155564,155566,155568,155570,155572,155574,155576,155578],{"class":35,"line":5019},[33,155548,155549],{"class":167},"    bar.add_data(Reference(ws, ",[33,155551,17651],{"class":238},[33,155553,242],{"class":163},[33,155555,1533],{"class":50},[33,155557,365],{"class":167},[33,155559,17659],{"class":238},[33,155561,242],{"class":163},[33,155563,10258],{"class":50},[33,155565,365],{"class":167},[33,155567,17642],{"class":238},[33,155569,242],{"class":163},[33,155571,734],{"class":50},[33,155573,365],{"class":167},[33,155575,97398],{"class":238},[33,155577,242],{"class":163},[33,155579,155580],{"class":167},"last),\n",[33,155582,155583,155586,155588,155590],{"class":35,"line":5032},[33,155584,155585],{"class":238},"                 titles_from_data",[33,155587,242],{"class":163},[33,155589,855],{"class":50},[33,155591,221],{"class":167},[33,155593,155594,155597,155599,155601,155603,155605,155607,155609,155611,155613,155615,155617],{"class":35,"line":5039},[33,155595,155596],{"class":167},"    bar.set_categories(Reference(ws, ",[33,155598,17651],{"class":238},[33,155600,242],{"class":163},[33,155602,734],{"class":50},[33,155604,365],{"class":167},[33,155606,17642],{"class":238},[33,155608,242],{"class":163},[33,155610,1533],{"class":50},[33,155612,365],{"class":167},[33,155614,97398],{"class":238},[33,155616,242],{"class":163},[33,155618,155619],{"class":167},"last))\n",[33,155621,155622,155625,155627,155629,155631],{"class":35,"line":5068},[33,155623,155624],{"class":167},"    bar.width, bar.height ",[33,155626,242],{"class":163},[33,155628,153389],{"class":50},[33,155630,365],{"class":167},[33,155632,153394],{"class":50},[33,155634,155635,155638,155640],{"class":35,"line":5077},[33,155636,155637],{"class":167},"    ws.add_chart(bar, ",[33,155639,104865],{"class":54},[33,155641,221],{"class":167},[33,155643,155644],{"class":35,"line":5082},[33,155645,92],{"emptyLinePlaceholder":91},[33,155647,155648,155651,155653],{"class":35,"line":5089},[33,155649,155650],{"class":167},"    line ",[33,155652,242],{"class":163},[33,155654,151759],{"class":167},[33,155656,155657,155660,155662],{"class":35,"line":5098},[33,155658,155659],{"class":167},"    line.title ",[33,155661,242],{"class":163},[33,155663,153425],{"class":54},[33,155665,155666,155669,155671],{"class":35,"line":5105},[33,155667,155668],{"class":167},"    line.smooth ",[33,155670,242],{"class":163},[33,155672,2887],{"class":50},[33,155674,155675,155678,155680,155682,155684,155686,155688,155690,155692,155694,155696,155698],{"class":35,"line":5110},[33,155676,155677],{"class":167},"    line.add_data(Reference(ws, ",[33,155679,17651],{"class":238},[33,155681,242],{"class":163},[33,155683,1503],{"class":50},[33,155685,365],{"class":167},[33,155687,17642],{"class":238},[33,155689,242],{"class":163},[33,155691,734],{"class":50},[33,155693,365],{"class":167},[33,155695,97398],{"class":238},[33,155697,242],{"class":163},[33,155699,155580],{"class":167},[33,155701,155702,155705,155707,155709],{"class":35,"line":5115},[33,155703,155704],{"class":238},"                  titles_from_data",[33,155706,242],{"class":163},[33,155708,855],{"class":50},[33,155710,221],{"class":167},[33,155712,155713,155716,155718,155720,155722,155724,155726,155728,155730,155732,155734,155736],{"class":35,"line":5128},[33,155714,155715],{"class":167},"    line.set_categories(Reference(ws, ",[33,155717,17651],{"class":238},[33,155719,242],{"class":163},[33,155721,734],{"class":50},[33,155723,365],{"class":167},[33,155725,17642],{"class":238},[33,155727,242],{"class":163},[33,155729,1533],{"class":50},[33,155731,365],{"class":167},[33,155733,97398],{"class":238},[33,155735,242],{"class":163},[33,155737,155619],{"class":167},[33,155739,155740,155743,155745,155747,155749],{"class":35,"line":5135},[33,155741,155742],{"class":167},"    line.width, line.height ",[33,155744,242],{"class":163},[33,155746,153389],{"class":50},[33,155748,365],{"class":167},[33,155750,153394],{"class":50},[33,155752,155753,155756,155758],{"class":35,"line":5142},[33,155754,155755],{"class":167},"    ws.add_chart(line, ",[33,155757,153507],{"class":54},[33,155759,221],{"class":167},[33,155761,155762],{"class":35,"line":5151},[33,155763,92],{"emptyLinePlaceholder":91},[33,155765,155766],{"class":35,"line":5156},[33,155767,92],{"emptyLinePlaceholder":91},[33,155769,155770,155772,155774,155776,155778],{"class":35,"line":5161},[33,155771,562],{"class":163},[33,155773,6636],{"class":46},[33,155775,568],{"class":167},[33,155777,571],{"class":50},[33,155779,574],{"class":167},[33,155781,155782,155784,155786,155788,155790,155792,155795],{"class":35,"line":5167},[33,155783,6648],{"class":167},[33,155785,242],{"class":163},[33,155787,6653],{"class":167},[33,155789,6656],{"class":238},[33,155791,242],{"class":163},[33,155793,155794],{"class":54},"\"Write formulas and charts to Excel.\"",[33,155796,221],{"class":167},[33,155798,155799,155801,155803,155805,155807,155809,155811],{"class":35,"line":5172},[33,155800,6669],{"class":167},[33,155802,6672],{"class":54},[33,155804,25480],{"class":167},[33,155806,6685],{"class":238},[33,155808,242],{"class":163},[33,155810,149709],{"class":54},[33,155812,247],{"class":167},[33,155814,155815,155817,155819,155822],{"class":35,"line":5182},[33,155816,53388],{"class":238},[33,155818,242],{"class":163},[33,155820,155821],{"class":54},"\"Source .xlsx file (created if absent)\"",[33,155823,221],{"class":167},[33,155825,155826,155828,155830,155832,155834,155836,155838],{"class":35,"line":5195},[33,155827,6669],{"class":167},[33,155829,6699],{"class":54},[33,155831,365],{"class":167},[33,155833,6685],{"class":238},[33,155835,242],{"class":163},[33,155837,571],{"class":50},[33,155839,247],{"class":167},[33,155841,155842,155844,155846,155849],{"class":35,"line":5200},[33,155843,53388],{"class":238},[33,155845,242],{"class":163},[33,155847,155848],{"class":54},"\"Destination path (default: overwrite input)\"",[33,155850,221],{"class":167},[33,155852,155853,155855,155857],{"class":35,"line":5205},[33,155854,6766],{"class":167},[33,155856,242],{"class":163},[33,155858,6771],{"class":167},[33,155860,155861],{"class":35,"line":5210},[33,155862,92],{"emptyLinePlaceholder":91},[33,155864,155865,155868,155870],{"class":35,"line":5215},[33,155866,155867],{"class":167},"    src  ",[33,155869,242],{"class":163},[33,155871,69442],{"class":167},[33,155873,155874,155877,155879,155882,155884,155886,155888],{"class":35,"line":5220},[33,155875,155876],{"class":167},"    dest ",[33,155878,242],{"class":163},[33,155880,155881],{"class":167}," Path(args.output) ",[33,155883,2491],{"class":163},[33,155885,53645],{"class":167},[33,155887,7489],{"class":163},[33,155889,155890],{"class":167}," src\n",[33,155892,155893],{"class":35,"line":5227},[33,155894,92],{"emptyLinePlaceholder":91},[33,155896,155897,155899,155901],{"class":35,"line":5232},[33,155898,617],{"class":163},[33,155900,620],{"class":163},[33,155902,155903],{"class":167}," src.exists():\n",[33,155905,155906,155908,155910,155912,155915,155917,155920,155922,155924],{"class":35,"line":5237},[33,155907,9414],{"class":50},[33,155909,602],{"class":167},[33,155911,4059],{"class":163},[33,155913,155914],{"class":54},"\"Creating sample workbook at ",[33,155916,1115],{"class":50},[33,155918,155919],{"class":167},"src",[33,155921,1121],{"class":50},[33,155923,274],{"class":54},[33,155925,221],{"class":167},[33,155927,155928],{"class":35,"line":5251},[33,155929,155930],{"class":167},"        build_sample_workbook(src)\n",[33,155932,155933],{"class":35,"line":5259},[33,155934,92],{"emptyLinePlaceholder":91},[33,155936,155937,155939],{"class":35,"line":5264},[33,155938,2424],{"class":163},[33,155940,574],{"class":167},[33,155942,155943,155945,155947],{"class":35,"line":5269},[33,155944,97065],{"class":167},[33,155946,242],{"class":163},[33,155948,155949],{"class":167}," openpyxl.load_workbook(src)\n",[33,155951,155952,155954,155956,155958],{"class":35,"line":5283},[33,155953,2449],{"class":163},[33,155955,783],{"class":50},[33,155957,1852],{"class":163},[33,155959,1855],{"class":167},[33,155961,155962,155964,155966,155968,155970,155972,155974,155976,155978,155980,155982,155984,155986,155988,155990,155992],{"class":35,"line":5293},[33,155963,4051],{"class":163},[33,155965,16617],{"class":50},[33,155967,602],{"class":167},[33,155969,4059],{"class":163},[33,155971,9935],{"class":54},[33,155973,1115],{"class":50},[33,155975,155919],{"class":167},[33,155977,1121],{"class":50},[33,155979,2079],{"class":54},[33,155981,1115],{"class":50},[33,155983,6565],{"class":167},[33,155985,1121],{"class":50},[33,155987,274],{"class":54},[33,155989,1649],{"class":167},[33,155991,190],{"class":163},[33,155993,20843],{"class":167},[33,155995,155996],{"class":35,"line":5303},[33,155997,92],{"emptyLinePlaceholder":91},[33,155999,156000,156002,156004,156006,156008],{"class":35,"line":5313},[33,156001,17442],{"class":167},[33,156003,242],{"class":163},[33,156005,17447],{"class":167},[33,156007,140420],{"class":54},[33,156009,9202],{"class":167},[33,156011,156012],{"class":35,"line":5320},[33,156013,156014],{"class":167},"    apply_formulas(ws)\n",[33,156016,156017],{"class":35,"line":5325},[33,156018,156019],{"class":167},"    apply_named_ranges(wb, ws)\n",[33,156021,156022],{"class":35,"line":5330},[33,156023,156024],{"class":167},"    style_headers(ws)\n",[33,156026,156027],{"class":35,"line":5344},[33,156028,156029],{"class":167},"    add_charts(ws)\n",[33,156031,156032],{"class":35,"line":5349},[33,156033,92],{"emptyLinePlaceholder":91},[33,156035,156036],{"class":35,"line":5354},[33,156037,156038],{"class":167},"    wb.save(dest)\n",[33,156040,156041,156043,156045,156047,156049,156051,156053,156055,156057],{"class":35,"line":5368},[33,156042,7268],{"class":50},[33,156044,602],{"class":167},[33,156046,4059],{"class":163},[33,156048,58214],{"class":54},[33,156050,1115],{"class":50},[33,156052,124602],{"class":167},[33,156054,1121],{"class":50},[33,156056,274],{"class":54},[33,156058,221],{"class":167},[33,156060,156061],{"class":35,"line":5377},[33,156062,92],{"emptyLinePlaceholder":91},[33,156064,156065],{"class":35,"line":5382},[33,156066,92],{"emptyLinePlaceholder":91},[33,156068,156069,156071,156073,156075,156077],{"class":35,"line":5389},[33,156070,2491],{"class":163},[33,156072,2494],{"class":50},[33,156074,2497],{"class":163},[33,156076,2500],{"class":54},[33,156078,574],{"class":167},[33,156080,156081],{"class":35,"line":5399},[33,156082,6914],{"class":167},[2537,156084],{},[18,156086,6918],{"id":6917},[4211,156088,156089,156094,156099,156104],{},[4214,156090,156091,156093],{},[940,156092,147046],{"href":149866}," — when formulas return None on read-back",[4214,156095,156096,156098],{},[940,156097,6936],{"href":6935}," — scheduling and batch report workflows",[4214,156100,156101,156103],{},[940,156102,99577],{"href":99576}," — loading .xlsx data back into pandas",[4214,156105,156106,156108],{},[940,156107,95780],{"href":21804}," — end-to-end sales report automation",[14,156110,6947,156111,3035],{},[940,156112,26258],{"href":26257},[6953,156114,156115],{},"html pre.shiki code .sAwPA, html code.shiki .sAwPA{--shiki-default:#6A737D}html pre.shiki code .s7eDp, html code.shiki .s7eDp{--shiki-default:#6F42C1}html pre.shiki code .sYBdl, html code.shiki .sYBdl{--shiki-default:#032F62}html pre.shiki code .sD7c4, html code.shiki .sD7c4{--shiki-default:#D73A49}html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html pre.shiki code .sgsFI, html code.shiki .sgsFI{--shiki-default:#24292E}html pre.shiki code .sYu0t, html code.shiki .sYu0t{--shiki-default:#005CC5}html pre.shiki code .sqxcx, html code.shiki .sqxcx{--shiki-default:#E36209}",{"title":28,"searchDepth":43,"depth":43,"links":156117},[156118,156119,156120,156121,156122,156123,156124,156125,156126,156127,156131,156136,156137,156138,156139,156140],{"id":20,"depth":43,"text":21},{"id":149665,"depth":43,"text":149666},{"id":149856,"depth":43,"text":149857},{"id":150324,"depth":43,"text":150325},{"id":150729,"depth":43,"text":150730},{"id":151191,"depth":43,"text":151192},{"id":151615,"depth":43,"text":151616},{"id":151972,"depth":43,"text":151973},{"id":152376,"depth":43,"text":152377},{"id":152482,"depth":43,"text":152483,"children":156128},[156129,156130],{"id":152486,"depth":61,"text":152487},{"id":152964,"depth":61,"text":152965},{"id":2708,"depth":43,"text":61980,"children":156132},[156133,156134,156135],{"id":153144,"depth":61,"text":153145},{"id":153535,"depth":61,"text":153536},{"id":153856,"depth":61,"text":153857},{"id":52029,"depth":43,"text":52030},{"id":4208,"depth":43,"text":62712},{"id":4270,"depth":43,"text":4271},{"id":4401,"depth":43,"text":62848},{"id":6917,"depth":43,"text":6918},"Formulas & Charts","Write live Excel formulas, named ranges, number formats, and BarChart\u002FLineChart\u002FPieChart objects into .xlsx files using openpyxl — step-by-step with runnable code.",{},"\u002Fpython-for-excel-csv-data-processing\u002Fwriting-excel-formulas-and-charts-with-openpyxl",{"title":102074,"description":156142},"python-for-excel-csv-data-processing\u002Fwriting-excel-formulas-and-charts-with-openpyxl\u002Findex",[47,22009,99614,156148,6989],"charts","wV-dhJCtWuyrVif0Yb2524iP5EzEbQFhY40NSGqiLCQ",{"id":156151,"title":156152,"body":156153,"breadcrumbTitle":161311,"canonical":6977,"date":6977,"description":107412,"draft":6980,"extension":6981,"image":6977,"meta":161312,"navigation":91,"path":161313,"robots":6977,"seo":161314,"seoTitle":161320,"stem":161321,"tags":6977,"updatedAt":6977,"__hash__":161322},"content\u002Fword-document-templating-batch-processing\u002Fautomating-word-document-creation\u002Findex.md","Automating Word Document Creation",{"type":7,"value":156154,"toc":161286},[156155,156158,156169,156179,156181,156201,156204,156235,156238,156252,156263,156265,156269,156275,156439,156449,156451,156455,156459,156467,156690,156710,156714,156799,156813,156817,156824,157010,157022,157026,157029,157430,157433,157487,157491,157502,157653,157674,157678,157928,157932,158023,158025,158029,158038,158157,158181,158183,158187,158191,158201,158491,158495,158502,158759,158763,158766,159142,159144,159148,159151,159590,159593,159747,159749,159753,159764,159775,159788,159977,159983,160056,160058,160062,160185,160187,160191,161133,161135,161137,161146,161177,161194,161214,161236,161252,161254,161256,161280,161284],[10,156156,156152],{"id":156157},"automating-word-document-creation",[14,156159,156160,156161,156165,156166,156168],{},"Generating Word documents by hand — or by recording macros — breaks the moment the data changes or the volume grows past a handful of files. Macro-recorded VBA is brittle across Office versions, COM automation is Windows-only, and copying content into templates by hand does not scale. ",[940,156162,18041],{"href":156163,"rel":156164},"https:\u002F\u002Fpython-docx.readthedocs.io\u002F",[1367]," lets you construct ",[30,156167,18051],{}," files entirely from Python: no Word installation required, no COM automation, no platform restriction. The library writes the underlying OOXML XML directly, so the output is indistinguishable from a file Word would produce.",[14,156170,156171,156172,156174,156175,3035],{},"This guide covers every core building block in depth — paragraphs, headings, runs, tables, page breaks, and sections — then assembles them into a tested, production-ready script. For template-driven generation (Jinja2 placeholders, conditional blocks, row loops over datasets), see ",[940,156173,26185],{"href":18040},". For embedding photographs, logos, or programmatically generated charts, see ",[940,156176,156178],{"href":156177},"\u002Fword-document-templating-batch-processing\u002Finserting-images-into-word-documents\u002F","Inserting Images into Word Documents",[18,156180,21],{"id":20},[23,156182,156184],{"className":25,"code":156183,"language":27,"meta":28,"style":28},"# system deps: none beyond Python 3.9+\npip install python-docx pandas\n",[30,156185,156186,156191],{"__ignoreMap":28},[33,156187,156188],{"class":35,"line":36},[33,156189,156190],{"class":39},"# system deps: none beyond Python 3.9+\n",[33,156192,156193,156195,156197,156199],{"class":35,"line":43},[33,156194,76],{"class":46},[33,156196,79],{"class":54},[33,156198,16192],{"class":54},[33,156200,9707],{"class":54},[14,156202,156203],{},"Confirm the library version before starting:",[23,156205,156207],{"className":126,"code":156206,"language":47,"meta":28,"style":28},"# pip install python-docx\nimport docx\nprint(docx.__version__)  # e.g. 1.1.2\n",[30,156208,156209,156214,156221],{"__ignoreMap":28},[33,156210,156211],{"class":35,"line":36},[33,156212,156213],{"class":39},"# pip install python-docx\n",[33,156215,156216,156218],{"class":35,"line":43},[33,156217,164],{"class":163},[33,156219,156220],{"class":167}," docx\n",[33,156222,156223,156225,156228,156230,156232],{"class":35,"line":61},[33,156224,13474],{"class":50},[33,156226,156227],{"class":167},"(docx.",[33,156229,37016],{"class":50},[33,156231,10922],{"class":167},[33,156233,156234],{"class":39},"# e.g. 1.1.2\n",[14,156236,156237],{},"Create an output directory for generated files:",[23,156239,156241],{"className":25,"code":156240,"language":27,"meta":28,"style":28},"mkdir -p output\u002Fword\n",[30,156242,156243],{"__ignoreMap":28},[33,156244,156245,156247,156249],{"class":35,"line":36},[33,156246,59501],{"class":46},[33,156248,59504],{"class":50},[33,156250,156251],{"class":54}," output\u002Fword\n",[14,156253,156254,156255,156258,156259,156262],{},"python-docx ships with a built-in default template (",[30,156256,156257],{},"default.docx",") bundled inside the package. Calling ",[30,156260,156261],{},"Document()"," with no arguments starts from that template, which provides sensible margin defaults and the standard Word style set (Normal, Heading 1–9, Title, Body Text, Table Grid, etc.).",[2537,156264],{},[18,156266,156268],{"id":156267},"_1-inspect-before-you-build","1. Inspect Before You Build",[14,156270,156271,156272,156274],{},"Before writing generation code against an existing corporate template, enumerate the styles it defines. Using an unknown style name silently falls back to ",[30,156273,99685],{},", which causes unexpected formatting that is hard to debug after the fact.",[23,156276,156278],{"className":126,"code":156277,"language":47,"meta":28,"style":28},"# pip install python-docx\nfrom pathlib import Path\nfrom docx import Document\n\nREFERENCE = Path(\"reference.docx\")\n\ntry:\n    doc = Document(REFERENCE)\nexcept FileNotFoundError:\n    # No reference? Start from the built-in default template.\n    doc = Document()\n\n# Print every built-in paragraph and character style name\nfor style in doc.styles:\n    if style.type.name in (\"PARAGRAPH\", \"CHARACTER\"):\n        print(f\"{style.type.name:12s}  {style.name}\")\n",[30,156279,156280,156284,156294,156304,156308,156322,156326,156332,156345,156353,156358,156366,156370,156375,156387,156408],{"__ignoreMap":28},[33,156281,156282],{"class":35,"line":36},[33,156283,156213],{"class":39},[33,156285,156286,156288,156290,156292],{"class":35,"line":43},[33,156287,190],{"class":163},[33,156289,193],{"class":167},[33,156291,164],{"class":163},[33,156293,198],{"class":167},[33,156295,156296,156298,156300,156302],{"class":35,"line":61},[33,156297,190],{"class":163},[33,156299,18092],{"class":167},[33,156301,164],{"class":163},[33,156303,18097],{"class":167},[33,156305,156306],{"class":35,"line":73},[33,156307,92],{"emptyLinePlaceholder":91},[33,156309,156310,156313,156315,156317,156320],{"class":35,"line":88},[33,156311,156312],{"class":50},"REFERENCE",[33,156314,212],{"class":163},[33,156316,215],{"class":167},[33,156318,156319],{"class":54},"\"reference.docx\"",[33,156321,221],{"class":167},[33,156323,156324],{"class":35,"line":95},[33,156325,92],{"emptyLinePlaceholder":91},[33,156327,156328,156330],{"class":35,"line":101},[33,156329,35574],{"class":163},[33,156331,574],{"class":167},[33,156333,156334,156336,156338,156341,156343],{"class":35,"line":171},[33,156335,18224],{"class":167},[33,156337,242],{"class":163},[33,156339,156340],{"class":167}," Document(",[33,156342,156312],{"class":50},[33,156344,221],{"class":167},[33,156346,156347,156349,156351],{"class":35,"line":179},[33,156348,35726],{"class":163},[33,156350,2945],{"class":50},[33,156352,574],{"class":167},[33,156354,156355],{"class":35,"line":187},[33,156356,156357],{"class":39},"    # No reference? Start from the built-in default template.\n",[33,156359,156360,156362,156364],{"class":35,"line":201},[33,156361,18224],{"class":167},[33,156363,242],{"class":163},[33,156365,18229],{"class":167},[33,156367,156368],{"class":35,"line":206},[33,156369,92],{"emptyLinePlaceholder":91},[33,156371,156372],{"class":35,"line":224},[33,156373,156374],{"class":39},"# Print every built-in paragraph and character style name\n",[33,156376,156377,156379,156382,156384],{"class":35,"line":229},[33,156378,6124],{"class":163},[33,156380,156381],{"class":167}," style ",[33,156383,662],{"class":163},[33,156385,156386],{"class":167}," doc.styles:\n",[33,156388,156389,156391,156394,156396,156398,156401,156403,156406],{"class":35,"line":235},[33,156390,617],{"class":163},[33,156392,156393],{"class":167}," style.type.name ",[33,156395,662],{"class":163},[33,156397,17583],{"class":167},[33,156399,156400],{"class":54},"\"PARAGRAPH\"",[33,156402,365],{"class":167},[33,156404,156405],{"class":54},"\"CHARACTER\"",[33,156407,1737],{"class":167},[33,156409,156410,156412,156414,156416,156418,156420,156423,156426,156428,156430,156433,156435,156437],{"class":35,"line":250},[33,156411,9414],{"class":50},[33,156413,602],{"class":167},[33,156415,4059],{"class":163},[33,156417,274],{"class":54},[33,156419,1115],{"class":50},[33,156421,156422],{"class":167},"style.type.name",[33,156424,156425],{"class":163},":12s",[33,156427,1121],{"class":50},[33,156429,54867],{"class":50},[33,156431,156432],{"class":167},"style.name",[33,156434,1121],{"class":50},[33,156436,274],{"class":54},[33,156438,221],{"class":167},[14,156440,156441,156442,2012,156445,156448],{},"Run this once and capture the output. The style names you see — exactly as printed, with correct capitalisation and spacing — are the strings you pass to ",[30,156443,156444],{},"doc.add_paragraph(style=...)",[30,156446,156447],{},"run.style = doc.styles[name]"," later.",[2537,156450],{},[18,156452,156454],{"id":156453},"_2-core-building-blocks","2. Core Building Blocks",[424,156456,156458],{"id":156457},"step-1-create-a-document-and-configure-page-layout","Step 1 — Create a Document and Configure Page Layout",[14,156460,156461,156463,156464,156466],{},[30,156462,156261],{}," opens or creates a ",[30,156465,18051],{}," and returns the root object. Every subsequent element-add call appends to its content in document order. Set page geometry via the default section before adding content:",[23,156468,156470],{"className":126,"code":156469,"language":47,"meta":28,"style":28},"# pip install python-docx\nfrom pathlib import Path\nfrom docx import Document\nfrom docx.shared import Inches, Pt\nfrom docx.enum.section import WD_ORIENT\n\nOUTPUT = Path(\"output\u002Fword\u002Freport.docx\")\nOUTPUT.parent.mkdir(parents=True, exist_ok=True)\n\ndoc = Document()\n\n# Default section — first (and only) section in a new document\nsection = doc.sections[0]\nsection.page_width    = Inches(8.5)\nsection.page_height   = Inches(11)\nsection.orientation   = WD_ORIENT.PORTRAIT\nsection.top_margin    = Inches(1)\nsection.bottom_margin = Inches(1)\nsection.left_margin   = Inches(1.25)\nsection.right_margin  = Inches(1.25)\n",[30,156471,156472,156476,156486,156496,156507,156519,156523,156536,156558,156562,156571,156575,156580,156594,156609,156622,156637,156650,156663,156677],{"__ignoreMap":28},[33,156473,156474],{"class":35,"line":36},[33,156475,156213],{"class":39},[33,156477,156478,156480,156482,156484],{"class":35,"line":43},[33,156479,190],{"class":163},[33,156481,193],{"class":167},[33,156483,164],{"class":163},[33,156485,198],{"class":167},[33,156487,156488,156490,156492,156494],{"class":35,"line":61},[33,156489,190],{"class":163},[33,156491,18092],{"class":167},[33,156493,164],{"class":163},[33,156495,18097],{"class":167},[33,156497,156498,156500,156502,156504],{"class":35,"line":73},[33,156499,190],{"class":163},[33,156501,18104],{"class":167},[33,156503,164],{"class":163},[33,156505,156506],{"class":167}," Inches, Pt\n",[33,156508,156509,156511,156514,156516],{"class":35,"line":88},[33,156510,190],{"class":163},[33,156512,156513],{"class":167}," docx.enum.section ",[33,156515,164],{"class":163},[33,156517,156518],{"class":50}," WD_ORIENT\n",[33,156520,156521],{"class":35,"line":95},[33,156522,92],{"emptyLinePlaceholder":91},[33,156524,156525,156527,156529,156531,156534],{"class":35,"line":101},[33,156526,96935],{"class":50},[33,156528,212],{"class":163},[33,156530,215],{"class":167},[33,156532,156533],{"class":54},"\"output\u002Fword\u002Freport.docx\"",[33,156535,221],{"class":167},[33,156537,156538,156540,156542,156544,156546,156548,156550,156552,156554,156556],{"class":35,"line":171},[33,156539,96935],{"class":50},[33,156541,866],{"class":167},[33,156543,869],{"class":238},[33,156545,242],{"class":163},[33,156547,855],{"class":50},[33,156549,365],{"class":167},[33,156551,878],{"class":238},[33,156553,242],{"class":163},[33,156555,855],{"class":50},[33,156557,221],{"class":167},[33,156559,156560],{"class":35,"line":179},[33,156561,92],{"emptyLinePlaceholder":91},[33,156563,156564,156567,156569],{"class":35,"line":187},[33,156565,156566],{"class":167},"doc ",[33,156568,242],{"class":163},[33,156570,18229],{"class":167},[33,156572,156573],{"class":35,"line":201},[33,156574,92],{"emptyLinePlaceholder":91},[33,156576,156577],{"class":35,"line":206},[33,156578,156579],{"class":39},"# Default section — first (and only) section in a new document\n",[33,156581,156582,156585,156587,156590,156592],{"class":35,"line":224},[33,156583,156584],{"class":167},"section ",[33,156586,242],{"class":163},[33,156588,156589],{"class":167}," doc.sections[",[33,156591,748],{"class":50},[33,156593,9202],{"class":167},[33,156595,156596,156599,156601,156604,156607],{"class":35,"line":229},[33,156597,156598],{"class":167},"section.page_width    ",[33,156600,242],{"class":163},[33,156602,156603],{"class":167}," Inches(",[33,156605,156606],{"class":50},"8.5",[33,156608,221],{"class":167},[33,156610,156611,156614,156616,156618,156620],{"class":35,"line":235},[33,156612,156613],{"class":167},"section.page_height   ",[33,156615,242],{"class":163},[33,156617,156603],{"class":167},[33,156619,17260],{"class":50},[33,156621,221],{"class":167},[33,156623,156624,156627,156629,156632,156634],{"class":35,"line":250},[33,156625,156626],{"class":167},"section.orientation   ",[33,156628,242],{"class":163},[33,156630,156631],{"class":50}," WD_ORIENT",[33,156633,3035],{"class":167},[33,156635,156636],{"class":50},"PORTRAIT\n",[33,156638,156639,156642,156644,156646,156648],{"class":35,"line":266},[33,156640,156641],{"class":167},"section.top_margin    ",[33,156643,242],{"class":163},[33,156645,156603],{"class":167},[33,156647,734],{"class":50},[33,156649,221],{"class":167},[33,156651,156652,156655,156657,156659,156661],{"class":35,"line":290},[33,156653,156654],{"class":167},"section.bottom_margin ",[33,156656,242],{"class":163},[33,156658,156603],{"class":167},[33,156660,734],{"class":50},[33,156662,221],{"class":167},[33,156664,156665,156668,156670,156672,156675],{"class":35,"line":295},[33,156666,156667],{"class":167},"section.left_margin   ",[33,156669,242],{"class":163},[33,156671,156603],{"class":167},[33,156673,156674],{"class":50},"1.25",[33,156676,221],{"class":167},[33,156678,156679,156682,156684,156686,156688],{"class":35,"line":300},[33,156680,156681],{"class":167},"section.right_margin  ",[33,156683,242],{"class":163},[33,156685,156603],{"class":167},[33,156687,156674],{"class":50},[33,156689,221],{"class":167},[14,156691,156692,156693,156696,156697,365,156700,71132,156703,91899,156706,156709],{},"Margins are set in EMUs (English Metric Units) internally, but ",[30,156694,156695],{},"Inches()"," converts for you. ",[30,156698,156699],{},"Pt()",[30,156701,156702],{},"Cm()",[30,156704,156705],{},"Mm()",[30,156707,156708],{},"docx.shared"," work the same way.",[424,156711,156713],{"id":156712},"step-2-headings","Step 2 — Headings",[23,156715,156717],{"className":126,"code":156716,"language":47,"meta":28,"style":28},"doc.add_heading(\"Quarterly Sales Report\", level=0)   # Title style\ndoc.add_heading(\"Executive Summary\",      level=1)   # Heading 1\ndoc.add_heading(\"Revenue by Region\",      level=2)   # Heading 2\ndoc.add_heading(\"APAC Detail\",            level=3)   # Heading 3\n",[30,156718,156719,156740,156760,156779],{"__ignoreMap":28},[33,156720,156721,156724,156727,156729,156731,156733,156735,156737],{"class":35,"line":36},[33,156722,156723],{"class":167},"doc.add_heading(",[33,156725,156726],{"class":54},"\"Quarterly Sales Report\"",[33,156728,365],{"class":167},[33,156730,18267],{"class":238},[33,156732,242],{"class":163},[33,156734,748],{"class":50},[33,156736,12000],{"class":167},[33,156738,156739],{"class":39},"# Title style\n",[33,156741,156742,156744,156747,156749,156751,156753,156755,156757],{"class":35,"line":43},[33,156743,156723],{"class":167},[33,156745,156746],{"class":54},"\"Executive Summary\"",[33,156748,121141],{"class":167},[33,156750,18267],{"class":238},[33,156752,242],{"class":163},[33,156754,734],{"class":50},[33,156756,12000],{"class":167},[33,156758,156759],{"class":39},"# Heading 1\n",[33,156761,156762,156764,156766,156768,156770,156772,156774,156776],{"class":35,"line":61},[33,156763,156723],{"class":167},[33,156765,104780],{"class":54},[33,156767,121141],{"class":167},[33,156769,18267],{"class":238},[33,156771,242],{"class":163},[33,156773,1533],{"class":50},[33,156775,12000],{"class":167},[33,156777,156778],{"class":39},"# Heading 2\n",[33,156780,156781,156783,156786,156788,156790,156792,156794,156796],{"class":35,"line":73},[33,156782,156723],{"class":167},[33,156784,156785],{"class":54},"\"APAC Detail\"",[33,156787,48549],{"class":167},[33,156789,18267],{"class":238},[33,156791,242],{"class":163},[33,156793,10258],{"class":50},[33,156795,12000],{"class":167},[33,156797,156798],{"class":39},"# Heading 3\n",[14,156800,156801,156804,156805,156808,156809,156812],{},[30,156802,156803],{},"level=0"," maps to Word's \"Title\" style. ",[30,156806,156807],{},"level=1"," through ",[30,156810,156811],{},"level=9"," map to Heading 1–9. Headings appear in the document's navigation pane and are picked up by automated table-of-contents generation.",[424,156814,156816],{"id":156815},"step-3-paragraphs-and-runs","Step 3 — Paragraphs and Runs",[14,156818,156819,156820,156823],{},"A paragraph holds one or more ",[1974,156821,156822],{},"runs","; a run is a contiguous span of text that shares identical character-level formatting. Understanding this distinction is essential — it is the same model Word uses internally.",[23,156825,156827],{"className":126,"code":156826,"language":47,"meta":28,"style":28},"# pip install python-docx\nfrom docx.shared import Pt, RGBColor\n\n# Simple paragraph: one call adds text and returns a Paragraph\nplain = doc.add_paragraph(\"This paragraph uses the Normal style baseline.\")\n\n# Mixed-format paragraph: build it run by run\npara  = doc.add_paragraph()                        # empty paragraph\nrun1  = para.add_run(\"Revenue grew \")\nrun2  = para.add_run(\"14 %\")\nrun2.bold            = True\nrun2.font.color.rgb  = RGBColor(0x16, 0x65, 0x34)  # dark green\n\nrun3  = para.add_run(\" year-over-year, driven by APAC expansion.\")\n\n# Paragraph-level spacing\npara.paragraph_format.space_after  = Pt(6)\npara.paragraph_format.space_before = Pt(0)\n",[30,156828,156829,156833,156843,156847,156852,156867,156871,156876,156889,156904,156918,156927,156957,156961,156975,156979,156984,156997],{"__ignoreMap":28},[33,156830,156831],{"class":35,"line":36},[33,156832,156213],{"class":39},[33,156834,156835,156837,156839,156841],{"class":35,"line":43},[33,156836,190],{"class":163},[33,156838,18104],{"class":167},[33,156840,164],{"class":163},[33,156842,22662],{"class":167},[33,156844,156845],{"class":35,"line":61},[33,156846,92],{"emptyLinePlaceholder":91},[33,156848,156849],{"class":35,"line":73},[33,156850,156851],{"class":39},"# Simple paragraph: one call adds text and returns a Paragraph\n",[33,156853,156854,156857,156859,156862,156865],{"class":35,"line":88},[33,156855,156856],{"class":167},"plain ",[33,156858,242],{"class":163},[33,156860,156861],{"class":167}," doc.add_paragraph(",[33,156863,156864],{"class":54},"\"This paragraph uses the Normal style baseline.\"",[33,156866,221],{"class":167},[33,156868,156869],{"class":35,"line":95},[33,156870,92],{"emptyLinePlaceholder":91},[33,156872,156873],{"class":35,"line":101},[33,156874,156875],{"class":39},"# Mixed-format paragraph: build it run by run\n",[33,156877,156878,156881,156883,156886],{"class":35,"line":171},[33,156879,156880],{"class":167},"para  ",[33,156882,242],{"class":163},[33,156884,156885],{"class":167}," doc.add_paragraph()                        ",[33,156887,156888],{"class":39},"# empty paragraph\n",[33,156890,156891,156894,156896,156899,156902],{"class":35,"line":179},[33,156892,156893],{"class":167},"run1  ",[33,156895,242],{"class":163},[33,156897,156898],{"class":167}," para.add_run(",[33,156900,156901],{"class":54},"\"Revenue grew \"",[33,156903,221],{"class":167},[33,156905,156906,156909,156911,156913,156916],{"class":35,"line":187},[33,156907,156908],{"class":167},"run2  ",[33,156910,242],{"class":163},[33,156912,156898],{"class":167},[33,156914,156915],{"class":54},"\"14 %\"",[33,156917,221],{"class":167},[33,156919,156920,156923,156925],{"class":35,"line":201},[33,156921,156922],{"class":167},"run2.bold            ",[33,156924,242],{"class":163},[33,156926,2887],{"class":50},[33,156928,156929,156932,156934,156936,156938,156940,156942,156944,156946,156948,156950,156952,156954],{"class":35,"line":206},[33,156930,156931],{"class":167},"run2.font.color.rgb  ",[33,156933,242],{"class":163},[33,156935,18288],{"class":167},[33,156937,18291],{"class":163},[33,156939,24213],{"class":50},[33,156941,365],{"class":167},[33,156943,18291],{"class":163},[33,156945,2653],{"class":50},[33,156947,365],{"class":167},[33,156949,18291],{"class":163},[33,156951,38717],{"class":50},[33,156953,10922],{"class":167},[33,156955,156956],{"class":39},"# dark green\n",[33,156958,156959],{"class":35,"line":224},[33,156960,92],{"emptyLinePlaceholder":91},[33,156962,156963,156966,156968,156970,156973],{"class":35,"line":229},[33,156964,156965],{"class":167},"run3  ",[33,156967,242],{"class":163},[33,156969,156898],{"class":167},[33,156971,156972],{"class":54},"\" year-over-year, driven by APAC expansion.\"",[33,156974,221],{"class":167},[33,156976,156977],{"class":35,"line":235},[33,156978,92],{"emptyLinePlaceholder":91},[33,156980,156981],{"class":35,"line":250},[33,156982,156983],{"class":39},"# Paragraph-level spacing\n",[33,156985,156986,156989,156991,156993,156995],{"class":35,"line":266},[33,156987,156988],{"class":167},"para.paragraph_format.space_after  ",[33,156990,242],{"class":163},[33,156992,18472],{"class":167},[33,156994,2681],{"class":50},[33,156996,221],{"class":167},[33,156998,156999,157002,157004,157006,157008],{"class":35,"line":290},[33,157000,157001],{"class":167},"para.paragraph_format.space_before ",[33,157003,242],{"class":163},[33,157005,18472],{"class":167},[33,157007,748],{"class":50},[33,157009,221],{"class":167},[14,157011,157012,157013,157017,157018,157021],{},"For detailed control of font family, size, and East-Asian character rendering, see ",[940,157014,157016],{"href":157015},"\u002Fword-document-templating-batch-processing\u002Fautomating-word-document-creation\u002Fset-fonts-and-styles-with-python-docx\u002F","Set Fonts and Styles with python-docx"," — that page covers the ",[30,157019,157020],{},"w:eastAsia"," oxml workaround and how to define reusable named character styles.",[424,157023,157025],{"id":157024},"step-4-tables","Step 4 — Tables",[14,157027,157028],{},"Tables are the most complex element. python-docx provides a clean API for creation and row-by-row population, but some formatting tasks (column widths, cell shading, repeating header rows) require dropping to the underlying XML.",[23,157030,157032],{"className":126,"code":157031,"language":47,"meta":28,"style":28},"# pip install python-docx\nfrom docx.shared import Inches\nfrom docx.oxml.ns import qn\nfrom docx.oxml   import OxmlElement\n\nheaders = [\"Region\",    \"Q3 Revenue\", \"Q4 Revenue\", \"YoY %\"]\nrows    = [\n    [\"APAC\",            \"$4.2 M\",     \"$5.1 M\",     \"+21 %\"],\n    [\"EMEA\",            \"$3.8 M\",     \"$4.0 M\",     \"+5 %\"],\n    [\"Americas\",        \"$6.1 M\",     \"$6.9 M\",     \"+13 %\"],\n    [\"Middle East\",     \"$1.1 M\",     \"$1.4 M\",     \"+27 %\"],\n]\n\ntable = doc.add_table(rows=1, cols=len(headers))\ntable.style = \"Table Grid\"\n\n# Set explicit column widths\nfor i, width in enumerate([Inches(1.6), Inches(1.4), Inches(1.4), Inches(1.0)]):\n    for cell in table.columns[i].cells:\n        cell.width = width\n\n# Header row — bold text\nhdr_cells = table.rows[0].cells\nfor i, text in enumerate(headers):\n    hdr_cells[i].text = text\n    hdr_cells[i].paragraphs[0].runs[0].bold = True\n\n# Data rows\nfor row_data in rows:\n    cells = table.add_row().cells\n    for i, text in enumerate(row_data):\n        cells[i].text = text\n",[30,157033,157034,157038,157049,157061,157073,157077,157104,157113,157136,157160,157184,157208,157212,157216,157242,157251,157255,157260,157294,157305,157315,157319,157324,157337,157351,157361,157379,157383,157388,157399,157408,157421],{"__ignoreMap":28},[33,157035,157036],{"class":35,"line":36},[33,157037,156213],{"class":39},[33,157039,157040,157042,157044,157046],{"class":35,"line":43},[33,157041,190],{"class":163},[33,157043,18104],{"class":167},[33,157045,164],{"class":163},[33,157047,157048],{"class":167}," Inches\n",[33,157050,157051,157053,157056,157058],{"class":35,"line":61},[33,157052,190],{"class":163},[33,157054,157055],{"class":167}," docx.oxml.ns ",[33,157057,164],{"class":163},[33,157059,157060],{"class":167}," qn\n",[33,157062,157063,157065,157068,157070],{"class":35,"line":73},[33,157064,190],{"class":163},[33,157066,157067],{"class":167}," docx.oxml   ",[33,157069,164],{"class":163},[33,157071,157072],{"class":167}," OxmlElement\n",[33,157074,157075],{"class":35,"line":88},[33,157076,92],{"emptyLinePlaceholder":91},[33,157078,157079,157081,157083,157085,157087,157089,157092,157094,157097,157099,157102],{"class":35,"line":95},[33,157080,100706],{"class":167},[33,157082,242],{"class":163},[33,157084,9178],{"class":167},[33,157086,11865],{"class":54},[33,157088,38342],{"class":167},[33,157090,157091],{"class":54},"\"Q3 Revenue\"",[33,157093,365],{"class":167},[33,157095,157096],{"class":54},"\"Q4 Revenue\"",[33,157098,365],{"class":167},[33,157100,157101],{"class":54},"\"YoY %\"",[33,157103,9202],{"class":167},[33,157105,157106,157109,157111],{"class":35,"line":101},[33,157107,157108],{"class":167},"rows    ",[33,157110,242],{"class":163},[33,157112,7473],{"class":167},[33,157114,157115,157117,157119,157121,157124,157126,157129,157131,157134],{"class":35,"line":171},[33,157116,35065],{"class":167},[33,157118,120447],{"class":54},[33,157120,48549],{"class":167},[33,157122,157123],{"class":54},"\"$4.2 M\"",[33,157125,25539],{"class":167},[33,157127,157128],{"class":54},"\"$5.1 M\"",[33,157130,25539],{"class":167},[33,157132,157133],{"class":54},"\"+21 %\"",[33,157135,8935],{"class":167},[33,157137,157138,157140,157143,157145,157148,157150,157153,157155,157158],{"class":35,"line":179},[33,157139,35065],{"class":167},[33,157141,157142],{"class":54},"\"EMEA\"",[33,157144,48549],{"class":167},[33,157146,157147],{"class":54},"\"$3.8 M\"",[33,157149,25539],{"class":167},[33,157151,157152],{"class":54},"\"$4.0 M\"",[33,157154,25539],{"class":167},[33,157156,157157],{"class":54},"\"+5 %\"",[33,157159,8935],{"class":167},[33,157161,157162,157164,157167,157169,157172,157174,157177,157179,157182],{"class":35,"line":187},[33,157163,35065],{"class":167},[33,157165,157166],{"class":54},"\"Americas\"",[33,157168,89262],{"class":167},[33,157170,157171],{"class":54},"\"$6.1 M\"",[33,157173,25539],{"class":167},[33,157175,157176],{"class":54},"\"$6.9 M\"",[33,157178,25539],{"class":167},[33,157180,157181],{"class":54},"\"+13 %\"",[33,157183,8935],{"class":167},[33,157185,157186,157188,157191,157193,157196,157198,157201,157203,157206],{"class":35,"line":201},[33,157187,35065],{"class":167},[33,157189,157190],{"class":54},"\"Middle East\"",[33,157192,25539],{"class":167},[33,157194,157195],{"class":54},"\"$1.1 M\"",[33,157197,25539],{"class":167},[33,157199,157200],{"class":54},"\"$1.4 M\"",[33,157202,25539],{"class":167},[33,157204,157205],{"class":54},"\"+27 %\"",[33,157207,8935],{"class":167},[33,157209,157210],{"class":35,"line":206},[33,157211,9202],{"class":167},[33,157213,157214],{"class":35,"line":224},[33,157215,92],{"emptyLinePlaceholder":91},[33,157217,157218,157221,157223,157225,157227,157229,157231,157233,157235,157237,157239],{"class":35,"line":229},[33,157219,157220],{"class":167},"table ",[33,157222,242],{"class":163},[33,157224,18626],{"class":167},[33,157226,18629],{"class":238},[33,157228,242],{"class":163},[33,157230,734],{"class":50},[33,157232,365],{"class":167},[33,157234,18638],{"class":238},[33,157236,242],{"class":163},[33,157238,928],{"class":50},[33,157240,157241],{"class":167},"(headers))\n",[33,157243,157244,157247,157249],{"class":35,"line":235},[33,157245,157246],{"class":167},"table.style ",[33,157248,242],{"class":163},[33,157250,18655],{"class":54},[33,157252,157253],{"class":35,"line":250},[33,157254,92],{"emptyLinePlaceholder":91},[33,157256,157257],{"class":35,"line":266},[33,157258,157259],{"class":39},"# Set explicit column widths\n",[33,157261,157262,157264,157267,157269,157271,157274,157277,157280,157283,157285,157287,157289,157291],{"class":35,"line":290},[33,157263,6124],{"class":163},[33,157265,157266],{"class":167}," i, width ",[33,157268,662],{"class":163},[33,157270,7403],{"class":50},[33,157272,157273],{"class":167},"([Inches(",[33,157275,157276],{"class":50},"1.6",[33,157278,157279],{"class":167},"), Inches(",[33,157281,157282],{"class":50},"1.4",[33,157284,157279],{"class":167},[33,157286,157282],{"class":50},[33,157288,157279],{"class":167},[33,157290,82813],{"class":50},[33,157292,157293],{"class":167},")]):\n",[33,157295,157296,157298,157300,157302],{"class":35,"line":295},[33,157297,656],{"class":163},[33,157299,17467],{"class":167},[33,157301,662],{"class":163},[33,157303,157304],{"class":167}," table.columns[i].cells:\n",[33,157306,157307,157310,157312],{"class":35,"line":300},[33,157308,157309],{"class":167},"        cell.width ",[33,157311,242],{"class":163},[33,157313,157314],{"class":167}," width\n",[33,157316,157317],{"class":35,"line":317},[33,157318,92],{"emptyLinePlaceholder":91},[33,157320,157321],{"class":35,"line":332},[33,157322,157323],{"class":39},"# Header row — bold text\n",[33,157325,157326,157329,157331,157333,157335],{"class":35,"line":347},[33,157327,157328],{"class":167},"hdr_cells ",[33,157330,242],{"class":163},[33,157332,18674],{"class":167},[33,157334,748],{"class":50},[33,157336,18679],{"class":167},[33,157338,157339,157341,157344,157346,157348],{"class":35,"line":374},[33,157340,6124],{"class":163},[33,157342,157343],{"class":167}," i, text ",[33,157345,662],{"class":163},[33,157347,7403],{"class":50},[33,157349,157350],{"class":167},"(headers):\n",[33,157352,157353,157356,157358],{"class":35,"line":397},[33,157354,157355],{"class":167},"    hdr_cells[i].text ",[33,157357,242],{"class":163},[33,157359,157360],{"class":167}," text\n",[33,157362,157363,157366,157368,157370,157372,157375,157377],{"class":35,"line":653},[33,157364,157365],{"class":167},"    hdr_cells[i].paragraphs[",[33,157367,748],{"class":50},[33,157369,18713],{"class":167},[33,157371,748],{"class":50},[33,157373,157374],{"class":167},"].bold ",[33,157376,242],{"class":163},[33,157378,2887],{"class":50},[33,157380,157381],{"class":35,"line":667},[33,157382,92],{"emptyLinePlaceholder":91},[33,157384,157385],{"class":35,"line":675},[33,157386,157387],{"class":39},"# Data rows\n",[33,157389,157390,157392,157395,157397],{"class":35,"line":689},[33,157391,6124],{"class":163},[33,157393,157394],{"class":167}," row_data ",[33,157396,662],{"class":163},[33,157398,8723],{"class":167},[33,157400,157401,157404,157406],{"class":35,"line":703},[33,157402,157403],{"class":167},"    cells ",[33,157405,242],{"class":163},[33,157407,18752],{"class":167},[33,157409,157410,157412,157414,157416,157418],{"class":35,"line":714},[33,157411,656],{"class":163},[33,157413,157343],{"class":167},[33,157415,662],{"class":163},[33,157417,7403],{"class":50},[33,157419,157420],{"class":167},"(row_data):\n",[33,157422,157423,157426,157428],{"class":35,"line":723},[33,157424,157425],{"class":167},"        cells[i].text ",[33,157427,242],{"class":163},[33,157429,157360],{"class":167},[14,157431,157432],{},"To repeat the header row on each printed page (important for long tables):",[23,157434,157436],{"className":126,"code":157435,"language":47,"meta":28,"style":28},"# Mark the header row as a repeating header via OOXML\ntr    = table.rows[0]._tr\ntrPr  = tr.get_or_add_trPr()\ntblHeader = OxmlElement(\"w:tblHeader\")\ntrPr.append(tblHeader)\n",[30,157437,157438,157443,157457,157467,157482],{"__ignoreMap":28},[33,157439,157440],{"class":35,"line":36},[33,157441,157442],{"class":39},"# Mark the header row as a repeating header via OOXML\n",[33,157444,157445,157448,157450,157452,157454],{"class":35,"line":43},[33,157446,157447],{"class":167},"tr    ",[33,157449,242],{"class":163},[33,157451,18674],{"class":167},[33,157453,748],{"class":50},[33,157455,157456],{"class":167},"]._tr\n",[33,157458,157459,157462,157464],{"class":35,"line":61},[33,157460,157461],{"class":167},"trPr  ",[33,157463,242],{"class":163},[33,157465,157466],{"class":167}," tr.get_or_add_trPr()\n",[33,157468,157469,157472,157474,157477,157480],{"class":35,"line":73},[33,157470,157471],{"class":167},"tblHeader ",[33,157473,242],{"class":163},[33,157475,157476],{"class":167}," OxmlElement(",[33,157478,157479],{"class":54},"\"w:tblHeader\"",[33,157481,221],{"class":167},[33,157483,157484],{"class":35,"line":88},[33,157485,157486],{"class":167},"trPr.append(tblHeader)\n",[424,157488,157490],{"id":157489},"step-5-page-breaks-and-section-breaks","Step 5 — Page Breaks and Section Breaks",[14,157492,157493,157494,157497,157498,157501],{},"A soft page break inserts ",[30,157495,157496],{},"\\x0c"," inside the current section. A section break creates a new ",[30,157499,157500],{},"\u003Cw:sectPr>"," block with independent margin, header, footer, and orientation settings.",[23,157503,157505],{"className":126,"code":157504,"language":47,"meta":28,"style":28},"# pip install python-docx\nfrom docx.enum.section import WD_SECTION\n\n# Soft page break — stays in the current section\ndoc.add_page_break()\n\n# New section starting on the next page (landscape orientation)\nlandscape_section = doc.add_section(WD_SECTION.NEW_PAGE)\nlandscape_section.orientation  = WD_ORIENT.LANDSCAPE\nlandscape_section.page_width   = Inches(11)\nlandscape_section.page_height  = Inches(8.5)\nlandscape_section.left_margin  = Inches(0.75)\nlandscape_section.right_margin = Inches(0.75)\n\ndoc.add_heading(\"Wide Data Table\", level=1)\n",[30,157506,157507,157511,157522,157526,157531,157536,157540,157545,157565,157579,157592,157605,157619,157632,157636],{"__ignoreMap":28},[33,157508,157509],{"class":35,"line":36},[33,157510,156213],{"class":39},[33,157512,157513,157515,157517,157519],{"class":35,"line":43},[33,157514,190],{"class":163},[33,157516,156513],{"class":167},[33,157518,164],{"class":163},[33,157520,157521],{"class":50}," WD_SECTION\n",[33,157523,157524],{"class":35,"line":61},[33,157525,92],{"emptyLinePlaceholder":91},[33,157527,157528],{"class":35,"line":73},[33,157529,157530],{"class":39},"# Soft page break — stays in the current section\n",[33,157532,157533],{"class":35,"line":88},[33,157534,157535],{"class":167},"doc.add_page_break()\n",[33,157537,157538],{"class":35,"line":95},[33,157539,92],{"emptyLinePlaceholder":91},[33,157541,157542],{"class":35,"line":101},[33,157543,157544],{"class":39},"# New section starting on the next page (landscape orientation)\n",[33,157546,157547,157550,157552,157555,157558,157560,157563],{"class":35,"line":171},[33,157548,157549],{"class":167},"landscape_section ",[33,157551,242],{"class":163},[33,157553,157554],{"class":167}," doc.add_section(",[33,157556,157557],{"class":50},"WD_SECTION",[33,157559,3035],{"class":167},[33,157561,157562],{"class":50},"NEW_PAGE",[33,157564,221],{"class":167},[33,157566,157567,157570,157572,157574,157576],{"class":35,"line":179},[33,157568,157569],{"class":167},"landscape_section.orientation  ",[33,157571,242],{"class":163},[33,157573,156631],{"class":50},[33,157575,3035],{"class":167},[33,157577,157578],{"class":50},"LANDSCAPE\n",[33,157580,157581,157584,157586,157588,157590],{"class":35,"line":187},[33,157582,157583],{"class":167},"landscape_section.page_width   ",[33,157585,242],{"class":163},[33,157587,156603],{"class":167},[33,157589,17260],{"class":50},[33,157591,221],{"class":167},[33,157593,157594,157597,157599,157601,157603],{"class":35,"line":201},[33,157595,157596],{"class":167},"landscape_section.page_height  ",[33,157598,242],{"class":163},[33,157600,156603],{"class":167},[33,157602,156606],{"class":50},[33,157604,221],{"class":167},[33,157606,157607,157610,157612,157614,157617],{"class":35,"line":206},[33,157608,157609],{"class":167},"landscape_section.left_margin  ",[33,157611,242],{"class":163},[33,157613,156603],{"class":167},[33,157615,157616],{"class":50},"0.75",[33,157618,221],{"class":167},[33,157620,157621,157624,157626,157628,157630],{"class":35,"line":224},[33,157622,157623],{"class":167},"landscape_section.right_margin ",[33,157625,242],{"class":163},[33,157627,156603],{"class":167},[33,157629,157616],{"class":50},[33,157631,221],{"class":167},[33,157633,157634],{"class":35,"line":229},[33,157635,92],{"emptyLinePlaceholder":91},[33,157637,157638,157640,157643,157645,157647,157649,157651],{"class":35,"line":235},[33,157639,156723],{"class":167},[33,157641,157642],{"class":54},"\"Wide Data Table\"",[33,157644,365],{"class":167},[33,157646,18267],{"class":238},[33,157648,242],{"class":163},[33,157650,734],{"class":50},[33,157652,221],{"class":167},[14,157654,157655,157657,157658,157660,157661,365,157664,365,157667,157670,157671,3035],{},[30,157656,157557],{}," values: ",[30,157659,157562],{}," (most common), ",[30,157662,157663],{},"EVEN_PAGE",[30,157665,157666],{},"ODD_PAGE",[30,157668,157669],{},"CONTINUOUS"," (no page break), ",[30,157672,157673],{},"NEW_COLUMN",[424,157675,157677],{"id":157676},"step-6-headers-and-footers","Step 6 — Headers and Footers",[23,157679,157681],{"className":126,"code":157680,"language":47,"meta":28,"style":28},"# pip install python-docx\nsection  = doc.sections[0]\nheader   = section.header\nfooter   = section.footer\n\n# Clear any default content\nheader.paragraphs[0].clear()\nfooter.paragraphs[0].clear()\n\nheader.paragraphs[0].add_run(\"Acme Corp — Confidential\").bold = True\n\nfooter_para = footer.paragraphs[0]\nfooter_para.add_run(\"Generated by reporting pipeline  |  \")\n\n# Add a PAGE field for automatic page numbering\nfrom docx.oxml import OxmlElement\nfldChar1 = OxmlElement(\"w:fldChar\")\nfldChar1.set(qn(\"w:fldCharType\"), \"begin\")\ninstrText = OxmlElement(\"w:instrText\")\ninstrText.text = \"PAGE\"\nfldChar2 = OxmlElement(\"w:fldChar\")\nfldChar2.set(qn(\"w:fldCharType\"), \"end\")\n\nrun = footer_para.add_run()\nrun._r.append(fldChar1)\nrun._r.append(instrText)\nrun._r.append(fldChar2)\n",[30,157682,157683,157687,157700,157710,157720,157724,157729,157739,157748,157752,157771,157775,157789,157799,157803,157808,157819,157833,157848,157862,157872,157885,157899,157903,157913,157918,157923],{"__ignoreMap":28},[33,157684,157685],{"class":35,"line":36},[33,157686,156213],{"class":39},[33,157688,157689,157692,157694,157696,157698],{"class":35,"line":43},[33,157690,157691],{"class":167},"section  ",[33,157693,242],{"class":163},[33,157695,156589],{"class":167},[33,157697,748],{"class":50},[33,157699,9202],{"class":167},[33,157701,157702,157705,157707],{"class":35,"line":61},[33,157703,157704],{"class":167},"header   ",[33,157706,242],{"class":163},[33,157708,157709],{"class":167}," section.header\n",[33,157711,157712,157715,157717],{"class":35,"line":73},[33,157713,157714],{"class":167},"footer   ",[33,157716,242],{"class":163},[33,157718,157719],{"class":167}," section.footer\n",[33,157721,157722],{"class":35,"line":88},[33,157723,92],{"emptyLinePlaceholder":91},[33,157725,157726],{"class":35,"line":95},[33,157727,157728],{"class":39},"# Clear any default content\n",[33,157730,157731,157734,157736],{"class":35,"line":101},[33,157732,157733],{"class":167},"header.paragraphs[",[33,157735,748],{"class":50},[33,157737,157738],{"class":167},"].clear()\n",[33,157740,157741,157744,157746],{"class":35,"line":171},[33,157742,157743],{"class":167},"footer.paragraphs[",[33,157745,748],{"class":50},[33,157747,157738],{"class":167},[33,157749,157750],{"class":35,"line":179},[33,157751,92],{"emptyLinePlaceholder":91},[33,157753,157754,157756,157758,157761,157764,157767,157769],{"class":35,"line":187},[33,157755,157733],{"class":167},[33,157757,748],{"class":50},[33,157759,157760],{"class":167},"].add_run(",[33,157762,157763],{"class":54},"\"Acme Corp — Confidential\"",[33,157765,157766],{"class":167},").bold ",[33,157768,242],{"class":163},[33,157770,2887],{"class":50},[33,157772,157773],{"class":35,"line":201},[33,157774,92],{"emptyLinePlaceholder":91},[33,157776,157777,157780,157782,157785,157787],{"class":35,"line":206},[33,157778,157779],{"class":167},"footer_para ",[33,157781,242],{"class":163},[33,157783,157784],{"class":167}," footer.paragraphs[",[33,157786,748],{"class":50},[33,157788,9202],{"class":167},[33,157790,157791,157794,157797],{"class":35,"line":224},[33,157792,157793],{"class":167},"footer_para.add_run(",[33,157795,157796],{"class":54},"\"Generated by reporting pipeline  |  \"",[33,157798,221],{"class":167},[33,157800,157801],{"class":35,"line":229},[33,157802,92],{"emptyLinePlaceholder":91},[33,157804,157805],{"class":35,"line":235},[33,157806,157807],{"class":39},"# Add a PAGE field for automatic page numbering\n",[33,157809,157810,157812,157815,157817],{"class":35,"line":250},[33,157811,190],{"class":163},[33,157813,157814],{"class":167}," docx.oxml ",[33,157816,164],{"class":163},[33,157818,157072],{"class":167},[33,157820,157821,157824,157826,157828,157831],{"class":35,"line":266},[33,157822,157823],{"class":167},"fldChar1 ",[33,157825,242],{"class":163},[33,157827,157476],{"class":167},[33,157829,157830],{"class":54},"\"w:fldChar\"",[33,157832,221],{"class":167},[33,157834,157835,157838,157841,157843,157846],{"class":35,"line":290},[33,157836,157837],{"class":167},"fldChar1.set(qn(",[33,157839,157840],{"class":54},"\"w:fldCharType\"",[33,157842,18525],{"class":167},[33,157844,157845],{"class":54},"\"begin\"",[33,157847,221],{"class":167},[33,157849,157850,157853,157855,157857,157860],{"class":35,"line":295},[33,157851,157852],{"class":167},"instrText ",[33,157854,242],{"class":163},[33,157856,157476],{"class":167},[33,157858,157859],{"class":54},"\"w:instrText\"",[33,157861,221],{"class":167},[33,157863,157864,157867,157869],{"class":35,"line":300},[33,157865,157866],{"class":167},"instrText.text ",[33,157868,242],{"class":163},[33,157870,157871],{"class":54}," \"PAGE\"\n",[33,157873,157874,157877,157879,157881,157883],{"class":35,"line":317},[33,157875,157876],{"class":167},"fldChar2 ",[33,157878,242],{"class":163},[33,157880,157476],{"class":167},[33,157882,157830],{"class":54},[33,157884,221],{"class":167},[33,157886,157887,157890,157892,157894,157897],{"class":35,"line":332},[33,157888,157889],{"class":167},"fldChar2.set(qn(",[33,157891,157840],{"class":54},[33,157893,18525],{"class":167},[33,157895,157896],{"class":54},"\"end\"",[33,157898,221],{"class":167},[33,157900,157901],{"class":35,"line":347},[33,157902,92],{"emptyLinePlaceholder":91},[33,157904,157905,157908,157910],{"class":35,"line":374},[33,157906,157907],{"class":167},"run ",[33,157909,242],{"class":163},[33,157911,157912],{"class":167}," footer_para.add_run()\n",[33,157914,157915],{"class":35,"line":397},[33,157916,157917],{"class":167},"run._r.append(fldChar1)\n",[33,157919,157920],{"class":35,"line":653},[33,157921,157922],{"class":167},"run._r.append(instrText)\n",[33,157924,157925],{"class":35,"line":667},[33,157926,157927],{"class":167},"run._r.append(fldChar2)\n",[424,157929,157931],{"id":157930},"step-7-save","Step 7 — Save",[23,157933,157935],{"className":126,"code":157934,"language":47,"meta":28,"style":28},"try:\n    doc.save(OUTPUT)\n    print(f\"Saved: {OUTPUT}\")\nexcept PermissionError:\n    print(f\"File is open in Word — close it and retry: {OUTPUT}\")\nexcept OSError as exc:\n    print(f\"Save failed: {exc}\")\n",[30,157936,157937,157943,157951,157967,157975,157992,158002],{"__ignoreMap":28},[33,157938,157939,157941],{"class":35,"line":36},[33,157940,35574],{"class":163},[33,157942,574],{"class":167},[33,157944,157945,157947,157949],{"class":35,"line":43},[33,157946,85716],{"class":167},[33,157948,96935],{"class":50},[33,157950,221],{"class":167},[33,157952,157953,157955,157957,157959,157961,157963,157965],{"class":35,"line":61},[33,157954,7268],{"class":50},[33,157956,602],{"class":167},[33,157958,4059],{"class":163},[33,157960,97737],{"class":54},[33,157962,97684],{"class":50},[33,157964,274],{"class":54},[33,157966,221],{"class":167},[33,157968,157969,157971,157973],{"class":35,"line":73},[33,157970,35726],{"class":163},[33,157972,17393],{"class":50},[33,157974,574],{"class":167},[33,157976,157977,157979,157981,157983,157986,157988,157990],{"class":35,"line":88},[33,157978,7268],{"class":50},[33,157980,602],{"class":167},[33,157982,4059],{"class":163},[33,157984,157985],{"class":54},"\"File is open in Word — close it and retry: ",[33,157987,97684],{"class":50},[33,157989,274],{"class":54},[33,157991,221],{"class":167},[33,157993,157994,157996,157998,158000],{"class":35,"line":95},[33,157995,35726],{"class":163},[33,157997,107953],{"class":50},[33,157999,1852],{"class":163},[33,158001,1855],{"class":167},[33,158003,158004,158006,158008,158010,158013,158015,158017,158019,158021],{"class":35,"line":101},[33,158005,7268],{"class":50},[33,158007,602],{"class":167},[33,158009,4059],{"class":163},[33,158011,158012],{"class":54},"\"Save failed: ",[33,158014,1115],{"class":50},[33,158016,6565],{"class":167},[33,158018,1121],{"class":50},[33,158020,274],{"class":54},[33,158022,221],{"class":167},[2537,158024],{},[18,158026,158028],{"id":158027},"_3-document-elements-docx-how-it-fits-together","3. Document() → Elements → .docx: How It Fits Together",[14,158030,158031,158032,158034,158035,158037],{},"The diagram traces the call sequence from ",[30,158033,156261],{}," through the OOXML element tree to the saved ",[30,158036,18051],{}," ZIP archive.",[2540,158039,2547,158041,2547,158044,2547,158047,2547,2547,158061,2547,158063,2547,158065,2547,158068,2547,158070,2547,158074,2547,158077,2547,158079,2547,158082,2547,158085,2547,158087,2547,158091,2547,158094,2547,158096,2547,158099,2547,2547,158102,2547,158106,2547,158109,2547,158111,2547,158114,2547,2547,158116,2547,158119,2547,158122,2547,158125,2547,158128,2547,158131,2547,158134,2547,2547,158137,2547,158139,2547,2547,158142,2547,158144,2547,158148,2547,158151,2547,158154],{"viewBox":2542,"role":2543,"ariaLabel":158040,"xmlns":2545,"style":2546},"python-docx build flow: Document() to saved .docx",[2549,158042,158043],{},"python-docx document build flow",[2553,158045,158046],{},"Flowchart showing how Document(), add_heading, add_paragraph, add_table, and add_section calls compose the OOXML element tree that is serialised to a .docx ZIP archive.",[2557,158048,2559,158049,2559,158056,2547],{},[2561,158050,2564,158052,2564,158054,2559],{"id":158051,"x1":748,"y1":748,"x2":734,"y2":748},"word-create-grad",[2566,158053],{"offset":748,"style":2568},[2566,158055],{"offset":734,"style":2571},[2573,158057,2564,158059,2559],{"id":158058,"viewBox":2576,"refX":2577,"refY":1153,"markerWidth":1179,"markerHeight":1179,"orient":2578},"word-create-arrow",[2580,158060],{"d":2582,"fill":2583},[2585,158062],{"x":24213,"y":2587,"width":2610,"height":26341,"rx":2591,"fill":2592,"stroke":2593,"style":2594},[2000,158064,156261],{"x":38741,"y":120779,"fill":2599,"style":59932},[2000,158066,158067],{"x":38741,"y":38740,"fill":2583,"style":2605},"new or from template",[2585,158069],{"x":24213,"y":17018,"width":2610,"height":26341,"rx":2591,"fill":2592,"stroke":2593,"style":2594},[2000,158071,158073],{"x":38741,"y":158072,"fill":2599,"style":2600},"103","add_heading()",[2000,158075,158076],{"x":38741,"y":2589,"fill":2583,"style":2605},"level 0–9 → style name",[2585,158078],{"x":24213,"y":11194,"width":2610,"height":26341,"rx":2591,"fill":2592,"stroke":2593,"style":2594},[2000,158080,158081],{"x":38741,"y":142716,"fill":2599,"style":2600},"add_paragraph()",[2000,158083,158084],{"x":38741,"y":2643,"fill":2583,"style":2605},"+ add_run() per span",[2585,158086],{"x":24213,"y":110852,"width":2610,"height":26341,"rx":2591,"fill":2592,"stroke":2593,"style":2594},[2000,158088,158090],{"x":38741,"y":158089,"fill":2599,"style":2600},"231","add_table()",[2000,158092,158093],{"x":38741,"y":38722,"fill":2583,"style":2605},"rows \u002F cols \u002F style",[2585,158095],{"x":24213,"y":11148,"width":2610,"height":26341,"rx":2591,"fill":2592,"stroke":2593,"style":2594},[2000,158097,158098],{"x":38741,"y":107631,"fill":2599,"style":2600},"add_section()",[2000,158100,158101],{"x":38741,"y":89111,"fill":2583,"style":2605},"margins \u002F orientation",[35,158103],{"x1":158104,"y1":49816,"x2":49839,"y2":2588,"stroke":2583,"markerEnd":158105,"style":2594},"176","url(#word-create-arrow)",[35,158107],{"x1":158104,"y1":158108,"x2":49839,"y2":11194,"stroke":2583,"markerEnd":158105,"style":2594},"106",[35,158110],{"x1":158104,"y1":11115,"x2":49839,"y2":11115,"stroke":2583,"markerEnd":158105,"style":2594},[35,158112],{"x1":158104,"y1":158113,"x2":49839,"y2":16997,"stroke":2583,"markerEnd":158105,"style":2594},"234",[35,158115],{"x1":158104,"y1":64888,"x2":49839,"y2":58401,"stroke":2583,"markerEnd":158105,"style":2594},[2585,158117],{"x":49839,"y":2650,"width":58337,"height":2610,"rx":2591,"fill":158118,"stroke":2593,"style":2594},"url(#word-create-grad)",[2000,158120,158121],{"x":152431,"y":26402,"fill":2599,"style":59932},"OOXML element tree",[2000,158123,158124],{"x":152431,"y":11194,"fill":2599,"style":2685},"w:document",[2000,158126,158127],{"x":152431,"y":2639,"fill":2599,"style":2685},"└ w:body",[2000,158129,158130],{"x":152431,"y":125458,"fill":2599,"style":2685},"   ├ w:p  (paragraphs)",[2000,158132,158133],{"x":152431,"y":102566,"fill":2599,"style":2685},"   ├ w:tbl (tables)",[2000,158135,158136],{"x":152431,"y":11126,"fill":2599,"style":2685},"   └ w:sectPr",[35,158138],{"x1":64900,"y1":58337,"x2":49853,"y2":58337,"stroke":2583,"markerEnd":158105,"style":2594},[2000,158140,158141],{"x":2647,"y":2639,"fill":2583,"style":2605},"doc.save()",[2585,158143],{"x":49853,"y":2589,"width":58337,"height":2589,"rx":2591,"fill":2592,"stroke":2593,"style":2594},[2000,158145,158147],{"x":158146,"y":11194,"fill":2599,"style":59932},"650","report.docx (ZIP)",[2000,158149,158150],{"x":158146,"y":2604,"fill":2583,"style":2685},"word\u002Fdocument.xml",[2000,158152,158153],{"x":158146,"y":64936,"fill":2583,"style":2685},"word\u002Fstyles.xml",[2000,158155,158156],{"x":158146,"y":110852,"fill":2583,"style":2685},"word\u002Fmedia\u002F*",[14,158158,39550,158159,158161,158162,158164,158165,158167,158168,158170,158171,158174,158175,8877,158177,158180],{},[30,158160,18051],{}," format is a ZIP archive. ",[30,158163,158141],{}," serialises the in-memory element tree to ",[30,158166,158150],{},", writes styles to ",[30,158169,158153],{},", and bundles embedded media into ",[30,158172,158173],{},"word\u002Fmedia\u002F",". Unzip any ",[30,158176,18051],{},[30,158178,158179],{},"unzip -d out report.docx"," to inspect the raw XML when debugging.",[2537,158182],{},[18,158184,158186],{"id":158185},"_4-edge-cases-and-variants","4. Edge Cases and Variants",[424,158188,158190],{"id":158189},"variant-a-continuing-a-paragraphs-formatting-across-multiple-runs","Variant A — Continuing a Paragraph's Formatting Across Multiple Runs",[14,158192,158193,158194,158196,158197,158200],{},"When you need a single paragraph with mixed inline styles, build it from multiple runs on the same ",[30,158195,57653],{}," object. Calling ",[30,158198,158199],{},"doc.add_paragraph()"," repeatedly creates separate paragraphs, each with its own spacing and style.",[23,158202,158204],{"className":126,"code":158203,"language":47,"meta":28,"style":28},"# pip install python-docx\nfrom pathlib import Path\nfrom docx import Document\nfrom docx.shared import Pt\n\nOUTPUT = Path(\"output\u002Fword\u002Fmixed_runs.docx\")\nOUTPUT.parent.mkdir(parents=True, exist_ok=True)\n\ndoc  = Document()\npara = doc.add_paragraph(style=\"Body Text\")\npara.paragraph_format.space_after = Pt(8)\n\nrun_a = para.add_run(\"Status: \")\nrun_b = para.add_run(\"APPROVED\")\nrun_b.bold            = True\nrun_b.font.color.rgb  = __import__(\"docx.shared\", fromlist=[\"RGBColor\"]).RGBColor(0x16, 0x65, 0x34)\nrun_c = para.add_run(\" — routed to finance on 2026-06-15.\")\n\ntry:\n    doc.save(OUTPUT)\n    print(f\"Saved: {OUTPUT}\")\nexcept OSError as exc:\n    print(f\"Save failed: {exc}\")\n",[30,158205,158206,158210,158220,158230,158241,158245,158258,158280,158284,158293,158311,158324,158328,158342,158356,158365,158413,158427,158431,158437,158445,158461,158471],{"__ignoreMap":28},[33,158207,158208],{"class":35,"line":36},[33,158209,156213],{"class":39},[33,158211,158212,158214,158216,158218],{"class":35,"line":43},[33,158213,190],{"class":163},[33,158215,193],{"class":167},[33,158217,164],{"class":163},[33,158219,198],{"class":167},[33,158221,158222,158224,158226,158228],{"class":35,"line":61},[33,158223,190],{"class":163},[33,158225,18092],{"class":167},[33,158227,164],{"class":163},[33,158229,18097],{"class":167},[33,158231,158232,158234,158236,158238],{"class":35,"line":73},[33,158233,190],{"class":163},[33,158235,18104],{"class":167},[33,158237,164],{"class":163},[33,158239,158240],{"class":167}," Pt\n",[33,158242,158243],{"class":35,"line":88},[33,158244,92],{"emptyLinePlaceholder":91},[33,158246,158247,158249,158251,158253,158256],{"class":35,"line":95},[33,158248,96935],{"class":50},[33,158250,212],{"class":163},[33,158252,215],{"class":167},[33,158254,158255],{"class":54},"\"output\u002Fword\u002Fmixed_runs.docx\"",[33,158257,221],{"class":167},[33,158259,158260,158262,158264,158266,158268,158270,158272,158274,158276,158278],{"class":35,"line":101},[33,158261,96935],{"class":50},[33,158263,866],{"class":167},[33,158265,869],{"class":238},[33,158267,242],{"class":163},[33,158269,855],{"class":50},[33,158271,365],{"class":167},[33,158273,878],{"class":238},[33,158275,242],{"class":163},[33,158277,855],{"class":50},[33,158279,221],{"class":167},[33,158281,158282],{"class":35,"line":171},[33,158283,92],{"emptyLinePlaceholder":91},[33,158285,158286,158289,158291],{"class":35,"line":179},[33,158287,158288],{"class":167},"doc  ",[33,158290,242],{"class":163},[33,158292,18229],{"class":167},[33,158294,158295,158298,158300,158302,158304,158306,158309],{"class":35,"line":187},[33,158296,158297],{"class":167},"para ",[33,158299,242],{"class":163},[33,158301,156861],{"class":167},[33,158303,6953],{"class":238},[33,158305,242],{"class":163},[33,158307,158308],{"class":54},"\"Body Text\"",[33,158310,221],{"class":167},[33,158312,158313,158316,158318,158320,158322],{"class":35,"line":201},[33,158314,158315],{"class":167},"para.paragraph_format.space_after ",[33,158317,242],{"class":163},[33,158319,18472],{"class":167},[33,158321,2591],{"class":50},[33,158323,221],{"class":167},[33,158325,158326],{"class":35,"line":206},[33,158327,92],{"emptyLinePlaceholder":91},[33,158329,158330,158333,158335,158337,158340],{"class":35,"line":224},[33,158331,158332],{"class":167},"run_a ",[33,158334,242],{"class":163},[33,158336,156898],{"class":167},[33,158338,158339],{"class":54},"\"Status: \"",[33,158341,221],{"class":167},[33,158343,158344,158347,158349,158351,158354],{"class":35,"line":229},[33,158345,158346],{"class":167},"run_b ",[33,158348,242],{"class":163},[33,158350,156898],{"class":167},[33,158352,158353],{"class":54},"\"APPROVED\"",[33,158355,221],{"class":167},[33,158357,158358,158361,158363],{"class":35,"line":235},[33,158359,158360],{"class":167},"run_b.bold            ",[33,158362,242],{"class":163},[33,158364,2887],{"class":50},[33,158366,158367,158370,158372,158375,158377,158380,158382,158385,158387,158389,158392,158395,158397,158399,158401,158403,158405,158407,158409,158411],{"class":35,"line":250},[33,158368,158369],{"class":167},"run_b.font.color.rgb  ",[33,158371,242],{"class":163},[33,158373,158374],{"class":50}," __import__",[33,158376,602],{"class":167},[33,158378,158379],{"class":54},"\"docx.shared\"",[33,158381,365],{"class":167},[33,158383,158384],{"class":238},"fromlist",[33,158386,242],{"class":163},[33,158388,8309],{"class":167},[33,158390,158391],{"class":54},"\"RGBColor\"",[33,158393,158394],{"class":167},"]).RGBColor(",[33,158396,18291],{"class":163},[33,158398,24213],{"class":50},[33,158400,365],{"class":167},[33,158402,18291],{"class":163},[33,158404,2653],{"class":50},[33,158406,365],{"class":167},[33,158408,18291],{"class":163},[33,158410,38717],{"class":50},[33,158412,221],{"class":167},[33,158414,158415,158418,158420,158422,158425],{"class":35,"line":266},[33,158416,158417],{"class":167},"run_c ",[33,158419,242],{"class":163},[33,158421,156898],{"class":167},[33,158423,158424],{"class":54},"\" — routed to finance on 2026-06-15.\"",[33,158426,221],{"class":167},[33,158428,158429],{"class":35,"line":290},[33,158430,92],{"emptyLinePlaceholder":91},[33,158432,158433,158435],{"class":35,"line":295},[33,158434,35574],{"class":163},[33,158436,574],{"class":167},[33,158438,158439,158441,158443],{"class":35,"line":300},[33,158440,85716],{"class":167},[33,158442,96935],{"class":50},[33,158444,221],{"class":167},[33,158446,158447,158449,158451,158453,158455,158457,158459],{"class":35,"line":317},[33,158448,7268],{"class":50},[33,158450,602],{"class":167},[33,158452,4059],{"class":163},[33,158454,97737],{"class":54},[33,158456,97684],{"class":50},[33,158458,274],{"class":54},[33,158460,221],{"class":167},[33,158462,158463,158465,158467,158469],{"class":35,"line":332},[33,158464,35726],{"class":163},[33,158466,107953],{"class":50},[33,158468,1852],{"class":163},[33,158470,1855],{"class":167},[33,158472,158473,158475,158477,158479,158481,158483,158485,158487,158489],{"class":35,"line":347},[33,158474,7268],{"class":50},[33,158476,602],{"class":167},[33,158478,4059],{"class":163},[33,158480,158012],{"class":54},[33,158482,1115],{"class":50},[33,158484,6565],{"class":167},[33,158486,1121],{"class":50},[33,158488,274],{"class":54},[33,158490,221],{"class":167},[424,158492,158494],{"id":158493},"variant-b-multi-column-section-layout","Variant B — Multi-Column Section Layout",[14,158496,158497,158498,158501],{},"Some reports need a two-column layout for dense reference material or side-by-side comparisons. The ",[30,158499,158500],{},"w:cols"," element is not exposed through the python-docx high-level API and must be set via oxml:",[23,158503,158505],{"className":126,"code":158504,"language":47,"meta":28,"style":28},"# pip install python-docx\nfrom pathlib import Path\nfrom docx import Document\nfrom docx.oxml.ns import qn\nfrom docx.oxml   import OxmlElement\n\nOUTPUT = Path(\"output\u002Fword\u002Ftwo_columns.docx\")\nOUTPUT.parent.mkdir(parents=True, exist_ok=True)\n\ndoc  = Document()\nsect = doc.sections[0]\n\ncols = OxmlElement(\"w:cols\")\ncols.set(qn(\"w:num\"),   \"2\")      # 2 equal columns\ncols.set(qn(\"w:space\"), \"720\")    # 0.5-inch gutter (720 twips)\nsect._sectPr.append(cols)\n\ndoc.add_paragraph(\n    \"Text in a two-column section flows automatically from the bottom of the \"\n    \"first column to the top of the second.\"\n)\n\ntry:\n    doc.save(OUTPUT)\n    print(f\"Saved: {OUTPUT}\")\nexcept OSError as exc:\n    print(f\"Save failed: {exc}\")\n",[30,158506,158507,158511,158521,158531,158541,158551,158555,158568,158590,158594,158602,158615,158619,158633,158650,158667,158672,158676,158681,158686,158691,158695,158699,158705,158713,158729,158739],{"__ignoreMap":28},[33,158508,158509],{"class":35,"line":36},[33,158510,156213],{"class":39},[33,158512,158513,158515,158517,158519],{"class":35,"line":43},[33,158514,190],{"class":163},[33,158516,193],{"class":167},[33,158518,164],{"class":163},[33,158520,198],{"class":167},[33,158522,158523,158525,158527,158529],{"class":35,"line":61},[33,158524,190],{"class":163},[33,158526,18092],{"class":167},[33,158528,164],{"class":163},[33,158530,18097],{"class":167},[33,158532,158533,158535,158537,158539],{"class":35,"line":73},[33,158534,190],{"class":163},[33,158536,157055],{"class":167},[33,158538,164],{"class":163},[33,158540,157060],{"class":167},[33,158542,158543,158545,158547,158549],{"class":35,"line":88},[33,158544,190],{"class":163},[33,158546,157067],{"class":167},[33,158548,164],{"class":163},[33,158550,157072],{"class":167},[33,158552,158553],{"class":35,"line":95},[33,158554,92],{"emptyLinePlaceholder":91},[33,158556,158557,158559,158561,158563,158566],{"class":35,"line":101},[33,158558,96935],{"class":50},[33,158560,212],{"class":163},[33,158562,215],{"class":167},[33,158564,158565],{"class":54},"\"output\u002Fword\u002Ftwo_columns.docx\"",[33,158567,221],{"class":167},[33,158569,158570,158572,158574,158576,158578,158580,158582,158584,158586,158588],{"class":35,"line":171},[33,158571,96935],{"class":50},[33,158573,866],{"class":167},[33,158575,869],{"class":238},[33,158577,242],{"class":163},[33,158579,855],{"class":50},[33,158581,365],{"class":167},[33,158583,878],{"class":238},[33,158585,242],{"class":163},[33,158587,855],{"class":50},[33,158589,221],{"class":167},[33,158591,158592],{"class":35,"line":179},[33,158593,92],{"emptyLinePlaceholder":91},[33,158595,158596,158598,158600],{"class":35,"line":187},[33,158597,158288],{"class":167},[33,158599,242],{"class":163},[33,158601,18229],{"class":167},[33,158603,158604,158607,158609,158611,158613],{"class":35,"line":201},[33,158605,158606],{"class":167},"sect ",[33,158608,242],{"class":163},[33,158610,156589],{"class":167},[33,158612,748],{"class":50},[33,158614,9202],{"class":167},[33,158616,158617],{"class":35,"line":206},[33,158618,92],{"emptyLinePlaceholder":91},[33,158620,158621,158624,158626,158628,158631],{"class":35,"line":224},[33,158622,158623],{"class":167},"cols ",[33,158625,242],{"class":163},[33,158627,157476],{"class":167},[33,158629,158630],{"class":54},"\"w:cols\"",[33,158632,221],{"class":167},[33,158634,158635,158638,158641,158643,158645,158647],{"class":35,"line":229},[33,158636,158637],{"class":167},"cols.set(qn(",[33,158639,158640],{"class":54},"\"w:num\"",[33,158642,122176],{"class":167},[33,158644,13395],{"class":54},[33,158646,54109],{"class":167},[33,158648,158649],{"class":39},"# 2 equal columns\n",[33,158651,158652,158654,158657,158659,158662,158664],{"class":35,"line":235},[33,158653,158637],{"class":167},[33,158655,158656],{"class":54},"\"w:space\"",[33,158658,18525],{"class":167},[33,158660,158661],{"class":54},"\"720\"",[33,158663,101057],{"class":167},[33,158665,158666],{"class":39},"# 0.5-inch gutter (720 twips)\n",[33,158668,158669],{"class":35,"line":250},[33,158670,158671],{"class":167},"sect._sectPr.append(cols)\n",[33,158673,158674],{"class":35,"line":266},[33,158675,92],{"emptyLinePlaceholder":91},[33,158677,158678],{"class":35,"line":290},[33,158679,158680],{"class":167},"doc.add_paragraph(\n",[33,158682,158683],{"class":35,"line":295},[33,158684,158685],{"class":54},"    \"Text in a two-column section flows automatically from the bottom of the \"\n",[33,158687,158688],{"class":35,"line":300},[33,158689,158690],{"class":54},"    \"first column to the top of the second.\"\n",[33,158692,158693],{"class":35,"line":317},[33,158694,221],{"class":167},[33,158696,158697],{"class":35,"line":332},[33,158698,92],{"emptyLinePlaceholder":91},[33,158700,158701,158703],{"class":35,"line":347},[33,158702,35574],{"class":163},[33,158704,574],{"class":167},[33,158706,158707,158709,158711],{"class":35,"line":374},[33,158708,85716],{"class":167},[33,158710,96935],{"class":50},[33,158712,221],{"class":167},[33,158714,158715,158717,158719,158721,158723,158725,158727],{"class":35,"line":397},[33,158716,7268],{"class":50},[33,158718,602],{"class":167},[33,158720,4059],{"class":163},[33,158722,97737],{"class":54},[33,158724,97684],{"class":50},[33,158726,274],{"class":54},[33,158728,221],{"class":167},[33,158730,158731,158733,158735,158737],{"class":35,"line":653},[33,158732,35726],{"class":163},[33,158734,107953],{"class":50},[33,158736,1852],{"class":163},[33,158738,1855],{"class":167},[33,158740,158741,158743,158745,158747,158749,158751,158753,158755,158757],{"class":35,"line":667},[33,158742,7268],{"class":50},[33,158744,602],{"class":167},[33,158746,4059],{"class":163},[33,158748,158012],{"class":54},[33,158750,1115],{"class":50},[33,158752,6565],{"class":167},[33,158754,1121],{"class":50},[33,158756,274],{"class":54},[33,158758,221],{"class":167},[424,158760,158762],{"id":158761},"variant-c-merging-table-cells","Variant C — Merging Table Cells",[14,158764,158765],{},"Merged cells are common in report headers (spanning a label across multiple columns) and in structured forms:",[23,158767,158769],{"className":126,"code":158768,"language":47,"meta":28,"style":28},"# pip install python-docx\nfrom pathlib import Path\nfrom docx import Document\nfrom docx.shared import Inches\n\nOUTPUT = Path(\"output\u002Fword\u002Fmerged_cells.docx\")\nOUTPUT.parent.mkdir(parents=True, exist_ok=True)\n\ndoc   = Document()\ntable = doc.add_table(rows=3, cols=4)\ntable.style = \"Table Grid\"\n\n# Merge the first two cells in row 0 (spans columns 0 and 1)\na = table.cell(0, 0)\nb = table.cell(0, 1)\nmerged = a.merge(b)\nmerged.text = \"Merged header\"\n\n# Standard cells for the remainder\nfor col_idx in range(2, 4):\n    table.cell(0, col_idx).text = f\"Col {col_idx}\"\n\nfor row_idx in range(1, 3):\n    for col_idx in range(4):\n        table.cell(row_idx, col_idx).text = f\"R{row_idx}C{col_idx}\"\n\ntry:\n    doc.save(OUTPUT)\n    print(f\"Saved: {OUTPUT}\")\nexcept OSError as exc:\n    print(f\"Save failed: {exc}\")\n",[30,158770,158771,158775,158785,158795,158805,158809,158822,158844,158848,158857,158881,158889,158893,158898,158916,158933,158943,158953,158957,158962,158982,159007,159011,159032,159048,159078,159082,159088,159096,159112,159122],{"__ignoreMap":28},[33,158772,158773],{"class":35,"line":36},[33,158774,156213],{"class":39},[33,158776,158777,158779,158781,158783],{"class":35,"line":43},[33,158778,190],{"class":163},[33,158780,193],{"class":167},[33,158782,164],{"class":163},[33,158784,198],{"class":167},[33,158786,158787,158789,158791,158793],{"class":35,"line":61},[33,158788,190],{"class":163},[33,158790,18092],{"class":167},[33,158792,164],{"class":163},[33,158794,18097],{"class":167},[33,158796,158797,158799,158801,158803],{"class":35,"line":73},[33,158798,190],{"class":163},[33,158800,18104],{"class":167},[33,158802,164],{"class":163},[33,158804,157048],{"class":167},[33,158806,158807],{"class":35,"line":88},[33,158808,92],{"emptyLinePlaceholder":91},[33,158810,158811,158813,158815,158817,158820],{"class":35,"line":95},[33,158812,96935],{"class":50},[33,158814,212],{"class":163},[33,158816,215],{"class":167},[33,158818,158819],{"class":54},"\"output\u002Fword\u002Fmerged_cells.docx\"",[33,158821,221],{"class":167},[33,158823,158824,158826,158828,158830,158832,158834,158836,158838,158840,158842],{"class":35,"line":101},[33,158825,96935],{"class":50},[33,158827,866],{"class":167},[33,158829,869],{"class":238},[33,158831,242],{"class":163},[33,158833,855],{"class":50},[33,158835,365],{"class":167},[33,158837,878],{"class":238},[33,158839,242],{"class":163},[33,158841,855],{"class":50},[33,158843,221],{"class":167},[33,158845,158846],{"class":35,"line":171},[33,158847,92],{"emptyLinePlaceholder":91},[33,158849,158850,158853,158855],{"class":35,"line":179},[33,158851,158852],{"class":167},"doc   ",[33,158854,242],{"class":163},[33,158856,18229],{"class":167},[33,158858,158859,158861,158863,158865,158867,158869,158871,158873,158875,158877,158879],{"class":35,"line":187},[33,158860,157220],{"class":167},[33,158862,242],{"class":163},[33,158864,18626],{"class":167},[33,158866,18629],{"class":238},[33,158868,242],{"class":163},[33,158870,10258],{"class":50},[33,158872,365],{"class":167},[33,158874,18638],{"class":238},[33,158876,242],{"class":163},[33,158878,1503],{"class":50},[33,158880,221],{"class":167},[33,158882,158883,158885,158887],{"class":35,"line":201},[33,158884,157246],{"class":167},[33,158886,242],{"class":163},[33,158888,18655],{"class":54},[33,158890,158891],{"class":35,"line":206},[33,158892,92],{"emptyLinePlaceholder":91},[33,158894,158895],{"class":35,"line":224},[33,158896,158897],{"class":39},"# Merge the first two cells in row 0 (spans columns 0 and 1)\n",[33,158899,158900,158903,158905,158908,158910,158912,158914],{"class":35,"line":229},[33,158901,158902],{"class":167},"a ",[33,158904,242],{"class":163},[33,158906,158907],{"class":167}," table.cell(",[33,158909,748],{"class":50},[33,158911,365],{"class":167},[33,158913,748],{"class":50},[33,158915,221],{"class":167},[33,158917,158918,158921,158923,158925,158927,158929,158931],{"class":35,"line":235},[33,158919,158920],{"class":167},"b ",[33,158922,242],{"class":163},[33,158924,158907],{"class":167},[33,158926,748],{"class":50},[33,158928,365],{"class":167},[33,158930,734],{"class":50},[33,158932,221],{"class":167},[33,158934,158935,158938,158940],{"class":35,"line":250},[33,158936,158937],{"class":167},"merged ",[33,158939,242],{"class":163},[33,158941,158942],{"class":167}," a.merge(b)\n",[33,158944,158945,158948,158950],{"class":35,"line":266},[33,158946,158947],{"class":167},"merged.text ",[33,158949,242],{"class":163},[33,158951,158952],{"class":54}," \"Merged header\"\n",[33,158954,158955],{"class":35,"line":290},[33,158956,92],{"emptyLinePlaceholder":91},[33,158958,158959],{"class":35,"line":295},[33,158960,158961],{"class":39},"# Standard cells for the remainder\n",[33,158963,158964,158966,158968,158970,158972,158974,158976,158978,158980],{"class":35,"line":300},[33,158965,6124],{"class":163},[33,158967,17741],{"class":167},[33,158969,662],{"class":163},[33,158971,1801],{"class":50},[33,158973,602],{"class":167},[33,158975,1533],{"class":50},[33,158977,365],{"class":167},[33,158979,1503],{"class":50},[33,158981,1737],{"class":167},[33,158983,158984,158987,158989,158992,158994,158996,158998,159000,159003,159005],{"class":35,"line":317},[33,158985,158986],{"class":167},"    table.cell(",[33,158988,748],{"class":50},[33,158990,158991],{"class":167},", col_idx).text ",[33,158993,242],{"class":163},[33,158995,1110],{"class":163},[33,158997,46256],{"class":54},[33,158999,1115],{"class":50},[33,159001,159002],{"class":167},"col_idx",[33,159004,1121],{"class":50},[33,159006,7504],{"class":54},[33,159008,159009],{"class":35,"line":332},[33,159010,92],{"emptyLinePlaceholder":91},[33,159012,159013,159015,159018,159020,159022,159024,159026,159028,159030],{"class":35,"line":347},[33,159014,6124],{"class":163},[33,159016,159017],{"class":167}," row_idx ",[33,159019,662],{"class":163},[33,159021,1801],{"class":50},[33,159023,602],{"class":167},[33,159025,734],{"class":50},[33,159027,365],{"class":167},[33,159029,10258],{"class":50},[33,159031,1737],{"class":167},[33,159033,159034,159036,159038,159040,159042,159044,159046],{"class":35,"line":374},[33,159035,656],{"class":163},[33,159037,17741],{"class":167},[33,159039,662],{"class":163},[33,159041,1801],{"class":50},[33,159043,602],{"class":167},[33,159045,1503],{"class":50},[33,159047,1737],{"class":167},[33,159049,159050,159053,159055,159057,159060,159062,159065,159067,159070,159072,159074,159076],{"class":35,"line":397},[33,159051,159052],{"class":167},"        table.cell(row_idx, col_idx).text ",[33,159054,242],{"class":163},[33,159056,1110],{"class":163},[33,159058,159059],{"class":54},"\"R",[33,159061,1115],{"class":50},[33,159063,159064],{"class":167},"row_idx",[33,159066,1121],{"class":50},[33,159068,159069],{"class":54},"C",[33,159071,1115],{"class":50},[33,159073,159002],{"class":167},[33,159075,1121],{"class":50},[33,159077,7504],{"class":54},[33,159079,159080],{"class":35,"line":653},[33,159081,92],{"emptyLinePlaceholder":91},[33,159083,159084,159086],{"class":35,"line":667},[33,159085,35574],{"class":163},[33,159087,574],{"class":167},[33,159089,159090,159092,159094],{"class":35,"line":675},[33,159091,85716],{"class":167},[33,159093,96935],{"class":50},[33,159095,221],{"class":167},[33,159097,159098,159100,159102,159104,159106,159108,159110],{"class":35,"line":689},[33,159099,7268],{"class":50},[33,159101,602],{"class":167},[33,159103,4059],{"class":163},[33,159105,97737],{"class":54},[33,159107,97684],{"class":50},[33,159109,274],{"class":54},[33,159111,221],{"class":167},[33,159113,159114,159116,159118,159120],{"class":35,"line":703},[33,159115,35726],{"class":163},[33,159117,107953],{"class":50},[33,159119,1852],{"class":163},[33,159121,1855],{"class":167},[33,159123,159124,159126,159128,159130,159132,159134,159136,159138,159140],{"class":35,"line":714},[33,159125,7268],{"class":50},[33,159127,602],{"class":167},[33,159129,4059],{"class":163},[33,159131,158012],{"class":54},[33,159133,1115],{"class":50},[33,159135,6565],{"class":167},[33,159137,1121],{"class":50},[33,159139,274],{"class":54},[33,159141,221],{"class":167},[2537,159143],{},[18,159145,159147],{"id":159146},"_5-validation","5. Validation",[14,159149,159150],{},"After generating a file, parse it back and assert structural integrity before delivering it:",[23,159152,159154],{"className":126,"code":159153,"language":47,"meta":28,"style":28},"# pip install python-docx\nfrom pathlib import Path\nfrom docx import Document\n\ndef validate_docx(path: Path, min_paragraphs: int = 3, min_tables: int = 0) -> bool:\n    \"\"\"Return True if the file opens cleanly, has enough content, and no unrendered placeholders.\"\"\"\n    try:\n        doc = Document(path)\n    except Exception as exc:\n        print(f\"[FAIL] Cannot open {path.name}: {exc}\")\n        return False\n\n    n_paragraphs = len(doc.paragraphs)\n    n_tables     = len(doc.tables)\n\n    if n_paragraphs \u003C min_paragraphs:\n        print(f\"[WARN] {path.name}: only {n_paragraphs} paragraphs — possible truncation\")\n        return False\n\n    if n_tables \u003C min_tables:\n        print(f\"[WARN] {path.name}: expected {min_tables} table(s), found {n_tables}\")\n        return False\n\n    # Scan for unrendered Jinja2 or placeholder syntax\n    full_text = \" \".join(p.text for p in doc.paragraphs)\n    if \"{{\" in full_text or \"}}\" in full_text:\n        print(f\"[WARN] {path.name}: unrendered placeholder detected\")\n        return False\n\n    print(f\"[OK] {path.name}: {n_paragraphs} paragraphs, {n_tables} tables\")\n    return True\n\nvalidate_docx(Path(\"output\u002Fword\u002Freport.docx\"), min_paragraphs=5, min_tables=1)\n",[30,159155,159156,159160,159170,159180,159184,159215,159220,159226,159235,159245,159274,159280,159284,159296,159308,159312,159324,159356,159362,159366,159378,159418,159424,159428,159433,159454,159484,159505,159511,159515,159554,159560,159564],{"__ignoreMap":28},[33,159157,159158],{"class":35,"line":36},[33,159159,156213],{"class":39},[33,159161,159162,159164,159166,159168],{"class":35,"line":43},[33,159163,190],{"class":163},[33,159165,193],{"class":167},[33,159167,164],{"class":163},[33,159169,198],{"class":167},[33,159171,159172,159174,159176,159178],{"class":35,"line":61},[33,159173,190],{"class":163},[33,159175,18092],{"class":167},[33,159177,164],{"class":163},[33,159179,18097],{"class":167},[33,159181,159182],{"class":35,"line":73},[33,159183,92],{"emptyLinePlaceholder":91},[33,159185,159186,159188,159191,159194,159196,159198,159200,159203,159205,159207,159209,159211,159213],{"class":35,"line":88},[33,159187,562],{"class":163},[33,159189,159190],{"class":46}," validate_docx",[33,159192,159193],{"class":167},"(path: Path, min_paragraphs: ",[33,159195,1059],{"class":50},[33,159197,212],{"class":163},[33,159199,1714],{"class":50},[33,159201,159202],{"class":167},", min_tables: ",[33,159204,1059],{"class":50},[33,159206,212],{"class":163},[33,159208,10791],{"class":50},[33,159210,1617],{"class":167},[33,159212,2821],{"class":50},[33,159214,574],{"class":167},[33,159216,159217],{"class":35,"line":95},[33,159218,159219],{"class":54},"    \"\"\"Return True if the file opens cleanly, has enough content, and no unrendered placeholders.\"\"\"\n",[33,159221,159222,159224],{"class":35,"line":101},[33,159223,2424],{"class":163},[33,159225,574],{"class":167},[33,159227,159228,159230,159232],{"class":35,"line":171},[33,159229,20077],{"class":167},[33,159231,242],{"class":163},[33,159233,159234],{"class":167}," Document(path)\n",[33,159236,159237,159239,159241,159243],{"class":35,"line":179},[33,159238,2449],{"class":163},[33,159240,783],{"class":50},[33,159242,1852],{"class":163},[33,159244,1855],{"class":167},[33,159246,159247,159249,159251,159253,159256,159258,159260,159262,159264,159266,159268,159270,159272],{"class":35,"line":187},[33,159248,9414],{"class":50},[33,159250,602],{"class":167},[33,159252,4059],{"class":163},[33,159254,159255],{"class":54},"\"[FAIL] Cannot open ",[33,159257,1115],{"class":50},[33,159259,57398],{"class":167},[33,159261,1121],{"class":50},[33,159263,2079],{"class":54},[33,159265,1115],{"class":50},[33,159267,6565],{"class":167},[33,159269,1121],{"class":50},[33,159271,274],{"class":54},[33,159273,221],{"class":167},[33,159275,159276,159278],{"class":35,"line":201},[33,159277,1659],{"class":163},[33,159279,2903],{"class":50},[33,159281,159282],{"class":35,"line":206},[33,159283,92],{"emptyLinePlaceholder":91},[33,159285,159286,159289,159291,159293],{"class":35,"line":224},[33,159287,159288],{"class":167},"    n_paragraphs ",[33,159290,242],{"class":163},[33,159292,4037],{"class":50},[33,159294,159295],{"class":167},"(doc.paragraphs)\n",[33,159297,159298,159301,159303,159305],{"class":35,"line":229},[33,159299,159300],{"class":167},"    n_tables     ",[33,159302,242],{"class":163},[33,159304,4037],{"class":50},[33,159306,159307],{"class":167},"(doc.tables)\n",[33,159309,159310],{"class":35,"line":235},[33,159311,92],{"emptyLinePlaceholder":91},[33,159313,159314,159316,159319,159321],{"class":35,"line":250},[33,159315,617],{"class":163},[33,159317,159318],{"class":167}," n_paragraphs ",[33,159320,4043],{"class":163},[33,159322,159323],{"class":167}," min_paragraphs:\n",[33,159325,159326,159328,159330,159332,159335,159337,159339,159341,159344,159346,159349,159351,159354],{"class":35,"line":266},[33,159327,9414],{"class":50},[33,159329,602],{"class":167},[33,159331,4059],{"class":163},[33,159333,159334],{"class":54},"\"[WARN] ",[33,159336,1115],{"class":50},[33,159338,57398],{"class":167},[33,159340,1121],{"class":50},[33,159342,159343],{"class":54},": only ",[33,159345,1115],{"class":50},[33,159347,159348],{"class":167},"n_paragraphs",[33,159350,1121],{"class":50},[33,159352,159353],{"class":54}," paragraphs — possible truncation\"",[33,159355,221],{"class":167},[33,159357,159358,159360],{"class":35,"line":290},[33,159359,1659],{"class":163},[33,159361,2903],{"class":50},[33,159363,159364],{"class":35,"line":295},[33,159365,92],{"emptyLinePlaceholder":91},[33,159367,159368,159370,159373,159375],{"class":35,"line":300},[33,159369,617],{"class":163},[33,159371,159372],{"class":167}," n_tables ",[33,159374,4043],{"class":163},[33,159376,159377],{"class":167}," min_tables:\n",[33,159379,159380,159382,159384,159386,159388,159390,159392,159394,159397,159399,159402,159404,159407,159409,159412,159414,159416],{"class":35,"line":317},[33,159381,9414],{"class":50},[33,159383,602],{"class":167},[33,159385,4059],{"class":163},[33,159387,159334],{"class":54},[33,159389,1115],{"class":50},[33,159391,57398],{"class":167},[33,159393,1121],{"class":50},[33,159395,159396],{"class":54},": expected ",[33,159398,1115],{"class":50},[33,159400,159401],{"class":167},"min_tables",[33,159403,1121],{"class":50},[33,159405,159406],{"class":54}," table(s), found ",[33,159408,1115],{"class":50},[33,159410,159411],{"class":167},"n_tables",[33,159413,1121],{"class":50},[33,159415,274],{"class":54},[33,159417,221],{"class":167},[33,159419,159420,159422],{"class":35,"line":332},[33,159421,1659],{"class":163},[33,159423,2903],{"class":50},[33,159425,159426],{"class":35,"line":347},[33,159427,92],{"emptyLinePlaceholder":91},[33,159429,159430],{"class":35,"line":374},[33,159431,159432],{"class":39},"    # Scan for unrendered Jinja2 or placeholder syntax\n",[33,159434,159435,159438,159440,159442,159445,159447,159449,159451],{"class":35,"line":397},[33,159436,159437],{"class":167},"    full_text ",[33,159439,242],{"class":163},[33,159441,57412],{"class":54},[33,159443,159444],{"class":167},".join(p.text ",[33,159446,6124],{"class":163},[33,159448,6127],{"class":167},[33,159450,662],{"class":163},[33,159452,159453],{"class":167}," doc.paragraphs)\n",[33,159455,159456,159458,159460,159463,159465,159467,159470,159472,159474,159477,159479,159481],{"class":35,"line":653},[33,159457,617],{"class":163},[33,159459,44625],{"class":54},[33,159461,159462],{"class":50},"{{",[33,159464,274],{"class":54},[33,159466,8002],{"class":163},[33,159468,159469],{"class":167}," full_text ",[33,159471,7162],{"class":163},[33,159473,44625],{"class":54},[33,159475,159476],{"class":50},"}}",[33,159478,274],{"class":54},[33,159480,8002],{"class":163},[33,159482,159483],{"class":167}," full_text:\n",[33,159485,159486,159488,159490,159492,159494,159496,159498,159500,159503],{"class":35,"line":667},[33,159487,9414],{"class":50},[33,159489,602],{"class":167},[33,159491,4059],{"class":163},[33,159493,159334],{"class":54},[33,159495,1115],{"class":50},[33,159497,57398],{"class":167},[33,159499,1121],{"class":50},[33,159501,159502],{"class":54},": unrendered placeholder detected\"",[33,159504,221],{"class":167},[33,159506,159507,159509],{"class":35,"line":675},[33,159508,1659],{"class":163},[33,159510,2903],{"class":50},[33,159512,159513],{"class":35,"line":689},[33,159514,92],{"emptyLinePlaceholder":91},[33,159516,159517,159519,159521,159523,159526,159528,159530,159532,159534,159536,159538,159540,159543,159545,159547,159549,159552],{"class":35,"line":703},[33,159518,7268],{"class":50},[33,159520,602],{"class":167},[33,159522,4059],{"class":163},[33,159524,159525],{"class":54},"\"[OK] ",[33,159527,1115],{"class":50},[33,159529,57398],{"class":167},[33,159531,1121],{"class":50},[33,159533,2079],{"class":54},[33,159535,1115],{"class":50},[33,159537,159348],{"class":167},[33,159539,1121],{"class":50},[33,159541,159542],{"class":54}," paragraphs, ",[33,159544,1115],{"class":50},[33,159546,159411],{"class":167},[33,159548,1121],{"class":50},[33,159550,159551],{"class":54}," tables\"",[33,159553,221],{"class":167},[33,159555,159556,159558],{"class":35,"line":714},[33,159557,1332],{"class":163},[33,159559,2887],{"class":50},[33,159561,159562],{"class":35,"line":723},[33,159563,92],{"emptyLinePlaceholder":91},[33,159565,159566,159569,159571,159573,159576,159578,159580,159582,159584,159586,159588],{"class":35,"line":754},[33,159567,159568],{"class":167},"validate_docx(Path(",[33,159570,156533],{"class":54},[33,159572,18525],{"class":167},[33,159574,159575],{"class":238},"min_paragraphs",[33,159577,242],{"class":163},[33,159579,1153],{"class":50},[33,159581,365],{"class":167},[33,159583,159401],{"class":238},[33,159585,242],{"class":163},[33,159587,734],{"class":50},[33,159589,221],{"class":167},[14,159591,159592],{},"For stricter validation, spot-check cell content by index:",[23,159594,159596],{"className":126,"code":159595,"language":47,"meta":28,"style":28},"from docx import Document\nfrom pathlib import Path\n\ndoc  = Document(Path(\"output\u002Fword\u002Freport.docx\"))\ntbl  = doc.tables[0]\nassert tbl.cell(0, 0).text == \"Region\", f\"Unexpected header: {tbl.cell(0,0).text}\"\nassert tbl.cell(1, 0).text == \"APAC\",   f\"Unexpected first row: {tbl.cell(1,0).text}\"\nprint(\"Table spot-check passed.\")\n",[30,159597,159598,159608,159618,159622,159635,159649,159694,159736],{"__ignoreMap":28},[33,159599,159600,159602,159604,159606],{"class":35,"line":36},[33,159601,190],{"class":163},[33,159603,18092],{"class":167},[33,159605,164],{"class":163},[33,159607,18097],{"class":167},[33,159609,159610,159612,159614,159616],{"class":35,"line":43},[33,159611,190],{"class":163},[33,159613,193],{"class":167},[33,159615,164],{"class":163},[33,159617,198],{"class":167},[33,159619,159620],{"class":35,"line":61},[33,159621,92],{"emptyLinePlaceholder":91},[33,159623,159624,159626,159628,159631,159633],{"class":35,"line":73},[33,159625,158288],{"class":167},[33,159627,242],{"class":163},[33,159629,159630],{"class":167}," Document(Path(",[33,159632,156533],{"class":54},[33,159634,371],{"class":167},[33,159636,159637,159640,159642,159645,159647],{"class":35,"line":88},[33,159638,159639],{"class":167},"tbl  ",[33,159641,242],{"class":163},[33,159643,159644],{"class":167}," doc.tables[",[33,159646,748],{"class":50},[33,159648,9202],{"class":167},[33,159650,159651,159653,159656,159658,159660,159662,159665,159667,159670,159672,159674,159676,159678,159681,159683,159685,159687,159690,159692],{"class":35,"line":95},[33,159652,36397],{"class":163},[33,159654,159655],{"class":167}," tbl.cell(",[33,159657,748],{"class":50},[33,159659,365],{"class":167},[33,159661,748],{"class":50},[33,159663,159664],{"class":167},").text ",[33,159666,1865],{"class":163},[33,159668,159669],{"class":54}," \"Region\"",[33,159671,365],{"class":167},[33,159673,4059],{"class":163},[33,159675,98403],{"class":54},[33,159677,1115],{"class":50},[33,159679,159680],{"class":167},"tbl.cell(",[33,159682,748],{"class":50},[33,159684,63503],{"class":167},[33,159686,748],{"class":50},[33,159688,159689],{"class":167},").text",[33,159691,1121],{"class":50},[33,159693,7504],{"class":54},[33,159695,159696,159698,159700,159702,159704,159706,159708,159710,159713,159715,159717,159720,159722,159724,159726,159728,159730,159732,159734],{"class":35,"line":101},[33,159697,36397],{"class":163},[33,159699,159655],{"class":167},[33,159701,734],{"class":50},[33,159703,365],{"class":167},[33,159705,748],{"class":50},[33,159707,159664],{"class":167},[33,159709,1865],{"class":163},[33,159711,159712],{"class":54}," \"APAC\"",[33,159714,1166],{"class":167},[33,159716,4059],{"class":163},[33,159718,159719],{"class":54},"\"Unexpected first row: ",[33,159721,1115],{"class":50},[33,159723,159680],{"class":167},[33,159725,734],{"class":50},[33,159727,63503],{"class":167},[33,159729,748],{"class":50},[33,159731,159689],{"class":167},[33,159733,1121],{"class":50},[33,159735,7504],{"class":54},[33,159737,159738,159740,159742,159745],{"class":35,"line":171},[33,159739,13474],{"class":50},[33,159741,602],{"class":167},[33,159743,159744],{"class":54},"\"Table spot-check passed.\"",[33,159746,221],{"class":167},[2537,159748],{},[18,159750,159752],{"id":159751},"_6-performance-and-scale","6. Performance and Scale",[14,159754,159755,159757,159758,159760,159761,159763],{},[1974,159756,4218],{},": each ",[30,159759,156261],{}," object holds the full XML tree in RAM. For batches of 1 000 or more documents, instantiate a fresh ",[30,159762,156261],{}," per record and let it go out of scope after saving — do not accumulate open document objects in a list.",[14,159765,159766,159768,159769,159771,159772,159774],{},[1974,159767,4237],{},": python-docx itself is not thread-safe at the document level (each document object manipulates a shared lxml element tree). Use ",[30,159770,4240],{}," for CPU-bound generation tasks. Reserve ",[30,159773,84758],{}," for I\u002FO-bound bottlenecks such as writing to a slow network share or uploading completed files to object storage.",[14,159776,159777,159780,159781,159784,159785,159787],{},[1974,159778,159779],{},"Atomic writes",": write to a ",[30,159782,159783],{},"tempfile.NamedTemporaryFile"," in the same directory as the target, then rename. This avoids delivering a partial (unreadable) ",[30,159786,18051],{}," if the process is interrupted mid-save:",[23,159789,159791],{"className":126,"code":159790,"language":47,"meta":28,"style":28},"# pip install python-docx\nimport shutil\nimport tempfile\nfrom pathlib import Path\nfrom docx import Document\n\ndef save_atomic(doc: Document, final_path: Path) -> None:\n    \"\"\"Write doc to a temp file in the same directory, then rename to final_path.\"\"\"\n    final_path.parent.mkdir(parents=True, exist_ok=True)\n    with tempfile.NamedTemporaryFile(\n        suffix=\".docx\", dir=final_path.parent, delete=False\n    ) as tmp:\n        tmp_path = Path(tmp.name)\n    try:\n        doc.save(tmp_path)\n        shutil.move(str(tmp_path), str(final_path))\n    except Exception:\n        tmp_path.unlink(missing_ok=True)\n        raise\n",[30,159792,159793,159797,159803,159809,159819,159829,159833,159847,159852,159873,159880,159907,159916,159926,159932,159937,159952,159960,159973],{"__ignoreMap":28},[33,159794,159795],{"class":35,"line":36},[33,159796,156213],{"class":39},[33,159798,159799,159801],{"class":35,"line":43},[33,159800,164],{"class":163},[33,159802,41706],{"class":167},[33,159804,159805,159807],{"class":35,"line":61},[33,159806,164],{"class":163},[33,159808,70055],{"class":167},[33,159810,159811,159813,159815,159817],{"class":35,"line":73},[33,159812,190],{"class":163},[33,159814,193],{"class":167},[33,159816,164],{"class":163},[33,159818,198],{"class":167},[33,159820,159821,159823,159825,159827],{"class":35,"line":88},[33,159822,190],{"class":163},[33,159824,18092],{"class":167},[33,159826,164],{"class":163},[33,159828,18097],{"class":167},[33,159830,159831],{"class":35,"line":95},[33,159832,92],{"emptyLinePlaceholder":91},[33,159834,159835,159837,159840,159843,159845],{"class":35,"line":101},[33,159836,562],{"class":163},[33,159838,159839],{"class":46}," save_atomic",[33,159841,159842],{"class":167},"(doc: Document, final_path: Path) -> ",[33,159844,571],{"class":50},[33,159846,574],{"class":167},[33,159848,159849],{"class":35,"line":171},[33,159850,159851],{"class":54},"    \"\"\"Write doc to a temp file in the same directory, then rename to final_path.\"\"\"\n",[33,159853,159854,159857,159859,159861,159863,159865,159867,159869,159871],{"class":35,"line":179},[33,159855,159856],{"class":167},"    final_path.parent.mkdir(",[33,159858,869],{"class":238},[33,159860,242],{"class":163},[33,159862,855],{"class":50},[33,159864,365],{"class":167},[33,159866,878],{"class":238},[33,159868,242],{"class":163},[33,159870,855],{"class":50},[33,159872,221],{"class":167},[33,159874,159875,159877],{"class":35,"line":187},[33,159876,1635],{"class":163},[33,159878,159879],{"class":167}," tempfile.NamedTemporaryFile(\n",[33,159881,159882,159885,159887,159890,159892,159895,159897,159900,159903,159905],{"class":35,"line":201},[33,159883,159884],{"class":238},"        suffix",[33,159886,242],{"class":163},[33,159888,159889],{"class":54},"\".docx\"",[33,159891,365],{"class":167},[33,159893,159894],{"class":238},"dir",[33,159896,242],{"class":163},[33,159898,159899],{"class":167},"final_path.parent, ",[33,159901,159902],{"class":238},"delete",[33,159904,242],{"class":163},[33,159906,8339],{"class":50},[33,159908,159909,159911,159913],{"class":35,"line":206},[33,159910,35761],{"class":167},[33,159912,495],{"class":163},[33,159914,159915],{"class":167}," tmp:\n",[33,159917,159918,159921,159923],{"class":35,"line":224},[33,159919,159920],{"class":167},"        tmp_path ",[33,159922,242],{"class":163},[33,159924,159925],{"class":167}," Path(tmp.name)\n",[33,159927,159928,159930],{"class":35,"line":229},[33,159929,2424],{"class":163},[33,159931,574],{"class":167},[33,159933,159934],{"class":35,"line":235},[33,159935,159936],{"class":167},"        doc.save(tmp_path)\n",[33,159938,159939,159942,159944,159947,159949],{"class":35,"line":250},[33,159940,159941],{"class":167},"        shutil.move(",[33,159943,1053],{"class":50},[33,159945,159946],{"class":167},"(tmp_path), ",[33,159948,1053],{"class":50},[33,159950,159951],{"class":167},"(final_path))\n",[33,159953,159954,159956,159958],{"class":35,"line":266},[33,159955,2449],{"class":163},[33,159957,783],{"class":50},[33,159959,574],{"class":167},[33,159961,159962,159965,159967,159969,159971],{"class":35,"line":290},[33,159963,159964],{"class":167},"        tmp_path.unlink(",[33,159966,67210],{"class":238},[33,159968,242],{"class":163},[33,159970,855],{"class":50},[33,159972,221],{"class":167},[33,159974,159975],{"class":35,"line":295},[33,159976,65922],{"class":163},[14,159978,159979,159982],{},[1974,159980,159981],{},"Chunking large datasets",": if the source data is a large CSV or database query, stream it in chunks rather than loading the full dataset into memory before generation begins:",[23,159984,159986],{"className":126,"code":159985,"language":47,"meta":28,"style":28},"import pandas as pd\nfrom pathlib import Path\n\nCHUNK_SIZE = 500\nfor i, chunk in enumerate(pd.read_csv(\"data\u002Frecords.csv\", chunksize=CHUNK_SIZE)):\n    # build and save one document per chunk\n    pass\n",[30,159987,159988,159998,160008,160012,160021,160046,160051],{"__ignoreMap":28},[33,159989,159990,159992,159994,159996],{"class":35,"line":36},[33,159991,164],{"class":163},[33,159993,492],{"class":167},[33,159995,495],{"class":163},[33,159997,498],{"class":167},[33,159999,160000,160002,160004,160006],{"class":35,"line":43},[33,160001,190],{"class":163},[33,160003,193],{"class":167},[33,160005,164],{"class":163},[33,160007,198],{"class":167},[33,160009,160010],{"class":35,"line":61},[33,160011,92],{"emptyLinePlaceholder":91},[33,160013,160014,160016,160018],{"class":35,"line":73},[33,160015,123425],{"class":50},[33,160017,212],{"class":163},[33,160019,160020],{"class":50}," 500\n",[33,160022,160023,160025,160027,160029,160031,160033,160036,160038,160040,160042,160044],{"class":35,"line":88},[33,160024,6124],{"class":163},[33,160026,115785],{"class":167},[33,160028,662],{"class":163},[33,160030,7403],{"class":50},[33,160032,123499],{"class":167},[33,160034,160035],{"class":54},"\"data\u002Frecords.csv\"",[33,160037,365],{"class":167},[33,160039,21944],{"class":238},[33,160041,242],{"class":163},[33,160043,123425],{"class":50},[33,160045,8687],{"class":167},[33,160047,160048],{"class":35,"line":95},[33,160049,160050],{"class":39},"    # build and save one document per chunk\n",[33,160052,160053],{"class":35,"line":101},[33,160054,160055],{"class":163},"    pass\n",[2537,160057],{},[18,160059,160061],{"id":160060},"_7-troubleshooting","7. Troubleshooting",[4273,160063,160064,160074],{},[4276,160065,160066],{},[4279,160067,160068,160070,160072],{},[4282,160069,4284],{},[4282,160071,4287],{},[4282,160073,4290],{},[4292,160075,160076,160100,160115,160131,160150,160164],{},[4279,160077,160078,160083,160092],{},[4297,160079,160080],{},[30,160081,160082],{},"PackageNotFoundError: Package not found at ...",[4297,160084,160085,160086,160089,160090],{},"Passing a ",[30,160087,160088],{},".doc"," or corrupt ZIP to ",[30,160091,156261],{},[4297,160093,160094,160095,36661,160097,160099],{},"Convert ",[30,160096,160088],{},[30,160098,18051],{}," first (LibreOffice headless); validate extension before opening",[4279,160101,160102,160105,160111],{},[4297,160103,160104],{},"Font name set but has no effect in Word",[4297,160106,160107,160108,160110],{},"Run-level font overridden by style-level or ",[30,160109,157020],{}," not set for CJK",[4297,160112,6571,160113],{},[940,160114,157016],{"href":157015},[4279,160116,160117,160120,160126],{},[4297,160118,160119],{},"Table row count is wrong at runtime",[4297,160121,160122,160125],{},[30,160123,160124],{},"table.add_row()"," called outside the data loop or skipped on some iterations",[4297,160127,74566,160128,160130],{},[30,160129,160124],{}," exactly once per data record",[4279,160132,160133,160141,160144],{},[4297,160134,160135,42706,160138],{},[30,160136,160137],{},"InvalidSpanError",[30,160139,160140],{},"cell.merge()",[4297,160142,160143],{},"Merging already-merged cells or passing out-of-range indices",[4297,160145,67848,160146,160149],{},[30,160147,160148],{},"cell._tc.tcPr"," for existing merge flags; restructure the merge order",[4279,160151,160152,160158,160161],{},[4297,160153,160154,42706,160156],{},[30,160155,68035],{},[30,160157,158141],{},[4297,160159,160160],{},"Target file is open in Word",[4297,160162,160163],{},"Close the file in Word, or save to a temp path and rename",[4279,160165,160166,160169,160175],{},[4297,160167,160168],{},"Section orientation not applied",[4297,160170,160171,160172],{},"Width\u002Fheight not swapped after setting ",[30,160173,160174],{},"WD_ORIENT.LANDSCAPE",[4297,160176,160177,160178,10065,160181,160184],{},"Set both ",[30,160179,160180],{},"page_width = Inches(11)",[30,160182,160183],{},"page_height = Inches(8.5)"," explicitly after changing orientation",[2537,160186],{},[18,160188,160190],{"id":160189},"_8-complete-working-script","8. Complete Working Script",[23,160192,160194],{"className":126,"code":160193,"language":47,"meta":28,"style":28},"# pip install python-docx pandas\n\"\"\"\nbuild_report.py — generate a sales report .docx from a CSV.\n\nUsage:\n    python build_report.py --data sales.csv --out output\u002Freport.docx --title \"Q4 Report\"\n\"\"\"\nimport argparse\nimport shutil\nimport sys\nimport tempfile\nfrom pathlib import Path\n\nimport pandas as pd\nfrom docx import Document\nfrom docx.shared import Inches, Pt, RGBColor\nfrom docx.oxml.ns import qn\nfrom docx.oxml   import OxmlElement\n\n\ndef build_document(df: pd.DataFrame, title: str) -> Document:\n    doc = Document()\n\n    # Margins\n    section = doc.sections[0]\n    for attr in (\"top_margin\", \"bottom_margin\", \"left_margin\", \"right_margin\"):\n        setattr(section, attr, Inches(1))\n\n    doc.add_heading(title, level=0)\n    doc.add_heading(\"Summary\", level=1)\n\n    intro     = doc.add_paragraph()\n    intro.add_run(\"Records in dataset: \")\n    count_run = intro.add_run(str(len(df)))\n    count_run.bold = True\n\n    if df.empty:\n        doc.add_paragraph(\"No records to display.\")\n        return doc\n\n    doc.add_heading(\"Data\", level=1)\n\n    table = doc.add_table(rows=1, cols=len(df.columns))\n    table.style = \"Table Grid\"\n\n    # Mark header row as repeating\n    tr   = table.rows[0]._tr\n    trPr = tr.get_or_add_trPr()\n    trPr.append(OxmlElement(\"w:tblHeader\"))\n\n    hdr = table.rows[0].cells\n    for i, col in enumerate(df.columns):\n        hdr[i].text = str(col)\n        hdr[i].paragraphs[0].runs[0].bold = True\n\n    for _, row in df.iterrows():\n        cells = table.add_row().cells\n        for i, val in enumerate(row):\n            cells[i].text = str(val)\n\n    return doc\n\n\ndef main() -> None:\n    parser = argparse.ArgumentParser(description=\"Generate a .docx sales report from CSV.\")\n    parser.add_argument(\"--data\",  required=True, type=Path, help=\"Input CSV path\")\n    parser.add_argument(\"--out\",   required=True, type=Path, help=\"Output .docx path\")\n    parser.add_argument(\"--title\", default=\"Report\",          help=\"Document title\")\n    args = parser.parse_args()\n\n    try:\n        df = pd.read_csv(args.data)\n    except FileNotFoundError:\n        sys.exit(f\"[ERROR] CSV not found: {args.data}\")\n    except pd.errors.EmptyDataError:\n        sys.exit(\"[ERROR] CSV is empty.\")\n\n    doc      = build_document(df, args.title)\n    out_path = args.out\n    out_path.parent.mkdir(parents=True, exist_ok=True)\n\n    with tempfile.NamedTemporaryFile(\n        suffix=\".docx\", dir=out_path.parent, delete=False\n    ) as tmp:\n        tmp_path = Path(tmp.name)\n\n    try:\n        doc.save(tmp_path)\n        shutil.move(str(tmp_path), str(out_path))\n        print(f\"[OK] Report saved: {out_path}\")\n    except Exception as exc:\n        tmp_path.unlink(missing_ok=True)\n        sys.exit(f\"[ERROR] Save failed: {exc}\")\n\n\nif __name__ == \"__main__\":\n    main()\n",[30,160195,160196,160201,160205,160210,160214,160218,160223,160227,160233,160239,160245,160251,160261,160265,160275,160285,160296,160306,160316,160320,160324,160339,160347,160351,160356,160369,160400,160412,160416,160429,160445,160449,160459,160469,160488,160497,160501,160507,160517,160524,160528,160544,160548,160573,160581,160585,160590,160603,160612,160621,160625,160637,160650,160661,160677,160681,160691,160699,160713,160723,160727,160733,160737,160741,160753,160770,160800,160831,160858,160866,160870,160876,160885,160893,160913,160920,160929,160933,160943,160951,160971,160975,160981,161004,161012,161020,161024,161030,161034,161047,161068,161078,161090,161109,161113,161117,161129],{"__ignoreMap":28},[33,160197,160198],{"class":35,"line":36},[33,160199,160200],{"class":39},"# pip install python-docx pandas\n",[33,160202,160203],{"class":35,"line":43},[33,160204,139],{"class":54},[33,160206,160207],{"class":35,"line":61},[33,160208,160209],{"class":54},"build_report.py — generate a sales report .docx from a CSV.\n",[33,160211,160212],{"class":35,"line":73},[33,160213,92],{"emptyLinePlaceholder":91},[33,160215,160216],{"class":35,"line":88},[33,160217,4435],{"class":54},[33,160219,160220],{"class":35,"line":95},[33,160221,160222],{"class":54},"    python build_report.py --data sales.csv --out output\u002Freport.docx --title \"Q4 Report\"\n",[33,160224,160225],{"class":35,"line":101},[33,160226,139],{"class":54},[33,160228,160229,160231],{"class":35,"line":171},[33,160230,164],{"class":163},[33,160232,4461],{"class":167},[33,160234,160235,160237],{"class":35,"line":179},[33,160236,164],{"class":163},[33,160238,41706],{"class":167},[33,160240,160241,160243],{"class":35,"line":187},[33,160242,164],{"class":163},[33,160244,168],{"class":167},[33,160246,160247,160249],{"class":35,"line":201},[33,160248,164],{"class":163},[33,160250,70055],{"class":167},[33,160252,160253,160255,160257,160259],{"class":35,"line":206},[33,160254,190],{"class":163},[33,160256,193],{"class":167},[33,160258,164],{"class":163},[33,160260,198],{"class":167},[33,160262,160263],{"class":35,"line":224},[33,160264,92],{"emptyLinePlaceholder":91},[33,160266,160267,160269,160271,160273],{"class":35,"line":229},[33,160268,164],{"class":163},[33,160270,492],{"class":167},[33,160272,495],{"class":163},[33,160274,498],{"class":167},[33,160276,160277,160279,160281,160283],{"class":35,"line":235},[33,160278,190],{"class":163},[33,160280,18092],{"class":167},[33,160282,164],{"class":163},[33,160284,18097],{"class":167},[33,160286,160287,160289,160291,160293],{"class":35,"line":250},[33,160288,190],{"class":163},[33,160290,18104],{"class":167},[33,160292,164],{"class":163},[33,160294,160295],{"class":167}," Inches, Pt, RGBColor\n",[33,160297,160298,160300,160302,160304],{"class":35,"line":266},[33,160299,190],{"class":163},[33,160301,157055],{"class":167},[33,160303,164],{"class":163},[33,160305,157060],{"class":167},[33,160307,160308,160310,160312,160314],{"class":35,"line":290},[33,160309,190],{"class":163},[33,160311,157067],{"class":167},[33,160313,164],{"class":163},[33,160315,157072],{"class":167},[33,160317,160318],{"class":35,"line":295},[33,160319,92],{"emptyLinePlaceholder":91},[33,160321,160322],{"class":35,"line":300},[33,160323,92],{"emptyLinePlaceholder":91},[33,160325,160326,160328,160331,160334,160336],{"class":35,"line":317},[33,160327,562],{"class":163},[33,160329,160330],{"class":46}," build_document",[33,160332,160333],{"class":167},"(df: pd.DataFrame, title: ",[33,160335,1053],{"class":50},[33,160337,160338],{"class":167},") -> Document:\n",[33,160340,160341,160343,160345],{"class":35,"line":332},[33,160342,18224],{"class":167},[33,160344,242],{"class":163},[33,160346,18229],{"class":167},[33,160348,160349],{"class":35,"line":347},[33,160350,92],{"emptyLinePlaceholder":91},[33,160352,160353],{"class":35,"line":374},[33,160354,160355],{"class":39},"    # Margins\n",[33,160357,160358,160361,160363,160365,160367],{"class":35,"line":397},[33,160359,160360],{"class":167},"    section ",[33,160362,242],{"class":163},[33,160364,156589],{"class":167},[33,160366,748],{"class":50},[33,160368,9202],{"class":167},[33,160370,160371,160373,160376,160378,160380,160383,160385,160388,160390,160393,160395,160398],{"class":35,"line":653},[33,160372,656],{"class":163},[33,160374,160375],{"class":167}," attr ",[33,160377,662],{"class":163},[33,160379,17583],{"class":167},[33,160381,160382],{"class":54},"\"top_margin\"",[33,160384,365],{"class":167},[33,160386,160387],{"class":54},"\"bottom_margin\"",[33,160389,365],{"class":167},[33,160391,160392],{"class":54},"\"left_margin\"",[33,160394,365],{"class":167},[33,160396,160397],{"class":54},"\"right_margin\"",[33,160399,1737],{"class":167},[33,160401,160402,160405,160408,160410],{"class":35,"line":667},[33,160403,160404],{"class":50},"        setattr",[33,160406,160407],{"class":167},"(section, attr, Inches(",[33,160409,734],{"class":50},[33,160411,371],{"class":167},[33,160413,160414],{"class":35,"line":675},[33,160415,92],{"emptyLinePlaceholder":91},[33,160417,160418,160421,160423,160425,160427],{"class":35,"line":689},[33,160419,160420],{"class":167},"    doc.add_heading(title, ",[33,160422,18267],{"class":238},[33,160424,242],{"class":163},[33,160426,748],{"class":50},[33,160428,221],{"class":167},[33,160430,160431,160433,160435,160437,160439,160441,160443],{"class":35,"line":703},[33,160432,18591],{"class":167},[33,160434,103086],{"class":54},[33,160436,365],{"class":167},[33,160438,18267],{"class":238},[33,160440,242],{"class":163},[33,160442,734],{"class":50},[33,160444,221],{"class":167},[33,160446,160447],{"class":35,"line":714},[33,160448,92],{"emptyLinePlaceholder":91},[33,160450,160451,160454,160456],{"class":35,"line":723},[33,160452,160453],{"class":167},"    intro     ",[33,160455,242],{"class":163},[33,160457,160458],{"class":167}," doc.add_paragraph()\n",[33,160460,160461,160464,160467],{"class":35,"line":754},[33,160462,160463],{"class":167},"    intro.add_run(",[33,160465,160466],{"class":54},"\"Records in dataset: \"",[33,160468,221],{"class":167},[33,160470,160471,160474,160476,160479,160481,160483,160485],{"class":35,"line":771},[33,160472,160473],{"class":167},"    count_run ",[33,160475,242],{"class":163},[33,160477,160478],{"class":167}," intro.add_run(",[33,160480,1053],{"class":50},[33,160482,602],{"class":167},[33,160484,928],{"class":50},[33,160486,160487],{"class":167},"(df)))\n",[33,160489,160490,160493,160495],{"class":35,"line":777},[33,160491,160492],{"class":167},"    count_run.bold ",[33,160494,242],{"class":163},[33,160496,2887],{"class":50},[33,160498,160499],{"class":35,"line":788},[33,160500,92],{"emptyLinePlaceholder":91},[33,160502,160503,160505],{"class":35,"line":804},[33,160504,617],{"class":163},[33,160506,27514],{"class":167},[33,160508,160509,160512,160515],{"class":35,"line":809},[33,160510,160511],{"class":167},"        doc.add_paragraph(",[33,160513,160514],{"class":54},"\"No records to display.\"",[33,160516,221],{"class":167},[33,160518,160519,160521],{"class":35,"line":819},[33,160520,1659],{"class":163},[33,160522,160523],{"class":167}," doc\n",[33,160525,160526],{"class":35,"line":829},[33,160527,92],{"emptyLinePlaceholder":91},[33,160529,160530,160532,160534,160536,160538,160540,160542],{"class":35,"line":834},[33,160531,18591],{"class":167},[33,160533,17376],{"class":54},[33,160535,365],{"class":167},[33,160537,18267],{"class":238},[33,160539,242],{"class":163},[33,160541,734],{"class":50},[33,160543,221],{"class":167},[33,160545,160546],{"class":35,"line":839},[33,160547,92],{"emptyLinePlaceholder":91},[33,160549,160550,160552,160554,160556,160558,160560,160562,160564,160566,160568,160570],{"class":35,"line":860},[33,160551,18621],{"class":167},[33,160553,242],{"class":163},[33,160555,18626],{"class":167},[33,160557,18629],{"class":238},[33,160559,242],{"class":163},[33,160561,734],{"class":50},[33,160563,365],{"class":167},[33,160565,18638],{"class":238},[33,160567,242],{"class":163},[33,160569,928],{"class":50},[33,160571,160572],{"class":167},"(df.columns))\n",[33,160574,160575,160577,160579],{"class":35,"line":887},[33,160576,18650],{"class":167},[33,160578,242],{"class":163},[33,160580,18655],{"class":54},[33,160582,160583],{"class":35,"line":907},[33,160584,92],{"emptyLinePlaceholder":91},[33,160586,160587],{"class":35,"line":1826},[33,160588,160589],{"class":39},"    # Mark header row as repeating\n",[33,160591,160592,160595,160597,160599,160601],{"class":35,"line":1844},[33,160593,160594],{"class":167},"    tr   ",[33,160596,242],{"class":163},[33,160598,18674],{"class":167},[33,160600,748],{"class":50},[33,160602,157456],{"class":167},[33,160604,160605,160608,160610],{"class":35,"line":1858},[33,160606,160607],{"class":167},"    trPr ",[33,160609,242],{"class":163},[33,160611,157466],{"class":167},[33,160613,160614,160617,160619],{"class":35,"line":1871},[33,160615,160616],{"class":167},"    trPr.append(OxmlElement(",[33,160618,157479],{"class":54},[33,160620,371],{"class":167},[33,160622,160623],{"class":35,"line":1877},[33,160624,92],{"emptyLinePlaceholder":91},[33,160626,160627,160629,160631,160633,160635],{"class":35,"line":1883},[33,160628,18669],{"class":167},[33,160630,242],{"class":163},[33,160632,18674],{"class":167},[33,160634,748],{"class":50},[33,160636,18679],{"class":167},[33,160638,160639,160641,160643,160645,160647],{"class":35,"line":1915},[33,160640,656],{"class":163},[33,160642,18686],{"class":167},[33,160644,662],{"class":163},[33,160646,7403],{"class":50},[33,160648,160649],{"class":167},"(df.columns):\n",[33,160651,160652,160654,160656,160658],{"class":35,"line":1926},[33,160653,18698],{"class":167},[33,160655,242],{"class":163},[33,160657,7887],{"class":50},[33,160659,160660],{"class":167},"(col)\n",[33,160662,160663,160665,160667,160669,160671,160673,160675],{"class":35,"line":1932},[33,160664,18708],{"class":167},[33,160666,748],{"class":50},[33,160668,18713],{"class":167},[33,160670,748],{"class":50},[33,160672,157374],{"class":167},[33,160674,242],{"class":163},[33,160676,2887],{"class":50},[33,160678,160679],{"class":35,"line":1938},[33,160680,92],{"emptyLinePlaceholder":91},[33,160682,160683,160685,160687,160689],{"class":35,"line":1950},[33,160684,656],{"class":163},[33,160686,8560],{"class":167},[33,160688,662],{"class":163},[33,160690,8565],{"class":167},[33,160692,160693,160695,160697],{"class":35,"line":1958},[33,160694,18747],{"class":167},[33,160696,242],{"class":163},[33,160698,18752],{"class":167},[33,160700,160701,160703,160706,160708,160710],{"class":35,"line":4904},[33,160702,5973],{"class":163},[33,160704,160705],{"class":167}," i, val ",[33,160707,662],{"class":163},[33,160709,7403],{"class":50},[33,160711,160712],{"class":167},"(row):\n",[33,160714,160715,160717,160719,160721],{"class":35,"line":4909},[33,160716,23937],{"class":167},[33,160718,242],{"class":163},[33,160720,7887],{"class":50},[33,160722,24016],{"class":167},[33,160724,160725],{"class":35,"line":4915},[33,160726,92],{"emptyLinePlaceholder":91},[33,160728,160729,160731],{"class":35,"line":4925},[33,160730,1332],{"class":163},[33,160732,160523],{"class":167},[33,160734,160735],{"class":35,"line":4935},[33,160736,92],{"emptyLinePlaceholder":91},[33,160738,160739],{"class":35,"line":4941},[33,160740,92],{"emptyLinePlaceholder":91},[33,160742,160743,160745,160747,160749,160751],{"class":35,"line":4950},[33,160744,562],{"class":163},[33,160746,6636],{"class":46},[33,160748,568],{"class":167},[33,160750,571],{"class":50},[33,160752,574],{"class":167},[33,160754,160755,160757,160759,160761,160763,160765,160768],{"class":35,"line":4960},[33,160756,6648],{"class":167},[33,160758,242],{"class":163},[33,160760,6653],{"class":167},[33,160762,6656],{"class":238},[33,160764,242],{"class":163},[33,160766,160767],{"class":54},"\"Generate a .docx sales report from CSV.\"",[33,160769,221],{"class":167},[33,160771,160772,160774,160776,160778,160780,160782,160784,160786,160788,160790,160792,160794,160796,160798],{"class":35,"line":4965},[33,160773,6669],{"class":167},[33,160775,64452],{"class":54},[33,160777,25480],{"class":167},[33,160779,25448],{"class":238},[33,160781,242],{"class":163},[33,160783,855],{"class":50},[33,160785,365],{"class":167},[33,160787,6677],{"class":238},[33,160789,242],{"class":163},[33,160791,6682],{"class":167},[33,160793,25463],{"class":238},[33,160795,242],{"class":163},[33,160797,107039],{"class":54},[33,160799,221],{"class":167},[33,160801,160802,160804,160806,160808,160810,160812,160814,160816,160818,160820,160822,160824,160826,160829],{"class":35,"line":4971},[33,160803,6669],{"class":167},[33,160805,41152],{"class":54},[33,160807,1166],{"class":167},[33,160809,25448],{"class":238},[33,160811,242],{"class":163},[33,160813,855],{"class":50},[33,160815,365],{"class":167},[33,160817,6677],{"class":238},[33,160819,242],{"class":163},[33,160821,6682],{"class":167},[33,160823,25463],{"class":238},[33,160825,242],{"class":163},[33,160827,160828],{"class":54},"\"Output .docx path\"",[33,160830,221],{"class":167},[33,160832,160833,160835,160838,160840,160842,160844,160847,160849,160851,160853,160856],{"class":35,"line":4983},[33,160834,6669],{"class":167},[33,160836,160837],{"class":54},"\"--title\"",[33,160839,365],{"class":167},[33,160841,6685],{"class":238},[33,160843,242],{"class":163},[33,160845,160846],{"class":54},"\"Report\"",[33,160848,98374],{"class":167},[33,160850,25463],{"class":238},[33,160852,242],{"class":163},[33,160854,160855],{"class":54},"\"Document title\"",[33,160857,221],{"class":167},[33,160859,160860,160862,160864],{"class":35,"line":4988},[33,160861,6766],{"class":167},[33,160863,242],{"class":163},[33,160865,6771],{"class":167},[33,160867,160868],{"class":35,"line":4993},[33,160869,92],{"emptyLinePlaceholder":91},[33,160871,160872,160874],{"class":35,"line":5003},[33,160873,2424],{"class":163},[33,160875,574],{"class":167},[33,160877,160878,160880,160882],{"class":35,"line":5008},[33,160879,7930],{"class":167},[33,160881,242],{"class":163},[33,160883,160884],{"class":167}," pd.read_csv(args.data)\n",[33,160886,160887,160889,160891],{"class":35,"line":5014},[33,160888,2449],{"class":163},[33,160890,2945],{"class":50},[33,160892,574],{"class":167},[33,160894,160895,160897,160899,160902,160904,160907,160909,160911],{"class":35,"line":5019},[33,160896,2995],{"class":167},[33,160898,4059],{"class":163},[33,160900,160901],{"class":54},"\"[ERROR] CSV not found: ",[33,160903,1115],{"class":50},[33,160905,160906],{"class":167},"args.data",[33,160908,1121],{"class":50},[33,160910,274],{"class":54},[33,160912,221],{"class":167},[33,160914,160915,160917],{"class":35,"line":5032},[33,160916,2449],{"class":163},[33,160918,160919],{"class":167}," pd.errors.EmptyDataError:\n",[33,160921,160922,160924,160927],{"class":35,"line":5039},[33,160923,2995],{"class":167},[33,160925,160926],{"class":54},"\"[ERROR] CSV is empty.\"",[33,160928,221],{"class":167},[33,160930,160931],{"class":35,"line":5068},[33,160932,92],{"emptyLinePlaceholder":91},[33,160934,160935,160938,160940],{"class":35,"line":5077},[33,160936,160937],{"class":167},"    doc      ",[33,160939,242],{"class":163},[33,160941,160942],{"class":167}," build_document(df, args.title)\n",[33,160944,160945,160947,160949],{"class":35,"line":5082},[33,160946,6388],{"class":167},[33,160948,242],{"class":163},[33,160950,124878],{"class":167},[33,160952,160953,160955,160957,160959,160961,160963,160965,160967,160969],{"class":35,"line":5089},[33,160954,64564],{"class":167},[33,160956,869],{"class":238},[33,160958,242],{"class":163},[33,160960,855],{"class":50},[33,160962,365],{"class":167},[33,160964,878],{"class":238},[33,160966,242],{"class":163},[33,160968,855],{"class":50},[33,160970,221],{"class":167},[33,160972,160973],{"class":35,"line":5098},[33,160974,92],{"emptyLinePlaceholder":91},[33,160976,160977,160979],{"class":35,"line":5105},[33,160978,1635],{"class":163},[33,160980,159879],{"class":167},[33,160982,160983,160985,160987,160989,160991,160993,160995,160998,161000,161002],{"class":35,"line":5110},[33,160984,159884],{"class":238},[33,160986,242],{"class":163},[33,160988,159889],{"class":54},[33,160990,365],{"class":167},[33,160992,159894],{"class":238},[33,160994,242],{"class":163},[33,160996,160997],{"class":167},"out_path.parent, ",[33,160999,159902],{"class":238},[33,161001,242],{"class":163},[33,161003,8339],{"class":50},[33,161005,161006,161008,161010],{"class":35,"line":5115},[33,161007,35761],{"class":167},[33,161009,495],{"class":163},[33,161011,159915],{"class":167},[33,161013,161014,161016,161018],{"class":35,"line":5128},[33,161015,159920],{"class":167},[33,161017,242],{"class":163},[33,161019,159925],{"class":167},[33,161021,161022],{"class":35,"line":5135},[33,161023,92],{"emptyLinePlaceholder":91},[33,161025,161026,161028],{"class":35,"line":5142},[33,161027,2424],{"class":163},[33,161029,574],{"class":167},[33,161031,161032],{"class":35,"line":5151},[33,161033,159936],{"class":167},[33,161035,161036,161038,161040,161042,161044],{"class":35,"line":5156},[33,161037,159941],{"class":167},[33,161039,1053],{"class":50},[33,161041,159946],{"class":167},[33,161043,1053],{"class":50},[33,161045,161046],{"class":167},"(out_path))\n",[33,161048,161049,161051,161053,161055,161058,161060,161062,161064,161066],{"class":35,"line":5161},[33,161050,9414],{"class":50},[33,161052,602],{"class":167},[33,161054,4059],{"class":163},[33,161056,161057],{"class":54},"\"[OK] Report saved: ",[33,161059,1115],{"class":50},[33,161061,40722],{"class":167},[33,161063,1121],{"class":50},[33,161065,274],{"class":54},[33,161067,221],{"class":167},[33,161069,161070,161072,161074,161076],{"class":35,"line":5167},[33,161071,2449],{"class":163},[33,161073,783],{"class":50},[33,161075,1852],{"class":163},[33,161077,1855],{"class":167},[33,161079,161080,161082,161084,161086,161088],{"class":35,"line":5172},[33,161081,159964],{"class":167},[33,161083,67210],{"class":238},[33,161085,242],{"class":163},[33,161087,855],{"class":50},[33,161089,221],{"class":167},[33,161091,161092,161094,161096,161099,161101,161103,161105,161107],{"class":35,"line":5182},[33,161093,2995],{"class":167},[33,161095,4059],{"class":163},[33,161097,161098],{"class":54},"\"[ERROR] Save failed: ",[33,161100,1115],{"class":50},[33,161102,6565],{"class":167},[33,161104,1121],{"class":50},[33,161106,274],{"class":54},[33,161108,221],{"class":167},[33,161110,161111],{"class":35,"line":5195},[33,161112,92],{"emptyLinePlaceholder":91},[33,161114,161115],{"class":35,"line":5200},[33,161116,92],{"emptyLinePlaceholder":91},[33,161118,161119,161121,161123,161125,161127],{"class":35,"line":5205},[33,161120,2491],{"class":163},[33,161122,2494],{"class":50},[33,161124,2497],{"class":163},[33,161126,2500],{"class":54},[33,161128,574],{"class":167},[33,161130,161131],{"class":35,"line":5210},[33,161132,6914],{"class":167},[2537,161134],{},[18,161136,88566],{"id":29183},[14,161138,161139,161142,161143,161145],{},[1974,161140,161141],{},"Does python-docx require Microsoft Word to be installed?","\nNo. It reads and writes the OOXML (",[30,161144,18051],{},") ZIP format entirely in Python — no COM automation, no Office installation, no Windows dependency. The library works identically on Linux, macOS, and Windows.",[14,161147,161148,161151,161152,161155,161156,36608,161159,161162,161163,69863,161166,69863,161169,161172,161173,161176],{},[1974,161149,161150],{},"How do I add page numbers to the footer?","\nPage numbers in Word are implemented as a ",[30,161153,161154],{},"PAGE"," field in the ",[30,161157,161158],{},"w:fldChar",[30,161160,161161],{},"w:instrText"," XML pattern — there is no high-level python-docx method for it. Insert the three-element XML sequence (",[30,161164,161165],{},"fldChar begin",[30,161167,161168],{},"instrText PAGE",[30,161170,161171],{},"fldChar end",") into a run in ",[30,161174,161175],{},"section.footer.paragraphs[0]"," as shown in the Headers and Footers step above.",[14,161178,161179,161182,161183,161186,161187,161190,161191,3035],{},[1974,161180,161181],{},"Can I copy a styled paragraph from one document into another?","\nNot via the high-level API. You must clone the underlying ",[30,161184,161185],{},"_p"," XML element: ",[30,161188,161189],{},"from copy import deepcopy; target_doc.element.body.append(deepcopy(src_para._p))",". This copies the element structure but does not copy style definitions — any styles referenced must already exist in the target document's ",[30,161192,161193],{},"styles.xml",[14,161195,161196,107278,161199,365,161202,71132,161205,161208,161209,161211,161212,3035],{},[1974,161197,161198],{},"How do I set font family, size, and color on runs?",[30,161200,161201],{},"run.font.name",[30,161203,161204],{},"run.font.size = Pt(12)",[30,161206,161207],{},"run.font.color.rgb = RGBColor(r, g, b)",". East-Asian font names require an extra ",[30,161210,157020],{}," XML attribute not exposed by the API — the oxml workaround and named style approach are covered in detail in ",[940,161213,157016],{"href":157015},[14,161215,161216,161219,161221,161222,161224,161225,161227,161228,161230,161231,161233,161234,3035],{},[1974,161217,161218],{},"What is the difference between python-docx and docxtpl?",[30,161220,18041],{}," builds documents element by element in code; ",[30,161223,18047],{}," renders a Jinja2-annotated ",[30,161226,18051],{}," template against a data dictionary. Use ",[30,161229,18041],{}," when the document structure itself is data-driven (variable number of sections, tables whose shape changes per record); use ",[30,161232,18047],{}," when the layout is fixed and only the values change — see ",[940,161235,26185],{"href":18040},[14,161237,161238,161241,161242,161245,161246,65087,161248,161251],{},[1974,161239,161240],{},"How do I generate a table of contents?","\npython-docx cannot build a live, auto-updating TOC — that requires Word's field calculation engine. You can insert a ",[30,161243,161244],{},"TOC"," field placeholder (",[30,161247,161158],{},[30,161249,161250],{},"w:instrText 'TOC \\\\o \"1-3\"'",") that Word will update when the user opens the document and accepts the prompt to update fields.",[2537,161253],{},[18,161255,6918],{"id":6917},[4211,161257,161258,161263,161268,161273],{},[4214,161259,161260,161262],{},[940,161261,157016],{"href":157015}," — fix run-level font changes that have no visible effect, including the East-Asian oxml workaround",[4214,161264,161265,161267],{},[940,161266,26185],{"href":18040}," — Jinja2 template loops for high-volume, layout-fixed document generation",[4214,161269,161270,161272],{},[940,161271,156178],{"href":156177}," — embed charts and logos with correct sizing and DPI",[4214,161274,161275,161279],{},[940,161276,161278],{"href":161277},"\u002Fword-document-templating-batch-processing\u002Fconverting-docx-to-pdf-with-python\u002F","Converting DOCX to PDF with Python"," — chain generation with headless PDF conversion for print-ready output",[14,161281,6947,161282,3035],{},[940,161283,26263],{"href":26262},[6953,161285,26204],{},{"title":28,"searchDepth":43,"depth":43,"links":161287},[161288,161289,161290,161299,161300,161305,161306,161307,161308,161309,161310],{"id":20,"depth":43,"text":21},{"id":156267,"depth":43,"text":156268},{"id":156453,"depth":43,"text":156454,"children":161291},[161292,161293,161294,161295,161296,161297,161298],{"id":156457,"depth":61,"text":156458},{"id":156712,"depth":61,"text":156713},{"id":156815,"depth":61,"text":156816},{"id":157024,"depth":61,"text":157025},{"id":157489,"depth":61,"text":157490},{"id":157676,"depth":61,"text":157677},{"id":157930,"depth":61,"text":157931},{"id":158027,"depth":43,"text":158028},{"id":158185,"depth":43,"text":158186,"children":161301},[161302,161303,161304],{"id":158189,"depth":61,"text":158190},{"id":158493,"depth":61,"text":158494},{"id":158761,"depth":61,"text":158762},{"id":159146,"depth":43,"text":159147},{"id":159751,"depth":43,"text":159752},{"id":160060,"depth":43,"text":160061},{"id":160189,"depth":43,"text":160190},{"id":29183,"depth":43,"text":88566},{"id":6917,"depth":43,"text":6918},"Word Doc Creation",{},"\u002Fword-document-templating-batch-processing\u002Fautomating-word-document-creation",{"title":156152,"description":161315},{"Build ":161316,"date":46387,"updatedAt":6978,"tags":161318},{"docx files programmatically with python-docx":161317},"paragraphs, headings, runs, tables, page breaks, sections, and batch output at scale.",[47,18041,161319,6989],"docx","Automate Word Document Creation with python-docx","word-document-templating-batch-processing\u002Fautomating-word-document-creation\u002Findex","wKz3SzM5zrBKWhYYyaJS05jSDfnUn23boBKe7pQk6Fk",{"id":161324,"title":157016,"body":161325,"breadcrumbTitle":163888,"canonical":6977,"date":6978,"description":163889,"draft":6980,"extension":6981,"image":6977,"meta":163890,"navigation":91,"path":163891,"robots":6977,"seo":163892,"seoTitle":157016,"stem":163893,"tags":163894,"updatedAt":6978,"__hash__":163895},"content\u002Fword-document-templating-batch-processing\u002Fautomating-word-document-creation\u002Fset-fonts-and-styles-with-python-docx\u002Findex.md",{"type":7,"value":161326,"toc":163877},[161327,161330,161342,161351,161353,161355,161358,161405,161425,161447,161449,161451,161458,161687,161693,161738,161750,161752,161756,161773,162036,162053,162055,162059,162066,162357,162360,162369,162372,162375,162481,162483,162487,162490,162874,162884,162886,162890,162896,163228,163240,163291,163293,163297,163300,163534,163537,163539,163541,163544,163824,163827,163833,163850,163852,163854,163871,163875],[10,161328,157016],{"id":161329},"set-fonts-and-styles-with-python-docx",[14,161331,161332,161335,161336,128085,161339,161341],{},[30,161333,161334],{},"run.font.name = \"Arial\""," runs without error, you save the file, open it in Word — and the font is still Calibri. Or you set a Chinese font name and the CJK characters still render in the default fallback face. Both symptoms share the same root cause: ",[1974,161337,161338],{},"run-level formatting only wins when no style higher in the inheritance chain explicitly overrides it",[30,161340,157020],{}," XML attribute that controls East-Asian character rendering is not exposed through the python-docx high-level API.",[14,161343,161344,161345,161347,161348,161350],{},"This guide explains the root cause, provides a minimal diagnostic to confirm it, then shows four targeted fixes: run-level font properties for Latin text, the ",[30,161346,157020],{}," oxml workaround for CJK characters, defining a reusable named character style, and modifying the ",[30,161349,99685],{}," paragraph style's font and spacing globally.",[2537,161352],{},[18,161354,7021],{"id":7020},[14,161356,161357],{},"Word resolves the final rendered font through a precedence chain:",[35387,161359,161360,161376,161385,161396],{},[4214,161361,161362,161365,161366,161369,161370,365,161372,161375],{},[1974,161363,161364],{},"Run direct formatting"," — properties set on a ",[30,161367,161368],{},"Run"," object (",[30,161371,161201],{},[30,161373,161374],{},"run.bold",", etc.)",[4214,161377,161378,161381,161382,12027],{},[1974,161379,161380],{},"Character style"," — a named character style applied to the run (",[30,161383,161384],{},"run.style = ...",[4214,161386,161387,161390,161391,365,161393,161375],{},[1974,161388,161389],{},"Paragraph style"," — the style of the paragraph the run belongs to (",[30,161392,99685],{},[30,161394,161395],{},"Body Text",[4214,161397,161398,161401,161402],{},[1974,161399,161400],{},"Document defaults"," — the document's default font and size in ",[30,161403,161404],{},"word\u002Fsettings.xml",[14,161406,36018,161407,161409,161410,10065,161413,161416,161417,161420,161421,161424],{},[30,161408,161334],{}," writes ",[30,161411,161412],{},"w:ascii=\"Arial\"",[30,161414,161415],{},"w:hAnsi=\"Arial\""," to the run's ",[30,161418,161419],{},"\u003Cw:rFonts>"," element. This correctly overrides the style chain ",[1974,161422,161423],{},"for the Latin character range",". However:",[4211,161426,161427,161433],{},[4214,161428,161429,161430,161432],{},"If you set ",[30,161431,161201],{}," but the paragraph style defines the same font explicitly, some renderers still show the style value (the run direct-formatting flag must be set, not just the value).",[4214,161434,39550,161435,161437,161438,161441,161442,161444,161445,3035],{},[30,161436,157020],{}," attribute — which Word uses for Han, Hiragana, Katakana, Hangul, and related ranges — is ",[1974,161439,161440],{},"never written by the python-docx API"," when you assign ",[30,161443,161201],{},". So CJK characters always fall through to the style or document default, regardless of what you put in ",[30,161446,161201],{},[2537,161448],{},[18,161450,35017],{"id":35016},[14,161452,161453,161454,161457],{},"Run this to see exactly what XML python-docx produces for a run with ",[30,161455,161456],{},"font.name"," assigned:",[23,161459,161461],{"className":126,"code":161460,"language":47,"meta":28,"style":28},"# pip install python-docx lxml\nfrom pathlib import Path\nfrom docx import Document\nfrom lxml import etree\n\nOUTPUT = Path(\"output\u002Fdiag_fonts.docx\")\nOUTPUT.parent.mkdir(parents=True, exist_ok=True)\n\ndoc  = Document()\npara = doc.add_paragraph()\nrun  = para.add_run(\"Hello, 世界 — Latin and CJK mixed\")\nrun.font.name = \"Noto Sans SC\"   # intended CJK font\nrun.font.size = __import__(\"docx.shared\", fromlist=[\"Pt\"]).Pt(12)\n\ndoc.save(OUTPUT)\n\n# Re-open and inspect the raw XML of the first run\ndoc2    = Document(OUTPUT)\nfirst_r = doc2.paragraphs[0].runs[0]\nprint(etree.tostring(first_r._r, pretty_print=True).decode())\n",[30,161462,161463,161468,161478,161488,161500,161504,161517,161539,161543,161551,161559,161573,161586,161617,161621,161630,161634,161639,161652,161670],{"__ignoreMap":28},[33,161464,161465],{"class":35,"line":36},[33,161466,161467],{"class":39},"# pip install python-docx lxml\n",[33,161469,161470,161472,161474,161476],{"class":35,"line":43},[33,161471,190],{"class":163},[33,161473,193],{"class":167},[33,161475,164],{"class":163},[33,161477,198],{"class":167},[33,161479,161480,161482,161484,161486],{"class":35,"line":61},[33,161481,190],{"class":163},[33,161483,18092],{"class":167},[33,161485,164],{"class":163},[33,161487,18097],{"class":167},[33,161489,161490,161492,161495,161497],{"class":35,"line":73},[33,161491,190],{"class":163},[33,161493,161494],{"class":167}," lxml ",[33,161496,164],{"class":163},[33,161498,161499],{"class":167}," etree\n",[33,161501,161502],{"class":35,"line":88},[33,161503,92],{"emptyLinePlaceholder":91},[33,161505,161506,161508,161510,161512,161515],{"class":35,"line":95},[33,161507,96935],{"class":50},[33,161509,212],{"class":163},[33,161511,215],{"class":167},[33,161513,161514],{"class":54},"\"output\u002Fdiag_fonts.docx\"",[33,161516,221],{"class":167},[33,161518,161519,161521,161523,161525,161527,161529,161531,161533,161535,161537],{"class":35,"line":101},[33,161520,96935],{"class":50},[33,161522,866],{"class":167},[33,161524,869],{"class":238},[33,161526,242],{"class":163},[33,161528,855],{"class":50},[33,161530,365],{"class":167},[33,161532,878],{"class":238},[33,161534,242],{"class":163},[33,161536,855],{"class":50},[33,161538,221],{"class":167},[33,161540,161541],{"class":35,"line":171},[33,161542,92],{"emptyLinePlaceholder":91},[33,161544,161545,161547,161549],{"class":35,"line":179},[33,161546,158288],{"class":167},[33,161548,242],{"class":163},[33,161550,18229],{"class":167},[33,161552,161553,161555,161557],{"class":35,"line":187},[33,161554,158297],{"class":167},[33,161556,242],{"class":163},[33,161558,160458],{"class":167},[33,161560,161561,161564,161566,161568,161571],{"class":35,"line":201},[33,161562,161563],{"class":167},"run  ",[33,161565,242],{"class":163},[33,161567,156898],{"class":167},[33,161569,161570],{"class":54},"\"Hello, 世界 — Latin and CJK mixed\"",[33,161572,221],{"class":167},[33,161574,161575,161578,161580,161583],{"class":35,"line":206},[33,161576,161577],{"class":167},"run.font.name ",[33,161579,242],{"class":163},[33,161581,161582],{"class":54}," \"Noto Sans SC\"",[33,161584,161585],{"class":39},"   # intended CJK font\n",[33,161587,161588,161591,161593,161595,161597,161599,161601,161603,161605,161607,161610,161613,161615],{"class":35,"line":224},[33,161589,161590],{"class":167},"run.font.size ",[33,161592,242],{"class":163},[33,161594,158374],{"class":50},[33,161596,602],{"class":167},[33,161598,158379],{"class":54},[33,161600,365],{"class":167},[33,161602,158384],{"class":238},[33,161604,242],{"class":163},[33,161606,8309],{"class":167},[33,161608,161609],{"class":54},"\"Pt\"",[33,161611,161612],{"class":167},"]).Pt(",[33,161614,55650],{"class":50},[33,161616,221],{"class":167},[33,161618,161619],{"class":35,"line":229},[33,161620,92],{"emptyLinePlaceholder":91},[33,161622,161623,161626,161628],{"class":35,"line":235},[33,161624,161625],{"class":167},"doc.save(",[33,161627,96935],{"class":50},[33,161629,221],{"class":167},[33,161631,161632],{"class":35,"line":250},[33,161633,92],{"emptyLinePlaceholder":91},[33,161635,161636],{"class":35,"line":266},[33,161637,161638],{"class":39},"# Re-open and inspect the raw XML of the first run\n",[33,161640,161641,161644,161646,161648,161650],{"class":35,"line":290},[33,161642,161643],{"class":167},"doc2    ",[33,161645,242],{"class":163},[33,161647,156340],{"class":167},[33,161649,96935],{"class":50},[33,161651,221],{"class":167},[33,161653,161654,161657,161659,161662,161664,161666,161668],{"class":35,"line":295},[33,161655,161656],{"class":167},"first_r ",[33,161658,242],{"class":163},[33,161660,161661],{"class":167}," doc2.paragraphs[",[33,161663,748],{"class":50},[33,161665,18713],{"class":167},[33,161667,748],{"class":50},[33,161669,9202],{"class":167},[33,161671,161672,161674,161677,161680,161682,161684],{"class":35,"line":300},[33,161673,13474],{"class":50},[33,161675,161676],{"class":167},"(etree.tostring(first_r._r, ",[33,161678,161679],{"class":238},"pretty_print",[33,161681,242],{"class":163},[33,161683,855],{"class":50},[33,161685,161686],{"class":167},").decode())\n",[14,161688,161689,161690,161692],{},"Expected (broken) output — notice ",[30,161691,157020],{}," is absent:",[23,161694,161696],{"className":147140,"code":161695,"language":147142,"meta":28,"style":28},"\u003Cw:r xmlns:w=\"...\">\n  \u003Cw:rPr>\n    \u003Cw:rFonts w:ascii=\"Noto Sans SC\" w:hAnsi=\"Noto Sans SC\"\u002F>\n    \u003Cw:sz w:val=\"24\"\u002F>\n    \u003Cw:szCs w:val=\"24\"\u002F>\n  \u003C\u002Fw:rPr>\n  \u003Cw:t xml:space=\"preserve\">Hello, 世界 — Latin and CJK mixed\u003C\u002Fw:t>\n\u003C\u002Fw:r>\n",[30,161697,161698,161703,161708,161713,161718,161723,161728,161733],{"__ignoreMap":28},[33,161699,161700],{"class":35,"line":36},[33,161701,161702],{},"\u003Cw:r xmlns:w=\"...\">\n",[33,161704,161705],{"class":35,"line":43},[33,161706,161707],{},"  \u003Cw:rPr>\n",[33,161709,161710],{"class":35,"line":61},[33,161711,161712],{},"    \u003Cw:rFonts w:ascii=\"Noto Sans SC\" w:hAnsi=\"Noto Sans SC\"\u002F>\n",[33,161714,161715],{"class":35,"line":73},[33,161716,161717],{},"    \u003Cw:sz w:val=\"24\"\u002F>\n",[33,161719,161720],{"class":35,"line":88},[33,161721,161722],{},"    \u003Cw:szCs w:val=\"24\"\u002F>\n",[33,161724,161725],{"class":35,"line":95},[33,161726,161727],{},"  \u003C\u002Fw:rPr>\n",[33,161729,161730],{"class":35,"line":101},[33,161731,161732],{},"  \u003Cw:t xml:space=\"preserve\">Hello, 世界 — Latin and CJK mixed\u003C\u002Fw:t>\n",[33,161734,161735],{"class":35,"line":171},[33,161736,161737],{},"\u003C\u002Fw:r>\n",[14,161739,161740,10065,161743,161746,161747,161749],{},[30,161741,161742],{},"w:ascii",[30,161744,161745],{},"w:hAnsi"," are set, but ",[30,161748,157020],{}," is missing. Word falls back to its default CJK face (SimSun on Windows, Heiti SC on macOS) for the CJK codepoints.",[2537,161751],{},[18,161753,161755],{"id":161754},"fix-1-run-level-font-properties-latin-text","Fix 1 — Run-Level Font Properties (Latin Text)",[14,161757,161758,161759,10065,161761,161764,161765,365,161767,71132,161770,161772],{},"For Latin scripts, setting ",[30,161760,161201],{},[30,161762,161763],{},"run.font.size"," is sufficient as long as the paragraph's named style does not also set an explicit font. Add ",[30,161766,17236],{},[30,161768,161769],{},"italic",[30,161771,17245],{}," on the same object:",[23,161774,161776],{"className":126,"code":161775,"language":47,"meta":28,"style":28},"# pip install python-docx\nfrom pathlib import Path\nfrom docx import Document\nfrom docx.shared import Pt, RGBColor\n\nOUTPUT = Path(\"output\u002Fword\u002Fstyled_run.docx\")\nOUTPUT.parent.mkdir(parents=True, exist_ok=True)\n\ndoc  = Document()\npara = doc.add_paragraph()\nrun  = para.add_run(\"Critical notice — Latin text only\")\n\nrun.font.name      = \"Georgia\"                       # Latin typeface\nrun.font.size      = Pt(13)                          # explicit size in points\nrun.bold           = True                            # shortcut for run.font.bold = True\nrun.font.italic    = True\nrun.font.color.rgb = RGBColor(0xC0, 0x39, 0x2B)     # dark red (hex RGB)\n\ntry:\n    doc.save(OUTPUT)\n    print(f\"Saved: {OUTPUT}\")\nexcept OSError as exc:\n    print(f\"Save failed: {exc}\")\n",[30,161777,161778,161782,161792,161802,161812,161816,161829,161851,161855,161863,161871,161884,161888,161901,161918,161930,161939,161972,161976,161982,161990,162006,162016],{"__ignoreMap":28},[33,161779,161780],{"class":35,"line":36},[33,161781,156213],{"class":39},[33,161783,161784,161786,161788,161790],{"class":35,"line":43},[33,161785,190],{"class":163},[33,161787,193],{"class":167},[33,161789,164],{"class":163},[33,161791,198],{"class":167},[33,161793,161794,161796,161798,161800],{"class":35,"line":61},[33,161795,190],{"class":163},[33,161797,18092],{"class":167},[33,161799,164],{"class":163},[33,161801,18097],{"class":167},[33,161803,161804,161806,161808,161810],{"class":35,"line":73},[33,161805,190],{"class":163},[33,161807,18104],{"class":167},[33,161809,164],{"class":163},[33,161811,22662],{"class":167},[33,161813,161814],{"class":35,"line":88},[33,161815,92],{"emptyLinePlaceholder":91},[33,161817,161818,161820,161822,161824,161827],{"class":35,"line":95},[33,161819,96935],{"class":50},[33,161821,212],{"class":163},[33,161823,215],{"class":167},[33,161825,161826],{"class":54},"\"output\u002Fword\u002Fstyled_run.docx\"",[33,161828,221],{"class":167},[33,161830,161831,161833,161835,161837,161839,161841,161843,161845,161847,161849],{"class":35,"line":101},[33,161832,96935],{"class":50},[33,161834,866],{"class":167},[33,161836,869],{"class":238},[33,161838,242],{"class":163},[33,161840,855],{"class":50},[33,161842,365],{"class":167},[33,161844,878],{"class":238},[33,161846,242],{"class":163},[33,161848,855],{"class":50},[33,161850,221],{"class":167},[33,161852,161853],{"class":35,"line":171},[33,161854,92],{"emptyLinePlaceholder":91},[33,161856,161857,161859,161861],{"class":35,"line":179},[33,161858,158288],{"class":167},[33,161860,242],{"class":163},[33,161862,18229],{"class":167},[33,161864,161865,161867,161869],{"class":35,"line":187},[33,161866,158297],{"class":167},[33,161868,242],{"class":163},[33,161870,160458],{"class":167},[33,161872,161873,161875,161877,161879,161882],{"class":35,"line":201},[33,161874,161563],{"class":167},[33,161876,242],{"class":163},[33,161878,156898],{"class":167},[33,161880,161881],{"class":54},"\"Critical notice — Latin text only\"",[33,161883,221],{"class":167},[33,161885,161886],{"class":35,"line":206},[33,161887,92],{"emptyLinePlaceholder":91},[33,161889,161890,161893,161895,161898],{"class":35,"line":224},[33,161891,161892],{"class":167},"run.font.name      ",[33,161894,242],{"class":163},[33,161896,161897],{"class":54}," \"Georgia\"",[33,161899,161900],{"class":39},"                       # Latin typeface\n",[33,161902,161903,161906,161908,161910,161912,161915],{"class":35,"line":229},[33,161904,161905],{"class":167},"run.font.size      ",[33,161907,242],{"class":163},[33,161909,18472],{"class":167},[33,161911,83532],{"class":50},[33,161913,161914],{"class":167},")                          ",[33,161916,161917],{"class":39},"# explicit size in points\n",[33,161919,161920,161923,161925,161927],{"class":35,"line":235},[33,161921,161922],{"class":167},"run.bold           ",[33,161924,242],{"class":163},[33,161926,2519],{"class":50},[33,161928,161929],{"class":39},"                            # shortcut for run.font.bold = True\n",[33,161931,161932,161935,161937],{"class":35,"line":250},[33,161933,161934],{"class":167},"run.font.italic    ",[33,161936,242],{"class":163},[33,161938,2887],{"class":50},[33,161940,161941,161944,161946,161948,161950,161953,161955,161957,161959,161961,161963,161966,161969],{"class":35,"line":266},[33,161942,161943],{"class":167},"run.font.color.rgb ",[33,161945,242],{"class":163},[33,161947,18288],{"class":167},[33,161949,18291],{"class":163},[33,161951,161952],{"class":50},"C0",[33,161954,365],{"class":167},[33,161956,18291],{"class":163},[33,161958,120779],{"class":50},[33,161960,365],{"class":167},[33,161962,18291],{"class":163},[33,161964,161965],{"class":50},"2B",[33,161967,161968],{"class":167},")     ",[33,161970,161971],{"class":39},"# dark red (hex RGB)\n",[33,161973,161974],{"class":35,"line":290},[33,161975,92],{"emptyLinePlaceholder":91},[33,161977,161978,161980],{"class":35,"line":295},[33,161979,35574],{"class":163},[33,161981,574],{"class":167},[33,161983,161984,161986,161988],{"class":35,"line":300},[33,161985,85716],{"class":167},[33,161987,96935],{"class":50},[33,161989,221],{"class":167},[33,161991,161992,161994,161996,161998,162000,162002,162004],{"class":35,"line":317},[33,161993,7268],{"class":50},[33,161995,602],{"class":167},[33,161997,4059],{"class":163},[33,161999,97737],{"class":54},[33,162001,97684],{"class":50},[33,162003,274],{"class":54},[33,162005,221],{"class":167},[33,162007,162008,162010,162012,162014],{"class":35,"line":332},[33,162009,35726],{"class":163},[33,162011,107953],{"class":50},[33,162013,1852],{"class":163},[33,162015,1855],{"class":167},[33,162017,162018,162020,162022,162024,162026,162028,162030,162032,162034],{"class":35,"line":347},[33,162019,7268],{"class":50},[33,162021,602],{"class":167},[33,162023,4059],{"class":163},[33,162025,158012],{"class":54},[33,162027,1115],{"class":50},[33,162029,6565],{"class":167},[33,162031,1121],{"class":50},[33,162033,274],{"class":54},[33,162035,221],{"class":167},[14,162037,162038,10065,162040,162043,162044,162046,162047,162049,162050,162052],{},[30,162039,161374],{},[30,162041,162042],{},"run.font.bold"," are equivalent. Prefer ",[30,162045,162042],{}," for consistency with other font properties. Setting a value to ",[30,162048,571],{}," explicitly resets it to inherit from the style chain; setting ",[30,162051,902],{}," explicitly suppresses bold even if the style requests it.",[2537,162054],{},[18,162056,162058],{"id":162057},"fix-2-east-asian-font-via-oxml-the-weastasia-workaround","Fix 2 — East-Asian Font via oxml (the w:eastAsia Workaround)",[14,162060,162061,162062,162065],{},"python-docx has no ",[30,162063,162064],{},"run.font.east_asian_name"," property. Set it by reaching into the underlying lxml element directly:",[23,162067,162069],{"className":126,"code":162068,"language":47,"meta":28,"style":28},"# pip install python-docx lxml\nfrom pathlib import Path\nfrom docx import Document\nfrom docx.oxml.ns import qn\nfrom docx.shared import Pt\n\nOUTPUT    = Path(\"output\u002Fword\u002Fcjk_font.docx\")\nOUTPUT.parent.mkdir(parents=True, exist_ok=True)\n\nCJK_FONT   = \"Noto Sans SC\"\nLATIN_FONT = \"Arial\"\n\ndoc  = Document()\npara = doc.add_paragraph()\nrun  = para.add_run(\"Hello, 世界 — mixed Latin and CJK text.\")\n\n# Set the Latin\u002FASCII font via the high-level API\nrun.font.name = LATIN_FONT\nrun.font.size = Pt(12)\n\n# Add w:eastAsia to the rFonts element — the missing step\nr_pr    = run._r.get_or_add_rPr()       # \u003Cw:rPr> element, created if absent\nr_fonts = r_pr.get_or_add_rFonts()      # \u003Cw:rFonts> element, created if absent\nr_fonts.set(qn(\"w:eastAsia\"), CJK_FONT) # write the attribute python-docx omits\n\ntry:\n    doc.save(OUTPUT)\n    print(f\"Saved: {OUTPUT}\")\nexcept OSError as exc:\n    print(f\"Save failed: {exc}\")\n",[30,162070,162071,162075,162085,162095,162105,162115,162119,162132,162154,162158,162168,162178,162182,162190,162198,162211,162215,162220,162229,162241,162245,162250,162263,162276,162293,162297,162303,162311,162327,162337],{"__ignoreMap":28},[33,162072,162073],{"class":35,"line":36},[33,162074,161467],{"class":39},[33,162076,162077,162079,162081,162083],{"class":35,"line":43},[33,162078,190],{"class":163},[33,162080,193],{"class":167},[33,162082,164],{"class":163},[33,162084,198],{"class":167},[33,162086,162087,162089,162091,162093],{"class":35,"line":61},[33,162088,190],{"class":163},[33,162090,18092],{"class":167},[33,162092,164],{"class":163},[33,162094,18097],{"class":167},[33,162096,162097,162099,162101,162103],{"class":35,"line":73},[33,162098,190],{"class":163},[33,162100,157055],{"class":167},[33,162102,164],{"class":163},[33,162104,157060],{"class":167},[33,162106,162107,162109,162111,162113],{"class":35,"line":88},[33,162108,190],{"class":163},[33,162110,18104],{"class":167},[33,162112,164],{"class":163},[33,162114,158240],{"class":167},[33,162116,162117],{"class":35,"line":95},[33,162118,92],{"emptyLinePlaceholder":91},[33,162120,162121,162123,162125,162127,162130],{"class":35,"line":101},[33,162122,96935],{"class":50},[33,162124,20470],{"class":163},[33,162126,215],{"class":167},[33,162128,162129],{"class":54},"\"output\u002Fword\u002Fcjk_font.docx\"",[33,162131,221],{"class":167},[33,162133,162134,162136,162138,162140,162142,162144,162146,162148,162150,162152],{"class":35,"line":171},[33,162135,96935],{"class":50},[33,162137,866],{"class":167},[33,162139,869],{"class":238},[33,162141,242],{"class":163},[33,162143,855],{"class":50},[33,162145,365],{"class":167},[33,162147,878],{"class":238},[33,162149,242],{"class":163},[33,162151,855],{"class":50},[33,162153,221],{"class":167},[33,162155,162156],{"class":35,"line":179},[33,162157,92],{"emptyLinePlaceholder":91},[33,162159,162160,162163,162165],{"class":35,"line":187},[33,162161,162162],{"class":50},"CJK_FONT",[33,162164,21012],{"class":163},[33,162166,162167],{"class":54}," \"Noto Sans SC\"\n",[33,162169,162170,162173,162175],{"class":35,"line":201},[33,162171,162172],{"class":50},"LATIN_FONT",[33,162174,212],{"class":163},[33,162176,162177],{"class":54}," \"Arial\"\n",[33,162179,162180],{"class":35,"line":206},[33,162181,92],{"emptyLinePlaceholder":91},[33,162183,162184,162186,162188],{"class":35,"line":224},[33,162185,158288],{"class":167},[33,162187,242],{"class":163},[33,162189,18229],{"class":167},[33,162191,162192,162194,162196],{"class":35,"line":229},[33,162193,158297],{"class":167},[33,162195,242],{"class":163},[33,162197,160458],{"class":167},[33,162199,162200,162202,162204,162206,162209],{"class":35,"line":235},[33,162201,161563],{"class":167},[33,162203,242],{"class":163},[33,162205,156898],{"class":167},[33,162207,162208],{"class":54},"\"Hello, 世界 — mixed Latin and CJK text.\"",[33,162210,221],{"class":167},[33,162212,162213],{"class":35,"line":250},[33,162214,92],{"emptyLinePlaceholder":91},[33,162216,162217],{"class":35,"line":266},[33,162218,162219],{"class":39},"# Set the Latin\u002FASCII font via the high-level API\n",[33,162221,162222,162224,162226],{"class":35,"line":290},[33,162223,161577],{"class":167},[33,162225,242],{"class":163},[33,162227,162228],{"class":50}," LATIN_FONT\n",[33,162230,162231,162233,162235,162237,162239],{"class":35,"line":295},[33,162232,161590],{"class":167},[33,162234,242],{"class":163},[33,162236,18472],{"class":167},[33,162238,55650],{"class":50},[33,162240,221],{"class":167},[33,162242,162243],{"class":35,"line":300},[33,162244,92],{"emptyLinePlaceholder":91},[33,162246,162247],{"class":35,"line":317},[33,162248,162249],{"class":39},"# Add w:eastAsia to the rFonts element — the missing step\n",[33,162251,162252,162255,162257,162260],{"class":35,"line":332},[33,162253,162254],{"class":167},"r_pr    ",[33,162256,242],{"class":163},[33,162258,162259],{"class":167}," run._r.get_or_add_rPr()       ",[33,162261,162262],{"class":39},"# \u003Cw:rPr> element, created if absent\n",[33,162264,162265,162268,162270,162273],{"class":35,"line":347},[33,162266,162267],{"class":167},"r_fonts ",[33,162269,242],{"class":163},[33,162271,162272],{"class":167}," r_pr.get_or_add_rFonts()      ",[33,162274,162275],{"class":39},"# \u003Cw:rFonts> element, created if absent\n",[33,162277,162278,162281,162284,162286,162288,162290],{"class":35,"line":374},[33,162279,162280],{"class":167},"r_fonts.set(qn(",[33,162282,162283],{"class":54},"\"w:eastAsia\"",[33,162285,18525],{"class":167},[33,162287,162162],{"class":50},[33,162289,1649],{"class":167},[33,162291,162292],{"class":39},"# write the attribute python-docx omits\n",[33,162294,162295],{"class":35,"line":397},[33,162296,92],{"emptyLinePlaceholder":91},[33,162298,162299,162301],{"class":35,"line":653},[33,162300,35574],{"class":163},[33,162302,574],{"class":167},[33,162304,162305,162307,162309],{"class":35,"line":667},[33,162306,85716],{"class":167},[33,162308,96935],{"class":50},[33,162310,221],{"class":167},[33,162312,162313,162315,162317,162319,162321,162323,162325],{"class":35,"line":675},[33,162314,7268],{"class":50},[33,162316,602],{"class":167},[33,162318,4059],{"class":163},[33,162320,97737],{"class":54},[33,162322,97684],{"class":50},[33,162324,274],{"class":54},[33,162326,221],{"class":167},[33,162328,162329,162331,162333,162335],{"class":35,"line":689},[33,162330,35726],{"class":163},[33,162332,107953],{"class":50},[33,162334,1852],{"class":163},[33,162336,1855],{"class":167},[33,162338,162339,162341,162343,162345,162347,162349,162351,162353,162355],{"class":35,"line":703},[33,162340,7268],{"class":50},[33,162342,602],{"class":167},[33,162344,4059],{"class":163},[33,162346,158012],{"class":54},[33,162348,1115],{"class":50},[33,162350,6565],{"class":167},[33,162352,1121],{"class":50},[33,162354,274],{"class":54},[33,162356,221],{"class":167},[14,162358,162359],{},"After this fix, the XML for the run looks like:",[23,162361,162363],{"className":147140,"code":162362,"language":147142,"meta":28,"style":28},"\u003Cw:rFonts w:ascii=\"Arial\" w:hAnsi=\"Arial\" w:eastAsia=\"Noto Sans SC\"\u002F>\n",[30,162364,162365],{"__ignoreMap":28},[33,162366,162367],{"class":35,"line":36},[33,162368,162362],{},[14,162370,162371],{},"Word now selects \"Noto Sans SC\" for the CJK codepoints (U+4E00–U+9FFF and related ranges) and \"Arial\" for the Latin characters. Both fonts must be installed on the machine that opens the document, or Word will substitute silently.",[14,162373,162374],{},"A reusable helper to apply this pattern to any run:",[23,162376,162378],{"className":126,"code":162377,"language":47,"meta":28,"style":28},"# pip install python-docx\nfrom docx.oxml.ns import qn\n\ndef set_run_fonts(run, latin: str, cjk: str | None = None) -> None:\n    \"\"\"Set latin font (and optionally CJK font) on a run.\"\"\"\n    run.font.name = latin\n    if cjk:\n        rpr    = run._r.get_or_add_rPr()\n        rfonts = rpr.get_or_add_rFonts()\n        rfonts.set(qn(\"w:eastAsia\"), cjk)\n",[30,162379,162380,162384,162394,162398,162429,162434,162444,162451,162461,162471],{"__ignoreMap":28},[33,162381,162382],{"class":35,"line":36},[33,162383,156213],{"class":39},[33,162385,162386,162388,162390,162392],{"class":35,"line":43},[33,162387,190],{"class":163},[33,162389,157055],{"class":167},[33,162391,164],{"class":163},[33,162393,157060],{"class":167},[33,162395,162396],{"class":35,"line":61},[33,162397,92],{"emptyLinePlaceholder":91},[33,162399,162400,162402,162405,162408,162410,162413,162415,162417,162419,162421,162423,162425,162427],{"class":35,"line":73},[33,162401,562],{"class":163},[33,162403,162404],{"class":46}," set_run_fonts",[33,162406,162407],{"class":167},"(run, latin: ",[33,162409,1053],{"class":50},[33,162411,162412],{"class":167},", cjk: ",[33,162414,1053],{"class":50},[33,162416,2850],{"class":163},[33,162418,7657],{"class":50},[33,162420,212],{"class":163},[33,162422,7657],{"class":50},[33,162424,1617],{"class":167},[33,162426,571],{"class":50},[33,162428,574],{"class":167},[33,162430,162431],{"class":35,"line":88},[33,162432,162433],{"class":54},"    \"\"\"Set latin font (and optionally CJK font) on a run.\"\"\"\n",[33,162435,162436,162439,162441],{"class":35,"line":95},[33,162437,162438],{"class":167},"    run.font.name ",[33,162440,242],{"class":163},[33,162442,162443],{"class":167}," latin\n",[33,162445,162446,162448],{"class":35,"line":101},[33,162447,617],{"class":163},[33,162449,162450],{"class":167}," cjk:\n",[33,162452,162453,162456,162458],{"class":35,"line":171},[33,162454,162455],{"class":167},"        rpr    ",[33,162457,242],{"class":163},[33,162459,162460],{"class":167}," run._r.get_or_add_rPr()\n",[33,162462,162463,162466,162468],{"class":35,"line":179},[33,162464,162465],{"class":167},"        rfonts ",[33,162467,242],{"class":163},[33,162469,162470],{"class":167}," rpr.get_or_add_rFonts()\n",[33,162472,162473,162476,162478],{"class":35,"line":187},[33,162474,162475],{"class":167},"        rfonts.set(qn(",[33,162477,162283],{"class":54},[33,162479,162480],{"class":167},"), cjk)\n",[2537,162482],{},[18,162484,162486],{"id":162485},"fix-3-define-a-named-character-style","Fix 3 — Define a Named Character Style",[14,162488,162489],{},"Applying direct formatting run-by-run is verbose and fragile at scale. Define a named character style once on the document and apply it by name:",[23,162491,162493],{"className":126,"code":162492,"language":47,"meta":28,"style":28},"# pip install python-docx\nfrom pathlib import Path\nfrom docx import Document\nfrom docx.oxml.ns import qn\nfrom docx.shared import Pt, RGBColor\nfrom docx.enum.style import WD_STYLE_TYPE\n\nOUTPUT = Path(\"output\u002Fword\u002Fnamed_style.docx\")\nOUTPUT.parent.mkdir(parents=True, exist_ok=True)\n\ndoc = Document()\n\n# Create a character style\nchar_style            = doc.styles.add_style(\"BrandHighlight\", WD_STYLE_TYPE.CHARACTER)\nchar_style.font.name  = \"Trebuchet MS\"\nchar_style.font.size  = Pt(12)\nchar_style.font.bold  = True\nchar_style.font.color.rgb = RGBColor(0x1D, 0x4E, 0xD8)   # brand blue\n\n# Add East-Asian coverage to the style via oxml\nrpr    = char_style.element.get_or_add_rPr()\nrfonts = rpr.get_or_add_rFonts()\nrfonts.set(qn(\"w:eastAsia\"), \"Noto Sans SC\")\n\n# Apply by name to any run\npara         = doc.add_paragraph(\"Total revenue: \")\nhighlight    = para.add_run(\"$1.4 M\")\nhighlight.style = doc.styles[\"BrandHighlight\"]   # apply the character style\n\npara.add_run(\" — up 12 % year-over-year.\")\n\ntry:\n    doc.save(OUTPUT)\n    print(f\"Saved: {OUTPUT}\")\nexcept OSError as exc:\n    print(f\"Save failed: {exc}\")\n",[30,162494,162495,162499,162509,162519,162529,162539,162551,162555,162568,162590,162594,162602,162606,162611,162636,162646,162659,162668,162701,162705,162710,162720,162729,162743,162747,162752,162766,162779,162796,162800,162810,162814,162820,162828,162844,162854],{"__ignoreMap":28},[33,162496,162497],{"class":35,"line":36},[33,162498,156213],{"class":39},[33,162500,162501,162503,162505,162507],{"class":35,"line":43},[33,162502,190],{"class":163},[33,162504,193],{"class":167},[33,162506,164],{"class":163},[33,162508,198],{"class":167},[33,162510,162511,162513,162515,162517],{"class":35,"line":61},[33,162512,190],{"class":163},[33,162514,18092],{"class":167},[33,162516,164],{"class":163},[33,162518,18097],{"class":167},[33,162520,162521,162523,162525,162527],{"class":35,"line":73},[33,162522,190],{"class":163},[33,162524,157055],{"class":167},[33,162526,164],{"class":163},[33,162528,157060],{"class":167},[33,162530,162531,162533,162535,162537],{"class":35,"line":88},[33,162532,190],{"class":163},[33,162534,18104],{"class":167},[33,162536,164],{"class":163},[33,162538,22662],{"class":167},[33,162540,162541,162543,162546,162548],{"class":35,"line":95},[33,162542,190],{"class":163},[33,162544,162545],{"class":167}," docx.enum.style ",[33,162547,164],{"class":163},[33,162549,162550],{"class":50}," WD_STYLE_TYPE\n",[33,162552,162553],{"class":35,"line":101},[33,162554,92],{"emptyLinePlaceholder":91},[33,162556,162557,162559,162561,162563,162566],{"class":35,"line":171},[33,162558,96935],{"class":50},[33,162560,212],{"class":163},[33,162562,215],{"class":167},[33,162564,162565],{"class":54},"\"output\u002Fword\u002Fnamed_style.docx\"",[33,162567,221],{"class":167},[33,162569,162570,162572,162574,162576,162578,162580,162582,162584,162586,162588],{"class":35,"line":179},[33,162571,96935],{"class":50},[33,162573,866],{"class":167},[33,162575,869],{"class":238},[33,162577,242],{"class":163},[33,162579,855],{"class":50},[33,162581,365],{"class":167},[33,162583,878],{"class":238},[33,162585,242],{"class":163},[33,162587,855],{"class":50},[33,162589,221],{"class":167},[33,162591,162592],{"class":35,"line":187},[33,162593,92],{"emptyLinePlaceholder":91},[33,162595,162596,162598,162600],{"class":35,"line":201},[33,162597,156566],{"class":167},[33,162599,242],{"class":163},[33,162601,18229],{"class":167},[33,162603,162604],{"class":35,"line":206},[33,162605,92],{"emptyLinePlaceholder":91},[33,162607,162608],{"class":35,"line":224},[33,162609,162610],{"class":39},"# Create a character style\n",[33,162612,162613,162616,162618,162621,162624,162626,162629,162631,162634],{"class":35,"line":229},[33,162614,162615],{"class":167},"char_style            ",[33,162617,242],{"class":163},[33,162619,162620],{"class":167}," doc.styles.add_style(",[33,162622,162623],{"class":54},"\"BrandHighlight\"",[33,162625,365],{"class":167},[33,162627,162628],{"class":50},"WD_STYLE_TYPE",[33,162630,3035],{"class":167},[33,162632,162633],{"class":50},"CHARACTER",[33,162635,221],{"class":167},[33,162637,162638,162641,162643],{"class":35,"line":235},[33,162639,162640],{"class":167},"char_style.font.name  ",[33,162642,242],{"class":163},[33,162644,162645],{"class":54}," \"Trebuchet MS\"\n",[33,162647,162648,162651,162653,162655,162657],{"class":35,"line":250},[33,162649,162650],{"class":167},"char_style.font.size  ",[33,162652,242],{"class":163},[33,162654,18472],{"class":167},[33,162656,55650],{"class":50},[33,162658,221],{"class":167},[33,162660,162661,162664,162666],{"class":35,"line":266},[33,162662,162663],{"class":167},"char_style.font.bold  ",[33,162665,242],{"class":163},[33,162667,2887],{"class":50},[33,162669,162670,162673,162675,162677,162679,162682,162684,162686,162689,162691,162693,162696,162698],{"class":35,"line":290},[33,162671,162672],{"class":167},"char_style.font.color.rgb ",[33,162674,242],{"class":163},[33,162676,18288],{"class":167},[33,162678,18291],{"class":163},[33,162680,162681],{"class":50},"1D",[33,162683,365],{"class":167},[33,162685,18291],{"class":163},[33,162687,162688],{"class":50},"4E",[33,162690,365],{"class":167},[33,162692,18291],{"class":163},[33,162694,162695],{"class":50},"D8",[33,162697,12000],{"class":167},[33,162699,162700],{"class":39},"# brand blue\n",[33,162702,162703],{"class":35,"line":295},[33,162704,92],{"emptyLinePlaceholder":91},[33,162706,162707],{"class":35,"line":300},[33,162708,162709],{"class":39},"# Add East-Asian coverage to the style via oxml\n",[33,162711,162712,162715,162717],{"class":35,"line":317},[33,162713,162714],{"class":167},"rpr    ",[33,162716,242],{"class":163},[33,162718,162719],{"class":167}," char_style.element.get_or_add_rPr()\n",[33,162721,162722,162725,162727],{"class":35,"line":332},[33,162723,162724],{"class":167},"rfonts ",[33,162726,242],{"class":163},[33,162728,162470],{"class":167},[33,162730,162731,162734,162736,162738,162741],{"class":35,"line":347},[33,162732,162733],{"class":167},"rfonts.set(qn(",[33,162735,162283],{"class":54},[33,162737,18525],{"class":167},[33,162739,162740],{"class":54},"\"Noto Sans SC\"",[33,162742,221],{"class":167},[33,162744,162745],{"class":35,"line":374},[33,162746,92],{"emptyLinePlaceholder":91},[33,162748,162749],{"class":35,"line":397},[33,162750,162751],{"class":39},"# Apply by name to any run\n",[33,162753,162754,162757,162759,162761,162764],{"class":35,"line":653},[33,162755,162756],{"class":167},"para         ",[33,162758,242],{"class":163},[33,162760,156861],{"class":167},[33,162762,162763],{"class":54},"\"Total revenue: \"",[33,162765,221],{"class":167},[33,162767,162768,162771,162773,162775,162777],{"class":35,"line":667},[33,162769,162770],{"class":167},"highlight    ",[33,162772,242],{"class":163},[33,162774,156898],{"class":167},[33,162776,157200],{"class":54},[33,162778,221],{"class":167},[33,162780,162781,162784,162786,162789,162791,162793],{"class":35,"line":675},[33,162782,162783],{"class":167},"highlight.style ",[33,162785,242],{"class":163},[33,162787,162788],{"class":167}," doc.styles[",[33,162790,162623],{"class":54},[33,162792,48135],{"class":167},[33,162794,162795],{"class":39},"# apply the character style\n",[33,162797,162798],{"class":35,"line":689},[33,162799,92],{"emptyLinePlaceholder":91},[33,162801,162802,162805,162808],{"class":35,"line":703},[33,162803,162804],{"class":167},"para.add_run(",[33,162806,162807],{"class":54},"\" — up 12 % year-over-year.\"",[33,162809,221],{"class":167},[33,162811,162812],{"class":35,"line":714},[33,162813,92],{"emptyLinePlaceholder":91},[33,162815,162816,162818],{"class":35,"line":723},[33,162817,35574],{"class":163},[33,162819,574],{"class":167},[33,162821,162822,162824,162826],{"class":35,"line":754},[33,162823,85716],{"class":167},[33,162825,96935],{"class":50},[33,162827,221],{"class":167},[33,162829,162830,162832,162834,162836,162838,162840,162842],{"class":35,"line":771},[33,162831,7268],{"class":50},[33,162833,602],{"class":167},[33,162835,4059],{"class":163},[33,162837,97737],{"class":54},[33,162839,97684],{"class":50},[33,162841,274],{"class":54},[33,162843,221],{"class":167},[33,162845,162846,162848,162850,162852],{"class":35,"line":777},[33,162847,35726],{"class":163},[33,162849,107953],{"class":50},[33,162851,1852],{"class":163},[33,162853,1855],{"class":167},[33,162855,162856,162858,162860,162862,162864,162866,162868,162870,162872],{"class":35,"line":788},[33,162857,7268],{"class":50},[33,162859,602],{"class":167},[33,162861,4059],{"class":163},[33,162863,158012],{"class":54},[33,162865,1115],{"class":50},[33,162867,6565],{"class":167},[33,162869,1121],{"class":50},[33,162871,274],{"class":54},[33,162873,221],{"class":167},[14,162875,162876,162877,162879,162880,162883],{},"Named styles are stored in ",[30,162878,158153],{}," and travel with the document. When a user opens the document in Word and modifies the ",[30,162881,162882],{},"BrandHighlight"," style definition, every run that references it updates automatically — a key advantage over run-level direct formatting.",[2537,162885],{},[18,162887,162889],{"id":162888},"fix-4-modify-the-normal-style-and-paragraph-spacing","Fix 4 — Modify the Normal Style and Paragraph Spacing",[14,162891,162892,162893,162895],{},"If every paragraph in the document should use a different font or line spacing baseline, override the ",[30,162894,99685],{}," paragraph style globally. This avoids setting properties on every individual paragraph or run:",[23,162897,162899],{"className":126,"code":162898,"language":47,"meta":28,"style":28},"# pip install python-docx\nfrom pathlib import Path\nfrom docx import Document\nfrom docx.shared import Pt\nfrom docx.oxml.ns import qn\n\nOUTPUT = Path(\"output\u002Fword\u002Fnormal_override.docx\")\nOUTPUT.parent.mkdir(parents=True, exist_ok=True)\n\ndoc          = Document()\nnormal_style = doc.styles[\"Normal\"]\n\n# Change the default font for the whole document\nnormal_style.font.name = \"Source Sans Pro\"\nnormal_style.font.size = Pt(11)\n\n# Apply East-Asian font to the Normal style\nrpr    = normal_style.element.get_or_add_rPr()\nrfonts = rpr.get_or_add_rFonts()\nrfonts.set(qn(\"w:eastAsia\"), \"Noto Sans SC\")\n\n# Remove default space-after so paragraphs don't balloon with extra whitespace\npf = normal_style.paragraph_format\npf.space_after  = Pt(0)\npf.space_before = Pt(0)\npf.line_spacing = Pt(15)   # 15pt leading for 11pt body text\n\ndoc.add_paragraph(\"This paragraph inherits the new Normal style baseline.\")\ndoc.add_paragraph(\"So does this one — no per-run overrides needed.\")\n\ntry:\n    doc.save(OUTPUT)\n    print(f\"Saved: {OUTPUT}\")\nexcept OSError as exc:\n    print(f\"Save failed: {exc}\")\n",[30,162900,162901,162905,162915,162925,162935,162945,162949,162962,162984,162988,162997,163010,163014,163019,163029,163042,163046,163051,163060,163068,163080,163084,163089,163099,163112,163125,163141,163145,163155,163164,163168,163174,163182,163198,163208],{"__ignoreMap":28},[33,162902,162903],{"class":35,"line":36},[33,162904,156213],{"class":39},[33,162906,162907,162909,162911,162913],{"class":35,"line":43},[33,162908,190],{"class":163},[33,162910,193],{"class":167},[33,162912,164],{"class":163},[33,162914,198],{"class":167},[33,162916,162917,162919,162921,162923],{"class":35,"line":61},[33,162918,190],{"class":163},[33,162920,18092],{"class":167},[33,162922,164],{"class":163},[33,162924,18097],{"class":167},[33,162926,162927,162929,162931,162933],{"class":35,"line":73},[33,162928,190],{"class":163},[33,162930,18104],{"class":167},[33,162932,164],{"class":163},[33,162934,158240],{"class":167},[33,162936,162937,162939,162941,162943],{"class":35,"line":88},[33,162938,190],{"class":163},[33,162940,157055],{"class":167},[33,162942,164],{"class":163},[33,162944,157060],{"class":167},[33,162946,162947],{"class":35,"line":95},[33,162948,92],{"emptyLinePlaceholder":91},[33,162950,162951,162953,162955,162957,162960],{"class":35,"line":101},[33,162952,96935],{"class":50},[33,162954,212],{"class":163},[33,162956,215],{"class":167},[33,162958,162959],{"class":54},"\"output\u002Fword\u002Fnormal_override.docx\"",[33,162961,221],{"class":167},[33,162963,162964,162966,162968,162970,162972,162974,162976,162978,162980,162982],{"class":35,"line":171},[33,162965,96935],{"class":50},[33,162967,866],{"class":167},[33,162969,869],{"class":238},[33,162971,242],{"class":163},[33,162973,855],{"class":50},[33,162975,365],{"class":167},[33,162977,878],{"class":238},[33,162979,242],{"class":163},[33,162981,855],{"class":50},[33,162983,221],{"class":167},[33,162985,162986],{"class":35,"line":179},[33,162987,92],{"emptyLinePlaceholder":91},[33,162989,162990,162993,162995],{"class":35,"line":187},[33,162991,162992],{"class":167},"doc          ",[33,162994,242],{"class":163},[33,162996,18229],{"class":167},[33,162998,162999,163002,163004,163006,163008],{"class":35,"line":201},[33,163000,163001],{"class":167},"normal_style ",[33,163003,242],{"class":163},[33,163005,162788],{"class":167},[33,163007,19348],{"class":54},[33,163009,9202],{"class":167},[33,163011,163012],{"class":35,"line":206},[33,163013,92],{"emptyLinePlaceholder":91},[33,163015,163016],{"class":35,"line":224},[33,163017,163018],{"class":39},"# Change the default font for the whole document\n",[33,163020,163021,163024,163026],{"class":35,"line":229},[33,163022,163023],{"class":167},"normal_style.font.name ",[33,163025,242],{"class":163},[33,163027,163028],{"class":54}," \"Source Sans Pro\"\n",[33,163030,163031,163034,163036,163038,163040],{"class":35,"line":235},[33,163032,163033],{"class":167},"normal_style.font.size ",[33,163035,242],{"class":163},[33,163037,18472],{"class":167},[33,163039,17260],{"class":50},[33,163041,221],{"class":167},[33,163043,163044],{"class":35,"line":250},[33,163045,92],{"emptyLinePlaceholder":91},[33,163047,163048],{"class":35,"line":266},[33,163049,163050],{"class":39},"# Apply East-Asian font to the Normal style\n",[33,163052,163053,163055,163057],{"class":35,"line":290},[33,163054,162714],{"class":167},[33,163056,242],{"class":163},[33,163058,163059],{"class":167}," normal_style.element.get_or_add_rPr()\n",[33,163061,163062,163064,163066],{"class":35,"line":295},[33,163063,162724],{"class":167},[33,163065,242],{"class":163},[33,163067,162470],{"class":167},[33,163069,163070,163072,163074,163076,163078],{"class":35,"line":300},[33,163071,162733],{"class":167},[33,163073,162283],{"class":54},[33,163075,18525],{"class":167},[33,163077,162740],{"class":54},[33,163079,221],{"class":167},[33,163081,163082],{"class":35,"line":317},[33,163083,92],{"emptyLinePlaceholder":91},[33,163085,163086],{"class":35,"line":332},[33,163087,163088],{"class":39},"# Remove default space-after so paragraphs don't balloon with extra whitespace\n",[33,163090,163091,163094,163096],{"class":35,"line":347},[33,163092,163093],{"class":167},"pf ",[33,163095,242],{"class":163},[33,163097,163098],{"class":167}," normal_style.paragraph_format\n",[33,163100,163101,163104,163106,163108,163110],{"class":35,"line":374},[33,163102,163103],{"class":167},"pf.space_after  ",[33,163105,242],{"class":163},[33,163107,18472],{"class":167},[33,163109,748],{"class":50},[33,163111,221],{"class":167},[33,163113,163114,163117,163119,163121,163123],{"class":35,"line":397},[33,163115,163116],{"class":167},"pf.space_before ",[33,163118,242],{"class":163},[33,163120,18472],{"class":167},[33,163122,748],{"class":50},[33,163124,221],{"class":167},[33,163126,163127,163130,163132,163134,163136,163138],{"class":35,"line":653},[33,163128,163129],{"class":167},"pf.line_spacing ",[33,163131,242],{"class":163},[33,163133,18472],{"class":167},[33,163135,1646],{"class":50},[33,163137,12000],{"class":167},[33,163139,163140],{"class":39},"# 15pt leading for 11pt body text\n",[33,163142,163143],{"class":35,"line":667},[33,163144,92],{"emptyLinePlaceholder":91},[33,163146,163147,163150,163153],{"class":35,"line":675},[33,163148,163149],{"class":167},"doc.add_paragraph(",[33,163151,163152],{"class":54},"\"This paragraph inherits the new Normal style baseline.\"",[33,163154,221],{"class":167},[33,163156,163157,163159,163162],{"class":35,"line":689},[33,163158,163149],{"class":167},[33,163160,163161],{"class":54},"\"So does this one — no per-run overrides needed.\"",[33,163163,221],{"class":167},[33,163165,163166],{"class":35,"line":703},[33,163167,92],{"emptyLinePlaceholder":91},[33,163169,163170,163172],{"class":35,"line":714},[33,163171,35574],{"class":163},[33,163173,574],{"class":167},[33,163175,163176,163178,163180],{"class":35,"line":723},[33,163177,85716],{"class":167},[33,163179,96935],{"class":50},[33,163181,221],{"class":167},[33,163183,163184,163186,163188,163190,163192,163194,163196],{"class":35,"line":754},[33,163185,7268],{"class":50},[33,163187,602],{"class":167},[33,163189,4059],{"class":163},[33,163191,97737],{"class":54},[33,163193,97684],{"class":50},[33,163195,274],{"class":54},[33,163197,221],{"class":167},[33,163199,163200,163202,163204,163206],{"class":35,"line":771},[33,163201,35726],{"class":163},[33,163203,107953],{"class":50},[33,163205,1852],{"class":163},[33,163207,1855],{"class":167},[33,163209,163210,163212,163214,163216,163218,163220,163222,163224,163226],{"class":35,"line":777},[33,163211,7268],{"class":50},[33,163213,602],{"class":167},[33,163215,4059],{"class":163},[33,163217,158012],{"class":54},[33,163219,1115],{"class":50},[33,163221,6565],{"class":167},[33,163223,1121],{"class":50},[33,163225,274],{"class":54},[33,163227,221],{"class":167},[14,163229,163230,163231,163233,163234,365,163237,163239],{},"Modifying ",[30,163232,99685],{}," cascades to every paragraph that inherits from it. Custom styles that explicitly define their own font (",[30,163235,163236],{},"Heading 1",[30,163238,161395],{},", etc.) are unaffected. To change Heading 1's font independently:",[23,163241,163243],{"className":126,"code":163242,"language":47,"meta":28,"style":28},"h1 = doc.styles[\"Heading 1\"]\nh1.font.name = \"Montserrat\"\nh1.font.size = Pt(18)\nh1.font.bold = True\n",[30,163244,163245,163259,163269,163282],{"__ignoreMap":28},[33,163246,163247,163250,163252,163254,163257],{"class":35,"line":36},[33,163248,163249],{"class":167},"h1 ",[33,163251,242],{"class":163},[33,163253,162788],{"class":167},[33,163255,163256],{"class":54},"\"Heading 1\"",[33,163258,9202],{"class":167},[33,163260,163261,163264,163266],{"class":35,"line":43},[33,163262,163263],{"class":167},"h1.font.name ",[33,163265,242],{"class":163},[33,163267,163268],{"class":54}," \"Montserrat\"\n",[33,163270,163271,163274,163276,163278,163280],{"class":35,"line":61},[33,163272,163273],{"class":167},"h1.font.size ",[33,163275,242],{"class":163},[33,163277,18472],{"class":167},[33,163279,19300],{"class":50},[33,163281,221],{"class":167},[33,163283,163284,163287,163289],{"class":35,"line":73},[33,163285,163286],{"class":167},"h1.font.bold ",[33,163288,242],{"class":163},[33,163290,2887],{"class":50},[2537,163292],{},[18,163294,163296],{"id":163295},"variant-stripping-direct-formatting-from-an-existing-document","Variant — Stripping Direct Formatting From an Existing Document",[14,163298,163299],{},"When a document acquired from an external source has stale run-level font overrides that resist style updates, clear them:",[23,163301,163303],{"className":126,"code":163302,"language":47,"meta":28,"style":28},"# pip install python-docx\nfrom pathlib import Path\nfrom docx import Document\nfrom docx.oxml.ns import qn\n\nPATH = Path(\"output\u002Fword\u002Fexisting.docx\")\n\ntry:\n    doc = Document(PATH)\nexcept FileNotFoundError:\n    raise SystemExit(f\"File not found: {PATH}\")\n\nfor para in doc.paragraphs:\n    for run in para.runs:\n        rpr = run._r.find(qn(\"w:rPr\"))\n        if rpr is not None:\n            # Remove direct font specification — runs will now inherit the style\n            for rfonts in rpr.findall(qn(\"w:rFonts\")):\n                rpr.remove(rfonts)\n            for sz in rpr.findall(qn(\"w:sz\")):\n                rpr.remove(sz)\n\ndoc.save(PATH)\nprint(\"Direct font overrides cleared.\")\n",[30,163304,163305,163309,163319,163329,163339,163343,163356,163360,163366,163378,163386,163405,163409,163421,163433,163448,163463,163468,163485,163490,163506,163511,163515,163523],{"__ignoreMap":28},[33,163306,163307],{"class":35,"line":36},[33,163308,156213],{"class":39},[33,163310,163311,163313,163315,163317],{"class":35,"line":43},[33,163312,190],{"class":163},[33,163314,193],{"class":167},[33,163316,164],{"class":163},[33,163318,198],{"class":167},[33,163320,163321,163323,163325,163327],{"class":35,"line":61},[33,163322,190],{"class":163},[33,163324,18092],{"class":167},[33,163326,164],{"class":163},[33,163328,18097],{"class":167},[33,163330,163331,163333,163335,163337],{"class":35,"line":73},[33,163332,190],{"class":163},[33,163334,157055],{"class":167},[33,163336,164],{"class":163},[33,163338,157060],{"class":167},[33,163340,163341],{"class":35,"line":88},[33,163342,92],{"emptyLinePlaceholder":91},[33,163344,163345,163347,163349,163351,163354],{"class":35,"line":95},[33,163346,122],{"class":50},[33,163348,212],{"class":163},[33,163350,215],{"class":167},[33,163352,163353],{"class":54},"\"output\u002Fword\u002Fexisting.docx\"",[33,163355,221],{"class":167},[33,163357,163358],{"class":35,"line":101},[33,163359,92],{"emptyLinePlaceholder":91},[33,163361,163362,163364],{"class":35,"line":171},[33,163363,35574],{"class":163},[33,163365,574],{"class":167},[33,163367,163368,163370,163372,163374,163376],{"class":35,"line":179},[33,163369,18224],{"class":167},[33,163371,242],{"class":163},[33,163373,156340],{"class":167},[33,163375,122],{"class":50},[33,163377,221],{"class":167},[33,163379,163380,163382,163384],{"class":35,"line":187},[33,163381,35726],{"class":163},[33,163383,2945],{"class":50},[33,163385,574],{"class":167},[33,163387,163388,163390,163392,163394,163396,163398,163401,163403],{"class":35,"line":201},[33,163389,35742],{"class":163},[33,163391,16617],{"class":50},[33,163393,602],{"class":167},[33,163395,4059],{"class":163},[33,163397,15677],{"class":54},[33,163399,163400],{"class":50},"{PATH}",[33,163402,274],{"class":54},[33,163404,221],{"class":167},[33,163406,163407],{"class":35,"line":206},[33,163408,92],{"emptyLinePlaceholder":91},[33,163410,163411,163413,163416,163418],{"class":35,"line":224},[33,163412,6124],{"class":163},[33,163414,163415],{"class":167}," para ",[33,163417,662],{"class":163},[33,163419,163420],{"class":167}," doc.paragraphs:\n",[33,163422,163423,163425,163428,163430],{"class":35,"line":229},[33,163424,656],{"class":163},[33,163426,163427],{"class":167}," run ",[33,163429,662],{"class":163},[33,163431,163432],{"class":167}," para.runs:\n",[33,163434,163435,163438,163440,163443,163446],{"class":35,"line":235},[33,163436,163437],{"class":167},"        rpr ",[33,163439,242],{"class":163},[33,163441,163442],{"class":167}," run._r.find(qn(",[33,163444,163445],{"class":54},"\"w:rPr\"",[33,163447,371],{"class":167},[33,163449,163450,163452,163455,163457,163459,163461],{"class":35,"line":250},[33,163451,8221],{"class":163},[33,163453,163454],{"class":167}," rpr ",[33,163456,3847],{"class":163},[33,163458,620],{"class":163},[33,163460,7657],{"class":50},[33,163462,574],{"class":167},[33,163464,163465],{"class":35,"line":266},[33,163466,163467],{"class":39},"            # Remove direct font specification — runs will now inherit the style\n",[33,163469,163470,163472,163475,163477,163480,163483],{"class":35,"line":290},[33,163471,1793],{"class":163},[33,163473,163474],{"class":167}," rfonts ",[33,163476,662],{"class":163},[33,163478,163479],{"class":167}," rpr.findall(qn(",[33,163481,163482],{"class":54},"\"w:rFonts\"",[33,163484,8687],{"class":167},[33,163486,163487],{"class":35,"line":295},[33,163488,163489],{"class":167},"                rpr.remove(rfonts)\n",[33,163491,163492,163494,163497,163499,163501,163504],{"class":35,"line":300},[33,163493,1793],{"class":163},[33,163495,163496],{"class":167}," sz ",[33,163498,662],{"class":163},[33,163500,163479],{"class":167},[33,163502,163503],{"class":54},"\"w:sz\"",[33,163505,8687],{"class":167},[33,163507,163508],{"class":35,"line":317},[33,163509,163510],{"class":167},"                rpr.remove(sz)\n",[33,163512,163513],{"class":35,"line":332},[33,163514,92],{"emptyLinePlaceholder":91},[33,163516,163517,163519,163521],{"class":35,"line":347},[33,163518,161625],{"class":167},[33,163520,122],{"class":50},[33,163522,221],{"class":167},[33,163524,163525,163527,163529,163532],{"class":35,"line":374},[33,163526,13474],{"class":50},[33,163528,602],{"class":167},[33,163530,163531],{"class":54},"\"Direct font overrides cleared.\"",[33,163533,221],{"class":167},[14,163535,163536],{},"Use this when a style update at the document level is not flowing through to runs that were formatted individually.",[2537,163538],{},[18,163540,9247],{"id":9246},[14,163542,163543],{},"Confirm the font attributes are written correctly by re-opening the file and printing the XML:",[23,163545,163547],{"className":126,"code":163546,"language":47,"meta":28,"style":28},"# pip install python-docx lxml\nfrom pathlib import Path\nfrom docx import Document\nfrom lxml import etree\n\ndef check_run_fonts(path: Path) -> None:\n    \"\"\"Print the w:rFonts attributes for every run in the document.\"\"\"\n    doc = Document(path)\n    W = \"http:\u002F\u002Fschemas.openxmlformats.org\u002Fwordprocessingml\u002F2006\u002Fmain\"\n\n    for i, para in enumerate(doc.paragraphs):\n        for j, run in enumerate(para.runs):\n            rpr = run._r.find(f\"{{{W}}}rPr\")\n            if rpr is None:\n                continue\n            rf = rpr.find(f\"{{{W}}}rFonts\")\n            if rf is not None:\n                attrs = {k.split(\"}\")[-1]: v for k, v in rf.attrib.items()}\n                print(f\"Para {i}, Run {j}: {attrs}\")\n\ncheck_run_fonts(Path(\"output\u002Fword\u002Fcjk_font.docx\"))\n",[30,163548,163549,163553,163563,163573,163583,163587,163600,163605,163613,163623,163627,163641,163655,163683,163695,163699,163724,163739,163771,163811,163815],{"__ignoreMap":28},[33,163550,163551],{"class":35,"line":36},[33,163552,161467],{"class":39},[33,163554,163555,163557,163559,163561],{"class":35,"line":43},[33,163556,190],{"class":163},[33,163558,193],{"class":167},[33,163560,164],{"class":163},[33,163562,198],{"class":167},[33,163564,163565,163567,163569,163571],{"class":35,"line":61},[33,163566,190],{"class":163},[33,163568,18092],{"class":167},[33,163570,164],{"class":163},[33,163572,18097],{"class":167},[33,163574,163575,163577,163579,163581],{"class":35,"line":73},[33,163576,190],{"class":163},[33,163578,161494],{"class":167},[33,163580,164],{"class":163},[33,163582,161499],{"class":167},[33,163584,163585],{"class":35,"line":88},[33,163586,92],{"emptyLinePlaceholder":91},[33,163588,163589,163591,163594,163596,163598],{"class":35,"line":95},[33,163590,562],{"class":163},[33,163592,163593],{"class":46}," check_run_fonts",[33,163595,3743],{"class":167},[33,163597,571],{"class":50},[33,163599,574],{"class":167},[33,163601,163602],{"class":35,"line":101},[33,163603,163604],{"class":54},"    \"\"\"Print the w:rFonts attributes for every run in the document.\"\"\"\n",[33,163606,163607,163609,163611],{"class":35,"line":171},[33,163608,18224],{"class":167},[33,163610,242],{"class":163},[33,163612,159234],{"class":167},[33,163614,163615,163618,163620],{"class":35,"line":179},[33,163616,163617],{"class":167},"    W ",[33,163619,242],{"class":163},[33,163621,163622],{"class":54}," \"http:\u002F\u002Fschemas.openxmlformats.org\u002Fwordprocessingml\u002F2006\u002Fmain\"\n",[33,163624,163625],{"class":35,"line":187},[33,163626,92],{"emptyLinePlaceholder":91},[33,163628,163629,163631,163634,163636,163638],{"class":35,"line":201},[33,163630,656],{"class":163},[33,163632,163633],{"class":167}," i, para ",[33,163635,662],{"class":163},[33,163637,7403],{"class":50},[33,163639,163640],{"class":167},"(doc.paragraphs):\n",[33,163642,163643,163645,163648,163650,163652],{"class":35,"line":206},[33,163644,5973],{"class":163},[33,163646,163647],{"class":167}," j, run ",[33,163649,662],{"class":163},[33,163651,7403],{"class":50},[33,163653,163654],{"class":167},"(para.runs):\n",[33,163656,163657,163660,163662,163665,163667,163669,163672,163675,163678,163681],{"class":35,"line":224},[33,163658,163659],{"class":167},"            rpr ",[33,163661,242],{"class":163},[33,163663,163664],{"class":167}," run._r.find(",[33,163666,4059],{"class":163},[33,163668,274],{"class":54},[33,163670,163671],{"class":50},"{{{",[33,163673,163674],{"class":167},"W",[33,163676,163677],{"class":50},"}}}",[33,163679,163680],{"class":54},"rPr\"",[33,163682,221],{"class":167},[33,163684,163685,163687,163689,163691,163693],{"class":35,"line":229},[33,163686,5995],{"class":163},[33,163688,163454],{"class":167},[33,163690,3847],{"class":163},[33,163692,7657],{"class":50},[33,163694,574],{"class":167},[33,163696,163697],{"class":35,"line":235},[33,163698,12315],{"class":163},[33,163700,163701,163704,163706,163709,163711,163713,163715,163717,163719,163722],{"class":35,"line":250},[33,163702,163703],{"class":167},"            rf ",[33,163705,242],{"class":163},[33,163707,163708],{"class":167}," rpr.find(",[33,163710,4059],{"class":163},[33,163712,274],{"class":54},[33,163714,163671],{"class":50},[33,163716,163674],{"class":167},[33,163718,163677],{"class":50},[33,163720,163721],{"class":54},"rFonts\"",[33,163723,221],{"class":167},[33,163725,163726,163728,163731,163733,163735,163737],{"class":35,"line":266},[33,163727,5995],{"class":163},[33,163729,163730],{"class":167}," rf ",[33,163732,3847],{"class":163},[33,163734,620],{"class":163},[33,163736,7657],{"class":50},[33,163738,574],{"class":167},[33,163740,163741,163744,163746,163749,163752,163754,163756,163758,163761,163763,163766,163768],{"class":35,"line":290},[33,163742,163743],{"class":167},"                attrs ",[33,163745,242],{"class":163},[33,163747,163748],{"class":167}," {k.split(",[33,163750,163751],{"class":54},"\"}\"",[33,163753,109745],{"class":167},[33,163755,4126],{"class":163},[33,163757,734],{"class":50},[33,163759,163760],{"class":167},"]: v ",[33,163762,6124],{"class":163},[33,163764,163765],{"class":167}," k, v ",[33,163767,662],{"class":163},[33,163769,163770],{"class":167}," rf.attrib.items()}\n",[33,163772,163773,163775,163777,163779,163782,163784,163786,163788,163791,163793,163796,163798,163800,163802,163805,163807,163809],{"class":35,"line":295},[33,163774,8264],{"class":50},[33,163776,602],{"class":167},[33,163778,4059],{"class":163},[33,163780,163781],{"class":54},"\"Para ",[33,163783,1115],{"class":50},[33,163785,7499],{"class":167},[33,163787,1121],{"class":50},[33,163789,163790],{"class":54},", Run ",[33,163792,1115],{"class":50},[33,163794,163795],{"class":167},"j",[33,163797,1121],{"class":50},[33,163799,2079],{"class":54},[33,163801,1115],{"class":50},[33,163803,163804],{"class":167},"attrs",[33,163806,1121],{"class":50},[33,163808,274],{"class":54},[33,163810,221],{"class":167},[33,163812,163813],{"class":35,"line":300},[33,163814,92],{"emptyLinePlaceholder":91},[33,163816,163817,163820,163822],{"class":35,"line":317},[33,163818,163819],{"class":167},"check_run_fonts(Path(",[33,163821,162129],{"class":54},[33,163823,371],{"class":167},[14,163825,163826],{},"For Fix 2 the output should be:",[23,163828,163831],{"className":163829,"code":163830,"language":2000},[1998],"Para 0, Run 0: {'ascii': 'Arial', 'hAnsi': 'Arial', 'eastAsia': 'Noto Sans SC'}\n",[30,163832,163830],{"__ignoreMap":28},[14,163834,41963,163835,163838,163839,163842,163843,163846,163847,163849],{},[30,163836,163837],{},"eastAsia"," is missing, the ",[30,163840,163841],{},"r_fonts.set(qn(\"w:eastAsia\"), ...)"," line did not execute. Add a ",[30,163844,163845],{},"print(etree.tostring(r_fonts, ...))"," immediately after it to confirm the attribute was written before ",[30,163848,158141],{}," is called.",[2537,163851],{},[18,163853,6918],{"id":6917},[4211,163855,163856,163861,163866],{},[4214,163857,163858,163860],{},[940,163859,156152],{"href":26562}," — full guide to paragraphs, headings, tables, and page breaks with python-docx",[4214,163862,163863,163865],{},[940,163864,26185],{"href":18040}," — Jinja2 template rendering for batch document generation where styles are set in the template file",[4214,163867,163868,163870],{},[940,163869,156178],{"href":156177}," — embed images with correct DPI and sizing alongside styled text",[14,163872,6947,163873,3035],{},[940,163874,156152],{"href":26562},[6953,163876,59405],{},{"title":28,"searchDepth":43,"depth":43,"links":163878},[163879,163880,163881,163882,163883,163884,163885,163886,163887],{"id":7020,"depth":43,"text":7021},{"id":35016,"depth":43,"text":35017},{"id":161754,"depth":43,"text":161755},{"id":162057,"depth":43,"text":162058},{"id":162485,"depth":43,"text":162486},{"id":162888,"depth":43,"text":162889},{"id":163295,"depth":43,"text":163296},{"id":9246,"depth":43,"text":9247},{"id":6917,"depth":43,"text":6918},"Fonts & Styles","Font name, size, bold, and color changes in python-docx have no visible effect when run-level and style-level formatting conflict. Fix it with oxml for East-Asian fonts too.",{},"\u002Fword-document-templating-batch-processing\u002Fautomating-word-document-creation\u002Fset-fonts-and-styles-with-python-docx",{"title":157016,"description":163889},"word-document-templating-batch-processing\u002Fautomating-word-document-creation\u002Fset-fonts-and-styles-with-python-docx\u002Findex",[47,18041,59426,161319],"aI3msdAd0Pkcbm-Efq1HewwqE7Q5xu2Ge7jSxm3xNSU",{"id":163897,"title":163898,"body":163899,"breadcrumbTitle":166419,"canonical":6977,"date":6978,"description":166420,"draft":6980,"extension":6981,"image":6977,"meta":166421,"navigation":91,"path":166422,"robots":6977,"seo":166423,"seoTitle":166424,"stem":166425,"tags":166426,"updatedAt":6978,"__hash__":166428},"content\u002Fword-document-templating-batch-processing\u002Fconverting-docx-to-pdf-with-python\u002Ffix-docx2pdf-error-on-linux\u002Findex.md","Fix docx2pdf Error on Linux",{"type":7,"value":163900,"toc":166403},[163901,163904,163910,163916,163923,163929,163932,163934,163951,163957,163959,163962,164103,164106,164112,164116,164125,164129,164200,164204,164646,164650,164658,165252,165256,165265,165484,165494,165498,165501,165507,165517,165531,165537,165552,165562,165566,165571,165577,165844,165846,165875,165879,165885,165888,165950,165953,166016,166022,166026,166143,166145,166148,166363,166377,166379,166396,166400],[10,163902,163898],{"id":163903},"fix-docx2pdf-error-on-linux",[14,163905,138217,163906,163909],{},[30,163907,163908],{},"docx2pdf"," on a Linux machine raises one of two errors immediately:",[23,163911,163914],{"className":163912,"code":163913,"language":2000},[1998],"NotImplementedError: docx2pdf is not implemented for linux as it requires Microsoft Word to be installed\n",[30,163915,163913],{"__ignoreMap":28},[14,163917,163918,163919,163922],{},"or, if you have imported ",[30,163920,163921],{},"win32com"," directly:",[23,163924,163927],{"className":163925,"code":163926,"language":2000},[1998],"ModuleNotFoundError: No module named 'win32com'\n",[30,163928,163926],{"__ignoreMap":28},[14,163930,163931],{},"Both errors have the same root cause.",[18,163933,7021],{"id":7020},[14,163935,163936,150718,163938,163940,163941,163944,163945,163947,163948,163950],{},[30,163937,163908],{},[30,163939,18051],{}," to PDF by driving Microsoft Word's COM automation layer — on Windows via ",[30,163942,163943],{},"win32com.client",", on macOS via AppleScript. Neither mechanism exists on Linux. The library contains a hard platform check that raises ",[30,163946,86120],{}," before attempting any conversion. There is no configuration flag or workaround that makes ",[30,163949,163908],{}," work on Linux; the dependency on Word is architectural.",[14,163952,163953,163954,163956],{},"If you reached this page by running a script copied from a Windows developer's machine, or by deploying to a Linux server without changing the conversion call, you need to replace ",[30,163955,163908],{}," with LibreOffice headless for the Linux execution path.",[18,163958,35017],{"id":35016},[14,163960,163961],{},"The following snippet confirms the error without touching any files:",[23,163963,163965],{"className":126,"code":163964,"language":47,"meta":28,"style":28},"# pip install docx2pdf\nimport platform\n\nprint(f\"Platform: {platform.system()}\")   # expect 'Linux'\n\ntry:\n    from docx2pdf import convert\n    convert(\"dummy.docx\")\nexcept NotImplementedError as exc:\n    print(f\"Confirmed root cause: {exc}\")\nexcept Exception as exc:\n    print(f\"Other error: {exc}\")\n",[30,163966,163967,163972,163979,163983,164008,164012,164018,164030,164040,164051,164072,164082],{"__ignoreMap":28},[33,163968,163969],{"class":35,"line":36},[33,163970,163971],{"class":39},"# pip install docx2pdf\n",[33,163973,163974,163976],{"class":35,"line":43},[33,163975,164],{"class":163},[33,163977,163978],{"class":167}," platform\n",[33,163980,163981],{"class":35,"line":61},[33,163982,92],{"emptyLinePlaceholder":91},[33,163984,163985,163987,163989,163991,163994,163996,163999,164001,164003,164005],{"class":35,"line":73},[33,163986,13474],{"class":50},[33,163988,602],{"class":167},[33,163990,4059],{"class":163},[33,163992,163993],{"class":54},"\"Platform: ",[33,163995,1115],{"class":50},[33,163997,163998],{"class":167},"platform.system()",[33,164000,1121],{"class":50},[33,164002,274],{"class":54},[33,164004,12000],{"class":167},[33,164006,164007],{"class":39},"# expect 'Linux'\n",[33,164009,164010],{"class":35,"line":88},[33,164011,92],{"emptyLinePlaceholder":91},[33,164013,164014,164016],{"class":35,"line":95},[33,164015,35574],{"class":163},[33,164017,574],{"class":167},[33,164019,164020,164022,164025,164027],{"class":35,"line":101},[33,164021,3878],{"class":163},[33,164023,164024],{"class":167}," docx2pdf ",[33,164026,164],{"class":163},[33,164028,164029],{"class":167}," convert\n",[33,164031,164032,164035,164038],{"class":35,"line":171},[33,164033,164034],{"class":167},"    convert(",[33,164036,164037],{"class":54},"\"dummy.docx\"",[33,164039,221],{"class":167},[33,164041,164042,164044,164047,164049],{"class":35,"line":179},[33,164043,35726],{"class":163},[33,164045,164046],{"class":50}," NotImplementedError",[33,164048,1852],{"class":163},[33,164050,1855],{"class":167},[33,164052,164053,164055,164057,164059,164062,164064,164066,164068,164070],{"class":35,"line":187},[33,164054,7268],{"class":50},[33,164056,602],{"class":167},[33,164058,4059],{"class":163},[33,164060,164061],{"class":54},"\"Confirmed root cause: ",[33,164063,1115],{"class":50},[33,164065,6565],{"class":167},[33,164067,1121],{"class":50},[33,164069,274],{"class":54},[33,164071,221],{"class":167},[33,164073,164074,164076,164078,164080],{"class":35,"line":201},[33,164075,35726],{"class":163},[33,164077,783],{"class":50},[33,164079,1852],{"class":163},[33,164081,1855],{"class":167},[33,164083,164084,164086,164088,164090,164093,164095,164097,164099,164101],{"class":35,"line":206},[33,164085,7268],{"class":50},[33,164087,602],{"class":167},[33,164089,4059],{"class":163},[33,164091,164092],{"class":54},"\"Other error: ",[33,164094,1115],{"class":50},[33,164096,6565],{"class":167},[33,164098,1121],{"class":50},[33,164100,274],{"class":54},[33,164102,221],{"class":167},[14,164104,164105],{},"Expected output on Linux:",[23,164107,164110],{"className":164108,"code":164109,"language":2000},[1998],"Platform: Linux\nConfirmed root cause: docx2pdf is not implemented for linux as it requires Microsoft Word to be installed\n",[30,164111,164109],{"__ignoreMap":28},[18,164113,164115],{"id":164114},"fix-use-libreoffice-headless","Fix: Use LibreOffice Headless",[14,164117,164118,164119,164122,164123,3035],{},"LibreOffice ships a headless conversion mode via the ",[30,164120,164121],{},"soffice"," binary. It requires no Python package — call it with ",[30,164124,35794],{},[424,164126,164128],{"id":164127},"step-1-install-libreoffice","Step 1 — Install LibreOffice",[23,164130,164132],{"className":25,"code":164131,"language":27,"meta":28,"style":28},"# Ubuntu \u002F Debian\nsudo apt update && sudo apt install -y libreoffice\n\n# RHEL \u002F CentOS \u002F Rocky\nsudo yum install -y libreoffice\n\n# Verify\nsoffice --version\n# LibreOffice 7.x.x ...\n",[30,164133,164134,164139,164159,164163,164168,164181,164185,164189,164195],{"__ignoreMap":28},[33,164135,164136],{"class":35,"line":36},[33,164137,164138],{"class":39},"# Ubuntu \u002F Debian\n",[33,164140,164141,164143,164145,164147,164149,164151,164153,164155,164157],{"class":35,"line":43},[33,164142,9669],{"class":46},[33,164144,57878],{"class":54},[33,164146,35211],{"class":54},[33,164148,35214],{"class":167},[33,164150,9669],{"class":46},[33,164152,57878],{"class":54},[33,164154,79],{"class":54},[33,164156,20912],{"class":50},[33,164158,26696],{"class":54},[33,164160,164161],{"class":35,"line":61},[33,164162,92],{"emptyLinePlaceholder":91},[33,164164,164165],{"class":35,"line":73},[33,164166,164167],{"class":39},"# RHEL \u002F CentOS \u002F Rocky\n",[33,164169,164170,164172,164175,164177,164179],{"class":35,"line":88},[33,164171,9669],{"class":46},[33,164173,164174],{"class":54}," yum",[33,164176,79],{"class":54},[33,164178,20912],{"class":50},[33,164180,26696],{"class":54},[33,164182,164183],{"class":35,"line":95},[33,164184,92],{"emptyLinePlaceholder":91},[33,164186,164187],{"class":35,"line":101},[33,164188,98],{"class":39},[33,164190,164191,164193],{"class":35,"line":171},[33,164192,164121],{"class":46},[33,164194,41864],{"class":50},[33,164196,164197],{"class":35,"line":179},[33,164198,164199],{"class":39},"# LibreOffice 7.x.x ...\n",[424,164201,164203],{"id":164202},"step-2-replace-the-docx2pdf-call","Step 2 — Replace the docx2pdf call",[23,164205,164207],{"className":126,"code":164206,"language":47,"meta":28,"style":28},"# No pip package needed — requires soffice on PATH\nimport subprocess\nfrom pathlib import Path\n\ndef convert_with_libreoffice(docx_path: Path, output_dir: Path) -> Path:\n    \"\"\"Convert a .docx file to PDF using LibreOffice headless.\"\"\"\n    docx_path = docx_path.resolve()   # soffice requires an absolute path\n    output_dir = output_dir.resolve()\n    output_dir.mkdir(parents=True, exist_ok=True)\n\n    result = subprocess.run(\n        [\n            \"soffice\",\n            \"--headless\",\n            \"--convert-to\", \"pdf\",  # target format\n            \"--outdir\", str(output_dir),  # where the PDF lands\n            str(docx_path),\n        ],\n        capture_output=True,\n        text=True,\n        timeout=120,  # seconds; increase for large documents\n    )\n\n    if result.returncode != 0:\n        raise RuntimeError(\n            f\"LibreOffice conversion failed:\\n{result.stderr.strip()}\"\n        )\n\n    pdf_path = output_dir \u002F (docx_path.stem + \".pdf\")\n    if not pdf_path.exists():\n        raise FileNotFoundError(f\"Expected output not found: {pdf_path}\")\n\n    return pdf_path\n\n\n# Usage\ninput_file = Path(\"documents\u002Freport.docx\")\noutput_dir = Path(\"output_pdfs\")\n\ntry:\n    pdf = convert_with_libreoffice(input_file, output_dir)\n    print(f\"Converted: {pdf}\")\nexcept FileNotFoundError as exc:\n    print(f\"Input missing: {exc}\")\nexcept RuntimeError as exc:\n    print(f\"Conversion error: {exc}\")\n",[30,164208,164209,164214,164220,164230,164234,164244,164249,164261,164270,164290,164294,164302,164306,164313,164320,164333,164347,164354,164358,164368,164378,164391,164395,164399,164411,164419,164435,164439,164443,164463,164471,164494,164498,164505,164509,164513,164517,164531,164545,164549,164555,164564,164585,164595,164615,164625],{"__ignoreMap":28},[33,164210,164211],{"class":35,"line":36},[33,164212,164213],{"class":39},"# No pip package needed — requires soffice on PATH\n",[33,164215,164216,164218],{"class":35,"line":43},[33,164217,164],{"class":163},[33,164219,35040],{"class":167},[33,164221,164222,164224,164226,164228],{"class":35,"line":61},[33,164223,190],{"class":163},[33,164225,193],{"class":167},[33,164227,164],{"class":163},[33,164229,198],{"class":167},[33,164231,164232],{"class":35,"line":73},[33,164233,92],{"emptyLinePlaceholder":91},[33,164235,164236,164238,164241],{"class":35,"line":88},[33,164237,562],{"class":163},[33,164239,164240],{"class":46}," convert_with_libreoffice",[33,164242,164243],{"class":167},"(docx_path: Path, output_dir: Path) -> Path:\n",[33,164245,164246],{"class":35,"line":95},[33,164247,164248],{"class":54},"    \"\"\"Convert a .docx file to PDF using LibreOffice headless.\"\"\"\n",[33,164250,164251,164253,164255,164258],{"class":35,"line":101},[33,164252,21681],{"class":167},[33,164254,242],{"class":163},[33,164256,164257],{"class":167}," docx_path.resolve()   ",[33,164259,164260],{"class":39},"# soffice requires an absolute path\n",[33,164262,164263,164265,164267],{"class":35,"line":171},[33,164264,22180],{"class":167},[33,164266,242],{"class":163},[33,164268,164269],{"class":167}," output_dir.resolve()\n",[33,164271,164272,164274,164276,164278,164280,164282,164284,164286,164288],{"class":35,"line":179},[33,164273,6346],{"class":167},[33,164275,869],{"class":238},[33,164277,242],{"class":163},[33,164279,855],{"class":50},[33,164281,365],{"class":167},[33,164283,878],{"class":238},[33,164285,242],{"class":163},[33,164287,855],{"class":50},[33,164289,221],{"class":167},[33,164291,164292],{"class":35,"line":187},[33,164293,92],{"emptyLinePlaceholder":91},[33,164295,164296,164298,164300],{"class":35,"line":201},[33,164297,8842],{"class":167},[33,164299,242],{"class":163},[33,164301,35060],{"class":167},[33,164303,164304],{"class":35,"line":206},[33,164305,19619],{"class":167},[33,164307,164308,164311],{"class":35,"line":224},[33,164309,164310],{"class":54},"            \"soffice\"",[33,164312,247],{"class":167},[33,164314,164315,164318],{"class":35,"line":229},[33,164316,164317],{"class":54},"            \"--headless\"",[33,164319,247],{"class":167},[33,164321,164322,164324,164326,164328,164330],{"class":35,"line":235},[33,164323,148389],{"class":54},[33,164325,365],{"class":167},[33,164327,15519],{"class":54},[33,164329,25480],{"class":167},[33,164331,164332],{"class":39},"# target format\n",[33,164334,164335,164337,164339,164341,164344],{"class":35,"line":250},[33,164336,148401],{"class":54},[33,164338,365],{"class":167},[33,164340,1053],{"class":50},[33,164342,164343],{"class":167},"(output_dir),  ",[33,164345,164346],{"class":39},"# where the PDF lands\n",[33,164348,164349,164351],{"class":35,"line":266},[33,164350,10673],{"class":50},[33,164352,164353],{"class":167},"(docx_path),\n",[33,164355,164356],{"class":35,"line":290},[33,164357,20776],{"class":167},[33,164359,164360,164362,164364,164366],{"class":35,"line":295},[33,164361,148430],{"class":238},[33,164363,242],{"class":163},[33,164365,855],{"class":50},[33,164367,247],{"class":167},[33,164369,164370,164372,164374,164376],{"class":35,"line":300},[33,164371,148441],{"class":238},[33,164373,242],{"class":163},[33,164375,855],{"class":50},[33,164377,247],{"class":167},[33,164379,164380,164382,164384,164386,164388],{"class":35,"line":317},[33,164381,148452],{"class":238},[33,164383,242],{"class":163},[33,164385,2589],{"class":50},[33,164387,25480],{"class":167},[33,164389,164390],{"class":39},"# seconds; increase for large documents\n",[33,164392,164393],{"class":35,"line":332},[33,164394,1202],{"class":167},[33,164396,164397],{"class":35,"line":347},[33,164398,92],{"emptyLinePlaceholder":91},[33,164400,164401,164403,164405,164407,164409],{"class":35,"line":374},[33,164402,617],{"class":163},[33,164404,35108],{"class":167},[33,164406,17877],{"class":163},[33,164408,10791],{"class":50},[33,164410,574],{"class":167},[33,164412,164413,164415,164417],{"class":35,"line":397},[33,164414,4051],{"class":163},[33,164416,7590],{"class":50},[33,164418,7637],{"class":167},[33,164420,164421,164423,164426,164428,164431,164433],{"class":35,"line":653},[33,164422,12744],{"class":163},[33,164424,164425],{"class":54},"\"LibreOffice conversion failed:",[33,164427,5793],{"class":50},[33,164429,164430],{"class":167},"result.stderr.strip()",[33,164432,1121],{"class":50},[33,164434,7504],{"class":54},[33,164436,164437],{"class":35,"line":667},[33,164438,5867],{"class":167},[33,164440,164441],{"class":35,"line":675},[33,164442,92],{"emptyLinePlaceholder":91},[33,164444,164445,164447,164449,164451,164453,164456,164458,164461],{"class":35,"line":689},[33,164446,21570],{"class":167},[33,164448,242],{"class":163},[33,164450,6393],{"class":167},[33,164452,1351],{"class":163},[33,164454,164455],{"class":167}," (docx_path.stem ",[33,164457,1811],{"class":163},[33,164459,164460],{"class":54}," \".pdf\"",[33,164462,221],{"class":167},[33,164464,164465,164467,164469],{"class":35,"line":703},[33,164466,617],{"class":163},[33,164468,620],{"class":163},[33,164470,21595],{"class":167},[33,164472,164473,164475,164477,164479,164481,164484,164486,164488,164490,164492],{"class":35,"line":714},[33,164474,4051],{"class":163},[33,164476,2945],{"class":50},[33,164478,602],{"class":167},[33,164480,4059],{"class":163},[33,164482,164483],{"class":54},"\"Expected output not found: ",[33,164485,1115],{"class":50},[33,164487,27069],{"class":167},[33,164489,1121],{"class":50},[33,164491,274],{"class":54},[33,164493,221],{"class":167},[33,164495,164496],{"class":35,"line":723},[33,164497,92],{"emptyLinePlaceholder":91},[33,164499,164500,164502],{"class":35,"line":754},[33,164501,1332],{"class":163},[33,164503,164504],{"class":167}," pdf_path\n",[33,164506,164507],{"class":35,"line":771},[33,164508,92],{"emptyLinePlaceholder":91},[33,164510,164511],{"class":35,"line":777},[33,164512,92],{"emptyLinePlaceholder":91},[33,164514,164515],{"class":35,"line":788},[33,164516,136030],{"class":39},[33,164518,164519,164522,164524,164526,164529],{"class":35,"line":804},[33,164520,164521],{"class":167},"input_file ",[33,164523,242],{"class":163},[33,164525,215],{"class":167},[33,164527,164528],{"class":54},"\"documents\u002Freport.docx\"",[33,164530,221],{"class":167},[33,164532,164533,164536,164538,164540,164543],{"class":35,"line":809},[33,164534,164535],{"class":167},"output_dir ",[33,164537,242],{"class":163},[33,164539,215],{"class":167},[33,164541,164542],{"class":54},"\"output_pdfs\"",[33,164544,221],{"class":167},[33,164546,164547],{"class":35,"line":819},[33,164548,92],{"emptyLinePlaceholder":91},[33,164550,164551,164553],{"class":35,"line":829},[33,164552,35574],{"class":163},[33,164554,574],{"class":167},[33,164556,164557,164559,164561],{"class":35,"line":834},[33,164558,46704],{"class":167},[33,164560,242],{"class":163},[33,164562,164563],{"class":167}," convert_with_libreoffice(input_file, output_dir)\n",[33,164565,164566,164568,164570,164572,164575,164577,164579,164581,164583],{"class":35,"line":839},[33,164567,7268],{"class":50},[33,164569,602],{"class":167},[33,164571,4059],{"class":163},[33,164573,164574],{"class":54},"\"Converted: ",[33,164576,1115],{"class":50},[33,164578,9631],{"class":167},[33,164580,1121],{"class":50},[33,164582,274],{"class":54},[33,164584,221],{"class":167},[33,164586,164587,164589,164591,164593],{"class":35,"line":860},[33,164588,35726],{"class":163},[33,164590,2945],{"class":50},[33,164592,1852],{"class":163},[33,164594,1855],{"class":167},[33,164596,164597,164599,164601,164603,164605,164607,164609,164611,164613],{"class":35,"line":887},[33,164598,7268],{"class":50},[33,164600,602],{"class":167},[33,164602,4059],{"class":163},[33,164604,126056],{"class":54},[33,164606,1115],{"class":50},[33,164608,6565],{"class":167},[33,164610,1121],{"class":50},[33,164612,274],{"class":54},[33,164614,221],{"class":167},[33,164616,164617,164619,164621,164623],{"class":35,"line":907},[33,164618,35726],{"class":163},[33,164620,7590],{"class":50},[33,164622,1852],{"class":163},[33,164624,1855],{"class":167},[33,164626,164627,164629,164631,164633,164636,164638,164640,164642,164644],{"class":35,"line":1826},[33,164628,7268],{"class":50},[33,164630,602],{"class":167},[33,164632,4059],{"class":163},[33,164634,164635],{"class":54},"\"Conversion error: ",[33,164637,1115],{"class":50},[33,164639,6565],{"class":167},[33,164641,1121],{"class":50},[33,164643,274],{"class":54},[33,164645,221],{"class":167},[18,164647,164649],{"id":164648},"cross-platform-wrapper","Cross-Platform Wrapper",[14,164651,164652,164653,164655,164656,3035],{},"If your codebase runs on both Windows\u002FmacOS (where ",[30,164654,163908],{}," is correct) and Linux\u002FCI servers (where LibreOffice is correct), use this wrapper instead of calling either library directly. The full batch workflow using this pattern is in ",[940,164657,161278],{"href":161277},[23,164659,164661],{"className":126,"code":164660,"language":47,"meta":28,"style":28},"# pip install docx2pdf   (Windows\u002FmacOS only; harmless to install on Linux)\n# Linux requires: sudo apt install libreoffice\nimport platform\nimport subprocess\nimport tempfile\nfrom pathlib import Path\n\n_SYSTEM = platform.system()\n\n\ndef convert_docx_to_pdf(docx_path: Path, output_dir: Path) -> Path:\n    \"\"\"\n    Convert a single .docx to PDF.\n\n    - Windows \u002F macOS: uses docx2pdf (requires Microsoft Word)\n    - Linux \u002F other:   uses LibreOffice headless (requires soffice on PATH)\n    \"\"\"\n    docx_path = docx_path.resolve()\n    output_dir = output_dir.resolve()\n    output_dir.mkdir(parents=True, exist_ok=True)\n    pdf_path = output_dir \u002F (docx_path.stem + \".pdf\")\n\n    if _SYSTEM in (\"Windows\", \"Darwin\"):\n        # Windows\u002FmacOS path — drives Microsoft Word via COM \u002F AppleScript\n        from docx2pdf import convert  # noqa: PLC0415\n        convert(docx_path, pdf_path)\n    else:\n        # Linux \u002F server path — drives LibreOffice headless\n        _soffice_convert(docx_path, output_dir)\n\n    if not pdf_path.exists():\n        raise FileNotFoundError(f\"Conversion produced no output at {pdf_path}\")\n\n    return pdf_path\n\n\ndef _soffice_convert(docx_path: Path, output_dir: Path) -> None:\n    \"\"\"Internal: run soffice with an isolated user profile to allow parallelism.\"\"\"\n    with tempfile.TemporaryDirectory() as tmp:\n        profile = Path(tmp) \u002F \"lo_profile\"\n        profile.mkdir()\n        result = subprocess.run(\n            [\n                \"soffice\",\n                f\"-env:UserInstallation=file:\u002F\u002F{profile}\",  # isolate profile\n                \"--headless\",\n                \"--convert-to\", \"pdf\",\n                \"--outdir\", str(output_dir),\n                str(docx_path),\n            ],\n            capture_output=True,\n            text=True,\n            timeout=120,\n        )\n        if result.returncode != 0:\n            raise RuntimeError(result.stderr.strip() or \"soffice returned non-zero exit code\")\n\n\n# Example usage\nif __name__ == \"__main__\":\n    try:\n        out = convert_docx_to_pdf(\n            docx_path=Path(\"documents\u002Fcontract.docx\"),\n            output_dir=Path(\"output_pdfs\"),\n        )\n        print(f\"Success: {out}\")\n    except Exception as exc:\n        print(f\"Failed: {exc}\")\n",[30,164662,164663,164668,164673,164679,164685,164691,164701,164705,164715,164719,164723,164732,164736,164741,164745,164750,164755,164759,164768,164776,164796,164814,164818,164839,164844,164859,164864,164870,164875,164880,164884,164892,164915,164919,164925,164929,164933,164947,164952,164963,164978,164983,164991,164996,165003,165024,165031,165042,165054,165060,165064,165075,165086,165097,165101,165113,165129,165133,165137,165142,165154,165160,165169,165183,165196,165200,165221,165231],{"__ignoreMap":28},[33,164664,164665],{"class":35,"line":36},[33,164666,164667],{"class":39},"# pip install docx2pdf   (Windows\u002FmacOS only; harmless to install on Linux)\n",[33,164669,164670],{"class":35,"line":43},[33,164671,164672],{"class":39},"# Linux requires: sudo apt install libreoffice\n",[33,164674,164675,164677],{"class":35,"line":61},[33,164676,164],{"class":163},[33,164678,163978],{"class":167},[33,164680,164681,164683],{"class":35,"line":73},[33,164682,164],{"class":163},[33,164684,35040],{"class":167},[33,164686,164687,164689],{"class":35,"line":88},[33,164688,164],{"class":163},[33,164690,70055],{"class":167},[33,164692,164693,164695,164697,164699],{"class":35,"line":95},[33,164694,190],{"class":163},[33,164696,193],{"class":167},[33,164698,164],{"class":163},[33,164700,198],{"class":167},[33,164702,164703],{"class":35,"line":101},[33,164704,92],{"emptyLinePlaceholder":91},[33,164706,164707,164710,164712],{"class":35,"line":171},[33,164708,164709],{"class":50},"_SYSTEM",[33,164711,212],{"class":163},[33,164713,164714],{"class":167}," platform.system()\n",[33,164716,164717],{"class":35,"line":179},[33,164718,92],{"emptyLinePlaceholder":91},[33,164720,164721],{"class":35,"line":187},[33,164722,92],{"emptyLinePlaceholder":91},[33,164724,164725,164727,164730],{"class":35,"line":201},[33,164726,562],{"class":163},[33,164728,164729],{"class":46}," convert_docx_to_pdf",[33,164731,164243],{"class":167},[33,164733,164734],{"class":35,"line":206},[33,164735,7673],{"class":54},[33,164737,164738],{"class":35,"line":224},[33,164739,164740],{"class":54},"    Convert a single .docx to PDF.\n",[33,164742,164743],{"class":35,"line":229},[33,164744,92],{"emptyLinePlaceholder":91},[33,164746,164747],{"class":35,"line":235},[33,164748,164749],{"class":54},"    - Windows \u002F macOS: uses docx2pdf (requires Microsoft Word)\n",[33,164751,164752],{"class":35,"line":250},[33,164753,164754],{"class":54},"    - Linux \u002F other:   uses LibreOffice headless (requires soffice on PATH)\n",[33,164756,164757],{"class":35,"line":266},[33,164758,7673],{"class":54},[33,164760,164761,164763,164765],{"class":35,"line":290},[33,164762,21681],{"class":167},[33,164764,242],{"class":163},[33,164766,164767],{"class":167}," docx_path.resolve()\n",[33,164769,164770,164772,164774],{"class":35,"line":295},[33,164771,22180],{"class":167},[33,164773,242],{"class":163},[33,164775,164269],{"class":167},[33,164777,164778,164780,164782,164784,164786,164788,164790,164792,164794],{"class":35,"line":300},[33,164779,6346],{"class":167},[33,164781,869],{"class":238},[33,164783,242],{"class":163},[33,164785,855],{"class":50},[33,164787,365],{"class":167},[33,164789,878],{"class":238},[33,164791,242],{"class":163},[33,164793,855],{"class":50},[33,164795,221],{"class":167},[33,164797,164798,164800,164802,164804,164806,164808,164810,164812],{"class":35,"line":317},[33,164799,21570],{"class":167},[33,164801,242],{"class":163},[33,164803,6393],{"class":167},[33,164805,1351],{"class":163},[33,164807,164455],{"class":167},[33,164809,1811],{"class":163},[33,164811,164460],{"class":54},[33,164813,221],{"class":167},[33,164815,164816],{"class":35,"line":332},[33,164817,92],{"emptyLinePlaceholder":91},[33,164819,164820,164822,164825,164827,164829,164832,164834,164837],{"class":35,"line":347},[33,164821,617],{"class":163},[33,164823,164824],{"class":50}," _SYSTEM",[33,164826,8002],{"class":163},[33,164828,17583],{"class":167},[33,164830,164831],{"class":54},"\"Windows\"",[33,164833,365],{"class":167},[33,164835,164836],{"class":54},"\"Darwin\"",[33,164838,1737],{"class":167},[33,164840,164841],{"class":35,"line":374},[33,164842,164843],{"class":39},"        # Windows\u002FmacOS path — drives Microsoft Word via COM \u002F AppleScript\n",[33,164845,164846,164849,164851,164853,164856],{"class":35,"line":397},[33,164847,164848],{"class":163},"        from",[33,164850,164024],{"class":167},[33,164852,164],{"class":163},[33,164854,164855],{"class":167}," convert  ",[33,164857,164858],{"class":39},"# noqa: PLC0415\n",[33,164860,164861],{"class":35,"line":653},[33,164862,164863],{"class":167},"        convert(docx_path, pdf_path)\n",[33,164865,164866,164868],{"class":35,"line":667},[33,164867,6864],{"class":163},[33,164869,574],{"class":167},[33,164871,164872],{"class":35,"line":675},[33,164873,164874],{"class":39},"        # Linux \u002F server path — drives LibreOffice headless\n",[33,164876,164877],{"class":35,"line":689},[33,164878,164879],{"class":167},"        _soffice_convert(docx_path, output_dir)\n",[33,164881,164882],{"class":35,"line":703},[33,164883,92],{"emptyLinePlaceholder":91},[33,164885,164886,164888,164890],{"class":35,"line":714},[33,164887,617],{"class":163},[33,164889,620],{"class":163},[33,164891,21595],{"class":167},[33,164893,164894,164896,164898,164900,164902,164905,164907,164909,164911,164913],{"class":35,"line":723},[33,164895,4051],{"class":163},[33,164897,2945],{"class":50},[33,164899,602],{"class":167},[33,164901,4059],{"class":163},[33,164903,164904],{"class":54},"\"Conversion produced no output at ",[33,164906,1115],{"class":50},[33,164908,27069],{"class":167},[33,164910,1121],{"class":50},[33,164912,274],{"class":54},[33,164914,221],{"class":167},[33,164916,164917],{"class":35,"line":754},[33,164918,92],{"emptyLinePlaceholder":91},[33,164920,164921,164923],{"class":35,"line":771},[33,164922,1332],{"class":163},[33,164924,164504],{"class":167},[33,164926,164927],{"class":35,"line":777},[33,164928,92],{"emptyLinePlaceholder":91},[33,164930,164931],{"class":35,"line":788},[33,164932,92],{"emptyLinePlaceholder":91},[33,164934,164935,164937,164940,164943,164945],{"class":35,"line":804},[33,164936,562],{"class":163},[33,164938,164939],{"class":46}," _soffice_convert",[33,164941,164942],{"class":167},"(docx_path: Path, output_dir: Path) -> ",[33,164944,571],{"class":50},[33,164946,574],{"class":167},[33,164948,164949],{"class":35,"line":809},[33,164950,164951],{"class":54},"    \"\"\"Internal: run soffice with an isolated user profile to allow parallelism.\"\"\"\n",[33,164953,164954,164956,164959,164961],{"class":35,"line":819},[33,164955,1635],{"class":163},[33,164957,164958],{"class":167}," tempfile.TemporaryDirectory() ",[33,164960,495],{"class":163},[33,164962,159915],{"class":167},[33,164964,164965,164968,164970,164973,164975],{"class":35,"line":829},[33,164966,164967],{"class":167},"        profile ",[33,164969,242],{"class":163},[33,164971,164972],{"class":167}," Path(tmp) ",[33,164974,1351],{"class":163},[33,164976,164977],{"class":54}," \"lo_profile\"\n",[33,164979,164980],{"class":35,"line":834},[33,164981,164982],{"class":167},"        profile.mkdir()\n",[33,164984,164985,164987,164989],{"class":35,"line":839},[33,164986,87961],{"class":167},[33,164988,242],{"class":163},[33,164990,35060],{"class":167},[33,164992,164993],{"class":35,"line":860},[33,164994,164995],{"class":167},"            [\n",[33,164997,164998,165001],{"class":35,"line":887},[33,164999,165000],{"class":54},"                \"soffice\"",[33,165002,247],{"class":167},[33,165004,165005,165007,165010,165012,165015,165017,165019,165021],{"class":35,"line":907},[33,165006,23946],{"class":163},[33,165008,165009],{"class":54},"\"-env:UserInstallation=file:\u002F\u002F",[33,165011,1115],{"class":50},[33,165013,165014],{"class":167},"profile",[33,165016,1121],{"class":50},[33,165018,274],{"class":54},[33,165020,25480],{"class":167},[33,165022,165023],{"class":39},"# isolate profile\n",[33,165025,165026,165029],{"class":35,"line":1826},[33,165027,165028],{"class":54},"                \"--headless\"",[33,165030,247],{"class":167},[33,165032,165033,165036,165038,165040],{"class":35,"line":1844},[33,165034,165035],{"class":54},"                \"--convert-to\"",[33,165037,365],{"class":167},[33,165039,15519],{"class":54},[33,165041,247],{"class":167},[33,165043,165044,165047,165049,165051],{"class":35,"line":1858},[33,165045,165046],{"class":54},"                \"--outdir\"",[33,165048,365],{"class":167},[33,165050,1053],{"class":50},[33,165052,165053],{"class":167},"(output_dir),\n",[33,165055,165056,165058],{"class":35,"line":1871},[33,165057,7879],{"class":50},[33,165059,164353],{"class":167},[33,165061,165062],{"class":35,"line":1877},[33,165063,11436],{"class":167},[33,165065,165066,165069,165071,165073],{"class":35,"line":1883},[33,165067,165068],{"class":238},"            capture_output",[33,165070,242],{"class":163},[33,165072,855],{"class":50},[33,165074,247],{"class":167},[33,165076,165077,165080,165082,165084],{"class":35,"line":1915},[33,165078,165079],{"class":238},"            text",[33,165081,242],{"class":163},[33,165083,855],{"class":50},[33,165085,247],{"class":167},[33,165087,165088,165091,165093,165095],{"class":35,"line":1926},[33,165089,165090],{"class":238},"            timeout",[33,165092,242],{"class":163},[33,165094,2589],{"class":50},[33,165096,247],{"class":167},[33,165098,165099],{"class":35,"line":1932},[33,165100,5867],{"class":167},[33,165102,165103,165105,165107,165109,165111],{"class":35,"line":1938},[33,165104,8221],{"class":163},[33,165106,35108],{"class":167},[33,165108,17877],{"class":163},[33,165110,10791],{"class":50},[33,165112,574],{"class":167},[33,165114,165115,165117,165119,165122,165124,165127],{"class":35,"line":1950},[33,165116,59715],{"class":163},[33,165118,7590],{"class":50},[33,165120,165121],{"class":167},"(result.stderr.strip() ",[33,165123,7162],{"class":163},[33,165125,165126],{"class":54}," \"soffice returned non-zero exit code\"",[33,165128,221],{"class":167},[33,165130,165131],{"class":35,"line":1958},[33,165132,92],{"emptyLinePlaceholder":91},[33,165134,165135],{"class":35,"line":4904},[33,165136,92],{"emptyLinePlaceholder":91},[33,165138,165139],{"class":35,"line":4909},[33,165140,165141],{"class":39},"# Example usage\n",[33,165143,165144,165146,165148,165150,165152],{"class":35,"line":4915},[33,165145,2491],{"class":163},[33,165147,2494],{"class":50},[33,165149,2497],{"class":163},[33,165151,2500],{"class":54},[33,165153,574],{"class":167},[33,165155,165156,165158],{"class":35,"line":4925},[33,165157,2424],{"class":163},[33,165159,574],{"class":167},[33,165161,165162,165164,165166],{"class":35,"line":4935},[33,165163,50344],{"class":167},[33,165165,242],{"class":163},[33,165167,165168],{"class":167}," convert_docx_to_pdf(\n",[33,165170,165171,165174,165176,165178,165181],{"class":35,"line":4941},[33,165172,165173],{"class":238},"            docx_path",[33,165175,242],{"class":163},[33,165177,15641],{"class":167},[33,165179,165180],{"class":54},"\"documents\u002Fcontract.docx\"",[33,165182,1506],{"class":167},[33,165184,165185,165188,165190,165192,165194],{"class":35,"line":4950},[33,165186,165187],{"class":238},"            output_dir",[33,165189,242],{"class":163},[33,165191,15641],{"class":167},[33,165193,164542],{"class":54},[33,165195,1506],{"class":167},[33,165197,165198],{"class":35,"line":4960},[33,165199,5867],{"class":167},[33,165201,165202,165204,165206,165208,165211,165213,165215,165217,165219],{"class":35,"line":4965},[33,165203,9414],{"class":50},[33,165205,602],{"class":167},[33,165207,4059],{"class":163},[33,165209,165210],{"class":54},"\"Success: ",[33,165212,1115],{"class":50},[33,165214,18014],{"class":167},[33,165216,1121],{"class":50},[33,165218,274],{"class":54},[33,165220,221],{"class":167},[33,165222,165223,165225,165227,165229],{"class":35,"line":4971},[33,165224,2449],{"class":163},[33,165226,783],{"class":50},[33,165228,1852],{"class":163},[33,165230,1855],{"class":167},[33,165232,165233,165235,165237,165239,165242,165244,165246,165248,165250],{"class":35,"line":4983},[33,165234,9414],{"class":50},[33,165236,602],{"class":167},[33,165238,4059],{"class":163},[33,165240,165241],{"class":54},"\"Failed: ",[33,165243,1115],{"class":50},[33,165245,6565],{"class":167},[33,165247,1121],{"class":50},[33,165249,274],{"class":54},[33,165251,221],{"class":167},[18,165253,165255],{"id":165254},"variant-fix-a-soffice-not-found-on-path","Variant Fix A: soffice Not Found on PATH",[14,165257,165258,165259,165261,165262,3035],{},"If LibreOffice is installed but ",[30,165260,164121],{}," is not on your PATH, the subprocess call raises ",[30,165263,165264],{},"FileNotFoundError: [Errno 2] No such file or directory: 'soffice'",[23,165266,165268],{"className":126,"code":165267,"language":47,"meta":28,"style":28},"# pip install nothing — stdlib only\nimport shutil\nfrom pathlib import Path\n\ndef find_soffice() -> str:\n    \"\"\"Return the absolute path to soffice, or raise if not found.\"\"\"\n    candidate = shutil.which(\"soffice\") or shutil.which(\"libreoffice\")\n    if candidate:\n        return candidate\n\n    # Common non-PATH install locations on Linux\n    for fallback in [\n        \"\u002Fusr\u002Fbin\u002Fsoffice\",\n        \"\u002Fusr\u002Flib\u002Flibreoffice\u002Fprogram\u002Fsoffice\",\n        \"\u002Fopt\u002Flibreoffice\u002Fprogram\u002Fsoffice\",\n    ]:\n        if Path(fallback).exists():\n            return fallback\n\n    raise FileNotFoundError(\n        \"soffice not found. Install LibreOffice:\\n\"\n        \"  Ubuntu\u002FDebian: sudo apt install libreoffice\\n\"\n        \"  RHEL\u002FCentOS:   sudo yum install libreoffice\"\n    )\n\nsoffice_bin = find_soffice()\nprint(f\"Using: {soffice_bin}\")\n",[30,165269,165270,165275,165281,165291,165295,165308,165313,165336,165343,165350,165354,165359,165370,165377,165384,165391,165395,165402,165409,165413,165421,165430,165439,165444,165448,165452,165462],{"__ignoreMap":28},[33,165271,165272],{"class":35,"line":36},[33,165273,165274],{"class":39},"# pip install nothing — stdlib only\n",[33,165276,165277,165279],{"class":35,"line":43},[33,165278,164],{"class":163},[33,165280,41706],{"class":167},[33,165282,165283,165285,165287,165289],{"class":35,"line":61},[33,165284,190],{"class":163},[33,165286,193],{"class":167},[33,165288,164],{"class":163},[33,165290,198],{"class":167},[33,165292,165293],{"class":35,"line":73},[33,165294,92],{"emptyLinePlaceholder":91},[33,165296,165297,165299,165302,165304,165306],{"class":35,"line":88},[33,165298,562],{"class":163},[33,165300,165301],{"class":46}," find_soffice",[33,165303,568],{"class":167},[33,165305,1053],{"class":50},[33,165307,574],{"class":167},[33,165309,165310],{"class":35,"line":95},[33,165311,165312],{"class":54},"    \"\"\"Return the absolute path to soffice, or raise if not found.\"\"\"\n",[33,165314,165315,165318,165320,165322,165325,165327,165329,165331,165334],{"class":35,"line":101},[33,165316,165317],{"class":167},"    candidate ",[33,165319,242],{"class":163},[33,165321,41716],{"class":167},[33,165323,165324],{"class":54},"\"soffice\"",[33,165326,1649],{"class":167},[33,165328,7162],{"class":163},[33,165330,41716],{"class":167},[33,165332,165333],{"class":54},"\"libreoffice\"",[33,165335,221],{"class":167},[33,165337,165338,165340],{"class":35,"line":171},[33,165339,617],{"class":163},[33,165341,165342],{"class":167}," candidate:\n",[33,165344,165345,165347],{"class":35,"line":179},[33,165346,1659],{"class":163},[33,165348,165349],{"class":167}," candidate\n",[33,165351,165352],{"class":35,"line":187},[33,165353,92],{"emptyLinePlaceholder":91},[33,165355,165356],{"class":35,"line":201},[33,165357,165358],{"class":39},"    # Common non-PATH install locations on Linux\n",[33,165360,165361,165363,165366,165368],{"class":35,"line":206},[33,165362,656],{"class":163},[33,165364,165365],{"class":167}," fallback ",[33,165367,662],{"class":163},[33,165369,7473],{"class":167},[33,165371,165372,165375],{"class":35,"line":224},[33,165373,165374],{"class":54},"        \"\u002Fusr\u002Fbin\u002Fsoffice\"",[33,165376,247],{"class":167},[33,165378,165379,165382],{"class":35,"line":229},[33,165380,165381],{"class":54},"        \"\u002Fusr\u002Flib\u002Flibreoffice\u002Fprogram\u002Fsoffice\"",[33,165383,247],{"class":167},[33,165385,165386,165389],{"class":35,"line":235},[33,165387,165388],{"class":54},"        \"\u002Fopt\u002Flibreoffice\u002Fprogram\u002Fsoffice\"",[33,165390,247],{"class":167},[33,165392,165393],{"class":35,"line":250},[33,165394,154791],{"class":167},[33,165396,165397,165399],{"class":35,"line":266},[33,165398,8221],{"class":163},[33,165400,165401],{"class":167}," Path(fallback).exists():\n",[33,165403,165404,165406],{"class":35,"line":290},[33,165405,28782],{"class":163},[33,165407,165408],{"class":167}," fallback\n",[33,165410,165411],{"class":35,"line":295},[33,165412,92],{"emptyLinePlaceholder":91},[33,165414,165415,165417,165419],{"class":35,"line":300},[33,165416,35742],{"class":163},[33,165418,2945],{"class":50},[33,165420,7637],{"class":167},[33,165422,165423,165426,165428],{"class":35,"line":317},[33,165424,165425],{"class":54},"        \"soffice not found. Install LibreOffice:",[33,165427,25830],{"class":50},[33,165429,7504],{"class":54},[33,165431,165432,165435,165437],{"class":35,"line":332},[33,165433,165434],{"class":54},"        \"  Ubuntu\u002FDebian: sudo apt install libreoffice",[33,165436,25830],{"class":50},[33,165438,7504],{"class":54},[33,165440,165441],{"class":35,"line":347},[33,165442,165443],{"class":54},"        \"  RHEL\u002FCentOS:   sudo yum install libreoffice\"\n",[33,165445,165446],{"class":35,"line":374},[33,165447,1202],{"class":167},[33,165449,165450],{"class":35,"line":397},[33,165451,92],{"emptyLinePlaceholder":91},[33,165453,165454,165457,165459],{"class":35,"line":653},[33,165455,165456],{"class":167},"soffice_bin ",[33,165458,242],{"class":163},[33,165460,165461],{"class":167}," find_soffice()\n",[33,165463,165464,165466,165468,165470,165473,165475,165478,165480,165482],{"class":35,"line":667},[33,165465,13474],{"class":50},[33,165467,602],{"class":167},[33,165469,4059],{"class":163},[33,165471,165472],{"class":54},"\"Using: ",[33,165474,1115],{"class":50},[33,165476,165477],{"class":167},"soffice_bin",[33,165479,1121],{"class":50},[33,165481,274],{"class":54},[33,165483,221],{"class":167},[14,165485,165486,165487,42238,165489,58767,165491,165493],{},"Then substitute ",[30,165488,165477],{},[30,165490,165324],{},[30,165492,149485],{}," call above.",[18,165495,165497],{"id":165496},"variant-fix-b-libreoffice-profile-lock-tmp-permission-issues","Variant Fix B: LibreOffice Profile Lock \u002F tmp Permission Issues",[14,165499,165500],{},"On headless servers, multiple processes sharing a single LibreOffice user profile directory cause:",[23,165502,165505],{"className":165503,"code":165504,"language":2000},[1998],"[Java framework] Error in function createSettingsDocument (elements.cxx)\nuser installation could not be completed\n",[30,165506,165504],{"__ignoreMap":28},[14,165508,165509,165510,165513,165514,165516],{},"Or a conversion appears to succeed (exit code 0) but produces no PDF. The root cause is that LibreOffice tries to write its configuration to ",[30,165511,165512],{},"~\u002F.config\u002Flibreoffice\u002F"," and either the directory is locked by another ",[30,165515,164121],{}," process or the running user has no home directory (common in Docker containers).",[14,165518,165519,165520,165523,165524,165527,165528,165530],{},"The fix is already shown in the cross-platform wrapper above: pass ",[30,165521,165522],{},"-env:UserInstallation=file:\u002F\u002F…"," pointing to a fresh ",[30,165525,165526],{},"tempfile.TemporaryDirectory()"," for each conversion. This makes every ",[30,165529,164121],{}," invocation fully independent.",[14,165532,165533,165534,165536],{},"For Docker environments where ",[30,165535,7938],{}," does not exist, also set:",[23,165538,165540],{"className":36048,"code":165539,"language":36050,"meta":28,"style":28},"ENV HOME=\u002Ftmp\nRUN mkdir -p \u002Ftmp\u002F.config\u002Flibreoffice\n",[30,165541,165542,165547],{"__ignoreMap":28},[33,165543,165544],{"class":35,"line":36},[33,165545,165546],{},"ENV HOME=\u002Ftmp\n",[33,165548,165549],{"class":35,"line":43},[33,165550,165551],{},"RUN mkdir -p \u002Ftmp\u002F.config\u002Flibreoffice\n",[14,165553,165554,165555,165558,165559,3035],{},"Or use the ",[30,165556,165557],{},"-env:UserInstallation"," flag with an explicit writable path rather than relying on ",[30,165560,165561],{},"$HOME",[18,165563,165565],{"id":165564},"variant-fix-c-github-actions-and-cicd-pipelines","Variant Fix C: GitHub Actions and CI\u002FCD Pipelines",[14,165567,165568,165570],{},[30,165569,163908],{}," is especially problematic in CI environments because GitHub-hosted runners use Ubuntu, and Windows runners do not have Microsoft Word installed. The correct approach is to install LibreOffice on the CI runner and use the cross-platform wrapper shown above.",[14,165572,165573,165574,20891],{},"Add to your workflow ",[30,165575,165576],{},".yml",[23,165578,165580],{"className":2062,"code":165579,"language":2064,"meta":28,"style":28},"# .github\u002Fworkflows\u002Fconvert.yml\nname: Convert DOCX to PDF\n\non:\n  push:\n    paths:\n      - 'documents\u002F**\u002F*.docx'\n\njobs:\n  convert:\n    runs-on: ubuntu-latest  # Linux runner; docx2pdf will NOT work here\n\n    steps:\n      - uses: actions\u002Fcheckout@v4\n\n      - name: Install LibreOffice\n        run: |\n          sudo apt-get update\n          sudo apt-get install -y libreoffice ttf-mscorefonts-installer\n          sudo fc-cache -f -v\n\n      - name: Set up Python\n        uses: actions\u002Fsetup-python@v5\n        with:\n          python-version: '3.12'\n\n      - name: Install Python deps\n        run: pip install pypdf\n\n      - name: Convert documents\n        run: python scripts\u002Fbatch_convert.py documents\u002F output_pdfs\u002F\n\n      - name: Upload PDFs\n        uses: actions\u002Fupload-artifact@v4\n        with:\n          name: converted-pdfs\n          path: output_pdfs\u002F\n",[30,165581,165582,165587,165596,165600,165606,165613,165620,165627,165631,165637,165643,165654,165658,165664,165674,165678,165689,165697,165702,165707,165712,165716,165727,165735,165741,165750,165754,165764,165773,165777,165788,165797,165801,165812,165820,165826,165835],{"__ignoreMap":28},[33,165583,165584],{"class":35,"line":36},[33,165585,165586],{"class":39},"# .github\u002Fworkflows\u002Fconvert.yml\n",[33,165588,165589,165591,165593],{"class":35,"line":43},[33,165590,1118],{"class":2076},[33,165592,2079],{"class":167},[33,165594,165595],{"class":54},"Convert DOCX to PDF\n",[33,165597,165598],{"class":35,"line":61},[33,165599,92],{"emptyLinePlaceholder":91},[33,165601,165602,165604],{"class":35,"line":73},[33,165603,2091],{"class":50},[33,165605,574],{"class":167},[33,165607,165608,165611],{"class":35,"line":88},[33,165609,165610],{"class":2076},"  push",[33,165612,574],{"class":167},[33,165614,165615,165618],{"class":35,"line":95},[33,165616,165617],{"class":2076},"    paths",[33,165619,574],{"class":167},[33,165621,165622,165624],{"class":35,"line":101},[33,165623,2167],{"class":167},[33,165625,165626],{"class":54},"'documents\u002F**\u002F*.docx'\n",[33,165628,165629],{"class":35,"line":171},[33,165630,92],{"emptyLinePlaceholder":91},[33,165632,165633,165635],{"class":35,"line":179},[33,165634,2136],{"class":2076},[33,165636,574],{"class":167},[33,165638,165639,165641],{"class":35,"line":187},[33,165640,71216],{"class":2076},[33,165642,574],{"class":167},[33,165644,165645,165647,165649,165651],{"class":35,"line":201},[33,165646,2150],{"class":2076},[33,165648,2079],{"class":167},[33,165650,36696],{"class":54},[33,165652,165653],{"class":39},"  # Linux runner; docx2pdf will NOT work here\n",[33,165655,165656],{"class":35,"line":206},[33,165657,92],{"emptyLinePlaceholder":91},[33,165659,165660,165662],{"class":35,"line":224},[33,165661,2160],{"class":2076},[33,165663,574],{"class":167},[33,165665,165666,165668,165670,165672],{"class":35,"line":229},[33,165667,2167],{"class":167},[33,165669,2170],{"class":2076},[33,165671,2079],{"class":167},[33,165673,2175],{"class":54},[33,165675,165676],{"class":35,"line":235},[33,165677,92],{"emptyLinePlaceholder":91},[33,165679,165680,165682,165684,165686],{"class":35,"line":250},[33,165681,2167],{"class":167},[33,165683,1118],{"class":2076},[33,165685,2079],{"class":167},[33,165687,165688],{"class":54},"Install LibreOffice\n",[33,165690,165691,165693,165695],{"class":35,"line":266},[33,165692,2219],{"class":2076},[33,165694,2079],{"class":167},[33,165696,80950],{"class":163},[33,165698,165699],{"class":35,"line":290},[33,165700,165701],{"class":54},"          sudo apt-get update\n",[33,165703,165704],{"class":35,"line":295},[33,165705,165706],{"class":54},"          sudo apt-get install -y libreoffice ttf-mscorefonts-installer\n",[33,165708,165709],{"class":35,"line":300},[33,165710,165711],{"class":54},"          sudo fc-cache -f -v\n",[33,165713,165714],{"class":35,"line":317},[33,165715,92],{"emptyLinePlaceholder":91},[33,165717,165718,165720,165722,165724],{"class":35,"line":332},[33,165719,2167],{"class":167},[33,165721,1118],{"class":2076},[33,165723,2079],{"class":167},[33,165725,165726],{"class":54},"Set up Python\n",[33,165728,165729,165731,165733],{"class":35,"line":347},[33,165730,2287],{"class":2076},[33,165732,2079],{"class":167},[33,165734,2186],{"class":54},[33,165736,165737,165739],{"class":35,"line":374},[33,165738,2191],{"class":2076},[33,165740,574],{"class":167},[33,165742,165743,165745,165747],{"class":35,"line":397},[33,165744,2198],{"class":2076},[33,165746,2079],{"class":167},[33,165748,165749],{"class":54},"'3.12'\n",[33,165751,165752],{"class":35,"line":653},[33,165753,92],{"emptyLinePlaceholder":91},[33,165755,165756,165758,165760,165762],{"class":35,"line":667},[33,165757,2167],{"class":167},[33,165759,1118],{"class":2076},[33,165761,2079],{"class":167},[33,165763,80980],{"class":54},[33,165765,165766,165768,165770],{"class":35,"line":675},[33,165767,2219],{"class":2076},[33,165769,2079],{"class":167},[33,165771,165772],{"class":54},"pip install pypdf\n",[33,165774,165775],{"class":35,"line":689},[33,165776,92],{"emptyLinePlaceholder":91},[33,165778,165779,165781,165783,165785],{"class":35,"line":703},[33,165780,2167],{"class":167},[33,165782,1118],{"class":2076},[33,165784,2079],{"class":167},[33,165786,165787],{"class":54},"Convert documents\n",[33,165789,165790,165792,165794],{"class":35,"line":714},[33,165791,2219],{"class":2076},[33,165793,2079],{"class":167},[33,165795,165796],{"class":54},"python scripts\u002Fbatch_convert.py documents\u002F output_pdfs\u002F\n",[33,165798,165799],{"class":35,"line":723},[33,165800,92],{"emptyLinePlaceholder":91},[33,165802,165803,165805,165807,165809],{"class":35,"line":754},[33,165804,2167],{"class":167},[33,165806,1118],{"class":2076},[33,165808,2079],{"class":167},[33,165810,165811],{"class":54},"Upload PDFs\n",[33,165813,165814,165816,165818],{"class":35,"line":771},[33,165815,2287],{"class":2076},[33,165817,2079],{"class":167},[33,165819,2292],{"class":54},[33,165821,165822,165824],{"class":35,"line":777},[33,165823,2191],{"class":2076},[33,165825,574],{"class":167},[33,165827,165828,165830,165832],{"class":35,"line":788},[33,165829,2303],{"class":2076},[33,165831,2079],{"class":167},[33,165833,165834],{"class":54},"converted-pdfs\n",[33,165836,165837,165839,165841],{"class":35,"line":804},[33,165838,2313],{"class":2076},[33,165840,2079],{"class":167},[33,165842,165843],{"class":54},"output_pdfs\u002F\n",[14,165845,150300],{},[4211,165847,165848,165863,165869],{},[4214,165849,165850,165851,165853,165854,165856,165857,165859,165860,3035],{},"Do not install ",[30,165852,163908],{}," in the CI ",[30,165855,36846],{}," step — it imports cleanly on Linux but raises ",[30,165858,86120],{}," at runtime. If other parts of your codebase import it, guard the import with ",[30,165861,165862],{},"if platform.system() != \"Linux\"",[4214,165864,165865,165868],{},[30,165866,165867],{},"ttf-mscorefonts-installer"," installs Arial, Times New Roman, Courier New, and other common fonts. Without it, LibreOffice substitutes its own fonts and text reflows.",[4214,165870,165871,165872,165874],{},"For self-hosted runners on Windows, switch to ",[30,165873,163908],{}," and ensure Word is installed on the runner machine.",[18,165876,165878],{"id":165877},"variant-fix-d-checking-libreoffice-font-rendering","Variant Fix D: Checking LibreOffice Font Rendering",[14,165880,165881,165882,165884],{},"Even after installing ",[30,165883,165867],{},", some documents use fonts not in that package (Impact, Wingdings, custom brand fonts). LibreOffice silently substitutes them, which shifts text and can cause overflows or empty pages.",[14,165886,165887],{},"Diagnose font substitution by checking LibreOffice's font list:",[23,165889,165891],{"className":25,"code":165890,"language":27,"meta":28,"style":28},"# List fonts LibreOffice can see\nsoffice --headless --infilter=\"impress8\" --convert-to txt \u002Fdev\u002Fnull 2>&1 | grep -i \"font\"\n# Or more directly:\nfc-list | grep -i \"arial\"\n",[30,165892,165893,165898,165931,165936],{"__ignoreMap":28},[33,165894,165895],{"class":35,"line":36},[33,165896,165897],{"class":39},"# List fonts LibreOffice can see\n",[33,165899,165900,165902,165905,165908,165911,165914,165917,165920,165922,165924,165926,165928],{"class":35,"line":43},[33,165901,164121],{"class":46},[33,165903,165904],{"class":50}," --headless",[33,165906,165907],{"class":50}," --infilter=",[33,165909,165910],{"class":54},"\"impress8\"",[33,165912,165913],{"class":50}," --convert-to",[33,165915,165916],{"class":54}," txt",[33,165918,165919],{"class":54}," \u002Fdev\u002Fnull",[33,165921,81324],{"class":163},[33,165923,2850],{"class":163},[33,165925,41954],{"class":46},[33,165927,41957],{"class":50},[33,165929,165930],{"class":54}," \"font\"\n",[33,165932,165933],{"class":35,"line":61},[33,165934,165935],{"class":39},"# Or more directly:\n",[33,165937,165938,165941,165943,165945,165947],{"class":35,"line":73},[33,165939,165940],{"class":46},"fc-list",[33,165942,2850],{"class":163},[33,165944,41954],{"class":46},[33,165946,41957],{"class":50},[33,165948,165949],{"class":54}," \"arial\"\n",[14,165951,165952],{},"To add a custom font on a server:",[23,165954,165956],{"className":25,"code":165955,"language":27,"meta":28,"style":28},"# Copy the .ttf or .otf file\nsudo mkdir -p \u002Fusr\u002Fshare\u002Ffonts\u002Fcustom\nsudo cp BrandFont-Regular.ttf \u002Fusr\u002Fshare\u002Ffonts\u002Fcustom\u002F\nsudo fc-cache -f -v\n# Verify it is now visible\nfc-list | grep BrandFont\n",[30,165957,165958,165963,165975,165988,166000,166005],{"__ignoreMap":28},[33,165959,165960],{"class":35,"line":36},[33,165961,165962],{"class":39},"# Copy the .ttf or .otf file\n",[33,165964,165965,165967,165970,165972],{"class":35,"line":43},[33,165966,9669],{"class":46},[33,165968,165969],{"class":54}," mkdir",[33,165971,59504],{"class":50},[33,165973,165974],{"class":54}," \u002Fusr\u002Fshare\u002Ffonts\u002Fcustom\n",[33,165976,165977,165979,165982,165985],{"class":35,"line":61},[33,165978,9669],{"class":46},[33,165980,165981],{"class":54}," cp",[33,165983,165984],{"class":54}," BrandFont-Regular.ttf",[33,165986,165987],{"class":54}," \u002Fusr\u002Fshare\u002Ffonts\u002Fcustom\u002F\n",[33,165989,165990,165992,165995,165997],{"class":35,"line":73},[33,165991,9669],{"class":46},[33,165993,165994],{"class":54}," fc-cache",[33,165996,35263],{"class":50},[33,165998,165999],{"class":50}," -v\n",[33,166001,166002],{"class":35,"line":88},[33,166003,166004],{"class":39},"# Verify it is now visible\n",[33,166006,166007,166009,166011,166013],{"class":35,"line":95},[33,166008,165940],{"class":46},[33,166010,2850],{"class":163},[33,166012,41954],{"class":46},[33,166014,166015],{"class":54}," BrandFont\n",[14,166017,166018,166019,166021],{},"Then re-run the conversion. For production pipelines that must guarantee exact font rendering, consider running conversions on a macOS or Windows CI runner where ",[30,166020,163908],{}," can use Word's own rendering engine.",[18,166023,166025],{"id":166024},"troubleshooting-quick-reference","Troubleshooting Quick Reference",[4273,166027,166028,166038],{},[4276,166029,166030],{},[4279,166031,166032,166034,166036],{},[4282,166033,4284],{"align":128901},[4282,166035,101762],{"align":128901},[4282,166037,4290],{"align":128901},[4292,166039,166040,166053,166066,166085,166100,166114,166132],{},[4279,166041,166042,166047,166050],{},[4297,166043,166044],{"align":128901},[30,166045,166046],{},"NotImplementedError: docx2pdf is not implemented for linux",[4297,166048,166049],{"align":128901},"Hard platform check in docx2pdf",[4297,166051,166052],{"align":128901},"Replace with LibreOffice headless (Step 2 above)",[4279,166054,166055,166060,166063],{},[4297,166056,166057],{"align":128901},[30,166058,166059],{},"ModuleNotFoundError: No module named 'win32com'",[4297,166061,166062],{"align":128901},"win32com is a Windows-only package",[4297,166064,166065],{"align":128901},"Same — switch to LibreOffice on Linux",[4279,166067,166068,166073,166076],{},[4297,166069,166070],{"align":128901},[30,166071,166072],{},"FileNotFoundError: soffice",[4297,166074,166075],{"align":128901},"LibreOffice not installed or not on PATH",[4297,166077,166078,14391,166081,166084],{"align":128901},[30,166079,166080],{},"sudo apt install libreoffice",[30,166082,166083],{},"find_soffice()"," (Variant A)",[4279,166086,166087,166092,166095],{},[4297,166088,166089],{"align":128901},[30,166090,166091],{},"user installation could not be completed",[4297,166093,166094],{"align":128901},"LibreOffice profile dir locked or missing",[4297,166096,17059,166097,166099],{"align":128901},[30,166098,165557],{}," with a temp dir (Variant B)",[4279,166101,166102,166105,166108],{},[4297,166103,166104],{"align":128901},"PDF produced but has wrong fonts \u002F text overflow",[4297,166106,166107],{"align":128901},"Required fonts not installed on server",[4297,166109,166110,166111,166113],{"align":128901},"Install ",[30,166112,165867],{}," and custom fonts (Variant D)",[4279,166115,166116,166119,166122],{},[4297,166117,166118],{"align":128901},"Exit code 0 but no PDF file in outdir",[4297,166120,166121],{"align":128901},"soffice silently failed (corrupt docx, missing font)",[4297,166123,67848,166124,166127,166128,8877,166130,146166],{"align":128901},[30,166125,166126],{},"stderr","; validate source ",[30,166129,18051],{},[30,166131,18041],{},[4279,166133,166134,166137,166140],{},[4297,166135,166136],{"align":128901},"GitHub Actions CI fails on ubuntu runner",[4297,166138,166139],{"align":128901},"CI uses Linux, docx2pdf hard-fails",[4297,166141,166142],{"align":128901},"Install LibreOffice on runner; use cross-platform wrapper (Variant C)",[18,166144,9247],{"id":9246},[14,166146,166147],{},"After applying the fix, run the diagnostic again and check that a real PDF is produced:",[23,166149,166151],{"className":126,"code":166150,"language":47,"meta":28,"style":28},"# pip install pypdf\nfrom pathlib import Path\nfrom pypdf import PdfReader\nfrom your_module import convert_docx_to_pdf  # the wrapper above\n\ndocx = Path(\"documents\u002Ftest.docx\")\nout_dir = Path(\"output_pdfs\")\n\ntry:\n    pdf = convert_docx_to_pdf(docx, out_dir)\n    reader = PdfReader(pdf)\n    print(f\"PDF pages: {len(reader.pages)}\")   # should be > 0\n    assert len(reader.pages) > 0, \"Empty PDF — conversion failed silently\"\n    print(\"Conversion verified.\")\nexcept AssertionError as exc:\n    print(f\"Verification failed: {exc}\")\nexcept Exception as exc:\n    print(f\"Error: {exc}\")\n",[30,166152,166153,166157,166167,166177,166192,166196,166210,166223,166227,166233,166242,166250,166274,166291,166302,166312,166333,166343],{"__ignoreMap":28},[33,166154,166155],{"class":35,"line":36},[33,166156,57316],{"class":39},[33,166158,166159,166161,166163,166165],{"class":35,"line":43},[33,166160,190],{"class":163},[33,166162,193],{"class":167},[33,166164,164],{"class":163},[33,166166,198],{"class":167},[33,166168,166169,166171,166173,166175],{"class":35,"line":61},[33,166170,190],{"class":163},[33,166172,57333],{"class":167},[33,166174,164],{"class":163},[33,166176,57338],{"class":167},[33,166178,166179,166181,166184,166186,166189],{"class":35,"line":73},[33,166180,190],{"class":163},[33,166182,166183],{"class":167}," your_module ",[33,166185,164],{"class":163},[33,166187,166188],{"class":167}," convert_docx_to_pdf  ",[33,166190,166191],{"class":39},"# the wrapper above\n",[33,166193,166194],{"class":35,"line":88},[33,166195,92],{"emptyLinePlaceholder":91},[33,166197,166198,166201,166203,166205,166208],{"class":35,"line":95},[33,166199,166200],{"class":167},"docx ",[33,166202,242],{"class":163},[33,166204,215],{"class":167},[33,166206,166207],{"class":54},"\"documents\u002Ftest.docx\"",[33,166209,221],{"class":167},[33,166211,166212,166215,166217,166219,166221],{"class":35,"line":101},[33,166213,166214],{"class":167},"out_dir ",[33,166216,242],{"class":163},[33,166218,215],{"class":167},[33,166220,164542],{"class":54},[33,166222,221],{"class":167},[33,166224,166225],{"class":35,"line":171},[33,166226,92],{"emptyLinePlaceholder":91},[33,166228,166229,166231],{"class":35,"line":179},[33,166230,35574],{"class":163},[33,166232,574],{"class":167},[33,166234,166235,166237,166239],{"class":35,"line":187},[33,166236,46704],{"class":167},[33,166238,242],{"class":163},[33,166240,166241],{"class":167}," convert_docx_to_pdf(docx, out_dir)\n",[33,166243,166244,166246,166248],{"class":35,"line":201},[33,166245,57365],{"class":167},[33,166247,242],{"class":163},[33,166249,87564],{"class":167},[33,166251,166252,166254,166256,166258,166261,166263,166265,166267,166269,166271],{"class":35,"line":206},[33,166253,7268],{"class":50},[33,166255,602],{"class":167},[33,166257,4059],{"class":163},[33,166259,166260],{"class":54},"\"PDF pages: ",[33,166262,4065],{"class":50},[33,166264,59322],{"class":167},[33,166266,1121],{"class":50},[33,166268,274],{"class":54},[33,166270,12000],{"class":167},[33,166272,166273],{"class":39},"# should be > 0\n",[33,166275,166276,166278,166280,166282,166284,166286,166288],{"class":35,"line":224},[33,166277,9228],{"class":163},[33,166279,4037],{"class":50},[33,166281,57383],{"class":167},[33,166283,6009],{"class":163},[33,166285,10791],{"class":50},[33,166287,365],{"class":167},[33,166289,166290],{"class":54},"\"Empty PDF — conversion failed silently\"\n",[33,166292,166293,166295,166297,166300],{"class":35,"line":229},[33,166294,7268],{"class":50},[33,166296,602],{"class":167},[33,166298,166299],{"class":54},"\"Conversion verified.\"",[33,166301,221],{"class":167},[33,166303,166304,166306,166308,166310],{"class":35,"line":235},[33,166305,35726],{"class":163},[33,166307,9445],{"class":50},[33,166309,1852],{"class":163},[33,166311,1855],{"class":167},[33,166313,166314,166316,166318,166320,166323,166325,166327,166329,166331],{"class":35,"line":250},[33,166315,7268],{"class":50},[33,166317,602],{"class":167},[33,166319,4059],{"class":163},[33,166321,166322],{"class":54},"\"Verification failed: ",[33,166324,1115],{"class":50},[33,166326,6565],{"class":167},[33,166328,1121],{"class":50},[33,166330,274],{"class":54},[33,166332,221],{"class":167},[33,166334,166335,166337,166339,166341],{"class":35,"line":266},[33,166336,35726],{"class":163},[33,166338,783],{"class":50},[33,166340,1852],{"class":163},[33,166342,1855],{"class":167},[33,166344,166345,166347,166349,166351,166353,166355,166357,166359,166361],{"class":35,"line":290},[33,166346,7268],{"class":50},[33,166348,602],{"class":167},[33,166350,4059],{"class":163},[33,166352,39108],{"class":54},[33,166354,1115],{"class":50},[33,166356,6565],{"class":167},[33,166358,1121],{"class":50},[33,166360,274],{"class":54},[33,166362,221],{"class":167},[14,166364,166365,166366,166368,166369,166371,166372,166374,166375,3035],{},"A page count greater than zero confirms LibreOffice rendered the document correctly. If you are generating the source ",[30,166367,18051],{}," files programmatically, see ",[940,166370,156152],{"href":26562}," for the templating patterns that pair with this conversion step. For building PDFs from data without the ",[30,166373,18051],{}," intermediate, see ",[940,166376,26191],{"href":19001},[18,166378,6918],{"id":6917},[4211,166380,166381,166386,166391],{},[4214,166382,166383,166385],{},[940,166384,161278],{"href":161277}," — full guide covering both engines, batch conversion, font fidelity, and Docker deployment",[4214,166387,166388,166390],{},[940,166389,156152],{"href":26562}," — create the .docx source files before converting them",[4214,166392,166393,166395],{},[940,166394,26191],{"href":19001}," — alternative: generate PDFs directly from data using ReportLab or WeasyPrint",[14,166397,6947,166398,3035],{},[940,166399,161278],{"href":161277},[6953,166401,166402],{},"html pre.shiki code .sAwPA, html code.shiki .sAwPA{--shiki-default:#6A737D}html pre.shiki code .sD7c4, html code.shiki .sD7c4{--shiki-default:#D73A49}html pre.shiki code .sgsFI, html code.shiki .sgsFI{--shiki-default:#24292E}html pre.shiki code .sYu0t, html code.shiki .sYu0t{--shiki-default:#005CC5}html pre.shiki code .sYBdl, html code.shiki .sYBdl{--shiki-default:#032F62}html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html pre.shiki code .s7eDp, html code.shiki .s7eDp{--shiki-default:#6F42C1}html pre.shiki code .sqxcx, html code.shiki .sqxcx{--shiki-default:#E36209}html pre.shiki code .shJU0, html code.shiki .shJU0{--shiki-default:#22863A}",{"title":28,"searchDepth":43,"depth":43,"links":166404},[166405,166406,166407,166411,166412,166413,166414,166415,166416,166417,166418],{"id":7020,"depth":43,"text":7021},{"id":35016,"depth":43,"text":35017},{"id":164114,"depth":43,"text":164115,"children":166408},[166409,166410],{"id":164127,"depth":61,"text":164128},{"id":164202,"depth":61,"text":164203},{"id":164648,"depth":43,"text":164649},{"id":165254,"depth":43,"text":165255},{"id":165496,"depth":43,"text":165497},{"id":165564,"depth":43,"text":165565},{"id":165877,"depth":43,"text":165878},{"id":166024,"depth":43,"text":166025},{"id":9246,"depth":43,"text":9247},{"id":6917,"depth":43,"text":6918},"Fix docx2pdf on Linux","Fix \"docx2pdf is not implemented for linux\" and win32com errors by switching to LibreOffice headless subprocess — includes a cross-platform wrapper and soffice PATH fixes.",{},"\u002Fword-document-templating-batch-processing\u002Fconverting-docx-to-pdf-with-python\u002Ffix-docx2pdf-error-on-linux",{"title":163898,"description":166420},"Fix docx2pdf Error on Linux — NotImplementedError & COM Errors","word-document-templating-batch-processing\u002Fconverting-docx-to-pdf-with-python\u002Ffix-docx2pdf-error-on-linux\u002Findex",[47,163908,166427,43144,81739],"libreoffice","zdpixVVOSoVInTFT_zSsFyBjB44xjJebVQ-rKbG7ovw",{"id":166430,"title":161278,"body":166431,"breadcrumbTitle":170107,"canonical":6977,"date":6978,"description":170108,"draft":6980,"extension":6981,"image":6977,"meta":170109,"navigation":91,"path":170110,"robots":6977,"seo":170111,"seoTitle":170112,"stem":170113,"tags":170114,"updatedAt":6978,"__hash__":170116},"content\u002Fword-document-templating-batch-processing\u002Fconverting-docx-to-pdf-with-python\u002Findex.md",{"type":7,"value":166432,"toc":170085},[166433,166436,166441,166454,166465,166467,166510,166513,166535,166539,166545,166731,166822,166826,166831,167022,167031,167035,167049,167354,167363,167367,167459,167465,167469,167472,168036,168040,168044,168052,168058,168088,168106,168110,168113,168131,168143,168147,168151,168157,168303,168307,168312,168376,168380,168383,168429,168433,168439,168670,168674,168683,168867,168879,168888,168892,168998,169002,170052,170054,170079,170083],[10,166434,161278],{"id":166435},"converting-docx-to-pdf-with-python",[14,166437,166438,166440],{},[30,166439,18051],{}," is the source format; PDF is what you send. Doing this manually — open Word, File → Export, repeat — falls apart the moment you have a folder of 200 contracts. Python can drive the conversion engine directly, whether that engine is Microsoft Word on Windows\u002FmacOS or LibreOffice headless on a Linux server.",[14,166442,166443,166444,166446,166447,166450,166451,166453],{},"Two common tools exist: ",[30,166445,163908],{},", which wraps Word's COM automation, and LibreOffice's ",[30,166448,166449],{},"soffice --headless"," command, which runs the LibreOffice rendering engine without a GUI. Choosing the wrong one for your environment is the single most common conversion failure — ",[30,166452,163908],{}," simply will not run on Linux. This guide diagnoses your environment first, then provides battle-tested code for each path.",[14,166455,166456,166457,166459,166460,166462,166463,3035],{},"For background on dynamically generating the ",[30,166458,18051],{}," files you will be converting, see ",[940,166461,156152],{"href":26562},". Once converted, if you need to assemble the resulting PDFs into a single deliverable, see ",[940,166464,26191],{"href":19001},[18,166466,21],{"id":20},[23,166468,166470],{"className":25,"code":166469,"language":27,"meta":28,"style":28},"# Windows \u002F macOS (requires Microsoft Word installed)\npip install docx2pdf\n\n# Linux \u002F server environments (requires LibreOffice)\n# Ubuntu\u002FDebian: sudo apt install libreoffice\n# RHEL\u002FCentOS:   sudo yum install libreoffice\n# Verify: soffice --version\n",[30,166471,166472,166477,166486,166490,166495,166500,166505],{"__ignoreMap":28},[33,166473,166474],{"class":35,"line":36},[33,166475,166476],{"class":39},"# Windows \u002F macOS (requires Microsoft Word installed)\n",[33,166478,166479,166481,166483],{"class":35,"line":43},[33,166480,76],{"class":46},[33,166482,79],{"class":54},[33,166484,166485],{"class":54}," docx2pdf\n",[33,166487,166488],{"class":35,"line":61},[33,166489,92],{"emptyLinePlaceholder":91},[33,166491,166492],{"class":35,"line":73},[33,166493,166494],{"class":39},"# Linux \u002F server environments (requires LibreOffice)\n",[33,166496,166497],{"class":35,"line":88},[33,166498,166499],{"class":39},"# Ubuntu\u002FDebian: sudo apt install libreoffice\n",[33,166501,166502],{"class":35,"line":95},[33,166503,166504],{"class":39},"# RHEL\u002FCentOS:   sudo yum install libreoffice\n",[33,166506,166507],{"class":35,"line":101},[33,166508,166509],{"class":39},"# Verify: soffice --version\n",[14,166511,166512],{},"Set up a test file before proceeding:",[23,166514,166516],{"className":25,"code":166515,"language":27,"meta":28,"style":28},"mkdir -p input_docs output_pdfs\n# Place a sample.docx in input_docs\u002F before running the examples below\n",[30,166517,166518,166530],{"__ignoreMap":28},[33,166519,166520,166522,166524,166527],{"class":35,"line":36},[33,166521,59501],{"class":46},[33,166523,59504],{"class":50},[33,166525,166526],{"class":54}," input_docs",[33,166528,166529],{"class":54}," output_pdfs\n",[33,166531,166532],{"class":35,"line":43},[33,166533,166534],{"class":39},"# Place a sample.docx in input_docs\u002F before running the examples below\n",[18,166536,166538],{"id":166537},"_1-detect-your-environment","1. Detect Your Environment",[14,166540,166541,166542,166544],{},"Before writing a single conversion line, check which engine is available. Running docx2pdf on Linux raises ",[30,166543,86120],{}," immediately; running LibreOffice on a machine where only Word is available wastes time. The snippet below makes the engine explicit at startup.",[23,166546,166548],{"className":126,"code":166547,"language":47,"meta":28,"style":28},"# pip install docx2pdf  (Windows\u002FmacOS only)\nimport platform\nimport shutil\nfrom pathlib import Path\n\ndef detect_engine() -> str:\n    \"\"\"Return 'docx2pdf' on Windows\u002FmacOS if Word is accessible, else 'libreoffice'.\"\"\"\n    system = platform.system()\n    if system in (\"Windows\", \"Darwin\"):\n        try:\n            import docx2pdf  # noqa: F401\n            return \"docx2pdf\"\n        except ImportError:\n            pass\n    soffice = shutil.which(\"soffice\") or shutil.which(\"libreoffice\")\n    if soffice:\n        return \"libreoffice\"\n    raise RuntimeError(\n        \"No conversion engine found. \"\n        \"Install docx2pdf (Windows\u002FmacOS) or LibreOffice (Linux\u002Fserver).\"\n    )\n\nprint(detect_engine())\n",[30,166549,166550,166555,166561,166567,166577,166581,166594,166599,166608,166627,166633,166644,166651,166659,166663,166684,166691,166698,166706,166711,166716,166720,166724],{"__ignoreMap":28},[33,166551,166552],{"class":35,"line":36},[33,166553,166554],{"class":39},"# pip install docx2pdf  (Windows\u002FmacOS only)\n",[33,166556,166557,166559],{"class":35,"line":43},[33,166558,164],{"class":163},[33,166560,163978],{"class":167},[33,166562,166563,166565],{"class":35,"line":61},[33,166564,164],{"class":163},[33,166566,41706],{"class":167},[33,166568,166569,166571,166573,166575],{"class":35,"line":73},[33,166570,190],{"class":163},[33,166572,193],{"class":167},[33,166574,164],{"class":163},[33,166576,198],{"class":167},[33,166578,166579],{"class":35,"line":88},[33,166580,92],{"emptyLinePlaceholder":91},[33,166582,166583,166585,166588,166590,166592],{"class":35,"line":95},[33,166584,562],{"class":163},[33,166586,166587],{"class":46}," detect_engine",[33,166589,568],{"class":167},[33,166591,1053],{"class":50},[33,166593,574],{"class":167},[33,166595,166596],{"class":35,"line":101},[33,166597,166598],{"class":54},"    \"\"\"Return 'docx2pdf' on Windows\u002FmacOS if Word is accessible, else 'libreoffice'.\"\"\"\n",[33,166600,166601,166604,166606],{"class":35,"line":171},[33,166602,166603],{"class":167},"    system ",[33,166605,242],{"class":163},[33,166607,164714],{"class":167},[33,166609,166610,166612,166615,166617,166619,166621,166623,166625],{"class":35,"line":179},[33,166611,617],{"class":163},[33,166613,166614],{"class":167}," system ",[33,166616,662],{"class":163},[33,166618,17583],{"class":167},[33,166620,164831],{"class":54},[33,166622,365],{"class":167},[33,166624,164836],{"class":54},[33,166626,1737],{"class":167},[33,166628,166629,166631],{"class":35,"line":187},[33,166630,670],{"class":163},[33,166632,574],{"class":167},[33,166634,166635,166638,166641],{"class":35,"line":201},[33,166636,166637],{"class":163},"            import",[33,166639,166640],{"class":167}," docx2pdf  ",[33,166642,166643],{"class":39},"# noqa: F401\n",[33,166645,166646,166648],{"class":35,"line":206},[33,166647,28782],{"class":163},[33,166649,166650],{"class":54}," \"docx2pdf\"\n",[33,166652,166653,166655,166657],{"class":35,"line":224},[33,166654,780],{"class":163},[33,166656,40488],{"class":50},[33,166658,574],{"class":167},[33,166660,166661],{"class":35,"line":229},[33,166662,3552],{"class":163},[33,166664,166665,166668,166670,166672,166674,166676,166678,166680,166682],{"class":35,"line":235},[33,166666,166667],{"class":167},"    soffice ",[33,166669,242],{"class":163},[33,166671,41716],{"class":167},[33,166673,165324],{"class":54},[33,166675,1649],{"class":167},[33,166677,7162],{"class":163},[33,166679,41716],{"class":167},[33,166681,165333],{"class":54},[33,166683,221],{"class":167},[33,166685,166686,166688],{"class":35,"line":250},[33,166687,617],{"class":163},[33,166689,166690],{"class":167}," soffice:\n",[33,166692,166693,166695],{"class":35,"line":266},[33,166694,1659],{"class":163},[33,166696,166697],{"class":54}," \"libreoffice\"\n",[33,166699,166700,166702,166704],{"class":35,"line":290},[33,166701,35742],{"class":163},[33,166703,7590],{"class":50},[33,166705,7637],{"class":167},[33,166707,166708],{"class":35,"line":295},[33,166709,166710],{"class":54},"        \"No conversion engine found. \"\n",[33,166712,166713],{"class":35,"line":300},[33,166714,166715],{"class":54},"        \"Install docx2pdf (Windows\u002FmacOS) or LibreOffice (Linux\u002Fserver).\"\n",[33,166717,166718],{"class":35,"line":317},[33,166719,1202],{"class":167},[33,166721,166722],{"class":35,"line":332},[33,166723,92],{"emptyLinePlaceholder":91},[33,166725,166726,166728],{"class":35,"line":347},[33,166727,13474],{"class":50},[33,166729,166730],{"class":167},"(detect_engine())\n",[2540,166732,2547,166734,2547,166737,2547,166740,2547,2547,166754,2547,166756,2547,166759,2547,2547,166762,2547,2547,166765,2547,166768,2547,166771,2547,2547,166773,2547,166775,2547,2547,166778,2547,166781,2547,166784,2547,166787,2547,2547,166789,2547,166791,2547,2547,166795,2547,166797,2547,166801,2547,166804,2547,2547,166807,2547,2547,166809,2547,2547,166811,2547,166813,2547,166815,2547,2547,166817,2547,166819],{"viewBox":11071,"role":2543,"ariaLabel":166733,"xmlns":2545,"style":2546},"Decision tree for choosing a DOCX to PDF conversion engine",[2549,166735,166736],{},"DOCX to PDF engine decision tree",[2553,166738,166739],{},"A decision tree showing that a .docx file feeds into an OS check; Windows or macOS leads to docx2pdf via Word COM, while Linux\u002Fserver leads to LibreOffice headless soffice, and both produce a PDF output.",[2557,166741,2559,166742,2559,166749,2547],{},[2561,166743,2564,166745,2564,166747,2559],{"id":166744,"x1":748,"y1":748,"x2":734,"y2":748},"docx2pdf-grad",[2566,166746],{"offset":748,"style":2568},[2566,166748],{"offset":734,"style":2571},[2573,166750,2564,166752,2559],{"id":166751,"viewBox":2576,"refX":2577,"refY":1153,"markerWidth":1179,"markerHeight":1179,"orient":2578},"docx2pdf-arrow",[2580,166753],{"d":2582,"fill":2583},[2585,166755],{"x":11231,"y":2587,"width":58337,"height":2680,"rx":2591,"fill":2592,"stroke":2593,"style":2594},[2000,166757,166758],{"x":2626,"y":26323,"fill":2599,"style":59932},".docx file",[2000,166760,166761],{"x":2626,"y":71517,"fill":2583,"style":2605},"source document",[35,166763],{"x1":2626,"y1":38748,"x2":2626,"y2":2679,"stroke":2583,"markerEnd":166764,"style":2594},"url(#docx2pdf-arrow)",[49826,166766],{"points":166767,"fill":2615,"stroke":2593,"style":2594},"380,110 480,155 380,200 280,155",[2000,166769,166770],{"x":2626,"y":2635,"fill":2599,"style":2685},"OS check",[2000,166772,163998],{"x":2626,"y":142716,"fill":2583,"style":2605},[35,166774],{"x1":49839,"y1":2598,"x2":2588,"y2":2598,"stroke":2583,"markerEnd":166764,"style":2594},[2000,166776,166777],{"x":2693,"y":11194,"fill":11166,"style":2605},"Windows \u002F macOS",[2585,166779],{"x":2587,"y":2589,"width":2679,"height":38748,"rx":2591,"fill":166780,"stroke":2593,"style":2594},"url(#docx2pdf-grad)",[2000,166782,163908],{"x":16991,"y":166783,"fill":2599,"style":59932},"149",[2000,166785,166786],{"x":16991,"y":59956,"fill":2599,"style":2605},"Word COM",[2000,166788,6989],{"x":16991,"y":38806,"fill":2583,"style":11127},[35,166790],{"x1":49852,"y1":2598,"x2":110848,"y2":2598,"stroke":2583,"markerEnd":166764,"style":2594},[2000,166792,166794],{"x":166793,"y":11194,"fill":11166,"style":2605},"555","Linux \u002F server",[2585,166796],{"x":110848,"y":2589,"width":2679,"height":38748,"rx":2591,"fill":166780,"stroke":2593,"style":2594},[2000,166798,166800],{"x":166799,"y":166783,"fill":2599,"style":59932},"685","LibreOffice",[2000,166802,166803],{"x":166799,"y":59956,"fill":2599,"style":2605},"headless",[2000,166805,166806],{"x":166799,"y":38806,"fill":2583,"style":11127},"soffice CLI",[35,166808],{"x1":16991,"y1":2697,"x2":16991,"y2":49869,"stroke":2583,"markerEnd":166764,"style":2594},[35,166810],{"x1":166799,"y1":2697,"x2":166799,"y2":49869,"stroke":2583,"markerEnd":166764,"style":2594},[35,166812],{"x1":16991,"y1":71592,"x2":26433,"y2":71592,"stroke":2583,"style":2594},[35,166814],{"x1":166799,"y1":71592,"x2":64900,"y2":71592,"stroke":2583,"style":2594},[35,166816],{"x1":2626,"y1":71592,"x2":2626,"y2":125489,"stroke":2583,"markerEnd":166764,"style":2594},[2585,166818],{"x":11231,"y":125489,"width":58337,"height":11103,"rx":2681,"fill":11165,"stroke":11166,"style":2594},[2000,166820,59987],{"x":2626,"y":166821,"fill":2599,"style":59932},"301",[18,166823,166825],{"id":166824},"_2-convert-a-single-file-with-docx2pdf-windows-macos","2. Convert a Single File with docx2pdf (Windows \u002F macOS)",[14,166827,166828,166830],{},[30,166829,163908],{}," calls into the running Word instance via COM automation on Windows and via AppleScript on macOS. The output PDF preserves all Word styles, embedded fonts, and track-changes markup exactly as Word renders them.",[23,166832,166834],{"className":126,"code":166833,"language":47,"meta":28,"style":28},"# pip install docx2pdf\nfrom pathlib import Path\nfrom docx2pdf import convert\n\nINPUT = Path(\"input_docs\u002Fcontract.docx\")\nOUTPUT = Path(\"output_pdfs\u002Fcontract.pdf\")\n\ntry:\n    OUTPUT.parent.mkdir(parents=True, exist_ok=True)\n    convert(INPUT, OUTPUT)\n    print(f\"Converted: {OUTPUT}\")\nexcept FileNotFoundError as exc:\n    print(f\"Input not found: {exc}\")\nexcept Exception as exc:\n    # On Linux this raises NotImplementedError; on Windows it may raise\n    # com_error if Word is not installed.\n    print(f\"Conversion failed: {exc}\")\n",[30,166835,166836,166840,166850,166860,166864,166877,166890,166894,166900,166923,166935,166951,166961,166981,166991,166996,167001],{"__ignoreMap":28},[33,166837,166838],{"class":35,"line":36},[33,166839,163971],{"class":39},[33,166841,166842,166844,166846,166848],{"class":35,"line":43},[33,166843,190],{"class":163},[33,166845,193],{"class":167},[33,166847,164],{"class":163},[33,166849,198],{"class":167},[33,166851,166852,166854,166856,166858],{"class":35,"line":61},[33,166853,190],{"class":163},[33,166855,164024],{"class":167},[33,166857,164],{"class":163},[33,166859,164029],{"class":167},[33,166861,166862],{"class":35,"line":73},[33,166863,92],{"emptyLinePlaceholder":91},[33,166865,166866,166868,166870,166872,166875],{"class":35,"line":88},[33,166867,102346],{"class":50},[33,166869,212],{"class":163},[33,166871,215],{"class":167},[33,166873,166874],{"class":54},"\"input_docs\u002Fcontract.docx\"",[33,166876,221],{"class":167},[33,166878,166879,166881,166883,166885,166888],{"class":35,"line":95},[33,166880,96935],{"class":50},[33,166882,212],{"class":163},[33,166884,215],{"class":167},[33,166886,166887],{"class":54},"\"output_pdfs\u002Fcontract.pdf\"",[33,166889,221],{"class":167},[33,166891,166892],{"class":35,"line":101},[33,166893,92],{"emptyLinePlaceholder":91},[33,166895,166896,166898],{"class":35,"line":171},[33,166897,35574],{"class":163},[33,166899,574],{"class":167},[33,166901,166902,166905,166907,166909,166911,166913,166915,166917,166919,166921],{"class":35,"line":179},[33,166903,166904],{"class":50},"    OUTPUT",[33,166906,866],{"class":167},[33,166908,869],{"class":238},[33,166910,242],{"class":163},[33,166912,855],{"class":50},[33,166914,365],{"class":167},[33,166916,878],{"class":238},[33,166918,242],{"class":163},[33,166920,855],{"class":50},[33,166922,221],{"class":167},[33,166924,166925,166927,166929,166931,166933],{"class":35,"line":187},[33,166926,164034],{"class":167},[33,166928,102346],{"class":50},[33,166930,365],{"class":167},[33,166932,96935],{"class":50},[33,166934,221],{"class":167},[33,166936,166937,166939,166941,166943,166945,166947,166949],{"class":35,"line":201},[33,166938,7268],{"class":50},[33,166940,602],{"class":167},[33,166942,4059],{"class":163},[33,166944,164574],{"class":54},[33,166946,97684],{"class":50},[33,166948,274],{"class":54},[33,166950,221],{"class":167},[33,166952,166953,166955,166957,166959],{"class":35,"line":206},[33,166954,35726],{"class":163},[33,166956,2945],{"class":50},[33,166958,1852],{"class":163},[33,166960,1855],{"class":167},[33,166962,166963,166965,166967,166969,166971,166973,166975,166977,166979],{"class":35,"line":224},[33,166964,7268],{"class":50},[33,166966,602],{"class":167},[33,166968,4059],{"class":163},[33,166970,16624],{"class":54},[33,166972,1115],{"class":50},[33,166974,6565],{"class":167},[33,166976,1121],{"class":50},[33,166978,274],{"class":54},[33,166980,221],{"class":167},[33,166982,166983,166985,166987,166989],{"class":35,"line":229},[33,166984,35726],{"class":163},[33,166986,783],{"class":50},[33,166988,1852],{"class":163},[33,166990,1855],{"class":167},[33,166992,166993],{"class":35,"line":235},[33,166994,166995],{"class":39},"    # On Linux this raises NotImplementedError; on Windows it may raise\n",[33,166997,166998],{"class":35,"line":250},[33,166999,167000],{"class":39},"    # com_error if Word is not installed.\n",[33,167002,167003,167005,167007,167009,167012,167014,167016,167018,167020],{"class":35,"line":266},[33,167004,7268],{"class":50},[33,167006,602],{"class":167},[33,167008,4059],{"class":163},[33,167010,167011],{"class":54},"\"Conversion failed: ",[33,167013,1115],{"class":50},[33,167015,6565],{"class":167},[33,167017,1121],{"class":50},[33,167019,274],{"class":54},[33,167021,221],{"class":167},[14,167023,167024,167027,167028,167030],{},[30,167025,167026],{},"convert()"," accepts a file path or a directory path. When you pass a directory it converts every ",[30,167029,18051],{}," it finds into the same folder. For custom output directories use the two-argument form shown above.",[18,167032,167034],{"id":167033},"_3-convert-a-single-file-with-libreoffice-headless-linux-server","3. Convert a Single File with LibreOffice Headless (Linux \u002F Server)",[14,167036,167037,167038,167041,167042,167045,167046,167048],{},"LibreOffice's ",[30,167039,167040],{},"--headless"," flag runs the full rendering pipeline without spawning a window. The ",[30,167043,167044],{},"--outdir"," flag controls where the PDF lands. Use ",[30,167047,35794],{}," so Python captures errors and exit codes.",[23,167050,167052],{"className":126,"code":167051,"language":47,"meta":28,"style":28},"# No pip install needed — requires: sudo apt install libreoffice\nimport subprocess\nfrom pathlib import Path\n\nINPUT = Path(\"input_docs\u002Fcontract.docx\")\nOUTPUT_DIR = Path(\"output_pdfs\")\n\ntry:\n    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)\n    result = subprocess.run(\n        [\n            \"soffice\",\n            \"--headless\",\n            \"--convert-to\", \"pdf\",\n            \"--outdir\", str(OUTPUT_DIR),\n            str(INPUT),\n        ],\n        capture_output=True,\n        text=True,\n        timeout=60,\n    )\n    if result.returncode != 0:\n        raise RuntimeError(result.stderr.strip())\n    print(f\"Converted: {OUTPUT_DIR \u002F (INPUT.stem + '.pdf')}\")\nexcept FileNotFoundError:\n    print(\"soffice not found — install LibreOffice and ensure it is on PATH.\")\nexcept subprocess.TimeoutExpired:\n    print(\"Conversion timed out. File may be corrupt or very large.\")\nexcept Exception as exc:\n    print(f\"Conversion failed: {exc}\")\n",[30,167053,167054,167059,167065,167075,167079,167091,167103,167107,167113,167136,167144,167148,167154,167160,167170,167184,167194,167198,167208,167218,167228,167232,167244,167253,167288,167296,167307,167313,167324,167334],{"__ignoreMap":28},[33,167055,167056],{"class":35,"line":36},[33,167057,167058],{"class":39},"# No pip install needed — requires: sudo apt install libreoffice\n",[33,167060,167061,167063],{"class":35,"line":43},[33,167062,164],{"class":163},[33,167064,35040],{"class":167},[33,167066,167067,167069,167071,167073],{"class":35,"line":61},[33,167068,190],{"class":163},[33,167070,193],{"class":167},[33,167072,164],{"class":163},[33,167074,198],{"class":167},[33,167076,167077],{"class":35,"line":73},[33,167078,92],{"emptyLinePlaceholder":91},[33,167080,167081,167083,167085,167087,167089],{"class":35,"line":88},[33,167082,102346],{"class":50},[33,167084,212],{"class":163},[33,167086,215],{"class":167},[33,167088,166874],{"class":54},[33,167090,221],{"class":167},[33,167092,167093,167095,167097,167099,167101],{"class":35,"line":95},[33,167094,4615],{"class":50},[33,167096,212],{"class":163},[33,167098,215],{"class":167},[33,167100,164542],{"class":54},[33,167102,221],{"class":167},[33,167104,167105],{"class":35,"line":101},[33,167106,92],{"emptyLinePlaceholder":91},[33,167108,167109,167111],{"class":35,"line":171},[33,167110,35574],{"class":163},[33,167112,574],{"class":167},[33,167114,167115,167118,167120,167122,167124,167126,167128,167130,167132,167134],{"class":35,"line":179},[33,167116,167117],{"class":50},"    OUTPUT_DIR",[33,167119,1078],{"class":167},[33,167121,869],{"class":238},[33,167123,242],{"class":163},[33,167125,855],{"class":50},[33,167127,365],{"class":167},[33,167129,878],{"class":238},[33,167131,242],{"class":163},[33,167133,855],{"class":50},[33,167135,221],{"class":167},[33,167137,167138,167140,167142],{"class":35,"line":187},[33,167139,8842],{"class":167},[33,167141,242],{"class":163},[33,167143,35060],{"class":167},[33,167145,167146],{"class":35,"line":201},[33,167147,19619],{"class":167},[33,167149,167150,167152],{"class":35,"line":206},[33,167151,164310],{"class":54},[33,167153,247],{"class":167},[33,167155,167156,167158],{"class":35,"line":224},[33,167157,164317],{"class":54},[33,167159,247],{"class":167},[33,167161,167162,167164,167166,167168],{"class":35,"line":229},[33,167163,148389],{"class":54},[33,167165,365],{"class":167},[33,167167,15519],{"class":54},[33,167169,247],{"class":167},[33,167171,167172,167174,167176,167178,167180,167182],{"class":35,"line":235},[33,167173,148401],{"class":54},[33,167175,365],{"class":167},[33,167177,1053],{"class":50},[33,167179,602],{"class":167},[33,167181,4615],{"class":50},[33,167183,1506],{"class":167},[33,167185,167186,167188,167190,167192],{"class":35,"line":250},[33,167187,10673],{"class":50},[33,167189,602],{"class":167},[33,167191,102346],{"class":50},[33,167193,1506],{"class":167},[33,167195,167196],{"class":35,"line":266},[33,167197,20776],{"class":167},[33,167199,167200,167202,167204,167206],{"class":35,"line":290},[33,167201,148430],{"class":238},[33,167203,242],{"class":163},[33,167205,855],{"class":50},[33,167207,247],{"class":167},[33,167209,167210,167212,167214,167216],{"class":35,"line":295},[33,167211,148441],{"class":238},[33,167213,242],{"class":163},[33,167215,855],{"class":50},[33,167217,247],{"class":167},[33,167219,167220,167222,167224,167226],{"class":35,"line":300},[33,167221,148452],{"class":238},[33,167223,242],{"class":163},[33,167225,2590],{"class":50},[33,167227,247],{"class":167},[33,167229,167230],{"class":35,"line":317},[33,167231,1202],{"class":167},[33,167233,167234,167236,167238,167240,167242],{"class":35,"line":332},[33,167235,617],{"class":163},[33,167237,35108],{"class":167},[33,167239,17877],{"class":163},[33,167241,10791],{"class":50},[33,167243,574],{"class":167},[33,167245,167246,167248,167250],{"class":35,"line":347},[33,167247,4051],{"class":163},[33,167249,7590],{"class":50},[33,167251,167252],{"class":167},"(result.stderr.strip())\n",[33,167254,167255,167257,167259,167261,167263,167266,167268,167270,167272,167275,167277,167280,167282,167284,167286],{"class":35,"line":374},[33,167256,7268],{"class":50},[33,167258,602],{"class":167},[33,167260,4059],{"class":163},[33,167262,164574],{"class":54},[33,167264,167265],{"class":50},"{OUTPUT_DIR",[33,167267,1107],{"class":163},[33,167269,17583],{"class":167},[33,167271,102346],{"class":50},[33,167273,167274],{"class":167},".stem ",[33,167276,1811],{"class":163},[33,167278,167279],{"class":54}," '.pdf'",[33,167281,12027],{"class":167},[33,167283,1121],{"class":50},[33,167285,274],{"class":54},[33,167287,221],{"class":167},[33,167289,167290,167292,167294],{"class":35,"line":397},[33,167291,35726],{"class":163},[33,167293,2945],{"class":50},[33,167295,574],{"class":167},[33,167297,167298,167300,167302,167305],{"class":35,"line":653},[33,167299,7268],{"class":50},[33,167301,602],{"class":167},[33,167303,167304],{"class":54},"\"soffice not found — install LibreOffice and ensure it is on PATH.\"",[33,167306,221],{"class":167},[33,167308,167309,167311],{"class":35,"line":667},[33,167310,35726],{"class":163},[33,167312,148550],{"class":167},[33,167314,167315,167317,167319,167322],{"class":35,"line":675},[33,167316,7268],{"class":50},[33,167318,602],{"class":167},[33,167320,167321],{"class":54},"\"Conversion timed out. File may be corrupt or very large.\"",[33,167323,221],{"class":167},[33,167325,167326,167328,167330,167332],{"class":35,"line":689},[33,167327,35726],{"class":163},[33,167329,783],{"class":50},[33,167331,1852],{"class":163},[33,167333,1855],{"class":167},[33,167335,167336,167338,167340,167342,167344,167346,167348,167350,167352],{"class":35,"line":703},[33,167337,7268],{"class":50},[33,167339,602],{"class":167},[33,167341,4059],{"class":163},[33,167343,167011],{"class":54},[33,167345,1115],{"class":50},[33,167347,6565],{"class":167},[33,167349,1121],{"class":50},[33,167351,274],{"class":54},[33,167353,221],{"class":167},[14,167355,167356,167357,167359,167360,3035],{},"Note: LibreOffice creates a user profile directory on first run. On headless servers this can default to a locked or read-only location. If you see ",[30,167358,166091],{}," errors, see the variant fix in ",[940,167361,163898],{"href":167362},"\u002Fword-document-templating-batch-processing\u002Fconverting-docx-to-pdf-with-python\u002Ffix-docx2pdf-error-on-linux\u002F",[18,167364,167366],{"id":167365},"_4-docx2pdf-vs-libreoffice-vs-cloud-engine-comparison","4. docx2pdf vs LibreOffice vs Cloud — Engine Comparison",[4273,167368,167369,167386],{},[4276,167370,167371],{},[4279,167372,167373,167375,167378,167381,167383],{},[4282,167374,142769],{"align":128901},[4282,167376,167377],{"align":128901},"Platform",[4282,167379,167380],{"align":128901},"Fidelity",[4282,167382,107515],{"align":128901},[4282,167384,167385],{"align":128901},"Requires",[4292,167387,167388,167406,167425,167443],{},[4279,167389,167390,167394,167397,167400,167403],{},[4297,167391,167392],{"align":128901},[1974,167393,163908],{},[4297,167395,167396],{"align":128901},"Windows, macOS",[4297,167398,167399],{"align":128901},"Highest — Word renders it",[4297,167401,167402],{"align":128901},"Medium (COM overhead)",[4297,167404,167405],{"align":128901},"Microsoft Word installed",[4279,167407,167408,167413,167416,167419,167422],{},[4297,167409,167410],{"align":128901},[1974,167411,167412],{},"LibreOffice headless",[4297,167414,167415],{"align":128901},"Linux, macOS, Windows",[4297,167417,167418],{"align":128901},"Good — minor font diffs possible",[4297,167420,167421],{"align":128901},"Fast, parallelisable",[4297,167423,167424],{"align":128901},"LibreOffice ≥ 7 on PATH",[4279,167426,167427,167432,167434,167437,167440],{},[4297,167428,167429,167431],{"align":128901},[1974,167430,84302],{}," (Adobe, Zamzar, GroupDocs)",[4297,167433,123744],{"align":128901},[4297,167435,167436],{"align":128901},"Varies by vendor",[4297,167438,167439],{"align":128901},"Network-bound",[4297,167441,167442],{"align":128901},"API key + internet",[4279,167444,167445,167449,167451,167454,167456],{},[4297,167446,167447],{"align":128901},[1974,167448,20883],{},[4297,167450,123744],{"align":128901},[4297,167452,167453],{"align":128901},"HTML\u002FCSS only",[4297,167455,38586],{"align":128901},[4297,167457,167458],{"align":128901},"HTML intermediate step",[14,167460,167461,167462,167464],{},"If your documents use complex Word-specific features — mail merge fields, ActiveX controls, or custom VBA macros — only ",[30,167463,163908],{}," (which uses Word itself) will reproduce them faithfully. LibreOffice handles standard paragraph styles, tables, headers\u002Ffooters, and embedded images reliably; fidelity degrades with advanced typography or proprietary OOXML extensions.",[18,167466,167468],{"id":167467},"_5-batch-convert-a-folder","5. Batch Convert a Folder",[14,167470,167471],{},"Converting a folder preserves the relative directory structure in the output. Both engines handle this differently; the snippet below wraps both in a unified interface.",[23,167473,167475],{"className":126,"code":167474,"language":47,"meta":28,"style":28},"# pip install docx2pdf   (Windows\u002FmacOS path)\n# Linux path requires: sudo apt install libreoffice\nimport platform\nimport shutil\nimport subprocess\nfrom pathlib import Path\n\ndef batch_convert(input_dir: Path, output_dir: Path) -> list[Path]:\n    \"\"\"Convert all .docx files in input_dir to PDF and write them to output_dir.\"\"\"\n    input_dir = input_dir.resolve()\n    output_dir = output_dir.resolve()\n    output_dir.mkdir(parents=True, exist_ok=True)\n\n    docx_files = list(input_dir.rglob(\"*.docx\"))\n    if not docx_files:\n        print(\"No .docx files found.\")\n        return []\n\n    system = platform.system()\n    converted: list[Path] = []\n\n    for docx_path in docx_files:\n        # Mirror subdirectory structure\n        rel = docx_path.relative_to(input_dir)\n        out_subdir = output_dir \u002F rel.parent\n        out_subdir.mkdir(parents=True, exist_ok=True)\n        pdf_path = out_subdir \u002F (docx_path.stem + \".pdf\")\n\n        try:\n            if system in (\"Windows\", \"Darwin\"):\n                from docx2pdf import convert\n                convert(docx_path, pdf_path)\n            else:\n                result = subprocess.run(\n                    [\"soffice\", \"--headless\", \"--convert-to\", \"pdf\",\n                     \"--outdir\", str(out_subdir), str(docx_path)],\n                    capture_output=True, text=True, timeout=120,\n                )\n                if result.returncode != 0:\n                    raise RuntimeError(result.stderr.strip())\n            converted.append(pdf_path)\n            print(f\"  OK: {rel}\")\n        except Exception as exc:\n            print(f\"  FAIL: {rel} — {exc}\")\n\n    return converted\n\n\nif __name__ == \"__main__\":\n    results = batch_convert(\n        input_dir=Path(\"input_docs\"),\n        output_dir=Path(\"output_pdfs\"),\n    )\n    print(f\"\\nConverted {len(results)} file(s).\")\n",[30,167476,167477,167482,167487,167493,167499,167505,167515,167519,167529,167534,167543,167551,167571,167575,167592,167601,167612,167618,167622,167630,167639,167643,167654,167659,167669,167683,167704,167724,167728,167734,167752,167763,167768,167774,167783,167805,167822,167849,167853,167865,167873,167878,167899,167909,167938,167942,167949,167953,167957,167969,167978,167992,168005,168009],{"__ignoreMap":28},[33,167478,167479],{"class":35,"line":36},[33,167480,167481],{"class":39},"# pip install docx2pdf   (Windows\u002FmacOS path)\n",[33,167483,167484],{"class":35,"line":43},[33,167485,167486],{"class":39},"# Linux path requires: sudo apt install libreoffice\n",[33,167488,167489,167491],{"class":35,"line":61},[33,167490,164],{"class":163},[33,167492,163978],{"class":167},[33,167494,167495,167497],{"class":35,"line":73},[33,167496,164],{"class":163},[33,167498,41706],{"class":167},[33,167500,167501,167503],{"class":35,"line":88},[33,167502,164],{"class":163},[33,167504,35040],{"class":167},[33,167506,167507,167509,167511,167513],{"class":35,"line":95},[33,167508,190],{"class":163},[33,167510,193],{"class":167},[33,167512,164],{"class":163},[33,167514,198],{"class":167},[33,167516,167517],{"class":35,"line":101},[33,167518,92],{"emptyLinePlaceholder":91},[33,167520,167521,167523,167526],{"class":35,"line":171},[33,167522,562],{"class":163},[33,167524,167525],{"class":46}," batch_convert",[33,167527,167528],{"class":167},"(input_dir: Path, output_dir: Path) -> list[Path]:\n",[33,167530,167531],{"class":35,"line":179},[33,167532,167533],{"class":54},"    \"\"\"Convert all .docx files in input_dir to PDF and write them to output_dir.\"\"\"\n",[33,167535,167536,167538,167540],{"class":35,"line":187},[33,167537,69437],{"class":167},[33,167539,242],{"class":163},[33,167541,167542],{"class":167}," input_dir.resolve()\n",[33,167544,167545,167547,167549],{"class":35,"line":201},[33,167546,22180],{"class":167},[33,167548,242],{"class":163},[33,167550,164269],{"class":167},[33,167552,167553,167555,167557,167559,167561,167563,167565,167567,167569],{"class":35,"line":206},[33,167554,6346],{"class":167},[33,167556,869],{"class":238},[33,167558,242],{"class":163},[33,167560,855],{"class":50},[33,167562,365],{"class":167},[33,167564,878],{"class":238},[33,167566,242],{"class":163},[33,167568,855],{"class":50},[33,167570,221],{"class":167},[33,167572,167573],{"class":35,"line":224},[33,167574,92],{"emptyLinePlaceholder":91},[33,167576,167577,167580,167582,167584,167587,167590],{"class":35,"line":229},[33,167578,167579],{"class":167},"    docx_files ",[33,167581,242],{"class":163},[33,167583,599],{"class":50},[33,167585,167586],{"class":167},"(input_dir.rglob(",[33,167588,167589],{"class":54},"\"*.docx\"",[33,167591,371],{"class":167},[33,167593,167594,167596,167598],{"class":35,"line":235},[33,167595,617],{"class":163},[33,167597,620],{"class":163},[33,167599,167600],{"class":167}," docx_files:\n",[33,167602,167603,167605,167607,167610],{"class":35,"line":250},[33,167604,9414],{"class":50},[33,167606,602],{"class":167},[33,167608,167609],{"class":54},"\"No .docx files found.\"",[33,167611,221],{"class":167},[33,167613,167614,167616],{"class":35,"line":266},[33,167615,1659],{"class":163},[33,167617,589],{"class":167},[33,167619,167620],{"class":35,"line":290},[33,167621,92],{"emptyLinePlaceholder":91},[33,167623,167624,167626,167628],{"class":35,"line":295},[33,167625,166603],{"class":167},[33,167627,242],{"class":163},[33,167629,164714],{"class":167},[33,167631,167632,167635,167637],{"class":35,"line":300},[33,167633,167634],{"class":167},"    converted: list[Path] ",[33,167636,242],{"class":163},[33,167638,589],{"class":167},[33,167640,167641],{"class":35,"line":317},[33,167642,92],{"emptyLinePlaceholder":91},[33,167644,167645,167647,167650,167652],{"class":35,"line":332},[33,167646,656],{"class":163},[33,167648,167649],{"class":167}," docx_path ",[33,167651,662],{"class":163},[33,167653,167600],{"class":167},[33,167655,167656],{"class":35,"line":347},[33,167657,167658],{"class":39},"        # Mirror subdirectory structure\n",[33,167660,167661,167664,167666],{"class":35,"line":374},[33,167662,167663],{"class":167},"        rel ",[33,167665,242],{"class":163},[33,167667,167668],{"class":167}," docx_path.relative_to(input_dir)\n",[33,167670,167671,167674,167676,167678,167680],{"class":35,"line":397},[33,167672,167673],{"class":167},"        out_subdir ",[33,167675,242],{"class":163},[33,167677,6393],{"class":167},[33,167679,1351],{"class":163},[33,167681,167682],{"class":167}," rel.parent\n",[33,167684,167685,167688,167690,167692,167694,167696,167698,167700,167702],{"class":35,"line":653},[33,167686,167687],{"class":167},"        out_subdir.mkdir(",[33,167689,869],{"class":238},[33,167691,242],{"class":163},[33,167693,855],{"class":50},[33,167695,365],{"class":167},[33,167697,878],{"class":238},[33,167699,242],{"class":163},[33,167701,855],{"class":50},[33,167703,221],{"class":167},[33,167705,167706,167709,167711,167714,167716,167718,167720,167722],{"class":35,"line":667},[33,167707,167708],{"class":167},"        pdf_path ",[33,167710,242],{"class":163},[33,167712,167713],{"class":167}," out_subdir ",[33,167715,1351],{"class":163},[33,167717,164455],{"class":167},[33,167719,1811],{"class":163},[33,167721,164460],{"class":54},[33,167723,221],{"class":167},[33,167725,167726],{"class":35,"line":675},[33,167727,92],{"emptyLinePlaceholder":91},[33,167729,167730,167732],{"class":35,"line":689},[33,167731,670],{"class":163},[33,167733,574],{"class":167},[33,167735,167736,167738,167740,167742,167744,167746,167748,167750],{"class":35,"line":703},[33,167737,5995],{"class":163},[33,167739,166614],{"class":167},[33,167741,662],{"class":163},[33,167743,17583],{"class":167},[33,167745,164831],{"class":54},[33,167747,365],{"class":167},[33,167749,164836],{"class":54},[33,167751,1737],{"class":167},[33,167753,167754,167757,167759,167761],{"class":35,"line":714},[33,167755,167756],{"class":163},"                from",[33,167758,164024],{"class":167},[33,167760,164],{"class":163},[33,167762,164029],{"class":167},[33,167764,167765],{"class":35,"line":723},[33,167766,167767],{"class":167},"                convert(docx_path, pdf_path)\n",[33,167769,167770,167772],{"class":35,"line":754},[33,167771,8705],{"class":163},[33,167773,574],{"class":167},[33,167775,167776,167779,167781],{"class":35,"line":771},[33,167777,167778],{"class":167},"                result ",[33,167780,242],{"class":163},[33,167782,35060],{"class":167},[33,167784,167785,167788,167790,167792,167794,167796,167799,167801,167803],{"class":35,"line":777},[33,167786,167787],{"class":167},"                    [",[33,167789,165324],{"class":54},[33,167791,365],{"class":167},[33,167793,148375],{"class":54},[33,167795,365],{"class":167},[33,167797,167798],{"class":54},"\"--convert-to\"",[33,167800,365],{"class":167},[33,167802,15519],{"class":54},[33,167804,247],{"class":167},[33,167806,167807,167810,167812,167814,167817,167819],{"class":35,"line":788},[33,167808,167809],{"class":54},"                     \"--outdir\"",[33,167811,365],{"class":167},[33,167813,1053],{"class":50},[33,167815,167816],{"class":167},"(out_subdir), ",[33,167818,1053],{"class":50},[33,167820,167821],{"class":167},"(docx_path)],\n",[33,167823,167824,167827,167829,167831,167833,167835,167837,167839,167841,167843,167845,167847],{"class":35,"line":804},[33,167825,167826],{"class":238},"                    capture_output",[33,167828,242],{"class":163},[33,167830,855],{"class":50},[33,167832,365],{"class":167},[33,167834,2000],{"class":238},[33,167836,242],{"class":163},[33,167838,855],{"class":50},[33,167840,365],{"class":167},[33,167842,1641],{"class":238},[33,167844,242],{"class":163},[33,167846,2589],{"class":50},[33,167848,247],{"class":167},[33,167850,167851],{"class":35,"line":809},[33,167852,97316],{"class":167},[33,167854,167855,167857,167859,167861,167863],{"class":35,"line":819},[33,167856,7170],{"class":163},[33,167858,35108],{"class":167},[33,167860,17877],{"class":163},[33,167862,10791],{"class":50},[33,167864,574],{"class":167},[33,167866,167867,167869,167871],{"class":35,"line":829},[33,167868,72798],{"class":163},[33,167870,7590],{"class":50},[33,167872,167252],{"class":167},[33,167874,167875],{"class":35,"line":834},[33,167876,167877],{"class":167},"            converted.append(pdf_path)\n",[33,167879,167880,167882,167884,167886,167888,167890,167893,167895,167897],{"class":35,"line":839},[33,167881,9364],{"class":50},[33,167883,602],{"class":167},[33,167885,4059],{"class":163},[33,167887,94447],{"class":54},[33,167889,1115],{"class":50},[33,167891,167892],{"class":167},"rel",[33,167894,1121],{"class":50},[33,167896,274],{"class":54},[33,167898,221],{"class":167},[33,167900,167901,167903,167905,167907],{"class":35,"line":860},[33,167902,780],{"class":163},[33,167904,783],{"class":50},[33,167906,1852],{"class":163},[33,167908,1855],{"class":167},[33,167910,167911,167913,167915,167917,167920,167922,167924,167926,167928,167930,167932,167934,167936],{"class":35,"line":887},[33,167912,9364],{"class":50},[33,167914,602],{"class":167},[33,167916,4059],{"class":163},[33,167918,167919],{"class":54},"\"  FAIL: ",[33,167921,1115],{"class":50},[33,167923,167892],{"class":167},[33,167925,1121],{"class":50},[33,167927,6242],{"class":54},[33,167929,1115],{"class":50},[33,167931,6565],{"class":167},[33,167933,1121],{"class":50},[33,167935,274],{"class":54},[33,167937,221],{"class":167},[33,167939,167940],{"class":35,"line":907},[33,167941,92],{"emptyLinePlaceholder":91},[33,167943,167944,167946],{"class":35,"line":1826},[33,167945,1332],{"class":163},[33,167947,167948],{"class":167}," converted\n",[33,167950,167951],{"class":35,"line":1844},[33,167952,92],{"emptyLinePlaceholder":91},[33,167954,167955],{"class":35,"line":1858},[33,167956,92],{"emptyLinePlaceholder":91},[33,167958,167959,167961,167963,167965,167967],{"class":35,"line":1871},[33,167960,2491],{"class":163},[33,167962,2494],{"class":50},[33,167964,2497],{"class":163},[33,167966,2500],{"class":54},[33,167968,574],{"class":167},[33,167970,167971,167973,167975],{"class":35,"line":1877},[33,167972,37112],{"class":167},[33,167974,242],{"class":163},[33,167976,167977],{"class":167}," batch_convert(\n",[33,167979,167980,167983,167985,167987,167990],{"class":35,"line":1883},[33,167981,167982],{"class":238},"        input_dir",[33,167984,242],{"class":163},[33,167986,15641],{"class":167},[33,167988,167989],{"class":54},"\"input_docs\"",[33,167991,1506],{"class":167},[33,167993,167994,167997,167999,168001,168003],{"class":35,"line":1915},[33,167995,167996],{"class":238},"        output_dir",[33,167998,242],{"class":163},[33,168000,15641],{"class":167},[33,168002,164542],{"class":54},[33,168004,1506],{"class":167},[33,168006,168007],{"class":35,"line":1926},[33,168008,1202],{"class":167},[33,168010,168011,168013,168015,168017,168019,168021,168024,168026,168029,168031,168034],{"class":35,"line":1932},[33,168012,7268],{"class":50},[33,168014,602],{"class":167},[33,168016,4059],{"class":163},[33,168018,274],{"class":54},[33,168020,25830],{"class":50},[33,168022,168023],{"class":54},"Converted ",[33,168025,4065],{"class":50},[33,168027,168028],{"class":167},"(results)",[33,168030,1121],{"class":50},[33,168032,168033],{"class":54}," file(s).\"",[33,168035,221],{"class":167},[18,168037,168039],{"id":168038},"_6-font-and-layout-fidelity-caveats","6. Font and Layout Fidelity Caveats",[424,168041,168043],{"id":168042},"embedded-vs-system-fonts","Embedded vs. System Fonts",[14,168045,168046,168048,168049,168051],{},[30,168047,163908],{}," asks Word to render the document, so every font Word can access — including fonts embedded in the ",[30,168050,18051],{}," — is available. LibreOffice renders with its own font engine; fonts embedded in the OOXML container are extracted at conversion time but system fonts referenced by name must be installed on the server.",[14,168053,168054,168057],{},[1974,168055,168056],{},"Action",": On Linux servers, install the Microsoft core fonts package to cover the most common Word typefaces:",[23,168059,168061],{"className":25,"code":168060,"language":27,"meta":28,"style":28},"# Ubuntu\u002FDebian\nsudo apt install ttf-mscorefonts-installer\nsudo fc-cache -f -v\n",[30,168062,168063,168067,168078],{"__ignoreMap":28},[33,168064,168065],{"class":35,"line":36},[33,168066,20901],{"class":39},[33,168068,168069,168071,168073,168075],{"class":35,"line":43},[33,168070,9669],{"class":46},[33,168072,57878],{"class":54},[33,168074,79],{"class":54},[33,168076,168077],{"class":54}," ttf-mscorefonts-installer\n",[33,168079,168080,168082,168084,168086],{"class":35,"line":61},[33,168081,9669],{"class":46},[33,168083,165994],{"class":54},[33,168085,35263],{"class":50},[33,168087,165999],{"class":50},[14,168089,168090,168091,1351,168094,168097,168098,168101,168102,168105],{},"If documents use custom brand fonts, copy the ",[30,168092,168093],{},".ttf",[30,168095,168096],{},".otf"," files to ",[30,168099,168100],{},"\u002Fusr\u002Fshare\u002Ffonts\u002Fcustom\u002F"," and run ",[30,168103,168104],{},"fc-cache"," again before converting.",[424,168107,168109],{"id":168108},"complex-layouts","Complex Layouts",[14,168111,168112],{},"LibreOffice may misplace:",[4211,168114,168115,168118,168121,168128],{},[4214,168116,168117],{},"Text boxes anchored to a character position (rather than the page)",[4214,168119,168120],{},"Word Art and SmartArt graphics (rendered as bitmaps at low resolution)",[4214,168122,168123,168124,168127],{},"Tables that span a page break with ",[30,168125,168126],{},"Keep together"," enabled",[4214,168129,168130],{},"Headers\u002Ffooters using linked-story chains",[14,168132,168133,168134,168136,168137,168139,168140,168142],{},"For documents with these features, prefer ",[30,168135,163908],{}," on a Windows CI runner, or convert ",[30,168138,18051],{}," → HTML via ",[30,168141,18041],{}," first and then use WeasyPrint for the PDF step — though HTML conversion itself loses complex formatting.",[18,168144,168146],{"id":168145},"_7-edge-cases-and-variants","7. Edge Cases and Variants",[424,168148,168150],{"id":168149},"password-protected-docx-files","Password-Protected DOCX Files",[14,168152,168153,168154,168156],{},"Both engines will fail silently or raise on encrypted ",[30,168155,18051],{}," files. Strip the password first:",[23,168158,168160],{"className":126,"code":168159,"language":47,"meta":28,"style":28},"# pip install msoffcrypto-tool\nimport msoffcrypto\nfrom pathlib import Path\nimport io\n\ndef decrypt_docx(encrypted_path: Path, password: str) -> bytes:\n    \"\"\"Return decrypted .docx bytes, ready for conversion.\"\"\"\n    try:\n        with open(encrypted_path, \"rb\") as f:\n            office_file = msoffcrypto.OfficeFile(f)\n            office_file.load_key(password=password)\n            output = io.BytesIO()\n            office_file.decrypt(output)\n            return output.getvalue()\n    except msoffcrypto.exceptions.InvalidKeyError:\n        raise ValueError(\"Incorrect password for the .docx file.\")\n",[30,168161,168162,168167,168174,168184,168190,168194,168212,168217,168223,168240,168250,168262,168271,168276,168283,168290],{"__ignoreMap":28},[33,168163,168164],{"class":35,"line":36},[33,168165,168166],{"class":39},"# pip install msoffcrypto-tool\n",[33,168168,168169,168171],{"class":35,"line":43},[33,168170,164],{"class":163},[33,168172,168173],{"class":167}," msoffcrypto\n",[33,168175,168176,168178,168180,168182],{"class":35,"line":61},[33,168177,190],{"class":163},[33,168179,193],{"class":167},[33,168181,164],{"class":163},[33,168183,198],{"class":167},[33,168185,168186,168188],{"class":35,"line":73},[33,168187,164],{"class":163},[33,168189,60058],{"class":167},[33,168191,168192],{"class":35,"line":88},[33,168193,92],{"emptyLinePlaceholder":91},[33,168195,168196,168198,168201,168204,168206,168208,168210],{"class":35,"line":95},[33,168197,562],{"class":163},[33,168199,168200],{"class":46}," decrypt_docx",[33,168202,168203],{"class":167},"(encrypted_path: Path, password: ",[33,168205,1053],{"class":50},[33,168207,1617],{"class":167},[33,168209,1620],{"class":50},[33,168211,574],{"class":167},[33,168213,168214],{"class":35,"line":101},[33,168215,168216],{"class":54},"    \"\"\"Return decrypted .docx bytes, ready for conversion.\"\"\"\n",[33,168218,168219,168221],{"class":35,"line":171},[33,168220,2424],{"class":163},[33,168222,574],{"class":167},[33,168224,168225,168227,168229,168232,168234,168236,168238],{"class":35,"line":179},[33,168226,2191],{"class":163},[33,168228,68213],{"class":50},[33,168230,168231],{"class":167},"(encrypted_path, ",[33,168233,68219],{"class":54},[33,168235,1649],{"class":167},[33,168237,495],{"class":163},[33,168239,77684],{"class":167},[33,168241,168242,168245,168247],{"class":35,"line":187},[33,168243,168244],{"class":167},"            office_file ",[33,168246,242],{"class":163},[33,168248,168249],{"class":167}," msoffcrypto.OfficeFile(f)\n",[33,168251,168252,168255,168257,168259],{"class":35,"line":201},[33,168253,168254],{"class":167},"            office_file.load_key(",[33,168256,39563],{"class":238},[33,168258,242],{"class":163},[33,168260,168261],{"class":167},"password)\n",[33,168263,168264,168267,168269],{"class":35,"line":206},[33,168265,168266],{"class":167},"            output ",[33,168268,242],{"class":163},[33,168270,61918],{"class":167},[33,168272,168273],{"class":35,"line":224},[33,168274,168275],{"class":167},"            office_file.decrypt(output)\n",[33,168277,168278,168280],{"class":35,"line":229},[33,168279,28782],{"class":163},[33,168281,168282],{"class":167}," output.getvalue()\n",[33,168284,168285,168287],{"class":35,"line":235},[33,168286,2449],{"class":163},[33,168288,168289],{"class":167}," msoffcrypto.exceptions.InvalidKeyError:\n",[33,168291,168292,168294,168296,168298,168301],{"class":35,"line":250},[33,168293,4051],{"class":163},[33,168295,4054],{"class":50},[33,168297,602],{"class":167},[33,168299,168300],{"class":54},"\"Incorrect password for the .docx file.\"",[33,168302,221],{"class":167},[424,168304,168306],{"id":168305},"skipping-already-converted-files","Skipping Already-Converted Files",[14,168308,168309,168310,20891],{},"In incremental runs you want to skip files whose PDF is newer than the source ",[30,168311,18051],{},[23,168313,168315],{"className":126,"code":168314,"language":47,"meta":28,"style":28},"from pathlib import Path\n\ndef needs_conversion(docx_path: Path, pdf_path: Path) -> bool:\n    \"\"\"Return True if the PDF is missing or older than the .docx.\"\"\"\n    if not pdf_path.exists():\n        return True\n    return docx_path.stat().st_mtime > pdf_path.stat().st_mtime\n",[30,168316,168317,168327,168331,168345,168350,168358,168364],{"__ignoreMap":28},[33,168318,168319,168321,168323,168325],{"class":35,"line":36},[33,168320,190],{"class":163},[33,168322,193],{"class":167},[33,168324,164],{"class":163},[33,168326,198],{"class":167},[33,168328,168329],{"class":35,"line":43},[33,168330,92],{"emptyLinePlaceholder":91},[33,168332,168333,168335,168338,168341,168343],{"class":35,"line":61},[33,168334,562],{"class":163},[33,168336,168337],{"class":46}," needs_conversion",[33,168339,168340],{"class":167},"(docx_path: Path, pdf_path: Path) -> ",[33,168342,2821],{"class":50},[33,168344,574],{"class":167},[33,168346,168347],{"class":35,"line":73},[33,168348,168349],{"class":54},"    \"\"\"Return True if the PDF is missing or older than the .docx.\"\"\"\n",[33,168351,168352,168354,168356],{"class":35,"line":88},[33,168353,617],{"class":163},[33,168355,620],{"class":163},[33,168357,21595],{"class":167},[33,168359,168360,168362],{"class":35,"line":95},[33,168361,1659],{"class":163},[33,168363,2887],{"class":50},[33,168365,168366,168368,168371,168373],{"class":35,"line":101},[33,168367,1332],{"class":163},[33,168369,168370],{"class":167}," docx_path.stat().st_mtime ",[33,168372,6009],{"class":163},[33,168374,168375],{"class":167}," pdf_path.stat().st_mtime\n",[424,168377,168379],{"id":168378},"running-libreoffice-in-docker","Running LibreOffice in Docker",[14,168381,168382],{},"For reproducible server deployments avoid relying on system LibreOffice. A minimal Dockerfile:",[23,168384,168386],{"className":36048,"code":168385,"language":36050,"meta":28,"style":28},"FROM python:3.12-slim\nRUN apt-get update && apt-get install -y --no-install-recommends \\\n    libreoffice \\\n    ttf-mscorefonts-installer \\\n    && rm -rf \u002Fvar\u002Flib\u002Fapt\u002Flists\u002F*\nWORKDIR \u002Fapp\nCOPY . .\nRUN pip install --no-cache-dir docx2pdf  # kept for Windows\u002FmacOS parity; Linux path uses soffice\nCMD [\"python\", \"batch_convert.py\"]\n",[30,168387,168388,168392,168396,168401,168406,168410,168414,168419,168424],{"__ignoreMap":28},[33,168389,168390],{"class":35,"line":36},[33,168391,36057],{},[33,168393,168394],{"class":35,"line":43},[33,168395,42440],{},[33,168397,168398],{"class":35,"line":61},[33,168399,168400],{},"    libreoffice \\\n",[33,168402,168403],{"class":35,"line":73},[33,168404,168405],{},"    ttf-mscorefonts-installer \\\n",[33,168407,168408],{"class":35,"line":88},[33,168409,42460],{},[33,168411,168412],{"class":35,"line":95},[33,168413,36128],{},[33,168415,168416],{"class":35,"line":101},[33,168417,168418],{},"COPY . .\n",[33,168420,168421],{"class":35,"line":171},[33,168422,168423],{},"RUN pip install --no-cache-dir docx2pdf  # kept for Windows\u002FmacOS parity; Linux path uses soffice\n",[33,168425,168426],{"class":35,"line":179},[33,168427,168428],{},"CMD [\"python\", \"batch_convert.py\"]\n",[18,168430,168432],{"id":168431},"_8-validation","8. Validation",[14,168434,168435,168436,168438],{},"After conversion, open-check the PDFs with ",[30,168437,65045],{}," to confirm page count and that the file is not corrupt:",[23,168440,168442],{"className":126,"code":168441,"language":47,"meta":28,"style":28},"# pip install pypdf\nfrom pathlib import Path\nfrom pypdf import PdfReader\n\ndef validate_pdf(pdf_path: Path, expected_min_pages: int = 1) -> bool:\n    \"\"\"Return True if the PDF is readable and has at least expected_min_pages.\"\"\"\n    try:\n        reader = PdfReader(pdf_path)\n        actual = len(reader.pages)\n        if actual \u003C expected_min_pages:\n            print(f\"[WARN] {pdf_path.name}: only {actual} page(s), expected ≥ {expected_min_pages}\")\n            return False\n        return True\n    except Exception as exc:\n        print(f\"[FAIL] {pdf_path.name}: {exc}\")\n        return False\n\n# Quick batch check\noutput_dir = Path(\"output_pdfs\")\nfor pdf in output_dir.rglob(\"*.pdf\"):\n    validate_pdf(pdf)\n",[30,168443,168444,168448,168458,168468,168472,168493,168498,168504,168513,168523,168534,168572,168578,168584,168594,168623,168629,168633,168638,168650,168665],{"__ignoreMap":28},[33,168445,168446],{"class":35,"line":36},[33,168447,57316],{"class":39},[33,168449,168450,168452,168454,168456],{"class":35,"line":43},[33,168451,190],{"class":163},[33,168453,193],{"class":167},[33,168455,164],{"class":163},[33,168457,198],{"class":167},[33,168459,168460,168462,168464,168466],{"class":35,"line":61},[33,168461,190],{"class":163},[33,168463,57333],{"class":167},[33,168465,164],{"class":163},[33,168467,57338],{"class":167},[33,168469,168470],{"class":35,"line":73},[33,168471,92],{"emptyLinePlaceholder":91},[33,168473,168474,168476,168478,168481,168483,168485,168487,168489,168491],{"class":35,"line":88},[33,168475,562],{"class":163},[33,168477,62458],{"class":46},[33,168479,168480],{"class":167},"(pdf_path: Path, expected_min_pages: ",[33,168482,1059],{"class":50},[33,168484,212],{"class":163},[33,168486,1814],{"class":50},[33,168488,1617],{"class":167},[33,168490,2821],{"class":50},[33,168492,574],{"class":167},[33,168494,168495],{"class":35,"line":95},[33,168496,168497],{"class":54},"    \"\"\"Return True if the PDF is readable and has at least expected_min_pages.\"\"\"\n",[33,168499,168500,168502],{"class":35,"line":101},[33,168501,2424],{"class":163},[33,168503,574],{"class":167},[33,168505,168506,168508,168510],{"class":35,"line":171},[33,168507,62484],{"class":167},[33,168509,242],{"class":163},[33,168511,168512],{"class":167}," PdfReader(pdf_path)\n",[33,168514,168515,168517,168519,168521],{"class":35,"line":179},[33,168516,25149],{"class":167},[33,168518,242],{"class":163},[33,168520,4037],{"class":50},[33,168522,70691],{"class":167},[33,168524,168525,168527,168529,168531],{"class":35,"line":187},[33,168526,8221],{"class":163},[33,168528,25170],{"class":167},[33,168530,4043],{"class":163},[33,168532,168533],{"class":167}," expected_min_pages:\n",[33,168535,168536,168538,168540,168542,168544,168546,168548,168550,168552,168554,168556,168558,168561,168563,168566,168568,168570],{"class":35,"line":201},[33,168537,9364],{"class":50},[33,168539,602],{"class":167},[33,168541,4059],{"class":163},[33,168543,159334],{"class":54},[33,168545,1115],{"class":50},[33,168547,27186],{"class":167},[33,168549,1121],{"class":50},[33,168551,159343],{"class":54},[33,168553,1115],{"class":50},[33,168555,25201],{"class":167},[33,168557,1121],{"class":50},[33,168559,168560],{"class":54}," page(s), expected ≥ ",[33,168562,1115],{"class":50},[33,168564,168565],{"class":167},"expected_min_pages",[33,168567,1121],{"class":50},[33,168569,274],{"class":54},[33,168571,221],{"class":167},[33,168573,168574,168576],{"class":35,"line":206},[33,168575,28782],{"class":163},[33,168577,2903],{"class":50},[33,168579,168580,168582],{"class":35,"line":224},[33,168581,1659],{"class":163},[33,168583,2887],{"class":50},[33,168585,168586,168588,168590,168592],{"class":35,"line":229},[33,168587,2449],{"class":163},[33,168589,783],{"class":50},[33,168591,1852],{"class":163},[33,168593,1855],{"class":167},[33,168595,168596,168598,168600,168602,168605,168607,168609,168611,168613,168615,168617,168619,168621],{"class":35,"line":235},[33,168597,9414],{"class":50},[33,168599,602],{"class":167},[33,168601,4059],{"class":163},[33,168603,168604],{"class":54},"\"[FAIL] ",[33,168606,1115],{"class":50},[33,168608,27186],{"class":167},[33,168610,1121],{"class":50},[33,168612,2079],{"class":54},[33,168614,1115],{"class":50},[33,168616,6565],{"class":167},[33,168618,1121],{"class":50},[33,168620,274],{"class":54},[33,168622,221],{"class":167},[33,168624,168625,168627],{"class":35,"line":250},[33,168626,1659],{"class":163},[33,168628,2903],{"class":50},[33,168630,168631],{"class":35,"line":266},[33,168632,92],{"emptyLinePlaceholder":91},[33,168634,168635],{"class":35,"line":290},[33,168636,168637],{"class":39},"# Quick batch check\n",[33,168639,168640,168642,168644,168646,168648],{"class":35,"line":295},[33,168641,164535],{"class":167},[33,168643,242],{"class":163},[33,168645,215],{"class":167},[33,168647,164542],{"class":54},[33,168649,221],{"class":167},[33,168651,168652,168654,168656,168658,168661,168663],{"class":35,"line":300},[33,168653,6124],{"class":163},[33,168655,67712],{"class":167},[33,168657,662],{"class":163},[33,168659,168660],{"class":167}," output_dir.rglob(",[33,168662,610],{"class":54},[33,168664,1737],{"class":167},[33,168666,168667],{"class":35,"line":317},[33,168668,168669],{"class":167},"    validate_pdf(pdf)\n",[18,168671,168673],{"id":168672},"_9-performance-and-scale","9. Performance and Scale",[14,168675,168676,168679,168680,168682],{},[1974,168677,168678],{},"LibreOffice parallelism",": Each ",[30,168681,164121],{}," process locks its user-profile directory. Running multiple conversions in parallel fails unless each process gets its own profile:",[23,168684,168686],{"className":126,"code":168685,"language":47,"meta":28,"style":28},"import subprocess\nimport tempfile\nfrom pathlib import Path\n\ndef soffice_convert_isolated(docx_path: Path, output_dir: Path) -> None:\n    \"\"\"Run soffice with a per-process user profile to allow parallelism.\"\"\"\n    with tempfile.TemporaryDirectory() as tmp:\n        profile_dir = Path(tmp) \u002F \"lo_profile\"\n        profile_dir.mkdir()\n        subprocess.run(\n            [\n                \"soffice\",\n                f\"-env:UserInstallation=file:\u002F\u002F{profile_dir}\",\n                \"--headless\",\n                \"--convert-to\", \"pdf\",\n                \"--outdir\", str(output_dir),\n                str(docx_path),\n            ],\n            capture_output=True, text=True, timeout=120, check=True,\n        )\n",[30,168687,168688,168694,168700,168710,168714,168727,168732,168742,168755,168760,168765,168769,168775,168792,168798,168808,168818,168824,168828,168863],{"__ignoreMap":28},[33,168689,168690,168692],{"class":35,"line":36},[33,168691,164],{"class":163},[33,168693,35040],{"class":167},[33,168695,168696,168698],{"class":35,"line":43},[33,168697,164],{"class":163},[33,168699,70055],{"class":167},[33,168701,168702,168704,168706,168708],{"class":35,"line":61},[33,168703,190],{"class":163},[33,168705,193],{"class":167},[33,168707,164],{"class":163},[33,168709,198],{"class":167},[33,168711,168712],{"class":35,"line":73},[33,168713,92],{"emptyLinePlaceholder":91},[33,168715,168716,168718,168721,168723,168725],{"class":35,"line":88},[33,168717,562],{"class":163},[33,168719,168720],{"class":46}," soffice_convert_isolated",[33,168722,164942],{"class":167},[33,168724,571],{"class":50},[33,168726,574],{"class":167},[33,168728,168729],{"class":35,"line":95},[33,168730,168731],{"class":54},"    \"\"\"Run soffice with a per-process user profile to allow parallelism.\"\"\"\n",[33,168733,168734,168736,168738,168740],{"class":35,"line":101},[33,168735,1635],{"class":163},[33,168737,164958],{"class":167},[33,168739,495],{"class":163},[33,168741,159915],{"class":167},[33,168743,168744,168747,168749,168751,168753],{"class":35,"line":171},[33,168745,168746],{"class":167},"        profile_dir ",[33,168748,242],{"class":163},[33,168750,164972],{"class":167},[33,168752,1351],{"class":163},[33,168754,164977],{"class":54},[33,168756,168757],{"class":35,"line":179},[33,168758,168759],{"class":167},"        profile_dir.mkdir()\n",[33,168761,168762],{"class":35,"line":187},[33,168763,168764],{"class":167},"        subprocess.run(\n",[33,168766,168767],{"class":35,"line":201},[33,168768,164995],{"class":167},[33,168770,168771,168773],{"class":35,"line":206},[33,168772,165000],{"class":54},[33,168774,247],{"class":167},[33,168776,168777,168779,168781,168783,168786,168788,168790],{"class":35,"line":224},[33,168778,23946],{"class":163},[33,168780,165009],{"class":54},[33,168782,1115],{"class":50},[33,168784,168785],{"class":167},"profile_dir",[33,168787,1121],{"class":50},[33,168789,274],{"class":54},[33,168791,247],{"class":167},[33,168793,168794,168796],{"class":35,"line":229},[33,168795,165028],{"class":54},[33,168797,247],{"class":167},[33,168799,168800,168802,168804,168806],{"class":35,"line":235},[33,168801,165035],{"class":54},[33,168803,365],{"class":167},[33,168805,15519],{"class":54},[33,168807,247],{"class":167},[33,168809,168810,168812,168814,168816],{"class":35,"line":250},[33,168811,165046],{"class":54},[33,168813,365],{"class":167},[33,168815,1053],{"class":50},[33,168817,165053],{"class":167},[33,168819,168820,168822],{"class":35,"line":266},[33,168821,7879],{"class":50},[33,168823,164353],{"class":167},[33,168825,168826],{"class":35,"line":290},[33,168827,11436],{"class":167},[33,168829,168830,168832,168834,168836,168838,168840,168842,168844,168846,168848,168850,168852,168854,168857,168859,168861],{"class":35,"line":295},[33,168831,165068],{"class":238},[33,168833,242],{"class":163},[33,168835,855],{"class":50},[33,168837,365],{"class":167},[33,168839,2000],{"class":238},[33,168841,242],{"class":163},[33,168843,855],{"class":50},[33,168845,365],{"class":167},[33,168847,1641],{"class":238},[33,168849,242],{"class":163},[33,168851,2589],{"class":50},[33,168853,365],{"class":167},[33,168855,168856],{"class":238},"check",[33,168858,242],{"class":163},[33,168860,855],{"class":50},[33,168862,247],{"class":167},[33,168864,168865],{"class":35,"line":300},[33,168866,5867],{"class":167},[14,168868,168869,168872,168873,168875,168876,168878],{},[1974,168870,168871],{},"docx2pdf parallelism",": On Windows, ",[30,168874,163908],{}," supports passing a directory path. For true parallelism, spawn multiple Word processes via separate COM instances — but in practice this is rarely stable. Prefer sequential conversion with ",[30,168877,163908],{}," and parallel conversion with LibreOffice.",[14,168880,168881,168883,168884,168887],{},[1974,168882,4218],{},": LibreOffice loads the full document model into memory. Documents with many embedded images can exceed 2 GB RAM per process. Monitor with ",[30,168885,168886],{},"psutil"," and cap concurrent workers accordingly.",[18,168889,168891],{"id":168890},"_10-troubleshooting","10. Troubleshooting",[4273,168893,168894,168904],{},[4276,168895,168896],{},[4279,168897,168898,168900,168902],{},[4282,168899,14317],{"align":128901},[4282,168901,4287],{"align":128901},[4282,168903,4290],{"align":128901},[4292,168905,168906,168922,168935,168950,168964,168979],{},[4279,168907,168908,168912,168917],{},[4297,168909,168910],{"align":128901},[30,168911,166046],{},[4297,168913,168914,168916],{"align":128901},[30,168915,163908],{}," requires Word, unavailable on Linux",[4297,168918,168919,168920],{"align":128901},"Switch to LibreOffice headless; see ",[940,168921,163898],{"href":167362},[4279,168923,168924,168929,168932],{},[4297,168925,168926],{"align":128901},[30,168927,168928],{},"com_error: -2147221005",[4297,168930,168931],{"align":128901},"Word COM server not registered \u002F Word not installed",[4297,168933,168934],{"align":128901},"Install Microsoft Word or use LibreOffice",[4279,168936,168937,168942,168945],{},[4297,168938,168939],{"align":128901},[30,168940,168941],{},"soffice: command not found",[4297,168943,168944],{"align":128901},"LibreOffice not on PATH",[4297,168946,168947,168949],{"align":128901},[30,168948,166080],{}," or add LibreOffice bin dir to PATH",[4279,168951,168952,168956,168959],{},[4297,168953,168954],{"align":128901},[30,168955,166091],{},[4297,168957,168958],{"align":128901},"LibreOffice profile dir locked or read-only",[4297,168960,17059,168961,168963],{"align":128901},[30,168962,165557],{}," with a per-run temp dir (see section 9)",[4279,168965,168966,168969,168972],{},[4297,168967,168968],{"align":128901},"Garbled or missing text in PDF",[4297,168970,168971],{"align":128901},"Font not installed on Linux server",[4297,168973,166110,168974,168976,168977],{"align":128901},[30,168975,165867],{}," and custom fonts; run ",[30,168978,168104],{},[4279,168980,168981,168984,168989],{},[4297,168982,168983],{"align":128901},"PDF page count is 0",[4297,168985,168986,168987],{"align":128901},"Empty or corrupt ",[30,168988,18051],{},[4297,168990,168991,168992,168994,168995],{"align":128901},"Validate with ",[30,168993,18041],{}," before converting: ",[30,168996,168997],{},"Document(path).paragraphs",[18,168999,169001],{"id":169000},"_11-complete-script","11. Complete Script",[23,169003,169005],{"className":126,"code":169004,"language":47,"meta":28,"style":28},"#!\u002Fusr\u002Fbin\u002Fenv python3\n\"\"\"\nbatch_docx_to_pdf.py — Convert a folder of .docx files to PDF.\n\nUsage:\n    python batch_docx_to_pdf.py input_docs\u002F output_pdfs\u002F\n\nEngines:\n    Windows \u002F macOS: docx2pdf (requires Microsoft Word)\n    Linux \u002F server:  LibreOffice headless (requires soffice on PATH)\n\npip install docx2pdf pypdf    # docx2pdf only needed on Windows\u002FmacOS\n\"\"\"\nimport argparse\nimport platform\nimport subprocess\nimport tempfile\nfrom pathlib import Path\n\ntry:\n    from pypdf import PdfReader\n    PYPDF_AVAILABLE = True\nexcept ImportError:\n    PYPDF_AVAILABLE = False\n\nSYSTEM = platform.system()\n\n\ndef soffice_convert(docx_path: Path, output_dir: Path) -> None:\n    \"\"\"Convert via LibreOffice headless with an isolated user profile.\"\"\"\n    with tempfile.TemporaryDirectory() as tmp:\n        profile = Path(tmp) \u002F \"lo_profile\"\n        profile.mkdir()\n        result = subprocess.run(\n            [\n                \"soffice\",\n                f\"-env:UserInstallation=file:\u002F\u002F{profile}\",\n                \"--headless\",\n                \"--convert-to\", \"pdf\",\n                \"--outdir\", str(output_dir),\n                str(docx_path),\n            ],\n            capture_output=True, text=True, timeout=120,\n        )\n        if result.returncode != 0:\n            raise RuntimeError(result.stderr.strip() or \"soffice exited non-zero\")\n\n\ndef docx2pdf_convert(docx_path: Path, output_path: Path) -> None:\n    \"\"\"Convert via docx2pdf (Windows\u002FmacOS only).\"\"\"\n    from docx2pdf import convert  # noqa: PLC0415\n    convert(docx_path, output_path)\n\n\ndef validate(pdf_path: Path) -> bool:\n    if not PYPDF_AVAILABLE:\n        return pdf_path.exists() and pdf_path.stat().st_size > 0\n    try:\n        return len(PdfReader(pdf_path).pages) > 0\n    except Exception:\n        return False\n\n\ndef batch_convert(input_dir: Path, output_dir: Path, skip_existing: bool = True) -> None:\n    input_dir = input_dir.resolve()\n    output_dir = output_dir.resolve()\n    output_dir.mkdir(parents=True, exist_ok=True)\n\n    files = sorted(input_dir.rglob(\"*.docx\"))\n    if not files:\n        print(\"No .docx files found.\")\n        return\n\n    ok = fail = skipped = 0\n\n    for docx_path in files:\n        rel = docx_path.relative_to(input_dir)\n        out_subdir = output_dir \u002F rel.parent\n        out_subdir.mkdir(parents=True, exist_ok=True)\n        pdf_path = out_subdir \u002F (docx_path.stem + \".pdf\")\n\n        if skip_existing and pdf_path.exists() and pdf_path.stat().st_mtime >= docx_path.stat().st_mtime:\n            skipped += 1\n            continue\n\n        try:\n            if SYSTEM in (\"Windows\", \"Darwin\"):\n                docx2pdf_convert(docx_path, pdf_path)\n            else:\n                soffice_convert(docx_path, out_subdir)\n\n            if validate(pdf_path):\n                print(f\"  OK:   {rel}\")\n                ok += 1\n            else:\n                print(f\"  WARN: {rel} — PDF validation failed\")\n                fail += 1\n        except Exception as exc:\n            print(f\"  FAIL: {rel} — {exc}\")\n            fail += 1\n\n    print(f\"\\nDone: {ok} converted, {fail} failed, {skipped} skipped.\")\n\n\ndef main() -> None:\n    parser = argparse.ArgumentParser(description=\"Batch convert .docx to PDF\")\n    parser.add_argument(\"input_dir\", type=Path)\n    parser.add_argument(\"output_dir\", type=Path)\n    parser.add_argument(\"--no-skip\", action=\"store_true\", help=\"Re-convert even if PDF exists\")\n    args = parser.parse_args()\n    batch_convert(args.input_dir, args.output_dir, skip_existing=not args.no_skip)\n\n\nif __name__ == \"__main__\":\n    main()\n",[30,169006,169007,169011,169015,169020,169024,169028,169033,169037,169042,169047,169052,169056,169061,169065,169071,169077,169083,169089,169099,169103,169109,169119,169128,169136,169144,169148,169157,169161,169165,169178,169183,169193,169205,169209,169217,169221,169227,169243,169249,169259,169269,169275,169279,169305,169309,169321,169336,169340,169344,169358,169363,169375,169380,169384,169388,169400,169411,169427,169433,169446,169454,169460,169464,169468,169489,169497,169505,169525,169529,169543,169551,169561,169565,169569,169587,169591,169601,169609,169621,169641,169659,169663,169684,169693,169697,169701,169707,169726,169731,169737,169742,169746,169753,169774,169783,169789,169811,169820,169830,169858,169867,169871,169915,169919,169923,169935,169952,169966,169981,170007,170015,170028,170032,170036,170048],{"__ignoreMap":28},[33,169008,169009],{"class":35,"line":36},[33,169010,14447],{"class":39},[33,169012,169013],{"class":35,"line":43},[33,169014,139],{"class":54},[33,169016,169017],{"class":35,"line":61},[33,169018,169019],{"class":54},"batch_docx_to_pdf.py — Convert a folder of .docx files to PDF.\n",[33,169021,169022],{"class":35,"line":73},[33,169023,92],{"emptyLinePlaceholder":91},[33,169025,169026],{"class":35,"line":88},[33,169027,4435],{"class":54},[33,169029,169030],{"class":35,"line":95},[33,169031,169032],{"class":54},"    python batch_docx_to_pdf.py input_docs\u002F output_pdfs\u002F\n",[33,169034,169035],{"class":35,"line":101},[33,169036,92],{"emptyLinePlaceholder":91},[33,169038,169039],{"class":35,"line":171},[33,169040,169041],{"class":54},"Engines:\n",[33,169043,169044],{"class":35,"line":179},[33,169045,169046],{"class":54},"    Windows \u002F macOS: docx2pdf (requires Microsoft Word)\n",[33,169048,169049],{"class":35,"line":187},[33,169050,169051],{"class":54},"    Linux \u002F server:  LibreOffice headless (requires soffice on PATH)\n",[33,169053,169054],{"class":35,"line":201},[33,169055,92],{"emptyLinePlaceholder":91},[33,169057,169058],{"class":35,"line":206},[33,169059,169060],{"class":54},"pip install docx2pdf pypdf    # docx2pdf only needed on Windows\u002FmacOS\n",[33,169062,169063],{"class":35,"line":224},[33,169064,139],{"class":54},[33,169066,169067,169069],{"class":35,"line":229},[33,169068,164],{"class":163},[33,169070,4461],{"class":167},[33,169072,169073,169075],{"class":35,"line":235},[33,169074,164],{"class":163},[33,169076,163978],{"class":167},[33,169078,169079,169081],{"class":35,"line":250},[33,169080,164],{"class":163},[33,169082,35040],{"class":167},[33,169084,169085,169087],{"class":35,"line":266},[33,169086,164],{"class":163},[33,169088,70055],{"class":167},[33,169090,169091,169093,169095,169097],{"class":35,"line":290},[33,169092,190],{"class":163},[33,169094,193],{"class":167},[33,169096,164],{"class":163},[33,169098,198],{"class":167},[33,169100,169101],{"class":35,"line":295},[33,169102,92],{"emptyLinePlaceholder":91},[33,169104,169105,169107],{"class":35,"line":300},[33,169106,35574],{"class":163},[33,169108,574],{"class":167},[33,169110,169111,169113,169115,169117],{"class":35,"line":317},[33,169112,3878],{"class":163},[33,169114,57333],{"class":167},[33,169116,164],{"class":163},[33,169118,57338],{"class":167},[33,169120,169121,169124,169126],{"class":35,"line":332},[33,169122,169123],{"class":50},"    PYPDF_AVAILABLE",[33,169125,212],{"class":163},[33,169127,2887],{"class":50},[33,169129,169130,169132,169134],{"class":35,"line":347},[33,169131,35726],{"class":163},[33,169133,40488],{"class":50},[33,169135,574],{"class":167},[33,169137,169138,169140,169142],{"class":35,"line":374},[33,169139,169123],{"class":50},[33,169141,212],{"class":163},[33,169143,2903],{"class":50},[33,169145,169146],{"class":35,"line":397},[33,169147,92],{"emptyLinePlaceholder":91},[33,169149,169150,169153,169155],{"class":35,"line":653},[33,169151,169152],{"class":50},"SYSTEM",[33,169154,212],{"class":163},[33,169156,164714],{"class":167},[33,169158,169159],{"class":35,"line":667},[33,169160,92],{"emptyLinePlaceholder":91},[33,169162,169163],{"class":35,"line":675},[33,169164,92],{"emptyLinePlaceholder":91},[33,169166,169167,169169,169172,169174,169176],{"class":35,"line":689},[33,169168,562],{"class":163},[33,169170,169171],{"class":46}," soffice_convert",[33,169173,164942],{"class":167},[33,169175,571],{"class":50},[33,169177,574],{"class":167},[33,169179,169180],{"class":35,"line":703},[33,169181,169182],{"class":54},"    \"\"\"Convert via LibreOffice headless with an isolated user profile.\"\"\"\n",[33,169184,169185,169187,169189,169191],{"class":35,"line":714},[33,169186,1635],{"class":163},[33,169188,164958],{"class":167},[33,169190,495],{"class":163},[33,169192,159915],{"class":167},[33,169194,169195,169197,169199,169201,169203],{"class":35,"line":723},[33,169196,164967],{"class":167},[33,169198,242],{"class":163},[33,169200,164972],{"class":167},[33,169202,1351],{"class":163},[33,169204,164977],{"class":54},[33,169206,169207],{"class":35,"line":754},[33,169208,164982],{"class":167},[33,169210,169211,169213,169215],{"class":35,"line":771},[33,169212,87961],{"class":167},[33,169214,242],{"class":163},[33,169216,35060],{"class":167},[33,169218,169219],{"class":35,"line":777},[33,169220,164995],{"class":167},[33,169222,169223,169225],{"class":35,"line":788},[33,169224,165000],{"class":54},[33,169226,247],{"class":167},[33,169228,169229,169231,169233,169235,169237,169239,169241],{"class":35,"line":804},[33,169230,23946],{"class":163},[33,169232,165009],{"class":54},[33,169234,1115],{"class":50},[33,169236,165014],{"class":167},[33,169238,1121],{"class":50},[33,169240,274],{"class":54},[33,169242,247],{"class":167},[33,169244,169245,169247],{"class":35,"line":809},[33,169246,165028],{"class":54},[33,169248,247],{"class":167},[33,169250,169251,169253,169255,169257],{"class":35,"line":819},[33,169252,165035],{"class":54},[33,169254,365],{"class":167},[33,169256,15519],{"class":54},[33,169258,247],{"class":167},[33,169260,169261,169263,169265,169267],{"class":35,"line":829},[33,169262,165046],{"class":54},[33,169264,365],{"class":167},[33,169266,1053],{"class":50},[33,169268,165053],{"class":167},[33,169270,169271,169273],{"class":35,"line":834},[33,169272,7879],{"class":50},[33,169274,164353],{"class":167},[33,169276,169277],{"class":35,"line":839},[33,169278,11436],{"class":167},[33,169280,169281,169283,169285,169287,169289,169291,169293,169295,169297,169299,169301,169303],{"class":35,"line":860},[33,169282,165068],{"class":238},[33,169284,242],{"class":163},[33,169286,855],{"class":50},[33,169288,365],{"class":167},[33,169290,2000],{"class":238},[33,169292,242],{"class":163},[33,169294,855],{"class":50},[33,169296,365],{"class":167},[33,169298,1641],{"class":238},[33,169300,242],{"class":163},[33,169302,2589],{"class":50},[33,169304,247],{"class":167},[33,169306,169307],{"class":35,"line":887},[33,169308,5867],{"class":167},[33,169310,169311,169313,169315,169317,169319],{"class":35,"line":907},[33,169312,8221],{"class":163},[33,169314,35108],{"class":167},[33,169316,17877],{"class":163},[33,169318,10791],{"class":50},[33,169320,574],{"class":167},[33,169322,169323,169325,169327,169329,169331,169334],{"class":35,"line":1826},[33,169324,59715],{"class":163},[33,169326,7590],{"class":50},[33,169328,165121],{"class":167},[33,169330,7162],{"class":163},[33,169332,169333],{"class":54}," \"soffice exited non-zero\"",[33,169335,221],{"class":167},[33,169337,169338],{"class":35,"line":1844},[33,169339,92],{"emptyLinePlaceholder":91},[33,169341,169342],{"class":35,"line":1858},[33,169343,92],{"emptyLinePlaceholder":91},[33,169345,169346,169348,169351,169354,169356],{"class":35,"line":1871},[33,169347,562],{"class":163},[33,169349,169350],{"class":46}," docx2pdf_convert",[33,169352,169353],{"class":167},"(docx_path: Path, output_path: Path) -> ",[33,169355,571],{"class":50},[33,169357,574],{"class":167},[33,169359,169360],{"class":35,"line":1877},[33,169361,169362],{"class":54},"    \"\"\"Convert via docx2pdf (Windows\u002FmacOS only).\"\"\"\n",[33,169364,169365,169367,169369,169371,169373],{"class":35,"line":1883},[33,169366,3878],{"class":163},[33,169368,164024],{"class":167},[33,169370,164],{"class":163},[33,169372,164855],{"class":167},[33,169374,164858],{"class":39},[33,169376,169377],{"class":35,"line":1915},[33,169378,169379],{"class":167},"    convert(docx_path, output_path)\n",[33,169381,169382],{"class":35,"line":1926},[33,169383,92],{"emptyLinePlaceholder":91},[33,169385,169386],{"class":35,"line":1932},[33,169387,92],{"emptyLinePlaceholder":91},[33,169389,169390,169392,169394,169396,169398],{"class":35,"line":1938},[33,169391,562],{"class":163},[33,169393,25052],{"class":46},[33,169395,37097],{"class":167},[33,169397,2821],{"class":50},[33,169399,574],{"class":167},[33,169401,169402,169404,169406,169409],{"class":35,"line":1950},[33,169403,617],{"class":163},[33,169405,620],{"class":163},[33,169407,169408],{"class":50}," PYPDF_AVAILABLE",[33,169410,574],{"class":167},[33,169412,169413,169415,169418,169420,169423,169425],{"class":35,"line":1958},[33,169414,1659],{"class":163},[33,169416,169417],{"class":167}," pdf_path.exists() ",[33,169419,6001],{"class":163},[33,169421,169422],{"class":167}," pdf_path.stat().st_size ",[33,169424,6009],{"class":163},[33,169426,28914],{"class":50},[33,169428,169429,169431],{"class":35,"line":4904},[33,169430,2424],{"class":163},[33,169432,574],{"class":167},[33,169434,169435,169437,169439,169442,169444],{"class":35,"line":4909},[33,169436,1659],{"class":163},[33,169438,4037],{"class":50},[33,169440,169441],{"class":167},"(PdfReader(pdf_path).pages) ",[33,169443,6009],{"class":163},[33,169445,28914],{"class":50},[33,169447,169448,169450,169452],{"class":35,"line":4915},[33,169449,2449],{"class":163},[33,169451,783],{"class":50},[33,169453,574],{"class":167},[33,169455,169456,169458],{"class":35,"line":4925},[33,169457,1659],{"class":163},[33,169459,2903],{"class":50},[33,169461,169462],{"class":35,"line":4935},[33,169463,92],{"emptyLinePlaceholder":91},[33,169465,169466],{"class":35,"line":4941},[33,169467,92],{"emptyLinePlaceholder":91},[33,169469,169470,169472,169474,169477,169479,169481,169483,169485,169487],{"class":35,"line":4950},[33,169471,562],{"class":163},[33,169473,167525],{"class":46},[33,169475,169476],{"class":167},"(input_dir: Path, output_dir: Path, skip_existing: ",[33,169478,2821],{"class":50},[33,169480,212],{"class":163},[33,169482,2519],{"class":50},[33,169484,1617],{"class":167},[33,169486,571],{"class":50},[33,169488,574],{"class":167},[33,169490,169491,169493,169495],{"class":35,"line":4960},[33,169492,69437],{"class":167},[33,169494,242],{"class":163},[33,169496,167542],{"class":167},[33,169498,169499,169501,169503],{"class":35,"line":4965},[33,169500,22180],{"class":167},[33,169502,242],{"class":163},[33,169504,164269],{"class":167},[33,169506,169507,169509,169511,169513,169515,169517,169519,169521,169523],{"class":35,"line":4971},[33,169508,6346],{"class":167},[33,169510,869],{"class":238},[33,169512,242],{"class":163},[33,169514,855],{"class":50},[33,169516,365],{"class":167},[33,169518,878],{"class":238},[33,169520,242],{"class":163},[33,169522,855],{"class":50},[33,169524,221],{"class":167},[33,169526,169527],{"class":35,"line":4983},[33,169528,92],{"emptyLinePlaceholder":91},[33,169530,169531,169533,169535,169537,169539,169541],{"class":35,"line":4988},[33,169532,74826],{"class":167},[33,169534,242],{"class":163},[33,169536,28924],{"class":50},[33,169538,167586],{"class":167},[33,169540,167589],{"class":54},[33,169542,371],{"class":167},[33,169544,169545,169547,169549],{"class":35,"line":4993},[33,169546,617],{"class":163},[33,169548,620],{"class":163},[33,169550,74854],{"class":167},[33,169552,169553,169555,169557,169559],{"class":35,"line":5003},[33,169554,9414],{"class":50},[33,169556,602],{"class":167},[33,169558,167609],{"class":54},[33,169560,221],{"class":167},[33,169562,169563],{"class":35,"line":5008},[33,169564,646],{"class":163},[33,169566,169567],{"class":35,"line":5014},[33,169568,92],{"emptyLinePlaceholder":91},[33,169570,169571,169573,169575,169578,169580,169583,169585],{"class":35,"line":5019},[33,169572,79390],{"class":167},[33,169574,242],{"class":163},[33,169576,169577],{"class":167}," fail ",[33,169579,242],{"class":163},[33,169581,169582],{"class":167}," skipped ",[33,169584,242],{"class":163},[33,169586,28914],{"class":50},[33,169588,169589],{"class":35,"line":5032},[33,169590,92],{"emptyLinePlaceholder":91},[33,169592,169593,169595,169597,169599],{"class":35,"line":5039},[33,169594,656],{"class":163},[33,169596,167649],{"class":167},[33,169598,662],{"class":163},[33,169600,74854],{"class":167},[33,169602,169603,169605,169607],{"class":35,"line":5068},[33,169604,167663],{"class":167},[33,169606,242],{"class":163},[33,169608,167668],{"class":167},[33,169610,169611,169613,169615,169617,169619],{"class":35,"line":5077},[33,169612,167673],{"class":167},[33,169614,242],{"class":163},[33,169616,6393],{"class":167},[33,169618,1351],{"class":163},[33,169620,167682],{"class":167},[33,169622,169623,169625,169627,169629,169631,169633,169635,169637,169639],{"class":35,"line":5082},[33,169624,167687],{"class":167},[33,169626,869],{"class":238},[33,169628,242],{"class":163},[33,169630,855],{"class":50},[33,169632,365],{"class":167},[33,169634,878],{"class":238},[33,169636,242],{"class":163},[33,169638,855],{"class":50},[33,169640,221],{"class":167},[33,169642,169643,169645,169647,169649,169651,169653,169655,169657],{"class":35,"line":5089},[33,169644,167708],{"class":167},[33,169646,242],{"class":163},[33,169648,167713],{"class":167},[33,169650,1351],{"class":163},[33,169652,164455],{"class":167},[33,169654,1811],{"class":163},[33,169656,164460],{"class":54},[33,169658,221],{"class":167},[33,169660,169661],{"class":35,"line":5098},[33,169662,92],{"emptyLinePlaceholder":91},[33,169664,169665,169667,169670,169672,169674,169676,169679,169681],{"class":35,"line":5105},[33,169666,8221],{"class":163},[33,169668,169669],{"class":167}," skip_existing ",[33,169671,6001],{"class":163},[33,169673,169417],{"class":167},[33,169675,6001],{"class":163},[33,169677,169678],{"class":167}," pdf_path.stat().st_mtime ",[33,169680,43000],{"class":163},[33,169682,169683],{"class":167}," docx_path.stat().st_mtime:\n",[33,169685,169686,169689,169691],{"class":35,"line":5110},[33,169687,169688],{"class":167},"            skipped ",[33,169690,28976],{"class":163},[33,169692,17709],{"class":50},[33,169694,169695],{"class":35,"line":5115},[33,169696,9330],{"class":163},[33,169698,169699],{"class":35,"line":5128},[33,169700,92],{"emptyLinePlaceholder":91},[33,169702,169703,169705],{"class":35,"line":5135},[33,169704,670],{"class":163},[33,169706,574],{"class":167},[33,169708,169709,169711,169714,169716,169718,169720,169722,169724],{"class":35,"line":5142},[33,169710,5995],{"class":163},[33,169712,169713],{"class":50}," SYSTEM",[33,169715,8002],{"class":163},[33,169717,17583],{"class":167},[33,169719,164831],{"class":54},[33,169721,365],{"class":167},[33,169723,164836],{"class":54},[33,169725,1737],{"class":167},[33,169727,169728],{"class":35,"line":5151},[33,169729,169730],{"class":167},"                docx2pdf_convert(docx_path, pdf_path)\n",[33,169732,169733,169735],{"class":35,"line":5156},[33,169734,8705],{"class":163},[33,169736,574],{"class":167},[33,169738,169739],{"class":35,"line":5161},[33,169740,169741],{"class":167},"                soffice_convert(docx_path, out_subdir)\n",[33,169743,169744],{"class":35,"line":5167},[33,169745,92],{"emptyLinePlaceholder":91},[33,169747,169748,169750],{"class":35,"line":5172},[33,169749,5995],{"class":163},[33,169751,169752],{"class":167}," validate(pdf_path):\n",[33,169754,169755,169757,169759,169761,169764,169766,169768,169770,169772],{"class":35,"line":5182},[33,169756,8264],{"class":50},[33,169758,602],{"class":167},[33,169760,4059],{"class":163},[33,169762,169763],{"class":54},"\"  OK:   ",[33,169765,1115],{"class":50},[33,169767,167892],{"class":167},[33,169769,1121],{"class":50},[33,169771,274],{"class":54},[33,169773,221],{"class":167},[33,169775,169776,169779,169781],{"class":35,"line":5195},[33,169777,169778],{"class":167},"                ok ",[33,169780,28976],{"class":163},[33,169782,17709],{"class":50},[33,169784,169785,169787],{"class":35,"line":5200},[33,169786,8705],{"class":163},[33,169788,574],{"class":167},[33,169790,169791,169793,169795,169797,169800,169802,169804,169806,169809],{"class":35,"line":5205},[33,169792,8264],{"class":50},[33,169794,602],{"class":167},[33,169796,4059],{"class":163},[33,169798,169799],{"class":54},"\"  WARN: ",[33,169801,1115],{"class":50},[33,169803,167892],{"class":167},[33,169805,1121],{"class":50},[33,169807,169808],{"class":54}," — PDF validation failed\"",[33,169810,221],{"class":167},[33,169812,169813,169816,169818],{"class":35,"line":5210},[33,169814,169815],{"class":167},"                fail ",[33,169817,28976],{"class":163},[33,169819,17709],{"class":50},[33,169821,169822,169824,169826,169828],{"class":35,"line":5215},[33,169823,780],{"class":163},[33,169825,783],{"class":50},[33,169827,1852],{"class":163},[33,169829,1855],{"class":167},[33,169831,169832,169834,169836,169838,169840,169842,169844,169846,169848,169850,169852,169854,169856],{"class":35,"line":5220},[33,169833,9364],{"class":50},[33,169835,602],{"class":167},[33,169837,4059],{"class":163},[33,169839,167919],{"class":54},[33,169841,1115],{"class":50},[33,169843,167892],{"class":167},[33,169845,1121],{"class":50},[33,169847,6242],{"class":54},[33,169849,1115],{"class":50},[33,169851,6565],{"class":167},[33,169853,1121],{"class":50},[33,169855,274],{"class":54},[33,169857,221],{"class":167},[33,169859,169860,169863,169865],{"class":35,"line":5227},[33,169861,169862],{"class":167},"            fail ",[33,169864,28976],{"class":163},[33,169866,17709],{"class":50},[33,169868,169869],{"class":35,"line":5232},[33,169870,92],{"emptyLinePlaceholder":91},[33,169872,169873,169875,169877,169879,169881,169883,169885,169887,169889,169891,169894,169896,169899,169901,169904,169906,169908,169910,169913],{"class":35,"line":5237},[33,169874,7268],{"class":50},[33,169876,602],{"class":167},[33,169878,4059],{"class":163},[33,169880,274],{"class":54},[33,169882,25830],{"class":50},[33,169884,87732],{"class":54},[33,169886,1115],{"class":50},[33,169888,87737],{"class":167},[33,169890,1121],{"class":50},[33,169892,169893],{"class":54}," converted, ",[33,169895,1115],{"class":50},[33,169897,169898],{"class":167},"fail",[33,169900,1121],{"class":50},[33,169902,169903],{"class":54}," failed, ",[33,169905,1115],{"class":50},[33,169907,94574],{"class":167},[33,169909,1121],{"class":50},[33,169911,169912],{"class":54}," skipped.\"",[33,169914,221],{"class":167},[33,169916,169917],{"class":35,"line":5251},[33,169918,92],{"emptyLinePlaceholder":91},[33,169920,169921],{"class":35,"line":5259},[33,169922,92],{"emptyLinePlaceholder":91},[33,169924,169925,169927,169929,169931,169933],{"class":35,"line":5264},[33,169926,562],{"class":163},[33,169928,6636],{"class":46},[33,169930,568],{"class":167},[33,169932,571],{"class":50},[33,169934,574],{"class":167},[33,169936,169937,169939,169941,169943,169945,169947,169950],{"class":35,"line":5269},[33,169938,6648],{"class":167},[33,169940,242],{"class":163},[33,169942,6653],{"class":167},[33,169944,6656],{"class":238},[33,169946,242],{"class":163},[33,169948,169949],{"class":54},"\"Batch convert .docx to PDF\"",[33,169951,221],{"class":167},[33,169953,169954,169956,169958,169960,169962,169964],{"class":35,"line":5283},[33,169955,6669],{"class":167},[33,169957,137636],{"class":54},[33,169959,365],{"class":167},[33,169961,6677],{"class":238},[33,169963,242],{"class":163},[33,169965,15528],{"class":167},[33,169967,169968,169970,169973,169975,169977,169979],{"class":35,"line":5293},[33,169969,6669],{"class":167},[33,169971,169972],{"class":54},"\"output_dir\"",[33,169974,365],{"class":167},[33,169976,6677],{"class":238},[33,169978,242],{"class":163},[33,169980,15528],{"class":167},[33,169982,169983,169985,169988,169990,169992,169994,169996,169998,170000,170002,170005],{"class":35,"line":5303},[33,169984,6669],{"class":167},[33,169986,169987],{"class":54},"\"--no-skip\"",[33,169989,365],{"class":167},[33,169991,124563],{"class":238},[33,169993,242],{"class":163},[33,169995,6740],{"class":54},[33,169997,365],{"class":167},[33,169999,25463],{"class":238},[33,170001,242],{"class":163},[33,170003,170004],{"class":54},"\"Re-convert even if PDF exists\"",[33,170006,221],{"class":167},[33,170008,170009,170011,170013],{"class":35,"line":5313},[33,170010,6766],{"class":167},[33,170012,242],{"class":163},[33,170014,6771],{"class":167},[33,170016,170017,170020,170023,170025],{"class":35,"line":5320},[33,170018,170019],{"class":167},"    batch_convert(args.input_dir, args.output_dir, ",[33,170021,170022],{"class":238},"skip_existing",[33,170024,124906],{"class":163},[33,170026,170027],{"class":167}," args.no_skip)\n",[33,170029,170030],{"class":35,"line":5325},[33,170031,92],{"emptyLinePlaceholder":91},[33,170033,170034],{"class":35,"line":5330},[33,170035,92],{"emptyLinePlaceholder":91},[33,170037,170038,170040,170042,170044,170046],{"class":35,"line":5344},[33,170039,2491],{"class":163},[33,170041,2494],{"class":50},[33,170043,2497],{"class":163},[33,170045,2500],{"class":54},[33,170047,574],{"class":167},[33,170049,170050],{"class":35,"line":5349},[33,170051,6914],{"class":167},[18,170053,6918],{"id":6917},[4211,170055,170056,170064,170069,170074],{},[4214,170057,170058,170060,170061,170063],{},[940,170059,163898],{"href":167362}," — detailed fix for the ",[30,170062,86120],{}," and COM errors on Linux",[4214,170065,170066,170068],{},[940,170067,156152],{"href":26562}," — generate the .docx files you will be converting",[4214,170070,170071,170073],{},[940,170072,26191],{"href":19001}," — build PDFs directly from data without the DOCX intermediate step",[4214,170075,170076,170078],{},[940,170077,52682],{"href":52681}," — combine the PDFs produced by batch conversion",[14,170080,6947,170081,3035],{},[940,170082,26263],{"href":26262},[6953,170084,64775],{},{"title":28,"searchDepth":43,"depth":43,"links":170086},[170087,170088,170089,170090,170091,170092,170093,170097,170102,170103,170104,170105,170106],{"id":20,"depth":43,"text":21},{"id":166537,"depth":43,"text":166538},{"id":166824,"depth":43,"text":166825},{"id":167033,"depth":43,"text":167034},{"id":167365,"depth":43,"text":167366},{"id":167467,"depth":43,"text":167468},{"id":168038,"depth":43,"text":168039,"children":170094},[170095,170096],{"id":168042,"depth":61,"text":168043},{"id":168108,"depth":61,"text":168109},{"id":168145,"depth":43,"text":168146,"children":170098},[170099,170100,170101],{"id":168149,"depth":61,"text":168150},{"id":168305,"depth":61,"text":168306},{"id":168378,"depth":61,"text":168379},{"id":168431,"depth":43,"text":168432},{"id":168672,"depth":43,"text":168673},{"id":168890,"depth":43,"text":168891},{"id":169000,"depth":43,"text":169001},{"id":6917,"depth":43,"text":6918},"DOCX to PDF","Convert .docx files to PDF in Python using docx2pdf on Windows\u002FmacOS and LibreOffice headless on Linux. Covers batch folders, font fidelity, and engine selection.",{},"\u002Fword-document-templating-batch-processing\u002Fconverting-docx-to-pdf-with-python",{"title":161278,"description":170108},"Convert DOCX to PDF with Python — docx2pdf & LibreOffice","word-document-templating-batch-processing\u002Fconverting-docx-to-pdf-with-python\u002Findex",[47,163908,166427,170115,9631],"word","yXNuyITe59GYk95FuGWoKkbhT-gcKQ19OzhaXHNbFOE",{"id":170118,"title":170119,"body":170120,"breadcrumbTitle":6977,"canonical":6977,"date":6977,"description":172218,"draft":6980,"extension":6981,"image":6977,"meta":172219,"navigation":91,"path":172220,"robots":6977,"seo":172221,"seoTitle":107412,"stem":172222,"tags":6977,"updatedAt":6977,"__hash__":172223},"content\u002Fword-document-templating-batch-processing\u002Fdynamic-mail-merge-with-python\u002Ffix-docxtpl-jinja2-undefined-error\u002Findex.md","Fix docxtpl Jinja2 UndefinedError",{"type":7,"value":170121,"toc":172201},[170122,170125,170146,170160,170162,170180,170186,170193,170195,170198,170400,170406,170410,170413,170622,170638,170642,170649,170814,170826,170834,170840,170846,170859,170869,170876,170881,170887,170890,170967,170977,170981,170994,171000,171003,171150,171156,171160,171172,171567,171574,171588,171594,171608,171611,171746,171752,171758,171762,171765,171771,171782,171820,171823,171939,171941,171944,172170,172175,172177,172194,172198],[10,170123,170119],{"id":170124},"fix-docxtpl-jinja2-undefinederror",[14,170126,170127,170130,170131,170134,170135,170138,170139,170141,170142,170145],{},[30,170128,170129],{},"jinja2.exceptions.UndefinedError: 'xxx' is undefined"," surfaces when ",[30,170132,170133],{},"template.render(context)"," encounters a ",[30,170136,170137],{},"{{ xxx }}"," placeholder in the ",[30,170140,18051],{}," template whose name is not present as a key in the ",[30,170143,170144],{},"context"," dict. The exception aborts the render entirely — no output file is written.",[14,170147,170148,170149,170151,170152,170155,170156,170159],{},"The three most common causes are: a typo between a template placeholder and a CSV column header, a column that is present in some rows but ",[30,170150,8884],{}," in others, and an attribute access on a dict key that does not exist (e.g. ",[30,170153,170154],{},"{{ item.price }}"," when ",[30,170157,170158],{},"item"," is a plain string).",[18,170161,7021],{"id":7020},[14,170163,170164,170165,170168,170169,170172,170173,170176,170177,170179],{},"Jinja2's default ",[30,170166,170167],{},"Undefined"," strategy is ",[1974,170170,170171],{},"strict",": referencing a missing name raises ",[30,170174,170175],{},"UndefinedError"," immediately rather than rendering an empty string. ",[30,170178,18047],{}," inherits this behavior unchanged. The full traceback points to the exact variable name in the exception message, making diagnosis straightforward once you know where to look.",[23,170181,170184],{"className":170182,"code":170183,"language":2000},[1998],"jinja2.exceptions.UndefinedError: 'client_tier' is undefined\n",[30,170185,170183],{"__ignoreMap":28},[14,170187,170188,170189,170192],{},"The name after the colon (",[30,170190,170191],{},"client_tier",") is the key your template expects but your context dict does not contain.",[18,170194,99786],{"id":54445},[14,170196,170197],{},"Isolate the problem by rendering a single row with an explicit context dict and printing what you have vs what the template expects.",[23,170199,170201],{"className":126,"code":170200,"language":47,"meta":28,"style":28},"# pip install docxtpl\nfrom pathlib import Path\nfrom docxtpl import DocxTemplate\n\nTEMPLATE = Path(\"templates\u002Fletter_template.docx\")\n\n# Step 1 — print every placeholder the template uses\ntpl = DocxTemplate(str(TEMPLATE))\nprint(\"Template variables:\", tpl.get_undeclared_template_variables())\n\n# Step 2 — print what your context actually contains\ncontext = {\n    \"first_name\": \"Alice\",\n    \"last_name\":  \"Smith\",\n    # \"client_tier\" is intentionally missing to reproduce the error\n}\nprint(\"Context keys:\", list(context.keys()))\n\n# Step 3 — attempt render; the UndefinedError names the missing key\ntry:\n    tpl.render(context)\nexcept Exception as exc:\n    print(f\"Error: {exc}\")\n",[30,170202,170203,170208,170218,170228,170232,170245,170249,170254,170272,170284,170288,170293,170302,170313,170325,170330,170334,170350,170354,170359,170365,170370,170380],{"__ignoreMap":28},[33,170204,170205],{"class":35,"line":36},[33,170206,170207],{"class":39},"# pip install docxtpl\n",[33,170209,170210,170212,170214,170216],{"class":35,"line":43},[33,170211,190],{"class":163},[33,170213,193],{"class":167},[33,170215,164],{"class":163},[33,170217,198],{"class":167},[33,170219,170220,170222,170224,170226],{"class":35,"line":61},[33,170221,190],{"class":163},[33,170223,20437],{"class":167},[33,170225,164],{"class":163},[33,170227,20442],{"class":167},[33,170229,170230],{"class":35,"line":73},[33,170231,92],{"emptyLinePlaceholder":91},[33,170233,170234,170236,170238,170240,170243],{"class":35,"line":88},[33,170235,97915],{"class":50},[33,170237,212],{"class":163},[33,170239,215],{"class":167},[33,170241,170242],{"class":54},"\"templates\u002Fletter_template.docx\"",[33,170244,221],{"class":167},[33,170246,170247],{"class":35,"line":95},[33,170248,92],{"emptyLinePlaceholder":91},[33,170250,170251],{"class":35,"line":101},[33,170252,170253],{"class":39},"# Step 1 — print every placeholder the template uses\n",[33,170255,170256,170259,170261,170264,170266,170268,170270],{"class":35,"line":171},[33,170257,170258],{"class":167},"tpl ",[33,170260,242],{"class":163},[33,170262,170263],{"class":167}," DocxTemplate(",[33,170265,1053],{"class":50},[33,170267,602],{"class":167},[33,170269,97915],{"class":50},[33,170271,371],{"class":167},[33,170273,170274,170276,170278,170281],{"class":35,"line":179},[33,170275,13474],{"class":50},[33,170277,602],{"class":167},[33,170279,170280],{"class":54},"\"Template variables:\"",[33,170282,170283],{"class":167},", tpl.get_undeclared_template_variables())\n",[33,170285,170286],{"class":35,"line":187},[33,170287,92],{"emptyLinePlaceholder":91},[33,170289,170290],{"class":35,"line":201},[33,170291,170292],{"class":39},"# Step 2 — print what your context actually contains\n",[33,170294,170295,170298,170300],{"class":35,"line":206},[33,170296,170297],{"class":167},"context ",[33,170299,242],{"class":163},[33,170301,16265],{"class":167},[33,170303,170304,170307,170309,170311],{"class":35,"line":224},[33,170305,170306],{"class":54},"    \"first_name\"",[33,170308,2079],{"class":167},[33,170310,140023],{"class":54},[33,170312,247],{"class":167},[33,170314,170315,170318,170320,170323],{"class":35,"line":229},[33,170316,170317],{"class":54},"    \"last_name\"",[33,170319,20627],{"class":167},[33,170321,170322],{"class":54},"\"Smith\"",[33,170324,247],{"class":167},[33,170326,170327],{"class":35,"line":235},[33,170328,170329],{"class":39},"    # \"client_tier\" is intentionally missing to reproduce the error\n",[33,170331,170332],{"class":35,"line":250},[33,170333,4113],{"class":167},[33,170335,170336,170338,170340,170343,170345,170347],{"class":35,"line":266},[33,170337,13474],{"class":50},[33,170339,602],{"class":167},[33,170341,170342],{"class":54},"\"Context keys:\"",[33,170344,365],{"class":167},[33,170346,25066],{"class":50},[33,170348,170349],{"class":167},"(context.keys()))\n",[33,170351,170352],{"class":35,"line":290},[33,170353,92],{"emptyLinePlaceholder":91},[33,170355,170356],{"class":35,"line":295},[33,170357,170358],{"class":39},"# Step 3 — attempt render; the UndefinedError names the missing key\n",[33,170360,170361,170363],{"class":35,"line":300},[33,170362,35574],{"class":163},[33,170364,574],{"class":167},[33,170366,170367],{"class":35,"line":317},[33,170368,170369],{"class":167},"    tpl.render(context)\n",[33,170371,170372,170374,170376,170378],{"class":35,"line":332},[33,170373,35726],{"class":163},[33,170375,783],{"class":50},[33,170377,1852],{"class":163},[33,170379,1855],{"class":167},[33,170381,170382,170384,170386,170388,170390,170392,170394,170396,170398],{"class":35,"line":347},[33,170383,7268],{"class":50},[33,170385,602],{"class":167},[33,170387,4059],{"class":163},[33,170389,39108],{"class":54},[33,170391,1115],{"class":50},[33,170393,6565],{"class":167},[33,170395,1121],{"class":50},[33,170397,274],{"class":54},[33,170399,221],{"class":167},[14,170401,170402,170405],{},[30,170403,170404],{},"get_undeclared_template_variables()"," returns the set of all names the template references. Diff that set against your context keys to find every gap at once, not just the first failure.",[18,170407,170409],{"id":170408},"fix-1-align-context-keys-with-template-placeholders","Fix 1 — Align Context Keys with Template Placeholders",[14,170411,170412],{},"The most direct fix: make sure every name the template uses exists in the context dict.",[23,170414,170416],{"className":126,"code":170415,"language":47,"meta":28,"style":28},"# pip install docxtpl pandas\nimport pandas as pd\nfrom docxtpl import DocxTemplate\nfrom pathlib import Path\n\nTEMPLATE = Path(\"templates\u002Fletter_template.docx\")\nDATA     = Path(\"data\u002Frecipients.csv\")\n\ntpl      = DocxTemplate(str(TEMPLATE))\nrequired = tpl.get_undeclared_template_variables()   # {'first_name', 'client_tier', ...}\n\ndf = pd.read_csv(DATA)\nprint(\"CSV columns :\", df.columns.tolist())\nprint(\"Template vars:\", sorted(required))\n\n# Find the gap\nmissing = required - set(df.columns.tolist())\nif missing:\n    raise ValueError(\n        f\"Template references variables not in CSV: {missing}. \"\n        \"Rename the CSV column or the template placeholder.\"\n    )\n",[30,170417,170418,170423,170433,170443,170453,170457,170469,170482,170486,170503,170516,170520,170532,170543,170560,170564,170569,170584,170590,170598,170613,170618],{"__ignoreMap":28},[33,170419,170420],{"class":35,"line":36},[33,170421,170422],{"class":39},"# pip install docxtpl pandas\n",[33,170424,170425,170427,170429,170431],{"class":35,"line":43},[33,170426,164],{"class":163},[33,170428,492],{"class":167},[33,170430,495],{"class":163},[33,170432,498],{"class":167},[33,170434,170435,170437,170439,170441],{"class":35,"line":61},[33,170436,190],{"class":163},[33,170438,20437],{"class":167},[33,170440,164],{"class":163},[33,170442,20442],{"class":167},[33,170444,170445,170447,170449,170451],{"class":35,"line":73},[33,170446,190],{"class":163},[33,170448,193],{"class":167},[33,170450,164],{"class":163},[33,170452,198],{"class":167},[33,170454,170455],{"class":35,"line":88},[33,170456,92],{"emptyLinePlaceholder":91},[33,170458,170459,170461,170463,170465,170467],{"class":35,"line":95},[33,170460,97915],{"class":50},[33,170462,212],{"class":163},[33,170464,215],{"class":167},[33,170466,170242],{"class":54},[33,170468,221],{"class":167},[33,170470,170471,170473,170475,170477,170480],{"class":35,"line":101},[33,170472,59605],{"class":50},[33,170474,96938],{"class":163},[33,170476,215],{"class":167},[33,170478,170479],{"class":54},"\"data\u002Frecipients.csv\"",[33,170481,221],{"class":167},[33,170483,170484],{"class":35,"line":171},[33,170485,92],{"emptyLinePlaceholder":91},[33,170487,170488,170491,170493,170495,170497,170499,170501],{"class":35,"line":179},[33,170489,170490],{"class":167},"tpl      ",[33,170492,242],{"class":163},[33,170494,170263],{"class":167},[33,170496,1053],{"class":50},[33,170498,602],{"class":167},[33,170500,97915],{"class":50},[33,170502,371],{"class":167},[33,170504,170505,170508,170510,170513],{"class":35,"line":187},[33,170506,170507],{"class":167},"required ",[33,170509,242],{"class":163},[33,170511,170512],{"class":167}," tpl.get_undeclared_template_variables()   ",[33,170514,170515],{"class":39},"# {'first_name', 'client_tier', ...}\n",[33,170517,170518],{"class":35,"line":201},[33,170519,92],{"emptyLinePlaceholder":91},[33,170521,170522,170524,170526,170528,170530],{"class":35,"line":206},[33,170523,13459],{"class":167},[33,170525,242],{"class":163},[33,170527,9481],{"class":167},[33,170529,59605],{"class":50},[33,170531,221],{"class":167},[33,170533,170534,170536,170538,170541],{"class":35,"line":224},[33,170535,13474],{"class":50},[33,170537,602],{"class":167},[33,170539,170540],{"class":54},"\"CSV columns :\"",[33,170542,119004],{"class":167},[33,170544,170545,170547,170549,170552,170554,170557],{"class":35,"line":229},[33,170546,13474],{"class":50},[33,170548,602],{"class":167},[33,170550,170551],{"class":54},"\"Template vars:\"",[33,170553,365],{"class":167},[33,170555,170556],{"class":50},"sorted",[33,170558,170559],{"class":167},"(required))\n",[33,170561,170562],{"class":35,"line":235},[33,170563,92],{"emptyLinePlaceholder":91},[33,170565,170566],{"class":35,"line":250},[33,170567,170568],{"class":39},"# Find the gap\n",[33,170570,170571,170574,170576,170578,170580,170582],{"class":35,"line":266},[33,170572,170573],{"class":167},"missing ",[33,170575,242],{"class":163},[33,170577,59698],{"class":167},[33,170579,4126],{"class":163},[33,170581,4129],{"class":50},[33,170583,140713],{"class":167},[33,170585,170586,170588],{"class":35,"line":290},[33,170587,2491],{"class":163},[33,170589,4139],{"class":167},[33,170591,170592,170594,170596],{"class":35,"line":295},[33,170593,35742],{"class":163},[33,170595,4054],{"class":50},[33,170597,7637],{"class":167},[33,170599,170600,170602,170605,170607,170609,170611],{"class":35,"line":300},[33,170601,9533],{"class":163},[33,170603,170604],{"class":54},"\"Template references variables not in CSV: ",[33,170606,1115],{"class":50},[33,170608,4157],{"class":167},[33,170610,1121],{"class":50},[33,170612,52129],{"class":54},[33,170614,170615],{"class":35,"line":317},[33,170616,170617],{"class":54},"        \"Rename the CSV column or the template placeholder.\"\n",[33,170619,170620],{"class":35,"line":332},[33,170621,1202],{"class":167},[14,170623,170624,170625,69863,170627,170630,170631,69863,170634,170637],{},"Running this before the render loop surfaces every mismatch immediately. Fix by either renaming the column in the CSV (",[30,170626,170191],{},[30,170628,170629],{},"tier",") or updating the template placeholder (",[30,170632,170633],{},"{{ client_tier }}",[30,170635,170636],{},"{{ tier }}","). Renaming in the template is safer when you do not control the CSV schema.",[18,170639,170641],{"id":170640},"fix-2-provide-defaults-for-optional-keys","Fix 2 — Provide Defaults for Optional Keys",[14,170643,170644,170645,170648],{},"When a key is genuinely optional (some rows may not have it), supply a fallback in ",[30,170646,170647],{},"build_context"," rather than patching the template.",[23,170650,170652],{"className":126,"code":170651,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\n\n\ndef build_context(row: pd.Series) -> dict:\n    ctx: dict = row.to_dict()\n\n    # Replace NaN with empty string for every value in one pass\n    ctx = {k: (\"\" if pd.isna(v) else v) for k, v in ctx.items()}   # NaN → \"\"\n\n    # Explicit defaults for keys the template always references\n    ctx.setdefault(\"client_tier\", \"standard\")      # template: {{ client_tier }}\n    ctx.setdefault(\"discount_pct\", 0)              # template: {{ discount_pct }}\n    ctx.setdefault(\"notes\", \"\")                    # template: {{ notes }}\n\n    return ctx\n",[30,170653,170654,170658,170668,170672,170676,170690,170702,170706,170711,170745,170749,170754,170772,170788,170803,170807],{"__ignoreMap":28},[33,170655,170656],{"class":35,"line":36},[33,170657,8895],{"class":39},[33,170659,170660,170662,170664,170666],{"class":35,"line":43},[33,170661,164],{"class":163},[33,170663,492],{"class":167},[33,170665,495],{"class":163},[33,170667,498],{"class":167},[33,170669,170670],{"class":35,"line":61},[33,170671,92],{"emptyLinePlaceholder":91},[33,170673,170674],{"class":35,"line":73},[33,170675,92],{"emptyLinePlaceholder":91},[33,170677,170678,170680,170683,170686,170688],{"class":35,"line":88},[33,170679,562],{"class":163},[33,170681,170682],{"class":46}," build_context",[33,170684,170685],{"class":167},"(row: pd.Series) -> ",[33,170687,37100],{"class":50},[33,170689,574],{"class":167},[33,170691,170692,170695,170697,170699],{"class":35,"line":95},[33,170693,170694],{"class":167},"    ctx: ",[33,170696,37100],{"class":50},[33,170698,212],{"class":163},[33,170700,170701],{"class":167}," row.to_dict()\n",[33,170703,170704],{"class":35,"line":101},[33,170705,92],{"emptyLinePlaceholder":91},[33,170707,170708],{"class":35,"line":171},[33,170709,170710],{"class":39},"    # Replace NaN with empty string for every value in one pass\n",[33,170712,170713,170716,170718,170721,170723,170725,170728,170730,170733,170735,170737,170739,170742],{"class":35,"line":179},[33,170714,170715],{"class":167},"    ctx ",[33,170717,242],{"class":163},[33,170719,170720],{"class":167}," {k: (",[33,170722,3198],{"class":54},[33,170724,9994],{"class":163},[33,170726,170727],{"class":167}," pd.isna(v) ",[33,170729,7489],{"class":163},[33,170731,170732],{"class":167}," v) ",[33,170734,6124],{"class":163},[33,170736,163765],{"class":167},[33,170738,662],{"class":163},[33,170740,170741],{"class":167}," ctx.items()}   ",[33,170743,170744],{"class":39},"# NaN → \"\"\n",[33,170746,170747],{"class":35,"line":187},[33,170748,92],{"emptyLinePlaceholder":91},[33,170750,170751],{"class":35,"line":201},[33,170752,170753],{"class":39},"    # Explicit defaults for keys the template always references\n",[33,170755,170756,170759,170762,170764,170767,170769],{"class":35,"line":206},[33,170757,170758],{"class":167},"    ctx.setdefault(",[33,170760,170761],{"class":54},"\"client_tier\"",[33,170763,365],{"class":167},[33,170765,170766],{"class":54},"\"standard\"",[33,170768,54109],{"class":167},[33,170770,170771],{"class":39},"# template: {{ client_tier }}\n",[33,170773,170774,170776,170779,170781,170783,170785],{"class":35,"line":224},[33,170775,170758],{"class":167},[33,170777,170778],{"class":54},"\"discount_pct\"",[33,170780,365],{"class":167},[33,170782,748],{"class":50},[33,170784,67217],{"class":167},[33,170786,170787],{"class":39},"# template: {{ discount_pct }}\n",[33,170789,170790,170792,170794,170796,170798,170800],{"class":35,"line":229},[33,170791,170758],{"class":167},[33,170793,131398],{"class":54},[33,170795,365],{"class":167},[33,170797,3198],{"class":54},[33,170799,58134],{"class":167},[33,170801,170802],{"class":39},"# template: {{ notes }}\n",[33,170804,170805],{"class":35,"line":235},[33,170806,92],{"emptyLinePlaceholder":91},[33,170808,170809,170811],{"class":35,"line":250},[33,170810,1332],{"class":163},[33,170812,170813],{"class":167}," ctx\n",[14,170815,39550,170816,170819,170820,170822,170823,170825],{},[30,170817,170818],{},"pd.isna"," sweep handles ",[30,170821,129041],{}," that pandas injects for empty CSV cells — those are truthy in Python but raise ",[30,170824,170175],{}," when Jinja2 tries to render them as strings.",[18,170827,170829,170830,170833],{"id":170828},"fix-3-use-default-in-the-template-itself","Fix 3 — Use ",[30,170831,170832],{},"| default()"," in the Template Itself",[14,170835,170836,170837,170839],{},"When you cannot change the Python code (e.g. the template is maintained by a different team), add a Jinja2 filter directly in the ",[30,170838,18051],{}," placeholder:",[23,170841,170844],{"className":170842,"code":170843,"language":2000},[1998],"{{ client_tier | default('standard') }}\n{{ discount_pct | default(0) }}\n{{ notes | default('') }}\n",[30,170845,170843],{"__ignoreMap":28},[14,170847,39550,170848,170851,170852,170854,170855,170858],{},[30,170849,170850],{},"| default(value)"," filter returns ",[30,170853,67110],{}," whenever the left-hand side is undefined or falsy. To return the default only when the variable is strictly undefined (not just empty), use ",[30,170856,170857],{},"| default('', boolean=False)"," — but the plain form covers the common case.",[14,170860,170861,170864,170865,170868],{},[1974,170862,170863],{},"Important",": retype each placeholder in Word after adding the filter. Word often splits ",[30,170866,170867],{},"| default("," across separate XML runs, causing a silent render failure rather than the filter being applied.",[18,170870,170872,170873],{"id":170871},"variant-whitespace-or-typo-inside","Variant — Whitespace or Typo Inside ",[30,170874,170875],{},"{{ }}",[14,170877,170878,170879,3035],{},"Jinja2 is case-sensitive and whitespace inside the delimiters matters in one specific way: leading\u002Ftrailing spaces are stripped, but a wrong character anywhere in the name still raises ",[30,170880,170175],{},[23,170882,170885],{"className":170883,"code":170884,"language":2000},[1998],"{{ FirstName }}   →  UndefinedError: 'FirstName' is undefined  (context key is 'first_name')\n{{ first name }}  →  TemplateSyntaxError (space in identifier)\n{{first_name}}    →  renders correctly  (no spaces required around the name)\n",[30,170886,170884],{"__ignoreMap":28},[14,170888,170889],{},"Verify with a quick diagnostic:",[23,170891,170893],{"className":126,"code":170892,"language":47,"meta":28,"style":28},"# pip install docxtpl\nfrom docxtpl import DocxTemplate\nfrom pathlib import Path\n\ntpl = DocxTemplate(str(Path(\"templates\u002Fletter_template.docx\")))\nfor var in sorted(tpl.get_undeclared_template_variables()):\n    print(repr(var))   # repr() exposes hidden whitespace or non-ASCII chars\n",[30,170894,170895,170899,170909,170919,170923,170939,170953],{"__ignoreMap":28},[33,170896,170897],{"class":35,"line":36},[33,170898,170207],{"class":39},[33,170900,170901,170903,170905,170907],{"class":35,"line":43},[33,170902,190],{"class":163},[33,170904,20437],{"class":167},[33,170906,164],{"class":163},[33,170908,20442],{"class":167},[33,170910,170911,170913,170915,170917],{"class":35,"line":61},[33,170912,190],{"class":163},[33,170914,193],{"class":167},[33,170916,164],{"class":163},[33,170918,198],{"class":167},[33,170920,170921],{"class":35,"line":73},[33,170922,92],{"emptyLinePlaceholder":91},[33,170924,170925,170927,170929,170931,170933,170935,170937],{"class":35,"line":88},[33,170926,170258],{"class":167},[33,170928,242],{"class":163},[33,170930,170263],{"class":167},[33,170932,1053],{"class":50},[33,170934,62344],{"class":167},[33,170936,170242],{"class":54},[33,170938,23269],{"class":167},[33,170940,170941,170943,170946,170948,170950],{"class":35,"line":95},[33,170942,6124],{"class":163},[33,170944,170945],{"class":167}," var ",[33,170947,662],{"class":163},[33,170949,28924],{"class":50},[33,170951,170952],{"class":167},"(tpl.get_undeclared_template_variables()):\n",[33,170954,170955,170957,170959,170961,170964],{"class":35,"line":101},[33,170956,7268],{"class":50},[33,170958,602],{"class":167},[33,170960,45192],{"class":50},[33,170962,170963],{"class":167},"(var))   ",[33,170965,170966],{"class":39},"# repr() exposes hidden whitespace or non-ASCII chars\n",[14,170968,170969,170972,170973,170976],{},[30,170970,170971],{},"repr()"," will show ",[30,170974,170975],{},"'\\\\xa0first_name'"," if a non-breaking space crept into the placeholder text — a common copy-paste artefact from Word.",[18,170978,170980],{"id":170979},"variant-attribute-access-on-a-missing-dict-key","Variant — Attribute Access on a Missing Dict Key",[14,170982,170983,170984,170987,170988,170990,170991,170993],{},"When the template iterates a list (",[30,170985,170986],{},"{%tr for item in line_items %}",") and accesses ",[30,170989,170154],{},", Jinja2 first looks for a dict key ",[30,170992,116742],{},", then falls back to an attribute. If neither exists you get:",[23,170995,170998],{"className":170996,"code":170997,"language":2000},[1998],"jinja2.exceptions.UndefinedError: 'price' is undefined\n",[30,170999,170997],{"__ignoreMap":28},[14,171001,171002],{},"Fix by normalising each item dict before passing it to the context:",[23,171004,171006],{"className":126,"code":171005,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\n\n\ndef normalise_item(raw: dict) -> dict:\n    \"\"\"Ensure every key the template references exists in the item dict.\"\"\"\n    return {\n        \"description\": str(raw.get(\"description\", \"—\")),\n        \"qty\":         str(raw.get(\"qty\", raw.get(\"quantity\", \"1\"))),  # tolerate both names\n        \"unit_price\":  f\"${float(raw.get('unit_price', raw.get('price', 0))):,.2f}\",\n    }\n",[30,171007,171008,171012,171022,171026,171030,171048,171053,171059,171080,171109,171146],{"__ignoreMap":28},[33,171009,171010],{"class":35,"line":36},[33,171011,8895],{"class":39},[33,171013,171014,171016,171018,171020],{"class":35,"line":43},[33,171015,164],{"class":163},[33,171017,492],{"class":167},[33,171019,495],{"class":163},[33,171021,498],{"class":167},[33,171023,171024],{"class":35,"line":61},[33,171025,92],{"emptyLinePlaceholder":91},[33,171027,171028],{"class":35,"line":73},[33,171029,92],{"emptyLinePlaceholder":91},[33,171031,171032,171034,171037,171040,171042,171044,171046],{"class":35,"line":88},[33,171033,562],{"class":163},[33,171035,171036],{"class":46}," normalise_item",[33,171038,171039],{"class":167},"(raw: ",[33,171041,37100],{"class":50},[33,171043,1617],{"class":167},[33,171045,37100],{"class":50},[33,171047,574],{"class":167},[33,171049,171050],{"class":35,"line":95},[33,171051,171052],{"class":54},"    \"\"\"Ensure every key the template references exists in the item dict.\"\"\"\n",[33,171054,171055,171057],{"class":35,"line":101},[33,171056,1332],{"class":163},[33,171058,16265],{"class":167},[33,171060,171061,171064,171066,171068,171071,171074,171076,171078],{"class":35,"line":171},[33,171062,171063],{"class":54},"        \"description\"",[33,171065,2079],{"class":167},[33,171067,1053],{"class":50},[33,171069,171070],{"class":167},"(raw.get(",[33,171072,171073],{"class":54},"\"description\"",[33,171075,365],{"class":167},[33,171077,20352],{"class":54},[33,171079,1571],{"class":167},[33,171081,171082,171085,171088,171090,171092,171094,171097,171099,171101,171103,171106],{"class":35,"line":179},[33,171083,171084],{"class":54},"        \"qty\"",[33,171086,171087],{"class":167},":         ",[33,171089,1053],{"class":50},[33,171091,171070],{"class":167},[33,171093,54232],{"class":54},[33,171095,171096],{"class":167},", raw.get(",[33,171098,114399],{"class":54},[33,171100,365],{"class":167},[33,171102,35984],{"class":54},[33,171104,171105],{"class":167},"))),  ",[33,171107,171108],{"class":39},"# tolerate both names\n",[33,171110,171111,171113,171115,171117,171119,171121,171123,171126,171128,171131,171133,171135,171138,171140,171142,171144],{"class":35,"line":187},[33,171112,114039],{"class":54},[33,171114,20627],{"class":167},[33,171116,4059],{"class":163},[33,171118,18820],{"class":54},[33,171120,88861],{"class":50},[33,171122,171070],{"class":167},[33,171124,171125],{"class":54},"'unit_price'",[33,171127,171096],{"class":167},[33,171129,171130],{"class":54},"'price'",[33,171132,365],{"class":167},[33,171134,748],{"class":50},[33,171136,171137],{"class":167},")))",[33,171139,28440],{"class":163},[33,171141,1121],{"class":50},[33,171143,274],{"class":54},[33,171145,247],{"class":167},[33,171147,171148],{"class":35,"line":201},[33,171149,20781],{"class":167},[14,171151,79527,171152,171155],{},[30,171153,171154],{},".get()"," with a fallback prevents the error even when upstream data is inconsistent.",[18,171157,171159],{"id":171158},"variant-validate-before-rendering","Variant — Validate Before Rendering",[14,171161,171162,171163,171165,171166,171168,171169,171171],{},"When data comes from an external API or unreliable CSV, validate the full context before handing it to ",[30,171164,18047],{},". A ",[30,171167,95615],{}," with a clear message is better than a cryptic ",[30,171170,170175],{}," mid-batch.",[23,171173,171175],{"className":126,"code":171174,"language":47,"meta":28,"style":28},"# pip install docxtpl pandas\nimport pandas as pd\nfrom docxtpl import DocxTemplate\nfrom pathlib import Path\n\nTEMPLATE = Path(\"templates\u002Fletter_template.docx\")\n\n\ndef validate_context(ctx: dict, template_path: Path) -> None:\n    \"\"\"Raise ValueError listing every variable the template needs but context lacks.\"\"\"\n    tpl      = DocxTemplate(str(template_path))\n    required = tpl.get_undeclared_template_variables()\n    missing  = [k for k in required if k not in ctx]\n    if missing:\n        raise ValueError(\n            f\"Context is missing {len(missing)} required key(s): {missing}\\n\"\n            f\"Available keys: {sorted(ctx.keys())}\"\n        )\n\n\ndef render_safe(row: pd.Series, template_path: Path, out_path: Path) -> bool:\n    ctx = row.to_dict()\n    ctx = {k: (\"\" if pd.isna(v) else v) for k, v in ctx.items()}\n    ctx.setdefault(\"client_tier\", \"standard\")\n\n    try:\n        validate_context(ctx, template_path)   # raises ValueError if keys are missing\n        tpl = DocxTemplate(str(template_path))\n        tpl.render(ctx)\n        out_path.parent.mkdir(parents=True, exist_ok=True)\n        tpl.save(str(out_path))\n        return True\n    except (ValueError, Exception) as exc:\n        print(f\"Skipped {out_path.name}: {exc}\")\n        return False\n",[30,171176,171177,171181,171191,171201,171211,171215,171227,171231,171235,171254,171259,171273,171283,171313,171319,171327,171352,171369,171373,171377,171381,171395,171403,171430,171442,171446,171452,171460,171473,171478,171499,171508,171514,171532,171561],{"__ignoreMap":28},[33,171178,171179],{"class":35,"line":36},[33,171180,170422],{"class":39},[33,171182,171183,171185,171187,171189],{"class":35,"line":43},[33,171184,164],{"class":163},[33,171186,492],{"class":167},[33,171188,495],{"class":163},[33,171190,498],{"class":167},[33,171192,171193,171195,171197,171199],{"class":35,"line":61},[33,171194,190],{"class":163},[33,171196,20437],{"class":167},[33,171198,164],{"class":163},[33,171200,20442],{"class":167},[33,171202,171203,171205,171207,171209],{"class":35,"line":73},[33,171204,190],{"class":163},[33,171206,193],{"class":167},[33,171208,164],{"class":163},[33,171210,198],{"class":167},[33,171212,171213],{"class":35,"line":88},[33,171214,92],{"emptyLinePlaceholder":91},[33,171216,171217,171219,171221,171223,171225],{"class":35,"line":95},[33,171218,97915],{"class":50},[33,171220,212],{"class":163},[33,171222,215],{"class":167},[33,171224,170242],{"class":54},[33,171226,221],{"class":167},[33,171228,171229],{"class":35,"line":101},[33,171230,92],{"emptyLinePlaceholder":91},[33,171232,171233],{"class":35,"line":171},[33,171234,92],{"emptyLinePlaceholder":91},[33,171236,171237,171239,171242,171245,171247,171250,171252],{"class":35,"line":179},[33,171238,562],{"class":163},[33,171240,171241],{"class":46}," validate_context",[33,171243,171244],{"class":167},"(ctx: ",[33,171246,37100],{"class":50},[33,171248,171249],{"class":167},", template_path: Path) -> ",[33,171251,571],{"class":50},[33,171253,574],{"class":167},[33,171255,171256],{"class":35,"line":187},[33,171257,171258],{"class":54},"    \"\"\"Raise ValueError listing every variable the template needs but context lacks.\"\"\"\n",[33,171260,171261,171264,171266,171268,171270],{"class":35,"line":201},[33,171262,171263],{"class":167},"    tpl      ",[33,171265,242],{"class":163},[33,171267,170263],{"class":167},[33,171269,1053],{"class":50},[33,171271,171272],{"class":167},"(template_path))\n",[33,171274,171275,171278,171280],{"class":35,"line":206},[33,171276,171277],{"class":167},"    required ",[33,171279,242],{"class":163},[33,171281,171282],{"class":167}," tpl.get_undeclared_template_variables()\n",[33,171284,171285,171288,171290,171293,171295,171298,171300,171302,171304,171306,171308,171310],{"class":35,"line":224},[33,171286,171287],{"class":167},"    missing  ",[33,171289,242],{"class":163},[33,171291,171292],{"class":167}," [k ",[33,171294,6124],{"class":163},[33,171296,171297],{"class":167}," k ",[33,171299,662],{"class":163},[33,171301,59698],{"class":167},[33,171303,2491],{"class":163},[33,171305,171297],{"class":167},[33,171307,7999],{"class":163},[33,171309,8002],{"class":163},[33,171311,171312],{"class":167}," ctx]\n",[33,171314,171315,171317],{"class":35,"line":229},[33,171316,617],{"class":163},[33,171318,4139],{"class":167},[33,171320,171321,171323,171325],{"class":35,"line":235},[33,171322,4051],{"class":163},[33,171324,4054],{"class":50},[33,171326,7637],{"class":167},[33,171328,171329,171331,171334,171336,171339,171341,171344,171346,171348,171350],{"class":35,"line":250},[33,171330,12744],{"class":163},[33,171332,171333],{"class":54},"\"Context is missing ",[33,171335,4065],{"class":50},[33,171337,171338],{"class":167},"(missing)",[33,171340,1121],{"class":50},[33,171342,171343],{"class":54}," required key(s): ",[33,171345,1115],{"class":50},[33,171347,4157],{"class":167},[33,171349,6568],{"class":50},[33,171351,7504],{"class":54},[33,171353,171354,171356,171359,171362,171365,171367],{"class":35,"line":266},[33,171355,12744],{"class":163},[33,171357,171358],{"class":54},"\"Available keys: ",[33,171360,171361],{"class":50},"{sorted",[33,171363,171364],{"class":167},"(ctx.keys())",[33,171366,1121],{"class":50},[33,171368,7504],{"class":54},[33,171370,171371],{"class":35,"line":290},[33,171372,5867],{"class":167},[33,171374,171375],{"class":35,"line":295},[33,171376,92],{"emptyLinePlaceholder":91},[33,171378,171379],{"class":35,"line":300},[33,171380,92],{"emptyLinePlaceholder":91},[33,171382,171383,171385,171388,171391,171393],{"class":35,"line":317},[33,171384,562],{"class":163},[33,171386,171387],{"class":46}," render_safe",[33,171389,171390],{"class":167},"(row: pd.Series, template_path: Path, out_path: Path) -> ",[33,171392,2821],{"class":50},[33,171394,574],{"class":167},[33,171396,171397,171399,171401],{"class":35,"line":332},[33,171398,170715],{"class":167},[33,171400,242],{"class":163},[33,171402,170701],{"class":167},[33,171404,171405,171407,171409,171411,171413,171415,171417,171419,171421,171423,171425,171427],{"class":35,"line":347},[33,171406,170715],{"class":167},[33,171408,242],{"class":163},[33,171410,170720],{"class":167},[33,171412,3198],{"class":54},[33,171414,9994],{"class":163},[33,171416,170727],{"class":167},[33,171418,7489],{"class":163},[33,171420,170732],{"class":167},[33,171422,6124],{"class":163},[33,171424,163765],{"class":167},[33,171426,662],{"class":163},[33,171428,171429],{"class":167}," ctx.items()}\n",[33,171431,171432,171434,171436,171438,171440],{"class":35,"line":374},[33,171433,170758],{"class":167},[33,171435,170761],{"class":54},[33,171437,365],{"class":167},[33,171439,170766],{"class":54},[33,171441,221],{"class":167},[33,171443,171444],{"class":35,"line":397},[33,171445,92],{"emptyLinePlaceholder":91},[33,171447,171448,171450],{"class":35,"line":653},[33,171449,2424],{"class":163},[33,171451,574],{"class":167},[33,171453,171454,171457],{"class":35,"line":667},[33,171455,171456],{"class":167},"        validate_context(ctx, template_path)   ",[33,171458,171459],{"class":39},"# raises ValueError if keys are missing\n",[33,171461,171462,171465,171467,171469,171471],{"class":35,"line":675},[33,171463,171464],{"class":167},"        tpl ",[33,171466,242],{"class":163},[33,171468,170263],{"class":167},[33,171470,1053],{"class":50},[33,171472,171272],{"class":167},[33,171474,171475],{"class":35,"line":689},[33,171476,171477],{"class":167},"        tpl.render(ctx)\n",[33,171479,171480,171483,171485,171487,171489,171491,171493,171495,171497],{"class":35,"line":703},[33,171481,171482],{"class":167},"        out_path.parent.mkdir(",[33,171484,869],{"class":238},[33,171486,242],{"class":163},[33,171488,855],{"class":50},[33,171490,365],{"class":167},[33,171492,878],{"class":238},[33,171494,242],{"class":163},[33,171496,855],{"class":50},[33,171498,221],{"class":167},[33,171500,171501,171504,171506],{"class":35,"line":714},[33,171502,171503],{"class":167},"        tpl.save(",[33,171505,1053],{"class":50},[33,171507,161046],{"class":167},[33,171509,171510,171512],{"class":35,"line":723},[33,171511,1659],{"class":163},[33,171513,2887],{"class":50},[33,171515,171516,171518,171520,171522,171524,171526,171528,171530],{"class":35,"line":754},[33,171517,2449],{"class":163},[33,171519,17583],{"class":167},[33,171521,95615],{"class":50},[33,171523,365],{"class":167},[33,171525,108194],{"class":50},[33,171527,1649],{"class":167},[33,171529,495],{"class":163},[33,171531,1855],{"class":167},[33,171533,171534,171536,171538,171540,171543,171545,171547,171549,171551,171553,171555,171557,171559],{"class":35,"line":771},[33,171535,9414],{"class":50},[33,171537,602],{"class":167},[33,171539,4059],{"class":163},[33,171541,171542],{"class":54},"\"Skipped ",[33,171544,1115],{"class":50},[33,171546,75469],{"class":167},[33,171548,1121],{"class":50},[33,171550,2079],{"class":54},[33,171552,1115],{"class":50},[33,171554,6565],{"class":167},[33,171556,1121],{"class":50},[33,171558,274],{"class":54},[33,171560,221],{"class":167},[33,171562,171563,171565],{"class":35,"line":777},[33,171564,1659],{"class":163},[33,171566,2903],{"class":50},[18,171568,171570,171571,171573],{"id":171569},"variant-undefinederror-inside-a-conditional-block","Variant — ",[30,171572,170175],{}," Inside a Conditional Block",[14,171575,171576,171577,171579,171580,171583,171584,171587],{},"A subtler case: ",[30,171578,20399],{}," sits inside a ",[30,171581,171582],{},"{% if condition %}"," block and you assume it will never be evaluated when ",[30,171585,171586],{},"condition"," is false. Jinja2 does not short-circuit attribute lookups on undefined variables — it still resolves every name in the block regardless of the conditional outcome.",[23,171589,171592],{"className":171590,"code":171591,"language":2000},[1998],"{% if show_discount %}\n  Discount: {{ discount_code }} — {{ discount_pct }}%\n{% endif %}\n",[30,171593,171591],{"__ignoreMap":28},[14,171595,41963,171596,171599,171600,171602,171603,4348,171606,3035],{},[30,171597,171598],{},"discount_code"," is absent from the context, Jinja2 raises ",[30,171601,170175],{}," even when ",[30,171604,171605],{},"show_discount",[30,171607,902],{},[14,171609,171610],{},"Fix: supply the default in the context dict regardless of whether the condition will be true:",[23,171612,171614],{"className":126,"code":171613,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\n\n\ndef build_context(row: pd.Series) -> dict:\n    ctx: dict = row.to_dict()\n    ctx = {k: (\"\" if pd.isna(v) else v) for k, v in ctx.items()}\n\n    # Always provide keys referenced inside conditionals\n    ctx.setdefault(\"show_discount\", False)\n    ctx.setdefault(\"discount_code\", \"\")    # needed even when show_discount is False\n    ctx.setdefault(\"discount_pct\", 0)\n\n    return ctx\n",[30,171615,171616,171620,171630,171634,171638,171650,171660,171686,171690,171695,171708,171724,171736,171740],{"__ignoreMap":28},[33,171617,171618],{"class":35,"line":36},[33,171619,8895],{"class":39},[33,171621,171622,171624,171626,171628],{"class":35,"line":43},[33,171623,164],{"class":163},[33,171625,492],{"class":167},[33,171627,495],{"class":163},[33,171629,498],{"class":167},[33,171631,171632],{"class":35,"line":61},[33,171633,92],{"emptyLinePlaceholder":91},[33,171635,171636],{"class":35,"line":73},[33,171637,92],{"emptyLinePlaceholder":91},[33,171639,171640,171642,171644,171646,171648],{"class":35,"line":88},[33,171641,562],{"class":163},[33,171643,170682],{"class":46},[33,171645,170685],{"class":167},[33,171647,37100],{"class":50},[33,171649,574],{"class":167},[33,171651,171652,171654,171656,171658],{"class":35,"line":95},[33,171653,170694],{"class":167},[33,171655,37100],{"class":50},[33,171657,212],{"class":163},[33,171659,170701],{"class":167},[33,171661,171662,171664,171666,171668,171670,171672,171674,171676,171678,171680,171682,171684],{"class":35,"line":101},[33,171663,170715],{"class":167},[33,171665,242],{"class":163},[33,171667,170720],{"class":167},[33,171669,3198],{"class":54},[33,171671,9994],{"class":163},[33,171673,170727],{"class":167},[33,171675,7489],{"class":163},[33,171677,170732],{"class":167},[33,171679,6124],{"class":163},[33,171681,163765],{"class":167},[33,171683,662],{"class":163},[33,171685,171429],{"class":167},[33,171687,171688],{"class":35,"line":171},[33,171689,92],{"emptyLinePlaceholder":91},[33,171691,171692],{"class":35,"line":179},[33,171693,171694],{"class":39},"    # Always provide keys referenced inside conditionals\n",[33,171696,171697,171699,171702,171704,171706],{"class":35,"line":187},[33,171698,170758],{"class":167},[33,171700,171701],{"class":54},"\"show_discount\"",[33,171703,365],{"class":167},[33,171705,902],{"class":50},[33,171707,221],{"class":167},[33,171709,171710,171712,171715,171717,171719,171721],{"class":35,"line":201},[33,171711,170758],{"class":167},[33,171713,171714],{"class":54},"\"discount_code\"",[33,171716,365],{"class":167},[33,171718,3198],{"class":54},[33,171720,101057],{"class":167},[33,171722,171723],{"class":39},"# needed even when show_discount is False\n",[33,171725,171726,171728,171730,171732,171734],{"class":35,"line":206},[33,171727,170758],{"class":167},[33,171729,170778],{"class":54},[33,171731,365],{"class":167},[33,171733,748],{"class":50},[33,171735,221],{"class":167},[33,171737,171738],{"class":35,"line":224},[33,171739,92],{"emptyLinePlaceholder":91},[33,171741,171742,171744],{"class":35,"line":229},[33,171743,1332],{"class":163},[33,171745,170813],{"class":167},[14,171747,171748,171749,171751],{},"Alternatively, guard with the ",[30,171750,170832],{}," filter directly in the template:",[23,171753,171756],{"className":171754,"code":171755,"language":2000},[1998],"{% if show_discount %}\n  Discount: {{ discount_code | default('N\u002FA') }} — {{ discount_pct | default(0) }}%\n{% endif %}\n",[30,171757,171755],{"__ignoreMap":28},[18,171759,171761],{"id":171760},"variant-accessing-a-nested-key-that-does-not-exist","Variant — Accessing a Nested Key That Does Not Exist",[14,171763,171764],{},"When a context value is itself a dict and the template accesses a sub-key that is absent, the error reads:",[23,171766,171769],{"className":171767,"code":171768,"language":2000},[1998],"jinja2.exceptions.UndefinedError: 'address' has no attribute 'city'\n",[30,171770,171768],{"__ignoreMap":28},[14,171772,171773,171774,171777,171778,171781],{},"This happens when ",[30,171775,171776],{},"address"," is in the context but was constructed without a ",[30,171779,171780],{},"city"," key:",[23,171783,171785],{"className":126,"code":171784,"language":47,"meta":28,"style":28},"ctx[\"address\"] = {\"street\": \"123 Main St\"}   # no 'city'\n# template: {{ address.city }}  → UndefinedError\n",[30,171786,171787,171815],{"__ignoreMap":28},[33,171788,171789,171792,171795,171797,171799,171801,171804,171806,171809,171812],{"class":35,"line":36},[33,171790,171791],{"class":167},"ctx[",[33,171793,171794],{"class":54},"\"address\"",[33,171796,763],{"class":167},[33,171798,242],{"class":163},[33,171800,4098],{"class":167},[33,171802,171803],{"class":54},"\"street\"",[33,171805,2079],{"class":167},[33,171807,171808],{"class":54},"\"123 Main St\"",[33,171810,171811],{"class":167},"}   ",[33,171813,171814],{"class":39},"# no 'city'\n",[33,171816,171817],{"class":35,"line":43},[33,171818,171819],{"class":39},"# template: {{ address.city }}  → UndefinedError\n",[14,171821,171822],{},"Fix by normalising nested dicts with explicit defaults:",[23,171824,171826],{"className":126,"code":171825,"language":47,"meta":28,"style":28},"# pip install pandas\ndef normalise_address(raw: dict) -> dict:\n    \"\"\"Return an address dict with all template-required keys present.\"\"\"\n    return {\n        \"street\":  raw.get(\"street\", \"\"),\n        \"city\":    raw.get(\"city\", \"\"),\n        \"state\":   raw.get(\"state\", \"\"),\n        \"postcode\": raw.get(\"postcode\", raw.get(\"zip\", \"\")),  # tolerate both field names\n    }\n",[30,171827,171828,171832,171849,171854,171860,171876,171893,171910,171935],{"__ignoreMap":28},[33,171829,171830],{"class":35,"line":36},[33,171831,8895],{"class":39},[33,171833,171834,171836,171839,171841,171843,171845,171847],{"class":35,"line":43},[33,171835,562],{"class":163},[33,171837,171838],{"class":46}," normalise_address",[33,171840,171039],{"class":167},[33,171842,37100],{"class":50},[33,171844,1617],{"class":167},[33,171846,37100],{"class":50},[33,171848,574],{"class":167},[33,171850,171851],{"class":35,"line":61},[33,171852,171853],{"class":54},"    \"\"\"Return an address dict with all template-required keys present.\"\"\"\n",[33,171855,171856,171858],{"class":35,"line":73},[33,171857,1332],{"class":163},[33,171859,16265],{"class":167},[33,171861,171862,171865,171868,171870,171872,171874],{"class":35,"line":88},[33,171863,171864],{"class":54},"        \"street\"",[33,171866,171867],{"class":167},":  raw.get(",[33,171869,171803],{"class":54},[33,171871,365],{"class":167},[33,171873,3198],{"class":54},[33,171875,1506],{"class":167},[33,171877,171878,171881,171884,171887,171889,171891],{"class":35,"line":95},[33,171879,171880],{"class":54},"        \"city\"",[33,171882,171883],{"class":167},":    raw.get(",[33,171885,171886],{"class":54},"\"city\"",[33,171888,365],{"class":167},[33,171890,3198],{"class":54},[33,171892,1506],{"class":167},[33,171894,171895,171898,171901,171904,171906,171908],{"class":35,"line":101},[33,171896,171897],{"class":54},"        \"state\"",[33,171899,171900],{"class":167},":   raw.get(",[33,171902,171903],{"class":54},"\"state\"",[33,171905,365],{"class":167},[33,171907,3198],{"class":54},[33,171909,1506],{"class":167},[33,171911,171912,171915,171918,171921,171923,171925,171927,171929,171932],{"class":35,"line":171},[33,171913,171914],{"class":54},"        \"postcode\"",[33,171916,171917],{"class":167},": raw.get(",[33,171919,171920],{"class":54},"\"postcode\"",[33,171922,171096],{"class":167},[33,171924,125904],{"class":54},[33,171926,365],{"class":167},[33,171928,3198],{"class":54},[33,171930,171931],{"class":167},")),  ",[33,171933,171934],{"class":39},"# tolerate both field names\n",[33,171936,171937],{"class":35,"line":179},[33,171938,20781],{"class":167},[18,171940,9247],{"id":9246},[14,171942,171943],{},"After applying a fix, run the following assertions to confirm the error is gone and the output contains real values:",[23,171945,171947],{"className":126,"code":171946,"language":47,"meta":28,"style":28},"# pip install python-docx\nfrom docx import Document\nfrom pathlib import Path\nimport re\n\nout = Path(\"output\u002Fdoc_test.docx\")\nassert out.exists(), f\"Output file not written: {out}\"\nassert out.stat().st_size > 1000, \"Output file suspiciously small — likely corrupt\"\n\ndoc       = Document(str(out))\nfull_text = \" \".join(p.text for p in doc.paragraphs)\n\n# No unrendered placeholders remain\nleftover = re.findall(r\"\\{\\{.*?\\}\\}\", full_text)\nassert not leftover, f\"Unrendered placeholders found: {leftover}\"\n\n# At least one expected value is present\nassert \"Alice\" in full_text, \"Expected first name not found in output\"\n\nprint(\"Verification passed.\")\n",[30,171948,171949,171953,171963,171973,171979,171983,171997,172017,172034,172038,172051,172070,172074,172079,172109,172132,172136,172141,172156,172160],{"__ignoreMap":28},[33,171950,171951],{"class":35,"line":36},[33,171952,156213],{"class":39},[33,171954,171955,171957,171959,171961],{"class":35,"line":43},[33,171956,190],{"class":163},[33,171958,18092],{"class":167},[33,171960,164],{"class":163},[33,171962,18097],{"class":167},[33,171964,171965,171967,171969,171971],{"class":35,"line":61},[33,171966,190],{"class":163},[33,171968,193],{"class":167},[33,171970,164],{"class":163},[33,171972,198],{"class":167},[33,171974,171975,171977],{"class":35,"line":73},[33,171976,164],{"class":163},[33,171978,11917],{"class":167},[33,171980,171981],{"class":35,"line":88},[33,171982,92],{"emptyLinePlaceholder":91},[33,171984,171985,171988,171990,171992,171995],{"class":35,"line":95},[33,171986,171987],{"class":167},"out ",[33,171989,242],{"class":163},[33,171991,215],{"class":167},[33,171993,171994],{"class":54},"\"output\u002Fdoc_test.docx\"",[33,171996,221],{"class":167},[33,171998,171999,172001,172004,172006,172009,172011,172013,172015],{"class":35,"line":101},[33,172000,36397],{"class":163},[33,172002,172003],{"class":167}," out.exists(), ",[33,172005,4059],{"class":163},[33,172007,172008],{"class":54},"\"Output file not written: ",[33,172010,1115],{"class":50},[33,172012,18014],{"class":167},[33,172014,1121],{"class":50},[33,172016,7504],{"class":54},[33,172018,172019,172021,172024,172026,172029,172031],{"class":35,"line":171},[33,172020,36397],{"class":163},[33,172022,172023],{"class":167}," out.stat().st_size ",[33,172025,6009],{"class":163},[33,172027,172028],{"class":50}," 1000",[33,172030,365],{"class":167},[33,172032,172033],{"class":54},"\"Output file suspiciously small — likely corrupt\"\n",[33,172035,172036],{"class":35,"line":179},[33,172037,92],{"emptyLinePlaceholder":91},[33,172039,172040,172043,172045,172047,172049],{"class":35,"line":187},[33,172041,172042],{"class":167},"doc       ",[33,172044,242],{"class":163},[33,172046,156340],{"class":167},[33,172048,1053],{"class":50},[33,172050,55133],{"class":167},[33,172052,172053,172056,172058,172060,172062,172064,172066,172068],{"class":35,"line":201},[33,172054,172055],{"class":167},"full_text ",[33,172057,242],{"class":163},[33,172059,57412],{"class":54},[33,172061,159444],{"class":167},[33,172063,6124],{"class":163},[33,172065,6127],{"class":167},[33,172067,662],{"class":163},[33,172069,159453],{"class":167},[33,172071,172072],{"class":35,"line":206},[33,172073,92],{"emptyLinePlaceholder":91},[33,172075,172076],{"class":35,"line":224},[33,172077,172078],{"class":39},"# No unrendered placeholders remain\n",[33,172080,172081,172084,172086,172089,172091,172093,172096,172098,172101,172104,172106],{"class":35,"line":229},[33,172082,172083],{"class":167},"leftover ",[33,172085,242],{"class":163},[33,172087,172088],{"class":167}," re.findall(",[33,172090,11977],{"class":163},[33,172092,274],{"class":54},[33,172094,172095],{"class":12018},"\\{\\{",[33,172097,3035],{"class":50},[33,172099,172100],{"class":163},"*?",[33,172102,172103],{"class":12018},"\\}\\}",[33,172105,274],{"class":54},[33,172107,172108],{"class":167},", full_text)\n",[33,172110,172111,172113,172115,172118,172120,172123,172125,172128,172130],{"class":35,"line":235},[33,172112,36397],{"class":163},[33,172114,620],{"class":163},[33,172116,172117],{"class":167}," leftover, ",[33,172119,4059],{"class":163},[33,172121,172122],{"class":54},"\"Unrendered placeholders found: ",[33,172124,1115],{"class":50},[33,172126,172127],{"class":167},"leftover",[33,172129,1121],{"class":50},[33,172131,7504],{"class":54},[33,172133,172134],{"class":35,"line":250},[33,172135,92],{"emptyLinePlaceholder":91},[33,172137,172138],{"class":35,"line":266},[33,172139,172140],{"class":39},"# At least one expected value is present\n",[33,172142,172143,172145,172148,172150,172153],{"class":35,"line":290},[33,172144,36397],{"class":163},[33,172146,172147],{"class":54}," \"Alice\"",[33,172149,8002],{"class":163},[33,172151,172152],{"class":167}," full_text, ",[33,172154,172155],{"class":54},"\"Expected first name not found in output\"\n",[33,172157,172158],{"class":35,"line":295},[33,172159,92],{"emptyLinePlaceholder":91},[33,172161,172162,172164,172166,172168],{"class":35,"line":300},[33,172163,13474],{"class":50},[33,172165,602],{"class":167},[33,172167,9569],{"class":54},[33,172169,221],{"class":167},[14,172171,41963,172172,172174],{},[30,172173,172127],{}," is non-empty, the placeholder was in a run that Jinja2 never processed — most likely because it was split across XML runs in the template. Retype it in Word.",[18,172176,6918],{"id":6917},[4211,172178,172179,172184,172189],{},[4214,172180,172181,172183],{},[940,172182,26185],{"href":18040}," — full workflow: template authoring, context building, batch render loop, and output naming",[4214,172185,172186,172188],{},[940,172187,156152],{"href":26562}," — foundational python-docx patterns for programmatic document construction",[4214,172190,172191,172193],{},[940,172192,107447],{"href":9598}," — fix NaN, encoding, and mixed-type columns before feeding data into the render loop",[14,172195,6947,172196,3035],{},[940,172197,26185],{"href":18040},[6953,172199,172200],{},"html pre.shiki code .sAwPA, html code.shiki .sAwPA{--shiki-default:#6A737D}html pre.shiki code .sD7c4, html code.shiki .sD7c4{--shiki-default:#D73A49}html pre.shiki code .sgsFI, html code.shiki .sgsFI{--shiki-default:#24292E}html pre.shiki code .sYu0t, html code.shiki .sYu0t{--shiki-default:#005CC5}html pre.shiki code .sYBdl, html code.shiki .sYBdl{--shiki-default:#032F62}html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html pre.shiki code .s7eDp, html code.shiki .s7eDp{--shiki-default:#6F42C1}html pre.shiki code .sqxcx, html code.shiki .sqxcx{--shiki-default:#E36209}html pre.shiki code .s691h, html code.shiki .s691h{--shiki-default:#22863A;--shiki-default-font-weight:bold}",{"title":28,"searchDepth":43,"depth":43,"links":172202},[172203,172204,172205,172206,172207,172209,172211,172212,172213,172215,172216,172217],{"id":7020,"depth":43,"text":7021},{"id":54445,"depth":43,"text":99786},{"id":170408,"depth":43,"text":170409},{"id":170640,"depth":43,"text":170641},{"id":170828,"depth":43,"text":172208},"Fix 3 — Use | default() in the Template Itself",{"id":170871,"depth":43,"text":172210},"Variant — Whitespace or Typo Inside {{ }}",{"id":170979,"depth":43,"text":170980},{"id":171158,"depth":43,"text":171159},{"id":171569,"depth":43,"text":172214},"Variant — UndefinedError Inside a Conditional Block",{"id":171760,"depth":43,"text":171761},{"id":9246,"depth":43,"text":9247},{"id":6917,"depth":43,"text":6918},"jinja2.exceptions.UndefinedError: 'xxx' is undefined surfaces when template.render(context) encounters a {{ xxx }} placeholder in the .docx template whose name is not present as a key in the context dict. The exception aborts the render entirely — no output file is written.",{},"\u002Fword-document-templating-batch-processing\u002Fdynamic-mail-merge-with-python\u002Ffix-docxtpl-jinja2-undefined-error",{"title":170119,"description":172218},"word-document-templating-batch-processing\u002Fdynamic-mail-merge-with-python\u002Ffix-docxtpl-jinja2-undefined-error\u002Findex","X7M9xmY4vJD7l6adXas6_-SHoHto-VqY399lsW59bRY",{"id":172225,"title":26185,"body":172226,"breadcrumbTitle":6977,"canonical":6977,"date":6977,"description":176319,"draft":6980,"extension":6981,"image":6977,"meta":176320,"navigation":91,"path":176321,"robots":6977,"seo":176322,"seoTitle":107412,"stem":176323,"tags":6977,"updatedAt":6977,"__hash__":176324},"content\u002Fword-document-templating-batch-processing\u002Fdynamic-mail-merge-with-python\u002Findex.md",{"type":7,"value":172227,"toc":176299},[172228,172231,172237,172250,172252,172274,172288,172291,172305,172308,172331,172335,172341,172481,172493,172496,172502,172505,172509,172519,172525,172528,172534,172537,172556,172561,172565,172572,172852,172865,172869,172957,172961,173749,173756,173760,173763,173766,173772,173777,173910,173922,173926,173933,173953,173959,173966,174228,174234,174236,174240,174245,174352,174357,174361,174367,174757,174761,174767,174924,174928,174931,175147,175152,175156,175191,175193,175294,175296,176209,176212,176264,176266,176292,176296],[10,172229,26185],{"id":172230},"dynamic-mail-merge-with-python",[14,172232,172233,172234,172236],{},"Word's built-in mail merge tops out fast: it requires a running Word instance, chokes on conditional logic beyond simple if\u002Felse, and produces no audit trail. Python with ",[940,172235,18047],{"href":26562}," replaces it with a repeatable script that renders hundreds of documents in a single pass — no GUI, no manual field mapping, no version drift between runs.",[14,172238,172239,172240,172242,172243,172246,172247,172249],{},"This guide covers the full workflow: authoring a ",[30,172241,18051],{}," template with ",[30,172244,172245],{},"{{ placeholders }}",", loading a CSV or Excel data source with ",[940,172248,9630],{"href":99576},", rendering one document per data row, using conditional sections and in-template loops, and naming the output files deterministically.",[18,172251,21],{"id":20},[23,172253,172255],{"className":25,"code":172254,"language":27,"meta":28,"style":28},"# pip install docxtpl pandas openpyxl\npip install docxtpl pandas openpyxl\n",[30,172256,172257,172262],{"__ignoreMap":28},[33,172258,172259],{"class":35,"line":36},[33,172260,172261],{"class":39},"# pip install docxtpl pandas openpyxl\n",[33,172263,172264,172266,172268,172270,172272],{"class":35,"line":43},[33,172265,76],{"class":46},[33,172267,79],{"class":54},[33,172269,16195],{"class":54},[33,172271,16183],{"class":54},[33,172273,95887],{"class":54},[14,172275,172276,172278,172279,172281,172282,172284,172285,172287],{},[30,172277,18047],{}," depends on ",[30,172280,18041],{}," internally — you do not install it separately. ",[30,172283,22009],{}," is needed only when the data source is an ",[30,172286,26542],{}," file.",[14,172289,172290],{},"Create the working tree before running any script:",[23,172292,172294],{"className":25,"code":172293,"language":27,"meta":28,"style":28},"mkdir -p project\u002F{templates,data,output}\n",[30,172295,172296],{"__ignoreMap":28},[33,172297,172298,172300,172302],{"class":35,"line":36},[33,172299,59501],{"class":46},[33,172301,59504],{"class":50},[33,172303,172304],{"class":54}," project\u002F{templates,data,output}\n",[14,172306,172307],{},"Expected files:",[4211,172309,172310,172316,172322],{},[4214,172311,172312,172315],{},[30,172313,172314],{},"project\u002Ftemplates\u002Fletter_template.docx"," — your Jinja2-tagged Word template",[4214,172317,172318,172321],{},[30,172319,172320],{},"project\u002Fdata\u002Frecipients.csv"," — one row per output document",[4214,172323,172324,172327,172328,172330],{},[30,172325,172326],{},"project\u002Foutput\u002F"," — rendered ",[30,172329,18051],{}," files land here",[18,172332,172334],{"id":172333},"inspect-the-data-source-first","Inspect the Data Source First",[14,172336,172337,172338,172340],{},"Before writing the template, confirm exact column names. Typos between a CSV header and a template placeholder are the leading cause of ",[30,172339,170175],{}," at render time.",[23,172342,172344],{"className":126,"code":172343,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nimport pandas as pd\nfrom pathlib import Path\n\nDATA = Path(\"project\u002Fdata\u002Frecipients.csv\")\n\ntry:\n    df = pd.read_csv(DATA)\nexcept FileNotFoundError as exc:\n    raise SystemExit(f\"Data file not found: {exc}\")\n\nprint(\"Columns:\", df.columns.tolist())\nprint(\"Rows:\", len(df))\nprint(df.head(3).to_string())\n",[30,172345,172346,172350,172360,172370,172374,172387,172391,172397,172409,172419,172441,172445,172455,172470],{"__ignoreMap":28},[33,172347,172348],{"class":35,"line":36},[33,172349,3952],{"class":39},[33,172351,172352,172354,172356,172358],{"class":35,"line":43},[33,172353,164],{"class":163},[33,172355,492],{"class":167},[33,172357,495],{"class":163},[33,172359,498],{"class":167},[33,172361,172362,172364,172366,172368],{"class":35,"line":61},[33,172363,190],{"class":163},[33,172365,193],{"class":167},[33,172367,164],{"class":163},[33,172369,198],{"class":167},[33,172371,172372],{"class":35,"line":73},[33,172373,92],{"emptyLinePlaceholder":91},[33,172375,172376,172378,172380,172382,172385],{"class":35,"line":88},[33,172377,59605],{"class":50},[33,172379,212],{"class":163},[33,172381,215],{"class":167},[33,172383,172384],{"class":54},"\"project\u002Fdata\u002Frecipients.csv\"",[33,172386,221],{"class":167},[33,172388,172389],{"class":35,"line":95},[33,172390,92],{"emptyLinePlaceholder":91},[33,172392,172393,172395],{"class":35,"line":101},[33,172394,35574],{"class":163},[33,172396,574],{"class":167},[33,172398,172399,172401,172403,172405,172407],{"class":35,"line":171},[33,172400,4025],{"class":167},[33,172402,242],{"class":163},[33,172404,9481],{"class":167},[33,172406,59605],{"class":50},[33,172408,221],{"class":167},[33,172410,172411,172413,172415,172417],{"class":35,"line":179},[33,172412,35726],{"class":163},[33,172414,2945],{"class":50},[33,172416,1852],{"class":163},[33,172418,1855],{"class":167},[33,172420,172421,172423,172425,172427,172429,172431,172433,172435,172437,172439],{"class":35,"line":187},[33,172422,35742],{"class":163},[33,172424,16617],{"class":50},[33,172426,602],{"class":167},[33,172428,4059],{"class":163},[33,172430,59825],{"class":54},[33,172432,1115],{"class":50},[33,172434,6565],{"class":167},[33,172436,1121],{"class":50},[33,172438,274],{"class":54},[33,172440,221],{"class":167},[33,172442,172443],{"class":35,"line":201},[33,172444,92],{"emptyLinePlaceholder":91},[33,172446,172447,172449,172451,172453],{"class":35,"line":206},[33,172448,13474],{"class":50},[33,172450,602],{"class":167},[33,172452,119773],{"class":54},[33,172454,119004],{"class":167},[33,172456,172457,172459,172461,172464,172466,172468],{"class":35,"line":224},[33,172458,13474],{"class":50},[33,172460,602],{"class":167},[33,172462,172463],{"class":54},"\"Rows:\"",[33,172465,365],{"class":167},[33,172467,928],{"class":50},[33,172469,128027],{"class":167},[33,172471,172472,172474,172476,172478],{"class":35,"line":229},[33,172473,13474],{"class":50},[33,172475,35717],{"class":167},[33,172477,10258],{"class":50},[33,172479,172480],{"class":167},").to_string())\n",[14,172482,172483,172484,42238,172486,172489,172490,172492],{},"For an Excel source swap ",[30,172485,123327],{},[30,172487,172488],{},"pd.read_excel(DATA, engine=\"openpyxl\")",". The rest of the pipeline is identical. See ",[940,172491,99577],{"href":99576}," for multi-sheet and engine selection details.",[14,172494,172495],{},"Sample output you want to verify before proceeding:",[23,172497,172500],{"className":172498,"code":172499,"language":2000},[1998],"Columns: ['first_name', 'last_name', 'company', 'invoice_date', 'total_due', 'tier']\nRows: 42\n  first_name last_name        company invoice_date  total_due    tier\n0      Alice     Smith  Acme Corp Ltd   2026-06-01    1250.00  premium\n1        Bob     Jones      Beta LLC   2026-06-03     480.50  standard\n2    Charlie     Brown    Gamma Inc.   2026-06-05    3200.00  premium\n",[30,172501,172499],{"__ignoreMap":28},[14,172503,172504],{},"Write down these exact column names — they become your template placeholder names.",[18,172506,172508],{"id":172507},"step-1-author-the-word-template","Step 1 — Author the Word Template",[14,172510,172511,172512,172514,172515,172518],{},"Open a new ",[30,172513,18051],{}," in Word (or LibreOffice Writer). Type your letter content and insert ",[30,172516,172517],{},"{{ column_name }}"," exactly where each data value belongs. The delimiters must appear as a single continuous text run; if Word splits them across formatting runs the placeholder will silently fail to render.",[14,172520,172521,172524],{},[1974,172522,172523],{},"Tip:"," Type the placeholder, select it, and apply a character style. This forces Word to keep it in one XML run.",[14,172526,172527],{},"A minimal invoice letter template body:",[23,172529,172532],{"className":172530,"code":172531,"language":2000},[1998],"Dear {{ first_name }} {{ last_name }},\n\nThank you for your business with {{ company }}.\n\n{% if tier == 'premium' %}\nAs a Premium client you receive priority processing on all orders.\n{% endif %}\n\nInvoice date: {{ invoice_date }}\nAmount due:   {{ total_due }}\n\nItems ordered:\n{%tr for item in line_items %}\n  {{ item.description }}   Qty: {{ item.qty }}   Price: {{ item.unit_price }}\n{%tr endfor %}\n\nRegards,\nBilling Team\n",[30,172533,172531],{"__ignoreMap":28},[14,172535,172536],{},"Key syntax:",[4211,172538,172539,172544,172550],{},[4214,172540,172541,172543],{},[30,172542,20399],{}," — scalar substitution",[4214,172545,172546,172549],{},[30,172547,172548],{},"{% if condition %}...{% endif %}"," — conditional section (the entire paragraph is removed when false)",[4214,172551,172552,172555],{},[30,172553,172554],{},"{%tr for item in items %}...{%tr endfor %}"," — table row loop; must live inside a Word table row, not a paragraph",[14,172557,172558,172559,3035],{},"Save as ",[30,172560,172314],{},[18,172562,172564],{"id":172563},"step-2-build-the-context-dictionary","Step 2 — Build the Context Dictionary",[14,172566,172567,172568,172571],{},"Each call to ",[30,172569,172570],{},"template.render()"," takes a plain Python dict. Build one per data row, converting types that Jinja2 cannot format automatically.",[23,172573,172575],{"className":126,"code":172574,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\n\n\ndef build_context(row: pd.Series) -> dict:\n    \"\"\"Convert a DataFrame row to a render-ready context dict.\"\"\"\n    ctx: dict = row.to_dict()\n\n    # Format date — raw pandas Timestamp renders as ugly repr\n    try:\n        ctx[\"invoice_date\"] = pd.to_datetime(ctx[\"invoice_date\"]).strftime(\"%B %d, %Y\")\n    except (ValueError, TypeError):\n        ctx[\"invoice_date\"] = str(ctx.get(\"invoice_date\", \"\"))\n\n    # Format currency\n    try:\n        ctx[\"total_due\"] = f\"${float(ctx['total_due']):,.2f}\"\n    except (ValueError, TypeError):\n        ctx[\"total_due\"] = str(ctx.get(\"total_due\", \"0.00\"))\n\n    # Provide a default for every optional key the template references\n    ctx.setdefault(\"tier\", \"standard\")\n\n    # Nested list for the table loop — empty list collapses the loop cleanly\n    ctx[\"line_items\"] = ctx.get(\"line_items\", [])\n\n    return ctx\n",[30,172576,172577,172581,172591,172595,172599,172611,172616,172626,172630,172635,172641,172671,172685,172708,172712,172717,172723,172754,172768,172791,172795,172800,172813,172817,172822,172842,172846],{"__ignoreMap":28},[33,172578,172579],{"class":35,"line":36},[33,172580,8895],{"class":39},[33,172582,172583,172585,172587,172589],{"class":35,"line":43},[33,172584,164],{"class":163},[33,172586,492],{"class":167},[33,172588,495],{"class":163},[33,172590,498],{"class":167},[33,172592,172593],{"class":35,"line":61},[33,172594,92],{"emptyLinePlaceholder":91},[33,172596,172597],{"class":35,"line":73},[33,172598,92],{"emptyLinePlaceholder":91},[33,172600,172601,172603,172605,172607,172609],{"class":35,"line":88},[33,172602,562],{"class":163},[33,172604,170682],{"class":46},[33,172606,170685],{"class":167},[33,172608,37100],{"class":50},[33,172610,574],{"class":167},[33,172612,172613],{"class":35,"line":95},[33,172614,172615],{"class":54},"    \"\"\"Convert a DataFrame row to a render-ready context dict.\"\"\"\n",[33,172617,172618,172620,172622,172624],{"class":35,"line":101},[33,172619,170694],{"class":167},[33,172621,37100],{"class":50},[33,172623,212],{"class":163},[33,172625,170701],{"class":167},[33,172627,172628],{"class":35,"line":171},[33,172629,92],{"emptyLinePlaceholder":91},[33,172631,172632],{"class":35,"line":179},[33,172633,172634],{"class":39},"    # Format date — raw pandas Timestamp renders as ugly repr\n",[33,172636,172637,172639],{"class":35,"line":187},[33,172638,2424],{"class":163},[33,172640,574],{"class":167},[33,172642,172643,172646,172649,172651,172653,172656,172658,172661,172664,172666,172669],{"class":35,"line":201},[33,172644,172645],{"class":167},"        ctx[",[33,172647,172648],{"class":54},"\"invoice_date\"",[33,172650,763],{"class":167},[33,172652,242],{"class":163},[33,172654,172655],{"class":167}," pd.to_datetime(ctx[",[33,172657,172648],{"class":54},[33,172659,172660],{"class":167},"]).strftime(",[33,172662,172663],{"class":54},"\"%B ",[33,172665,916],{"class":50},[33,172667,172668],{"class":54},", %Y\"",[33,172670,221],{"class":167},[33,172672,172673,172675,172677,172679,172681,172683],{"class":35,"line":206},[33,172674,2449],{"class":163},[33,172676,17583],{"class":167},[33,172678,95615],{"class":50},[33,172680,365],{"class":167},[33,172682,86188],{"class":50},[33,172684,1737],{"class":167},[33,172686,172687,172689,172691,172693,172695,172697,172700,172702,172704,172706],{"class":35,"line":224},[33,172688,172645],{"class":167},[33,172690,172648],{"class":54},[33,172692,763],{"class":167},[33,172694,242],{"class":163},[33,172696,7887],{"class":50},[33,172698,172699],{"class":167},"(ctx.get(",[33,172701,172648],{"class":54},[33,172703,365],{"class":167},[33,172705,3198],{"class":54},[33,172707,371],{"class":167},[33,172709,172710],{"class":35,"line":229},[33,172711,92],{"emptyLinePlaceholder":91},[33,172713,172714],{"class":35,"line":235},[33,172715,172716],{"class":39},"    # Format currency\n",[33,172718,172719,172721],{"class":35,"line":250},[33,172720,2424],{"class":163},[33,172722,574],{"class":167},[33,172724,172725,172727,172730,172732,172734,172736,172738,172740,172743,172746,172748,172750,172752],{"class":35,"line":266},[33,172726,172645],{"class":167},[33,172728,172729],{"class":54},"\"total_due\"",[33,172731,763],{"class":167},[33,172733,242],{"class":163},[33,172735,1110],{"class":163},[33,172737,18820],{"class":54},[33,172739,88861],{"class":50},[33,172741,172742],{"class":167},"(ctx[",[33,172744,172745],{"class":54},"'total_due'",[33,172747,18798],{"class":167},[33,172749,28440],{"class":163},[33,172751,1121],{"class":50},[33,172753,7504],{"class":54},[33,172755,172756,172758,172760,172762,172764,172766],{"class":35,"line":290},[33,172757,2449],{"class":163},[33,172759,17583],{"class":167},[33,172761,95615],{"class":50},[33,172763,365],{"class":167},[33,172765,86188],{"class":50},[33,172767,1737],{"class":167},[33,172769,172770,172772,172774,172776,172778,172780,172782,172784,172786,172789],{"class":35,"line":295},[33,172771,172645],{"class":167},[33,172773,172729],{"class":54},[33,172775,763],{"class":167},[33,172777,242],{"class":163},[33,172779,7887],{"class":50},[33,172781,172699],{"class":167},[33,172783,172729],{"class":54},[33,172785,365],{"class":167},[33,172787,172788],{"class":54},"\"0.00\"",[33,172790,371],{"class":167},[33,172792,172793],{"class":35,"line":300},[33,172794,92],{"emptyLinePlaceholder":91},[33,172796,172797],{"class":35,"line":317},[33,172798,172799],{"class":39},"    # Provide a default for every optional key the template references\n",[33,172801,172802,172804,172807,172809,172811],{"class":35,"line":332},[33,172803,170758],{"class":167},[33,172805,172806],{"class":54},"\"tier\"",[33,172808,365],{"class":167},[33,172810,170766],{"class":54},[33,172812,221],{"class":167},[33,172814,172815],{"class":35,"line":347},[33,172816,92],{"emptyLinePlaceholder":91},[33,172818,172819],{"class":35,"line":374},[33,172820,172821],{"class":39},"    # Nested list for the table loop — empty list collapses the loop cleanly\n",[33,172823,172824,172827,172830,172832,172834,172837,172839],{"class":35,"line":397},[33,172825,172826],{"class":167},"    ctx[",[33,172828,172829],{"class":54},"\"line_items\"",[33,172831,763],{"class":167},[33,172833,242],{"class":163},[33,172835,172836],{"class":167}," ctx.get(",[33,172838,172829],{"class":54},[33,172840,172841],{"class":167},", [])\n",[33,172843,172844],{"class":35,"line":653},[33,172845,92],{"emptyLinePlaceholder":91},[33,172847,172848,172850],{"class":35,"line":667},[33,172849,1332],{"class":163},[33,172851,170813],{"class":167},[14,172853,172854,172855,172858,172859,172861,172862,3035],{},"Providing defaults with ",[30,172856,172857],{},"setdefault"," prevents ",[30,172860,170175],{}," when a CSV row has a missing value. For a full explanation of that error and all its variants see ",[940,172863,170119],{"href":172864},"\u002Fword-document-templating-batch-processing\u002Fdynamic-mail-merge-with-python\u002Ffix-docxtpl-jinja2-undefined-error\u002F",[18,172866,172868],{"id":172867},"svg-mail-merge-data-flow","SVG — Mail Merge Data Flow",[2540,172870,2547,172872,2547,172875,2547,172878,2547,2547,172892,2547,172894,2547,172897,2547,2547,172899,2547,172901,2547,172904,2547,2547,172907,2547,172910,2547,2547,172912,2547,172915,2547,172918,2547,172921,2547,2547,172924,2547,172926,2547,172928,2547,2547,172930,2547,172932,2547,172935,2547,172938,2547,172940,2547,172943,2547,172946,2547,172948,2547,172951,2547,172954],{"viewBox":11071,"role":2543,"ariaLabel":172871,"xmlns":2545,"style":2546},"Mail merge data flow: template.docx and CSV rows feed a render loop that produces many .docx output files",[2549,172873,172874],{},"Mail Merge Data Flow",[2553,172876,172877],{},"Diagram showing template.docx and a CSV data source feeding a Python render loop that outputs one .docx file per row.",[2557,172879,2559,172880,2559,172887,2547],{},[2561,172881,2564,172883,2564,172885,2559],{"id":172882,"x1":748,"y1":748,"x2":734,"y2":748},"mailmerge-grad",[2566,172884],{"offset":748,"style":2568},[2566,172886],{"offset":734,"style":2571},[2573,172888,2564,172890,2559],{"id":172889,"viewBox":2576,"refX":2577,"refY":1153,"markerWidth":1179,"markerHeight":1179,"orient":2578},"mailmerge-arrow",[2580,172891],{"d":2582,"fill":2583},[2585,172893],{"x":2587,"y":1543,"width":2610,"height":2590,"rx":2591,"fill":2592,"stroke":2593,"style":2594},[2000,172895,172896],{"x":2650,"y":58333,"fill":2599,"style":38718},"template.docx",[2000,172898,172245],{"x":2650,"y":26326,"fill":2583,"style":2605},[2585,172900],{"x":2587,"y":2588,"width":2610,"height":2590,"rx":2591,"fill":2592,"stroke":2593,"style":2594},[2000,172902,172903],{"x":2650,"y":2598,"fill":2599,"style":38718},"CSV \u002F Excel",[2000,172905,172906],{"x":2650,"y":82445,"fill":2583,"style":2605},"row 1 … row N",[35,172908],{"x1":58337,"y1":2590,"x2":11231,"y2":2679,"stroke":2583,"markerEnd":172909,"style":2594},"url(#mailmerge-arrow)",[35,172911],{"x1":58337,"y1":2610,"x2":11231,"y2":2635,"stroke":2583,"markerEnd":172909,"style":2594},[2585,172913],{"x":11231,"y":2650,"width":58337,"height":38748,"rx":2591,"fill":172914,"stroke":2593,"style":2594},"url(#mailmerge-grad)",[2000,172916,172917],{"x":2626,"y":2588,"fill":2599,"style":38718},"Render loop",[2000,172919,172920],{"x":2626,"y":2635,"fill":2599,"style":2605},"build_context(row)",[2000,172922,172923],{"x":2626,"y":11198,"fill":2599,"style":2605},"template.render(ctx)",[35,172925],{"x1":58352,"y1":125429,"x2":49853,"y2":2630,"stroke":2583,"markerEnd":172909,"style":2594},[35,172927],{"x1":58352,"y1":125429,"x2":49853,"y2":2609,"stroke":2583,"markerEnd":172909,"style":2594},[35,172929],{"x1":58352,"y1":125429,"x2":49853,"y2":2697,"stroke":2583,"markerEnd":172909,"style":2594},[2585,172931],{"x":49853,"y":58333,"width":2610,"height":2680,"rx":2591,"fill":2615,"stroke":2593,"style":2594},[2000,172933,172934],{"x":71573,"y":16991,"fill":2599,"style":2685},"invoice_ID_001.docx",[2000,172936,172937],{"x":71573,"y":38752,"fill":2583,"style":2605},"row 1 rendered",[2585,172939],{"x":49853,"y":2629,"width":2610,"height":2680,"rx":2591,"fill":2615,"stroke":2593,"style":2594},[2000,172941,172942],{"x":71573,"y":125429,"fill":2599,"style":2685},"invoice_ID_002.docx",[2000,172944,172945],{"x":71573,"y":114598,"fill":2583,"style":2605},"row 2 rendered",[2585,172947],{"x":49853,"y":2664,"width":2610,"height":2680,"rx":2591,"fill":2615,"stroke":2593,"style":2594},[2000,172949,172950],{"x":71573,"y":16997,"fill":2599,"style":2685},"invoice_ID_N.docx",[2000,172952,172953],{"x":71573,"y":110835,"fill":2583,"style":2605},"row N rendered",[2000,172955,172956],{"x":2626,"y":107631,"fill":2583,"style":2685},"\none output file per data row\n",[18,172958,172960],{"id":172959},"step-3-render-one-document-per-row","Step 3 — Render One Document per Row",[23,172962,172964],{"className":126,"code":172963,"language":47,"meta":28,"style":28},"# pip install docxtpl pandas openpyxl\nimport logging\nfrom pathlib import Path\n\nimport pandas as pd\nfrom docxtpl import DocxTemplate\n\nlogging.basicConfig(\n    level=logging.INFO,\n    format=\"%(asctime)s | %(levelname)s | %(message)s\",\n    handlers=[logging.FileHandler(\"merge.log\"), logging.StreamHandler()],\n)\n\nTEMPLATE = Path(\"project\u002Ftemplates\u002Fletter_template.docx\")\nDATA     = Path(\"project\u002Fdata\u002Frecipients.csv\")\nOUT_DIR  = Path(\"project\u002Foutput\")\n\n\ndef build_context(row: pd.Series) -> dict:\n    ctx: dict = row.to_dict()\n    try:\n        ctx[\"invoice_date\"] = pd.to_datetime(ctx[\"invoice_date\"]).strftime(\"%B %d, %Y\")\n    except (ValueError, TypeError):\n        ctx[\"invoice_date\"] = str(ctx.get(\"invoice_date\", \"\"))\n    try:\n        ctx[\"total_due\"] = f\"${float(ctx['total_due']):,.2f}\"\n    except (ValueError, TypeError):\n        ctx[\"total_due\"] = \"0.00\"\n    ctx.setdefault(\"tier\", \"standard\")\n    ctx[\"line_items\"] = ctx.get(\"line_items\", [])\n    return ctx\n\n\ndef render_row(template_path: Path, ctx: dict, output_path: Path) -> bool:\n    try:\n        tpl = DocxTemplate(str(template_path))   # fresh instance per row — prevents state bleed\n        tpl.render(ctx)\n        output_path.parent.mkdir(parents=True, exist_ok=True)\n        tpl.save(str(output_path))\n        return True\n    except Exception as exc:\n        logging.error(\"Render failed for %s: %s\", output_path.name, exc)\n        return False\n\n\ndef run_batch() -> None:\n    if not TEMPLATE.exists():\n        raise FileNotFoundError(f\"Template not found: {TEMPLATE}\")\n\n    try:\n        df = pd.read_csv(DATA)\n    except FileNotFoundError as exc:\n        raise SystemExit(f\"Data file missing: {exc}\") from exc\n\n    logging.info(\"Loaded %d rows from %s\", len(df), DATA)\n    ok = 0\n\n    for idx, row in df.iterrows():\n        ctx = build_context(row)\n        # Deterministic filename: sanitize the primary-key column\n        safe_id = str(ctx.get(\"client_id\", idx)).replace(\" \", \"_\").replace(\"\u002F\", \"-\")\n        out = OUT_DIR \u002F f\"invoice_{safe_id}.docx\"\n\n        if render_row(TEMPLATE, ctx, out):\n            ok += 1\n            logging.info(\"OK  %s\", out.name)\n\n    logging.info(\"Done: %d\u002F%d succeeded\", ok, len(df))\n\n\nif __name__ == \"__main__\":\n    run_batch()\n",[30,172965,172966,172970,172976,172986,172990,173000,173010,173014,173018,173030,173052,173065,173069,173073,173086,173098,173111,173115,173119,173131,173141,173147,173171,173185,173207,173213,173241,173255,173268,173280,173296,173302,173306,173310,173329,173335,173351,173355,173375,173384,173390,173400,173419,173425,173429,173433,173445,173456,173475,173479,173485,173497,173507,173534,173538,173564,173572,173576,173587,173597,173602,173637,173661,173665,173677,173685,173699,173703,173724,173728,173732,173744],{"__ignoreMap":28},[33,172967,172968],{"class":35,"line":36},[33,172969,172261],{"class":39},[33,172971,172972,172974],{"class":35,"line":43},[33,172973,164],{"class":163},[33,172975,184],{"class":167},[33,172977,172978,172980,172982,172984],{"class":35,"line":61},[33,172979,190],{"class":163},[33,172981,193],{"class":167},[33,172983,164],{"class":163},[33,172985,198],{"class":167},[33,172987,172988],{"class":35,"line":73},[33,172989,92],{"emptyLinePlaceholder":91},[33,172991,172992,172994,172996,172998],{"class":35,"line":88},[33,172993,164],{"class":163},[33,172995,492],{"class":167},[33,172997,495],{"class":163},[33,172999,498],{"class":167},[33,173001,173002,173004,173006,173008],{"class":35,"line":95},[33,173003,190],{"class":163},[33,173005,20437],{"class":167},[33,173007,164],{"class":163},[33,173009,20442],{"class":167},[33,173011,173012],{"class":35,"line":101},[33,173013,92],{"emptyLinePlaceholder":91},[33,173015,173016],{"class":35,"line":171},[33,173017,232],{"class":167},[33,173019,173020,173022,173024,173026,173028],{"class":35,"line":179},[33,173021,253],{"class":238},[33,173023,242],{"class":163},[33,173025,258],{"class":167},[33,173027,1067],{"class":50},[33,173029,247],{"class":167},[33,173031,173032,173034,173036,173038,173040,173042,173044,173046,173048,173050],{"class":35,"line":187},[33,173033,269],{"class":238},[33,173035,242],{"class":163},[33,173037,274],{"class":54},[33,173039,277],{"class":50},[33,173041,26814],{"class":54},[33,173043,26817],{"class":50},[33,173045,26814],{"class":54},[33,173047,26827],{"class":50},[33,173049,274],{"class":54},[33,173051,247],{"class":167},[33,173053,173054,173056,173058,173060,173063],{"class":35,"line":201},[33,173055,26852],{"class":238},[33,173057,242],{"class":163},[33,173059,127802],{"class":167},[33,173061,173062],{"class":54},"\"merge.log\"",[33,173064,127808],{"class":167},[33,173066,173067],{"class":35,"line":206},[33,173068,221],{"class":167},[33,173070,173071],{"class":35,"line":224},[33,173072,92],{"emptyLinePlaceholder":91},[33,173074,173075,173077,173079,173081,173084],{"class":35,"line":229},[33,173076,97915],{"class":50},[33,173078,212],{"class":163},[33,173080,215],{"class":167},[33,173082,173083],{"class":54},"\"project\u002Ftemplates\u002Fletter_template.docx\"",[33,173085,221],{"class":167},[33,173087,173088,173090,173092,173094,173096],{"class":35,"line":235},[33,173089,59605],{"class":50},[33,173091,96938],{"class":163},[33,173093,215],{"class":167},[33,173095,172384],{"class":54},[33,173097,221],{"class":167},[33,173099,173100,173102,173104,173106,173109],{"class":35,"line":250},[33,173101,40018],{"class":50},[33,173103,17208],{"class":163},[33,173105,215],{"class":167},[33,173107,173108],{"class":54},"\"project\u002Foutput\"",[33,173110,221],{"class":167},[33,173112,173113],{"class":35,"line":266},[33,173114,92],{"emptyLinePlaceholder":91},[33,173116,173117],{"class":35,"line":290},[33,173118,92],{"emptyLinePlaceholder":91},[33,173120,173121,173123,173125,173127,173129],{"class":35,"line":295},[33,173122,562],{"class":163},[33,173124,170682],{"class":46},[33,173126,170685],{"class":167},[33,173128,37100],{"class":50},[33,173130,574],{"class":167},[33,173132,173133,173135,173137,173139],{"class":35,"line":300},[33,173134,170694],{"class":167},[33,173136,37100],{"class":50},[33,173138,212],{"class":163},[33,173140,170701],{"class":167},[33,173142,173143,173145],{"class":35,"line":317},[33,173144,2424],{"class":163},[33,173146,574],{"class":167},[33,173148,173149,173151,173153,173155,173157,173159,173161,173163,173165,173167,173169],{"class":35,"line":332},[33,173150,172645],{"class":167},[33,173152,172648],{"class":54},[33,173154,763],{"class":167},[33,173156,242],{"class":163},[33,173158,172655],{"class":167},[33,173160,172648],{"class":54},[33,173162,172660],{"class":167},[33,173164,172663],{"class":54},[33,173166,916],{"class":50},[33,173168,172668],{"class":54},[33,173170,221],{"class":167},[33,173172,173173,173175,173177,173179,173181,173183],{"class":35,"line":347},[33,173174,2449],{"class":163},[33,173176,17583],{"class":167},[33,173178,95615],{"class":50},[33,173180,365],{"class":167},[33,173182,86188],{"class":50},[33,173184,1737],{"class":167},[33,173186,173187,173189,173191,173193,173195,173197,173199,173201,173203,173205],{"class":35,"line":374},[33,173188,172645],{"class":167},[33,173190,172648],{"class":54},[33,173192,763],{"class":167},[33,173194,242],{"class":163},[33,173196,7887],{"class":50},[33,173198,172699],{"class":167},[33,173200,172648],{"class":54},[33,173202,365],{"class":167},[33,173204,3198],{"class":54},[33,173206,371],{"class":167},[33,173208,173209,173211],{"class":35,"line":397},[33,173210,2424],{"class":163},[33,173212,574],{"class":167},[33,173214,173215,173217,173219,173221,173223,173225,173227,173229,173231,173233,173235,173237,173239],{"class":35,"line":653},[33,173216,172645],{"class":167},[33,173218,172729],{"class":54},[33,173220,763],{"class":167},[33,173222,242],{"class":163},[33,173224,1110],{"class":163},[33,173226,18820],{"class":54},[33,173228,88861],{"class":50},[33,173230,172742],{"class":167},[33,173232,172745],{"class":54},[33,173234,18798],{"class":167},[33,173236,28440],{"class":163},[33,173238,1121],{"class":50},[33,173240,7504],{"class":54},[33,173242,173243,173245,173247,173249,173251,173253],{"class":35,"line":667},[33,173244,2449],{"class":163},[33,173246,17583],{"class":167},[33,173248,95615],{"class":50},[33,173250,365],{"class":167},[33,173252,86188],{"class":50},[33,173254,1737],{"class":167},[33,173256,173257,173259,173261,173263,173265],{"class":35,"line":675},[33,173258,172645],{"class":167},[33,173260,172729],{"class":54},[33,173262,763],{"class":167},[33,173264,242],{"class":163},[33,173266,173267],{"class":54}," \"0.00\"\n",[33,173269,173270,173272,173274,173276,173278],{"class":35,"line":689},[33,173271,170758],{"class":167},[33,173273,172806],{"class":54},[33,173275,365],{"class":167},[33,173277,170766],{"class":54},[33,173279,221],{"class":167},[33,173281,173282,173284,173286,173288,173290,173292,173294],{"class":35,"line":703},[33,173283,172826],{"class":167},[33,173285,172829],{"class":54},[33,173287,763],{"class":167},[33,173289,242],{"class":163},[33,173291,172836],{"class":167},[33,173293,172829],{"class":54},[33,173295,172841],{"class":167},[33,173297,173298,173300],{"class":35,"line":714},[33,173299,1332],{"class":163},[33,173301,170813],{"class":167},[33,173303,173304],{"class":35,"line":723},[33,173305,92],{"emptyLinePlaceholder":91},[33,173307,173308],{"class":35,"line":754},[33,173309,92],{"emptyLinePlaceholder":91},[33,173311,173312,173314,173317,173320,173322,173325,173327],{"class":35,"line":771},[33,173313,562],{"class":163},[33,173315,173316],{"class":46}," render_row",[33,173318,173319],{"class":167},"(template_path: Path, ctx: ",[33,173321,37100],{"class":50},[33,173323,173324],{"class":167},", output_path: Path) -> ",[33,173326,2821],{"class":50},[33,173328,574],{"class":167},[33,173330,173331,173333],{"class":35,"line":777},[33,173332,2424],{"class":163},[33,173334,574],{"class":167},[33,173336,173337,173339,173341,173343,173345,173348],{"class":35,"line":788},[33,173338,171464],{"class":167},[33,173340,242],{"class":163},[33,173342,170263],{"class":167},[33,173344,1053],{"class":50},[33,173346,173347],{"class":167},"(template_path))   ",[33,173349,173350],{"class":39},"# fresh instance per row — prevents state bleed\n",[33,173352,173353],{"class":35,"line":804},[33,173354,171477],{"class":167},[33,173356,173357,173359,173361,173363,173365,173367,173369,173371,173373],{"class":35,"line":809},[33,173358,71854],{"class":167},[33,173360,869],{"class":238},[33,173362,242],{"class":163},[33,173364,855],{"class":50},[33,173366,365],{"class":167},[33,173368,878],{"class":238},[33,173370,242],{"class":163},[33,173372,855],{"class":50},[33,173374,221],{"class":167},[33,173376,173377,173379,173381],{"class":35,"line":819},[33,173378,171503],{"class":167},[33,173380,1053],{"class":50},[33,173382,173383],{"class":167},"(output_path))\n",[33,173385,173386,173388],{"class":35,"line":829},[33,173387,1659],{"class":163},[33,173389,2887],{"class":50},[33,173391,173392,173394,173396,173398],{"class":35,"line":834},[33,173393,2449],{"class":163},[33,173395,783],{"class":50},[33,173397,1852],{"class":163},[33,173399,1855],{"class":167},[33,173401,173402,173405,173408,173410,173412,173414,173416],{"class":35,"line":839},[33,173403,173404],{"class":167},"        logging.error(",[33,173406,173407],{"class":54},"\"Render failed for ",[33,173409,309],{"class":50},[33,173411,2079],{"class":54},[33,173413,309],{"class":50},[33,173415,274],{"class":54},[33,173417,173418],{"class":167},", output_path.name, exc)\n",[33,173420,173421,173423],{"class":35,"line":860},[33,173422,1659],{"class":163},[33,173424,2903],{"class":50},[33,173426,173427],{"class":35,"line":887},[33,173428,92],{"emptyLinePlaceholder":91},[33,173430,173431],{"class":35,"line":907},[33,173432,92],{"emptyLinePlaceholder":91},[33,173434,173435,173437,173439,173441,173443],{"class":35,"line":1826},[33,173436,562],{"class":163},[33,173438,28887],{"class":46},[33,173440,568],{"class":167},[33,173442,571],{"class":50},[33,173444,574],{"class":167},[33,173446,173447,173449,173451,173454],{"class":35,"line":1844},[33,173448,617],{"class":163},[33,173450,620],{"class":163},[33,173452,173453],{"class":50}," TEMPLATE",[33,173455,58027],{"class":167},[33,173457,173458,173460,173462,173464,173466,173468,173471,173473],{"class":35,"line":1858},[33,173459,4051],{"class":163},[33,173461,2945],{"class":50},[33,173463,602],{"class":167},[33,173465,4059],{"class":163},[33,173467,20538],{"class":54},[33,173469,173470],{"class":50},"{TEMPLATE}",[33,173472,274],{"class":54},[33,173474,221],{"class":167},[33,173476,173477],{"class":35,"line":1871},[33,173478,92],{"emptyLinePlaceholder":91},[33,173480,173481,173483],{"class":35,"line":1877},[33,173482,2424],{"class":163},[33,173484,574],{"class":167},[33,173486,173487,173489,173491,173493,173495],{"class":35,"line":1883},[33,173488,7930],{"class":167},[33,173490,242],{"class":163},[33,173492,9481],{"class":167},[33,173494,59605],{"class":50},[33,173496,221],{"class":167},[33,173498,173499,173501,173503,173505],{"class":35,"line":1915},[33,173500,2449],{"class":163},[33,173502,2945],{"class":50},[33,173504,1852],{"class":163},[33,173506,1855],{"class":167},[33,173508,173509,173511,173513,173515,173517,173520,173522,173524,173526,173528,173530,173532],{"class":35,"line":1926},[33,173510,4051],{"class":163},[33,173512,16617],{"class":50},[33,173514,602],{"class":167},[33,173516,4059],{"class":163},[33,173518,173519],{"class":54},"\"Data file missing: ",[33,173521,1115],{"class":50},[33,173523,6565],{"class":167},[33,173525,1121],{"class":50},[33,173527,274],{"class":54},[33,173529,1649],{"class":167},[33,173531,190],{"class":163},[33,173533,20843],{"class":167},[33,173535,173536],{"class":35,"line":1932},[33,173537,92],{"emptyLinePlaceholder":91},[33,173539,173540,173543,173545,173547,173549,173551,173553,173555,173557,173560,173562],{"class":35,"line":1938},[33,173541,173542],{"class":167},"    logging.info(",[33,173544,96187],{"class":54},[33,173546,916],{"class":50},[33,173548,96199],{"class":54},[33,173550,309],{"class":50},[33,173552,274],{"class":54},[33,173554,365],{"class":167},[33,173556,928],{"class":50},[33,173558,173559],{"class":167},"(df), ",[33,173561,59605],{"class":50},[33,173563,221],{"class":167},[33,173565,173566,173568,173570],{"class":35,"line":1950},[33,173567,79390],{"class":167},[33,173569,242],{"class":163},[33,173571,28914],{"class":50},[33,173573,173574],{"class":35,"line":1958},[33,173575,92],{"emptyLinePlaceholder":91},[33,173577,173578,173580,173583,173585],{"class":35,"line":4904},[33,173579,656],{"class":163},[33,173581,173582],{"class":167}," idx, row ",[33,173584,662],{"class":163},[33,173586,8565],{"class":167},[33,173588,173589,173592,173594],{"class":35,"line":4909},[33,173590,173591],{"class":167},"        ctx ",[33,173593,242],{"class":163},[33,173595,173596],{"class":167}," build_context(row)\n",[33,173598,173599],{"class":35,"line":4915},[33,173600,173601],{"class":39},"        # Deterministic filename: sanitize the primary-key column\n",[33,173603,173604,173607,173609,173611,173613,173616,173619,173621,173623,173625,173628,173631,173633,173635],{"class":35,"line":4925},[33,173605,173606],{"class":167},"        safe_id ",[33,173608,242],{"class":163},[33,173610,7887],{"class":50},[33,173612,172699],{"class":167},[33,173614,173615],{"class":54},"\"client_id\"",[33,173617,173618],{"class":167},", idx)).replace(",[33,173620,17294],{"class":54},[33,173622,365],{"class":167},[33,173624,7764],{"class":54},[33,173626,173627],{"class":167},").replace(",[33,173629,173630],{"class":54},"\"\u002F\"",[33,173632,365],{"class":167},[33,173634,75122],{"class":54},[33,173636,221],{"class":167},[33,173638,173639,173641,173643,173645,173647,173649,173652,173654,173657,173659],{"class":35,"line":4935},[33,173640,50344],{"class":167},[33,173642,242],{"class":163},[33,173644,40144],{"class":50},[33,173646,1107],{"class":163},[33,173648,1110],{"class":163},[33,173650,173651],{"class":54},"\"invoice_",[33,173653,1115],{"class":50},[33,173655,173656],{"class":167},"safe_id",[33,173658,1121],{"class":50},[33,173660,18215],{"class":54},[33,173662,173663],{"class":35,"line":4941},[33,173664,92],{"emptyLinePlaceholder":91},[33,173666,173667,173669,173672,173674],{"class":35,"line":4950},[33,173668,8221],{"class":163},[33,173670,173671],{"class":167}," render_row(",[33,173673,97915],{"class":50},[33,173675,173676],{"class":167},", ctx, out):\n",[33,173678,173679,173681,173683],{"class":35,"line":4960},[33,173680,87640],{"class":167},[33,173682,28976],{"class":163},[33,173684,17709],{"class":50},[33,173686,173687,173689,173692,173694,173696],{"class":35,"line":4965},[33,173688,134649],{"class":167},[33,173690,173691],{"class":54},"\"OK  ",[33,173693,309],{"class":50},[33,173695,274],{"class":54},[33,173697,173698],{"class":167},", out.name)\n",[33,173700,173701],{"class":35,"line":4971},[33,173702,92],{"emptyLinePlaceholder":91},[33,173704,173705,173707,173709,173711,173713,173715,173717,173720,173722],{"class":35,"line":4983},[33,173706,173542],{"class":167},[33,173708,22340],{"class":54},[33,173710,916],{"class":50},[33,173712,1351],{"class":54},[33,173714,916],{"class":50},[33,173716,92860],{"class":54},[33,173718,173719],{"class":167},", ok, ",[33,173721,928],{"class":50},[33,173723,128027],{"class":167},[33,173725,173726],{"class":35,"line":4988},[33,173727,92],{"emptyLinePlaceholder":91},[33,173729,173730],{"class":35,"line":4993},[33,173731,92],{"emptyLinePlaceholder":91},[33,173733,173734,173736,173738,173740,173742],{"class":35,"line":5003},[33,173735,2491],{"class":163},[33,173737,2494],{"class":50},[33,173739,2497],{"class":163},[33,173741,2500],{"class":54},[33,173743,574],{"class":167},[33,173745,173746],{"class":35,"line":5008},[33,173747,173748],{"class":167},"    run_batch()\n",[14,173750,173751,173752,173755],{},"The template is re-opened per iteration (",[30,173753,173754],{},"DocxTemplate(...)"," inside the loop). This is intentional — reusing a single instance can carry rendered content from one row into the next.",[18,173757,173759],{"id":173758},"step-4-conditional-sections","Step 4 — Conditional Sections",[14,173761,173762],{},"A conditional section shows or hides a paragraph depending on a value in the context dict.",[14,173764,173765],{},"In the template (one paragraph per branch):",[23,173767,173770],{"className":173768,"code":173769,"language":2000},[1998],"{% if tier == 'premium' %}\nAs a Premium client you receive priority SLA and a dedicated account manager.\n{% endif %}\n\n{% if total_overdue_days > 30 %}\nIMPORTANT: Your account is {{ total_overdue_days }} days overdue. Please remit immediately.\n{% endif %}\n",[30,173771,173769],{"__ignoreMap":28},[14,173773,100563,173774,173776],{},[30,173775,170647],{},", map the raw column to a typed Python value:",[23,173778,173780],{"className":126,"code":173779,"language":47,"meta":28,"style":28},"# pip install pandas\nimport pandas as pd\n\n\ndef build_context(row: pd.Series) -> dict:\n    ctx: dict = row.to_dict()\n    ctx[\"tier\"] = str(ctx.get(\"tier\", \"standard\")).lower()\n    try:\n        ctx[\"total_overdue_days\"] = int(ctx.get(\"total_overdue_days\", 0))\n    except (ValueError, TypeError):\n        ctx[\"total_overdue_days\"] = 0\n    return ctx\n",[30,173781,173782,173786,173796,173800,173804,173816,173826,173849,173855,173878,173892,173904],{"__ignoreMap":28},[33,173783,173784],{"class":35,"line":36},[33,173785,8895],{"class":39},[33,173787,173788,173790,173792,173794],{"class":35,"line":43},[33,173789,164],{"class":163},[33,173791,492],{"class":167},[33,173793,495],{"class":163},[33,173795,498],{"class":167},[33,173797,173798],{"class":35,"line":61},[33,173799,92],{"emptyLinePlaceholder":91},[33,173801,173802],{"class":35,"line":73},[33,173803,92],{"emptyLinePlaceholder":91},[33,173805,173806,173808,173810,173812,173814],{"class":35,"line":88},[33,173807,562],{"class":163},[33,173809,170682],{"class":46},[33,173811,170685],{"class":167},[33,173813,37100],{"class":50},[33,173815,574],{"class":167},[33,173817,173818,173820,173822,173824],{"class":35,"line":95},[33,173819,170694],{"class":167},[33,173821,37100],{"class":50},[33,173823,212],{"class":163},[33,173825,170701],{"class":167},[33,173827,173828,173830,173832,173834,173836,173838,173840,173842,173844,173846],{"class":35,"line":101},[33,173829,172826],{"class":167},[33,173831,172806],{"class":54},[33,173833,763],{"class":167},[33,173835,242],{"class":163},[33,173837,7887],{"class":50},[33,173839,172699],{"class":167},[33,173841,172806],{"class":54},[33,173843,365],{"class":167},[33,173845,170766],{"class":54},[33,173847,173848],{"class":167},")).lower()\n",[33,173850,173851,173853],{"class":35,"line":171},[33,173852,2424],{"class":163},[33,173854,574],{"class":167},[33,173856,173857,173859,173862,173864,173866,173868,173870,173872,173874,173876],{"class":35,"line":179},[33,173858,172645],{"class":167},[33,173860,173861],{"class":54},"\"total_overdue_days\"",[33,173863,763],{"class":167},[33,173865,242],{"class":163},[33,173867,3149],{"class":50},[33,173869,172699],{"class":167},[33,173871,173861],{"class":54},[33,173873,365],{"class":167},[33,173875,748],{"class":50},[33,173877,371],{"class":167},[33,173879,173880,173882,173884,173886,173888,173890],{"class":35,"line":187},[33,173881,2449],{"class":163},[33,173883,17583],{"class":167},[33,173885,95615],{"class":50},[33,173887,365],{"class":167},[33,173889,86188],{"class":50},[33,173891,1737],{"class":167},[33,173893,173894,173896,173898,173900,173902],{"class":35,"line":201},[33,173895,172645],{"class":167},[33,173897,173861],{"class":54},[33,173899,763],{"class":167},[33,173901,242],{"class":163},[33,173903,28914],{"class":50},[33,173905,173906,173908],{"class":35,"line":206},[33,173907,1332],{"class":163},[33,173909,170813],{"class":167},[14,173911,39550,173912,173915,173916,173918,173919,173921],{},[30,173913,173914],{},"{% if %}"," tag evaluates Python truthiness, so passing an integer ",[30,173917,748],{}," correctly suppresses the overdue block while ",[30,173920,83012],{}," renders it.",[18,173923,173925],{"id":173924},"step-5-table-row-loops","Step 5 — Table Row Loops",[14,173927,173928,173929,173932],{},"When each recipient has a variable number of line items the template needs a ",[30,173930,173931],{},"{%tr %}"," loop. Place the loop tag in a Word table:",[14,173934,173935,173936,173938,173939,26814,173942,26814,173945,173948,173949,173952],{},"| ",[30,173937,170986],{}," | | |\n| ",[30,173940,173941],{},"{{ item.description }}",[30,173943,173944],{},"{{ item.qty }}",[30,173946,173947],{},"{{ item.unit_price }}"," |\n| ",[30,173950,173951],{},"{%tr endfor %}"," | | |",[14,173954,173955,173956,173958],{},"(In the actual ",[30,173957,18051],{}," the tags occupy a full table row each, not a Markdown table.)",[14,173960,173961,173962,173965],{},"Build the ",[30,173963,173964],{},"line_items"," list from a second CSV or a JSON column:",[23,173967,173969],{"className":126,"code":173968,"language":47,"meta":28,"style":28},"# pip install pandas\nimport json\nimport pandas as pd\n\n\ndef build_context_with_items(row: pd.Series) -> dict:\n    ctx: dict = row.to_dict()\n\n    # Option A: items encoded as JSON string in a CSV column\n    raw_items = ctx.get(\"line_items_json\", \"[]\")\n    try:\n        items = json.loads(raw_items) if isinstance(raw_items, str) else []\n    except json.JSONDecodeError:\n        items = []\n\n    # Option B: pull from a related DataFrame keyed on client_id\n    # items_df = all_items_df[all_items_df[\"client_id\"] == row[\"client_id\"]]\n    # items = items_df[[\"description\",\"qty\",\"unit_price\"]].to_dict(\"records\")\n\n    ctx[\"line_items\"] = [\n        {\n            \"description\": str(i.get(\"description\", \"\")),\n            \"qty\":         str(i.get(\"qty\", \"\")),\n            \"unit_price\":  f\"${float(i.get('unit_price', 0)):,.2f}\",\n        }\n        for i in items\n    ]\n    return ctx\n",[30,173970,173971,173975,173981,173991,173995,173999,174012,174022,174026,174031,174049,174055,174079,174086,174094,174098,174103,174108,174113,174117,174129,174134,174154,174172,174203,174207,174218,174222],{"__ignoreMap":28},[33,173972,173973],{"class":35,"line":36},[33,173974,8895],{"class":39},[33,173976,173977,173979],{"class":35,"line":43},[33,173978,164],{"class":163},[33,173980,3081],{"class":167},[33,173982,173983,173985,173987,173989],{"class":35,"line":61},[33,173984,164],{"class":163},[33,173986,492],{"class":167},[33,173988,495],{"class":163},[33,173990,498],{"class":167},[33,173992,173993],{"class":35,"line":73},[33,173994,92],{"emptyLinePlaceholder":91},[33,173996,173997],{"class":35,"line":88},[33,173998,92],{"emptyLinePlaceholder":91},[33,174000,174001,174003,174006,174008,174010],{"class":35,"line":95},[33,174002,562],{"class":163},[33,174004,174005],{"class":46}," build_context_with_items",[33,174007,170685],{"class":167},[33,174009,37100],{"class":50},[33,174011,574],{"class":167},[33,174013,174014,174016,174018,174020],{"class":35,"line":101},[33,174015,170694],{"class":167},[33,174017,37100],{"class":50},[33,174019,212],{"class":163},[33,174021,170701],{"class":167},[33,174023,174024],{"class":35,"line":171},[33,174025,92],{"emptyLinePlaceholder":91},[33,174027,174028],{"class":35,"line":179},[33,174029,174030],{"class":39},"    # Option A: items encoded as JSON string in a CSV column\n",[33,174032,174033,174036,174038,174040,174043,174045,174047],{"class":35,"line":187},[33,174034,174035],{"class":167},"    raw_items ",[33,174037,242],{"class":163},[33,174039,172836],{"class":167},[33,174041,174042],{"class":54},"\"line_items_json\"",[33,174044,365],{"class":167},[33,174046,54164],{"class":54},[33,174048,221],{"class":167},[33,174050,174051,174053],{"class":35,"line":201},[33,174052,2424],{"class":163},[33,174054,574],{"class":167},[33,174056,174057,174059,174061,174064,174066,174068,174071,174073,174075,174077],{"class":35,"line":206},[33,174058,72084],{"class":167},[33,174060,242],{"class":163},[33,174062,174063],{"class":167}," json.loads(raw_items) ",[33,174065,2491],{"class":163},[33,174067,36538],{"class":50},[33,174069,174070],{"class":167},"(raw_items, ",[33,174072,1053],{"class":50},[33,174074,1649],{"class":167},[33,174076,7489],{"class":163},[33,174078,589],{"class":167},[33,174080,174081,174083],{"class":35,"line":224},[33,174082,2449],{"class":163},[33,174084,174085],{"class":167}," json.JSONDecodeError:\n",[33,174087,174088,174090,174092],{"class":35,"line":229},[33,174089,72084],{"class":167},[33,174091,242],{"class":163},[33,174093,589],{"class":167},[33,174095,174096],{"class":35,"line":235},[33,174097,92],{"emptyLinePlaceholder":91},[33,174099,174100],{"class":35,"line":250},[33,174101,174102],{"class":39},"    # Option B: pull from a related DataFrame keyed on client_id\n",[33,174104,174105],{"class":35,"line":266},[33,174106,174107],{"class":39},"    # items_df = all_items_df[all_items_df[\"client_id\"] == row[\"client_id\"]]\n",[33,174109,174110],{"class":35,"line":290},[33,174111,174112],{"class":39},"    # items = items_df[[\"description\",\"qty\",\"unit_price\"]].to_dict(\"records\")\n",[33,174114,174115],{"class":35,"line":295},[33,174116,92],{"emptyLinePlaceholder":91},[33,174118,174119,174121,174123,174125,174127],{"class":35,"line":300},[33,174120,172826],{"class":167},[33,174122,172829],{"class":54},[33,174124,763],{"class":167},[33,174126,242],{"class":163},[33,174128,7473],{"class":167},[33,174130,174131],{"class":35,"line":317},[33,174132,174133],{"class":167},"        {\n",[33,174135,174136,174139,174141,174143,174146,174148,174150,174152],{"class":35,"line":332},[33,174137,174138],{"class":54},"            \"description\"",[33,174140,2079],{"class":167},[33,174142,1053],{"class":50},[33,174144,174145],{"class":167},"(i.get(",[33,174147,171073],{"class":54},[33,174149,365],{"class":167},[33,174151,3198],{"class":54},[33,174153,1571],{"class":167},[33,174155,174156,174158,174160,174162,174164,174166,174168,174170],{"class":35,"line":347},[33,174157,144195],{"class":54},[33,174159,171087],{"class":167},[33,174161,1053],{"class":50},[33,174163,174145],{"class":167},[33,174165,54232],{"class":54},[33,174167,365],{"class":167},[33,174169,3198],{"class":54},[33,174171,1571],{"class":167},[33,174173,174174,174177,174179,174181,174183,174185,174187,174189,174191,174193,174195,174197,174199,174201],{"class":35,"line":374},[33,174175,174176],{"class":54},"            \"unit_price\"",[33,174178,20627],{"class":167},[33,174180,4059],{"class":163},[33,174182,18820],{"class":54},[33,174184,88861],{"class":50},[33,174186,174145],{"class":167},[33,174188,171125],{"class":54},[33,174190,365],{"class":167},[33,174192,748],{"class":50},[33,174194,76917],{"class":167},[33,174196,28440],{"class":163},[33,174198,1121],{"class":50},[33,174200,274],{"class":54},[33,174202,247],{"class":167},[33,174204,174205],{"class":35,"line":397},[33,174206,71399],{"class":167},[33,174208,174209,174211,174213,174215],{"class":35,"line":653},[33,174210,5973],{"class":163},[33,174212,47269],{"class":167},[33,174214,662],{"class":163},[33,174216,174217],{"class":167}," items\n",[33,174219,174220],{"class":35,"line":667},[33,174221,19559],{"class":167},[33,174223,174224,174226],{"class":35,"line":675},[33,174225,1332],{"class":163},[33,174227,170813],{"class":167},[14,174229,128348,174230,174233],{},[30,174231,174232],{},"line_items=[]"," when there are no items collapses the loop without raising an error.",[18,174235,12944],{"id":12943},[424,174237,174239],{"id":174238},"excel-as-data-source","Excel as Data Source",[14,174241,174242,174243,20891],{},"Swap one line in ",[30,174244,29210],{},[23,174246,174248],{"className":126,"code":174247,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nimport pandas as pd\nfrom pathlib import Path\n\nDATA = Path(\"project\u002Fdata\u002Frecipients.xlsx\")\n\ndf = pd.read_excel(DATA, engine=\"openpyxl\", sheet_name=\"Sheet1\")\n# dtype normalization — Excel dates arrive as datetime objects, not strings\ndf[\"invoice_date\"] = pd.to_datetime(df[\"invoice_date\"], errors=\"coerce\")\n",[30,174249,174250,174254,174264,174274,174278,174291,174295,174323,174328],{"__ignoreMap":28},[33,174251,174252],{"class":35,"line":36},[33,174253,3952],{"class":39},[33,174255,174256,174258,174260,174262],{"class":35,"line":43},[33,174257,164],{"class":163},[33,174259,492],{"class":167},[33,174261,495],{"class":163},[33,174263,498],{"class":167},[33,174265,174266,174268,174270,174272],{"class":35,"line":61},[33,174267,190],{"class":163},[33,174269,193],{"class":167},[33,174271,164],{"class":163},[33,174273,198],{"class":167},[33,174275,174276],{"class":35,"line":73},[33,174277,92],{"emptyLinePlaceholder":91},[33,174279,174280,174282,174284,174286,174289],{"class":35,"line":88},[33,174281,59605],{"class":50},[33,174283,212],{"class":163},[33,174285,215],{"class":167},[33,174287,174288],{"class":54},"\"project\u002Fdata\u002Frecipients.xlsx\"",[33,174290,221],{"class":167},[33,174292,174293],{"class":35,"line":95},[33,174294,92],{"emptyLinePlaceholder":91},[33,174296,174297,174299,174301,174303,174305,174307,174309,174311,174313,174315,174317,174319,174321],{"class":35,"line":101},[33,174298,13459],{"class":167},[33,174300,242],{"class":163},[33,174302,126254],{"class":167},[33,174304,59605],{"class":50},[33,174306,365],{"class":167},[33,174308,17351],{"class":238},[33,174310,242],{"class":163},[33,174312,17356],{"class":54},[33,174314,365],{"class":167},[33,174316,17371],{"class":238},[33,174318,242],{"class":163},[33,174320,147353],{"class":54},[33,174322,221],{"class":167},[33,174324,174325],{"class":35,"line":171},[33,174326,174327],{"class":39},"# dtype normalization — Excel dates arrive as datetime objects, not strings\n",[33,174329,174330,174332,174334,174336,174338,174340,174342,174344,174346,174348,174350],{"class":35,"line":179},[33,174331,11038],{"class":167},[33,174333,172648],{"class":54},[33,174335,763],{"class":167},[33,174337,242],{"class":163},[33,174339,27668],{"class":167},[33,174341,172648],{"class":54},[33,174343,8314],{"class":167},[33,174345,8317],{"class":238},[33,174347,242],{"class":163},[33,174349,12107],{"class":54},[33,174351,221],{"class":167},[14,174353,6571,174354,174356],{},[940,174355,107447],{"href":9598}," for handling encoding issues and mixed-type columns in flat files.",[424,174358,174360],{"id":174359},"parallel-rendering-for-large-batches","Parallel Rendering for Large Batches",[14,174362,174363,174364,174366],{},"For datasets over 500 rows the GIL is not the bottleneck — disk I\u002FO is. ",[30,174365,84758],{}," gives a meaningful speedup:",[23,174368,174370],{"className":126,"code":174369,"language":47,"meta":28,"style":28},"# pip install docxtpl pandas\nfrom concurrent.futures import ThreadPoolExecutor, as_completed\nfrom pathlib import Path\nimport pandas as pd\nfrom docxtpl import DocxTemplate\nimport logging\n\nTEMPLATE = Path(\"project\u002Ftemplates\u002Fletter_template.docx\")\nOUT_DIR  = Path(\"project\u002Foutput\")\n\n\ndef render_one(args: tuple) -> tuple[int, bool]:\n    idx, ctx = args\n    safe_id  = str(ctx.get(\"client_id\", idx)).replace(\" \", \"_\")\n    out      = OUT_DIR \u002F f\"invoice_{safe_id}.docx\"\n    try:\n        tpl = DocxTemplate(str(TEMPLATE))\n        tpl.render(ctx)\n        out.parent.mkdir(parents=True, exist_ok=True)\n        tpl.save(str(out))\n        return idx, True\n    except Exception as exc:\n        logging.error(\"Row %d failed: %s\", idx, exc)\n        return idx, False\n\n\ndef run_parallel(contexts: list[dict]) -> None:\n    with ThreadPoolExecutor(max_workers=8) as pool:\n        futures = {pool.submit(render_one, (i, ctx)): i for i, ctx in enumerate(contexts)}\n        ok = sum(1 for f in as_completed(futures) if f.result()[1])\n    logging.info(\"Parallel batch: %d\u002F%d OK\", ok, len(contexts))\n",[30,174371,174372,174376,174387,174397,174407,174417,174423,174427,174439,174451,174455,174459,174482,174491,174514,174537,174543,174559,174563,174584,174592,174601,174611,174629,174637,174641,174645,174663,174682,174703,174734],{"__ignoreMap":28},[33,174373,174374],{"class":35,"line":36},[33,174375,170422],{"class":39},[33,174377,174378,174380,174382,174384],{"class":35,"line":43},[33,174379,190],{"class":163},[33,174381,13880],{"class":167},[33,174383,164],{"class":163},[33,174385,174386],{"class":167}," ThreadPoolExecutor, as_completed\n",[33,174388,174389,174391,174393,174395],{"class":35,"line":61},[33,174390,190],{"class":163},[33,174392,193],{"class":167},[33,174394,164],{"class":163},[33,174396,198],{"class":167},[33,174398,174399,174401,174403,174405],{"class":35,"line":73},[33,174400,164],{"class":163},[33,174402,492],{"class":167},[33,174404,495],{"class":163},[33,174406,498],{"class":167},[33,174408,174409,174411,174413,174415],{"class":35,"line":88},[33,174410,190],{"class":163},[33,174412,20437],{"class":167},[33,174414,164],{"class":163},[33,174416,20442],{"class":167},[33,174418,174419,174421],{"class":35,"line":95},[33,174420,164],{"class":163},[33,174422,184],{"class":167},[33,174424,174425],{"class":35,"line":101},[33,174426,92],{"emptyLinePlaceholder":91},[33,174428,174429,174431,174433,174435,174437],{"class":35,"line":171},[33,174430,97915],{"class":50},[33,174432,212],{"class":163},[33,174434,215],{"class":167},[33,174436,173083],{"class":54},[33,174438,221],{"class":167},[33,174440,174441,174443,174445,174447,174449],{"class":35,"line":179},[33,174442,40018],{"class":50},[33,174444,17208],{"class":163},[33,174446,215],{"class":167},[33,174448,173108],{"class":54},[33,174450,221],{"class":167},[33,174452,174453],{"class":35,"line":187},[33,174454,92],{"emptyLinePlaceholder":91},[33,174456,174457],{"class":35,"line":201},[33,174458,92],{"emptyLinePlaceholder":91},[33,174460,174461,174463,174466,174469,174471,174474,174476,174478,174480],{"class":35,"line":206},[33,174462,562],{"class":163},[33,174464,174465],{"class":46}," render_one",[33,174467,174468],{"class":167},"(args: ",[33,174470,89239],{"class":50},[33,174472,174473],{"class":167},") -> tuple[",[33,174475,1059],{"class":50},[33,174477,365],{"class":167},[33,174479,2821],{"class":50},[33,174481,17477],{"class":167},[33,174483,174484,174487,174489],{"class":35,"line":224},[33,174485,174486],{"class":167},"    idx, ctx ",[33,174488,242],{"class":163},[33,174490,22127],{"class":167},[33,174492,174493,174496,174498,174500,174502,174504,174506,174508,174510,174512],{"class":35,"line":229},[33,174494,174495],{"class":167},"    safe_id  ",[33,174497,242],{"class":163},[33,174499,7887],{"class":50},[33,174501,172699],{"class":167},[33,174503,173615],{"class":54},[33,174505,173618],{"class":167},[33,174507,17294],{"class":54},[33,174509,365],{"class":167},[33,174511,7764],{"class":54},[33,174513,221],{"class":167},[33,174515,174516,174519,174521,174523,174525,174527,174529,174531,174533,174535],{"class":35,"line":235},[33,174517,174518],{"class":167},"    out      ",[33,174520,242],{"class":163},[33,174522,40144],{"class":50},[33,174524,1107],{"class":163},[33,174526,1110],{"class":163},[33,174528,173651],{"class":54},[33,174530,1115],{"class":50},[33,174532,173656],{"class":167},[33,174534,1121],{"class":50},[33,174536,18215],{"class":54},[33,174538,174539,174541],{"class":35,"line":250},[33,174540,2424],{"class":163},[33,174542,574],{"class":167},[33,174544,174545,174547,174549,174551,174553,174555,174557],{"class":35,"line":266},[33,174546,171464],{"class":167},[33,174548,242],{"class":163},[33,174550,170263],{"class":167},[33,174552,1053],{"class":50},[33,174554,602],{"class":167},[33,174556,97915],{"class":50},[33,174558,371],{"class":167},[33,174560,174561],{"class":35,"line":290},[33,174562,171477],{"class":167},[33,174564,174565,174568,174570,174572,174574,174576,174578,174580,174582],{"class":35,"line":295},[33,174566,174567],{"class":167},"        out.parent.mkdir(",[33,174569,869],{"class":238},[33,174571,242],{"class":163},[33,174573,855],{"class":50},[33,174575,365],{"class":167},[33,174577,878],{"class":238},[33,174579,242],{"class":163},[33,174581,855],{"class":50},[33,174583,221],{"class":167},[33,174585,174586,174588,174590],{"class":35,"line":300},[33,174587,171503],{"class":167},[33,174589,1053],{"class":50},[33,174591,55133],{"class":167},[33,174593,174594,174596,174599],{"class":35,"line":317},[33,174595,1659],{"class":163},[33,174597,174598],{"class":167}," idx, ",[33,174600,135488],{"class":50},[33,174602,174603,174605,174607,174609],{"class":35,"line":332},[33,174604,2449],{"class":163},[33,174606,783],{"class":50},[33,174608,1852],{"class":163},[33,174610,1855],{"class":167},[33,174612,174613,174615,174618,174620,174622,174624,174626],{"class":35,"line":347},[33,174614,173404],{"class":167},[33,174616,174617],{"class":54},"\"Row ",[33,174619,916],{"class":50},[33,174621,1899],{"class":54},[33,174623,309],{"class":50},[33,174625,274],{"class":54},[33,174627,174628],{"class":167},", idx, exc)\n",[33,174630,174631,174633,174635],{"class":35,"line":374},[33,174632,1659],{"class":163},[33,174634,174598],{"class":167},[33,174636,8339],{"class":50},[33,174638,174639],{"class":35,"line":397},[33,174640,92],{"emptyLinePlaceholder":91},[33,174642,174643],{"class":35,"line":653},[33,174644,92],{"emptyLinePlaceholder":91},[33,174646,174647,174649,174652,174655,174657,174659,174661],{"class":35,"line":667},[33,174648,562],{"class":163},[33,174650,174651],{"class":46}," run_parallel",[33,174653,174654],{"class":167},"(contexts: list[",[33,174656,37100],{"class":50},[33,174658,28895],{"class":167},[33,174660,571],{"class":50},[33,174662,574],{"class":167},[33,174664,174665,174667,174670,174672,174674,174676,174678,174680],{"class":35,"line":675},[33,174666,1635],{"class":163},[33,174668,174669],{"class":167}," ThreadPoolExecutor(",[33,174671,22277],{"class":238},[33,174673,242],{"class":163},[33,174675,2591],{"class":50},[33,174677,1649],{"class":167},[33,174679,495],{"class":163},[33,174681,14105],{"class":167},[33,174683,174684,174686,174688,174691,174693,174696,174698,174700],{"class":35,"line":689},[33,174685,14110],{"class":167},[33,174687,242],{"class":163},[33,174689,174690],{"class":167}," {pool.submit(render_one, (i, ctx)): i ",[33,174692,6124],{"class":163},[33,174694,174695],{"class":167}," i, ctx ",[33,174697,662],{"class":163},[33,174699,7403],{"class":50},[33,174701,174702],{"class":167},"(contexts)}\n",[33,174704,174705,174708,174710,174712,174714,174716,174718,174720,174722,174725,174727,174730,174732],{"class":35,"line":703},[33,174706,174707],{"class":167},"        ok ",[33,174709,242],{"class":163},[33,174711,46601],{"class":50},[33,174713,602],{"class":167},[33,174715,734],{"class":50},[33,174717,14766],{"class":163},[33,174719,8832],{"class":167},[33,174721,662],{"class":163},[33,174723,174724],{"class":167}," as_completed(futures) ",[33,174726,2491],{"class":163},[33,174728,174729],{"class":167}," f.result()[",[33,174731,734],{"class":50},[33,174733,751],{"class":167},[33,174735,174736,174738,174741,174743,174745,174747,174750,174752,174754],{"class":35,"line":714},[33,174737,173542],{"class":167},[33,174739,174740],{"class":54},"\"Parallel batch: ",[33,174742,916],{"class":50},[33,174744,1351],{"class":54},[33,174746,916],{"class":50},[33,174748,174749],{"class":54}," OK\"",[33,174751,173719],{"class":167},[33,174753,928],{"class":50},[33,174755,174756],{"class":167},"(contexts))\n",[424,174758,174760],{"id":174759},"rendering-a-docx-template-from-an-in-memory-buffer","Rendering a docx Template from an In-Memory Buffer",[14,174762,174763,174764,174766],{},"When the template is fetched from cloud storage (S3, Azure Blob) you can pass a ",[30,174765,61504],{}," object directly:",[23,174768,174770],{"className":126,"code":174769,"language":47,"meta":28,"style":28},"# pip install docxtpl boto3\nimport io\nimport boto3\nfrom docxtpl import DocxTemplate\nfrom pathlib import Path\n\ns3 = boto3.client(\"s3\")\nobj = s3.get_object(Bucket=\"my-bucket\", Key=\"templates\u002Fletter_template.docx\")\nbuf = io.BytesIO(obj[\"Body\"].read())\n\ntpl = DocxTemplate(buf)\ntpl.render({\"first_name\": \"Alice\", \"last_name\": \"Smith\"})\ntpl.save(str(Path(\"project\u002Foutput\u002Falice_smith.docx\")))\n",[30,174771,174772,174777,174783,174790,174800,174810,174814,174829,174858,174874,174878,174887,174910],{"__ignoreMap":28},[33,174773,174774],{"class":35,"line":36},[33,174775,174776],{"class":39},"# pip install docxtpl boto3\n",[33,174778,174779,174781],{"class":35,"line":43},[33,174780,164],{"class":163},[33,174782,60058],{"class":167},[33,174784,174785,174787],{"class":35,"line":61},[33,174786,164],{"class":163},[33,174788,174789],{"class":167}," boto3\n",[33,174791,174792,174794,174796,174798],{"class":35,"line":73},[33,174793,190],{"class":163},[33,174795,20437],{"class":167},[33,174797,164],{"class":163},[33,174799,20442],{"class":167},[33,174801,174802,174804,174806,174808],{"class":35,"line":88},[33,174803,190],{"class":163},[33,174805,193],{"class":167},[33,174807,164],{"class":163},[33,174809,198],{"class":167},[33,174811,174812],{"class":35,"line":95},[33,174813,92],{"emptyLinePlaceholder":91},[33,174815,174816,174819,174821,174824,174827],{"class":35,"line":101},[33,174817,174818],{"class":167},"s3 ",[33,174820,242],{"class":163},[33,174822,174823],{"class":167}," boto3.client(",[33,174825,174826],{"class":54},"\"s3\"",[33,174828,221],{"class":167},[33,174830,174831,174834,174836,174839,174842,174844,174847,174849,174852,174854,174856],{"class":35,"line":171},[33,174832,174833],{"class":167},"obj ",[33,174835,242],{"class":163},[33,174837,174838],{"class":167}," s3.get_object(",[33,174840,174841],{"class":238},"Bucket",[33,174843,242],{"class":163},[33,174845,174846],{"class":54},"\"my-bucket\"",[33,174848,365],{"class":167},[33,174850,174851],{"class":238},"Key",[33,174853,242],{"class":163},[33,174855,170242],{"class":54},[33,174857,221],{"class":167},[33,174859,174860,174863,174865,174868,174871],{"class":35,"line":179},[33,174861,174862],{"class":167},"buf ",[33,174864,242],{"class":163},[33,174866,174867],{"class":167}," io.BytesIO(obj[",[33,174869,174870],{"class":54},"\"Body\"",[33,174872,174873],{"class":167},"].read())\n",[33,174875,174876],{"class":35,"line":187},[33,174877,92],{"emptyLinePlaceholder":91},[33,174879,174880,174882,174884],{"class":35,"line":201},[33,174881,170258],{"class":167},[33,174883,242],{"class":163},[33,174885,174886],{"class":167}," DocxTemplate(buf)\n",[33,174888,174889,174892,174895,174897,174899,174901,174904,174906,174908],{"class":35,"line":206},[33,174890,174891],{"class":167},"tpl.render({",[33,174893,174894],{"class":54},"\"first_name\"",[33,174896,2079],{"class":167},[33,174898,140023],{"class":54},[33,174900,365],{"class":167},[33,174902,174903],{"class":54},"\"last_name\"",[33,174905,2079],{"class":167},[33,174907,170322],{"class":54},[33,174909,103249],{"class":167},[33,174911,174912,174915,174917,174919,174922],{"class":35,"line":224},[33,174913,174914],{"class":167},"tpl.save(",[33,174916,1053],{"class":50},[33,174918,62344],{"class":167},[33,174920,174921],{"class":54},"\"project\u002Foutput\u002Falice_smith.docx\"",[33,174923,23269],{"class":167},[18,174925,174927],{"id":174926},"validation-confirm-output-correctness","Validation — Confirm Output Correctness",[14,174929,174930],{},"After the batch completes, spot-check that placeholders were resolved:",[23,174932,174934],{"className":126,"code":174933,"language":47,"meta":28,"style":28},"# pip install python-docx\nfrom docx import Document\nfrom pathlib import Path\nimport re\n\nOUT_DIR = Path(\"project\u002Foutput\")\n\nunrendered_pattern = re.compile(r\"\\{\\{.*?\\}\\}\")\n\nfor docx_file in sorted(OUT_DIR.glob(\"*.docx\"))[:5]:\n    doc = Document(str(docx_file))\n    full_text = \" \".join(p.text for p in doc.paragraphs)\n    matches = unrendered_pattern.findall(full_text)\n    if matches:\n        print(f\"UNRENDERED in {docx_file.name}: {matches}\")\n    else:\n        print(f\"OK: {docx_file.name}\")\n",[30,174935,174936,174940,174950,174960,174966,174970,174982,174986,175012,175016,175042,175055,175073,175083,175090,175121,175127],{"__ignoreMap":28},[33,174937,174938],{"class":35,"line":36},[33,174939,156213],{"class":39},[33,174941,174942,174944,174946,174948],{"class":35,"line":43},[33,174943,190],{"class":163},[33,174945,18092],{"class":167},[33,174947,164],{"class":163},[33,174949,18097],{"class":167},[33,174951,174952,174954,174956,174958],{"class":35,"line":61},[33,174953,190],{"class":163},[33,174955,193],{"class":167},[33,174957,164],{"class":163},[33,174959,198],{"class":167},[33,174961,174962,174964],{"class":35,"line":73},[33,174963,164],{"class":163},[33,174965,11917],{"class":167},[33,174967,174968],{"class":35,"line":88},[33,174969,92],{"emptyLinePlaceholder":91},[33,174971,174972,174974,174976,174978,174980],{"class":35,"line":95},[33,174973,40018],{"class":50},[33,174975,212],{"class":163},[33,174977,215],{"class":167},[33,174979,173108],{"class":54},[33,174981,221],{"class":167},[33,174983,174984],{"class":35,"line":101},[33,174985,92],{"emptyLinePlaceholder":91},[33,174987,174988,174991,174993,174996,174998,175000,175002,175004,175006,175008,175010],{"class":35,"line":171},[33,174989,174990],{"class":167},"unrendered_pattern ",[33,174992,242],{"class":163},[33,174994,174995],{"class":167}," re.compile(",[33,174997,11977],{"class":163},[33,174999,274],{"class":54},[33,175001,172095],{"class":12018},[33,175003,3035],{"class":50},[33,175005,172100],{"class":163},[33,175007,172103],{"class":12018},[33,175009,274],{"class":54},[33,175011,221],{"class":167},[33,175013,175014],{"class":35,"line":179},[33,175015,92],{"emptyLinePlaceholder":91},[33,175017,175018,175020,175023,175025,175027,175029,175031,175033,175035,175038,175040],{"class":35,"line":187},[33,175019,6124],{"class":163},[33,175021,175022],{"class":167}," docx_file ",[33,175024,662],{"class":163},[33,175026,28924],{"class":50},[33,175028,602],{"class":167},[33,175030,40018],{"class":50},[33,175032,607],{"class":167},[33,175034,167589],{"class":54},[33,175036,175037],{"class":167},"))[:",[33,175039,1153],{"class":50},[33,175041,17477],{"class":167},[33,175043,175044,175046,175048,175050,175052],{"class":35,"line":201},[33,175045,18224],{"class":167},[33,175047,242],{"class":163},[33,175049,156340],{"class":167},[33,175051,1053],{"class":50},[33,175053,175054],{"class":167},"(docx_file))\n",[33,175056,175057,175059,175061,175063,175065,175067,175069,175071],{"class":35,"line":206},[33,175058,159437],{"class":167},[33,175060,242],{"class":163},[33,175062,57412],{"class":54},[33,175064,159444],{"class":167},[33,175066,6124],{"class":163},[33,175068,6127],{"class":167},[33,175070,662],{"class":163},[33,175072,159453],{"class":167},[33,175074,175075,175078,175080],{"class":35,"line":224},[33,175076,175077],{"class":167},"    matches ",[33,175079,242],{"class":163},[33,175081,175082],{"class":167}," unrendered_pattern.findall(full_text)\n",[33,175084,175085,175087],{"class":35,"line":229},[33,175086,617],{"class":163},[33,175088,175089],{"class":167}," matches:\n",[33,175091,175092,175094,175096,175098,175101,175103,175106,175108,175110,175112,175115,175117,175119],{"class":35,"line":235},[33,175093,9414],{"class":50},[33,175095,602],{"class":167},[33,175097,4059],{"class":163},[33,175099,175100],{"class":54},"\"UNRENDERED in ",[33,175102,1115],{"class":50},[33,175104,175105],{"class":167},"docx_file.name",[33,175107,1121],{"class":50},[33,175109,2079],{"class":54},[33,175111,1115],{"class":50},[33,175113,175114],{"class":167},"matches",[33,175116,1121],{"class":50},[33,175118,274],{"class":54},[33,175120,221],{"class":167},[33,175122,175123,175125],{"class":35,"line":250},[33,175124,6864],{"class":163},[33,175126,574],{"class":167},[33,175128,175129,175131,175133,175135,175137,175139,175141,175143,175145],{"class":35,"line":266},[33,175130,9414],{"class":50},[33,175132,602],{"class":167},[33,175134,4059],{"class":163},[33,175136,57480],{"class":54},[33,175138,1115],{"class":50},[33,175140,175105],{"class":167},[33,175142,1121],{"class":50},[33,175144,274],{"class":54},[33,175146,221],{"class":167},[14,175148,41801,175149,175151],{},[30,175150,170875],{}," remaining in the output means a context key was missing at render time. Cross-reference column names printed in the diagnostic step.",[18,175153,175155],{"id":175154},"performance-scale-notes","Performance & Scale Notes",[4211,175157,175158,175169,175175,175181],{},[4214,175159,175160,2079,175162,175165,175166,175168],{},[1974,175161,4218],{},[30,175163,175164],{},"DocxTemplate"," loads the whole ",[30,175167,18051],{}," XML into memory. A typical 50 KB template uses ~2 MB of working memory after parsing. 500 parallel renders sit comfortably inside 1 GB.",[4214,175170,175171,175174],{},[1974,175172,175173],{},"Disk",": Each output file is roughly the size of the template plus text expansion. Budget ~70 KB per rendered invoice.",[4214,175176,175177,175180],{},[1974,175178,175179],{},"Bottleneck",": Disk write throughput dominates at scale, not CPU. Threads beat processes for I\u002FO-bound work.",[4214,175182,175183,175186,175187,175190],{},[1974,175184,175185],{},"Very large batches",": Chunk the DataFrame with ",[30,175188,175189],{},"pd.read_csv(DATA, chunksize=200)"," and process one chunk at a time to cap peak memory.",[18,175192,4271],{"id":4270},[4273,175194,175195,175205],{},[4276,175196,175197],{},[4279,175198,175199,175201,175203],{},[4282,175200,14317],{},[4282,175202,4287],{},[4282,175204,4290],{},[4292,175206,175207,175224,175241,175257,175274],{},[4279,175208,175209,175213,175216],{},[4297,175210,175211],{},[30,175212,170129],{},[4297,175214,175215],{},"Context dict missing a key that the template references",[4297,175217,4358,175218,175221,175222],{},[30,175219,175220],{},"ctx.setdefault('xxx', '')"," or align column names; see ",[940,175223,170119],{"href":172864},[4279,175225,175226,175232,175235],{},[4297,175227,175228,175231],{},[30,175229,175230],{},"XMLSyntaxError"," on render",[4297,175233,175234],{},"Placeholder split across XML runs in the template",[4297,175236,175237,175238,175240],{},"Retype the ",[30,175239,170875],{}," tag as one continuous run in Word",[4279,175242,175243,175245,175251],{},[4297,175244,74636],{},[4297,175246,175247,175250],{},[30,175248,175249],{},"tpl.save()"," path parent directory does not exist",[4297,175252,74566,175253,175256],{},[30,175254,175255],{},"output_path.parent.mkdir(parents=True, exist_ok=True)"," before saving",[4279,175258,175259,175262,175265],{},[4297,175260,175261],{},"Table loop only renders one row",[4297,175263,175264],{},"Loop tag is in a paragraph, not a table row",[4297,175266,175267,175268,36608,175271,175273],{},"Move ",[30,175269,175270],{},"{%tr for %}",[30,175272,173951],{}," into separate Word table rows",[4279,175275,175276,175282,175287],{},[4297,175277,175278,175279],{},"Date column renders as ",[30,175280,175281],{},"2026-06-01 00:00:00",[4297,175283,118012,175284,175286],{},[30,175285,129076],{}," not formatted before render",[4297,175288,11870,175289,8363,175292],{},[30,175290,175291],{},".strftime(\"%B %d, %Y\")",[30,175293,170647],{},[18,175295,4402],{"id":4401},[23,175297,175299],{"className":126,"code":175298,"language":47,"meta":28,"style":28},"# pip install docxtpl pandas openpyxl\n\"\"\"\nmailmerge_batch.py — render one .docx per CSV\u002FExcel row.\nUsage: python mailmerge_batch.py --data recipients.csv --template letter_template.docx --out output\u002F\n\"\"\"\nimport argparse\nimport logging\nfrom pathlib import Path\n\nimport pandas as pd\nfrom docxtpl import DocxTemplate\n\nlogging.basicConfig(\n    level=logging.INFO,\n    format=\"%(asctime)s | %(levelname)s | %(message)s\",\n    handlers=[logging.FileHandler(\"merge.log\"), logging.StreamHandler()],\n)\n\n\ndef build_context(row: pd.Series) -> dict:\n    ctx: dict = row.to_dict()\n    try:\n        ctx[\"invoice_date\"] = pd.to_datetime(ctx[\"invoice_date\"]).strftime(\"%B %d, %Y\")\n    except (ValueError, TypeError, KeyError):\n        ctx[\"invoice_date\"] = str(ctx.get(\"invoice_date\", \"\"))\n    try:\n        ctx[\"total_due\"] = f\"${float(ctx['total_due']):,.2f}\"\n    except (ValueError, TypeError, KeyError):\n        ctx[\"total_due\"] = \"$0.00\"\n    ctx.setdefault(\"tier\", \"standard\")\n    ctx[\"line_items\"] = ctx.get(\"line_items\", [])\n    return ctx\n\n\ndef main() -> None:\n    parser = argparse.ArgumentParser(description=\"Batch mail merge with docxtpl\")\n    parser.add_argument(\"--data\",     required=True, help=\"CSV or .xlsx data source\")\n    parser.add_argument(\"--template\", required=True, help=\"Path to .docx template\")\n    parser.add_argument(\"--out\",      required=True, help=\"Output directory\")\n    parser.add_argument(\"--key\",      default=\"client_id\", help=\"Column for output filename\")\n    args = parser.parse_args()\n\n    template_path = Path(args.template)\n    out_dir       = Path(args.out)\n    data_path     = Path(args.data)\n    out_dir.mkdir(parents=True, exist_ok=True)\n\n    if not template_path.exists():\n        raise FileNotFoundError(f\"Template not found: {template_path}\")\n\n    # Load data — supports both CSV and Excel\n    try:\n        if data_path.suffix.lower() in (\".xlsx\", \".xls\"):\n            df = pd.read_excel(data_path, engine=\"openpyxl\")\n        else:\n            df = pd.read_csv(data_path)\n    except Exception as exc:\n        raise SystemExit(f\"Failed to load data: {exc}\") from exc\n\n    logging.info(\"Loaded %d rows from %s\", len(df), data_path)\n    ok = 0\n\n    for idx, row in df.iterrows():\n        ctx      = build_context(row)\n        safe_key = str(ctx.get(args.key, idx)).replace(\" \", \"_\").replace(\"\u002F\", \"-\")\n        out_path = out_dir \u002F f\"doc_{safe_key}.docx\"\n        try:\n            tpl = DocxTemplate(str(template_path))\n            tpl.render(ctx)\n            tpl.save(str(out_path))\n            ok += 1\n            logging.info(\"OK  %s\", out_path.name)\n        except Exception as exc:\n            logging.error(\"FAIL row %d (%s): %s\", idx, safe_key, exc)\n\n    logging.info(\"Batch complete: %d\u002F%d rendered\", ok, len(df))\n\n\nif __name__ == \"__main__\":\n    main()\n",[30,175300,175301,175305,175309,175314,175319,175323,175329,175335,175345,175349,175359,175369,175373,175377,175389,175411,175423,175427,175431,175435,175447,175457,175463,175487,175505,175527,175533,175561,175579,175592,175604,175620,175626,175630,175634,175646,175663,175688,175714,175738,175764,175772,175776,175786,175795,175804,175824,175828,175837,175860,175864,175869,175875,175895,175912,175918,175927,175937,175964,175968,175989,175997,176001,176011,176020,176048,176072,176078,176091,176096,176105,176113,176126,176136,176159,176163,176185,176189,176193,176205],{"__ignoreMap":28},[33,175302,175303],{"class":35,"line":36},[33,175304,172261],{"class":39},[33,175306,175307],{"class":35,"line":43},[33,175308,139],{"class":54},[33,175310,175311],{"class":35,"line":61},[33,175312,175313],{"class":54},"mailmerge_batch.py — render one .docx per CSV\u002FExcel row.\n",[33,175315,175316],{"class":35,"line":73},[33,175317,175318],{"class":54},"Usage: python mailmerge_batch.py --data recipients.csv --template letter_template.docx --out output\u002F\n",[33,175320,175321],{"class":35,"line":88},[33,175322,139],{"class":54},[33,175324,175325,175327],{"class":35,"line":95},[33,175326,164],{"class":163},[33,175328,4461],{"class":167},[33,175330,175331,175333],{"class":35,"line":101},[33,175332,164],{"class":163},[33,175334,184],{"class":167},[33,175336,175337,175339,175341,175343],{"class":35,"line":171},[33,175338,190],{"class":163},[33,175340,193],{"class":167},[33,175342,164],{"class":163},[33,175344,198],{"class":167},[33,175346,175347],{"class":35,"line":179},[33,175348,92],{"emptyLinePlaceholder":91},[33,175350,175351,175353,175355,175357],{"class":35,"line":187},[33,175352,164],{"class":163},[33,175354,492],{"class":167},[33,175356,495],{"class":163},[33,175358,498],{"class":167},[33,175360,175361,175363,175365,175367],{"class":35,"line":201},[33,175362,190],{"class":163},[33,175364,20437],{"class":167},[33,175366,164],{"class":163},[33,175368,20442],{"class":167},[33,175370,175371],{"class":35,"line":206},[33,175372,92],{"emptyLinePlaceholder":91},[33,175374,175375],{"class":35,"line":224},[33,175376,232],{"class":167},[33,175378,175379,175381,175383,175385,175387],{"class":35,"line":229},[33,175380,253],{"class":238},[33,175382,242],{"class":163},[33,175384,258],{"class":167},[33,175386,1067],{"class":50},[33,175388,247],{"class":167},[33,175390,175391,175393,175395,175397,175399,175401,175403,175405,175407,175409],{"class":35,"line":235},[33,175392,269],{"class":238},[33,175394,242],{"class":163},[33,175396,274],{"class":54},[33,175398,277],{"class":50},[33,175400,26814],{"class":54},[33,175402,26817],{"class":50},[33,175404,26814],{"class":54},[33,175406,26827],{"class":50},[33,175408,274],{"class":54},[33,175410,247],{"class":167},[33,175412,175413,175415,175417,175419,175421],{"class":35,"line":250},[33,175414,26852],{"class":238},[33,175416,242],{"class":163},[33,175418,127802],{"class":167},[33,175420,173062],{"class":54},[33,175422,127808],{"class":167},[33,175424,175425],{"class":35,"line":266},[33,175426,221],{"class":167},[33,175428,175429],{"class":35,"line":290},[33,175430,92],{"emptyLinePlaceholder":91},[33,175432,175433],{"class":35,"line":295},[33,175434,92],{"emptyLinePlaceholder":91},[33,175436,175437,175439,175441,175443,175445],{"class":35,"line":300},[33,175438,562],{"class":163},[33,175440,170682],{"class":46},[33,175442,170685],{"class":167},[33,175444,37100],{"class":50},[33,175446,574],{"class":167},[33,175448,175449,175451,175453,175455],{"class":35,"line":317},[33,175450,170694],{"class":167},[33,175452,37100],{"class":50},[33,175454,212],{"class":163},[33,175456,170701],{"class":167},[33,175458,175459,175461],{"class":35,"line":332},[33,175460,2424],{"class":163},[33,175462,574],{"class":167},[33,175464,175465,175467,175469,175471,175473,175475,175477,175479,175481,175483,175485],{"class":35,"line":347},[33,175466,172645],{"class":167},[33,175468,172648],{"class":54},[33,175470,763],{"class":167},[33,175472,242],{"class":163},[33,175474,172655],{"class":167},[33,175476,172648],{"class":54},[33,175478,172660],{"class":167},[33,175480,172663],{"class":54},[33,175482,916],{"class":50},[33,175484,172668],{"class":54},[33,175486,221],{"class":167},[33,175488,175489,175491,175493,175495,175497,175499,175501,175503],{"class":35,"line":374},[33,175490,2449],{"class":163},[33,175492,17583],{"class":167},[33,175494,95615],{"class":50},[33,175496,365],{"class":167},[33,175498,86188],{"class":50},[33,175500,365],{"class":167},[33,175502,8377],{"class":50},[33,175504,1737],{"class":167},[33,175506,175507,175509,175511,175513,175515,175517,175519,175521,175523,175525],{"class":35,"line":397},[33,175508,172645],{"class":167},[33,175510,172648],{"class":54},[33,175512,763],{"class":167},[33,175514,242],{"class":163},[33,175516,7887],{"class":50},[33,175518,172699],{"class":167},[33,175520,172648],{"class":54},[33,175522,365],{"class":167},[33,175524,3198],{"class":54},[33,175526,371],{"class":167},[33,175528,175529,175531],{"class":35,"line":653},[33,175530,2424],{"class":163},[33,175532,574],{"class":167},[33,175534,175535,175537,175539,175541,175543,175545,175547,175549,175551,175553,175555,175557,175559],{"class":35,"line":667},[33,175536,172645],{"class":167},[33,175538,172729],{"class":54},[33,175540,763],{"class":167},[33,175542,242],{"class":163},[33,175544,1110],{"class":163},[33,175546,18820],{"class":54},[33,175548,88861],{"class":50},[33,175550,172742],{"class":167},[33,175552,172745],{"class":54},[33,175554,18798],{"class":167},[33,175556,28440],{"class":163},[33,175558,1121],{"class":50},[33,175560,7504],{"class":54},[33,175562,175563,175565,175567,175569,175571,175573,175575,175577],{"class":35,"line":675},[33,175564,2449],{"class":163},[33,175566,17583],{"class":167},[33,175568,95615],{"class":50},[33,175570,365],{"class":167},[33,175572,86188],{"class":50},[33,175574,365],{"class":167},[33,175576,8377],{"class":50},[33,175578,1737],{"class":167},[33,175580,175581,175583,175585,175587,175589],{"class":35,"line":689},[33,175582,172645],{"class":167},[33,175584,172729],{"class":54},[33,175586,763],{"class":167},[33,175588,242],{"class":163},[33,175590,175591],{"class":54}," \"$0.00\"\n",[33,175593,175594,175596,175598,175600,175602],{"class":35,"line":703},[33,175595,170758],{"class":167},[33,175597,172806],{"class":54},[33,175599,365],{"class":167},[33,175601,170766],{"class":54},[33,175603,221],{"class":167},[33,175605,175606,175608,175610,175612,175614,175616,175618],{"class":35,"line":714},[33,175607,172826],{"class":167},[33,175609,172829],{"class":54},[33,175611,763],{"class":167},[33,175613,242],{"class":163},[33,175615,172836],{"class":167},[33,175617,172829],{"class":54},[33,175619,172841],{"class":167},[33,175621,175622,175624],{"class":35,"line":723},[33,175623,1332],{"class":163},[33,175625,170813],{"class":167},[33,175627,175628],{"class":35,"line":754},[33,175629,92],{"emptyLinePlaceholder":91},[33,175631,175632],{"class":35,"line":771},[33,175633,92],{"emptyLinePlaceholder":91},[33,175635,175636,175638,175640,175642,175644],{"class":35,"line":777},[33,175637,562],{"class":163},[33,175639,6636],{"class":46},[33,175641,568],{"class":167},[33,175643,571],{"class":50},[33,175645,574],{"class":167},[33,175647,175648,175650,175652,175654,175656,175658,175661],{"class":35,"line":788},[33,175649,6648],{"class":167},[33,175651,242],{"class":163},[33,175653,6653],{"class":167},[33,175655,6656],{"class":238},[33,175657,242],{"class":163},[33,175659,175660],{"class":54},"\"Batch mail merge with docxtpl\"",[33,175662,221],{"class":167},[33,175664,175665,175667,175669,175671,175673,175675,175677,175679,175681,175683,175686],{"class":35,"line":804},[33,175666,6669],{"class":167},[33,175668,64452],{"class":54},[33,175670,25539],{"class":167},[33,175672,25448],{"class":238},[33,175674,242],{"class":163},[33,175676,855],{"class":50},[33,175678,365],{"class":167},[33,175680,25463],{"class":238},[33,175682,242],{"class":163},[33,175684,175685],{"class":54},"\"CSV or .xlsx data source\"",[33,175687,221],{"class":167},[33,175689,175690,175692,175695,175697,175699,175701,175703,175705,175707,175709,175712],{"class":35,"line":809},[33,175691,6669],{"class":167},[33,175693,175694],{"class":54},"\"--template\"",[33,175696,365],{"class":167},[33,175698,25448],{"class":238},[33,175700,242],{"class":163},[33,175702,855],{"class":50},[33,175704,365],{"class":167},[33,175706,25463],{"class":238},[33,175708,242],{"class":163},[33,175710,175711],{"class":54},"\"Path to .docx template\"",[33,175713,221],{"class":167},[33,175715,175716,175718,175720,175722,175724,175726,175728,175730,175732,175734,175736],{"class":35,"line":819},[33,175717,6669],{"class":167},[33,175719,41152],{"class":54},[33,175721,121141],{"class":167},[33,175723,25448],{"class":238},[33,175725,242],{"class":163},[33,175727,855],{"class":50},[33,175729,365],{"class":167},[33,175731,25463],{"class":238},[33,175733,242],{"class":163},[33,175735,25501],{"class":54},[33,175737,221],{"class":167},[33,175739,175740,175742,175745,175747,175749,175751,175753,175755,175757,175759,175762],{"class":35,"line":829},[33,175741,6669],{"class":167},[33,175743,175744],{"class":54},"\"--key\"",[33,175746,121141],{"class":167},[33,175748,6685],{"class":238},[33,175750,242],{"class":163},[33,175752,173615],{"class":54},[33,175754,365],{"class":167},[33,175756,25463],{"class":238},[33,175758,242],{"class":163},[33,175760,175761],{"class":54},"\"Column for output filename\"",[33,175763,221],{"class":167},[33,175765,175766,175768,175770],{"class":35,"line":834},[33,175767,6766],{"class":167},[33,175769,242],{"class":163},[33,175771,6771],{"class":167},[33,175773,175774],{"class":35,"line":839},[33,175775,92],{"emptyLinePlaceholder":91},[33,175777,175778,175781,175783],{"class":35,"line":860},[33,175779,175780],{"class":167},"    template_path ",[33,175782,242],{"class":163},[33,175784,175785],{"class":167}," Path(args.template)\n",[33,175787,175788,175791,175793],{"class":35,"line":887},[33,175789,175790],{"class":167},"    out_dir       ",[33,175792,242],{"class":163},[33,175794,64559],{"class":167},[33,175796,175797,175800,175802],{"class":35,"line":907},[33,175798,175799],{"class":167},"    data_path     ",[33,175801,242],{"class":163},[33,175803,64550],{"class":167},[33,175805,175806,175808,175810,175812,175814,175816,175818,175820,175822],{"class":35,"line":1826},[33,175807,28258],{"class":167},[33,175809,869],{"class":238},[33,175811,242],{"class":163},[33,175813,855],{"class":50},[33,175815,365],{"class":167},[33,175817,878],{"class":238},[33,175819,242],{"class":163},[33,175821,855],{"class":50},[33,175823,221],{"class":167},[33,175825,175826],{"class":35,"line":1844},[33,175827,92],{"emptyLinePlaceholder":91},[33,175829,175830,175832,175834],{"class":35,"line":1858},[33,175831,617],{"class":163},[33,175833,620],{"class":163},[33,175835,175836],{"class":167}," template_path.exists():\n",[33,175838,175839,175841,175843,175845,175847,175849,175851,175854,175856,175858],{"class":35,"line":1871},[33,175840,4051],{"class":163},[33,175842,2945],{"class":50},[33,175844,602],{"class":167},[33,175846,4059],{"class":163},[33,175848,20538],{"class":54},[33,175850,1115],{"class":50},[33,175852,175853],{"class":167},"template_path",[33,175855,1121],{"class":50},[33,175857,274],{"class":54},[33,175859,221],{"class":167},[33,175861,175862],{"class":35,"line":1877},[33,175863,92],{"emptyLinePlaceholder":91},[33,175865,175866],{"class":35,"line":1883},[33,175867,175868],{"class":39},"    # Load data — supports both CSV and Excel\n",[33,175870,175871,175873],{"class":35,"line":1915},[33,175872,2424],{"class":163},[33,175874,574],{"class":167},[33,175876,175877,175879,175882,175884,175886,175888,175890,175893],{"class":35,"line":1926},[33,175878,8221],{"class":163},[33,175880,175881],{"class":167}," data_path.suffix.lower() ",[33,175883,662],{"class":163},[33,175885,17583],{"class":167},[33,175887,27374],{"class":54},[33,175889,365],{"class":167},[33,175891,175892],{"class":54},"\".xls\"",[33,175894,1737],{"class":167},[33,175896,175897,175899,175901,175904,175906,175908,175910],{"class":35,"line":1932},[33,175898,51528],{"class":167},[33,175900,242],{"class":163},[33,175902,175903],{"class":167}," pd.read_excel(data_path, ",[33,175905,17351],{"class":238},[33,175907,242],{"class":163},[33,175909,17356],{"class":54},[33,175911,221],{"class":167},[33,175913,175914,175916],{"class":35,"line":1938},[33,175915,41290],{"class":163},[33,175917,574],{"class":167},[33,175919,175920,175922,175924],{"class":35,"line":1950},[33,175921,51528],{"class":167},[33,175923,242],{"class":163},[33,175925,175926],{"class":167}," pd.read_csv(data_path)\n",[33,175928,175929,175931,175933,175935],{"class":35,"line":1958},[33,175930,2449],{"class":163},[33,175932,783],{"class":50},[33,175934,1852],{"class":163},[33,175936,1855],{"class":167},[33,175938,175939,175941,175943,175945,175947,175950,175952,175954,175956,175958,175960,175962],{"class":35,"line":4904},[33,175940,4051],{"class":163},[33,175942,16617],{"class":50},[33,175944,602],{"class":167},[33,175946,4059],{"class":163},[33,175948,175949],{"class":54},"\"Failed to load data: ",[33,175951,1115],{"class":50},[33,175953,6565],{"class":167},[33,175955,1121],{"class":50},[33,175957,274],{"class":54},[33,175959,1649],{"class":167},[33,175961,190],{"class":163},[33,175963,20843],{"class":167},[33,175965,175966],{"class":35,"line":4909},[33,175967,92],{"emptyLinePlaceholder":91},[33,175969,175970,175972,175974,175976,175978,175980,175982,175984,175986],{"class":35,"line":4915},[33,175971,173542],{"class":167},[33,175973,96187],{"class":54},[33,175975,916],{"class":50},[33,175977,96199],{"class":54},[33,175979,309],{"class":50},[33,175981,274],{"class":54},[33,175983,365],{"class":167},[33,175985,928],{"class":50},[33,175987,175988],{"class":167},"(df), data_path)\n",[33,175990,175991,175993,175995],{"class":35,"line":4925},[33,175992,79390],{"class":167},[33,175994,242],{"class":163},[33,175996,28914],{"class":50},[33,175998,175999],{"class":35,"line":4935},[33,176000,92],{"emptyLinePlaceholder":91},[33,176002,176003,176005,176007,176009],{"class":35,"line":4941},[33,176004,656],{"class":163},[33,176006,173582],{"class":167},[33,176008,662],{"class":163},[33,176010,8565],{"class":167},[33,176012,176013,176016,176018],{"class":35,"line":4950},[33,176014,176015],{"class":167},"        ctx      ",[33,176017,242],{"class":163},[33,176019,173596],{"class":167},[33,176021,176022,176025,176027,176029,176032,176034,176036,176038,176040,176042,176044,176046],{"class":35,"line":4960},[33,176023,176024],{"class":167},"        safe_key ",[33,176026,242],{"class":163},[33,176028,7887],{"class":50},[33,176030,176031],{"class":167},"(ctx.get(args.key, idx)).replace(",[33,176033,17294],{"class":54},[33,176035,365],{"class":167},[33,176037,7764],{"class":54},[33,176039,173627],{"class":167},[33,176041,173630],{"class":54},[33,176043,365],{"class":167},[33,176045,75122],{"class":54},[33,176047,221],{"class":167},[33,176049,176050,176052,176054,176056,176058,176060,176063,176065,176068,176070],{"class":35,"line":4965},[33,176051,79122],{"class":167},[33,176053,242],{"class":163},[33,176055,40669],{"class":167},[33,176057,1351],{"class":163},[33,176059,1110],{"class":163},[33,176061,176062],{"class":54},"\"doc_",[33,176064,1115],{"class":50},[33,176066,176067],{"class":167},"safe_key",[33,176069,1121],{"class":50},[33,176071,18215],{"class":54},[33,176073,176074,176076],{"class":35,"line":4971},[33,176075,670],{"class":163},[33,176077,574],{"class":167},[33,176079,176080,176083,176085,176087,176089],{"class":35,"line":4983},[33,176081,176082],{"class":167},"            tpl ",[33,176084,242],{"class":163},[33,176086,170263],{"class":167},[33,176088,1053],{"class":50},[33,176090,171272],{"class":167},[33,176092,176093],{"class":35,"line":4988},[33,176094,176095],{"class":167},"            tpl.render(ctx)\n",[33,176097,176098,176101,176103],{"class":35,"line":4993},[33,176099,176100],{"class":167},"            tpl.save(",[33,176102,1053],{"class":50},[33,176104,161046],{"class":167},[33,176106,176107,176109,176111],{"class":35,"line":5003},[33,176108,87640],{"class":167},[33,176110,28976],{"class":163},[33,176112,17709],{"class":50},[33,176114,176115,176117,176119,176121,176123],{"class":35,"line":5008},[33,176116,134649],{"class":167},[33,176118,173691],{"class":54},[33,176120,309],{"class":50},[33,176122,274],{"class":54},[33,176124,176125],{"class":167},", out_path.name)\n",[33,176127,176128,176130,176132,176134],{"class":35,"line":5014},[33,176129,780],{"class":163},[33,176131,783],{"class":50},[33,176133,1852],{"class":163},[33,176135,1855],{"class":167},[33,176137,176138,176141,176144,176146,176148,176150,176152,176154,176156],{"class":35,"line":5019},[33,176139,176140],{"class":167},"            logging.error(",[33,176142,176143],{"class":54},"\"FAIL row ",[33,176145,916],{"class":50},[33,176147,17583],{"class":54},[33,176149,309],{"class":50},[33,176151,86841],{"class":54},[33,176153,309],{"class":50},[33,176155,274],{"class":54},[33,176157,176158],{"class":167},", idx, safe_key, exc)\n",[33,176160,176161],{"class":35,"line":5032},[33,176162,92],{"emptyLinePlaceholder":91},[33,176164,176165,176167,176170,176172,176174,176176,176179,176181,176183],{"class":35,"line":5039},[33,176166,173542],{"class":167},[33,176168,176169],{"class":54},"\"Batch complete: ",[33,176171,916],{"class":50},[33,176173,1351],{"class":54},[33,176175,916],{"class":50},[33,176177,176178],{"class":54}," rendered\"",[33,176180,173719],{"class":167},[33,176182,928],{"class":50},[33,176184,128027],{"class":167},[33,176186,176187],{"class":35,"line":5068},[33,176188,92],{"emptyLinePlaceholder":91},[33,176190,176191],{"class":35,"line":5077},[33,176192,92],{"emptyLinePlaceholder":91},[33,176194,176195,176197,176199,176201,176203],{"class":35,"line":5082},[33,176196,2491],{"class":163},[33,176198,2494],{"class":50},[33,176200,2497],{"class":163},[33,176202,2500],{"class":54},[33,176204,574],{"class":167},[33,176206,176207],{"class":35,"line":5089},[33,176208,6914],{"class":167},[14,176210,176211],{},"Run:",[23,176213,176215],{"className":25,"code":176214,"language":27,"meta":28,"style":28},"python mailmerge_batch.py \\\n  --data project\u002Fdata\u002Frecipients.csv \\\n  --template project\u002Ftemplates\u002Fletter_template.docx \\\n  --out project\u002Foutput \\\n  --key client_id\n",[30,176216,176217,176226,176236,176246,176256],{"__ignoreMap":28},[33,176218,176219,176221,176224],{"class":35,"line":36},[33,176220,47],{"class":46},[33,176222,176223],{"class":54}," mailmerge_batch.py",[33,176225,26120],{"class":50},[33,176227,176228,176231,176234],{"class":35,"line":43},[33,176229,176230],{"class":50},"  --data",[33,176232,176233],{"class":54}," project\u002Fdata\u002Frecipients.csv",[33,176235,26120],{"class":50},[33,176237,176238,176241,176244],{"class":35,"line":61},[33,176239,176240],{"class":50},"  --template",[33,176242,176243],{"class":54}," project\u002Ftemplates\u002Fletter_template.docx",[33,176245,26120],{"class":50},[33,176247,176248,176251,176254],{"class":35,"line":73},[33,176249,176250],{"class":50},"  --out",[33,176252,176253],{"class":54}," project\u002Foutput",[33,176255,26120],{"class":50},[33,176257,176258,176261],{"class":35,"line":88},[33,176259,176260],{"class":50},"  --key",[33,176262,176263],{"class":54}," client_id\n",[18,176265,6918],{"id":6917},[4211,176267,176268,176277,176282,176287],{},[4214,176269,176270,176272,176273,176276],{},[940,176271,170119],{"href":172864}," — diagnose and fix ",[30,176274,176275],{},"UndefinedError: 'xxx' is undefined"," during render",[4214,176278,176279,176281],{},[940,176280,156152],{"href":26562}," — foundational python-docx patterns for building documents programmatically",[4214,176283,176284,176286],{},[940,176285,99577],{"href":99576}," — engine selection and multi-sheet loading for Excel data sources",[4214,176288,176289,176291],{},[940,176290,107447],{"href":9598}," — fix encoding issues and mixed-type columns before feeding into the render loop",[14,176293,6947,176294,3035],{},[940,176295,26263],{"href":26262},[6953,176297,176298],{},"html pre.shiki code .sAwPA, html code.shiki .sAwPA{--shiki-default:#6A737D}html pre.shiki code .s7eDp, html code.shiki .s7eDp{--shiki-default:#6F42C1}html pre.shiki code .sYBdl, html code.shiki .sYBdl{--shiki-default:#032F62}html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html pre.shiki code .sYu0t, html code.shiki .sYu0t{--shiki-default:#005CC5}html pre.shiki code .sD7c4, html code.shiki .sD7c4{--shiki-default:#D73A49}html pre.shiki code .sgsFI, html code.shiki .sgsFI{--shiki-default:#24292E}html pre.shiki code .sqxcx, html code.shiki .sqxcx{--shiki-default:#E36209}html pre.shiki code .s691h, html code.shiki .s691h{--shiki-default:#22863A;--shiki-default-font-weight:bold}",{"title":28,"searchDepth":43,"depth":43,"links":176300},[176301,176302,176303,176304,176305,176306,176307,176308,176309,176314,176315,176316,176317,176318],{"id":20,"depth":43,"text":21},{"id":172333,"depth":43,"text":172334},{"id":172507,"depth":43,"text":172508},{"id":172563,"depth":43,"text":172564},{"id":172867,"depth":43,"text":172868},{"id":172959,"depth":43,"text":172960},{"id":173758,"depth":43,"text":173759},{"id":173924,"depth":43,"text":173925},{"id":12943,"depth":43,"text":12944,"children":176310},[176311,176312,176313],{"id":174238,"depth":61,"text":174239},{"id":174359,"depth":61,"text":174360},{"id":174759,"depth":61,"text":174760},{"id":174926,"depth":43,"text":174927},{"id":175154,"depth":43,"text":175155},{"id":4270,"depth":43,"text":4271},{"id":4401,"depth":43,"text":4402},{"id":6917,"depth":43,"text":6918},"Word's built-in mail merge tops out fast: it requires a running Word instance, chokes on conditional logic beyond simple if\u002Felse, and produces no audit trail. Python with docxtpl replaces it with a repeatable script that renders hundreds of documents in a single pass — no GUI, no manual field mapping, no version drift between runs.",{},"\u002Fword-document-templating-batch-processing\u002Fdynamic-mail-merge-with-python",{"title":26185,"description":176319},"word-document-templating-batch-processing\u002Fdynamic-mail-merge-with-python\u002Findex","1oqVSWZIfATEqlRzHwiMUwwuNittopoYUtwEKOfi2Vg",{"id":176326,"title":26263,"body":176327,"breadcrumbTitle":179024,"canonical":6977,"date":46387,"description":179025,"draft":6980,"extension":6981,"image":6977,"meta":179026,"navigation":91,"path":179027,"robots":6977,"seo":179028,"seoTitle":179029,"stem":179030,"tags":179031,"updatedAt":6978,"__hash__":179032},"content\u002Fword-document-templating-batch-processing\u002Findex.md",{"type":7,"value":176328,"toc":179010},[176329,176332,176338,176342,176345,176434,176440,176443,176445,176448,176585,176588,176590,176596,176659,176665,176668,176748,176752,176775,176778,176820,176824,176833,177201,177217,177244,177248,177268,177532,177553,177557,177564,178084,178109,178115,178125,178129,178137,178435,178449,178451,178454,178490,178816,178819,178831,178833,178928,178930,178939,178948,178962,178971,178980,178982,179004,179008],[10,176330,26263],{"id":176331},"word-document-templating-batch-processing",[14,176333,176334,176335,176337],{},"Manual Word workflows fail the moment volume arrives. Copy a template, paste in a client name, fix the date, save with a unique filename, repeat 400 times — and every repetition is a chance for a stale figure, a broken style, or a typo in a contract clause. The work does not scale linearly: it scales with the number of fields times the number of records, and human attention degrades long before the batch finishes. Python replaces that loop with a deterministic pipeline. You author one template, bind it to a row of structured data, and render an identical-quality ",[30,176336,18051],{}," per record — auditable, repeatable, and fast. This guide covers the full path: designing templates, choosing the right library, ingesting CSV\u002FExcel\u002FJSON data, binding context, looping over records, exporting to PDF, and hardening the job for unattended scheduled runs.",[18,176339,176341],{"id":176340},"how-the-pipeline-fits-together","How the pipeline fits together",[14,176343,176344],{},"Every batch job is the same shape: a static template plus a table of variable data, fed through a render step, emitted as one file per row, with an optional PDF conversion at the end. Hold this diagram in mind for the rest of the page — each later section maps to one box.",[2540,176346,2547,176348,2547,176351,2547,176354,2547,176368,2547,176370,2547,176372,2547,176374,2547,176376,2547,176379,2547,176382,2547,176385,2547,176387,2547,176390,2547,176393,2547,176396,2547,176399,2547,176401,2547,176403,2547,176406,2547,176409,2547,176412,2547,176414,2547,176416,2547,176419,2547,176422,2547,176425,2547,176427,2547,176431],{"viewBox":11071,"role":2543,"ariaLabel":176347,"xmlns":2545,"style":2546},"Word batch pipeline: template and data sources flow into a render loop that emits one docx per row, then optional PDF conversion",[2549,176349,176350],{},"Word templating and batch pipeline",[2553,176352,176353],{},"A .docx template and CSV, Excel, or JSON data feed a render loop that produces one Word document per row, with an optional headless PDF conversion step.",[2557,176355,2559,176356,2559,176363,2547],{},[2561,176357,2564,176359,2564,176361,2559],{"id":176358,"x1":748,"y1":748,"x2":734,"y2":748},"word-pillar-grad",[2566,176360],{"offset":748,"style":2568},[2566,176362],{"offset":734,"style":2571},[2573,176364,2564,176366,2559],{"id":176365,"viewBox":2576,"refX":2577,"refY":1153,"markerWidth":1179,"markerHeight":1179,"orient":2578},"word-pillar-arrow",[2580,176367],{"d":2582,"fill":2583},[2585,176369],{"x":2587,"y":1543,"width":58337,"height":2590,"rx":3545,"fill":2592,"stroke":2593,"style":2594},[2000,176371,172896],{"x":2679,"y":38740,"fill":2599,"style":16983},[2000,176373,172245],{"x":2679,"y":26326,"fill":2583,"style":2685},[2585,176375],{"x":2587,"y":2589,"width":58337,"height":2590,"rx":3545,"fill":2592,"stroke":2593,"style":2594},[2000,176377,176378],{"x":2679,"y":11218,"fill":2599,"style":16983},"data source",[2000,176380,176381],{"x":2679,"y":152449,"fill":2583,"style":2685},"CSV \u002F Excel \u002F JSON",[35,176383],{"x1":2611,"y1":2590,"x2":11231,"y2":2679,"stroke":2583,"markerEnd":176384,"style":2594},"url(#word-pillar-arrow)",[35,176386],{"x1":2611,"y1":2635,"x2":11231,"y2":2589,"stroke":2583,"markerEnd":176384,"style":2594},[2585,176388],{"x":11231,"y":16991,"width":58337,"height":2597,"rx":3545,"fill":176389,"stroke":2593,"style":2594},"url(#word-pillar-grad)",[2000,176391,176392],{"x":2626,"y":102546,"fill":2599,"style":16983},"render loop",[2000,176394,176395],{"x":2626,"y":26402,"fill":2583,"style":2685},"bind context",[2000,176397,176398],{"x":2626,"y":2609,"fill":2583,"style":2685},"per row",[35,176400],{"x1":58352,"y1":2629,"x2":49853,"y2":2629,"stroke":2583,"markerEnd":176384,"style":2594},[2585,176402],{"x":49853,"y":16991,"width":58337,"height":2597,"rx":3545,"fill":2592,"stroke":2593,"style":2594},[2000,176404,176405],{"x":158146,"y":102546,"fill":2599,"style":16983},"out\u002FINV_001.docx",[2000,176407,176408],{"x":158146,"y":26402,"fill":2583,"style":2685},"one file",[2000,176410,176411],{"x":158146,"y":2609,"fill":2583,"style":2685},"per record",[35,176413],{"x1":158146,"y1":2598,"x2":158146,"y2":2701,"stroke":2583,"markerEnd":176384,"style":2594},[2585,176415],{"x":26414,"y":2701,"width":2611,"height":38748,"rx":3545,"fill":2615,"stroke":2593,"style":2694},[2000,176417,176418],{"x":71573,"y":38722,"fill":2599,"style":16983},"optional: PDF",[2000,176420,167412],{"x":71573,"y":176421,"fill":2583,"style":2685},"266",[2000,176423,176424],{"x":71573,"y":64939,"fill":2583,"style":2685},"or docx2pdf",[2585,176426],{"x":2587,"y":17008,"width":59959,"height":2680,"rx":3545,"fill":2615,"stroke":2593,"style":2594},[2000,176428,176430],{"x":26354,"y":176429,"fill":2599,"style":2600},"262","production hardening",[2000,176432,176433],{"x":26354,"y":49839,"fill":2583,"style":2685},"cron \u002F CI · logging · retries · dedup",[14,176435,176436,176437,176439],{},"The four phases — template design, data binding, render loop, output — stay constant whether you generate one offer letter or fifty thousand. What changes at scale is everything around the loop: memory discipline, logging, idempotent naming, and recovery from partial failure. A single-document script and a nightly batch share the same render call but almost nothing else; the gap between them is where most teams lose time. If you are still finding your footing with the single-document case, ",[940,176438,156152],{"href":26562}," covers library selection and the structural API before you scale it into a batch.",[14,176441,176442],{},"The mental model worth internalizing is separation of concerns. The template owns layout and styling. The data source owns content. The script owns the mapping between them and the orchestration around the loop. When those three stay decoupled, a marketing change to the letterhead never touches your code, a new column in the data never breaks rendering, and a bug in the loop never corrupts the template. The sections below walk each box of the diagram in that order, with one substantial, runnable snippet per phase that you can lift directly into a project.",[18,176444,26469],{"id":26468},[14,176446,176447],{},"There is no single \"Word library\" in Python. The ecosystem splits cleanly by job: structural editing versus templating versus format conversion. Pick by the question you are answering, not by popularity.",[4273,176449,176450,176462],{},[4276,176451,176452],{},[4279,176453,176454,176456,176458,176460],{},[4282,176455,26485],{},[4282,176457,64975],{},[4282,176459,26491],{},[4282,176461,26494],{},[4292,176463,176464,176483,176507,176529,176548,176567],{},[4279,176465,176466,176470,176476,176480],{},[4297,176467,176468],{},[940,176469,18041],{"href":26562},[4297,176471,176472,176473,176475],{},"Building or editing ",[30,176474,18051],{}," structurally — paragraphs, tables, runs, styles, core metadata",[4297,176477,176478],{},[30,176479,26570],{},[4297,176481,176482],{},"Filling a designer-authored template; its API rebuilds documents node by node and loses layout nuance",[4279,176484,176485,176489,176499,176504],{},[4297,176486,176487],{},[940,176488,18047],{"href":18040},[4297,176490,176491,176492,10065,176495,176498],{},"Rendering a Word-authored template with ",[30,176493,176494],{},"{{ vars }}",[30,176496,176497],{},"{% loops %}"," while preserving every style",[4297,176500,176501],{},[30,176502,176503],{},"pip install docxtpl",[4297,176505,176506],{},"Generating documents from scratch with no template; you would be fighting it",[4279,176508,176509,176514,176517,176523],{},[4297,176510,176511],{},[940,176512,176513],{"href":18040},"Jinja2",[4297,176515,176516],{},"The expression and control-flow engine inside docxtpl (filters, conditionals, loops)",[4297,176518,176519,176522],{},[30,176520,176521],{},"pip install jinja2"," (pulled in by docxtpl)",[4297,176524,176525,176526,176528],{},"Direct use against ",[30,176527,18051],{}," — Jinja2 only understands text, not the document XML wrapper",[4279,176530,176531,176533,176540,176545],{},[4297,176532,163908],{},[4297,176534,176535,176536,176539],{},"Quick ",[30,176537,176538],{},".docx → .pdf"," on a machine with Microsoft Word installed (Windows\u002FmacOS)",[4297,176541,176542],{},[30,176543,176544],{},"pip install docx2pdf",[4297,176546,176547],{},"Linux servers or any host without Word; it drives Word via COM\u002FAppleScript and will fail headless",[4279,176549,176550,176555,176558,176564],{},[4297,176551,176552,176553,12027],{},"LibreOffice (",[30,176554,164121],{},[4297,176556,176557],{},"Cross-platform, server-safe headless PDF conversion",[4297,176559,176560,176563],{},[30,176561,176562],{},"apt install libreoffice"," (system package)",[4297,176565,176566],{},"Pixel-perfect fidelity to Word's renderer; minor layout drift is possible",[4279,176568,176569,176574,176577,176582],{},[4297,176570,176571],{},[940,176572,176573],{"href":18040},"mailmerge",[4297,176575,176576],{},"Filling Word's native MERGEFIELD merge fields without rewriting the template",[4297,176578,176579],{},[30,176580,176581],{},"pip install docx-mailmerge",[4297,176583,176584],{},"Conditional logic or loops — it has no expression engine; reach for docxtpl instead",[14,176586,176587],{},"For most batch work the answer is docxtpl for rendering plus LibreOffice for PDF. python-docx joins in when you need post-render structural edits or metadata. Reserve docx2pdf for Windows desktops and mailmerge for legacy templates that already carry MERGEFIELD markers.",[18,176589,26619],{"id":26618},[14,176591,176592,176593,176595],{},"Isolate the project so a global package upgrade never silently changes your rendered output. Pin versions in ",[30,176594,26625],{}," so a colleague — or a scheduled job six months from now — reproduces the exact same byte output.",[23,176597,176599],{"className":25,"code":176598,"language":27,"meta":28,"style":28},"# Create and activate an isolated environment\npython -m venv .venv\nsource .venv\u002Fbin\u002Factivate          # Windows: .venv\\Scripts\\activate\npip install -r requirements.txt\n\n# Headless PDF conversion needs the system package (not pip)\n# Debian\u002FUbuntu:\nsudo apt-get install -y libreoffice\n",[30,176600,176601,176605,176615,176623,176633,176637,176642,176647],{"__ignoreMap":28},[33,176602,176603],{"class":35,"line":36},[33,176604,26635],{"class":39},[33,176606,176607,176609,176611,176613],{"class":35,"line":43},[33,176608,47],{"class":46},[33,176610,51],{"class":50},[33,176612,55],{"class":54},[33,176614,58],{"class":54},[33,176616,176617,176619,176621],{"class":35,"line":61},[33,176618,64],{"class":50},[33,176620,67],{"class":54},[33,176622,70],{"class":39},[33,176624,176625,176627,176629,176631],{"class":35,"line":73},[33,176626,76],{"class":46},[33,176628,79],{"class":54},[33,176630,26709],{"class":50},[33,176632,26712],{"class":54},[33,176634,176635],{"class":35,"line":88},[33,176636,92],{"emptyLinePlaceholder":91},[33,176638,176639],{"class":35,"line":95},[33,176640,176641],{"class":39},"# Headless PDF conversion needs the system package (not pip)\n",[33,176643,176644],{"class":35,"line":101},[33,176645,176646],{"class":39},"# Debian\u002FUbuntu:\n",[33,176648,176649,176651,176653,176655,176657],{"class":35,"line":171},[33,176650,9669],{"class":46},[33,176652,9672],{"class":54},[33,176654,79],{"class":54},[33,176656,20912],{"class":50},[33,176658,26696],{"class":54},[23,176660,176663],{"className":176661,"code":176662,"language":2000,"meta":28},[1998],"# requirements.txt — pin everything for reproducible batches\npython-docx==1.1.2\ndocxtpl==0.18.0\njinja2==3.1.4\npandas==2.2.2\nopenpyxl==3.1.5          # Excel ingestion engine for pandas\ndocx2pdf==0.1.8          # only on Windows\u002FmacOS with Word installed\n",[30,176664,176662],{"__ignoreMap":28},[14,176666,176667],{},"Verify the toolchain before you write a loop. A two-line import check now saves a failed overnight batch later.",[23,176669,176671],{"className":126,"code":176670,"language":47,"meta":28,"style":28},"# pip install python-docx docxtpl pandas\nfrom pathlib import Path\nimport docxtpl\nimport docx\n\nprint(\"docxtpl\", docxtpl.__version__)\nprint(\"python-docx\", docx.__version__)\nprint(\"cwd\", Path.cwd())\n",[30,176672,176673,176678,176688,176695,176701,176705,176721,176737],{"__ignoreMap":28},[33,176674,176675],{"class":35,"line":36},[33,176676,176677],{"class":39},"# pip install python-docx docxtpl pandas\n",[33,176679,176680,176682,176684,176686],{"class":35,"line":43},[33,176681,190],{"class":163},[33,176683,193],{"class":167},[33,176685,164],{"class":163},[33,176687,198],{"class":167},[33,176689,176690,176692],{"class":35,"line":61},[33,176691,164],{"class":163},[33,176693,176694],{"class":167}," docxtpl\n",[33,176696,176697,176699],{"class":35,"line":73},[33,176698,164],{"class":163},[33,176700,156220],{"class":167},[33,176702,176703],{"class":35,"line":88},[33,176704,92],{"emptyLinePlaceholder":91},[33,176706,176707,176709,176711,176714,176717,176719],{"class":35,"line":95},[33,176708,13474],{"class":50},[33,176710,602],{"class":167},[33,176712,176713],{"class":54},"\"docxtpl\"",[33,176715,176716],{"class":167},", docxtpl.",[33,176718,37016],{"class":50},[33,176720,221],{"class":167},[33,176722,176723,176725,176727,176730,176733,176735],{"class":35,"line":101},[33,176724,13474],{"class":50},[33,176726,602],{"class":167},[33,176728,176729],{"class":54},"\"python-docx\"",[33,176731,176732],{"class":167},", docx.",[33,176734,37016],{"class":50},[33,176736,221],{"class":167},[33,176738,176739,176741,176743,176746],{"class":35,"line":171},[33,176740,13474],{"class":50},[33,176742,602],{"class":167},[33,176744,176745],{"class":54},"\"cwd\"",[33,176747,344],{"class":167},[18,176749,176751],{"id":176750},"designing-the-template","Designing the template",[14,176753,176754,176755,176757,176758,176760,176761,176764,176765,46332,176768,46332,176771,176774],{},"Reliable rendering depends entirely on how the ",[30,176756,18051],{}," is authored. Word stores text as a sequence of ",[26245,176759,156822],{}," — styled spans — and it will silently split a placeholder like ",[30,176762,176763],{},"{{ client_name }}"," across several runs if you type it with autocorrect on, or edit it mid-word. docxtpl then sees ",[30,176766,176767],{},"{{ clie",[30,176769,176770],{},"nt_na",[30,176772,176773],{},"me }}"," and renders nothing.",[14,176776,176777],{},"Three rules keep templates render-safe:",[4211,176779,176780,176789,176803],{},[4214,176781,176782,176785,176786,176788],{},[1974,176783,176784],{},"Type placeholders in one pass."," Open the template, type the full ",[30,176787,20399],{}," without backspacing or letting autocorrect touch it. If a placeholder fails to render, select it, delete it, and retype it cleanly to collapse the runs.",[4214,176790,176791,176794,176795,176798,176799,176802],{},[1974,176792,176793],{},"Match names to your data exactly."," A ",[30,176796,176797],{},"{{ invoice_total }}"," placeholder needs an ",[30,176800,176801],{},"invoice_total"," key in the context. Align placeholder names to your CSV\u002FExcel column headers or JSON keys up front — see the ingestion section for normalizing messy headers.",[4214,176804,176805,176808,176809,176812,176813,176815,176816,176819],{},[1974,176806,176807],{},"Keep static content static."," Headers, footers, logos, and boilerplate clauses stay as plain Word content. Only the cells and paragraphs that vary become placeholders. For table rows that grow with the data, use docxtpl's row loop: ",[30,176810,176811],{},"{%tr for item in items %}"," … ",[30,176814,173951],{},", which clones the ",[30,176817,176818],{},"\u003Cw:tr>"," XML node per item and preserves borders and the header row.",[18,176821,176823],{"id":176822},"ingestion-loading-templates-and-data-sources","Ingestion: loading templates and data sources",[14,176825,176826,176827,176829,176830,176832],{},"The data side of the pipeline is just structured rows. Whether the source is CSV, Excel, or JSON, normalize it to a list of dictionaries — one dict per document — so the render loop stays format-agnostic. ",[940,176828,9630],{"href":9598}," handles all three with a uniform interface; the data-cleaning patterns in ",[940,176831,9599],{"href":9598}," apply directly here, since a malformed source row becomes a malformed document.",[23,176834,176836],{"className":126,"code":176835,"language":47,"meta":28,"style":28},"# pip install pandas openpyxl\nfrom pathlib import Path\nimport json\nimport pandas as pd\n\ndef load_records(source: Path) -> list[dict]:\n    \"\"\"Load CSV, Excel, or JSON into a uniform list of context dicts.\"\"\"\n    suffix = source.suffix.lower()\n    try:\n        if suffix == \".csv\":\n            df = pd.read_csv(source, dtype=str, keep_default_na=False)\n        elif suffix in {\".xlsx\", \".xls\"}:\n            df = pd.read_excel(source, dtype=str, engine=\"openpyxl\")\n        elif suffix == \".json\":\n            return json.loads(source.read_text(encoding=\"utf-8\"))\n        else:\n            raise ValueError(f\"Unsupported source type: {suffix}\")\n    except FileNotFoundError:\n        raise SystemExit(f\"Data source not found: {source}\")\n\n    # Normalize headers -> snake_case keys matching the template placeholders\n    df.columns = [c.strip().lower().replace(\" \", \"_\") for c in df.columns]\n    return df.to_dict(orient=\"records\")\n\n\nif __name__ == \"__main__\":\n    records = load_records(Path(\"data\") \u002F \"clients.csv\")\n    print(f\"Loaded {len(records)} records; first keys: {list(records[0])}\")\n",[30,176837,176838,176842,176852,176858,176868,176872,176886,176891,176901,176907,176920,176946,176964,176989,177002,177017,177023,177047,177055,177078,177082,177087,177111,177125,177129,177133,177145,177166],{"__ignoreMap":28},[33,176839,176840],{"class":35,"line":36},[33,176841,3952],{"class":39},[33,176843,176844,176846,176848,176850],{"class":35,"line":43},[33,176845,190],{"class":163},[33,176847,193],{"class":167},[33,176849,164],{"class":163},[33,176851,198],{"class":167},[33,176853,176854,176856],{"class":35,"line":61},[33,176855,164],{"class":163},[33,176857,3081],{"class":167},[33,176859,176860,176862,176864,176866],{"class":35,"line":73},[33,176861,164],{"class":163},[33,176863,492],{"class":167},[33,176865,495],{"class":163},[33,176867,498],{"class":167},[33,176869,176870],{"class":35,"line":88},[33,176871,92],{"emptyLinePlaceholder":91},[33,176873,176874,176876,176879,176882,176884],{"class":35,"line":95},[33,176875,562],{"class":163},[33,176877,176878],{"class":46}," load_records",[33,176880,176881],{"class":167},"(source: Path) -> list[",[33,176883,37100],{"class":50},[33,176885,17477],{"class":167},[33,176887,176888],{"class":35,"line":101},[33,176889,176890],{"class":54},"    \"\"\"Load CSV, Excel, or JSON into a uniform list of context dicts.\"\"\"\n",[33,176892,176893,176896,176898],{"class":35,"line":171},[33,176894,176895],{"class":167},"    suffix ",[33,176897,242],{"class":163},[33,176899,176900],{"class":167}," source.suffix.lower()\n",[33,176902,176903,176905],{"class":35,"line":179},[33,176904,2424],{"class":163},[33,176906,574],{"class":167},[33,176908,176909,176911,176914,176916,176918],{"class":35,"line":187},[33,176910,8221],{"class":163},[33,176912,176913],{"class":167}," suffix ",[33,176915,1865],{"class":163},[33,176917,136594],{"class":54},[33,176919,574],{"class":167},[33,176921,176922,176924,176926,176929,176931,176933,176935,176937,176940,176942,176944],{"class":35,"line":201},[33,176923,51528],{"class":167},[33,176925,242],{"class":163},[33,176927,176928],{"class":167}," pd.read_csv(source, ",[33,176930,23262],{"class":238},[33,176932,242],{"class":163},[33,176934,1053],{"class":50},[33,176936,365],{"class":167},[33,176938,176939],{"class":238},"keep_default_na",[33,176941,242],{"class":163},[33,176943,902],{"class":50},[33,176945,221],{"class":167},[33,176947,176948,176950,176952,176954,176956,176958,176960,176962],{"class":35,"line":206},[33,176949,17895],{"class":163},[33,176951,176913],{"class":167},[33,176953,662],{"class":163},[33,176955,4098],{"class":167},[33,176957,27374],{"class":54},[33,176959,365],{"class":167},[33,176961,175892],{"class":54},[33,176963,27382],{"class":167},[33,176965,176966,176968,176970,176973,176975,176977,176979,176981,176983,176985,176987],{"class":35,"line":224},[33,176967,51528],{"class":167},[33,176969,242],{"class":163},[33,176971,176972],{"class":167}," pd.read_excel(source, ",[33,176974,23262],{"class":238},[33,176976,242],{"class":163},[33,176978,1053],{"class":50},[33,176980,365],{"class":167},[33,176982,17351],{"class":238},[33,176984,242],{"class":163},[33,176986,17356],{"class":54},[33,176988,221],{"class":167},[33,176990,176991,176993,176995,176997,177000],{"class":35,"line":229},[33,176992,17895],{"class":163},[33,176994,176913],{"class":167},[33,176996,1865],{"class":163},[33,176998,176999],{"class":54}," \".json\"",[33,177001,574],{"class":167},[33,177003,177004,177006,177009,177011,177013,177015],{"class":35,"line":235},[33,177005,28782],{"class":163},[33,177007,177008],{"class":167}," json.loads(source.read_text(",[33,177010,27249],{"class":238},[33,177012,242],{"class":163},[33,177014,1195],{"class":54},[33,177016,371],{"class":167},[33,177018,177019,177021],{"class":35,"line":250},[33,177020,41290],{"class":163},[33,177022,574],{"class":167},[33,177024,177025,177027,177029,177031,177033,177036,177038,177041,177043,177045],{"class":35,"line":266},[33,177026,59715],{"class":163},[33,177028,4054],{"class":50},[33,177030,602],{"class":167},[33,177032,4059],{"class":163},[33,177034,177035],{"class":54},"\"Unsupported source type: ",[33,177037,1115],{"class":50},[33,177039,177040],{"class":167},"suffix",[33,177042,1121],{"class":50},[33,177044,274],{"class":54},[33,177046,221],{"class":167},[33,177048,177049,177051,177053],{"class":35,"line":290},[33,177050,2449],{"class":163},[33,177052,2945],{"class":50},[33,177054,574],{"class":167},[33,177056,177057,177059,177061,177063,177065,177068,177070,177072,177074,177076],{"class":35,"line":295},[33,177058,4051],{"class":163},[33,177060,16617],{"class":50},[33,177062,602],{"class":167},[33,177064,4059],{"class":163},[33,177066,177067],{"class":54},"\"Data source not found: ",[33,177069,1115],{"class":50},[33,177071,64],{"class":167},[33,177073,1121],{"class":50},[33,177075,274],{"class":54},[33,177077,221],{"class":167},[33,177079,177080],{"class":35,"line":300},[33,177081,92],{"emptyLinePlaceholder":91},[33,177083,177084],{"class":35,"line":317},[33,177085,177086],{"class":39},"    # Normalize headers -> snake_case keys matching the template placeholders\n",[33,177088,177089,177091,177093,177095,177097,177099,177101,177103,177105,177107,177109],{"class":35,"line":332},[33,177090,27546],{"class":167},[33,177092,242],{"class":163},[33,177094,27551],{"class":167},[33,177096,17294],{"class":54},[33,177098,365],{"class":167},[33,177100,7764],{"class":54},[33,177102,1649],{"class":167},[33,177104,6124],{"class":163},[33,177106,7486],{"class":167},[33,177108,662],{"class":163},[33,177110,12624],{"class":167},[33,177112,177113,177115,177117,177119,177121,177123],{"class":35,"line":347},[33,177114,1332],{"class":163},[33,177116,54131],{"class":167},[33,177118,22169],{"class":238},[33,177120,242],{"class":163},[33,177122,21222],{"class":54},[33,177124,221],{"class":167},[33,177126,177127],{"class":35,"line":374},[33,177128,92],{"emptyLinePlaceholder":91},[33,177130,177131],{"class":35,"line":397},[33,177132,92],{"emptyLinePlaceholder":91},[33,177134,177135,177137,177139,177141,177143],{"class":35,"line":653},[33,177136,2491],{"class":163},[33,177138,2494],{"class":50},[33,177140,2497],{"class":163},[33,177142,2500],{"class":54},[33,177144,574],{"class":167},[33,177146,177147,177150,177152,177155,177157,177159,177161,177164],{"class":35,"line":667},[33,177148,177149],{"class":167},"    records ",[33,177151,242],{"class":163},[33,177153,177154],{"class":167}," load_records(Path(",[33,177156,95970],{"class":54},[33,177158,1649],{"class":167},[33,177160,1351],{"class":163},[33,177162,177163],{"class":54}," \"clients.csv\"",[33,177165,221],{"class":167},[33,177167,177168,177170,177172,177174,177176,177178,177181,177183,177186,177188,177191,177193,177195,177197,177199],{"class":35,"line":675},[33,177169,7268],{"class":50},[33,177171,602],{"class":167},[33,177173,4059],{"class":163},[33,177175,96187],{"class":54},[33,177177,4065],{"class":50},[33,177179,177180],{"class":167},"(records)",[33,177182,1121],{"class":50},[33,177184,177185],{"class":54}," records; first keys: ",[33,177187,16875],{"class":50},[33,177189,177190],{"class":167},"(records[",[33,177192,748],{"class":50},[33,177194,18798],{"class":167},[33,177196,1121],{"class":50},[33,177198,274],{"class":54},[33,177200,221],{"class":167},[14,177202,177203,177204,10065,177207,177209,177210,177212,177213,177216],{},"Reading with ",[30,177205,177206],{},"dtype=str",[30,177208,146892],{}," is deliberate: it stops pandas from turning an empty cell into the float ",[30,177211,8884],{},", which would render as the literal text ",[30,177214,177215],{},"nan"," in your document. Cast specific numeric or date fields explicitly in the transformation step instead.",[14,177218,177219,177220,365,177223,177226,177227,10065,177229,177232,177233,10065,177236,36661,177238,177240,177241,177243],{},"The header-normalization line is the seam between data and template. Source spreadsheets arrive with headers like ",[30,177221,177222],{},"Invoice Total",[30,177224,177225],{},"Client Name",", or trailing whitespace from a careless export, while your placeholders read ",[30,177228,176801],{},[30,177230,177231],{},"client_name",". Lowercasing, stripping, and replacing spaces with underscores collapses that variability into one predictable key shape, so the same template renders against a CSV from accounting and an Excel export from a different team without per-source special-casing. If a source uses wildly different header text, add an explicit rename map rather than relying on the placeholder names to drift toward the data — the template is the contract, and the ingestion layer adapts to it. For Excel sources with multiple sheets or a header row that is not the first row, pass ",[30,177234,177235],{},"sheet_name=",[30,177237,136639],{},[30,177239,57240],{},"; the broader reading patterns in ",[940,177242,99577],{"href":99576}," cover the messier real-world layouts.",[18,177245,177247],{"id":177246},"transformation-binding-context-and-styles","Transformation: binding context and styles",[14,177249,177250,177251,177254,177255,177258,177259,177261,177262,177265,177266,3035],{},"Raw cells rarely render cleanly. A currency column arrives as ",[30,177252,177253],{},"1234.5"," and needs to read ",[30,177256,177257],{},"$1,234.50","; a date arrives as ",[30,177260,6978],{}," and should read ",[30,177263,177264],{},"June 18, 2026",". Do this shaping in Python before binding, not inside the template, so formatting logic lives in version control rather than buried in a ",[30,177267,18051],{},[23,177269,177271],{"className":126,"code":177270,"language":47,"meta":28,"style":28},"# pip install docxtpl\nfrom datetime import datetime\nfrom pathlib import Path\nfrom docxtpl import DocxTemplate\n\ndef build_context(row: dict) -> dict:\n    \"\"\"Coerce raw strings into display-ready values for the template.\"\"\"\n    ctx = dict(row)  # copy so the source record is untouched\n    if row.get(\"invoice_total\"):\n        ctx[\"invoice_total\"] = f\"${float(row['invoice_total']):,.2f}\"\n    if row.get(\"issued_on\"):\n        ctx[\"issued_on\"] = datetime.strptime(\n            row[\"issued_on\"], \"%Y-%m-%d\"\n        ).strftime(\"%B %d, %Y\")\n    # docxtpl renders a missing key as empty unless you guard it\n    ctx.setdefault(\"notes\", \"\")\n    return ctx\n\n\ndef render_one(template: Path, row: dict, out_path: Path) -> None:\n    tpl = DocxTemplate(template)            # fresh instance per document\n    tpl.render(build_context(row))          # bind and substitute\n    out_path.parent.mkdir(parents=True, exist_ok=True)\n    tpl.save(out_path)\n",[30,177272,177273,177277,177287,177297,177307,177311,177328,177333,177347,177359,177388,177399,177412,177426,177439,177444,177456,177462,177466,177470,177487,177499,177507,177527],{"__ignoreMap":28},[33,177274,177275],{"class":35,"line":36},[33,177276,170207],{"class":39},[33,177278,177279,177281,177283,177285],{"class":35,"line":43},[33,177280,190],{"class":163},[33,177282,3881],{"class":167},[33,177284,164],{"class":163},[33,177286,96864],{"class":167},[33,177288,177289,177291,177293,177295],{"class":35,"line":61},[33,177290,190],{"class":163},[33,177292,193],{"class":167},[33,177294,164],{"class":163},[33,177296,198],{"class":167},[33,177298,177299,177301,177303,177305],{"class":35,"line":73},[33,177300,190],{"class":163},[33,177302,20437],{"class":167},[33,177304,164],{"class":163},[33,177306,20442],{"class":167},[33,177308,177309],{"class":35,"line":88},[33,177310,92],{"emptyLinePlaceholder":91},[33,177312,177313,177315,177317,177320,177322,177324,177326],{"class":35,"line":95},[33,177314,562],{"class":163},[33,177316,170682],{"class":46},[33,177318,177319],{"class":167},"(row: ",[33,177321,37100],{"class":50},[33,177323,1617],{"class":167},[33,177325,37100],{"class":50},[33,177327,574],{"class":167},[33,177329,177330],{"class":35,"line":101},[33,177331,177332],{"class":54},"    \"\"\"Coerce raw strings into display-ready values for the template.\"\"\"\n",[33,177334,177335,177337,177339,177341,177344],{"class":35,"line":171},[33,177336,170715],{"class":167},[33,177338,242],{"class":163},[33,177340,85015],{"class":50},[33,177342,177343],{"class":167},"(row)  ",[33,177345,177346],{"class":39},"# copy so the source record is untouched\n",[33,177348,177349,177351,177354,177357],{"class":35,"line":179},[33,177350,617],{"class":163},[33,177352,177353],{"class":167}," row.get(",[33,177355,177356],{"class":54},"\"invoice_total\"",[33,177358,1737],{"class":167},[33,177360,177361,177363,177365,177367,177369,177371,177373,177375,177377,177380,177382,177384,177386],{"class":35,"line":187},[33,177362,172645],{"class":167},[33,177364,177356],{"class":54},[33,177366,763],{"class":167},[33,177368,242],{"class":163},[33,177370,1110],{"class":163},[33,177372,18820],{"class":54},[33,177374,88861],{"class":50},[33,177376,18769],{"class":167},[33,177378,177379],{"class":54},"'invoice_total'",[33,177381,18798],{"class":167},[33,177383,28440],{"class":163},[33,177385,1121],{"class":50},[33,177387,7504],{"class":54},[33,177389,177390,177392,177394,177397],{"class":35,"line":201},[33,177391,617],{"class":163},[33,177393,177353],{"class":167},[33,177395,177396],{"class":54},"\"issued_on\"",[33,177398,1737],{"class":167},[33,177400,177401,177403,177405,177407,177409],{"class":35,"line":206},[33,177402,172645],{"class":167},[33,177404,177396],{"class":54},[33,177406,763],{"class":167},[33,177408,242],{"class":163},[33,177410,177411],{"class":167}," datetime.strptime(\n",[33,177413,177414,177416,177418,177420,177422,177424],{"class":35,"line":224},[33,177415,19624],{"class":167},[33,177417,177396],{"class":54},[33,177419,8314],{"class":167},[33,177421,1244],{"class":54},[33,177423,916],{"class":50},[33,177425,7504],{"class":54},[33,177427,177428,177431,177433,177435,177437],{"class":35,"line":229},[33,177429,177430],{"class":167},"        ).strftime(",[33,177432,172663],{"class":54},[33,177434,916],{"class":50},[33,177436,172668],{"class":54},[33,177438,221],{"class":167},[33,177440,177441],{"class":35,"line":235},[33,177442,177443],{"class":39},"    # docxtpl renders a missing key as empty unless you guard it\n",[33,177445,177446,177448,177450,177452,177454],{"class":35,"line":250},[33,177447,170758],{"class":167},[33,177449,131398],{"class":54},[33,177451,365],{"class":167},[33,177453,3198],{"class":54},[33,177455,221],{"class":167},[33,177457,177458,177460],{"class":35,"line":266},[33,177459,1332],{"class":163},[33,177461,170813],{"class":167},[33,177463,177464],{"class":35,"line":290},[33,177465,92],{"emptyLinePlaceholder":91},[33,177467,177468],{"class":35,"line":295},[33,177469,92],{"emptyLinePlaceholder":91},[33,177471,177472,177474,177476,177479,177481,177483,177485],{"class":35,"line":300},[33,177473,562],{"class":163},[33,177475,174465],{"class":46},[33,177477,177478],{"class":167},"(template: Path, row: ",[33,177480,37100],{"class":50},[33,177482,66956],{"class":167},[33,177484,571],{"class":50},[33,177486,574],{"class":167},[33,177488,177489,177491,177493,177496],{"class":35,"line":317},[33,177490,20597],{"class":167},[33,177492,242],{"class":163},[33,177494,177495],{"class":167}," DocxTemplate(template)            ",[33,177497,177498],{"class":39},"# fresh instance per document\n",[33,177500,177501,177504],{"class":35,"line":332},[33,177502,177503],{"class":167},"    tpl.render(build_context(row))          ",[33,177505,177506],{"class":39},"# bind and substitute\n",[33,177508,177509,177511,177513,177515,177517,177519,177521,177523,177525],{"class":35,"line":347},[33,177510,64564],{"class":167},[33,177512,869],{"class":238},[33,177514,242],{"class":163},[33,177516,855],{"class":50},[33,177518,365],{"class":167},[33,177520,878],{"class":238},[33,177522,242],{"class":163},[33,177524,855],{"class":50},[33,177526,221],{"class":167},[33,177528,177529],{"class":35,"line":374},[33,177530,177531],{"class":167},"    tpl.save(out_path)\n",[14,177533,177534,177535,177537,177538,177540,177541,177544,177545,365,177547,177549,177550,177552],{},"Re-instantiating ",[30,177536,175164],{}," inside the loop is not optional. A ",[30,177539,175164],{}," object mutates in place on ",[30,177542,177543],{},"render()",", so reusing one instance across rows leaks the previous document's content into the next — a classic source of every output file containing the first record's data. Styling is inherited from the template itself: define ",[30,177546,163236],{},[30,177548,99685],{},", and any custom table styles in Word, reference them by name, and avoid inline formatting inside loops. For finer control over fonts and runs after rendering, ",[940,177551,18041],{"href":26562}," can reopen the saved file and adjust styles programmatically.",[18,177554,177556],{"id":177555},"consolidation-the-batch-loop","Consolidation: the batch loop",[14,177558,177559,177560,177563],{},"The batch loop ties ingestion, transformation, and rendering together. Two concerns dominate at scale: deterministic naming and resilience. Filenames must be unique and idempotent so a re-run overwrites cleanly rather than producing ",[30,177561,177562],{},"doc_1 (2).docx"," collisions, and a single bad row must not abort the whole run.",[23,177565,177567],{"className":126,"code":177566,"language":47,"meta":28,"style":28},"# pip install docxtpl pandas\nimport logging\nimport re\nfrom pathlib import Path\nfrom docxtpl import DocxTemplate\n\nlogging.basicConfig(\n    level=logging.INFO,\n    format=\"%(asctime)s | %(levelname)s | %(message)s\",\n)\n\ndef safe_name(value: str) -> str:\n    \"\"\"Filesystem-safe slug for deterministic, dedup-friendly filenames.\"\"\"\n    slug = re.sub(r\"[^A-Za-z0-9_-]+\", \"_\", value.strip())\n    return slug.strip(\"_\") or \"unnamed\"\n\n\ndef process_batch(template: Path, records: list[dict], out_dir: Path) -> dict:\n    out_dir.mkdir(parents=True, exist_ok=True)\n    seen: set[str] = set()\n    ok, failed = 0, 0\n\n    for idx, row in enumerate(records):\n        # Tie the filename to a stable business key, not the loop index\n        key = safe_name(row.get(\"invoice_id\") or f\"row_{idx}\")\n        # Dedup: if the key repeats, disambiguate instead of overwriting\n        name = key\n        n = 1\n        while name in seen:\n            n += 1\n            name = f\"{key}_{n}\"\n        seen.add(name)\n\n        out_path = out_dir \u002F f\"{name}.docx\"\n        try:\n            tpl = DocxTemplate(template)\n            tpl.render(build_context(row))   # from the transformation step\n            tpl.save(out_path)\n            logging.info(\"rendered %s\", out_path.name)\n            ok += 1\n        except Exception as exc:             # isolate the failing row\n            logging.error(\"row %s (%s) failed: %s\", idx, key, exc)\n            failed += 1\n\n    logging.info(\"done: %s ok, %s failed\", ok, failed)\n    return {\"ok\": ok, \"failed\": failed}\n",[30,177568,177569,177573,177579,177585,177595,177605,177609,177613,177625,177647,177651,177655,177672,177677,177709,177725,177729,177733,177752,177772,177787,177799,177803,177816,177821,177852,177857,177867,177875,177887,177895,177922,177927,177931,177953,177959,177967,177975,177980,177993,178001,178015,178038,178046,178050,178068],{"__ignoreMap":28},[33,177570,177571],{"class":35,"line":36},[33,177572,170422],{"class":39},[33,177574,177575,177577],{"class":35,"line":43},[33,177576,164],{"class":163},[33,177578,184],{"class":167},[33,177580,177581,177583],{"class":35,"line":61},[33,177582,164],{"class":163},[33,177584,11917],{"class":167},[33,177586,177587,177589,177591,177593],{"class":35,"line":73},[33,177588,190],{"class":163},[33,177590,193],{"class":167},[33,177592,164],{"class":163},[33,177594,198],{"class":167},[33,177596,177597,177599,177601,177603],{"class":35,"line":88},[33,177598,190],{"class":163},[33,177600,20437],{"class":167},[33,177602,164],{"class":163},[33,177604,20442],{"class":167},[33,177606,177607],{"class":35,"line":95},[33,177608,92],{"emptyLinePlaceholder":91},[33,177610,177611],{"class":35,"line":101},[33,177612,232],{"class":167},[33,177614,177615,177617,177619,177621,177623],{"class":35,"line":171},[33,177616,253],{"class":238},[33,177618,242],{"class":163},[33,177620,258],{"class":167},[33,177622,1067],{"class":50},[33,177624,247],{"class":167},[33,177626,177627,177629,177631,177633,177635,177637,177639,177641,177643,177645],{"class":35,"line":179},[33,177628,269],{"class":238},[33,177630,242],{"class":163},[33,177632,274],{"class":54},[33,177634,277],{"class":50},[33,177636,26814],{"class":54},[33,177638,26817],{"class":50},[33,177640,26814],{"class":54},[33,177642,26827],{"class":50},[33,177644,274],{"class":54},[33,177646,247],{"class":167},[33,177648,177649],{"class":35,"line":187},[33,177650,221],{"class":167},[33,177652,177653],{"class":35,"line":201},[33,177654,92],{"emptyLinePlaceholder":91},[33,177656,177657,177659,177661,177664,177666,177668,177670],{"class":35,"line":206},[33,177658,562],{"class":163},[33,177660,22737],{"class":46},[33,177662,177663],{"class":167},"(value: ",[33,177665,1053],{"class":50},[33,177667,1617],{"class":167},[33,177669,1053],{"class":50},[33,177671,574],{"class":167},[33,177673,177674],{"class":35,"line":224},[33,177675,177676],{"class":54},"    \"\"\"Filesystem-safe slug for deterministic, dedup-friendly filenames.\"\"\"\n",[33,177678,177679,177682,177684,177687,177689,177691,177693,177695,177698,177700,177702,177704,177706],{"class":35,"line":229},[33,177680,177681],{"class":167},"    slug ",[33,177683,242],{"class":163},[33,177685,177686],{"class":167}," re.sub(",[33,177688,11977],{"class":163},[33,177690,274],{"class":54},[33,177692,8309],{"class":50},[33,177694,113780],{"class":163},[33,177696,177697],{"class":50},"A-Za-z0-9_-]",[33,177699,1811],{"class":163},[33,177701,274],{"class":54},[33,177703,365],{"class":167},[33,177705,7764],{"class":54},[33,177707,177708],{"class":167},", value.strip())\n",[33,177710,177711,177713,177716,177718,177720,177722],{"class":35,"line":235},[33,177712,1332],{"class":163},[33,177714,177715],{"class":167}," slug.strip(",[33,177717,7764],{"class":54},[33,177719,1649],{"class":167},[33,177721,7162],{"class":163},[33,177723,177724],{"class":54}," \"unnamed\"\n",[33,177726,177727],{"class":35,"line":250},[33,177728,92],{"emptyLinePlaceholder":91},[33,177730,177731],{"class":35,"line":266},[33,177732,92],{"emptyLinePlaceholder":91},[33,177734,177735,177737,177740,177743,177745,177748,177750],{"class":35,"line":290},[33,177736,562],{"class":163},[33,177738,177739],{"class":46}," process_batch",[33,177741,177742],{"class":167},"(template: Path, records: list[",[33,177744,37100],{"class":50},[33,177746,177747],{"class":167},"], out_dir: Path) -> ",[33,177749,37100],{"class":50},[33,177751,574],{"class":167},[33,177753,177754,177756,177758,177760,177762,177764,177766,177768,177770],{"class":35,"line":295},[33,177755,28258],{"class":167},[33,177757,869],{"class":238},[33,177759,242],{"class":163},[33,177761,855],{"class":50},[33,177763,365],{"class":167},[33,177765,878],{"class":238},[33,177767,242],{"class":163},[33,177769,855],{"class":50},[33,177771,221],{"class":167},[33,177773,177774,177777,177779,177781,177783,177785],{"class":35,"line":300},[33,177775,177776],{"class":167},"    seen: set[",[33,177778,1053],{"class":50},[33,177780,763],{"class":167},[33,177782,242],{"class":163},[33,177784,4129],{"class":50},[33,177786,134865],{"class":167},[33,177788,177789,177791,177793,177795,177797],{"class":35,"line":317},[33,177790,87507],{"class":167},[33,177792,242],{"class":163},[33,177794,10791],{"class":50},[33,177796,365],{"class":167},[33,177798,87516],{"class":50},[33,177800,177801],{"class":35,"line":332},[33,177802,92],{"emptyLinePlaceholder":91},[33,177804,177805,177807,177809,177811,177813],{"class":35,"line":347},[33,177806,656],{"class":163},[33,177808,173582],{"class":167},[33,177810,662],{"class":163},[33,177812,7403],{"class":50},[33,177814,177815],{"class":167},"(records):\n",[33,177817,177818],{"class":35,"line":374},[33,177819,177820],{"class":39},"        # Tie the filename to a stable business key, not the loop index\n",[33,177822,177823,177826,177828,177831,177833,177835,177837,177839,177842,177844,177846,177848,177850],{"class":35,"line":397},[33,177824,177825],{"class":167},"        key ",[33,177827,242],{"class":163},[33,177829,177830],{"class":167}," safe_name(row.get(",[33,177832,27353],{"class":54},[33,177834,1649],{"class":167},[33,177836,7162],{"class":163},[33,177838,1110],{"class":163},[33,177840,177841],{"class":54},"\"row_",[33,177843,1115],{"class":50},[33,177845,72912],{"class":167},[33,177847,1121],{"class":50},[33,177849,274],{"class":54},[33,177851,221],{"class":167},[33,177853,177854],{"class":35,"line":653},[33,177855,177856],{"class":39},"        # Dedup: if the key repeats, disambiguate instead of overwriting\n",[33,177858,177859,177862,177864],{"class":35,"line":667},[33,177860,177861],{"class":167},"        name ",[33,177863,242],{"class":163},[33,177865,177866],{"class":167}," key\n",[33,177868,177869,177871,177873],{"class":35,"line":675},[33,177870,104230],{"class":167},[33,177872,242],{"class":163},[33,177874,17709],{"class":50},[33,177876,177877,177879,177882,177884],{"class":35,"line":689},[33,177878,6838],{"class":163},[33,177880,177881],{"class":167}," name ",[33,177883,662],{"class":163},[33,177885,177886],{"class":167}," seen:\n",[33,177888,177889,177891,177893],{"class":35,"line":703},[33,177890,75154],{"class":167},[33,177892,28976],{"class":163},[33,177894,17709],{"class":50},[33,177896,177897,177900,177902,177904,177906,177908,177910,177912,177914,177916,177918,177920],{"class":35,"line":714},[33,177898,177899],{"class":167},"            name ",[33,177901,242],{"class":163},[33,177903,1110],{"class":163},[33,177905,274],{"class":54},[33,177907,1115],{"class":50},[33,177909,44114],{"class":167},[33,177911,1121],{"class":50},[33,177913,78824],{"class":54},[33,177915,1115],{"class":50},[33,177917,22354],{"class":167},[33,177919,1121],{"class":50},[33,177921,7504],{"class":54},[33,177923,177924],{"class":35,"line":723},[33,177925,177926],{"class":167},"        seen.add(name)\n",[33,177928,177929],{"class":35,"line":754},[33,177930,92],{"emptyLinePlaceholder":91},[33,177932,177933,177935,177937,177939,177941,177943,177945,177947,177949,177951],{"class":35,"line":771},[33,177934,79122],{"class":167},[33,177936,242],{"class":163},[33,177938,40669],{"class":167},[33,177940,1351],{"class":163},[33,177942,1110],{"class":163},[33,177944,274],{"class":54},[33,177946,1115],{"class":50},[33,177948,1118],{"class":167},[33,177950,1121],{"class":50},[33,177952,18215],{"class":54},[33,177954,177955,177957],{"class":35,"line":777},[33,177956,670],{"class":163},[33,177958,574],{"class":167},[33,177960,177961,177963,177965],{"class":35,"line":788},[33,177962,176082],{"class":167},[33,177964,242],{"class":163},[33,177966,20602],{"class":167},[33,177968,177969,177972],{"class":35,"line":804},[33,177970,177971],{"class":167},"            tpl.render(build_context(row))   ",[33,177973,177974],{"class":39},"# from the transformation step\n",[33,177976,177977],{"class":35,"line":809},[33,177978,177979],{"class":167},"            tpl.save(out_path)\n",[33,177981,177982,177984,177987,177989,177991],{"class":35,"line":819},[33,177983,134649],{"class":167},[33,177985,177986],{"class":54},"\"rendered ",[33,177988,309],{"class":50},[33,177990,274],{"class":54},[33,177992,176125],{"class":167},[33,177994,177995,177997,177999],{"class":35,"line":829},[33,177996,87640],{"class":167},[33,177998,28976],{"class":163},[33,178000,17709],{"class":50},[33,178002,178003,178005,178007,178009,178012],{"class":35,"line":834},[33,178004,780],{"class":163},[33,178006,783],{"class":50},[33,178008,1852],{"class":163},[33,178010,178011],{"class":167}," exc:             ",[33,178013,178014],{"class":39},"# isolate the failing row\n",[33,178016,178017,178019,178022,178024,178026,178028,178031,178033,178035],{"class":35,"line":839},[33,178018,176140],{"class":167},[33,178020,178021],{"class":54},"\"row ",[33,178023,309],{"class":50},[33,178025,17583],{"class":54},[33,178027,309],{"class":50},[33,178029,178030],{"class":54},") failed: ",[33,178032,309],{"class":50},[33,178034,274],{"class":54},[33,178036,178037],{"class":167},", idx, key, exc)\n",[33,178039,178040,178042,178044],{"class":35,"line":860},[33,178041,87680],{"class":167},[33,178043,28976],{"class":163},[33,178045,17709],{"class":50},[33,178047,178048],{"class":35,"line":887},[33,178049,92],{"emptyLinePlaceholder":91},[33,178051,178052,178054,178057,178059,178061,178063,178065],{"class":35,"line":907},[33,178053,173542],{"class":167},[33,178055,178056],{"class":54},"\"done: ",[33,178058,309],{"class":50},[33,178060,29010],{"class":54},[33,178062,309],{"class":50},[33,178064,29015],{"class":54},[33,178066,178067],{"class":167},", ok, failed)\n",[33,178069,178070,178072,178074,178076,178079,178081],{"class":35,"line":1826},[33,178071,1332],{"class":163},[33,178073,4098],{"class":167},[33,178075,57024],{"class":54},[33,178077,178078],{"class":167},": ok, ",[33,178080,57029],{"class":54},[33,178082,178083],{"class":167},": failed}\n",[14,178085,178086,178087,365,178090,178093,178094,178097,178098,178101,178102,365,178105,178108],{},"Naming deserves more thought than it usually gets. Tying the filename to the loop index (",[30,178088,178089],{},"doc_0.docx",[30,178091,178092],{},"doc_1.docx",") is fragile: re-run the batch after the source gains a row and every file shifts by one, breaking any downstream reference. Tie it to a stable business key instead — an invoice ID, a client code, a contract number — so the same record always produces the same filename across runs. The ",[30,178095,178096],{},"safe_name"," helper strips out anything a filesystem or downstream system (SharePoint, a network share, a CI artifact store) would choke on, and the ",[30,178099,178100],{},"seen"," set guarantees that two records sharing a key disambiguate rather than silently overwrite each other. That combination makes the batch both idempotent and dedup-safe: a clean re-run replaces outputs in place, while genuine duplicates surface as ",[30,178103,178104],{},"_2",[30,178106,178107],{},"_3"," suffixes you can investigate.",[14,178110,178111,178112,178114],{},"Resilience is the other half. Wrapping each render in its own ",[30,178113,29157],{}," isolates a single malformed row — a bad date, a missing required field — so it logs and the batch continues, rather than one corrupt record aborting 9,999 good documents at 3 a.m. Collect the failed keys and re-run just those once the source is fixed; because the job is idempotent, re-processing the full batch would also work and produce identical output.",[14,178116,178117,178118,178121,178122,178124],{},"For very large inputs, stream rows instead of materializing the whole table. pandas ",[30,178119,178120],{},"read_csv(..., chunksize=1000)"," yields DataFrame chunks you can iterate without loading the full file into RAM, which keeps memory flat regardless of batch size. Wrap every file write in the implicit context management that ",[30,178123,175249],{}," already provides, and never hold more than one rendered document in memory at a time.",[18,178126,178128],{"id":178127},"output-and-serialization-converting-to-pdf","Output and serialization: converting to PDF",[14,178130,178131,178132,178134,178135,3035],{},"Distribution almost always means PDF — it is immutable, renders identically everywhere, and cannot be accidentally edited. The conversion strategy depends on the host. On a Linux server or CI runner, drive LibreOffice headless; on a Windows or macOS desktop with Word installed, docx2pdf is simpler. The same docx-to-PDF tradeoffs are covered in depth in ",[940,178133,161278],{"href":161277},", and the resulting PDFs slot directly into the report flows in ",[940,178136,26191],{"href":19001},[23,178138,178140],{"className":126,"code":178139,"language":47,"meta":28,"style":28},"# pip install docx2pdf   (Windows\u002FmacOS only)\nimport shutil\nimport subprocess\nfrom pathlib import Path\n\ndef docx_to_pdf(docx_path: Path, out_dir: Path) -> Path:\n    \"\"\"Convert via LibreOffice headless on Linux\u002Fservers, docx2pdf elsewhere.\"\"\"\n    out_dir.mkdir(parents=True, exist_ok=True)\n    soffice = shutil.which(\"soffice\") or shutil.which(\"libreoffice\")\n\n    if soffice:  # cross-platform, server-safe path\n        subprocess.run(\n            [soffice, \"--headless\", \"--convert-to\", \"pdf\",\n             \"--outdir\", str(out_dir), str(docx_path)],\n            check=True, capture_output=True, timeout=120,\n        )\n    else:        # desktop fallback (requires Microsoft Word)\n        from docx2pdf import convert\n        convert(str(docx_path), str(out_dir \u002F f\"{docx_path.stem}.pdf\"))\n\n    pdf_path = out_dir \u002F f\"{docx_path.stem}.pdf\"\n    if not pdf_path.exists() or pdf_path.stat().st_size == 0:\n        raise RuntimeError(f\"PDF conversion produced no output for {docx_path.name}\")\n    return pdf_path\n",[30,178141,178142,178147,178153,178159,178169,178173,178183,178188,178208,178228,178232,178242,178246,178263,178279,178306,178310,178319,178329,178361,178365,178387,178405,178429],{"__ignoreMap":28},[33,178143,178144],{"class":35,"line":36},[33,178145,178146],{"class":39},"# pip install docx2pdf   (Windows\u002FmacOS only)\n",[33,178148,178149,178151],{"class":35,"line":43},[33,178150,164],{"class":163},[33,178152,41706],{"class":167},[33,178154,178155,178157],{"class":35,"line":61},[33,178156,164],{"class":163},[33,178158,35040],{"class":167},[33,178160,178161,178163,178165,178167],{"class":35,"line":73},[33,178162,190],{"class":163},[33,178164,193],{"class":167},[33,178166,164],{"class":163},[33,178168,198],{"class":167},[33,178170,178171],{"class":35,"line":88},[33,178172,92],{"emptyLinePlaceholder":91},[33,178174,178175,178177,178180],{"class":35,"line":95},[33,178176,562],{"class":163},[33,178178,178179],{"class":46}," docx_to_pdf",[33,178181,178182],{"class":167},"(docx_path: Path, out_dir: Path) -> Path:\n",[33,178184,178185],{"class":35,"line":101},[33,178186,178187],{"class":54},"    \"\"\"Convert via LibreOffice headless on Linux\u002Fservers, docx2pdf elsewhere.\"\"\"\n",[33,178189,178190,178192,178194,178196,178198,178200,178202,178204,178206],{"class":35,"line":171},[33,178191,28258],{"class":167},[33,178193,869],{"class":238},[33,178195,242],{"class":163},[33,178197,855],{"class":50},[33,178199,365],{"class":167},[33,178201,878],{"class":238},[33,178203,242],{"class":163},[33,178205,855],{"class":50},[33,178207,221],{"class":167},[33,178209,178210,178212,178214,178216,178218,178220,178222,178224,178226],{"class":35,"line":179},[33,178211,166667],{"class":167},[33,178213,242],{"class":163},[33,178215,41716],{"class":167},[33,178217,165324],{"class":54},[33,178219,1649],{"class":167},[33,178221,7162],{"class":163},[33,178223,41716],{"class":167},[33,178225,165333],{"class":54},[33,178227,221],{"class":167},[33,178229,178230],{"class":35,"line":187},[33,178231,92],{"emptyLinePlaceholder":91},[33,178233,178234,178236,178239],{"class":35,"line":201},[33,178235,617],{"class":163},[33,178237,178238],{"class":167}," soffice:  ",[33,178240,178241],{"class":39},"# cross-platform, server-safe path\n",[33,178243,178244],{"class":35,"line":206},[33,178245,168764],{"class":167},[33,178247,178248,178251,178253,178255,178257,178259,178261],{"class":35,"line":224},[33,178249,178250],{"class":167},"            [soffice, ",[33,178252,148375],{"class":54},[33,178254,365],{"class":167},[33,178256,167798],{"class":54},[33,178258,365],{"class":167},[33,178260,15519],{"class":54},[33,178262,247],{"class":167},[33,178264,178265,178268,178270,178272,178275,178277],{"class":35,"line":229},[33,178266,178267],{"class":54},"             \"--outdir\"",[33,178269,365],{"class":167},[33,178271,1053],{"class":50},[33,178273,178274],{"class":167},"(out_dir), ",[33,178276,1053],{"class":50},[33,178278,167821],{"class":167},[33,178280,178281,178284,178286,178288,178290,178292,178294,178296,178298,178300,178302,178304],{"class":35,"line":235},[33,178282,178283],{"class":238},"            check",[33,178285,242],{"class":163},[33,178287,855],{"class":50},[33,178289,365],{"class":167},[33,178291,36378],{"class":238},[33,178293,242],{"class":163},[33,178295,855],{"class":50},[33,178297,365],{"class":167},[33,178299,1641],{"class":238},[33,178301,242],{"class":163},[33,178303,2589],{"class":50},[33,178305,247],{"class":167},[33,178307,178308],{"class":35,"line":250},[33,178309,5867],{"class":167},[33,178311,178312,178314,178316],{"class":35,"line":266},[33,178313,6864],{"class":163},[33,178315,104759],{"class":167},[33,178317,178318],{"class":39},"# desktop fallback (requires Microsoft Word)\n",[33,178320,178321,178323,178325,178327],{"class":35,"line":290},[33,178322,164848],{"class":163},[33,178324,164024],{"class":167},[33,178326,164],{"class":163},[33,178328,164029],{"class":167},[33,178330,178331,178334,178336,178339,178341,178343,178345,178347,178349,178351,178354,178356,178359],{"class":35,"line":295},[33,178332,178333],{"class":167},"        convert(",[33,178335,1053],{"class":50},[33,178337,178338],{"class":167},"(docx_path), ",[33,178340,1053],{"class":50},[33,178342,28482],{"class":167},[33,178344,1351],{"class":163},[33,178346,1110],{"class":163},[33,178348,274],{"class":54},[33,178350,1115],{"class":50},[33,178352,178353],{"class":167},"docx_path.stem",[33,178355,1121],{"class":50},[33,178357,178358],{"class":54},".pdf\"",[33,178360,371],{"class":167},[33,178362,178363],{"class":35,"line":300},[33,178364,92],{"emptyLinePlaceholder":91},[33,178366,178367,178369,178371,178373,178375,178377,178379,178381,178383,178385],{"class":35,"line":317},[33,178368,21570],{"class":167},[33,178370,242],{"class":163},[33,178372,40669],{"class":167},[33,178374,1351],{"class":163},[33,178376,1110],{"class":163},[33,178378,274],{"class":54},[33,178380,1115],{"class":50},[33,178382,178353],{"class":167},[33,178384,1121],{"class":50},[33,178386,19246],{"class":54},[33,178388,178389,178391,178393,178395,178397,178399,178401,178403],{"class":35,"line":332},[33,178390,617],{"class":163},[33,178392,620],{"class":163},[33,178394,169417],{"class":167},[33,178396,7162],{"class":163},[33,178398,169422],{"class":167},[33,178400,1865],{"class":163},[33,178402,10791],{"class":50},[33,178404,574],{"class":167},[33,178406,178407,178409,178411,178413,178415,178418,178420,178423,178425,178427],{"class":35,"line":347},[33,178408,4051],{"class":163},[33,178410,7590],{"class":50},[33,178412,602],{"class":167},[33,178414,4059],{"class":163},[33,178416,178417],{"class":54},"\"PDF conversion produced no output for ",[33,178419,1115],{"class":50},[33,178421,178422],{"class":167},"docx_path.name",[33,178424,1121],{"class":50},[33,178426,274],{"class":54},[33,178428,221],{"class":167},[33,178430,178431,178433],{"class":35,"line":374},[33,178432,1332],{"class":163},[33,178434,164504],{"class":167},[14,178436,178437,178438,178440,178441,178443,178444,2012,178446,178448],{},"The post-conversion check matters: LibreOffice can exit ",[30,178439,748],{}," yet silently emit nothing if the input is locked or malformed, so assert the file exists and is non-empty. Validate the ",[30,178442,18051],{}," itself the same way — scan the rendered output for stray ",[30,178445,159462],{},[30,178447,159476],{}," markers, which signal an unbound placeholder that slipped through.",[18,178450,28616],{"id":28615},[14,178452,178453],{},"A batch job that runs once on your laptop and one that runs unattended every night are different programs. The unattended version needs scheduling, durable logging, and recovery from transient failures.",[4211,178455,178456,178471,178484],{},[4214,178457,178458,178460,178461,178464,178465,178468,178469,3035],{},[1974,178459,28626],{}," On Linux, a cron entry (",[30,178462,178463],{},"0 6 * * 1 \u002Fpath\u002F.venv\u002Fbin\u002Fpython \u002Fpath\u002Frun_batch.py",") runs the job weekly. In CI, a scheduled GitHub Actions workflow with a ",[30,178466,178467],{},"cron:"," trigger gives you logs, artifacts, and a clean environment for free. The end-to-end scheduling and logging patterns generalize across document types in ",[940,178470,5],{"href":26465},[4214,178472,178473,178476,178477,178480,178481,178483],{},[1974,178474,178475],{},"Logging to a file."," Replace ",[30,178478,178479],{},"StreamHandler"," with a ",[30,178482,963],{}," so each run appends to a rotating log you can inspect after the fact. Log the source filename, record count, and per-row outcomes — not just \"done\".",[4214,178485,178486,178489],{},[1974,178487,178488],{},"Retries for I\u002FO."," PDF conversion and network-mounted output drives fail transiently. Wrap the conversion call in a small retry with backoff so a momentary file lock does not kill an otherwise-good document.",[23,178491,178493],{"className":126,"code":178492,"language":47,"meta":28,"style":28},"# pip install (stdlib only)\nimport logging\nimport time\nfrom logging.handlers import RotatingFileHandler\nfrom pathlib import Path\n\ndef configure_logging(log_file: Path) -> None:\n    log_file.parent.mkdir(parents=True, exist_ok=True)\n    handler = RotatingFileHandler(log_file, maxBytes=1_000_000, backupCount=3)\n    handler.setFormatter(logging.Formatter(\"%(asctime)s | %(levelname)s | %(message)s\"))\n    logging.basicConfig(level=logging.INFO, handlers=[handler])\n\n\ndef with_retries(func, *args, attempts: int = 3, base_delay: float = 1.0):\n    \"\"\"Retry a flaky I\u002FO operation with exponential backoff.\"\"\"\n    for attempt in range(1, attempts + 1):\n        try:\n            return func(*args)\n        except Exception as exc:\n            if attempt == attempts:\n                logging.error(\"gave up after %s attempts: %s\", attempts, exc)\n                raise\n            wait = base_delay * 2 ** (attempt - 1)\n            logging.warning(\"attempt %s failed (%s); retrying in %.1fs\", attempt, exc, wait)\n            time.sleep(wait)\n",[30,178494,178495,178499,178505,178511,178523,178533,178537,178550,178571,178599,178620,178643,178647,178651,178681,178686,178708,178714,178725,178735,178745,178763,178767,178789,178811],{"__ignoreMap":28},[33,178496,178497],{"class":35,"line":36},[33,178498,26734],{"class":39},[33,178500,178501,178503],{"class":35,"line":43},[33,178502,164],{"class":163},[33,178504,184],{"class":167},[33,178506,178507,178509],{"class":35,"line":61},[33,178508,164],{"class":163},[33,178510,1689],{"class":167},[33,178512,178513,178515,178518,178520],{"class":35,"line":73},[33,178514,190],{"class":163},[33,178516,178517],{"class":167}," logging.handlers ",[33,178519,164],{"class":163},[33,178521,178522],{"class":167}," RotatingFileHandler\n",[33,178524,178525,178527,178529,178531],{"class":35,"line":88},[33,178526,190],{"class":163},[33,178528,193],{"class":167},[33,178530,164],{"class":163},[33,178532,198],{"class":167},[33,178534,178535],{"class":35,"line":95},[33,178536,92],{"emptyLinePlaceholder":91},[33,178538,178539,178541,178543,178546,178548],{"class":35,"line":101},[33,178540,562],{"class":163},[33,178542,1047],{"class":46},[33,178544,178545],{"class":167},"(log_file: Path) -> ",[33,178547,571],{"class":50},[33,178549,574],{"class":167},[33,178551,178552,178555,178557,178559,178561,178563,178565,178567,178569],{"class":35,"line":171},[33,178553,178554],{"class":167},"    log_file.parent.mkdir(",[33,178556,869],{"class":238},[33,178558,242],{"class":163},[33,178560,855],{"class":50},[33,178562,365],{"class":167},[33,178564,878],{"class":238},[33,178566,242],{"class":163},[33,178568,855],{"class":50},[33,178570,221],{"class":167},[33,178572,178573,178575,178577,178580,178583,178585,178588,178590,178593,178595,178597],{"class":35,"line":179},[33,178574,1133],{"class":167},[33,178576,242],{"class":163},[33,178578,178579],{"class":167}," RotatingFileHandler(log_file, ",[33,178581,178582],{"class":238},"maxBytes",[33,178584,242],{"class":163},[33,178586,178587],{"class":50},"1_000_000",[33,178589,365],{"class":167},[33,178591,178592],{"class":238},"backupCount",[33,178594,242],{"class":163},[33,178596,10258],{"class":50},[33,178598,221],{"class":167},[33,178600,178601,178604,178606,178608,178610,178612,178614,178616,178618],{"class":35,"line":187},[33,178602,178603],{"class":167},"    handler.setFormatter(logging.Formatter(",[33,178605,274],{"class":54},[33,178607,277],{"class":50},[33,178609,26814],{"class":54},[33,178611,26817],{"class":50},[33,178613,26814],{"class":54},[33,178615,26827],{"class":50},[33,178617,274],{"class":54},[33,178619,371],{"class":167},[33,178621,178622,178625,178627,178629,178631,178633,178635,178638,178640],{"class":35,"line":201},[33,178623,178624],{"class":167},"    logging.basicConfig(",[33,178626,18267],{"class":238},[33,178628,242],{"class":163},[33,178630,258],{"class":167},[33,178632,1067],{"class":50},[33,178634,365],{"class":167},[33,178636,178637],{"class":238},"handlers",[33,178639,242],{"class":163},[33,178641,178642],{"class":167},"[handler])\n",[33,178644,178645],{"class":35,"line":206},[33,178646,92],{"emptyLinePlaceholder":91},[33,178648,178649],{"class":35,"line":224},[33,178650,92],{"emptyLinePlaceholder":91},[33,178652,178653,178655,178658,178661,178663,178665,178667,178669,178671,178673,178675,178677,178679],{"class":35,"line":229},[33,178654,562],{"class":163},[33,178656,178657],{"class":46}," with_retries",[33,178659,178660],{"class":167},"(func, ",[33,178662,1769],{"class":163},[33,178664,28726],{"class":167},[33,178666,1059],{"class":50},[33,178668,212],{"class":163},[33,178670,1714],{"class":50},[33,178672,28735],{"class":167},[33,178674,1720],{"class":50},[33,178676,212],{"class":163},[33,178678,28742],{"class":50},[33,178680,1737],{"class":167},[33,178682,178683],{"class":35,"line":235},[33,178684,178685],{"class":54},"    \"\"\"Retry a flaky I\u002FO operation with exponential backoff.\"\"\"\n",[33,178687,178688,178690,178692,178694,178696,178698,178700,178702,178704,178706],{"class":35,"line":250},[33,178689,656],{"class":163},[33,178691,1796],{"class":167},[33,178693,662],{"class":163},[33,178695,1801],{"class":50},[33,178697,602],{"class":167},[33,178699,734],{"class":50},[33,178701,1808],{"class":167},[33,178703,1811],{"class":163},[33,178705,1814],{"class":50},[33,178707,1737],{"class":167},[33,178709,178710,178712],{"class":35,"line":266},[33,178711,670],{"class":163},[33,178713,574],{"class":167},[33,178715,178716,178718,178721,178723],{"class":35,"line":290},[33,178717,28782],{"class":163},[33,178719,178720],{"class":167}," func(",[33,178722,1769],{"class":163},[33,178724,28789],{"class":167},[33,178726,178727,178729,178731,178733],{"class":35,"line":295},[33,178728,780],{"class":163},[33,178730,783],{"class":50},[33,178732,1852],{"class":163},[33,178734,1855],{"class":167},[33,178736,178737,178739,178741,178743],{"class":35,"line":300},[33,178738,5995],{"class":163},[33,178740,1796],{"class":167},[33,178742,1865],{"class":163},[33,178744,1868],{"class":167},[33,178746,178747,178750,178753,178755,178757,178759,178761],{"class":35,"line":317},[33,178748,178749],{"class":167},"                logging.error(",[33,178751,178752],{"class":54},"\"gave up after ",[33,178754,309],{"class":50},[33,178756,67588],{"class":54},[33,178758,309],{"class":50},[33,178760,274],{"class":54},[33,178762,67595],{"class":167},[33,178764,178765],{"class":35,"line":332},[33,178766,28814],{"class":163},[33,178768,178769,178771,178773,178775,178777,178779,178781,178783,178785,178787],{"class":35,"line":347},[33,178770,1783],{"class":167},[33,178772,242],{"class":163},[33,178774,28824],{"class":167},[33,178776,1769],{"class":163},[33,178778,7451],{"class":50},[33,178780,28833],{"class":163},[33,178782,28836],{"class":167},[33,178784,4126],{"class":163},[33,178786,1814],{"class":50},[33,178788,221],{"class":167},[33,178790,178791,178793,178796,178798,178800,178802,178804,178806,178808],{"class":35,"line":374},[33,178792,134681],{"class":167},[33,178794,178795],{"class":54},"\"attempt ",[33,178797,309],{"class":50},[33,178799,28855],{"class":54},[33,178801,309],{"class":50},[33,178803,28860],{"class":54},[33,178805,1907],{"class":50},[33,178807,1910],{"class":54},[33,178809,178810],{"class":167},", attempt, exc, wait)\n",[33,178812,178813],{"class":35,"line":397},[33,178814,178815],{"class":167},"            time.sleep(wait)\n",[14,178817,178818],{},"Make the whole job idempotent: deterministic filenames mean a re-run after a crash overwrites the same outputs rather than duplicating them, so you can safely restart a failed batch from the top.",[14,178820,178821,178822,178825,178826,2012,178828,178830],{},"One more production concern is observability after the fact. A scheduled job that runs while no one watches needs to leave a trail you can reconstruct a problem from days later. Log the source file path and its modification time, the record count read, and a per-row success or failure line keyed by the same business identifier used in the filename — that way a complaint about \"the wrong total on invoice INV-2207\" maps straight to a log line and the source row behind it. Emit a single summary line at the end (",[30,178823,178824],{},"done: 412 ok, 3 failed",") and, in CI, fail the job's exit status when the failure count crosses a threshold so a broken upstream feed raises an alert instead of quietly shipping blanks. Treat the rendered documents themselves as the final validation gate: a quick pass that opens each output and asserts no ",[30,178827,159462],{},[30,178829,159476],{}," survived catches unbound placeholders before they reach a client's inbox.",[18,178832,29071],{"id":29070},[4273,178834,178835,178845],{},[4276,178836,178837],{},[4279,178838,178839,178841,178843],{},[4282,178840,29080],{},[4282,178842,4287],{},[4282,178844,4290],{},[4292,178846,178847,178865,178879,178897,178912],{},[4279,178848,178849,178852,178858],{},[4297,178850,178851],{},"Every output file contains the first row's data",[4297,178853,178854,178855,178857],{},"A single ",[30,178856,175164],{}," instance reused across the loop mutates in place",[4297,178859,178860,178861,178864],{},"Re-instantiate ",[30,178862,178863],{},"DocxTemplate(template)"," inside the loop for each record",[4279,178866,178867,178873,178876],{},[4297,178868,178869,178870],{},"Placeholder renders as literal ",[30,178871,178872],{},"{{ name }}",[4297,178874,178875],{},"The placeholder text is split across multiple Word runs by autocorrect or mid-word edits",[4297,178877,178878],{},"Delete and retype the placeholder in one clean pass to collapse the runs",[4279,178880,178881,178886,178891],{},[4297,178882,178883,178884],{},"Empty cells render as ",[30,178885,177215],{},[4297,178887,178888,178889,154487],{},"pandas converts blank cells to the float ",[30,178890,8884],{},[4297,178892,154440,178893,178896],{},[30,178894,178895],{},"dtype=str, keep_default_na=False",", then cast specific columns explicitly",[4279,178898,178899,178902,178905],{},[4297,178900,178901],{},"PDF conversion fails on the server",[4297,178903,178904],{},"docx2pdf drives Microsoft Word, which is absent on Linux",[4297,178906,178907,178908,178911],{},"Use LibreOffice ",[30,178909,178910],{},"--headless --convert-to pdf"," on servers; reserve docx2pdf for desktops",[4279,178913,178914,178918,178921],{},[4297,178915,178916,70954],{},[30,178917,70953],{},[4297,178919,178920],{},"The entire data file is loaded and all documents held in memory",[4297,178922,178923,178924,178927],{},"Stream with ",[30,178925,178926],{},"read_csv(chunksize=...)"," and write each document before rendering the next",[18,178929,88566],{"id":29183},[14,178931,178932,178935,178936,178938],{},[1974,178933,178934],{},"Which library should I use — python-docx or docxtpl?","\nUse docxtpl when you fill a Word-authored template with ",[30,178937,172245],{},"; it preserves every style and layout detail. Use python-docx when you build or restructure a document programmatically, or to edit metadata and styles after rendering. Most batch jobs use docxtpl for the render and python-docx only for post-processing.",[14,178940,178941,178944,178945,178947],{},[1974,178942,178943],{},"Can I generate thousands of documents without crashing?","\nYes. Stream the data with pandas ",[30,178946,21944],{}," instead of loading it all at once, re-instantiate the template per row, and write each file before moving to the next so only one document is in memory at a time. The bottleneck at scale is usually PDF conversion, not rendering — parallelize that step across a process pool if it dominates runtime.",[14,178949,178950,178953,178954,176812,178956,178958,178959,178961],{},[1974,178951,178952],{},"How do I build tables whose row count varies per document?","\nUse docxtpl's row loop, ",[30,178955,176811],{},[30,178957,173951],{},", placed inside the template table. It clones the underlying ",[30,178960,176818],{}," table-row XML for each item in the bound list, preserving the header row and cell borders, so a record with three line items and one with thirty both render correctly.",[14,178963,178964,178967,178968,178970],{},[1974,178965,178966],{},"Does this work on Linux and macOS, or only Windows?","\nTemplate rendering and data binding are fully cross-platform — they only touch the ",[30,178969,18051],{}," XML. The platform-dependent step is PDF conversion: docx2pdf needs Microsoft Word (Windows\u002FmacOS only), while LibreOffice headless runs anywhere, including Linux servers and CI runners.",[14,178972,178973,178976,178977,178979],{},[1974,178974,178975],{},"Where should data formatting live — in Python or in the template?","\nIn Python. Coerce currency, dates, and number formats into display-ready strings before binding the context. Keeping formatting in version-controlled code rather than inside the binary ",[30,178978,18051],{}," makes the logic reviewable, testable, and consistent across every document in the batch.",[18,178981,6918],{"id":6917},[4211,178983,178984,178989,178994,178999],{},[4214,178985,178986,178988],{},[940,178987,156152],{"href":26562}," — the python-docx structural API and library selection before you scale to batches",[4214,178990,178991,178993],{},[940,178992,26185],{"href":18040}," — conditional blocks, nested data, and personalized bulk output with docxtpl and Jinja2",[4214,178995,178996,178998],{},[940,178997,156178],{"href":156177}," — embedding logos and per-record images, with sizing that survives rendering",[4214,179000,179001,179003],{},[940,179002,161278],{"href":161277}," — headless LibreOffice versus docx2pdf, fidelity tradeoffs, and batch conversion",[14,179005,6947,179006,3035],{},[940,179007,29264],{"href":1351},[6953,179009,29267],{},{"title":28,"searchDepth":43,"depth":43,"links":179011},[179012,179013,179014,179015,179016,179017,179018,179019,179020,179021,179022,179023],{"id":176340,"depth":43,"text":176341},{"id":26468,"depth":43,"text":26469},{"id":26618,"depth":43,"text":26619},{"id":176750,"depth":43,"text":176751},{"id":176822,"depth":43,"text":176823},{"id":177246,"depth":43,"text":177247},{"id":177555,"depth":43,"text":177556},{"id":178127,"depth":43,"text":178128},{"id":28615,"depth":43,"text":28616},{"id":29070,"depth":43,"text":29071},{"id":29183,"depth":43,"text":88566},{"id":6917,"depth":43,"text":6918},"Word Automation","Generate hundreds of consistent Word documents from CSV, Excel, or JSON with Python. Covers docxtpl, python-docx, Jinja2, batch loops, PDF export, and production hardening.",{},"\u002Fword-document-templating-batch-processing",{"title":26263,"description":179025},"Python Word Templating & Batch Processing","word-document-templating-batch-processing\u002Findex",[170115,47,18047,18041,75762],"KKEBjndL6YdJKisXiP_JYnp6TaK7mn8MsQfq07wP9DA",{"id":179034,"title":179035,"body":179036,"breadcrumbTitle":181614,"canonical":6977,"date":6978,"description":181615,"draft":6980,"extension":6981,"image":6977,"meta":181616,"navigation":91,"path":181617,"robots":6977,"seo":181618,"seoTitle":181619,"stem":181620,"tags":181621,"updatedAt":6978,"__hash__":181623},"content\u002Fword-document-templating-batch-processing\u002Finserting-images-into-word-documents\u002Ffix-image-too-large-in-python-docx\u002Findex.md","Fix Images Too Large in python-docx",{"type":7,"value":179037,"toc":181601},[179038,179041,179052,179054,179057,179063,179070,179073,179075,179078,179356,179359,179363,179375,179601,179610,179614,179621,179902,179919,179923,179932,179935,180140,180144,180147,180496,180506,180510,180517,180520,180526,180826,180832,181045,181048,181052,181058,181138,181140,181237,181239,181242,181558,181575,181577,181594,181598],[10,179039,179035],{"id":179040},"fix-images-too-large-in-python-docx",[14,179042,43155,179043,179046,179047,2012,179049,179051],{},[30,179044,179045],{},"doc.add_picture(\"logo.png\")"," without a ",[30,179048,56684],{},[30,179050,61972],{}," argument inserts the image at its native physical size derived from the file's DPI metadata. A 2000 x 1500 px image saved at 72 DPI renders as a 27 x 20 inch shape — it overflows the page margins, pushes all subsequent content down, and frequently causes Word to flag the document as corrupt on open.",[18,179053,7021],{"id":7020},[14,179055,179056],{},"python-docx converts pixel dimensions to EMU (English Metric Units) using the image's embedded DPI:",[23,179058,179061],{"className":179059,"code":179060,"language":2000},[1998],"width_emu = pixel_width \u002F dpi * 914400\n",[30,179062,179060],{"__ignoreMap":28},[14,179064,179065,179066,179069],{},"A 2000 px wide image at 72 DPI produces ",[30,179067,179068],{},"2000 \u002F 72 * 914400 = 25,400,000 EMU = 27.8 inches",". A standard US Letter page is only 8.5 inches wide, so the image blows past both margins.",[14,179071,179072],{},"The same logic applies in reverse for high-DPI images: a 600 x 450 px image at 300 DPI renders at 2 x 1.5 inches, which may be smaller than expected.",[18,179074,99786],{"id":54445},[14,179076,179077],{},"Run this to confirm your image's native physical size before inserting it:",[23,179079,179081],{"className":126,"code":179080,"language":47,"meta":28,"style":28},"# pip install Pillow\nfrom PIL import Image\nfrom pathlib import Path\n\nimg_path = Path(\"assets\u002Flogo.png\")\ntry:\n    with Image.open(img_path) as img:\n        w, h = img.size\n        dpi = img.info.get(\"dpi\", (72, 72))\n        print(f\"Pixels: {w} x {h}\")\n        print(f\"DPI: {dpi[0]} x {dpi[1]}\")\n        print(f\"Native size: {w\u002Fdpi[0]:.2f} x {h\u002Fdpi[1]:.2f} inches\")\nexcept FileNotFoundError:\n    print(f\"File not found: {img_path}\")\nexcept Exception as e:\n    print(f\"Could not read image: {e}\")\n",[30,179082,179083,179088,179098,179108,179112,179126,179132,179144,179154,179177,179208,179246,179296,179304,179325,179335],{"__ignoreMap":28},[33,179084,179085],{"class":35,"line":36},[33,179086,179087],{"class":39},"# pip install Pillow\n",[33,179089,179090,179092,179094,179096],{"class":35,"line":43},[33,179091,190],{"class":163},[33,179093,46889],{"class":50},[33,179095,46892],{"class":163},[33,179097,47171],{"class":167},[33,179099,179100,179102,179104,179106],{"class":35,"line":61},[33,179101,190],{"class":163},[33,179103,193],{"class":167},[33,179105,164],{"class":163},[33,179107,198],{"class":167},[33,179109,179110],{"class":35,"line":73},[33,179111,92],{"emptyLinePlaceholder":91},[33,179113,179114,179117,179119,179121,179124],{"class":35,"line":88},[33,179115,179116],{"class":167},"img_path ",[33,179118,242],{"class":163},[33,179120,215],{"class":167},[33,179122,179123],{"class":54},"\"assets\u002Flogo.png\"",[33,179125,221],{"class":167},[33,179127,179128,179130],{"class":35,"line":95},[33,179129,35574],{"class":163},[33,179131,574],{"class":167},[33,179133,179134,179136,179139,179141],{"class":35,"line":101},[33,179135,1635],{"class":163},[33,179137,179138],{"class":167}," Image.open(img_path) ",[33,179140,495],{"class":163},[33,179142,179143],{"class":167}," img:\n",[33,179145,179146,179149,179151],{"class":35,"line":171},[33,179147,179148],{"class":167},"        w, h ",[33,179150,242],{"class":163},[33,179152,179153],{"class":167}," img.size\n",[33,179155,179156,179159,179161,179164,179167,179169,179171,179173,179175],{"class":35,"line":179},[33,179157,179158],{"class":167},"        dpi ",[33,179160,242],{"class":163},[33,179162,179163],{"class":167}," img.info.get(",[33,179165,179166],{"class":54},"\"dpi\"",[33,179168,19953],{"class":167},[33,179170,49823],{"class":50},[33,179172,365],{"class":167},[33,179174,49823],{"class":50},[33,179176,371],{"class":167},[33,179178,179179,179181,179183,179185,179188,179190,179193,179195,179198,179200,179202,179204,179206],{"class":35,"line":187},[33,179180,9414],{"class":50},[33,179182,602],{"class":167},[33,179184,4059],{"class":163},[33,179186,179187],{"class":54},"\"Pixels: ",[33,179189,1115],{"class":50},[33,179191,179192],{"class":167},"w",[33,179194,1121],{"class":50},[33,179196,179197],{"class":54}," x ",[33,179199,1115],{"class":50},[33,179201,82663],{"class":167},[33,179203,1121],{"class":50},[33,179205,274],{"class":54},[33,179207,221],{"class":167},[33,179209,179210,179212,179214,179216,179219,179221,179224,179226,179228,179230,179232,179234,179236,179238,179240,179242,179244],{"class":35,"line":201},[33,179211,9414],{"class":50},[33,179213,602],{"class":167},[33,179215,4059],{"class":163},[33,179217,179218],{"class":54},"\"DPI: ",[33,179220,1115],{"class":50},[33,179222,179223],{"class":167},"dpi[",[33,179225,748],{"class":50},[33,179227,9546],{"class":167},[33,179229,1121],{"class":50},[33,179231,179197],{"class":54},[33,179233,1115],{"class":50},[33,179235,179223],{"class":167},[33,179237,734],{"class":50},[33,179239,9546],{"class":167},[33,179241,1121],{"class":50},[33,179243,274],{"class":54},[33,179245,221],{"class":167},[33,179247,179248,179250,179252,179254,179257,179259,179261,179263,179265,179267,179269,179271,179273,179275,179277,179279,179281,179283,179285,179287,179289,179291,179294],{"class":35,"line":206},[33,179249,9414],{"class":50},[33,179251,602],{"class":167},[33,179253,4059],{"class":163},[33,179255,179256],{"class":54},"\"Native size: ",[33,179258,1115],{"class":50},[33,179260,179192],{"class":167},[33,179262,1351],{"class":163},[33,179264,179223],{"class":167},[33,179266,748],{"class":50},[33,179268,9546],{"class":167},[33,179270,55819],{"class":163},[33,179272,1121],{"class":50},[33,179274,179197],{"class":54},[33,179276,1115],{"class":50},[33,179278,82663],{"class":167},[33,179280,1351],{"class":163},[33,179282,179223],{"class":167},[33,179284,734],{"class":50},[33,179286,9546],{"class":167},[33,179288,55819],{"class":163},[33,179290,1121],{"class":50},[33,179292,179293],{"class":54}," inches\"",[33,179295,221],{"class":167},[33,179297,179298,179300,179302],{"class":35,"line":224},[33,179299,35726],{"class":163},[33,179301,2945],{"class":50},[33,179303,574],{"class":167},[33,179305,179306,179308,179310,179312,179314,179316,179319,179321,179323],{"class":35,"line":229},[33,179307,7268],{"class":50},[33,179309,602],{"class":167},[33,179311,4059],{"class":163},[33,179313,15677],{"class":54},[33,179315,1115],{"class":50},[33,179317,179318],{"class":167},"img_path",[33,179320,1121],{"class":50},[33,179322,274],{"class":54},[33,179324,221],{"class":167},[33,179326,179327,179329,179331,179333],{"class":35,"line":235},[33,179328,35726],{"class":163},[33,179330,783],{"class":50},[33,179332,1852],{"class":163},[33,179334,7583],{"class":167},[33,179336,179337,179339,179341,179343,179346,179348,179350,179352,179354],{"class":35,"line":250},[33,179338,7268],{"class":50},[33,179340,602],{"class":167},[33,179342,4059],{"class":163},[33,179344,179345],{"class":54},"\"Could not read image: ",[33,179347,1115],{"class":50},[33,179349,7602],{"class":167},[33,179351,1121],{"class":50},[33,179353,274],{"class":54},[33,179355,221],{"class":167},[14,179357,179358],{},"If \"Native size\" is larger than your usable page area (typically 6–7 inches for a standard letter\u002FA4 page with 1-inch margins), the image will overflow without an explicit size argument.",[18,179360,179362],{"id":179361},"fix-always-pass-an-explicit-width","Fix: Always Pass an Explicit Width",[14,179364,179365,179366,36604,179368,179370,179371,179374],{},"The primary fix is to always supply ",[30,179367,56684],{},[30,179369,61972],{},") to ",[30,179372,179373],{},"add_picture",". python-docx preserves the aspect ratio when only one dimension is given.",[23,179376,179378],{"className":126,"code":179377,"language":47,"meta":28,"style":28},"# pip install python-docx\nfrom pathlib import Path\nfrom docx import Document\nfrom docx.shared import Inches\n\nIMAGE = Path(\"assets\u002Flogo.png\")\nOUTPUT = Path(\"output\u002Ffixed_image.docx\")\n\ndoc = Document()\ndoc.add_heading(\"Fixed Report\", level=1)\n\ntry:\n    # Pass explicit width — aspect ratio is preserved automatically\n    doc.add_picture(str(IMAGE), width=Inches(3.0))  # never omit this argument\n    doc.save(str(OUTPUT))\n    print(f\"Saved: {OUTPUT}\")\nexcept FileNotFoundError:\n    print(f\"Image not found: {IMAGE}\")\nexcept Exception as e:\n    print(f\"Insertion failed: {e}\")\n",[30,179379,179380,179384,179394,179404,179414,179418,179431,179444,179448,179456,179473,179477,179483,179488,179516,179528,179544,179552,179570,179580],{"__ignoreMap":28},[33,179381,179382],{"class":35,"line":36},[33,179383,156213],{"class":39},[33,179385,179386,179388,179390,179392],{"class":35,"line":43},[33,179387,190],{"class":163},[33,179389,193],{"class":167},[33,179391,164],{"class":163},[33,179393,198],{"class":167},[33,179395,179396,179398,179400,179402],{"class":35,"line":61},[33,179397,190],{"class":163},[33,179399,18092],{"class":167},[33,179401,164],{"class":163},[33,179403,18097],{"class":167},[33,179405,179406,179408,179410,179412],{"class":35,"line":73},[33,179407,190],{"class":163},[33,179409,18104],{"class":167},[33,179411,164],{"class":163},[33,179413,157048],{"class":167},[33,179415,179416],{"class":35,"line":88},[33,179417,92],{"emptyLinePlaceholder":91},[33,179419,179420,179423,179425,179427,179429],{"class":35,"line":95},[33,179421,179422],{"class":50},"IMAGE",[33,179424,212],{"class":163},[33,179426,215],{"class":167},[33,179428,179123],{"class":54},[33,179430,221],{"class":167},[33,179432,179433,179435,179437,179439,179442],{"class":35,"line":101},[33,179434,96935],{"class":50},[33,179436,212],{"class":163},[33,179438,215],{"class":167},[33,179440,179441],{"class":54},"\"output\u002Ffixed_image.docx\"",[33,179443,221],{"class":167},[33,179445,179446],{"class":35,"line":171},[33,179447,92],{"emptyLinePlaceholder":91},[33,179449,179450,179452,179454],{"class":35,"line":179},[33,179451,156566],{"class":167},[33,179453,242],{"class":163},[33,179455,18229],{"class":167},[33,179457,179458,179460,179463,179465,179467,179469,179471],{"class":35,"line":187},[33,179459,156723],{"class":167},[33,179461,179462],{"class":54},"\"Fixed Report\"",[33,179464,365],{"class":167},[33,179466,18267],{"class":238},[33,179468,242],{"class":163},[33,179470,734],{"class":50},[33,179472,221],{"class":167},[33,179474,179475],{"class":35,"line":201},[33,179476,92],{"emptyLinePlaceholder":91},[33,179478,179479,179481],{"class":35,"line":206},[33,179480,35574],{"class":163},[33,179482,574],{"class":167},[33,179484,179485],{"class":35,"line":224},[33,179486,179487],{"class":39},"    # Pass explicit width — aspect ratio is preserved automatically\n",[33,179489,179490,179493,179495,179497,179499,179501,179503,179505,179508,179511,179513],{"class":35,"line":229},[33,179491,179492],{"class":167},"    doc.add_picture(",[33,179494,1053],{"class":50},[33,179496,602],{"class":167},[33,179498,179422],{"class":50},[33,179500,18525],{"class":167},[33,179502,56684],{"class":238},[33,179504,242],{"class":163},[33,179506,179507],{"class":167},"Inches(",[33,179509,179510],{"class":50},"3.0",[33,179512,58831],{"class":167},[33,179514,179515],{"class":39},"# never omit this argument\n",[33,179517,179518,179520,179522,179524,179526],{"class":35,"line":235},[33,179519,85716],{"class":167},[33,179521,1053],{"class":50},[33,179523,602],{"class":167},[33,179525,96935],{"class":50},[33,179527,371],{"class":167},[33,179529,179530,179532,179534,179536,179538,179540,179542],{"class":35,"line":250},[33,179531,7268],{"class":50},[33,179533,602],{"class":167},[33,179535,4059],{"class":163},[33,179537,97737],{"class":54},[33,179539,97684],{"class":50},[33,179541,274],{"class":54},[33,179543,221],{"class":167},[33,179545,179546,179548,179550],{"class":35,"line":266},[33,179547,35726],{"class":163},[33,179549,2945],{"class":50},[33,179551,574],{"class":167},[33,179553,179554,179556,179558,179560,179563,179566,179568],{"class":35,"line":290},[33,179555,7268],{"class":50},[33,179557,602],{"class":167},[33,179559,4059],{"class":163},[33,179561,179562],{"class":54},"\"Image not found: ",[33,179564,179565],{"class":50},"{IMAGE}",[33,179567,274],{"class":54},[33,179569,221],{"class":167},[33,179571,179572,179574,179576,179578],{"class":35,"line":295},[33,179573,35726],{"class":163},[33,179575,783],{"class":50},[33,179577,1852],{"class":163},[33,179579,7583],{"class":167},[33,179581,179582,179584,179586,179588,179591,179593,179595,179597,179599],{"class":35,"line":300},[33,179583,7268],{"class":50},[33,179585,602],{"class":167},[33,179587,4059],{"class":163},[33,179589,179590],{"class":54},"\"Insertion failed: ",[33,179592,1115],{"class":50},[33,179594,7602],{"class":167},[33,179596,1121],{"class":50},[33,179598,274],{"class":54},[33,179600,221],{"class":167},[14,179602,17059,179603,49047,179606,179609],{},[30,179604,179605],{},"Cm(n)",[30,179607,179608],{},"Inches(n)"," if your document uses metric measurements. Both are valid arguments; they are just different units for the same EMU value.",[18,179611,179613],{"id":179612},"fix-compute-width-to-fit-the-usable-page-area","Fix: Compute Width to Fit the Usable Page Area",[14,179615,179616,179617,179620],{},"Hard-coding ",[30,179618,179619],{},"Inches(3.0)"," works for a single document but breaks when the template uses non-standard margins. Derive the usable width from the section geometry:",[23,179622,179624],{"className":126,"code":179623,"language":47,"meta":28,"style":28},"# pip install python-docx\nfrom pathlib import Path\nfrom docx import Document\n\nIMAGE = Path(\"assets\u002Flogo.png\")\nOUTPUT = Path(\"output\u002Fauto_width_image.docx\")\n\ndoc = Document()\nsection = doc.sections[0]\n\n# Compute usable width in EMU — works for any margin configuration\nusable_width = (\n    section.page_width        # total page width in EMU\n    - section.left_margin     # subtract left margin\n    - section.right_margin    # subtract right margin\n)\n\ndoc.add_heading(\"Auto-Fitted Image\", level=1)\ntry:\n    doc.add_picture(str(IMAGE), width=usable_width)  # fills the text column exactly\n    doc.save(str(OUTPUT))\n    print(f\"Usable width: {usable_width \u002F 914400:.2f} in — saved {OUTPUT}\")\nexcept FileNotFoundError:\n    print(f\"Image not found: {IMAGE}\")\nexcept Exception as e:\n    print(f\"Error: {e}\")\n",[30,179625,179626,179630,179640,179650,179654,179666,179679,179683,179691,179703,179707,179712,179721,179729,179740,179750,179754,179758,179775,179781,179803,179815,179848,179856,179872,179882],{"__ignoreMap":28},[33,179627,179628],{"class":35,"line":36},[33,179629,156213],{"class":39},[33,179631,179632,179634,179636,179638],{"class":35,"line":43},[33,179633,190],{"class":163},[33,179635,193],{"class":167},[33,179637,164],{"class":163},[33,179639,198],{"class":167},[33,179641,179642,179644,179646,179648],{"class":35,"line":61},[33,179643,190],{"class":163},[33,179645,18092],{"class":167},[33,179647,164],{"class":163},[33,179649,18097],{"class":167},[33,179651,179652],{"class":35,"line":73},[33,179653,92],{"emptyLinePlaceholder":91},[33,179655,179656,179658,179660,179662,179664],{"class":35,"line":88},[33,179657,179422],{"class":50},[33,179659,212],{"class":163},[33,179661,215],{"class":167},[33,179663,179123],{"class":54},[33,179665,221],{"class":167},[33,179667,179668,179670,179672,179674,179677],{"class":35,"line":95},[33,179669,96935],{"class":50},[33,179671,212],{"class":163},[33,179673,215],{"class":167},[33,179675,179676],{"class":54},"\"output\u002Fauto_width_image.docx\"",[33,179678,221],{"class":167},[33,179680,179681],{"class":35,"line":101},[33,179682,92],{"emptyLinePlaceholder":91},[33,179684,179685,179687,179689],{"class":35,"line":171},[33,179686,156566],{"class":167},[33,179688,242],{"class":163},[33,179690,18229],{"class":167},[33,179692,179693,179695,179697,179699,179701],{"class":35,"line":179},[33,179694,156584],{"class":167},[33,179696,242],{"class":163},[33,179698,156589],{"class":167},[33,179700,748],{"class":50},[33,179702,9202],{"class":167},[33,179704,179705],{"class":35,"line":187},[33,179706,92],{"emptyLinePlaceholder":91},[33,179708,179709],{"class":35,"line":201},[33,179710,179711],{"class":39},"# Compute usable width in EMU — works for any margin configuration\n",[33,179713,179714,179717,179719],{"class":35,"line":206},[33,179715,179716],{"class":167},"usable_width ",[33,179718,242],{"class":163},[33,179720,1415],{"class":167},[33,179722,179723,179726],{"class":35,"line":224},[33,179724,179725],{"class":167},"    section.page_width        ",[33,179727,179728],{"class":39},"# total page width in EMU\n",[33,179730,179731,179734,179737],{"class":35,"line":229},[33,179732,179733],{"class":163},"    -",[33,179735,179736],{"class":167}," section.left_margin     ",[33,179738,179739],{"class":39},"# subtract left margin\n",[33,179741,179742,179744,179747],{"class":35,"line":235},[33,179743,179733],{"class":163},[33,179745,179746],{"class":167}," section.right_margin    ",[33,179748,179749],{"class":39},"# subtract right margin\n",[33,179751,179752],{"class":35,"line":250},[33,179753,221],{"class":167},[33,179755,179756],{"class":35,"line":266},[33,179757,92],{"emptyLinePlaceholder":91},[33,179759,179760,179762,179765,179767,179769,179771,179773],{"class":35,"line":290},[33,179761,156723],{"class":167},[33,179763,179764],{"class":54},"\"Auto-Fitted Image\"",[33,179766,365],{"class":167},[33,179768,18267],{"class":238},[33,179770,242],{"class":163},[33,179772,734],{"class":50},[33,179774,221],{"class":167},[33,179776,179777,179779],{"class":35,"line":295},[33,179778,35574],{"class":163},[33,179780,574],{"class":167},[33,179782,179783,179785,179787,179789,179791,179793,179795,179797,179800],{"class":35,"line":300},[33,179784,179492],{"class":167},[33,179786,1053],{"class":50},[33,179788,602],{"class":167},[33,179790,179422],{"class":50},[33,179792,18525],{"class":167},[33,179794,56684],{"class":238},[33,179796,242],{"class":163},[33,179798,179799],{"class":167},"usable_width)  ",[33,179801,179802],{"class":39},"# fills the text column exactly\n",[33,179804,179805,179807,179809,179811,179813],{"class":35,"line":317},[33,179806,85716],{"class":167},[33,179808,1053],{"class":50},[33,179810,602],{"class":167},[33,179812,96935],{"class":50},[33,179814,371],{"class":167},[33,179816,179817,179819,179821,179823,179826,179828,179830,179832,179835,179837,179839,179842,179844,179846],{"class":35,"line":332},[33,179818,7268],{"class":50},[33,179820,602],{"class":167},[33,179822,4059],{"class":163},[33,179824,179825],{"class":54},"\"Usable width: ",[33,179827,1115],{"class":50},[33,179829,179716],{"class":167},[33,179831,1351],{"class":163},[33,179833,179834],{"class":50}," 914400",[33,179836,55819],{"class":163},[33,179838,1121],{"class":50},[33,179840,179841],{"class":54}," in — saved ",[33,179843,97684],{"class":50},[33,179845,274],{"class":54},[33,179847,221],{"class":167},[33,179849,179850,179852,179854],{"class":35,"line":347},[33,179851,35726],{"class":163},[33,179853,2945],{"class":50},[33,179855,574],{"class":167},[33,179857,179858,179860,179862,179864,179866,179868,179870],{"class":35,"line":374},[33,179859,7268],{"class":50},[33,179861,602],{"class":167},[33,179863,4059],{"class":163},[33,179865,179562],{"class":54},[33,179867,179565],{"class":50},[33,179869,274],{"class":54},[33,179871,221],{"class":167},[33,179873,179874,179876,179878,179880],{"class":35,"line":397},[33,179875,35726],{"class":163},[33,179877,783],{"class":50},[33,179879,1852],{"class":163},[33,179881,7583],{"class":167},[33,179883,179884,179886,179888,179890,179892,179894,179896,179898,179900],{"class":35,"line":653},[33,179885,7268],{"class":50},[33,179887,602],{"class":167},[33,179889,4059],{"class":163},[33,179891,39108],{"class":54},[33,179893,1115],{"class":50},[33,179895,7602],{"class":167},[33,179897,1121],{"class":50},[33,179899,274],{"class":54},[33,179901,221],{"class":167},[14,179903,179904,179905,365,179908,365,179911,179914,179915,179918],{},"All three values (",[30,179906,179907],{},"page_width",[30,179909,179910],{},"left_margin",[30,179912,179913],{},"right_margin",") are already in EMU, so arithmetic is exact. Pass the result directly to ",[30,179916,179917],{},"width="," — no unit conversion needed.",[18,179920,179922],{"id":179921},"variant-high-dpi-images-come-out-too-small","Variant: High-DPI Images Come Out Too Small",[14,179924,179925,179926,179929,179930,3035],{},"The opposite problem occurs with 300 DPI assets: ",[30,179927,179928],{},"1200 \u002F 300 * 914400 = 3,657,600 EMU = 4 inches",". That is a reasonable size but it may be too small for a full-width chart or too large for a logo corner. The fix is identical — always pass an explicit ",[30,179931,56684],{},[14,179933,179934],{},"For a helper that reads DPI and returns a safe EMU width:",[23,179936,179938],{"className":126,"code":179937,"language":47,"meta":28,"style":28},"# pip install python-docx Pillow\nfrom pathlib import Path\nfrom docx.shared import Inches\nfrom PIL import Image\n\ndef safe_image_width(img_path: Path, max_inches: float = 4.0) -> int:\n    \"\"\"Return an EMU width that never exceeds max_inches, preserving aspect ratio.\"\"\"\n    try:\n        with Image.open(img_path) as img:\n            w, h = img.size\n            dpi = img.info.get(\"dpi\", (96, 96))\n            native_inches = w \u002F dpi[0]\n            # Clamp to max_inches\n            target_inches = min(native_inches, max_inches)\n            return int(target_inches * 914400)  # convert to EMU\n    except Exception:\n        return int(Inches(max_inches))  # fallback to max\n\n# Usage:\n# doc.add_picture(str(img_path), width=safe_image_width(img_path, max_inches=3.0))\n",[30,179939,179940,179945,179955,179965,179975,179979,180002,180007,180013,180023,180032,180053,180071,180076,180088,180106,180114,180126,180130,180135],{"__ignoreMap":28},[33,179941,179942],{"class":35,"line":36},[33,179943,179944],{"class":39},"# pip install python-docx Pillow\n",[33,179946,179947,179949,179951,179953],{"class":35,"line":43},[33,179948,190],{"class":163},[33,179950,193],{"class":167},[33,179952,164],{"class":163},[33,179954,198],{"class":167},[33,179956,179957,179959,179961,179963],{"class":35,"line":61},[33,179958,190],{"class":163},[33,179960,18104],{"class":167},[33,179962,164],{"class":163},[33,179964,157048],{"class":167},[33,179966,179967,179969,179971,179973],{"class":35,"line":73},[33,179968,190],{"class":163},[33,179970,46889],{"class":50},[33,179972,46892],{"class":163},[33,179974,47171],{"class":167},[33,179976,179977],{"class":35,"line":88},[33,179978,92],{"emptyLinePlaceholder":91},[33,179980,179981,179983,179986,179989,179991,179993,179996,179998,180000],{"class":35,"line":95},[33,179982,562],{"class":163},[33,179984,179985],{"class":46}," safe_image_width",[33,179987,179988],{"class":167},"(img_path: Path, max_inches: ",[33,179990,1720],{"class":50},[33,179992,212],{"class":163},[33,179994,179995],{"class":50}," 4.0",[33,179997,1617],{"class":167},[33,179999,1059],{"class":50},[33,180001,574],{"class":167},[33,180003,180004],{"class":35,"line":101},[33,180005,180006],{"class":54},"    \"\"\"Return an EMU width that never exceeds max_inches, preserving aspect ratio.\"\"\"\n",[33,180008,180009,180011],{"class":35,"line":171},[33,180010,2424],{"class":163},[33,180012,574],{"class":167},[33,180014,180015,180017,180019,180021],{"class":35,"line":179},[33,180016,2191],{"class":163},[33,180018,179138],{"class":167},[33,180020,495],{"class":163},[33,180022,179143],{"class":167},[33,180024,180025,180028,180030],{"class":35,"line":187},[33,180026,180027],{"class":167},"            w, h ",[33,180029,242],{"class":163},[33,180031,179153],{"class":167},[33,180033,180034,180037,180039,180041,180043,180045,180047,180049,180051],{"class":35,"line":201},[33,180035,180036],{"class":167},"            dpi ",[33,180038,242],{"class":163},[33,180040,179163],{"class":167},[33,180042,179166],{"class":54},[33,180044,19953],{"class":167},[33,180046,38741],{"class":50},[33,180048,365],{"class":167},[33,180050,38741],{"class":50},[33,180052,371],{"class":167},[33,180054,180055,180058,180060,180062,180064,180067,180069],{"class":35,"line":206},[33,180056,180057],{"class":167},"            native_inches ",[33,180059,242],{"class":163},[33,180061,43419],{"class":167},[33,180063,1351],{"class":163},[33,180065,180066],{"class":167}," dpi[",[33,180068,748],{"class":50},[33,180070,9202],{"class":167},[33,180072,180073],{"class":35,"line":224},[33,180074,180075],{"class":39},"            # Clamp to max_inches\n",[33,180077,180078,180081,180083,180085],{"class":35,"line":229},[33,180079,180080],{"class":167},"            target_inches ",[33,180082,242],{"class":163},[33,180084,73775],{"class":50},[33,180086,180087],{"class":167},"(native_inches, max_inches)\n",[33,180089,180090,180092,180094,180097,180099,180101,180103],{"class":35,"line":235},[33,180091,28782],{"class":163},[33,180093,3149],{"class":50},[33,180095,180096],{"class":167},"(target_inches ",[33,180098,1769],{"class":163},[33,180100,179834],{"class":50},[33,180102,10922],{"class":167},[33,180104,180105],{"class":39},"# convert to EMU\n",[33,180107,180108,180110,180112],{"class":35,"line":250},[33,180109,2449],{"class":163},[33,180111,783],{"class":50},[33,180113,574],{"class":167},[33,180115,180116,180118,180120,180123],{"class":35,"line":266},[33,180117,1659],{"class":163},[33,180119,3149],{"class":50},[33,180121,180122],{"class":167},"(Inches(max_inches))  ",[33,180124,180125],{"class":39},"# fallback to max\n",[33,180127,180128],{"class":35,"line":290},[33,180129,92],{"emptyLinePlaceholder":91},[33,180131,180132],{"class":35,"line":295},[33,180133,180134],{"class":39},"# Usage:\n",[33,180136,180137],{"class":35,"line":300},[33,180138,180139],{"class":39},"# doc.add_picture(str(img_path), width=safe_image_width(img_path, max_inches=3.0))\n",[18,180141,180143],{"id":180142},"variant-oversized-image-inside-a-table-cell","Variant: Oversized Image Inside a Table Cell",[14,180145,180146],{},"Inside a table cell, the image must fit within the cell width. Cell width is accessible via the XML, but the simplest reliable approach is to compute a target width based on the number of columns and the usable page width:",[23,180148,180150],{"className":126,"code":180149,"language":47,"meta":28,"style":28},"# pip install python-docx\nfrom pathlib import Path\nfrom docx import Document\nfrom docx.shared import Inches\n\nLOGO = Path(\"assets\u002Flogo.png\")\nOUTPUT = Path(\"output\u002Ftable_constrained.docx\")\n\ndoc = Document()\nsection = doc.sections[0]\nusable = section.page_width - section.left_margin - section.right_margin\n\nnum_cols = 3\n# Leave a small gutter; divide evenly across columns\ncell_image_width = int((usable \u002F num_cols) * 0.85)  # 85% of the cell share\n\ntable = doc.add_table(rows=1, cols=num_cols)\nfor i in range(num_cols):\n    cell = table.cell(0, i)\n    para = cell.paragraphs[0]\n    run = para.add_run()\n    try:\n        run.add_picture(str(LOGO), width=cell_image_width)  # constrained to cell share\n    except FileNotFoundError:\n        cell.text = \"[logo missing]\"\n\ntry:\n    doc.save(str(OUTPUT))\n    print(f\"Saved: {OUTPUT}\")\nexcept Exception as e:\n    print(f\"Save failed: {e}\")\n",[30,180151,180152,180156,180166,180176,180186,180190,180202,180215,180219,180227,180239,180259,180263,180272,180277,180304,180308,180331,180344,180357,180371,180381,180387,180410,180418,180428,180432,180438,180450,180466,180476],{"__ignoreMap":28},[33,180153,180154],{"class":35,"line":36},[33,180155,156213],{"class":39},[33,180157,180158,180160,180162,180164],{"class":35,"line":43},[33,180159,190],{"class":163},[33,180161,193],{"class":167},[33,180163,164],{"class":163},[33,180165,198],{"class":167},[33,180167,180168,180170,180172,180174],{"class":35,"line":61},[33,180169,190],{"class":163},[33,180171,18092],{"class":167},[33,180173,164],{"class":163},[33,180175,18097],{"class":167},[33,180177,180178,180180,180182,180184],{"class":35,"line":73},[33,180179,190],{"class":163},[33,180181,18104],{"class":167},[33,180183,164],{"class":163},[33,180185,157048],{"class":167},[33,180187,180188],{"class":35,"line":88},[33,180189,92],{"emptyLinePlaceholder":91},[33,180191,180192,180194,180196,180198,180200],{"class":35,"line":95},[33,180193,91271],{"class":50},[33,180195,212],{"class":163},[33,180197,215],{"class":167},[33,180199,179123],{"class":54},[33,180201,221],{"class":167},[33,180203,180204,180206,180208,180210,180213],{"class":35,"line":101},[33,180205,96935],{"class":50},[33,180207,212],{"class":163},[33,180209,215],{"class":167},[33,180211,180212],{"class":54},"\"output\u002Ftable_constrained.docx\"",[33,180214,221],{"class":167},[33,180216,180217],{"class":35,"line":171},[33,180218,92],{"emptyLinePlaceholder":91},[33,180220,180221,180223,180225],{"class":35,"line":179},[33,180222,156566],{"class":167},[33,180224,242],{"class":163},[33,180226,18229],{"class":167},[33,180228,180229,180231,180233,180235,180237],{"class":35,"line":187},[33,180230,156584],{"class":167},[33,180232,242],{"class":163},[33,180234,156589],{"class":167},[33,180236,748],{"class":50},[33,180238,9202],{"class":167},[33,180240,180241,180244,180246,180249,180251,180254,180256],{"class":35,"line":201},[33,180242,180243],{"class":167},"usable ",[33,180245,242],{"class":163},[33,180247,180248],{"class":167}," section.page_width ",[33,180250,4126],{"class":163},[33,180252,180253],{"class":167}," section.left_margin ",[33,180255,4126],{"class":163},[33,180257,180258],{"class":167}," section.right_margin\n",[33,180260,180261],{"class":35,"line":206},[33,180262,92],{"emptyLinePlaceholder":91},[33,180264,180265,180268,180270],{"class":35,"line":224},[33,180266,180267],{"class":167},"num_cols ",[33,180269,242],{"class":163},[33,180271,103763],{"class":50},[33,180273,180274],{"class":35,"line":229},[33,180275,180276],{"class":39},"# Leave a small gutter; divide evenly across columns\n",[33,180278,180279,180282,180284,180286,180289,180291,180294,180296,180299,180301],{"class":35,"line":235},[33,180280,180281],{"class":167},"cell_image_width ",[33,180283,242],{"class":163},[33,180285,3149],{"class":50},[33,180287,180288],{"class":167},"((usable ",[33,180290,1351],{"class":163},[33,180292,180293],{"class":167}," num_cols) ",[33,180295,1769],{"class":163},[33,180297,180298],{"class":50}," 0.85",[33,180300,10922],{"class":167},[33,180302,180303],{"class":39},"# 85% of the cell share\n",[33,180305,180306],{"class":35,"line":250},[33,180307,92],{"emptyLinePlaceholder":91},[33,180309,180310,180312,180314,180316,180318,180320,180322,180324,180326,180328],{"class":35,"line":266},[33,180311,157220],{"class":167},[33,180313,242],{"class":163},[33,180315,18626],{"class":167},[33,180317,18629],{"class":238},[33,180319,242],{"class":163},[33,180321,734],{"class":50},[33,180323,365],{"class":167},[33,180325,18638],{"class":238},[33,180327,242],{"class":163},[33,180329,180330],{"class":167},"num_cols)\n",[33,180332,180333,180335,180337,180339,180341],{"class":35,"line":290},[33,180334,6124],{"class":163},[33,180336,47269],{"class":167},[33,180338,662],{"class":163},[33,180340,1801],{"class":50},[33,180342,180343],{"class":167},"(num_cols):\n",[33,180345,180346,180348,180350,180352,180354],{"class":35,"line":295},[33,180347,152795],{"class":167},[33,180349,242],{"class":163},[33,180351,158907],{"class":167},[33,180353,748],{"class":50},[33,180355,180356],{"class":167},", i)\n",[33,180358,180359,180362,180364,180367,180369],{"class":35,"line":300},[33,180360,180361],{"class":167},"    para ",[33,180363,242],{"class":163},[33,180365,180366],{"class":167}," cell.paragraphs[",[33,180368,748],{"class":50},[33,180370,9202],{"class":167},[33,180372,180373,180376,180378],{"class":35,"line":317},[33,180374,180375],{"class":167},"    run ",[33,180377,242],{"class":163},[33,180379,180380],{"class":167}," para.add_run()\n",[33,180382,180383,180385],{"class":35,"line":332},[33,180384,2424],{"class":163},[33,180386,574],{"class":167},[33,180388,180389,180392,180394,180396,180398,180400,180402,180404,180407],{"class":35,"line":347},[33,180390,180391],{"class":167},"        run.add_picture(",[33,180393,1053],{"class":50},[33,180395,602],{"class":167},[33,180397,91271],{"class":50},[33,180399,18525],{"class":167},[33,180401,56684],{"class":238},[33,180403,242],{"class":163},[33,180405,180406],{"class":167},"cell_image_width)  ",[33,180408,180409],{"class":39},"# constrained to cell share\n",[33,180411,180412,180414,180416],{"class":35,"line":374},[33,180413,2449],{"class":163},[33,180415,2945],{"class":50},[33,180417,574],{"class":167},[33,180419,180420,180423,180425],{"class":35,"line":397},[33,180421,180422],{"class":167},"        cell.text ",[33,180424,242],{"class":163},[33,180426,180427],{"class":54}," \"[logo missing]\"\n",[33,180429,180430],{"class":35,"line":653},[33,180431,92],{"emptyLinePlaceholder":91},[33,180433,180434,180436],{"class":35,"line":667},[33,180435,35574],{"class":163},[33,180437,574],{"class":167},[33,180439,180440,180442,180444,180446,180448],{"class":35,"line":675},[33,180441,85716],{"class":167},[33,180443,1053],{"class":50},[33,180445,602],{"class":167},[33,180447,96935],{"class":50},[33,180449,371],{"class":167},[33,180451,180452,180454,180456,180458,180460,180462,180464],{"class":35,"line":689},[33,180453,7268],{"class":50},[33,180455,602],{"class":167},[33,180457,4059],{"class":163},[33,180459,97737],{"class":54},[33,180461,97684],{"class":50},[33,180463,274],{"class":54},[33,180465,221],{"class":167},[33,180467,180468,180470,180472,180474],{"class":35,"line":703},[33,180469,35726],{"class":163},[33,180471,783],{"class":50},[33,180473,1852],{"class":163},[33,180475,7583],{"class":167},[33,180477,180478,180480,180482,180484,180486,180488,180490,180492,180494],{"class":35,"line":714},[33,180479,7268],{"class":50},[33,180481,602],{"class":167},[33,180483,4059],{"class":163},[33,180485,158012],{"class":54},[33,180487,1115],{"class":50},[33,180489,7602],{"class":167},[33,180491,1121],{"class":50},[33,180493,274],{"class":54},[33,180495,221],{"class":167},[14,180497,180498,180499,180502,180503,3035],{},"The 0.85 factor gives breathing room for cell padding. Adjust to taste; the important constraint is that ",[30,180500,180501],{},"cell_image_width"," never exceeds ",[30,180504,180505],{},"usable \u002F num_cols",[18,180507,180509],{"id":180508},"variant-image-with-no-dpi-metadata","Variant: Image with No DPI Metadata",[14,180511,180512,180513,180516],{},"Some images — especially those generated programmatically, captured with screen-recording tools, or exported from vector software — carry no DPI tag at all. ",[30,180514,180515],{},"img.info.get(\"dpi\", ...)"," returns the fallback tuple. python-docx falls back to 72 DPI internally, which is why screen captures almost always overflow.",[14,180518,180519],{},"You have two options:",[14,180521,180522,180525],{},[1974,180523,180524],{},"Option A — assume a sane DPI and scale accordingly."," If you know the image was a screen capture at 96 DPI, pass that assumption explicitly:",[23,180527,180529],{"className":126,"code":180528,"language":47,"meta":28,"style":28},"# pip install python-docx Pillow\nfrom pathlib import Path\nfrom docx import Document\nfrom docx.shared import Inches\nfrom PIL import Image\n\nASSUMED_DPI = 96  # override for images with no DPI metadata\nMAX_WIDTH_INCHES = 5.0\n\nimg_path = Path(\"assets\u002Fscreenshot.png\")\nOUTPUT = Path(\"output\u002Fscreenshot_report.docx\")\n\ndoc = Document()\ntry:\n    with Image.open(img_path) as img:\n        w, _ = img.size\n        native_inches = w \u002F ASSUMED_DPI\n        target_width = min(native_inches, MAX_WIDTH_INCHES)\n        target_emu = int(target_width * 914400)\n\n    doc.add_picture(str(img_path), width=target_emu)  # use computed EMU width\n    doc.save(str(OUTPUT))\n    print(f\"Saved: {OUTPUT}\")\nexcept FileNotFoundError:\n    print(f\"Image not found: {img_path}\")\nexcept Exception as e:\n    print(f\"Error: {e}\")\n",[30,180530,180531,180535,180545,180555,180565,180575,180579,180592,180602,180606,180619,180632,180636,180644,180650,180660,180669,180683,180699,180717,180721,180740,180752,180768,180776,180796,180806],{"__ignoreMap":28},[33,180532,180533],{"class":35,"line":36},[33,180534,179944],{"class":39},[33,180536,180537,180539,180541,180543],{"class":35,"line":43},[33,180538,190],{"class":163},[33,180540,193],{"class":167},[33,180542,164],{"class":163},[33,180544,198],{"class":167},[33,180546,180547,180549,180551,180553],{"class":35,"line":61},[33,180548,190],{"class":163},[33,180550,18092],{"class":167},[33,180552,164],{"class":163},[33,180554,18097],{"class":167},[33,180556,180557,180559,180561,180563],{"class":35,"line":73},[33,180558,190],{"class":163},[33,180560,18104],{"class":167},[33,180562,164],{"class":163},[33,180564,157048],{"class":167},[33,180566,180567,180569,180571,180573],{"class":35,"line":88},[33,180568,190],{"class":163},[33,180570,46889],{"class":50},[33,180572,46892],{"class":163},[33,180574,47171],{"class":167},[33,180576,180577],{"class":35,"line":95},[33,180578,92],{"emptyLinePlaceholder":91},[33,180580,180581,180584,180586,180589],{"class":35,"line":101},[33,180582,180583],{"class":50},"ASSUMED_DPI",[33,180585,212],{"class":163},[33,180587,180588],{"class":50}," 96",[33,180590,180591],{"class":39},"  # override for images with no DPI metadata\n",[33,180593,180594,180597,180599],{"class":35,"line":171},[33,180595,180596],{"class":50},"MAX_WIDTH_INCHES",[33,180598,212],{"class":163},[33,180600,180601],{"class":50}," 5.0\n",[33,180603,180604],{"class":35,"line":179},[33,180605,92],{"emptyLinePlaceholder":91},[33,180607,180608,180610,180612,180614,180617],{"class":35,"line":187},[33,180609,179116],{"class":167},[33,180611,242],{"class":163},[33,180613,215],{"class":167},[33,180615,180616],{"class":54},"\"assets\u002Fscreenshot.png\"",[33,180618,221],{"class":167},[33,180620,180621,180623,180625,180627,180630],{"class":35,"line":201},[33,180622,96935],{"class":50},[33,180624,212],{"class":163},[33,180626,215],{"class":167},[33,180628,180629],{"class":54},"\"output\u002Fscreenshot_report.docx\"",[33,180631,221],{"class":167},[33,180633,180634],{"class":35,"line":206},[33,180635,92],{"emptyLinePlaceholder":91},[33,180637,180638,180640,180642],{"class":35,"line":224},[33,180639,156566],{"class":167},[33,180641,242],{"class":163},[33,180643,18229],{"class":167},[33,180645,180646,180648],{"class":35,"line":229},[33,180647,35574],{"class":163},[33,180649,574],{"class":167},[33,180651,180652,180654,180656,180658],{"class":35,"line":235},[33,180653,1635],{"class":163},[33,180655,179138],{"class":167},[33,180657,495],{"class":163},[33,180659,179143],{"class":167},[33,180661,180662,180665,180667],{"class":35,"line":250},[33,180663,180664],{"class":167},"        w, _ ",[33,180666,242],{"class":163},[33,180668,179153],{"class":167},[33,180670,180671,180674,180676,180678,180680],{"class":35,"line":266},[33,180672,180673],{"class":167},"        native_inches ",[33,180675,242],{"class":163},[33,180677,43419],{"class":167},[33,180679,1351],{"class":163},[33,180681,180682],{"class":50}," ASSUMED_DPI\n",[33,180684,180685,180688,180690,180692,180695,180697],{"class":35,"line":290},[33,180686,180687],{"class":167},"        target_width ",[33,180689,242],{"class":163},[33,180691,73775],{"class":50},[33,180693,180694],{"class":167},"(native_inches, ",[33,180696,180596],{"class":50},[33,180698,221],{"class":167},[33,180700,180701,180704,180706,180708,180711,180713,180715],{"class":35,"line":295},[33,180702,180703],{"class":167},"        target_emu ",[33,180705,242],{"class":163},[33,180707,3149],{"class":50},[33,180709,180710],{"class":167},"(target_width ",[33,180712,1769],{"class":163},[33,180714,179834],{"class":50},[33,180716,221],{"class":167},[33,180718,180719],{"class":35,"line":300},[33,180720,92],{"emptyLinePlaceholder":91},[33,180722,180723,180725,180727,180730,180732,180734,180737],{"class":35,"line":317},[33,180724,179492],{"class":167},[33,180726,1053],{"class":50},[33,180728,180729],{"class":167},"(img_path), ",[33,180731,56684],{"class":238},[33,180733,242],{"class":163},[33,180735,180736],{"class":167},"target_emu)  ",[33,180738,180739],{"class":39},"# use computed EMU width\n",[33,180741,180742,180744,180746,180748,180750],{"class":35,"line":332},[33,180743,85716],{"class":167},[33,180745,1053],{"class":50},[33,180747,602],{"class":167},[33,180749,96935],{"class":50},[33,180751,371],{"class":167},[33,180753,180754,180756,180758,180760,180762,180764,180766],{"class":35,"line":347},[33,180755,7268],{"class":50},[33,180757,602],{"class":167},[33,180759,4059],{"class":163},[33,180761,97737],{"class":54},[33,180763,97684],{"class":50},[33,180765,274],{"class":54},[33,180767,221],{"class":167},[33,180769,180770,180772,180774],{"class":35,"line":374},[33,180771,35726],{"class":163},[33,180773,2945],{"class":50},[33,180775,574],{"class":167},[33,180777,180778,180780,180782,180784,180786,180788,180790,180792,180794],{"class":35,"line":397},[33,180779,7268],{"class":50},[33,180781,602],{"class":167},[33,180783,4059],{"class":163},[33,180785,179562],{"class":54},[33,180787,1115],{"class":50},[33,180789,179318],{"class":167},[33,180791,1121],{"class":50},[33,180793,274],{"class":54},[33,180795,221],{"class":167},[33,180797,180798,180800,180802,180804],{"class":35,"line":653},[33,180799,35726],{"class":163},[33,180801,783],{"class":50},[33,180803,1852],{"class":163},[33,180805,7583],{"class":167},[33,180807,180808,180810,180812,180814,180816,180818,180820,180822,180824],{"class":35,"line":667},[33,180809,7268],{"class":50},[33,180811,602],{"class":167},[33,180813,4059],{"class":163},[33,180815,39108],{"class":54},[33,180817,1115],{"class":50},[33,180819,7602],{"class":167},[33,180821,1121],{"class":50},[33,180823,274],{"class":54},[33,180825,221],{"class":167},[14,180827,180828,180831],{},[1974,180829,180830],{},"Option B — ignore pixel count entirely and always pass a fixed display size."," This is the safest approach for batch pipelines where the image source is varied and you want a consistent thumbnail size regardless of file characteristics:",[23,180833,180835],{"className":126,"code":180834,"language":47,"meta":28,"style":28},"# pip install python-docx\nfrom pathlib import Path\nfrom docx import Document\nfrom docx.shared import Inches\n\n# Fixed display width regardless of source DPI or pixel dimensions\nTHUMBNAIL_WIDTH = Inches(2.5)\n\nimg_path = Path(\"assets\u002Fscreenshot.png\")\nOUTPUT = Path(\"output\u002Fthumbnail_report.docx\")\n\ndoc = Document()\ntry:\n    doc.add_picture(str(img_path), width=THUMBNAIL_WIDTH)  # fixed size, DPI irrelevant\n    doc.save(str(OUTPUT))\n    print(f\"Saved: {OUTPUT}\")\nexcept FileNotFoundError:\n    print(f\"Image not found: {img_path}\")\nexcept Exception as e:\n    print(f\"Error: {e}\")\n",[30,180836,180837,180841,180851,180861,180871,180875,180880,180893,180897,180909,180922,180926,180934,180940,180959,180971,180987,180995,181015,181025],{"__ignoreMap":28},[33,180838,180839],{"class":35,"line":36},[33,180840,156213],{"class":39},[33,180842,180843,180845,180847,180849],{"class":35,"line":43},[33,180844,190],{"class":163},[33,180846,193],{"class":167},[33,180848,164],{"class":163},[33,180850,198],{"class":167},[33,180852,180853,180855,180857,180859],{"class":35,"line":61},[33,180854,190],{"class":163},[33,180856,18092],{"class":167},[33,180858,164],{"class":163},[33,180860,18097],{"class":167},[33,180862,180863,180865,180867,180869],{"class":35,"line":73},[33,180864,190],{"class":163},[33,180866,18104],{"class":167},[33,180868,164],{"class":163},[33,180870,157048],{"class":167},[33,180872,180873],{"class":35,"line":88},[33,180874,92],{"emptyLinePlaceholder":91},[33,180876,180877],{"class":35,"line":95},[33,180878,180879],{"class":39},"# Fixed display width regardless of source DPI or pixel dimensions\n",[33,180881,180882,180885,180887,180889,180891],{"class":35,"line":101},[33,180883,180884],{"class":50},"THUMBNAIL_WIDTH",[33,180886,212],{"class":163},[33,180888,156603],{"class":167},[33,180890,19760],{"class":50},[33,180892,221],{"class":167},[33,180894,180895],{"class":35,"line":171},[33,180896,92],{"emptyLinePlaceholder":91},[33,180898,180899,180901,180903,180905,180907],{"class":35,"line":179},[33,180900,179116],{"class":167},[33,180902,242],{"class":163},[33,180904,215],{"class":167},[33,180906,180616],{"class":54},[33,180908,221],{"class":167},[33,180910,180911,180913,180915,180917,180920],{"class":35,"line":187},[33,180912,96935],{"class":50},[33,180914,212],{"class":163},[33,180916,215],{"class":167},[33,180918,180919],{"class":54},"\"output\u002Fthumbnail_report.docx\"",[33,180921,221],{"class":167},[33,180923,180924],{"class":35,"line":201},[33,180925,92],{"emptyLinePlaceholder":91},[33,180927,180928,180930,180932],{"class":35,"line":206},[33,180929,156566],{"class":167},[33,180931,242],{"class":163},[33,180933,18229],{"class":167},[33,180935,180936,180938],{"class":35,"line":224},[33,180937,35574],{"class":163},[33,180939,574],{"class":167},[33,180941,180942,180944,180946,180948,180950,180952,180954,180956],{"class":35,"line":229},[33,180943,179492],{"class":167},[33,180945,1053],{"class":50},[33,180947,180729],{"class":167},[33,180949,56684],{"class":238},[33,180951,242],{"class":163},[33,180953,180884],{"class":50},[33,180955,10922],{"class":167},[33,180957,180958],{"class":39},"# fixed size, DPI irrelevant\n",[33,180960,180961,180963,180965,180967,180969],{"class":35,"line":235},[33,180962,85716],{"class":167},[33,180964,1053],{"class":50},[33,180966,602],{"class":167},[33,180968,96935],{"class":50},[33,180970,371],{"class":167},[33,180972,180973,180975,180977,180979,180981,180983,180985],{"class":35,"line":250},[33,180974,7268],{"class":50},[33,180976,602],{"class":167},[33,180978,4059],{"class":163},[33,180980,97737],{"class":54},[33,180982,97684],{"class":50},[33,180984,274],{"class":54},[33,180986,221],{"class":167},[33,180988,180989,180991,180993],{"class":35,"line":266},[33,180990,35726],{"class":163},[33,180992,2945],{"class":50},[33,180994,574],{"class":167},[33,180996,180997,180999,181001,181003,181005,181007,181009,181011,181013],{"class":35,"line":290},[33,180998,7268],{"class":50},[33,181000,602],{"class":167},[33,181002,4059],{"class":163},[33,181004,179562],{"class":54},[33,181006,1115],{"class":50},[33,181008,179318],{"class":167},[33,181010,1121],{"class":50},[33,181012,274],{"class":54},[33,181014,221],{"class":167},[33,181016,181017,181019,181021,181023],{"class":35,"line":295},[33,181018,35726],{"class":163},[33,181020,783],{"class":50},[33,181022,1852],{"class":163},[33,181024,7583],{"class":167},[33,181026,181027,181029,181031,181033,181035,181037,181039,181041,181043],{"class":35,"line":300},[33,181028,7268],{"class":50},[33,181030,602],{"class":167},[33,181032,4059],{"class":163},[33,181034,39108],{"class":54},[33,181036,1115],{"class":50},[33,181038,7602],{"class":167},[33,181040,1121],{"class":50},[33,181042,274],{"class":54},[33,181044,221],{"class":167},[14,181046,181047],{},"Option B is idiomatic for logo slots, watermarks, and icon-sized images where the display size is a design constraint rather than a function of the source file.",[18,181049,181051],{"id":181050},"understanding-the-emu-scale","Understanding the EMU Scale",[14,181053,181054,181055,181057],{},"The diagram below shows the relationship between pixel count, DPI, and the resulting physical size in inches, and how passing an explicit ",[30,181056,56684],{}," short-circuits the DPI dependency entirely.",[2540,181059,2547,181061,2547,181064,2547,181067,2547,2547,181081,2547,181083,2547,181086,2547,181089,2547,2547,181092,2547,2547,181095,2547,181097,2547,181100,2547,2547,181103,2547,181106,2547,181108,2547,181111,2547,181114,2547,2547,181117,2547,2547,181119,2547,181122,2547,181125,2547,2547,181128,2547,181130,2547,181132,2547,181135],{"viewBox":58288,"role":2543,"ariaLabel":181060,"xmlns":2545,"style":2546},"Diagram showing how pixel count divided by DPI gives native inches, and how an explicit width argument bypasses DPI",[2549,181062,181063],{},"EMU size calculation path",[2553,181065,181066],{},"Two paths: without explicit width, pixel count is divided by DPI to give native inches, which may overflow the page. With explicit width, DPI is ignored and the image is placed at the specified size.",[2557,181068,2559,181069,2559,181076,2547],{},[2561,181070,2564,181072,2564,181074,2559],{"id":181071,"x1":748,"y1":748,"x2":734,"y2":748},"fix-img-grad",[2566,181073],{"offset":748,"style":2568},[2566,181075],{"offset":734,"style":2571},[2573,181077,2564,181079,2559],{"id":181078,"viewBox":2576,"refX":2577,"refY":1153,"markerWidth":1179,"markerHeight":1179,"orient":2578},"fix-img-arrow",[2580,181080],{"d":2582,"fill":2583},[2585,181082],{"x":2587,"y":2650,"width":2609,"height":2590,"rx":2591,"fill":2592,"stroke":2593,"style":2594},[2000,181084,181085],{"x":2630,"y":135067,"fill":2599,"style":38718},"Image file",[2000,181087,181088],{"x":2630,"y":11218,"fill":2583,"style":2605},"pixels + DPI tag",[2000,181090,181091],{"x":2611,"y":16991,"fill":2583,"style":2605},"\nno width arg\n",[35,181093],{"x1":2610,"y1":2629,"x2":2678,"y2":12900,"stroke":2583,"markerEnd":181094,"style":2594},"url(#fix-img-arrow)",[2585,181096],{"x":2678,"y":58333,"width":11115,"height":2590,"rx":2591,"fill":2592,"stroke":2593,"style":2594},[2000,181098,181099],{"x":58345,"y":89051,"fill":2599,"style":2600},"px ÷ DPI × 914400",[2000,181101,181102],{"x":58345,"y":2650,"fill":2583,"style":2605},"native EMU size",[35,181104],{"x1":135075,"y1":12900,"x2":181105,"y2":12900,"stroke":2583,"markerEnd":181094,"style":2594},"515",[2585,181107],{"x":181105,"y":58333,"width":2598,"height":2590,"rx":2591,"fill":2592,"stroke":2593,"style":2594},[2000,181109,181110],{"x":26418,"y":89051,"fill":2599,"style":2600},"May overflow",[2000,181112,181113],{"x":26418,"y":2650,"fill":2583,"style":2605},"27 in at 72 DPI",[2000,181115,181116],{"x":2611,"y":125455,"fill":2583,"style":2605},"\nwidth=Inches(n)\n",[35,181118],{"x1":2610,"y1":2648,"x2":2678,"y2":2664,"stroke":11166,"markerEnd":181094,"style":2594},[2585,181120],{"x":2678,"y":2598,"width":11115,"height":2590,"rx":2591,"fill":181121,"stroke":2593,"style":2594},"url(#fix-img-grad)",[2000,181123,181124],{"x":58345,"y":11202,"fill":2599,"style":2600},"explicit EMU",[2000,181126,181127],{"x":58345,"y":2611,"fill":2599,"style":2605},"DPI ignored",[35,181129],{"x1":135075,"y1":125455,"x2":181105,"y2":125455,"stroke":11166,"markerEnd":181094,"style":2594},[2585,181131],{"x":181105,"y":2598,"width":2598,"height":2590,"rx":2591,"fill":2592,"stroke":11166,"style":2594},[2000,181133,181134],{"x":26418,"y":11202,"fill":2599,"style":38718},"Correct size",[2000,181136,181137],{"x":26418,"y":2611,"fill":2583,"style":2605},"fits the page",[18,181139,42592],{"id":42591},[4273,181141,181142,181152],{},[4276,181143,181144],{},[4279,181145,181146,181148,181150],{},[4282,181147,4284],{},[4282,181149,101762],{},[4282,181151,4290],{},[4292,181153,181154,181173,181190,181203,181220],{},[4279,181155,181156,181159,181164],{},[4297,181157,181158],{},"Image pushes content off page",[4297,181160,124273,181161,181163],{},[30,181162,56684],{}," argument; native size > page width",[4297,181165,138773,181166,181169,181170],{},[30,181167,181168],{},"width=Inches(n)"," or computed ",[30,181171,181172],{},"usable_width",[4279,181174,181175,181178,181181],{},[4297,181176,181177],{},"Image is a tiny thumbnail",[4297,181179,181180],{},"High-DPI source (300 DPI); native size too small",[4297,181182,181183,181184,14391,181186,181189],{},"Pass larger explicit ",[30,181185,56684],{},[30,181187,181188],{},"safe_image_width()"," helper",[4279,181191,181192,181195,181198],{},[4297,181193,181194],{},"Image correct in body, huge in header",[4297,181196,181197],{},"Header paragraph uses a different context; no width passed to header run",[4297,181199,74566,181200],{},[30,181201,181202],{},"header_run.add_picture(str(img), width=Inches(n))",[4279,181204,181205,181208,181211],{},[4297,181206,181207],{},"Word opens file with \"repaired content\" warning",[4297,181209,181210],{},"Image XML is malformed; often from a BytesIO with wrong seek position",[4297,181212,74566,181213,181216,181217,42709],{},[30,181214,181215],{},"buf.seek(0)"," before each ",[30,181218,181219],{},"add_picture(buf, ...)",[4279,181221,181222,181225,181231],{},[4297,181223,181224],{},"Image in table cell overflows the cell border",[4297,181226,181227,181228,181230],{},"Cell width not accounted for; passed page-wide ",[30,181229,181172],{}," to cell",[4297,181232,181233,181234,181236],{},"Divide ",[30,181235,181172],{}," by column count and apply an 85% gutter factor",[18,181238,9247],{"id":9246},[14,181240,181241],{},"Confirm the inserted image is within page bounds by reading back the saved document:",[23,181243,181245],{"className":126,"code":181244,"language":47,"meta":28,"style":28},"# pip install python-docx\nfrom pathlib import Path\nfrom docx import Document\n\nOUTPUT = Path(\"output\u002Ffixed_image.docx\")\nMAX_INCHES = 8.5  # letter page width; adjust for A4 (8.27 in)\n\ntry:\n    doc = Document(str(OUTPUT))\n    section = doc.sections[0]\n    page_inches = section.page_width \u002F 914400\n\n    for i, shape in enumerate(doc.inline_shapes):\n        w_in = shape.width \u002F 914400\n        h_in = shape.height \u002F 914400\n        status = \"OK\" if w_in \u003C= page_inches else \"OVERFLOW\"\n        print(f\"Shape {i}: {w_in:.2f} x {h_in:.2f} in — {status}\")\n\n    print(f\"Page width: {page_inches:.2f} in\")\nexcept FileNotFoundError:\n    print(f\"File not found: {OUTPUT}\")\nexcept Exception as e:\n    print(f\"Verification error: {e}\")\n",[30,181246,181247,181251,181261,181271,181275,181287,181300,181304,181310,181326,181338,181352,181356,181370,181384,181398,181422,181474,181478,181503,181511,181527,181537],{"__ignoreMap":28},[33,181248,181249],{"class":35,"line":36},[33,181250,156213],{"class":39},[33,181252,181253,181255,181257,181259],{"class":35,"line":43},[33,181254,190],{"class":163},[33,181256,193],{"class":167},[33,181258,164],{"class":163},[33,181260,198],{"class":167},[33,181262,181263,181265,181267,181269],{"class":35,"line":61},[33,181264,190],{"class":163},[33,181266,18092],{"class":167},[33,181268,164],{"class":163},[33,181270,18097],{"class":167},[33,181272,181273],{"class":35,"line":73},[33,181274,92],{"emptyLinePlaceholder":91},[33,181276,181277,181279,181281,181283,181285],{"class":35,"line":88},[33,181278,96935],{"class":50},[33,181280,212],{"class":163},[33,181282,215],{"class":167},[33,181284,179441],{"class":54},[33,181286,221],{"class":167},[33,181288,181289,181292,181294,181297],{"class":35,"line":95},[33,181290,181291],{"class":50},"MAX_INCHES",[33,181293,212],{"class":163},[33,181295,181296],{"class":50}," 8.5",[33,181298,181299],{"class":39},"  # letter page width; adjust for A4 (8.27 in)\n",[33,181301,181302],{"class":35,"line":101},[33,181303,92],{"emptyLinePlaceholder":91},[33,181305,181306,181308],{"class":35,"line":171},[33,181307,35574],{"class":163},[33,181309,574],{"class":167},[33,181311,181312,181314,181316,181318,181320,181322,181324],{"class":35,"line":179},[33,181313,18224],{"class":167},[33,181315,242],{"class":163},[33,181317,156340],{"class":167},[33,181319,1053],{"class":50},[33,181321,602],{"class":167},[33,181323,96935],{"class":50},[33,181325,371],{"class":167},[33,181327,181328,181330,181332,181334,181336],{"class":35,"line":187},[33,181329,160360],{"class":167},[33,181331,242],{"class":163},[33,181333,156589],{"class":167},[33,181335,748],{"class":50},[33,181337,9202],{"class":167},[33,181339,181340,181343,181345,181347,181349],{"class":35,"line":201},[33,181341,181342],{"class":167},"    page_inches ",[33,181344,242],{"class":163},[33,181346,180248],{"class":167},[33,181348,1351],{"class":163},[33,181350,181351],{"class":50}," 914400\n",[33,181353,181354],{"class":35,"line":206},[33,181355,92],{"emptyLinePlaceholder":91},[33,181357,181358,181360,181363,181365,181367],{"class":35,"line":224},[33,181359,656],{"class":163},[33,181361,181362],{"class":167}," i, shape ",[33,181364,662],{"class":163},[33,181366,7403],{"class":50},[33,181368,181369],{"class":167},"(doc.inline_shapes):\n",[33,181371,181372,181375,181377,181380,181382],{"class":35,"line":229},[33,181373,181374],{"class":167},"        w_in ",[33,181376,242],{"class":163},[33,181378,181379],{"class":167}," shape.width ",[33,181381,1351],{"class":163},[33,181383,181351],{"class":50},[33,181385,181386,181389,181391,181394,181396],{"class":35,"line":235},[33,181387,181388],{"class":167},"        h_in ",[33,181390,242],{"class":163},[33,181392,181393],{"class":167}," shape.height ",[33,181395,1351],{"class":163},[33,181397,181351],{"class":50},[33,181399,181400,181403,181405,181407,181409,181412,181414,181417,181419],{"class":35,"line":250},[33,181401,181402],{"class":167},"        status ",[33,181404,242],{"class":163},[33,181406,79178],{"class":54},[33,181408,9994],{"class":163},[33,181410,181411],{"class":167}," w_in ",[33,181413,44223],{"class":163},[33,181415,181416],{"class":167}," page_inches ",[33,181418,7489],{"class":163},[33,181420,181421],{"class":54}," \"OVERFLOW\"\n",[33,181423,181424,181426,181428,181430,181433,181435,181437,181439,181441,181443,181446,181448,181450,181452,181454,181457,181459,181461,181464,181466,181468,181470,181472],{"class":35,"line":266},[33,181425,9414],{"class":50},[33,181427,602],{"class":167},[33,181429,4059],{"class":163},[33,181431,181432],{"class":54},"\"Shape ",[33,181434,1115],{"class":50},[33,181436,7499],{"class":167},[33,181438,1121],{"class":50},[33,181440,2079],{"class":54},[33,181442,1115],{"class":50},[33,181444,181445],{"class":167},"w_in",[33,181447,55819],{"class":163},[33,181449,1121],{"class":50},[33,181451,179197],{"class":54},[33,181453,1115],{"class":50},[33,181455,181456],{"class":167},"h_in",[33,181458,55819],{"class":163},[33,181460,1121],{"class":50},[33,181462,181463],{"class":54}," in — ",[33,181465,1115],{"class":50},[33,181467,68351],{"class":167},[33,181469,1121],{"class":50},[33,181471,274],{"class":54},[33,181473,221],{"class":167},[33,181475,181476],{"class":35,"line":290},[33,181477,92],{"emptyLinePlaceholder":91},[33,181479,181480,181482,181484,181486,181489,181491,181494,181496,181498,181501],{"class":35,"line":295},[33,181481,7268],{"class":50},[33,181483,602],{"class":167},[33,181485,4059],{"class":163},[33,181487,181488],{"class":54},"\"Page width: ",[33,181490,1115],{"class":50},[33,181492,181493],{"class":167},"page_inches",[33,181495,55819],{"class":163},[33,181497,1121],{"class":50},[33,181499,181500],{"class":54}," in\"",[33,181502,221],{"class":167},[33,181504,181505,181507,181509],{"class":35,"line":300},[33,181506,35726],{"class":163},[33,181508,2945],{"class":50},[33,181510,574],{"class":167},[33,181512,181513,181515,181517,181519,181521,181523,181525],{"class":35,"line":317},[33,181514,7268],{"class":50},[33,181516,602],{"class":167},[33,181518,4059],{"class":163},[33,181520,15677],{"class":54},[33,181522,97684],{"class":50},[33,181524,274],{"class":54},[33,181526,221],{"class":167},[33,181528,181529,181531,181533,181535],{"class":35,"line":332},[33,181530,35726],{"class":163},[33,181532,783],{"class":50},[33,181534,1852],{"class":163},[33,181536,7583],{"class":167},[33,181538,181539,181541,181543,181545,181548,181550,181552,181554,181556],{"class":35,"line":347},[33,181540,7268],{"class":50},[33,181542,602],{"class":167},[33,181544,4059],{"class":163},[33,181546,181547],{"class":54},"\"Verification error: ",[33,181549,1115],{"class":50},[33,181551,7602],{"class":167},[33,181553,1121],{"class":50},[33,181555,274],{"class":54},[33,181557,221],{"class":167},[14,181559,181560,181561,181564,181565,181568,181569,181571,181572,181574],{},"All shapes should print ",[30,181562,181563],{},"OK",". Any shape where width exceeds the page width will show ",[30,181566,181567],{},"OVERFLOW"," — trace it back to the ",[30,181570,179373],{}," call and add an explicit ",[30,181573,56684],{}," argument.",[18,181576,6918],{"id":6917},[4211,181578,181579,181584,181589],{},[4214,181580,181581,181583],{},[940,181582,156178],{"href":156177}," — full guide covering headers, table cells, BytesIO, and batch insertion",[4214,181585,181586,181588],{},[940,181587,156152],{"href":26562}," — document generation pipeline context",[4214,181590,181591,181593],{},[940,181592,26185],{"href":18040}," — per-recipient image insertion via docxtpl",[14,181595,6947,181596,3035],{},[940,181597,156178],{"href":156177},[6953,181599,181600],{},"html pre.shiki code .sAwPA, html code.shiki .sAwPA{--shiki-default:#6A737D}html pre.shiki code .sD7c4, html code.shiki .sD7c4{--shiki-default:#D73A49}html pre.shiki code .sYu0t, html code.shiki .sYu0t{--shiki-default:#005CC5}html pre.shiki code .sgsFI, html code.shiki .sgsFI{--shiki-default:#24292E}html pre.shiki code .sYBdl, html code.shiki .sYBdl{--shiki-default:#032F62}html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html pre.shiki code .sqxcx, html code.shiki .sqxcx{--shiki-default:#E36209}html pre.shiki code .s7eDp, html code.shiki .s7eDp{--shiki-default:#6F42C1}",{"title":28,"searchDepth":43,"depth":43,"links":181602},[181603,181604,181605,181606,181607,181608,181609,181610,181611,181612,181613],{"id":7020,"depth":43,"text":7021},{"id":54445,"depth":43,"text":99786},{"id":179361,"depth":43,"text":179362},{"id":179612,"depth":43,"text":179613},{"id":179921,"depth":43,"text":179922},{"id":180142,"depth":43,"text":180143},{"id":180508,"depth":43,"text":180509},{"id":181050,"depth":43,"text":181051},{"id":42591,"depth":43,"text":42592},{"id":9246,"depth":43,"text":9247},{"id":6917,"depth":43,"text":6918},"Fix Image Too Large","When add_picture inserts an oversized image in python-docx, the cause is missing width\u002Fheight args. Fix by passing Inches() or computing usable page width.",{},"\u002Fword-document-templating-batch-processing\u002Finserting-images-into-word-documents\u002Ffix-image-too-large-in-python-docx",{"title":179035,"description":181615},"Fix Image Too Large in python-docx (add_picture)","word-document-templating-batch-processing\u002Finserting-images-into-word-documents\u002Ffix-image-too-large-in-python-docx\u002Findex",[47,18041,170115,181622],"images","-7Rsf802yYO5mVXb3qv5Y5fmRz1tCRTSGrca7nRLpxo",{"id":181625,"title":156178,"body":181626,"breadcrumbTitle":186213,"canonical":6977,"date":6977,"description":107412,"draft":6980,"extension":6981,"image":6977,"meta":186214,"navigation":91,"path":186215,"robots":6977,"seo":186216,"seoTitle":186221,"stem":186222,"tags":6977,"updatedAt":6977,"__hash__":186223},"content\u002Fword-document-templating-batch-processing\u002Finserting-images-into-word-documents\u002Findex.md",{"type":7,"value":181627,"toc":186191},[181628,181631,181634,181643,181645,181661,181670,181764,181768,181771,182027,182042,182046,182057,182277,182289,182377,182381,182384,182677,182695,182699,182702,183151,183162,183166,183173,183551,183562,183566,183573,183579,183583,183593,183856,183867,183871,183881,184188,184196,184200,184203,184674,184676,184680,184689,184693,184702,184706,184709,184711,184714,184990,185000,185002,185028,185030,185141,185143,186165,186167,186184,186188],[10,181629,156178],{"id":181630},"inserting-images-into-word-documents",[14,181632,181633],{},"Adding an image to a Word document sounds trivial until you hit the defaults: python-docx renders images at their native pixel size, which maps to a random physical size depending on the file's DPI metadata. A 300 DPI PNG intended for print comes out tiny; a 72 DPI screenshot fills three pages. This guide covers every insertion pattern you actually encounter — sizing, aspect ratio, placement in tables and headers\u002Ffooters, captions, BytesIO sources, and batch folder insertion — so you produce predictable output every time.",[14,181635,181636,181637,181639,181640,181642],{},"The techniques here build on ",[940,181638,156152],{"href":26562}," and are frequently combined with ",[940,181641,26185],{"href":18040}," when logos or signature images vary per recipient.",[18,181644,21],{"id":20},[23,181646,181648],{"className":25,"code":181647,"language":27,"meta":28,"style":28},"pip install python-docx Pillow\n",[30,181649,181650],{"__ignoreMap":28},[33,181651,181652,181654,181656,181658],{"class":35,"line":36},[33,181653,76],{"class":46},[33,181655,79],{"class":54},[33,181657,16192],{"class":54},[33,181659,181660],{"class":54}," Pillow\n",[14,181662,181663,181664,181666,181667,3035],{},"python-docx handles ",[30,181665,18051],{}," manipulation; Pillow is needed only for the aspect-ratio calculation helper and for loading images from bytes. A test image is assumed at ",[30,181668,181669],{},"assets\u002Flogo.png",[23,181671,181673],{"className":126,"code":181672,"language":47,"meta":28,"style":28},"# pip install python-docx Pillow\nfrom pathlib import Path\nfrom docx import Document\nfrom docx.shared import Inches, Cm, Pt\nfrom docx.enum.text import WD_ALIGN_PARAGRAPH\n\nASSETS = Path(\"assets\")\nOUTPUT = Path(\"output\")\nOUTPUT.mkdir(exist_ok=True)\n",[30,181674,181675,181679,181689,181699,181710,181720,181724,181738,181750],{"__ignoreMap":28},[33,181676,181677],{"class":35,"line":36},[33,181678,179944],{"class":39},[33,181680,181681,181683,181685,181687],{"class":35,"line":43},[33,181682,190],{"class":163},[33,181684,193],{"class":167},[33,181686,164],{"class":163},[33,181688,198],{"class":167},[33,181690,181691,181693,181695,181697],{"class":35,"line":61},[33,181692,190],{"class":163},[33,181694,18092],{"class":167},[33,181696,164],{"class":163},[33,181698,18097],{"class":167},[33,181700,181701,181703,181705,181707],{"class":35,"line":73},[33,181702,190],{"class":163},[33,181704,18104],{"class":167},[33,181706,164],{"class":163},[33,181708,181709],{"class":167}," Inches, Cm, Pt\n",[33,181711,181712,181714,181716,181718],{"class":35,"line":88},[33,181713,190],{"class":163},[33,181715,18116],{"class":167},[33,181717,164],{"class":163},[33,181719,18121],{"class":50},[33,181721,181722],{"class":35,"line":95},[33,181723,92],{"emptyLinePlaceholder":91},[33,181725,181726,181729,181731,181733,181736],{"class":35,"line":101},[33,181727,181728],{"class":50},"ASSETS",[33,181730,212],{"class":163},[33,181732,215],{"class":167},[33,181734,181735],{"class":54},"\"assets\"",[33,181737,221],{"class":167},[33,181739,181740,181742,181744,181746,181748],{"class":35,"line":171},[33,181741,96935],{"class":50},[33,181743,212],{"class":163},[33,181745,215],{"class":167},[33,181747,41169],{"class":54},[33,181749,221],{"class":167},[33,181751,181752,181754,181756,181758,181760,181762],{"class":35,"line":179},[33,181753,96935],{"class":50},[33,181755,1078],{"class":167},[33,181757,878],{"class":238},[33,181759,242],{"class":163},[33,181761,855],{"class":50},[33,181763,221],{"class":167},[18,181765,181767],{"id":181766},"_1-inspect-the-image-before-inserting","1. Inspect the Image Before Inserting",[14,181769,181770],{},"Before writing any insertion code, confirm the image dimensions and DPI. This tells you whether you need to supply an explicit size (you almost always do).",[23,181772,181774],{"className":126,"code":181773,"language":47,"meta":28,"style":28},"# pip install Pillow\nfrom PIL import Image\nfrom pathlib import Path\n\nimg_path = Path(\"assets\u002Flogo.png\")\ntry:\n    with Image.open(img_path) as img:\n        width_px, height_px = img.size\n        dpi = img.info.get(\"dpi\", (72, 72))\n        width_in = width_px \u002F dpi[0]\n        height_in = height_px \u002F dpi[1]\n        print(f\"Size: {width_px}x{height_px}px @ {dpi[0]} DPI\")\n        print(f\"Native physical size: {width_in:.2f} x {height_in:.2f} inches\")\nexcept FileNotFoundError:\n    print(\"Image not found — check the path\")\nexcept Exception as e:\n    print(f\"Could not open image: {e}\")\n",[30,181775,181776,181780,181790,181800,181804,181816,181822,181832,181841,181861,181879,181897,181942,181977,181985,181996,182006],{"__ignoreMap":28},[33,181777,181778],{"class":35,"line":36},[33,181779,179087],{"class":39},[33,181781,181782,181784,181786,181788],{"class":35,"line":43},[33,181783,190],{"class":163},[33,181785,46889],{"class":50},[33,181787,46892],{"class":163},[33,181789,47171],{"class":167},[33,181791,181792,181794,181796,181798],{"class":35,"line":61},[33,181793,190],{"class":163},[33,181795,193],{"class":167},[33,181797,164],{"class":163},[33,181799,198],{"class":167},[33,181801,181802],{"class":35,"line":73},[33,181803,92],{"emptyLinePlaceholder":91},[33,181805,181806,181808,181810,181812,181814],{"class":35,"line":88},[33,181807,179116],{"class":167},[33,181809,242],{"class":163},[33,181811,215],{"class":167},[33,181813,179123],{"class":54},[33,181815,221],{"class":167},[33,181817,181818,181820],{"class":35,"line":95},[33,181819,35574],{"class":163},[33,181821,574],{"class":167},[33,181823,181824,181826,181828,181830],{"class":35,"line":101},[33,181825,1635],{"class":163},[33,181827,179138],{"class":167},[33,181829,495],{"class":163},[33,181831,179143],{"class":167},[33,181833,181834,181837,181839],{"class":35,"line":171},[33,181835,181836],{"class":167},"        width_px, height_px ",[33,181838,242],{"class":163},[33,181840,179153],{"class":167},[33,181842,181843,181845,181847,181849,181851,181853,181855,181857,181859],{"class":35,"line":179},[33,181844,179158],{"class":167},[33,181846,242],{"class":163},[33,181848,179163],{"class":167},[33,181850,179166],{"class":54},[33,181852,19953],{"class":167},[33,181854,49823],{"class":50},[33,181856,365],{"class":167},[33,181858,49823],{"class":50},[33,181860,371],{"class":167},[33,181862,181863,181866,181868,181871,181873,181875,181877],{"class":35,"line":187},[33,181864,181865],{"class":167},"        width_in ",[33,181867,242],{"class":163},[33,181869,181870],{"class":167}," width_px ",[33,181872,1351],{"class":163},[33,181874,180066],{"class":167},[33,181876,748],{"class":50},[33,181878,9202],{"class":167},[33,181880,181881,181884,181886,181889,181891,181893,181895],{"class":35,"line":201},[33,181882,181883],{"class":167},"        height_in ",[33,181885,242],{"class":163},[33,181887,181888],{"class":167}," height_px ",[33,181890,1351],{"class":163},[33,181892,180066],{"class":167},[33,181894,734],{"class":50},[33,181896,9202],{"class":167},[33,181898,181899,181901,181903,181905,181908,181910,181913,181915,181918,181920,181923,181925,181928,181930,181932,181934,181936,181938,181940],{"class":35,"line":206},[33,181900,9414],{"class":50},[33,181902,602],{"class":167},[33,181904,4059],{"class":163},[33,181906,181907],{"class":54},"\"Size: ",[33,181909,1115],{"class":50},[33,181911,181912],{"class":167},"width_px",[33,181914,1121],{"class":50},[33,181916,181917],{"class":54},"x",[33,181919,1115],{"class":50},[33,181921,181922],{"class":167},"height_px",[33,181924,1121],{"class":50},[33,181926,181927],{"class":54},"px @ ",[33,181929,1115],{"class":50},[33,181931,179223],{"class":167},[33,181933,748],{"class":50},[33,181935,9546],{"class":167},[33,181937,1121],{"class":50},[33,181939,47052],{"class":54},[33,181941,221],{"class":167},[33,181943,181944,181946,181948,181950,181953,181955,181958,181960,181962,181964,181966,181969,181971,181973,181975],{"class":35,"line":224},[33,181945,9414],{"class":50},[33,181947,602],{"class":167},[33,181949,4059],{"class":163},[33,181951,181952],{"class":54},"\"Native physical size: ",[33,181954,1115],{"class":50},[33,181956,181957],{"class":167},"width_in",[33,181959,55819],{"class":163},[33,181961,1121],{"class":50},[33,181963,179197],{"class":54},[33,181965,1115],{"class":50},[33,181967,181968],{"class":167},"height_in",[33,181970,55819],{"class":163},[33,181972,1121],{"class":50},[33,181974,179293],{"class":54},[33,181976,221],{"class":167},[33,181978,181979,181981,181983],{"class":35,"line":229},[33,181980,35726],{"class":163},[33,181982,2945],{"class":50},[33,181984,574],{"class":167},[33,181986,181987,181989,181991,181994],{"class":35,"line":235},[33,181988,7268],{"class":50},[33,181990,602],{"class":167},[33,181992,181993],{"class":54},"\"Image not found — check the path\"",[33,181995,221],{"class":167},[33,181997,181998,182000,182002,182004],{"class":35,"line":250},[33,181999,35726],{"class":163},[33,182001,783],{"class":50},[33,182003,1852],{"class":163},[33,182005,7583],{"class":167},[33,182007,182008,182010,182012,182014,182017,182019,182021,182023,182025],{"class":35,"line":266},[33,182009,7268],{"class":50},[33,182011,602],{"class":167},[33,182013,4059],{"class":163},[33,182015,182016],{"class":54},"\"Could not open image: ",[33,182018,1115],{"class":50},[33,182020,7602],{"class":167},[33,182022,1121],{"class":50},[33,182024,274],{"class":54},[33,182026,221],{"class":167},[14,182028,182029,182030,2012,182032,182034,182035,107354,182038,182041],{},"If the native physical size is wildly wrong for your document layout, you need to pass an explicit ",[30,182031,56684],{},[30,182033,61972],{}," argument to ",[30,182036,182037],{},"add_picture()",[940,182039,179035],{"href":182040},"\u002Fword-document-templating-batch-processing\u002Finserting-images-into-word-documents\u002Ffix-image-too-large-in-python-docx\u002F"," for the full diagnosis.",[18,182043,182045],{"id":182044},"_2-basic-insertion-with-explicit-width","2. Basic Insertion with Explicit Width",[14,182047,182048,182051,182052,2012,182054,182056],{},[30,182049,182050],{},"Document.add_picture(image_path_or_stream, width, height)"," — supply exactly one of ",[30,182053,56684],{},[30,182055,61972],{}," and python-docx preserves the aspect ratio automatically.",[23,182058,182060],{"className":126,"code":182059,"language":47,"meta":28,"style":28},"# pip install python-docx\nfrom pathlib import Path\nfrom docx import Document\nfrom docx.shared import Inches\n\nIMAGE = Path(\"assets\u002Flogo.png\")\nOUTPUT = Path(\"output\u002Fbasic_image.docx\")\n\ndoc = Document()\ndoc.add_heading(\"Report Header\", level=1)\ndoc.add_paragraph(\"This report includes a company logo.\")\n\ntry:\n    doc.add_picture(str(IMAGE), width=Inches(2.0))\n    doc.save(str(OUTPUT))\n    print(f\"Saved: {OUTPUT}\")\nexcept FileNotFoundError:\n    print(f\"Image not found: {IMAGE}\")\nexcept Exception as e:\n    print(f\"Insertion failed: {e}\")\n",[30,182061,182062,182066,182076,182086,182096,182100,182112,182125,182129,182137,182154,182163,182167,182173,182195,182207,182223,182231,182247,182257],{"__ignoreMap":28},[33,182063,182064],{"class":35,"line":36},[33,182065,156213],{"class":39},[33,182067,182068,182070,182072,182074],{"class":35,"line":43},[33,182069,190],{"class":163},[33,182071,193],{"class":167},[33,182073,164],{"class":163},[33,182075,198],{"class":167},[33,182077,182078,182080,182082,182084],{"class":35,"line":61},[33,182079,190],{"class":163},[33,182081,18092],{"class":167},[33,182083,164],{"class":163},[33,182085,18097],{"class":167},[33,182087,182088,182090,182092,182094],{"class":35,"line":73},[33,182089,190],{"class":163},[33,182091,18104],{"class":167},[33,182093,164],{"class":163},[33,182095,157048],{"class":167},[33,182097,182098],{"class":35,"line":88},[33,182099,92],{"emptyLinePlaceholder":91},[33,182101,182102,182104,182106,182108,182110],{"class":35,"line":95},[33,182103,179422],{"class":50},[33,182105,212],{"class":163},[33,182107,215],{"class":167},[33,182109,179123],{"class":54},[33,182111,221],{"class":167},[33,182113,182114,182116,182118,182120,182123],{"class":35,"line":101},[33,182115,96935],{"class":50},[33,182117,212],{"class":163},[33,182119,215],{"class":167},[33,182121,182122],{"class":54},"\"output\u002Fbasic_image.docx\"",[33,182124,221],{"class":167},[33,182126,182127],{"class":35,"line":171},[33,182128,92],{"emptyLinePlaceholder":91},[33,182130,182131,182133,182135],{"class":35,"line":179},[33,182132,156566],{"class":167},[33,182134,242],{"class":163},[33,182136,18229],{"class":167},[33,182138,182139,182141,182144,182146,182148,182150,182152],{"class":35,"line":187},[33,182140,156723],{"class":167},[33,182142,182143],{"class":54},"\"Report Header\"",[33,182145,365],{"class":167},[33,182147,18267],{"class":238},[33,182149,242],{"class":163},[33,182151,734],{"class":50},[33,182153,221],{"class":167},[33,182155,182156,182158,182161],{"class":35,"line":201},[33,182157,163149],{"class":167},[33,182159,182160],{"class":54},"\"This report includes a company logo.\"",[33,182162,221],{"class":167},[33,182164,182165],{"class":35,"line":206},[33,182166,92],{"emptyLinePlaceholder":91},[33,182168,182169,182171],{"class":35,"line":224},[33,182170,35574],{"class":163},[33,182172,574],{"class":167},[33,182174,182175,182177,182179,182181,182183,182185,182187,182189,182191,182193],{"class":35,"line":229},[33,182176,179492],{"class":167},[33,182178,1053],{"class":50},[33,182180,602],{"class":167},[33,182182,179422],{"class":50},[33,182184,18525],{"class":167},[33,182186,56684],{"class":238},[33,182188,242],{"class":163},[33,182190,179507],{"class":167},[33,182192,121926],{"class":50},[33,182194,371],{"class":167},[33,182196,182197,182199,182201,182203,182205],{"class":35,"line":235},[33,182198,85716],{"class":167},[33,182200,1053],{"class":50},[33,182202,602],{"class":167},[33,182204,96935],{"class":50},[33,182206,371],{"class":167},[33,182208,182209,182211,182213,182215,182217,182219,182221],{"class":35,"line":250},[33,182210,7268],{"class":50},[33,182212,602],{"class":167},[33,182214,4059],{"class":163},[33,182216,97737],{"class":54},[33,182218,97684],{"class":50},[33,182220,274],{"class":54},[33,182222,221],{"class":167},[33,182224,182225,182227,182229],{"class":35,"line":266},[33,182226,35726],{"class":163},[33,182228,2945],{"class":50},[33,182230,574],{"class":167},[33,182232,182233,182235,182237,182239,182241,182243,182245],{"class":35,"line":290},[33,182234,7268],{"class":50},[33,182236,602],{"class":167},[33,182238,4059],{"class":163},[33,182240,179562],{"class":54},[33,182242,179565],{"class":50},[33,182244,274],{"class":54},[33,182246,221],{"class":167},[33,182248,182249,182251,182253,182255],{"class":35,"line":295},[33,182250,35726],{"class":163},[33,182252,783],{"class":50},[33,182254,1852],{"class":163},[33,182256,7583],{"class":167},[33,182258,182259,182261,182263,182265,182267,182269,182271,182273,182275],{"class":35,"line":300},[33,182260,7268],{"class":50},[33,182262,602],{"class":167},[33,182264,4059],{"class":163},[33,182266,179590],{"class":54},[33,182268,1115],{"class":50},[33,182270,7602],{"class":167},[33,182272,1121],{"class":50},[33,182274,274],{"class":54},[33,182276,221],{"class":167},[14,182278,182279,182280,182283,182284,10065,182286,182288],{},"Passing only ",[30,182281,182282],{},"width=Inches(2.0)"," scales the height proportionally. Passing both ",[30,182285,56684],{},[30,182287,61972],{}," overrides the aspect ratio — use that only when you deliberately want distortion (for example, a fixed-size icon cell in a table).",[2540,182290,2547,182292,2547,182295,2547,182298,2547,2547,182312,2547,182314,2547,182317,2547,182320,2547,2547,182323,2547,2547,182326,2547,182329,2547,182331,2547,182333,2547,182337,2547,182340,2547,2547,182343,2547,2547,182345,2547,182347,2547,182350,2547,182353,2547,182355,2547,182358,2547,182361,2547,182363,2547,182366,2547,2547,182369,2547,182371,2547,182373,2547,182375],{"viewBox":11071,"role":2543,"ariaLabel":182291,"xmlns":2545,"style":2546},"Flow from image file or bytes through add_picture with width parameter to positioned element in document",[2549,182293,182294],{},"Image insertion pipeline",[2553,182296,182297],{},"Shows the three-stage pipeline: image source (file or bytes), add_picture with width argument, and final placement in the document body, table cell, or header.",[2557,182299,2559,182300,2559,182307,2547],{},[2561,182301,2564,182303,2564,182305,2559],{"id":182302,"x1":748,"y1":748,"x2":734,"y2":748},"word-images-grad",[2566,182304],{"offset":748,"style":2568},[2566,182306],{"offset":734,"style":2571},[2573,182308,2564,182310,2559],{"id":182309,"viewBox":2576,"refX":2577,"refY":1153,"markerWidth":1179,"markerHeight":1179,"orient":2578},"word-images-arrow",[2580,182311],{"d":2582,"fill":2583},[2585,182313],{"x":2587,"y":2679,"width":2610,"height":2650,"rx":3545,"fill":2592,"stroke":2593,"style":2594},[2000,182315,182316],{"x":2650,"y":2635,"fill":2599,"style":38718},"Image Source",[2000,182318,182319],{"x":2650,"y":11115,"fill":2583,"style":2685},"Path \u002F BytesIO",[2000,182321,182322],{"x":2650,"y":125458,"fill":2583,"style":2605},"PNG · JPEG · BMP",[35,182324],{"x1":58337,"y1":2610,"x2":26446,"y2":2610,"stroke":2583,"markerEnd":182325,"style":2594},"url(#word-images-arrow)",[2585,182327],{"x":26446,"y":2630,"width":2611,"height":2609,"rx":3545,"fill":182328,"stroke":2593,"style":2594},"url(#word-images-grad)",[2000,182330,182037],{"x":152431,"y":26345,"fill":2599,"style":38718},[2000,182332,181168],{"x":152431,"y":114598,"fill":2599,"style":2685},[2000,182334,182336],{"x":152431,"y":182335,"fill":2583,"style":2605},"173","aspect ratio preserved",[2000,182338,182339],{"x":152431,"y":64936,"fill":2583,"style":2605},"EMU units internally",[2000,182341,182342],{"x":152431,"y":11183,"fill":2583,"style":2605},"returns InlineShape",[35,182344],{"x1":58352,"y1":2610,"x2":166793,"y2":2610,"stroke":2583,"markerEnd":182325,"style":2594},[2585,182346],{"x":166793,"y":2680,"width":2664,"height":2590,"rx":2591,"fill":2592,"stroke":2593,"style":2594},[2000,182348,182349],{"x":114673,"y":59963,"fill":2599,"style":2600},"Body paragraph",[2000,182351,182352],{"x":114673,"y":38741,"fill":2583,"style":2605},"inline flow",[2585,182354],{"x":166793,"y":2588,"width":2664,"height":2590,"rx":2591,"fill":2592,"stroke":2593,"style":2594},[2000,182356,182357],{"x":114673,"y":11132,"fill":2599,"style":2600},"Table cell",[2000,182359,182360],{"x":114673,"y":158104,"fill":2583,"style":2605},"constrain to cell width",[2585,182362],{"x":166793,"y":58401,"width":2664,"height":2590,"rx":2591,"fill":2592,"stroke":2593,"style":2594},[2000,182364,182365],{"x":114673,"y":64880,"fill":2599,"style":2600},"Header \u002F Footer",[2000,182367,182368],{"x":114673,"y":26415,"fill":2583,"style":2605},"via section paragraph",[35,182370],{"x1":166793,"y1":2597,"x2":110841,"y2":2597,"stroke":2593,"style":11105},[35,182372],{"x1":166793,"y1":2610,"x2":110841,"y2":2610,"stroke":2593,"style":11105},[35,182374],{"x1":166793,"y1":17008,"x2":110841,"y2":17008,"stroke":2593,"style":11105},[35,182376],{"x1":110841,"y1":2597,"x2":110841,"y2":17008,"stroke":2593,"style":11105},[18,182378,182380],{"id":182379},"_3-fitting-an-image-to-page-width","3. Fitting an Image to Page Width",[14,182382,182383],{},"For full-width images, compute the usable page width from the section margins rather than hardcoding inches. This adapts automatically if the template uses non-standard margins.",[23,182385,182387],{"className":126,"code":182386,"language":47,"meta":28,"style":28},"# pip install python-docx\nfrom pathlib import Path\nfrom docx import Document\nfrom docx.shared import Inches\n\nREPORT_IMAGE = Path(\"assets\u002Fchart.png\")\nOUTPUT = Path(\"output\u002Ffull_width_image.docx\")\n\ndoc = Document()\n\n# Read page geometry from the first section\nsection = doc.sections[0]\npage_width = section.page_width\nleft_margin = section.left_margin\nright_margin = section.right_margin\nusable_width = page_width - left_margin - right_margin\n\ndoc.add_heading(\"Monthly Sales Chart\", level=2)\ntry:\n    doc.add_picture(str(REPORT_IMAGE), width=usable_width)\n    doc.save(str(OUTPUT))\n    print(f\"Image width: {usable_width.inches:.2f} in — saved {OUTPUT}\")\nexcept FileNotFoundError:\n    print(f\"Image not found: {REPORT_IMAGE}\")\nexcept Exception as e:\n    print(f\"Could not insert image: {e}\")\n",[30,182388,182389,182393,182403,182413,182423,182427,182441,182454,182458,182466,182470,182475,182487,182497,182507,182516,182535,182539,182556,182562,182581,182593,182621,182629,182646,182656],{"__ignoreMap":28},[33,182390,182391],{"class":35,"line":36},[33,182392,156213],{"class":39},[33,182394,182395,182397,182399,182401],{"class":35,"line":43},[33,182396,190],{"class":163},[33,182398,193],{"class":167},[33,182400,164],{"class":163},[33,182402,198],{"class":167},[33,182404,182405,182407,182409,182411],{"class":35,"line":61},[33,182406,190],{"class":163},[33,182408,18092],{"class":167},[33,182410,164],{"class":163},[33,182412,18097],{"class":167},[33,182414,182415,182417,182419,182421],{"class":35,"line":73},[33,182416,190],{"class":163},[33,182418,18104],{"class":167},[33,182420,164],{"class":163},[33,182422,157048],{"class":167},[33,182424,182425],{"class":35,"line":88},[33,182426,92],{"emptyLinePlaceholder":91},[33,182428,182429,182432,182434,182436,182439],{"class":35,"line":95},[33,182430,182431],{"class":50},"REPORT_IMAGE",[33,182433,212],{"class":163},[33,182435,215],{"class":167},[33,182437,182438],{"class":54},"\"assets\u002Fchart.png\"",[33,182440,221],{"class":167},[33,182442,182443,182445,182447,182449,182452],{"class":35,"line":101},[33,182444,96935],{"class":50},[33,182446,212],{"class":163},[33,182448,215],{"class":167},[33,182450,182451],{"class":54},"\"output\u002Ffull_width_image.docx\"",[33,182453,221],{"class":167},[33,182455,182456],{"class":35,"line":171},[33,182457,92],{"emptyLinePlaceholder":91},[33,182459,182460,182462,182464],{"class":35,"line":179},[33,182461,156566],{"class":167},[33,182463,242],{"class":163},[33,182465,18229],{"class":167},[33,182467,182468],{"class":35,"line":187},[33,182469,92],{"emptyLinePlaceholder":91},[33,182471,182472],{"class":35,"line":201},[33,182473,182474],{"class":39},"# Read page geometry from the first section\n",[33,182476,182477,182479,182481,182483,182485],{"class":35,"line":206},[33,182478,156584],{"class":167},[33,182480,242],{"class":163},[33,182482,156589],{"class":167},[33,182484,748],{"class":50},[33,182486,9202],{"class":167},[33,182488,182489,182492,182494],{"class":35,"line":224},[33,182490,182491],{"class":167},"page_width ",[33,182493,242],{"class":163},[33,182495,182496],{"class":167}," section.page_width\n",[33,182498,182499,182502,182504],{"class":35,"line":229},[33,182500,182501],{"class":167},"left_margin ",[33,182503,242],{"class":163},[33,182505,182506],{"class":167}," section.left_margin\n",[33,182508,182509,182512,182514],{"class":35,"line":235},[33,182510,182511],{"class":167},"right_margin ",[33,182513,242],{"class":163},[33,182515,180258],{"class":167},[33,182517,182518,182520,182522,182525,182527,182530,182532],{"class":35,"line":250},[33,182519,179716],{"class":167},[33,182521,242],{"class":163},[33,182523,182524],{"class":167}," page_width ",[33,182526,4126],{"class":163},[33,182528,182529],{"class":167}," left_margin ",[33,182531,4126],{"class":163},[33,182533,182534],{"class":167}," right_margin\n",[33,182536,182537],{"class":35,"line":266},[33,182538,92],{"emptyLinePlaceholder":91},[33,182540,182541,182543,182546,182548,182550,182552,182554],{"class":35,"line":290},[33,182542,156723],{"class":167},[33,182544,182545],{"class":54},"\"Monthly Sales Chart\"",[33,182547,365],{"class":167},[33,182549,18267],{"class":238},[33,182551,242],{"class":163},[33,182553,1533],{"class":50},[33,182555,221],{"class":167},[33,182557,182558,182560],{"class":35,"line":295},[33,182559,35574],{"class":163},[33,182561,574],{"class":167},[33,182563,182564,182566,182568,182570,182572,182574,182576,182578],{"class":35,"line":300},[33,182565,179492],{"class":167},[33,182567,1053],{"class":50},[33,182569,602],{"class":167},[33,182571,182431],{"class":50},[33,182573,18525],{"class":167},[33,182575,56684],{"class":238},[33,182577,242],{"class":163},[33,182579,182580],{"class":167},"usable_width)\n",[33,182582,182583,182585,182587,182589,182591],{"class":35,"line":317},[33,182584,85716],{"class":167},[33,182586,1053],{"class":50},[33,182588,602],{"class":167},[33,182590,96935],{"class":50},[33,182592,371],{"class":167},[33,182594,182595,182597,182599,182601,182604,182606,182609,182611,182613,182615,182617,182619],{"class":35,"line":332},[33,182596,7268],{"class":50},[33,182598,602],{"class":167},[33,182600,4059],{"class":163},[33,182602,182603],{"class":54},"\"Image width: ",[33,182605,1115],{"class":50},[33,182607,182608],{"class":167},"usable_width.inches",[33,182610,55819],{"class":163},[33,182612,1121],{"class":50},[33,182614,179841],{"class":54},[33,182616,97684],{"class":50},[33,182618,274],{"class":54},[33,182620,221],{"class":167},[33,182622,182623,182625,182627],{"class":35,"line":347},[33,182624,35726],{"class":163},[33,182626,2945],{"class":50},[33,182628,574],{"class":167},[33,182630,182631,182633,182635,182637,182639,182642,182644],{"class":35,"line":374},[33,182632,7268],{"class":50},[33,182634,602],{"class":167},[33,182636,4059],{"class":163},[33,182638,179562],{"class":54},[33,182640,182641],{"class":50},"{REPORT_IMAGE}",[33,182643,274],{"class":54},[33,182645,221],{"class":167},[33,182647,182648,182650,182652,182654],{"class":35,"line":397},[33,182649,35726],{"class":163},[33,182651,783],{"class":50},[33,182653,1852],{"class":163},[33,182655,7583],{"class":167},[33,182657,182658,182660,182662,182664,182667,182669,182671,182673,182675],{"class":35,"line":653},[33,182659,7268],{"class":50},[33,182661,602],{"class":167},[33,182663,4059],{"class":163},[33,182665,182666],{"class":54},"\"Could not insert image: ",[33,182668,1115],{"class":50},[33,182670,7602],{"class":167},[33,182672,1121],{"class":50},[33,182674,274],{"class":54},[33,182676,221],{"class":167},[14,182678,182679,365,182682,71132,182685,182688,182689,182691,182692,182694],{},[30,182680,182681],{},"section.page_width",[30,182683,182684],{},"section.left_margin",[30,182686,182687],{},"section.right_margin"," are all in EMU (English Metric Units). python-docx arithmetic works directly in EMU — passing ",[30,182690,181172],{}," as the ",[30,182693,56684],{}," argument is valid without unit conversion.",[18,182696,182698],{"id":182697},"_4-placing-images-in-tables","4. Placing Images in Tables",[14,182700,182701],{},"Table cells are the most reliable way to position images side-by-side or to create a fixed-size image grid. Constrain the image to something narrower than the cell to leave interior padding.",[23,182703,182705],{"className":126,"code":182704,"language":47,"meta":28,"style":28},"# pip install python-docx\nfrom pathlib import Path\nfrom docx import Document\nfrom docx.shared import Inches\nfrom docx.enum.text import WD_ALIGN_PARAGRAPH\n\nLOGO = Path(\"assets\u002Flogo.png\")\nCHART = Path(\"assets\u002Fchart.png\")\nOUTPUT = Path(\"output\u002Ftable_images.docx\")\n\ndoc = Document()\ndoc.add_heading(\"Product Overview\", level=1)\n\ntable = doc.add_table(rows=1, cols=2)\ntable.style = \"Table Grid\"\n\nleft_cell = table.cell(0, 0)\nright_cell = table.cell(0, 1)\n\n# Insert into left cell\ntry:\n    para_l = left_cell.paragraphs[0]\n    para_l.alignment = WD_ALIGN_PARAGRAPH.CENTER\n    run_l = para_l.add_run()\n    run_l.add_picture(str(LOGO), width=Inches(2.5))\nexcept FileNotFoundError:\n    left_cell.text = \"[logo missing]\"\n\n# Insert into right cell\ntry:\n    para_r = right_cell.paragraphs[0]\n    para_r.alignment = WD_ALIGN_PARAGRAPH.CENTER\n    run_r = para_r.add_run()\n    run_r.add_picture(str(CHART), width=Inches(2.5))\nexcept FileNotFoundError:\n    right_cell.text = \"[chart missing]\"\n\ntry:\n    doc.save(str(OUTPUT))\n    print(f\"Saved: {OUTPUT}\")\nexcept Exception as e:\n    print(f\"Save failed: {e}\")\n",[30,182706,182707,182711,182721,182731,182741,182751,182755,182767,182780,182793,182797,182805,182822,182826,182850,182858,182862,182879,182896,182900,182905,182911,182925,182940,182950,182973,182981,182990,182994,182999,183005,183019,183032,183042,183065,183073,183083,183087,183093,183105,183121,183131],{"__ignoreMap":28},[33,182708,182709],{"class":35,"line":36},[33,182710,156213],{"class":39},[33,182712,182713,182715,182717,182719],{"class":35,"line":43},[33,182714,190],{"class":163},[33,182716,193],{"class":167},[33,182718,164],{"class":163},[33,182720,198],{"class":167},[33,182722,182723,182725,182727,182729],{"class":35,"line":61},[33,182724,190],{"class":163},[33,182726,18092],{"class":167},[33,182728,164],{"class":163},[33,182730,18097],{"class":167},[33,182732,182733,182735,182737,182739],{"class":35,"line":73},[33,182734,190],{"class":163},[33,182736,18104],{"class":167},[33,182738,164],{"class":163},[33,182740,157048],{"class":167},[33,182742,182743,182745,182747,182749],{"class":35,"line":88},[33,182744,190],{"class":163},[33,182746,18116],{"class":167},[33,182748,164],{"class":163},[33,182750,18121],{"class":50},[33,182752,182753],{"class":35,"line":95},[33,182754,92],{"emptyLinePlaceholder":91},[33,182756,182757,182759,182761,182763,182765],{"class":35,"line":101},[33,182758,91271],{"class":50},[33,182760,212],{"class":163},[33,182762,215],{"class":167},[33,182764,179123],{"class":54},[33,182766,221],{"class":167},[33,182768,182769,182772,182774,182776,182778],{"class":35,"line":171},[33,182770,182771],{"class":50},"CHART",[33,182773,212],{"class":163},[33,182775,215],{"class":167},[33,182777,182438],{"class":54},[33,182779,221],{"class":167},[33,182781,182782,182784,182786,182788,182791],{"class":35,"line":179},[33,182783,96935],{"class":50},[33,182785,212],{"class":163},[33,182787,215],{"class":167},[33,182789,182790],{"class":54},"\"output\u002Ftable_images.docx\"",[33,182792,221],{"class":167},[33,182794,182795],{"class":35,"line":187},[33,182796,92],{"emptyLinePlaceholder":91},[33,182798,182799,182801,182803],{"class":35,"line":201},[33,182800,156566],{"class":167},[33,182802,242],{"class":163},[33,182804,18229],{"class":167},[33,182806,182807,182809,182812,182814,182816,182818,182820],{"class":35,"line":206},[33,182808,156723],{"class":167},[33,182810,182811],{"class":54},"\"Product Overview\"",[33,182813,365],{"class":167},[33,182815,18267],{"class":238},[33,182817,242],{"class":163},[33,182819,734],{"class":50},[33,182821,221],{"class":167},[33,182823,182824],{"class":35,"line":224},[33,182825,92],{"emptyLinePlaceholder":91},[33,182827,182828,182830,182832,182834,182836,182838,182840,182842,182844,182846,182848],{"class":35,"line":229},[33,182829,157220],{"class":167},[33,182831,242],{"class":163},[33,182833,18626],{"class":167},[33,182835,18629],{"class":238},[33,182837,242],{"class":163},[33,182839,734],{"class":50},[33,182841,365],{"class":167},[33,182843,18638],{"class":238},[33,182845,242],{"class":163},[33,182847,1533],{"class":50},[33,182849,221],{"class":167},[33,182851,182852,182854,182856],{"class":35,"line":235},[33,182853,157246],{"class":167},[33,182855,242],{"class":163},[33,182857,18655],{"class":54},[33,182859,182860],{"class":35,"line":250},[33,182861,92],{"emptyLinePlaceholder":91},[33,182863,182864,182867,182869,182871,182873,182875,182877],{"class":35,"line":266},[33,182865,182866],{"class":167},"left_cell ",[33,182868,242],{"class":163},[33,182870,158907],{"class":167},[33,182872,748],{"class":50},[33,182874,365],{"class":167},[33,182876,748],{"class":50},[33,182878,221],{"class":167},[33,182880,182881,182884,182886,182888,182890,182892,182894],{"class":35,"line":290},[33,182882,182883],{"class":167},"right_cell ",[33,182885,242],{"class":163},[33,182887,158907],{"class":167},[33,182889,748],{"class":50},[33,182891,365],{"class":167},[33,182893,734],{"class":50},[33,182895,221],{"class":167},[33,182897,182898],{"class":35,"line":295},[33,182899,92],{"emptyLinePlaceholder":91},[33,182901,182902],{"class":35,"line":300},[33,182903,182904],{"class":39},"# Insert into left cell\n",[33,182906,182907,182909],{"class":35,"line":317},[33,182908,35574],{"class":163},[33,182910,574],{"class":167},[33,182912,182913,182916,182918,182921,182923],{"class":35,"line":332},[33,182914,182915],{"class":167},"    para_l ",[33,182917,242],{"class":163},[33,182919,182920],{"class":167}," left_cell.paragraphs[",[33,182922,748],{"class":50},[33,182924,9202],{"class":167},[33,182926,182927,182930,182932,182935,182937],{"class":35,"line":347},[33,182928,182929],{"class":167},"    para_l.alignment ",[33,182931,242],{"class":163},[33,182933,182934],{"class":50}," WD_ALIGN_PARAGRAPH",[33,182936,3035],{"class":167},[33,182938,182939],{"class":50},"CENTER\n",[33,182941,182942,182945,182947],{"class":35,"line":374},[33,182943,182944],{"class":167},"    run_l ",[33,182946,242],{"class":163},[33,182948,182949],{"class":167}," para_l.add_run()\n",[33,182951,182952,182955,182957,182959,182961,182963,182965,182967,182969,182971],{"class":35,"line":397},[33,182953,182954],{"class":167},"    run_l.add_picture(",[33,182956,1053],{"class":50},[33,182958,602],{"class":167},[33,182960,91271],{"class":50},[33,182962,18525],{"class":167},[33,182964,56684],{"class":238},[33,182966,242],{"class":163},[33,182968,179507],{"class":167},[33,182970,19760],{"class":50},[33,182972,371],{"class":167},[33,182974,182975,182977,182979],{"class":35,"line":653},[33,182976,35726],{"class":163},[33,182978,2945],{"class":50},[33,182980,574],{"class":167},[33,182982,182983,182986,182988],{"class":35,"line":667},[33,182984,182985],{"class":167},"    left_cell.text ",[33,182987,242],{"class":163},[33,182989,180427],{"class":54},[33,182991,182992],{"class":35,"line":675},[33,182993,92],{"emptyLinePlaceholder":91},[33,182995,182996],{"class":35,"line":689},[33,182997,182998],{"class":39},"# Insert into right cell\n",[33,183000,183001,183003],{"class":35,"line":703},[33,183002,35574],{"class":163},[33,183004,574],{"class":167},[33,183006,183007,183010,183012,183015,183017],{"class":35,"line":714},[33,183008,183009],{"class":167},"    para_r ",[33,183011,242],{"class":163},[33,183013,183014],{"class":167}," right_cell.paragraphs[",[33,183016,748],{"class":50},[33,183018,9202],{"class":167},[33,183020,183021,183024,183026,183028,183030],{"class":35,"line":723},[33,183022,183023],{"class":167},"    para_r.alignment ",[33,183025,242],{"class":163},[33,183027,182934],{"class":50},[33,183029,3035],{"class":167},[33,183031,182939],{"class":50},[33,183033,183034,183037,183039],{"class":35,"line":754},[33,183035,183036],{"class":167},"    run_r ",[33,183038,242],{"class":163},[33,183040,183041],{"class":167}," para_r.add_run()\n",[33,183043,183044,183047,183049,183051,183053,183055,183057,183059,183061,183063],{"class":35,"line":771},[33,183045,183046],{"class":167},"    run_r.add_picture(",[33,183048,1053],{"class":50},[33,183050,602],{"class":167},[33,183052,182771],{"class":50},[33,183054,18525],{"class":167},[33,183056,56684],{"class":238},[33,183058,242],{"class":163},[33,183060,179507],{"class":167},[33,183062,19760],{"class":50},[33,183064,371],{"class":167},[33,183066,183067,183069,183071],{"class":35,"line":777},[33,183068,35726],{"class":163},[33,183070,2945],{"class":50},[33,183072,574],{"class":167},[33,183074,183075,183078,183080],{"class":35,"line":788},[33,183076,183077],{"class":167},"    right_cell.text ",[33,183079,242],{"class":163},[33,183081,183082],{"class":54}," \"[chart missing]\"\n",[33,183084,183085],{"class":35,"line":804},[33,183086,92],{"emptyLinePlaceholder":91},[33,183088,183089,183091],{"class":35,"line":809},[33,183090,35574],{"class":163},[33,183092,574],{"class":167},[33,183094,183095,183097,183099,183101,183103],{"class":35,"line":819},[33,183096,85716],{"class":167},[33,183098,1053],{"class":50},[33,183100,602],{"class":167},[33,183102,96935],{"class":50},[33,183104,371],{"class":167},[33,183106,183107,183109,183111,183113,183115,183117,183119],{"class":35,"line":829},[33,183108,7268],{"class":50},[33,183110,602],{"class":167},[33,183112,4059],{"class":163},[33,183114,97737],{"class":54},[33,183116,97684],{"class":50},[33,183118,274],{"class":54},[33,183120,221],{"class":167},[33,183122,183123,183125,183127,183129],{"class":35,"line":834},[33,183124,35726],{"class":163},[33,183126,783],{"class":50},[33,183128,1852],{"class":163},[33,183130,7583],{"class":167},[33,183132,183133,183135,183137,183139,183141,183143,183145,183147,183149],{"class":35,"line":839},[33,183134,7268],{"class":50},[33,183136,602],{"class":167},[33,183138,4059],{"class":163},[33,183140,158012],{"class":54},[33,183142,1115],{"class":50},[33,183144,7602],{"class":167},[33,183146,1121],{"class":50},[33,183148,274],{"class":54},[33,183150,221],{"class":167},[14,183152,42581,183153,183155,183156,183158,183159,3035],{},[30,183154,179373],{}," inside a table cell is called on a ",[30,183157,161368],{},", not on the document directly. Access it via ",[30,183160,183161],{},"paragraph.add_run().add_picture(...)",[18,183163,183165],{"id":183164},"_5-inserting-images-into-headers-and-footers","5. Inserting Images into Headers and Footers",[14,183167,183168,183169,183172],{},"Headers and footers are separate ",[30,183170,183171],{},"_HeaderFooter"," objects. Access the paragraph inside them and insert via a run, exactly as you would in a table cell.",[23,183174,183176],{"className":126,"code":183175,"language":47,"meta":28,"style":28},"# pip install python-docx\nfrom pathlib import Path\nfrom docx import Document\nfrom docx.shared import Inches\nfrom docx.enum.text import WD_ALIGN_PARAGRAPH\n\nLOGO = Path(\"assets\u002Flogo.png\")\nOUTPUT = Path(\"output\u002Fheader_logo.docx\")\n\ndoc = Document()\nsection = doc.sections[0]\n\n# -- Header logo --\nheader = section.header\nheader_para = header.paragraphs[0]\nheader_para.alignment = WD_ALIGN_PARAGRAPH.RIGHT\nheader_run = header_para.add_run()\ntry:\n    header_run.add_picture(str(LOGO), width=Inches(1.2))\nexcept FileNotFoundError:\n    header_para.text = \"Logo missing\"\n\n# -- Footer page note --\nfooter = section.footer\nfooter_para = footer.paragraphs[0]\nfooter_para.text = \"Confidential — \"\nfooter_run = footer_para.add_run()\ntry:\n    footer_run.add_picture(str(LOGO), width=Inches(0.6))\nexcept FileNotFoundError:\n    footer_run.text = \"[logo]\"\n\ndoc.add_paragraph(\"Document body text here.\")\ntry:\n    doc.save(str(OUTPUT))\n    print(f\"Saved: {OUTPUT}\")\nexcept Exception as e:\n    print(f\"Save failed: {e}\")\n",[30,183177,183178,183182,183192,183202,183212,183222,183226,183238,183251,183255,183263,183275,183279,183284,183292,183306,183320,183330,183336,183360,183368,183378,183382,183387,183396,183408,183418,183427,183433,183456,183464,183474,183478,183487,183493,183505,183521,183531],{"__ignoreMap":28},[33,183179,183180],{"class":35,"line":36},[33,183181,156213],{"class":39},[33,183183,183184,183186,183188,183190],{"class":35,"line":43},[33,183185,190],{"class":163},[33,183187,193],{"class":167},[33,183189,164],{"class":163},[33,183191,198],{"class":167},[33,183193,183194,183196,183198,183200],{"class":35,"line":61},[33,183195,190],{"class":163},[33,183197,18092],{"class":167},[33,183199,164],{"class":163},[33,183201,18097],{"class":167},[33,183203,183204,183206,183208,183210],{"class":35,"line":73},[33,183205,190],{"class":163},[33,183207,18104],{"class":167},[33,183209,164],{"class":163},[33,183211,157048],{"class":167},[33,183213,183214,183216,183218,183220],{"class":35,"line":88},[33,183215,190],{"class":163},[33,183217,18116],{"class":167},[33,183219,164],{"class":163},[33,183221,18121],{"class":50},[33,183223,183224],{"class":35,"line":95},[33,183225,92],{"emptyLinePlaceholder":91},[33,183227,183228,183230,183232,183234,183236],{"class":35,"line":101},[33,183229,91271],{"class":50},[33,183231,212],{"class":163},[33,183233,215],{"class":167},[33,183235,179123],{"class":54},[33,183237,221],{"class":167},[33,183239,183240,183242,183244,183246,183249],{"class":35,"line":171},[33,183241,96935],{"class":50},[33,183243,212],{"class":163},[33,183245,215],{"class":167},[33,183247,183248],{"class":54},"\"output\u002Fheader_logo.docx\"",[33,183250,221],{"class":167},[33,183252,183253],{"class":35,"line":179},[33,183254,92],{"emptyLinePlaceholder":91},[33,183256,183257,183259,183261],{"class":35,"line":187},[33,183258,156566],{"class":167},[33,183260,242],{"class":163},[33,183262,18229],{"class":167},[33,183264,183265,183267,183269,183271,183273],{"class":35,"line":201},[33,183266,156584],{"class":167},[33,183268,242],{"class":163},[33,183270,156589],{"class":167},[33,183272,748],{"class":50},[33,183274,9202],{"class":167},[33,183276,183277],{"class":35,"line":206},[33,183278,92],{"emptyLinePlaceholder":91},[33,183280,183281],{"class":35,"line":224},[33,183282,183283],{"class":39},"# -- Header logo --\n",[33,183285,183286,183288,183290],{"class":35,"line":229},[33,183287,48309],{"class":167},[33,183289,242],{"class":163},[33,183291,157709],{"class":167},[33,183293,183294,183297,183299,183302,183304],{"class":35,"line":235},[33,183295,183296],{"class":167},"header_para ",[33,183298,242],{"class":163},[33,183300,183301],{"class":167}," header.paragraphs[",[33,183303,748],{"class":50},[33,183305,9202],{"class":167},[33,183307,183308,183311,183313,183315,183317],{"class":35,"line":250},[33,183309,183310],{"class":167},"header_para.alignment ",[33,183312,242],{"class":163},[33,183314,182934],{"class":50},[33,183316,3035],{"class":167},[33,183318,183319],{"class":50},"RIGHT\n",[33,183321,183322,183325,183327],{"class":35,"line":266},[33,183323,183324],{"class":167},"header_run ",[33,183326,242],{"class":163},[33,183328,183329],{"class":167}," header_para.add_run()\n",[33,183331,183332,183334],{"class":35,"line":290},[33,183333,35574],{"class":163},[33,183335,574],{"class":167},[33,183337,183338,183341,183343,183345,183347,183349,183351,183353,183355,183358],{"class":35,"line":295},[33,183339,183340],{"class":167},"    header_run.add_picture(",[33,183342,1053],{"class":50},[33,183344,602],{"class":167},[33,183346,91271],{"class":50},[33,183348,18525],{"class":167},[33,183350,56684],{"class":238},[33,183352,242],{"class":163},[33,183354,179507],{"class":167},[33,183356,183357],{"class":50},"1.2",[33,183359,371],{"class":167},[33,183361,183362,183364,183366],{"class":35,"line":300},[33,183363,35726],{"class":163},[33,183365,2945],{"class":50},[33,183367,574],{"class":167},[33,183369,183370,183373,183375],{"class":35,"line":317},[33,183371,183372],{"class":167},"    header_para.text ",[33,183374,242],{"class":163},[33,183376,183377],{"class":54}," \"Logo missing\"\n",[33,183379,183380],{"class":35,"line":332},[33,183381,92],{"emptyLinePlaceholder":91},[33,183383,183384],{"class":35,"line":347},[33,183385,183386],{"class":39},"# -- Footer page note --\n",[33,183388,183389,183392,183394],{"class":35,"line":374},[33,183390,183391],{"class":167},"footer ",[33,183393,242],{"class":163},[33,183395,157719],{"class":167},[33,183397,183398,183400,183402,183404,183406],{"class":35,"line":397},[33,183399,157779],{"class":167},[33,183401,242],{"class":163},[33,183403,157784],{"class":167},[33,183405,748],{"class":50},[33,183407,9202],{"class":167},[33,183409,183410,183413,183415],{"class":35,"line":653},[33,183411,183412],{"class":167},"footer_para.text ",[33,183414,242],{"class":163},[33,183416,183417],{"class":54}," \"Confidential — \"\n",[33,183419,183420,183423,183425],{"class":35,"line":667},[33,183421,183422],{"class":167},"footer_run ",[33,183424,242],{"class":163},[33,183426,157912],{"class":167},[33,183428,183429,183431],{"class":35,"line":675},[33,183430,35574],{"class":163},[33,183432,574],{"class":167},[33,183434,183435,183438,183440,183442,183444,183446,183448,183450,183452,183454],{"class":35,"line":689},[33,183436,183437],{"class":167},"    footer_run.add_picture(",[33,183439,1053],{"class":50},[33,183441,602],{"class":167},[33,183443,91271],{"class":50},[33,183445,18525],{"class":167},[33,183447,56684],{"class":238},[33,183449,242],{"class":163},[33,183451,179507],{"class":167},[33,183453,46090],{"class":50},[33,183455,371],{"class":167},[33,183457,183458,183460,183462],{"class":35,"line":703},[33,183459,35726],{"class":163},[33,183461,2945],{"class":50},[33,183463,574],{"class":167},[33,183465,183466,183469,183471],{"class":35,"line":714},[33,183467,183468],{"class":167},"    footer_run.text ",[33,183470,242],{"class":163},[33,183472,183473],{"class":54}," \"[logo]\"\n",[33,183475,183476],{"class":35,"line":723},[33,183477,92],{"emptyLinePlaceholder":91},[33,183479,183480,183482,183485],{"class":35,"line":754},[33,183481,163149],{"class":167},[33,183483,183484],{"class":54},"\"Document body text here.\"",[33,183486,221],{"class":167},[33,183488,183489,183491],{"class":35,"line":771},[33,183490,35574],{"class":163},[33,183492,574],{"class":167},[33,183494,183495,183497,183499,183501,183503],{"class":35,"line":777},[33,183496,85716],{"class":167},[33,183498,1053],{"class":50},[33,183500,602],{"class":167},[33,183502,96935],{"class":50},[33,183504,371],{"class":167},[33,183506,183507,183509,183511,183513,183515,183517,183519],{"class":35,"line":788},[33,183508,7268],{"class":50},[33,183510,602],{"class":167},[33,183512,4059],{"class":163},[33,183514,97737],{"class":54},[33,183516,97684],{"class":50},[33,183518,274],{"class":54},[33,183520,221],{"class":167},[33,183522,183523,183525,183527,183529],{"class":35,"line":804},[33,183524,35726],{"class":163},[33,183526,783],{"class":50},[33,183528,1852],{"class":163},[33,183530,7583],{"class":167},[33,183532,183533,183535,183537,183539,183541,183543,183545,183547,183549],{"class":35,"line":809},[33,183534,7268],{"class":50},[33,183536,602],{"class":167},[33,183538,4059],{"class":163},[33,183540,158012],{"class":54},[33,183542,1115],{"class":50},[33,183544,7602],{"class":167},[33,183546,1121],{"class":50},[33,183548,274],{"class":54},[33,183550,221],{"class":167},[14,183552,183553,183554,183557,183558,183561],{},"If the document uses different first-page headers, set ",[30,183555,183556],{},"section.different_first_page_header_footer = True"," and populate ",[30,183559,183560],{},"section.first_page_header"," separately.",[18,183563,183565],{"id":183564},"_6-inline-vs-floating-images","6. Inline vs Floating Images",[14,183567,183568,183569,183572],{},"python-docx inserts images as ",[1974,183570,183571],{},"inline shapes"," only — they flow with the text. Floating images (text wrap around an image) require raw OOXML manipulation because the python-docx API does not expose them.",[14,183574,183575,183576,183578],{},"For the majority of automated reporting workflows, inline is preferable: it is deterministic, survives re-flowing, and requires no XML patching. If you need floating images (e.g., for magazine-style layouts), embed a pre-positioned placeholder in the template and replace it via ",[30,183577,18047],{}," rather than injecting via python-docx.",[18,183580,183582],{"id":183581},"_7-adding-captions","7. Adding Captions",[14,183584,183585,183586,183589,183590,3035],{},"python-docx does not have a dedicated ",[30,183587,183588],{},"add_caption()"," method. The standard approach is to add a paragraph immediately after the picture paragraph and style it as ",[30,183591,183592],{},"Caption",[23,183594,183596],{"className":126,"code":183595,"language":47,"meta":28,"style":28},"# pip install python-docx\nfrom pathlib import Path\nfrom docx import Document\nfrom docx.shared import Inches, Pt\nfrom docx.enum.text import WD_ALIGN_PARAGRAPH\n\nIMAGE = Path(\"assets\u002Fchart.png\")\nOUTPUT = Path(\"output\u002Fcaptioned_image.docx\")\n\ndoc = Document()\ndoc.add_heading(\"Quarterly Results\", level=2)\n\ntry:\n    doc.add_picture(str(IMAGE), width=Inches(4.0))\n    # Caption immediately follows the picture paragraph\n    caption = doc.add_paragraph(\"Figure 1 — Q3 revenue by product line\", style=\"Caption\")\n    caption.alignment = WD_ALIGN_PARAGRAPH.CENTER\n    doc.save(str(OUTPUT))\n    print(f\"Saved: {OUTPUT}\")\nexcept FileNotFoundError:\n    print(f\"Image not found: {IMAGE}\")\nexcept Exception as e:\n    print(f\"Error: {e}\")\n",[30,183597,183598,183602,183612,183622,183632,183642,183646,183658,183671,183675,183683,183700,183704,183710,183733,183738,183761,183774,183786,183802,183810,183826,183836],{"__ignoreMap":28},[33,183599,183600],{"class":35,"line":36},[33,183601,156213],{"class":39},[33,183603,183604,183606,183608,183610],{"class":35,"line":43},[33,183605,190],{"class":163},[33,183607,193],{"class":167},[33,183609,164],{"class":163},[33,183611,198],{"class":167},[33,183613,183614,183616,183618,183620],{"class":35,"line":61},[33,183615,190],{"class":163},[33,183617,18092],{"class":167},[33,183619,164],{"class":163},[33,183621,18097],{"class":167},[33,183623,183624,183626,183628,183630],{"class":35,"line":73},[33,183625,190],{"class":163},[33,183627,18104],{"class":167},[33,183629,164],{"class":163},[33,183631,156506],{"class":167},[33,183633,183634,183636,183638,183640],{"class":35,"line":88},[33,183635,190],{"class":163},[33,183637,18116],{"class":167},[33,183639,164],{"class":163},[33,183641,18121],{"class":50},[33,183643,183644],{"class":35,"line":95},[33,183645,92],{"emptyLinePlaceholder":91},[33,183647,183648,183650,183652,183654,183656],{"class":35,"line":101},[33,183649,179422],{"class":50},[33,183651,212],{"class":163},[33,183653,215],{"class":167},[33,183655,182438],{"class":54},[33,183657,221],{"class":167},[33,183659,183660,183662,183664,183666,183669],{"class":35,"line":171},[33,183661,96935],{"class":50},[33,183663,212],{"class":163},[33,183665,215],{"class":167},[33,183667,183668],{"class":54},"\"output\u002Fcaptioned_image.docx\"",[33,183670,221],{"class":167},[33,183672,183673],{"class":35,"line":179},[33,183674,92],{"emptyLinePlaceholder":91},[33,183676,183677,183679,183681],{"class":35,"line":187},[33,183678,156566],{"class":167},[33,183680,242],{"class":163},[33,183682,18229],{"class":167},[33,183684,183685,183687,183690,183692,183694,183696,183698],{"class":35,"line":201},[33,183686,156723],{"class":167},[33,183688,183689],{"class":54},"\"Quarterly Results\"",[33,183691,365],{"class":167},[33,183693,18267],{"class":238},[33,183695,242],{"class":163},[33,183697,1533],{"class":50},[33,183699,221],{"class":167},[33,183701,183702],{"class":35,"line":206},[33,183703,92],{"emptyLinePlaceholder":91},[33,183705,183706,183708],{"class":35,"line":224},[33,183707,35574],{"class":163},[33,183709,574],{"class":167},[33,183711,183712,183714,183716,183718,183720,183722,183724,183726,183728,183731],{"class":35,"line":229},[33,183713,179492],{"class":167},[33,183715,1053],{"class":50},[33,183717,602],{"class":167},[33,183719,179422],{"class":50},[33,183721,18525],{"class":167},[33,183723,56684],{"class":238},[33,183725,242],{"class":163},[33,183727,179507],{"class":167},[33,183729,183730],{"class":50},"4.0",[33,183732,371],{"class":167},[33,183734,183735],{"class":35,"line":235},[33,183736,183737],{"class":39},"    # Caption immediately follows the picture paragraph\n",[33,183739,183740,183743,183745,183747,183750,183752,183754,183756,183759],{"class":35,"line":250},[33,183741,183742],{"class":167},"    caption ",[33,183744,242],{"class":163},[33,183746,156861],{"class":167},[33,183748,183749],{"class":54},"\"Figure 1 — Q3 revenue by product line\"",[33,183751,365],{"class":167},[33,183753,6953],{"class":238},[33,183755,242],{"class":163},[33,183757,183758],{"class":54},"\"Caption\"",[33,183760,221],{"class":167},[33,183762,183763,183766,183768,183770,183772],{"class":35,"line":266},[33,183764,183765],{"class":167},"    caption.alignment ",[33,183767,242],{"class":163},[33,183769,182934],{"class":50},[33,183771,3035],{"class":167},[33,183773,182939],{"class":50},[33,183775,183776,183778,183780,183782,183784],{"class":35,"line":290},[33,183777,85716],{"class":167},[33,183779,1053],{"class":50},[33,183781,602],{"class":167},[33,183783,96935],{"class":50},[33,183785,371],{"class":167},[33,183787,183788,183790,183792,183794,183796,183798,183800],{"class":35,"line":295},[33,183789,7268],{"class":50},[33,183791,602],{"class":167},[33,183793,4059],{"class":163},[33,183795,97737],{"class":54},[33,183797,97684],{"class":50},[33,183799,274],{"class":54},[33,183801,221],{"class":167},[33,183803,183804,183806,183808],{"class":35,"line":300},[33,183805,35726],{"class":163},[33,183807,2945],{"class":50},[33,183809,574],{"class":167},[33,183811,183812,183814,183816,183818,183820,183822,183824],{"class":35,"line":317},[33,183813,7268],{"class":50},[33,183815,602],{"class":167},[33,183817,4059],{"class":163},[33,183819,179562],{"class":54},[33,183821,179565],{"class":50},[33,183823,274],{"class":54},[33,183825,221],{"class":167},[33,183827,183828,183830,183832,183834],{"class":35,"line":332},[33,183829,35726],{"class":163},[33,183831,783],{"class":50},[33,183833,1852],{"class":163},[33,183835,7583],{"class":167},[33,183837,183838,183840,183842,183844,183846,183848,183850,183852,183854],{"class":35,"line":347},[33,183839,7268],{"class":50},[33,183841,602],{"class":167},[33,183843,4059],{"class":163},[33,183845,39108],{"class":54},[33,183847,1115],{"class":50},[33,183849,7602],{"class":167},[33,183851,1121],{"class":50},[33,183853,274],{"class":54},[33,183855,221],{"class":167},[14,183857,39550,183858,183860,183861,183863,183864,36715],{},[30,183859,183592],{}," style must exist in the document's style set. It is present in all default ",[30,183862,18041],{}," documents. If you open a stripped template that lacks it, add ",[30,183865,183866],{},"doc.styles.add_style(\"Caption\", WD_STYLE_TYPE.PARAGRAPH)",[18,183868,183870],{"id":183869},"_8-inserting-images-from-bytes-bytesio","8. Inserting Images from Bytes \u002F BytesIO",[14,183872,183873,183874,183877,183878,183880],{},"When images come from a database, S3, or an in-memory generation pipeline (e.g., a ",[30,183875,183876],{},"matplotlib"," chart), avoid writing to disk — pass a ",[30,183879,61504],{}," object directly.",[23,183882,183884],{"className":126,"code":183883,"language":47,"meta":28,"style":28},"# pip install python-docx matplotlib\nimport io\nfrom pathlib import Path\nimport matplotlib.pyplot as plt\nfrom docx import Document\nfrom docx.shared import Inches\n\nOUTPUT = Path(\"output\u002Fchart_from_bytes.docx\")\n\n# Generate a chart in memory\nfig, ax = plt.subplots(figsize=(6, 3))\nax.bar([\"Jan\", \"Feb\", \"Mar\"], [120, 145, 98])\nax.set_title(\"Monthly Units\")\nbuf = io.BytesIO()\nfig.savefig(buf, format=\"png\", dpi=150, bbox_inches=\"tight\")\nplt.close(fig)\nbuf.seek(0)\n\ndoc = Document()\ndoc.add_heading(\"Sales Report\", level=1)\ntry:\n    doc.add_picture(buf, width=Inches(5.0))\n    doc.save(str(OUTPUT))\n    print(f\"Saved: {OUTPUT}\")\nexcept Exception as e:\n    print(f\"Failed to insert chart: {e}\")\n",[30,183885,183886,183891,183897,183907,183917,183927,183937,183941,183954,183958,183963,183986,184015,184025,184033,184064,184069,184078,184082,184090,184107,184113,184129,184141,184157,184167],{"__ignoreMap":28},[33,183887,183888],{"class":35,"line":36},[33,183889,183890],{"class":39},"# pip install python-docx matplotlib\n",[33,183892,183893,183895],{"class":35,"line":43},[33,183894,164],{"class":163},[33,183896,60058],{"class":167},[33,183898,183899,183901,183903,183905],{"class":35,"line":61},[33,183900,190],{"class":163},[33,183902,193],{"class":167},[33,183904,164],{"class":163},[33,183906,198],{"class":167},[33,183908,183909,183911,183913,183915],{"class":35,"line":73},[33,183910,164],{"class":163},[33,183912,61552],{"class":167},[33,183914,495],{"class":163},[33,183916,61557],{"class":167},[33,183918,183919,183921,183923,183925],{"class":35,"line":88},[33,183920,190],{"class":163},[33,183922,18092],{"class":167},[33,183924,164],{"class":163},[33,183926,18097],{"class":167},[33,183928,183929,183931,183933,183935],{"class":35,"line":95},[33,183930,190],{"class":163},[33,183932,18104],{"class":167},[33,183934,164],{"class":163},[33,183936,157048],{"class":167},[33,183938,183939],{"class":35,"line":101},[33,183940,92],{"emptyLinePlaceholder":91},[33,183942,183943,183945,183947,183949,183952],{"class":35,"line":171},[33,183944,96935],{"class":50},[33,183946,212],{"class":163},[33,183948,215],{"class":167},[33,183950,183951],{"class":54},"\"output\u002Fchart_from_bytes.docx\"",[33,183953,221],{"class":167},[33,183955,183956],{"class":35,"line":179},[33,183957,92],{"emptyLinePlaceholder":91},[33,183959,183960],{"class":35,"line":187},[33,183961,183962],{"class":39},"# Generate a chart in memory\n",[33,183964,183965,183968,183970,183972,183974,183976,183978,183980,183982,183984],{"class":35,"line":201},[33,183966,183967],{"class":167},"fig, ax ",[33,183969,242],{"class":163},[33,183971,61687],{"class":167},[33,183973,61690],{"class":238},[33,183975,242],{"class":163},[33,183977,602],{"class":167},[33,183979,2681],{"class":50},[33,183981,365],{"class":167},[33,183983,10258],{"class":50},[33,183985,371],{"class":167},[33,183987,183988,183991,183993,183995,183997,183999,184001,184003,184005,184007,184009,184011,184013],{"class":35,"line":206},[33,183989,183990],{"class":167},"ax.bar([",[33,183992,11790],{"class":54},[33,183994,365],{"class":167},[33,183996,11795],{"class":54},[33,183998,365],{"class":167},[33,184000,11800],{"class":54},[33,184002,51701],{"class":167},[33,184004,2589],{"class":50},[33,184006,365],{"class":167},[33,184008,2648],{"class":50},[33,184010,365],{"class":167},[33,184012,82416],{"class":50},[33,184014,751],{"class":167},[33,184016,184017,184020,184023],{"class":35,"line":224},[33,184018,184019],{"class":167},"ax.set_title(",[33,184021,184022],{"class":54},"\"Monthly Units\"",[33,184024,221],{"class":167},[33,184026,184027,184029,184031],{"class":35,"line":229},[33,184028,174862],{"class":167},[33,184030,242],{"class":163},[33,184032,61918],{"class":167},[33,184034,184035,184038,184040,184042,184044,184046,184048,184050,184052,184054,184057,184059,184062],{"class":35,"line":235},[33,184036,184037],{"class":167},"fig.savefig(buf, ",[33,184039,61926],{"class":238},[33,184041,242],{"class":163},[33,184043,61931],{"class":54},[33,184045,365],{"class":167},[33,184047,46966],{"class":238},[33,184049,242],{"class":163},[33,184051,2635],{"class":50},[33,184053,365],{"class":167},[33,184055,184056],{"class":238},"bbox_inches",[33,184058,242],{"class":163},[33,184060,184061],{"class":54},"\"tight\"",[33,184063,221],{"class":167},[33,184065,184066],{"class":35,"line":250},[33,184067,184068],{"class":167},"plt.close(fig)\n",[33,184070,184071,184074,184076],{"class":35,"line":266},[33,184072,184073],{"class":167},"buf.seek(",[33,184075,748],{"class":50},[33,184077,221],{"class":167},[33,184079,184080],{"class":35,"line":290},[33,184081,92],{"emptyLinePlaceholder":91},[33,184083,184084,184086,184088],{"class":35,"line":295},[33,184085,156566],{"class":167},[33,184087,242],{"class":163},[33,184089,18229],{"class":167},[33,184091,184092,184094,184097,184099,184101,184103,184105],{"class":35,"line":300},[33,184093,156723],{"class":167},[33,184095,184096],{"class":54},"\"Sales Report\"",[33,184098,365],{"class":167},[33,184100,18267],{"class":238},[33,184102,242],{"class":163},[33,184104,734],{"class":50},[33,184106,221],{"class":167},[33,184108,184109,184111],{"class":35,"line":317},[33,184110,35574],{"class":163},[33,184112,574],{"class":167},[33,184114,184115,184118,184120,184122,184124,184127],{"class":35,"line":332},[33,184116,184117],{"class":167},"    doc.add_picture(buf, ",[33,184119,56684],{"class":238},[33,184121,242],{"class":163},[33,184123,179507],{"class":167},[33,184125,184126],{"class":50},"5.0",[33,184128,371],{"class":167},[33,184130,184131,184133,184135,184137,184139],{"class":35,"line":347},[33,184132,85716],{"class":167},[33,184134,1053],{"class":50},[33,184136,602],{"class":167},[33,184138,96935],{"class":50},[33,184140,371],{"class":167},[33,184142,184143,184145,184147,184149,184151,184153,184155],{"class":35,"line":374},[33,184144,7268],{"class":50},[33,184146,602],{"class":167},[33,184148,4059],{"class":163},[33,184150,97737],{"class":54},[33,184152,97684],{"class":50},[33,184154,274],{"class":54},[33,184156,221],{"class":167},[33,184158,184159,184161,184163,184165],{"class":35,"line":397},[33,184160,35726],{"class":163},[33,184162,783],{"class":50},[33,184164,1852],{"class":163},[33,184166,7583],{"class":167},[33,184168,184169,184171,184173,184175,184178,184180,184182,184184,184186],{"class":35,"line":653},[33,184170,7268],{"class":50},[33,184172,602],{"class":167},[33,184174,4059],{"class":163},[33,184176,184177],{"class":54},"\"Failed to insert chart: ",[33,184179,1115],{"class":50},[33,184181,7602],{"class":167},[33,184183,1121],{"class":50},[33,184185,274],{"class":54},[33,184187,221],{"class":167},[14,184189,184190,184192,184193,184195],{},[30,184191,181215],{}," is mandatory before passing to ",[30,184194,179373],{}," — otherwise the read position is at the end of the buffer and python-docx reads zero bytes.",[18,184197,184199],{"id":184198},"_9-batch-logoimage-insertion-from-a-folder","9. Batch Logo\u002FImage Insertion from a Folder",[14,184201,184202],{},"A common pattern: each row in a dataset maps to a product image file. Iterate the records, resolve the image path, fall back to a placeholder if the file is missing.",[23,184204,184206],{"className":126,"code":184205,"language":47,"meta":28,"style":28},"# pip install python-docx pandas\nimport pandas as pd\nfrom pathlib import Path\nfrom docx import Document\nfrom docx.shared import Inches\n\nIMAGES_DIR = Path(\"assets\u002Fproducts\")\nDATA_FILE = Path(\"data\u002Fproducts.csv\")\nOUTPUT_DIR = Path(\"output\u002Fproduct_sheets\")\nPLACEHOLDER = Path(\"assets\u002Fplaceholder.png\")\nOUTPUT_DIR.mkdir(parents=True, exist_ok=True)\n\ntry:\n    df = pd.read_csv(DATA_FILE)\nexcept FileNotFoundError:\n    raise SystemExit(f\"Data file not found: {DATA_FILE}\")\n\nfor _, row in df.iterrows():\n    doc = Document()\n    doc.add_heading(str(row.get(\"product_name\", \"Product\")), level=1)\n    doc.add_paragraph(str(row.get(\"description\", \"\")))\n\n    img_path = IMAGES_DIR \u002F f\"{row['sku']}.png\"\n    if not img_path.exists():\n        img_path = PLACEHOLDER\n\n    try:\n        doc.add_picture(str(img_path), width=Inches(3.0))\n    except Exception as e:\n        doc.add_paragraph(f\"[Image unavailable: {e}]\")\n\n    out_file = OUTPUT_DIR \u002F f\"{row['sku']}.docx\"\n    try:\n        doc.save(str(out_file))\n    except Exception as e:\n        print(f\"Could not save {out_file}: {e}\")\n\nprint(f\"Generated {len(df)} product sheets in {OUTPUT_DIR}\")\n",[30,184207,184208,184212,184222,184232,184242,184252,184256,184270,184284,184297,184311,184333,184337,184343,184355,184363,184382,184386,184396,184404,184430,184446,184450,184480,184489,184499,184503,184509,184528,184538,184558,184562,184589,184595,184604,184614,184644,184648],{"__ignoreMap":28},[33,184209,184210],{"class":35,"line":36},[33,184211,160200],{"class":39},[33,184213,184214,184216,184218,184220],{"class":35,"line":43},[33,184215,164],{"class":163},[33,184217,492],{"class":167},[33,184219,495],{"class":163},[33,184221,498],{"class":167},[33,184223,184224,184226,184228,184230],{"class":35,"line":61},[33,184225,190],{"class":163},[33,184227,193],{"class":167},[33,184229,164],{"class":163},[33,184231,198],{"class":167},[33,184233,184234,184236,184238,184240],{"class":35,"line":73},[33,184235,190],{"class":163},[33,184237,18092],{"class":167},[33,184239,164],{"class":163},[33,184241,18097],{"class":167},[33,184243,184244,184246,184248,184250],{"class":35,"line":88},[33,184245,190],{"class":163},[33,184247,18104],{"class":167},[33,184249,164],{"class":163},[33,184251,157048],{"class":167},[33,184253,184254],{"class":35,"line":95},[33,184255,92],{"emptyLinePlaceholder":91},[33,184257,184258,184261,184263,184265,184268],{"class":35,"line":101},[33,184259,184260],{"class":50},"IMAGES_DIR",[33,184262,212],{"class":163},[33,184264,215],{"class":167},[33,184266,184267],{"class":54},"\"assets\u002Fproducts\"",[33,184269,221],{"class":167},[33,184271,184272,184275,184277,184279,184282],{"class":35,"line":171},[33,184273,184274],{"class":50},"DATA_FILE",[33,184276,212],{"class":163},[33,184278,215],{"class":167},[33,184280,184281],{"class":54},"\"data\u002Fproducts.csv\"",[33,184283,221],{"class":167},[33,184285,184286,184288,184290,184292,184295],{"class":35,"line":179},[33,184287,4615],{"class":50},[33,184289,212],{"class":163},[33,184291,215],{"class":167},[33,184293,184294],{"class":54},"\"output\u002Fproduct_sheets\"",[33,184296,221],{"class":167},[33,184298,184299,184302,184304,184306,184309],{"class":35,"line":187},[33,184300,184301],{"class":50},"PLACEHOLDER",[33,184303,212],{"class":163},[33,184305,215],{"class":167},[33,184307,184308],{"class":54},"\"assets\u002Fplaceholder.png\"",[33,184310,221],{"class":167},[33,184312,184313,184315,184317,184319,184321,184323,184325,184327,184329,184331],{"class":35,"line":201},[33,184314,4615],{"class":50},[33,184316,1078],{"class":167},[33,184318,869],{"class":238},[33,184320,242],{"class":163},[33,184322,855],{"class":50},[33,184324,365],{"class":167},[33,184326,878],{"class":238},[33,184328,242],{"class":163},[33,184330,855],{"class":50},[33,184332,221],{"class":167},[33,184334,184335],{"class":35,"line":206},[33,184336,92],{"emptyLinePlaceholder":91},[33,184338,184339,184341],{"class":35,"line":224},[33,184340,35574],{"class":163},[33,184342,574],{"class":167},[33,184344,184345,184347,184349,184351,184353],{"class":35,"line":229},[33,184346,4025],{"class":167},[33,184348,242],{"class":163},[33,184350,9481],{"class":167},[33,184352,184274],{"class":50},[33,184354,221],{"class":167},[33,184356,184357,184359,184361],{"class":35,"line":235},[33,184358,35726],{"class":163},[33,184360,2945],{"class":50},[33,184362,574],{"class":167},[33,184364,184365,184367,184369,184371,184373,184375,184378,184380],{"class":35,"line":250},[33,184366,35742],{"class":163},[33,184368,16617],{"class":50},[33,184370,602],{"class":167},[33,184372,4059],{"class":163},[33,184374,59825],{"class":54},[33,184376,184377],{"class":50},"{DATA_FILE}",[33,184379,274],{"class":54},[33,184381,221],{"class":167},[33,184383,184384],{"class":35,"line":266},[33,184385,92],{"emptyLinePlaceholder":91},[33,184387,184388,184390,184392,184394],{"class":35,"line":290},[33,184389,6124],{"class":163},[33,184391,8560],{"class":167},[33,184393,662],{"class":163},[33,184395,8565],{"class":167},[33,184397,184398,184400,184402],{"class":35,"line":295},[33,184399,18224],{"class":167},[33,184401,242],{"class":163},[33,184403,18229],{"class":167},[33,184405,184406,184408,184410,184413,184416,184418,184420,184422,184424,184426,184428],{"class":35,"line":300},[33,184407,18591],{"class":167},[33,184409,1053],{"class":50},[33,184411,184412],{"class":167},"(row.get(",[33,184414,184415],{"class":54},"\"product_name\"",[33,184417,365],{"class":167},[33,184419,19580],{"class":54},[33,184421,77348],{"class":167},[33,184423,18267],{"class":238},[33,184425,242],{"class":163},[33,184427,734],{"class":50},[33,184429,221],{"class":167},[33,184431,184432,184434,184436,184438,184440,184442,184444],{"class":35,"line":317},[33,184433,28414],{"class":167},[33,184435,1053],{"class":50},[33,184437,184412],{"class":167},[33,184439,171073],{"class":54},[33,184441,365],{"class":167},[33,184443,3198],{"class":54},[33,184445,23269],{"class":167},[33,184447,184448],{"class":35,"line":332},[33,184449,92],{"emptyLinePlaceholder":91},[33,184451,184452,184455,184457,184460,184462,184464,184466,184468,184470,184473,184475,184477],{"class":35,"line":347},[33,184453,184454],{"class":167},"    img_path ",[33,184456,242],{"class":163},[33,184458,184459],{"class":50}," IMAGES_DIR",[33,184461,1107],{"class":163},[33,184463,1110],{"class":163},[33,184465,274],{"class":54},[33,184467,1115],{"class":50},[33,184469,18825],{"class":167},[33,184471,184472],{"class":54},"'sku'",[33,184474,9546],{"class":167},[33,184476,1121],{"class":50},[33,184478,184479],{"class":54},".png\"\n",[33,184481,184482,184484,184486],{"class":35,"line":374},[33,184483,617],{"class":163},[33,184485,620],{"class":163},[33,184487,184488],{"class":167}," img_path.exists():\n",[33,184490,184491,184494,184496],{"class":35,"line":397},[33,184492,184493],{"class":167},"        img_path ",[33,184495,242],{"class":163},[33,184497,184498],{"class":50}," PLACEHOLDER\n",[33,184500,184501],{"class":35,"line":653},[33,184502,92],{"emptyLinePlaceholder":91},[33,184504,184505,184507],{"class":35,"line":667},[33,184506,2424],{"class":163},[33,184508,574],{"class":167},[33,184510,184511,184514,184516,184518,184520,184522,184524,184526],{"class":35,"line":675},[33,184512,184513],{"class":167},"        doc.add_picture(",[33,184515,1053],{"class":50},[33,184517,180729],{"class":167},[33,184519,56684],{"class":238},[33,184521,242],{"class":163},[33,184523,179507],{"class":167},[33,184525,179510],{"class":50},[33,184527,371],{"class":167},[33,184529,184530,184532,184534,184536],{"class":35,"line":689},[33,184531,2449],{"class":163},[33,184533,783],{"class":50},[33,184535,1852],{"class":163},[33,184537,7583],{"class":167},[33,184539,184540,184542,184544,184547,184549,184551,184553,184556],{"class":35,"line":703},[33,184541,160511],{"class":167},[33,184543,4059],{"class":163},[33,184545,184546],{"class":54},"\"[Image unavailable: ",[33,184548,1115],{"class":50},[33,184550,7602],{"class":167},[33,184552,1121],{"class":50},[33,184554,184555],{"class":54},"]\"",[33,184557,221],{"class":167},[33,184559,184560],{"class":35,"line":714},[33,184561,92],{"emptyLinePlaceholder":91},[33,184563,184564,184567,184569,184571,184573,184575,184577,184579,184581,184583,184585,184587],{"class":35,"line":723},[33,184565,184566],{"class":167},"    out_file ",[33,184568,242],{"class":163},[33,184570,50349],{"class":50},[33,184572,1107],{"class":163},[33,184574,1110],{"class":163},[33,184576,274],{"class":54},[33,184578,1115],{"class":50},[33,184580,18825],{"class":167},[33,184582,184472],{"class":54},[33,184584,9546],{"class":167},[33,184586,1121],{"class":50},[33,184588,18215],{"class":54},[33,184590,184591,184593],{"class":35,"line":754},[33,184592,2424],{"class":163},[33,184594,574],{"class":167},[33,184596,184597,184599,184601],{"class":35,"line":771},[33,184598,84067],{"class":167},[33,184600,1053],{"class":50},[33,184602,184603],{"class":167},"(out_file))\n",[33,184605,184606,184608,184610,184612],{"class":35,"line":777},[33,184607,2449],{"class":163},[33,184609,783],{"class":50},[33,184611,1852],{"class":163},[33,184613,7583],{"class":167},[33,184615,184616,184618,184620,184622,184625,184627,184630,184632,184634,184636,184638,184640,184642],{"class":35,"line":788},[33,184617,9414],{"class":50},[33,184619,602],{"class":167},[33,184621,4059],{"class":163},[33,184623,184624],{"class":54},"\"Could not save ",[33,184626,1115],{"class":50},[33,184628,184629],{"class":167},"out_file",[33,184631,1121],{"class":50},[33,184633,2079],{"class":54},[33,184635,1115],{"class":50},[33,184637,7602],{"class":167},[33,184639,1121],{"class":50},[33,184641,274],{"class":54},[33,184643,221],{"class":167},[33,184645,184646],{"class":35,"line":804},[33,184647,92],{"emptyLinePlaceholder":91},[33,184649,184650,184652,184654,184656,184658,184660,184662,184664,184667,184670,184672],{"class":35,"line":809},[33,184651,13474],{"class":50},[33,184653,602],{"class":167},[33,184655,4059],{"class":163},[33,184657,57132],{"class":54},[33,184659,4065],{"class":50},[33,184661,4068],{"class":167},[33,184663,1121],{"class":50},[33,184665,184666],{"class":54}," product sheets in ",[33,184668,184669],{"class":50},"{OUTPUT_DIR}",[33,184671,274],{"class":54},[33,184673,221],{"class":167},[18,184675,12944],{"id":12943},[424,184677,184679],{"id":184678},"high-dpi-images-from-design-tools","High-DPI Images from Design Tools",[14,184681,184682,184683,184685,184686,184688],{},"Designers export assets at 300 DPI for print. At native size, a 1200 x 900 px \u002F 300 DPI image is 4 x 3 inches — fine for full-page layouts but too large for a logo slot. Always pass an explicit ",[30,184684,56684],{}," argument. See ",[940,184687,179035],{"href":182040}," for a helper that calculates the scaled size automatically.",[424,184690,184692],{"id":184691},"inserting-charts-generated-by-openpyxl","Inserting Charts Generated by openpyxl",[14,184694,184695,184696,184698,184699,184701],{},"When your pipeline uses ",[940,184697,22009],{"href":102073}," to generate Excel charts, export each chart as an image via ",[30,184700,183876],{}," or save the entire sheet view as PNG, then insert via BytesIO as shown in section 8.",[424,184703,184705],{"id":184704},"transparent-png-backgrounds","Transparent PNG Backgrounds",[14,184707,184708],{},"python-docx preserves PNG transparency. The image will display with a transparent background in Word's default white document view. On colored section backgrounds, this renders correctly — no pre-flattening required.",[18,184710,52030],{"id":52029},[14,184712,184713],{},"After generating, confirm the image dimensions are within expected ranges:",[23,184715,184717],{"className":126,"code":184716,"language":47,"meta":28,"style":28},"# pip install python-docx\nfrom pathlib import Path\nfrom docx import Document\nfrom docx.shared import Inches\n\nGENERATED = Path(\"output\u002Fbasic_image.docx\")\n\ntry:\n    doc = Document(str(GENERATED))\n    shapes = doc.inline_shapes\n    print(f\"Inline shapes found: {len(shapes)}\")\n    for i, shape in enumerate(shapes):\n        w_in = shape.width \u002F 914400  # EMU to inches\n        h_in = shape.height \u002F 914400\n        print(f\"  Shape {i}: {w_in:.2f} x {h_in:.2f} inches\")\n        assert w_in \u003C= 8.5, f\"Shape {i} wider than letter page\"\nexcept FileNotFoundError:\n    print(f\"Document not found: {GENERATED}\")\nexcept AssertionError as e:\n    print(f\"Validation failed: {e}\")\n",[30,184718,184719,184723,184733,184743,184753,184757,184770,184774,184780,184796,184806,184828,184841,184856,184868,184909,184934,184942,184960,184970],{"__ignoreMap":28},[33,184720,184721],{"class":35,"line":36},[33,184722,156213],{"class":39},[33,184724,184725,184727,184729,184731],{"class":35,"line":43},[33,184726,190],{"class":163},[33,184728,193],{"class":167},[33,184730,164],{"class":163},[33,184732,198],{"class":167},[33,184734,184735,184737,184739,184741],{"class":35,"line":61},[33,184736,190],{"class":163},[33,184738,18092],{"class":167},[33,184740,164],{"class":163},[33,184742,18097],{"class":167},[33,184744,184745,184747,184749,184751],{"class":35,"line":73},[33,184746,190],{"class":163},[33,184748,18104],{"class":167},[33,184750,164],{"class":163},[33,184752,157048],{"class":167},[33,184754,184755],{"class":35,"line":88},[33,184756,92],{"emptyLinePlaceholder":91},[33,184758,184759,184762,184764,184766,184768],{"class":35,"line":95},[33,184760,184761],{"class":50},"GENERATED",[33,184763,212],{"class":163},[33,184765,215],{"class":167},[33,184767,182122],{"class":54},[33,184769,221],{"class":167},[33,184771,184772],{"class":35,"line":101},[33,184773,92],{"emptyLinePlaceholder":91},[33,184775,184776,184778],{"class":35,"line":171},[33,184777,35574],{"class":163},[33,184779,574],{"class":167},[33,184781,184782,184784,184786,184788,184790,184792,184794],{"class":35,"line":179},[33,184783,18224],{"class":167},[33,184785,242],{"class":163},[33,184787,156340],{"class":167},[33,184789,1053],{"class":50},[33,184791,602],{"class":167},[33,184793,184761],{"class":50},[33,184795,371],{"class":167},[33,184797,184798,184801,184803],{"class":35,"line":187},[33,184799,184800],{"class":167},"    shapes ",[33,184802,242],{"class":163},[33,184804,184805],{"class":167}," doc.inline_shapes\n",[33,184807,184808,184810,184812,184814,184817,184819,184822,184824,184826],{"class":35,"line":201},[33,184809,7268],{"class":50},[33,184811,602],{"class":167},[33,184813,4059],{"class":163},[33,184815,184816],{"class":54},"\"Inline shapes found: ",[33,184818,4065],{"class":50},[33,184820,184821],{"class":167},"(shapes)",[33,184823,1121],{"class":50},[33,184825,274],{"class":54},[33,184827,221],{"class":167},[33,184829,184830,184832,184834,184836,184838],{"class":35,"line":206},[33,184831,656],{"class":163},[33,184833,181362],{"class":167},[33,184835,662],{"class":163},[33,184837,7403],{"class":50},[33,184839,184840],{"class":167},"(shapes):\n",[33,184842,184843,184845,184847,184849,184851,184853],{"class":35,"line":224},[33,184844,181374],{"class":167},[33,184846,242],{"class":163},[33,184848,181379],{"class":167},[33,184850,1351],{"class":163},[33,184852,179834],{"class":50},[33,184854,184855],{"class":39},"  # EMU to inches\n",[33,184857,184858,184860,184862,184864,184866],{"class":35,"line":229},[33,184859,181388],{"class":167},[33,184861,242],{"class":163},[33,184863,181393],{"class":167},[33,184865,1351],{"class":163},[33,184867,181351],{"class":50},[33,184869,184870,184872,184874,184876,184879,184881,184883,184885,184887,184889,184891,184893,184895,184897,184899,184901,184903,184905,184907],{"class":35,"line":235},[33,184871,9414],{"class":50},[33,184873,602],{"class":167},[33,184875,4059],{"class":163},[33,184877,184878],{"class":54},"\"  Shape ",[33,184880,1115],{"class":50},[33,184882,7499],{"class":167},[33,184884,1121],{"class":50},[33,184886,2079],{"class":54},[33,184888,1115],{"class":50},[33,184890,181445],{"class":167},[33,184892,55819],{"class":163},[33,184894,1121],{"class":50},[33,184896,179197],{"class":54},[33,184898,1115],{"class":50},[33,184900,181456],{"class":167},[33,184902,55819],{"class":163},[33,184904,1121],{"class":50},[33,184906,179293],{"class":54},[33,184908,221],{"class":167},[33,184910,184911,184913,184915,184917,184919,184921,184923,184925,184927,184929,184931],{"class":35,"line":250},[33,184912,21485],{"class":163},[33,184914,181411],{"class":167},[33,184916,44223],{"class":163},[33,184918,181296],{"class":50},[33,184920,365],{"class":167},[33,184922,4059],{"class":163},[33,184924,181432],{"class":54},[33,184926,1115],{"class":50},[33,184928,7499],{"class":167},[33,184930,1121],{"class":50},[33,184932,184933],{"class":54}," wider than letter page\"\n",[33,184935,184936,184938,184940],{"class":35,"line":266},[33,184937,35726],{"class":163},[33,184939,2945],{"class":50},[33,184941,574],{"class":167},[33,184943,184944,184946,184948,184950,184953,184956,184958],{"class":35,"line":290},[33,184945,7268],{"class":50},[33,184947,602],{"class":167},[33,184949,4059],{"class":163},[33,184951,184952],{"class":54},"\"Document not found: ",[33,184954,184955],{"class":50},"{GENERATED}",[33,184957,274],{"class":54},[33,184959,221],{"class":167},[33,184961,184962,184964,184966,184968],{"class":35,"line":295},[33,184963,35726],{"class":163},[33,184965,9445],{"class":50},[33,184967,1852],{"class":163},[33,184969,7583],{"class":167},[33,184971,184972,184974,184976,184978,184980,184982,184984,184986,184988],{"class":35,"line":300},[33,184973,7268],{"class":50},[33,184975,602],{"class":167},[33,184977,4059],{"class":163},[33,184979,124100],{"class":54},[33,184981,1115],{"class":50},[33,184983,7602],{"class":167},[33,184985,1121],{"class":50},[33,184987,274],{"class":54},[33,184989,221],{"class":167},[14,184991,184992,184993,10065,184996,184999],{},"One EMU = 1\u002F914400 of an inch. ",[30,184994,184995],{},"shape.width",[30,184997,184998],{},"shape.height"," are always in EMU.",[18,185001,175155],{"id":175154},[4211,185003,185004,185016,185022],{},[4214,185005,185006,185008,185009,185011,185012,185015],{},[30,185007,179373],{}," reads the entire image file each call. For batch runs inserting the same logo in thousands of documents, read the image bytes once into a ",[30,185010,61504],{},", then ",[30,185013,185014],{},"seek(0)"," before each insertion — this avoids repeated disk I\u002FO.",[4214,185017,185018,185019,185021],{},"python-docx holds the entire document XML in memory. For large batches, instantiate a new ",[30,185020,156261],{}," per output file rather than reusing a single instance; accumulated runs fragment the paragraph tree.",[4214,185023,185024,185025,185027],{},"Very large images (>10 MB) embedded in ",[30,185026,18051],{}," files bloat file sizes significantly. Resize to the intended display size using Pillow before insertion — aim for ~150 DPI at the intended display dimensions.",[18,185029,4271],{"id":4270},[4273,185031,185032,185042],{},[4276,185033,185034],{},[4279,185035,185036,185038,185040],{},[4282,185037,14317],{},[4282,185039,4287],{},[4282,185041,4290],{},[4292,185043,185044,185069,185084,185104,185123],{},[4279,185045,185046,185053,185056],{},[4297,185047,185048,42706,185051],{},[30,185049,185050],{},"PackageNotFoundError",[30,185052,179373],{},[4297,185054,185055],{},"Path string is wrong or file does not exist",[4297,185057,17059,185058,185061,185062,185065,185066,185068],{},[30,185059,185060],{},"Path.exists()"," before calling; pass ",[30,185063,185064],{},"str(path)"," not a ",[30,185067,35779],{}," object on older python-docx versions",[4279,185070,185071,185074,185077],{},[4297,185072,185073],{},"Image appears as a broken icon in Word",[4297,185075,185076],{},"BytesIO position not reset to 0",[4297,185078,74566,185079,185081,185082],{},[30,185080,181215],{}," immediately before ",[30,185083,181219],{},[4279,185085,185086,185089,185094],{},[4297,185087,185088],{},"Image fills entire page \u002F is huge",[4297,185090,124273,185091,185093],{},[30,185092,56684],{}," argument passed; python-docx used native EMU size",[4297,185095,138773,185096,2012,185098,10073,185101],{},[30,185097,181168],{},[30,185099,185100],{},"width=usable_width",[940,185102,185103],{"href":182040},"fix guide",[4279,185105,185106,185114,185117],{},[4297,185107,185108,42706,185111],{},[30,185109,185110],{},"KeyError: 'Caption'",[30,185112,185113],{},"add_paragraph(style=\"Caption\")",[4297,185115,185116],{},"Template lacks the Caption style",[4297,185118,185119,185120,185122],{},"Use a standard ",[30,185121,156261],{}," baseline, or add the style manually before use",[4279,185124,185125,185128,185134],{},[4297,185126,185127],{},"Image inserted in wrong location",[4297,185129,185130,185133],{},[30,185131,185132],{},"doc.add_picture()"," always appends to the document body",[4297,185135,185136,185137,185140],{},"To insert mid-document, add a paragraph at the target position and call ",[30,185138,185139],{},"run.add_picture()"," on that paragraph's run",[18,185142,4402],{"id":4401},[23,185144,185146],{"className":126,"code":185145,"language":47,"meta":28,"style":28},"#!\u002Fusr\u002Fbin\u002Fenv python3\n# pip install python-docx Pillow pandas\n\"\"\"\nbatch_image_report.py — insert per-row images from a CSV manifest\ninto individual .docx files, fitting each image to the usable page width.\n\nUsage: python batch_image_report.py --data products.csv --images assets\u002F --out output\u002F\n\"\"\"\nimport argparse\nimport io\nfrom pathlib import Path\n\nfrom docx import Document\nfrom docx.shared import Inches\nfrom docx.enum.text import WD_ALIGN_PARAGRAPH\nimport pandas as pd\nfrom PIL import Image as PILImage\n\n\ndef fit_width(img_path: Path, max_width_emu: int) -> int:\n    \"\"\"Return a width in EMU that fits within max_width_emu, preserving aspect ratio.\"\"\"\n    try:\n        with PILImage.open(img_path) as img:\n            w, h = img.size\n            dpi = img.info.get(\"dpi\", (96, 96))\n            native_emu = int(w \u002F dpi[0] * 914400)\n            return min(native_emu, max_width_emu)\n    except Exception:\n        return min(int(Inches(4)), max_width_emu)\n\n\ndef make_report(row: dict, images_dir: Path, output_path: Path, placeholder: Path) -> None:\n    doc = Document()\n    section = doc.sections[0]\n    usable = section.page_width - section.left_margin - section.right_margin\n\n    doc.add_heading(str(row.get(\"name\", \"Report\")), level=1)\n\n    for key in (\"description\", \"notes\"):\n        if row.get(key):\n            doc.add_paragraph(str(row[key]))\n\n    img_path = images_dir \u002F f\"{row.get('sku', 'unknown')}.png\"\n    if not img_path.exists():\n        img_path = placeholder\n\n    try:\n        target_width = fit_width(img_path, usable)\n        doc.add_picture(str(img_path), width=target_width)\n        caption = doc.add_paragraph(f\"Figure — {row.get('name', '')}\", style=\"Caption\")\n        caption.alignment = WD_ALIGN_PARAGRAPH.CENTER\n    except Exception as e:\n        doc.add_paragraph(f\"[Image error: {e}]\")\n\n    doc.save(str(output_path))\n\n\ndef main() -> None:\n    parser = argparse.ArgumentParser(description=\"Batch image report generator\")\n    parser.add_argument(\"--data\", required=True, help=\"Path to CSV file\")\n    parser.add_argument(\"--images\", required=True, help=\"Directory containing SKU PNG files\")\n    parser.add_argument(\"--out\", required=True, help=\"Output directory\")\n    parser.add_argument(\"--placeholder\", default=\"assets\u002Fplaceholder.png\")\n    args = parser.parse_args()\n\n    data_path = Path(args.data)\n    images_dir = Path(args.images)\n    output_dir = Path(args.out)\n    placeholder = Path(args.placeholder)\n    output_dir.mkdir(parents=True, exist_ok=True)\n\n    try:\n        df = pd.read_csv(data_path)\n    except FileNotFoundError:\n        raise SystemExit(f\"Data file not found: {data_path}\")\n\n    ok, fail = 0, 0\n    for _, row in df.iterrows():\n        out_file = output_dir \u002F f\"{row.get('sku', f'row_{_}')}.docx\"\n        try:\n            make_report(row.to_dict(), images_dir, out_file, placeholder)\n            ok += 1\n        except Exception as e:\n            print(f\"FAIL {out_file.name}: {e}\")\n            fail += 1\n\n    print(f\"Done: {ok} generated, {fail} failed → {output_dir}\")\n\n\nif __name__ == \"__main__\":\n    main()\n",[30,185147,185148,185152,185157,185161,185166,185171,185175,185180,185184,185190,185196,185206,185210,185220,185230,185240,185250,185266,185270,185274,185292,185297,185303,185314,185322,185342,185368,185377,185385,185403,185407,185411,185429,185437,185449,185466,185470,185494,185498,185517,185524,185534,185538,185570,185578,185587,185591,185597,185606,185621,185663,185676,185686,185705,185709,185717,185721,185725,185737,185754,185779,185805,185829,185846,185854,185858,185866,185876,185884,185894,185914,185918,185924,185932,185940,185963,185967,185980,185990,186032,186038,186043,186051,186061,186091,186099,186103,186141,186145,186149,186161],{"__ignoreMap":28},[33,185149,185150],{"class":35,"line":36},[33,185151,14447],{"class":39},[33,185153,185154],{"class":35,"line":43},[33,185155,185156],{"class":39},"# pip install python-docx Pillow pandas\n",[33,185158,185159],{"class":35,"line":61},[33,185160,139],{"class":54},[33,185162,185163],{"class":35,"line":73},[33,185164,185165],{"class":54},"batch_image_report.py — insert per-row images from a CSV manifest\n",[33,185167,185168],{"class":35,"line":88},[33,185169,185170],{"class":54},"into individual .docx files, fitting each image to the usable page width.\n",[33,185172,185173],{"class":35,"line":95},[33,185174,92],{"emptyLinePlaceholder":91},[33,185176,185177],{"class":35,"line":101},[33,185178,185179],{"class":54},"Usage: python batch_image_report.py --data products.csv --images assets\u002F --out output\u002F\n",[33,185181,185182],{"class":35,"line":171},[33,185183,139],{"class":54},[33,185185,185186,185188],{"class":35,"line":179},[33,185187,164],{"class":163},[33,185189,4461],{"class":167},[33,185191,185192,185194],{"class":35,"line":187},[33,185193,164],{"class":163},[33,185195,60058],{"class":167},[33,185197,185198,185200,185202,185204],{"class":35,"line":201},[33,185199,190],{"class":163},[33,185201,193],{"class":167},[33,185203,164],{"class":163},[33,185205,198],{"class":167},[33,185207,185208],{"class":35,"line":206},[33,185209,92],{"emptyLinePlaceholder":91},[33,185211,185212,185214,185216,185218],{"class":35,"line":224},[33,185213,190],{"class":163},[33,185215,18092],{"class":167},[33,185217,164],{"class":163},[33,185219,18097],{"class":167},[33,185221,185222,185224,185226,185228],{"class":35,"line":229},[33,185223,190],{"class":163},[33,185225,18104],{"class":167},[33,185227,164],{"class":163},[33,185229,157048],{"class":167},[33,185231,185232,185234,185236,185238],{"class":35,"line":235},[33,185233,190],{"class":163},[33,185235,18116],{"class":167},[33,185237,164],{"class":163},[33,185239,18121],{"class":50},[33,185241,185242,185244,185246,185248],{"class":35,"line":250},[33,185243,164],{"class":163},[33,185245,492],{"class":167},[33,185247,495],{"class":163},[33,185249,498],{"class":167},[33,185251,185252,185254,185256,185258,185261,185263],{"class":35,"line":266},[33,185253,190],{"class":163},[33,185255,46889],{"class":50},[33,185257,46892],{"class":163},[33,185259,185260],{"class":167}," Image ",[33,185262,495],{"class":163},[33,185264,185265],{"class":167}," PILImage\n",[33,185267,185268],{"class":35,"line":290},[33,185269,92],{"emptyLinePlaceholder":91},[33,185271,185272],{"class":35,"line":295},[33,185273,92],{"emptyLinePlaceholder":91},[33,185275,185276,185278,185281,185284,185286,185288,185290],{"class":35,"line":300},[33,185277,562],{"class":163},[33,185279,185280],{"class":46}," fit_width",[33,185282,185283],{"class":167},"(img_path: Path, max_width_emu: ",[33,185285,1059],{"class":50},[33,185287,1617],{"class":167},[33,185289,1059],{"class":50},[33,185291,574],{"class":167},[33,185293,185294],{"class":35,"line":317},[33,185295,185296],{"class":54},"    \"\"\"Return a width in EMU that fits within max_width_emu, preserving aspect ratio.\"\"\"\n",[33,185298,185299,185301],{"class":35,"line":332},[33,185300,2424],{"class":163},[33,185302,574],{"class":167},[33,185304,185305,185307,185310,185312],{"class":35,"line":347},[33,185306,2191],{"class":163},[33,185308,185309],{"class":167}," PILImage.open(img_path) ",[33,185311,495],{"class":163},[33,185313,179143],{"class":167},[33,185315,185316,185318,185320],{"class":35,"line":374},[33,185317,180027],{"class":167},[33,185319,242],{"class":163},[33,185321,179153],{"class":167},[33,185323,185324,185326,185328,185330,185332,185334,185336,185338,185340],{"class":35,"line":397},[33,185325,180036],{"class":167},[33,185327,242],{"class":163},[33,185329,179163],{"class":167},[33,185331,179166],{"class":54},[33,185333,19953],{"class":167},[33,185335,38741],{"class":50},[33,185337,365],{"class":167},[33,185339,38741],{"class":50},[33,185341,371],{"class":167},[33,185343,185344,185347,185349,185351,185354,185356,185358,185360,185362,185364,185366],{"class":35,"line":653},[33,185345,185346],{"class":167},"            native_emu ",[33,185348,242],{"class":163},[33,185350,3149],{"class":50},[33,185352,185353],{"class":167},"(w ",[33,185355,1351],{"class":163},[33,185357,180066],{"class":167},[33,185359,748],{"class":50},[33,185361,763],{"class":167},[33,185363,1769],{"class":163},[33,185365,179834],{"class":50},[33,185367,221],{"class":167},[33,185369,185370,185372,185374],{"class":35,"line":667},[33,185371,28782],{"class":163},[33,185373,73775],{"class":50},[33,185375,185376],{"class":167},"(native_emu, max_width_emu)\n",[33,185378,185379,185381,185383],{"class":35,"line":675},[33,185380,2449],{"class":163},[33,185382,783],{"class":50},[33,185384,574],{"class":167},[33,185386,185387,185389,185391,185393,185395,185398,185400],{"class":35,"line":689},[33,185388,1659],{"class":163},[33,185390,73775],{"class":50},[33,185392,602],{"class":167},[33,185394,1059],{"class":50},[33,185396,185397],{"class":167},"(Inches(",[33,185399,1503],{"class":50},[33,185401,185402],{"class":167},")), max_width_emu)\n",[33,185404,185405],{"class":35,"line":703},[33,185406,92],{"emptyLinePlaceholder":91},[33,185408,185409],{"class":35,"line":714},[33,185410,92],{"emptyLinePlaceholder":91},[33,185412,185413,185415,185418,185420,185422,185425,185427],{"class":35,"line":723},[33,185414,562],{"class":163},[33,185416,185417],{"class":46}," make_report",[33,185419,177319],{"class":167},[33,185421,37100],{"class":50},[33,185423,185424],{"class":167},", images_dir: Path, output_path: Path, placeholder: Path) -> ",[33,185426,571],{"class":50},[33,185428,574],{"class":167},[33,185430,185431,185433,185435],{"class":35,"line":754},[33,185432,18224],{"class":167},[33,185434,242],{"class":163},[33,185436,18229],{"class":167},[33,185438,185439,185441,185443,185445,185447],{"class":35,"line":771},[33,185440,160360],{"class":167},[33,185442,242],{"class":163},[33,185444,156589],{"class":167},[33,185446,748],{"class":50},[33,185448,9202],{"class":167},[33,185450,185451,185454,185456,185458,185460,185462,185464],{"class":35,"line":777},[33,185452,185453],{"class":167},"    usable ",[33,185455,242],{"class":163},[33,185457,180248],{"class":167},[33,185459,4126],{"class":163},[33,185461,180253],{"class":167},[33,185463,4126],{"class":163},[33,185465,180258],{"class":167},[33,185467,185468],{"class":35,"line":788},[33,185469,92],{"emptyLinePlaceholder":91},[33,185471,185472,185474,185476,185478,185480,185482,185484,185486,185488,185490,185492],{"class":35,"line":804},[33,185473,18591],{"class":167},[33,185475,1053],{"class":50},[33,185477,184412],{"class":167},[33,185479,104775],{"class":54},[33,185481,365],{"class":167},[33,185483,160846],{"class":54},[33,185485,77348],{"class":167},[33,185487,18267],{"class":238},[33,185489,242],{"class":163},[33,185491,734],{"class":50},[33,185493,221],{"class":167},[33,185495,185496],{"class":35,"line":809},[33,185497,92],{"emptyLinePlaceholder":91},[33,185499,185500,185502,185505,185507,185509,185511,185513,185515],{"class":35,"line":819},[33,185501,656],{"class":163},[33,185503,185504],{"class":167}," key ",[33,185506,662],{"class":163},[33,185508,17583],{"class":167},[33,185510,171073],{"class":54},[33,185512,365],{"class":167},[33,185514,131398],{"class":54},[33,185516,1737],{"class":167},[33,185518,185519,185521],{"class":35,"line":829},[33,185520,8221],{"class":163},[33,185522,185523],{"class":167}," row.get(key):\n",[33,185525,185526,185529,185531],{"class":35,"line":834},[33,185527,185528],{"class":167},"            doc.add_paragraph(",[33,185530,1053],{"class":50},[33,185532,185533],{"class":167},"(row[key]))\n",[33,185535,185536],{"class":35,"line":839},[33,185537,92],{"emptyLinePlaceholder":91},[33,185539,185540,185542,185544,185547,185549,185551,185553,185555,185558,185560,185562,185564,185566,185568],{"class":35,"line":860},[33,185541,184454],{"class":167},[33,185543,242],{"class":163},[33,185545,185546],{"class":167}," images_dir ",[33,185548,1351],{"class":163},[33,185550,1110],{"class":163},[33,185552,274],{"class":54},[33,185554,1115],{"class":50},[33,185556,185557],{"class":167},"row.get(",[33,185559,184472],{"class":54},[33,185561,365],{"class":167},[33,185563,93880],{"class":54},[33,185565,12027],{"class":167},[33,185567,1121],{"class":50},[33,185569,184479],{"class":54},[33,185571,185572,185574,185576],{"class":35,"line":887},[33,185573,617],{"class":163},[33,185575,620],{"class":163},[33,185577,184488],{"class":167},[33,185579,185580,185582,185584],{"class":35,"line":907},[33,185581,184493],{"class":167},[33,185583,242],{"class":163},[33,185585,185586],{"class":167}," placeholder\n",[33,185588,185589],{"class":35,"line":1826},[33,185590,92],{"emptyLinePlaceholder":91},[33,185592,185593,185595],{"class":35,"line":1844},[33,185594,2424],{"class":163},[33,185596,574],{"class":167},[33,185598,185599,185601,185603],{"class":35,"line":1858},[33,185600,180687],{"class":167},[33,185602,242],{"class":163},[33,185604,185605],{"class":167}," fit_width(img_path, usable)\n",[33,185607,185608,185610,185612,185614,185616,185618],{"class":35,"line":1871},[33,185609,184513],{"class":167},[33,185611,1053],{"class":50},[33,185613,180729],{"class":167},[33,185615,56684],{"class":238},[33,185617,242],{"class":163},[33,185619,185620],{"class":167},"target_width)\n",[33,185622,185623,185626,185628,185630,185632,185635,185637,185639,185642,185644,185647,185649,185651,185653,185655,185657,185659,185661],{"class":35,"line":1877},[33,185624,185625],{"class":167},"        caption ",[33,185627,242],{"class":163},[33,185629,156861],{"class":167},[33,185631,4059],{"class":163},[33,185633,185634],{"class":54},"\"Figure — ",[33,185636,1115],{"class":50},[33,185638,185557],{"class":167},[33,185640,185641],{"class":54},"'name'",[33,185643,365],{"class":167},[33,185645,185646],{"class":54},"''",[33,185648,12027],{"class":167},[33,185650,1121],{"class":50},[33,185652,274],{"class":54},[33,185654,365],{"class":167},[33,185656,6953],{"class":238},[33,185658,242],{"class":163},[33,185660,183758],{"class":54},[33,185662,221],{"class":167},[33,185664,185665,185668,185670,185672,185674],{"class":35,"line":1883},[33,185666,185667],{"class":167},"        caption.alignment ",[33,185669,242],{"class":163},[33,185671,182934],{"class":50},[33,185673,3035],{"class":167},[33,185675,182939],{"class":50},[33,185677,185678,185680,185682,185684],{"class":35,"line":1915},[33,185679,2449],{"class":163},[33,185681,783],{"class":50},[33,185683,1852],{"class":163},[33,185685,7583],{"class":167},[33,185687,185688,185690,185692,185695,185697,185699,185701,185703],{"class":35,"line":1926},[33,185689,160511],{"class":167},[33,185691,4059],{"class":163},[33,185693,185694],{"class":54},"\"[Image error: ",[33,185696,1115],{"class":50},[33,185698,7602],{"class":167},[33,185700,1121],{"class":50},[33,185702,184555],{"class":54},[33,185704,221],{"class":167},[33,185706,185707],{"class":35,"line":1932},[33,185708,92],{"emptyLinePlaceholder":91},[33,185710,185711,185713,185715],{"class":35,"line":1938},[33,185712,85716],{"class":167},[33,185714,1053],{"class":50},[33,185716,173383],{"class":167},[33,185718,185719],{"class":35,"line":1950},[33,185720,92],{"emptyLinePlaceholder":91},[33,185722,185723],{"class":35,"line":1958},[33,185724,92],{"emptyLinePlaceholder":91},[33,185726,185727,185729,185731,185733,185735],{"class":35,"line":4904},[33,185728,562],{"class":163},[33,185730,6636],{"class":46},[33,185732,568],{"class":167},[33,185734,571],{"class":50},[33,185736,574],{"class":167},[33,185738,185739,185741,185743,185745,185747,185749,185752],{"class":35,"line":4909},[33,185740,6648],{"class":167},[33,185742,242],{"class":163},[33,185744,6653],{"class":167},[33,185746,6656],{"class":238},[33,185748,242],{"class":163},[33,185750,185751],{"class":54},"\"Batch image report generator\"",[33,185753,221],{"class":167},[33,185755,185756,185758,185760,185762,185764,185766,185768,185770,185772,185774,185777],{"class":35,"line":4915},[33,185757,6669],{"class":167},[33,185759,64452],{"class":54},[33,185761,365],{"class":167},[33,185763,25448],{"class":238},[33,185765,242],{"class":163},[33,185767,855],{"class":50},[33,185769,365],{"class":167},[33,185771,25463],{"class":238},[33,185773,242],{"class":163},[33,185775,185776],{"class":54},"\"Path to CSV file\"",[33,185778,221],{"class":167},[33,185780,185781,185783,185786,185788,185790,185792,185794,185796,185798,185800,185803],{"class":35,"line":4925},[33,185782,6669],{"class":167},[33,185784,185785],{"class":54},"\"--images\"",[33,185787,365],{"class":167},[33,185789,25448],{"class":238},[33,185791,242],{"class":163},[33,185793,855],{"class":50},[33,185795,365],{"class":167},[33,185797,25463],{"class":238},[33,185799,242],{"class":163},[33,185801,185802],{"class":54},"\"Directory containing SKU PNG files\"",[33,185804,221],{"class":167},[33,185806,185807,185809,185811,185813,185815,185817,185819,185821,185823,185825,185827],{"class":35,"line":4935},[33,185808,6669],{"class":167},[33,185810,41152],{"class":54},[33,185812,365],{"class":167},[33,185814,25448],{"class":238},[33,185816,242],{"class":163},[33,185818,855],{"class":50},[33,185820,365],{"class":167},[33,185822,25463],{"class":238},[33,185824,242],{"class":163},[33,185826,25501],{"class":54},[33,185828,221],{"class":167},[33,185830,185831,185833,185836,185838,185840,185842,185844],{"class":35,"line":4941},[33,185832,6669],{"class":167},[33,185834,185835],{"class":54},"\"--placeholder\"",[33,185837,365],{"class":167},[33,185839,6685],{"class":238},[33,185841,242],{"class":163},[33,185843,184308],{"class":54},[33,185845,221],{"class":167},[33,185847,185848,185850,185852],{"class":35,"line":4950},[33,185849,6766],{"class":167},[33,185851,242],{"class":163},[33,185853,6771],{"class":167},[33,185855,185856],{"class":35,"line":4960},[33,185857,92],{"emptyLinePlaceholder":91},[33,185859,185860,185862,185864],{"class":35,"line":4965},[33,185861,64545],{"class":167},[33,185863,242],{"class":163},[33,185865,64550],{"class":167},[33,185867,185868,185871,185873],{"class":35,"line":4971},[33,185869,185870],{"class":167},"    images_dir ",[33,185872,242],{"class":163},[33,185874,185875],{"class":167}," Path(args.images)\n",[33,185877,185878,185880,185882],{"class":35,"line":4983},[33,185879,22180],{"class":167},[33,185881,242],{"class":163},[33,185883,64559],{"class":167},[33,185885,185886,185889,185891],{"class":35,"line":4988},[33,185887,185888],{"class":167},"    placeholder ",[33,185890,242],{"class":163},[33,185892,185893],{"class":167}," Path(args.placeholder)\n",[33,185895,185896,185898,185900,185902,185904,185906,185908,185910,185912],{"class":35,"line":4993},[33,185897,6346],{"class":167},[33,185899,869],{"class":238},[33,185901,242],{"class":163},[33,185903,855],{"class":50},[33,185905,365],{"class":167},[33,185907,878],{"class":238},[33,185909,242],{"class":163},[33,185911,855],{"class":50},[33,185913,221],{"class":167},[33,185915,185916],{"class":35,"line":5003},[33,185917,92],{"emptyLinePlaceholder":91},[33,185919,185920,185922],{"class":35,"line":5008},[33,185921,2424],{"class":163},[33,185923,574],{"class":167},[33,185925,185926,185928,185930],{"class":35,"line":5014},[33,185927,7930],{"class":167},[33,185929,242],{"class":163},[33,185931,175926],{"class":167},[33,185933,185934,185936,185938],{"class":35,"line":5019},[33,185935,2449],{"class":163},[33,185937,2945],{"class":50},[33,185939,574],{"class":167},[33,185941,185942,185944,185946,185948,185950,185952,185954,185957,185959,185961],{"class":35,"line":5032},[33,185943,4051],{"class":163},[33,185945,16617],{"class":50},[33,185947,602],{"class":167},[33,185949,4059],{"class":163},[33,185951,59825],{"class":54},[33,185953,1115],{"class":50},[33,185955,185956],{"class":167},"data_path",[33,185958,1121],{"class":50},[33,185960,274],{"class":54},[33,185962,221],{"class":167},[33,185964,185965],{"class":35,"line":5039},[33,185966,92],{"emptyLinePlaceholder":91},[33,185968,185969,185972,185974,185976,185978],{"class":35,"line":5068},[33,185970,185971],{"class":167},"    ok, fail ",[33,185973,242],{"class":163},[33,185975,10791],{"class":50},[33,185977,365],{"class":167},[33,185979,87516],{"class":50},[33,185981,185982,185984,185986,185988],{"class":35,"line":5077},[33,185983,656],{"class":163},[33,185985,8560],{"class":167},[33,185987,662],{"class":163},[33,185989,8565],{"class":167},[33,185991,185992,185995,185997,185999,186001,186003,186005,186007,186009,186011,186013,186015,186018,186020,186022,186024,186026,186028,186030],{"class":35,"line":5082},[33,185993,185994],{"class":167},"        out_file ",[33,185996,242],{"class":163},[33,185998,6393],{"class":167},[33,186000,1351],{"class":163},[33,186002,1110],{"class":163},[33,186004,274],{"class":54},[33,186006,1115],{"class":50},[33,186008,185557],{"class":167},[33,186010,184472],{"class":54},[33,186012,365],{"class":167},[33,186014,4059],{"class":163},[33,186016,186017],{"class":54},"'row_",[33,186019,1115],{"class":50},[33,186021,78824],{"class":167},[33,186023,1121],{"class":50},[33,186025,155273],{"class":54},[33,186027,12027],{"class":167},[33,186029,1121],{"class":50},[33,186031,18215],{"class":54},[33,186033,186034,186036],{"class":35,"line":5089},[33,186035,670],{"class":163},[33,186037,574],{"class":167},[33,186039,186040],{"class":35,"line":5098},[33,186041,186042],{"class":167},"            make_report(row.to_dict(), images_dir, out_file, placeholder)\n",[33,186044,186045,186047,186049],{"class":35,"line":5105},[33,186046,87640],{"class":167},[33,186048,28976],{"class":163},[33,186050,17709],{"class":50},[33,186052,186053,186055,186057,186059],{"class":35,"line":5110},[33,186054,780],{"class":163},[33,186056,783],{"class":50},[33,186058,1852],{"class":163},[33,186060,7583],{"class":167},[33,186062,186063,186065,186067,186069,186072,186074,186077,186079,186081,186083,186085,186087,186089],{"class":35,"line":5115},[33,186064,9364],{"class":50},[33,186066,602],{"class":167},[33,186068,4059],{"class":163},[33,186070,186071],{"class":54},"\"FAIL ",[33,186073,1115],{"class":50},[33,186075,186076],{"class":167},"out_file.name",[33,186078,1121],{"class":50},[33,186080,2079],{"class":54},[33,186082,1115],{"class":50},[33,186084,7602],{"class":167},[33,186086,1121],{"class":50},[33,186088,274],{"class":54},[33,186090,221],{"class":167},[33,186092,186093,186095,186097],{"class":35,"line":5128},[33,186094,169862],{"class":167},[33,186096,28976],{"class":163},[33,186098,17709],{"class":50},[33,186100,186101],{"class":35,"line":5135},[33,186102,92],{"emptyLinePlaceholder":91},[33,186104,186105,186107,186109,186111,186113,186115,186117,186119,186122,186124,186126,186128,186131,186133,186135,186137,186139],{"class":35,"line":5142},[33,186106,7268],{"class":50},[33,186108,602],{"class":167},[33,186110,4059],{"class":163},[33,186112,22340],{"class":54},[33,186114,1115],{"class":50},[33,186116,87737],{"class":167},[33,186118,1121],{"class":50},[33,186120,186121],{"class":54}," generated, ",[33,186123,1115],{"class":50},[33,186125,169898],{"class":167},[33,186127,1121],{"class":50},[33,186129,186130],{"class":54}," failed → ",[33,186132,1115],{"class":50},[33,186134,6822],{"class":167},[33,186136,1121],{"class":50},[33,186138,274],{"class":54},[33,186140,221],{"class":167},[33,186142,186143],{"class":35,"line":5151},[33,186144,92],{"emptyLinePlaceholder":91},[33,186146,186147],{"class":35,"line":5156},[33,186148,92],{"emptyLinePlaceholder":91},[33,186150,186151,186153,186155,186157,186159],{"class":35,"line":5161},[33,186152,2491],{"class":163},[33,186154,2494],{"class":50},[33,186156,2497],{"class":163},[33,186158,2500],{"class":54},[33,186160,574],{"class":167},[33,186162,186163],{"class":35,"line":5167},[33,186164,6914],{"class":167},[18,186166,6918],{"id":6917},[4211,186168,186169,186174,186179],{},[4214,186170,186171,186173],{},[940,186172,179035],{"href":182040}," — fix the default native-size overflow problem",[4214,186175,186176,186178],{},[940,186177,156152],{"href":26562}," — full document generation pipeline with styles and tables",[4214,186180,186181,186183],{},[940,186182,26185],{"href":18040}," — per-recipient templating where each output may carry a different logo or signature image",[14,186185,6947,186186,3035],{},[940,186187,26263],{"href":26262},[6953,186189,186190],{},"html pre.shiki code .s7eDp, html code.shiki .s7eDp{--shiki-default:#6F42C1}html pre.shiki code .sYBdl, html code.shiki .sYBdl{--shiki-default:#032F62}html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html pre.shiki code .sAwPA, html code.shiki .sAwPA{--shiki-default:#6A737D}html pre.shiki code .sD7c4, html code.shiki .sD7c4{--shiki-default:#D73A49}html pre.shiki code .sgsFI, html code.shiki .sgsFI{--shiki-default:#24292E}html pre.shiki code .sYu0t, html code.shiki .sYu0t{--shiki-default:#005CC5}html pre.shiki code .sqxcx, html code.shiki .sqxcx{--shiki-default:#E36209}",{"title":28,"searchDepth":43,"depth":43,"links":186192},[186193,186194,186195,186196,186197,186198,186199,186200,186201,186202,186203,186208,186209,186210,186211,186212],{"id":20,"depth":43,"text":21},{"id":181766,"depth":43,"text":181767},{"id":182044,"depth":43,"text":182045},{"id":182379,"depth":43,"text":182380},{"id":182697,"depth":43,"text":182698},{"id":183164,"depth":43,"text":183165},{"id":183564,"depth":43,"text":183565},{"id":183581,"depth":43,"text":183582},{"id":183869,"depth":43,"text":183870},{"id":184198,"depth":43,"text":184199},{"id":12943,"depth":43,"text":12944,"children":186204},[186205,186206,186207],{"id":184678,"depth":61,"text":184679},{"id":184691,"depth":61,"text":184692},{"id":184704,"depth":61,"text":184705},{"id":52029,"depth":43,"text":52030},{"id":175154,"depth":43,"text":175155},{"id":4270,"depth":43,"text":4271},{"id":4401,"depth":43,"text":4402},{"id":6917,"depth":43,"text":6918},"Inserting Images",{},"\u002Fword-document-templating-batch-processing\u002Finserting-images-into-word-documents",{"title":156178,"description":186217},{"Add images to ":186218,"date":6978,"updatedAt":6978,"tags":186220},{"docx files with python-docx":186219},"sizing, aspect ratio, table\u002Fheader\u002Ffooter placement, inline vs floating, captions, and batch logo insertion from a folder.",[47,18041,170115,181622],"Inserting Images into Word Documents with Python","word-document-templating-batch-processing\u002Finserting-images-into-word-documents\u002Findex","9GcTgXDOKwlRkIJhXd1rBVPdFzN_pL0NtPDvvJLJOoA",1781797435064]