Generating Reports from Pipeline Data
A pipeline's transform step produces a cleaned DataFrame. The mistake is writing a separate script for each output format. One transform, three deliverables — Excel for analysts, Word for managers, PDF for clients — is the practical pattern. This guide wires that fan-out so segment splits, output naming, and row-count validation all run in a single pass over your pipeline's data in Automating Document & Data Pipelines.
The tricky parts are not the individual libraries; they are keeping outputs synchronized (every segment that appears in Excel must appear in Word and PDF), handling template variables that don't exist in some segments, and keeping memory flat when the DataFrame has hundreds of thousands of rows.
Prerequisites
Install all three output libraries plus pandas before starting. If you are ingesting the DataFrame from a PDF source, see the Extracting PDF Data into pandas guide for that upstream step.
# System deps: none beyond Python 3.9+
pip install pandas openpyxl xlsxwriter python-docx docxtpl reportlab weasyprint
Create a minimal test fixture so every snippet below runs independently:
# pip install pandas
import pandas as pd
from pathlib import Path
FIXTURE_CSV = Path("/tmp/pipeline_sample.csv")
data = {
"region": ["North", "North", "South", "South", "East", "East"],
"product": ["Widget A", "Widget B", "Widget A", "Widget C", "Widget B", "Widget C"],
"units": [120, 85, 200, 60, 95, 110],
"revenue": [2400.0, 1700.0, 4000.0, 1200.0, 1900.0, 2200.0],
"cost": [1440.0, 1020.0, 2400.0, 720.0, 1140.0, 1320.0],
}
df = pd.DataFrame(data)
df["margin"] = df["revenue"] - df["cost"]
FIXTURE_CSV.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(FIXTURE_CSV, index=False)
print(df)
Diagnostic: Inspect the DataFrame Before Fanning Out
Before writing any output, validate shape, dtypes, and the grouping column. A missing column or the wrong dtype for a numeric field will corrupt all three outputs silently.
# pip install pandas
import pandas as pd
from pathlib import Path
INPUT_CSV = Path("/tmp/pipeline_sample.csv")
def diagnose(path: Path) -> pd.DataFrame:
try:
df = pd.read_csv(path)
except FileNotFoundError:
raise SystemExit(f"Input not found: {path}")
required_cols = {"region", "product", "units", "revenue", "cost", "margin"}
missing = required_cols - set(df.columns)
if missing:
raise ValueError(f"DataFrame missing columns: {missing}")
numeric_cols = ["units", "revenue", "cost", "margin"]
for col in numeric_cols:
if not pd.api.types.is_numeric_dtype(df[col]):
df[col] = pd.to_numeric(df[col], errors="coerce")
null_count = df[col].isna().sum()
if null_count:
raise ValueError(f"Column '{col}' has {null_count} non-numeric rows after coercion")
segments = df["region"].unique()
print(f"Shape: {df.shape} | Segments: {list(segments)} | Nulls: {df.isna().sum().sum()}")
return df
df = diagnose(INPUT_CSV)
Check the segment count matches your expectation. Three regions that should be four means a data issue upstream — not a report issue.
Core Implementation
The fan-out follows a fixed order: Excel first (pure data, easiest to validate), then Word (narrative summary over aggregates), then PDF (final deliverable). Each step reads the same in-memory DataFrame so there is no drift between outputs.
Step 1 — DataFrame to Formatted Excel Workbook
Use openpyxl for formatted Excel output when you need header styling, column widths, and a totals row. xlsxwriter is faster for write-only large files; swap the engine if rows exceed 200,000.
# pip install pandas openpyxl
import pandas as pd
from pathlib import Path
from openpyxl import load_workbook
from openpyxl.styles import Font, PatternFill, Alignment, numbers
from openpyxl.utils import get_column_letter
OUTPUT_DIR = Path("/tmp/reports")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
HEADER_FILL = PatternFill("solid", fgColor="2563EB")
TOTAL_FILL = PatternFill("solid", fgColor="DBEAFE")
HEADER_FONT = Font(bold=True, color="FFFFFF", size=11)
def df_to_excel(df: pd.DataFrame, segment: str, output_dir: Path) -> Path:
safe_seg = segment.replace(" ", "_").lower()
path = output_dir / f"pipeline-reports-{safe_seg}.xlsx"
# Write raw data first with pandas engine
try:
with pd.ExcelWriter(path, engine="openpyxl") as writer:
df.to_excel(writer, sheet_name="Data", index=False)
except PermissionError:
raise RuntimeError(f"File locked: {path}")
# Re-open with openpyxl to apply formatting
wb = load_workbook(path)
ws = wb["Data"]
# Header row styling
for cell in ws[1]:
cell.font = HEADER_FONT
cell.fill = HEADER_FILL
cell.alignment = Alignment(horizontal="center")
# Column widths + number formats
currency_cols = {col: ws.cell(1, col).value
for col in range(1, ws.max_column + 1)
if ws.cell(1, col).value in ("revenue", "cost", "margin")}
for col_idx, header in currency_cols.items():
letter = get_column_letter(col_idx)
ws.column_dimensions[letter].width = 14
for row in ws.iter_rows(min_row=2, min_col=col_idx, max_col=col_idx):
for cell in row:
cell.number_format = '"$"#,##0.00'
# Totals row
total_row = ws.max_row + 1
numeric_headers = ["units", "revenue", "cost", "margin"]
for col_idx in range(1, ws.max_column + 1):
header = ws.cell(1, col_idx).value
cell = ws.cell(total_row, col_idx)
if header in numeric_headers:
letter = get_column_letter(col_idx)
cell.value = f"=SUM({letter}2:{letter}{ws.max_row - 1})"
cell.font = Font(bold=True)
cell.fill = TOTAL_FILL
if header != "units":
cell.number_format = '"$"#,##0.00'
elif col_idx == 1:
cell.value = "TOTAL"
cell.font = Font(bold=True)
cell.fill = TOTAL_FILL
wb.save(path)
return path
df = pd.read_csv("/tmp/pipeline_sample.csv")
for region, group in df.groupby("region"):
out = df_to_excel(group.copy(), region, OUTPUT_DIR)
print(f"Excel: {out} ({len(group)} rows)")
Step 2 — DataFrame to Word Summary
python-docx writes structure programmatically; docxtpl is better when you have an existing .docx template with Jinja2 placeholders. The snippet below uses python-docx directly so there is no external template file dependency.
# pip install pandas python-docx
import pandas as pd
from pathlib import Path
from docx import Document
from docx.shared import Pt, RGBColor, Inches
from docx.enum.text import WD_ALIGN_PARAGRAPH
OUTPUT_DIR = Path("/tmp/reports")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
def df_to_word(df: pd.DataFrame, segment: str, output_dir: Path) -> Path:
safe_seg = segment.replace(" ", "_").lower()
path = output_dir / f"pipeline-reports-{safe_seg}.docx"
doc = Document()
# Title
title = doc.add_heading(f"Pipeline Report — {segment}", level=1)
title.runs[0].font.color.rgb = RGBColor(0x0F, 0x17, 0x2A)
# Executive summary paragraph
total_revenue = df["revenue"].sum()
total_margin = df["margin"].sum()
margin_pct = (total_margin / total_revenue * 100) if total_revenue else 0.0
summary = (
f"Region {segment} generated ${total_revenue:,.0f} in revenue across "
f"{len(df)} product lines with a net margin of {margin_pct:.1f}%."
)
p = doc.add_paragraph(summary)
p.runs[0].font.size = Pt(11)
# Aggregate by product
agg = (
df.groupby("product")
.agg(units=("units", "sum"), revenue=("revenue", "sum"), margin=("margin", "sum"))
.reset_index()
.sort_values("revenue", ascending=False)
)
doc.add_heading("Product Breakdown", level=2)
cols = list(agg.columns)
table = doc.add_table(rows=1, cols=len(cols))
table.style = "Table Grid"
# Header row
hdr = table.rows[0].cells
for i, col in enumerate(cols):
hdr[i].text = col.capitalize()
hdr[i].paragraphs[0].runs[0].font.bold = True
# Data rows
for _, row in agg.iterrows():
cells = table.add_row().cells
cells[0].text = str(row["product"])
cells[1].text = f"{int(row['units']):,}"
cells[2].text = f"${row['revenue']:,.0f}"
cells[3].text = f"${row['margin']:,.0f}"
try:
doc.save(path)
except PermissionError:
raise RuntimeError(f"File locked: {path}")
return path
df = pd.read_csv("/tmp/pipeline_sample.csv")
for region, group in df.groupby("region"):
out = df_to_word(group.copy(), region, OUTPUT_DIR)
print(f"Word: {out} ({len(group)} rows)")
Step 3 — DataFrame to PDF Report
ReportLab gives fine-grained layout control. WeasyPrint is simpler when you already have an HTML template. The snippet below uses ReportLab's Platypus story API, which handles page breaks automatically across long segment tables.
# pip install pandas reportlab
import pandas as pd
from pathlib import Path
from reportlab.lib.pagesizes import A4
from reportlab.lib import colors
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import cm
from reportlab.platypus import (
SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle
)
OUTPUT_DIR = Path("/tmp/reports")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
ACCENT = colors.HexColor("#2563EB")
ACCENT_SOFT = colors.HexColor("#DBEAFE")
BORDER = colors.HexColor("#E2E8F0")
def df_to_pdf(df: pd.DataFrame, segment: str, output_dir: Path) -> Path:
safe_seg = segment.replace(" ", "_").lower()
path = output_dir / f"pipeline-reports-{safe_seg}.pdf"
styles = getSampleStyleSheet()
title_style = ParagraphStyle(
"ReportTitle", parent=styles["Heading1"],
fontSize=18, textColor=ACCENT, spaceAfter=6
)
body_style = ParagraphStyle(
"ReportBody", parent=styles["Normal"],
fontSize=10, leading=14, spaceAfter=8
)
total_revenue = df["revenue"].sum()
total_margin = df["margin"].sum()
margin_pct = (total_margin / total_revenue * 100) if total_revenue else 0.0
story = [
Paragraph(f"Pipeline Report — {segment}", title_style),
Paragraph(
f"Total revenue: <b>${total_revenue:,.0f}</b> | "
f"Net margin: <b>{margin_pct:.1f}%</b> | "
f"Rows: <b>{len(df)}</b>",
body_style,
),
Spacer(1, 0.4 * cm),
]
# Detail table
headers = ["Product", "Units", "Revenue", "Cost", "Margin"]
table_data = [headers] + [
[
row["product"],
f"{int(row['units']):,}",
f"${row['revenue']:,.0f}",
f"${row['cost']:,.0f}",
f"${row['margin']:,.0f}",
]
for _, row in df.iterrows()
]
col_widths = [5 * cm, 2.5 * cm, 3 * cm, 3 * cm, 3 * cm]
tbl = Table(table_data, colWidths=col_widths, repeatRows=1)
tbl.setStyle(TableStyle([
("BACKGROUND", (0, 0), (-1, 0), ACCENT),
("TEXTCOLOR", (0, 0), (-1, 0), colors.white),
("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"),
("FONTSIZE", (0, 0), (-1, -1), 9),
("ROWBACKGROUNDS", (0, 1), (-1, -1), [colors.white, ACCENT_SOFT]),
("GRID", (0, 0), (-1, -1), 0.5, BORDER),
("ALIGN", (1, 0), (-1, -1), "RIGHT"),
]))
story.append(tbl)
try:
doc = SimpleDocTemplate(
str(path), pagesize=A4,
leftMargin=2 * cm, rightMargin=2 * cm,
topMargin=2 * cm, bottomMargin=2 * cm,
)
doc.build(story)
except PermissionError:
raise RuntimeError(f"File locked: {path}")
return path
df = pd.read_csv("/tmp/pipeline_sample.csv")
for region, group in df.groupby("region"):
out = df_to_pdf(group.copy(), region, OUTPUT_DIR)
print(f"PDF: {out} ({len(group)} rows)")
Edge Cases and Variants
Variant 1 — Segment with a Single Row
A one-row segment breaks some table libraries (ReportLab raises LayoutError on a table with only a header and one data row if column widths exceed the page). Guard explicitly:
# pip install reportlab
from reportlab.platypus import Table
def safe_table(data, col_widths):
if len(data) < 2:
# Pad with a placeholder row so Platypus doesn't raise
data = data + [["—"] * len(data[0])]
return Table(data, colWidths=col_widths, repeatRows=1)
Variant 2 — docxtpl Mail Merge for Word Templates
When the Word file is an existing template maintained by a designer, use docxtpl instead of building structure in code. The template uses {{ variable }} Jinja2 syntax.
# pip install pandas docxtpl
import pandas as pd
from pathlib import Path
from docxtpl import DocxTemplate
TEMPLATE_PATH = Path("/tmp/report_template.docx") # existing .docx with {{ }} placeholders
OUTPUT_DIR = Path("/tmp/reports")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
def merge_word_template(df: pd.DataFrame, segment: str, template: Path, output_dir: Path) -> Path:
if not template.exists():
raise FileNotFoundError(f"Template not found: {template}")
safe_seg = segment.replace(" ", "_").lower()
path = output_dir / f"pipeline-reports-{safe_seg}-merged.docx"
tpl = DocxTemplate(template)
context = {
"segment": segment,
"total_revenue": f"${df['revenue'].sum():,.0f}",
"total_margin": f"${df['margin'].sum():,.0f}",
"row_count": len(df),
"rows": [
{
"product": row["product"],
"units": int(row["units"]),
"revenue": f"${row['revenue']:,.0f}",
}
for _, row in df.iterrows()
],
}
try:
tpl.render(context)
tpl.save(path)
except Exception as exc:
raise RuntimeError(f"Template render failed for {segment}: {exc}") from exc
return path
If a template variable is missing from the context, docxtpl raises jinja2.exceptions.UndefinedError. Pass jinja_env=tpl.get_jinja_env(autoescape=False) and set undefined=jinja2.Undefined to silently skip missing keys during development.
Variant 3 — WeasyPrint for HTML-to-PDF
When your team writes Jinja2 HTML templates, WeasyPrint converts them to PDF without installing a headless browser. It requires system-level libpango and libcairo:
# Ubuntu/Debian
sudo apt-get install -y libpango-1.0-0 libcairo2 libpangocairo-1.0-0
pip install weasyprint jinja2
# pip install weasyprint jinja2
from pathlib import Path
import pandas as pd
from jinja2 import Environment, FileSystemLoader
from weasyprint import HTML
TEMPLATE_DIR = Path("/tmp/templates")
OUTPUT_DIR = Path("/tmp/reports")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
def html_to_pdf(df: pd.DataFrame, segment: str, template_dir: Path, output_dir: Path) -> Path:
safe_seg = segment.replace(" ", "_").lower()
path = output_dir / f"pipeline-reports-{safe_seg}-web.pdf"
env = Environment(loader=FileSystemLoader(str(template_dir)))
try:
tmpl = env.get_template("report.html")
except Exception as exc:
raise FileNotFoundError(f"Template 'report.html' not found in {template_dir}: {exc}") from exc
html_str = tmpl.render(segment=segment, rows=df.to_dict("records"))
try:
HTML(string=html_str).write_pdf(str(path))
except Exception as exc:
raise RuntimeError(f"WeasyPrint failed for {segment}: {exc}") from exc
return path
Validation — Assert Outputs Match Row Counts
After writing all three output types for a segment, assert that the row counts agree. Mismatches mean a silent filter crept into one of the writer functions.
# pip install pandas openpyxl
import pandas as pd
from pathlib import Path
OUTPUT_DIR = Path("/tmp/reports")
def validate_outputs(segment: str, expected_rows: int, output_dir: Path) -> None:
safe_seg = segment.replace(" ", "_").lower()
xlsx_path = output_dir / f"pipeline-reports-{safe_seg}.xlsx"
if xlsx_path.exists():
xl_df = pd.read_excel(xlsx_path, sheet_name="Data", engine="openpyxl")
# Subtract 1 for the TOTAL row appended during writing
actual_rows = len(xl_df) - 1
assert actual_rows == expected_rows, (
f"[{segment}] Excel row mismatch: expected {expected_rows}, got {actual_rows}"
)
print(f"[{segment}] Excel OK ({actual_rows} rows)")
pdf_path = output_dir / f"pipeline-reports-{safe_seg}.pdf"
if pdf_path.exists():
size = pdf_path.stat().st_size
assert size > 1024, f"[{segment}] PDF suspiciously small: {size} bytes"
print(f"[{segment}] PDF OK ({size:,} bytes)")
docx_path = output_dir / f"pipeline-reports-{safe_seg}.docx"
if docx_path.exists():
assert docx_path.stat().st_size > 512, f"[{segment}] DOCX suspiciously small"
print(f"[{segment}] DOCX OK")
df = pd.read_csv("/tmp/pipeline_sample.csv")
for region, group in df.groupby("region"):
validate_outputs(region, len(group), OUTPUT_DIR)
For the automating-monthly-sales-reports-in-excel pattern, add a checksum column to the DataFrame before writing and re-derive it after reading back to catch silent numeric rounding.
Performance and Scale
For DataFrames under 100,000 rows, the per-segment loop above runs in seconds. Beyond that:
Memory — groupby materialises each group as a copy. With 50 segments and 1 M rows, peak RAM is roughly (total_df_size) * 2. Switch to a generator that streams segments from disk:
# pip install pandas
import pandas as pd
from pathlib import Path
def iter_segments(csv_path: Path, segment_col: str, chunksize: int = 50_000):
"""Yield (segment_value, chunk_df) without loading the full file."""
# Read unique segments first (cheap header scan)
segments = pd.read_csv(csv_path, usecols=[segment_col])[segment_col].unique()
for seg in segments:
chunks = []
for chunk in pd.read_csv(csv_path, chunksize=chunksize):
subset = chunk[chunk[segment_col] == seg]
if not subset.empty:
chunks.append(subset)
if chunks:
yield seg, pd.concat(chunks, ignore_index=True)
Excel at scale — openpyxl in write-only mode (write_only=True) cuts memory by 60% but loses the re-open step for totals rows. Use xlsxwriter with constant_memory=True for write-only Excel above 200,000 rows; note that xlsxwriter cannot add a formula row after closing — pre-calculate totals with pandas instead.
PDF at scale — ReportLab's Platypus builds the full story in RAM. For very long tables, split the story into batches of 500 rows and use BaseDocTemplate with PageBreak() between batches. WeasyPrint is slower per page but can stream HTML from a file, which avoids loading the entire DOM.
Parallel segments — if segments are independent, use concurrent.futures.ProcessPoolExecutor:
# pip install pandas openpyxl python-docx reportlab
import pandas as pd
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor, as_completed
OUTPUT_DIR = Path("/tmp/reports")
def write_segment(args):
segment, rows_json, output_dir_str = args
import pandas as pd # re-import inside worker process
from pathlib import Path
# Import your actual writer functions here
group = pd.read_json(rows_json, orient="records")
output_dir = Path(output_dir_str)
# ... call df_to_excel, df_to_word, df_to_pdf
return segment, len(group)
df = pd.read_csv("/tmp/pipeline_sample.csv")
tasks = [
(region, group.to_json(orient="records"), str(OUTPUT_DIR))
for region, group in df.groupby("region")
]
with ProcessPoolExecutor(max_workers=4) as pool:
futures = {pool.submit(write_segment, t): t[0] for t in tasks}
for fut in as_completed(futures):
seg, n = fut.result()
print(f"Done: {seg} ({n} rows)")
Pass the group as JSON (not a raw DataFrame) to avoid pickling issues with some openpyxl internals.
Troubleshooting
| Error | Root cause | Fix |
|---|---|---|
openpyxl.utils.exceptions.InvalidFileException when re-opening | pd.ExcelWriter with engine="openpyxl" left the file handle open | Use with statement; flush before load_workbook |
reportlab.platypus.doctemplate.LayoutError: Flowable ... too large | A single table cell overflows the page width | Reduce colWidths; wrap long strings with Paragraph() inside cells |
jinja2.exceptions.UndefinedError in docxtpl | Template placeholder has no matching key in context | Audit context.keys() against template variables; set undefined=jinja2.Undefined to skip missing |
WeasyPrint OSError: no library called "libpango" | System C libraries not installed | sudo apt-get install libpango-1.0-0 libcairo2 libpangocairo-1.0-0 |
| PDF file is valid but all numeric columns show 0 | Numeric column dtype is object after reading CSV | Run pd.to_numeric(df[col], errors="coerce") on all numeric columns before writing |
Excel totals row shows #REF! | Row reference in SUM formula miscalculated after rows removed | Recompute ws.max_row after all data rows are written; use explicit row indices |
Complete Working Script
This script reads a CSV, validates shape, fans out to all three formats per segment, and validates outputs. Pass --input and --output-dir; --formats controls which outputs to generate.
# pip install pandas openpyxl python-docx reportlab
"""
pipeline_report_fanout.py — fan-out pipeline DataFrame to Excel, Word, and PDF.
Usage:
python pipeline_report_fanout.py \
--input /tmp/pipeline_sample.csv \
--output-dir /tmp/reports \
--segment-col region \
--formats excel word pdf
"""
import argparse
import sys
from pathlib import Path
import pandas as pd
from openpyxl import load_workbook
from openpyxl.styles import Font, PatternFill, Alignment
from openpyxl.utils import get_column_letter
from docx import Document
from docx.shared import Pt, RGBColor
from reportlab.lib.pagesizes import A4
from reportlab.lib import colors
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import cm
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle
# ── helpers ──────────────────────────────────────────────────────────────────
def safe_name(segment: str) -> str:
return segment.replace(" ", "_").lower()
def validate_df(df: pd.DataFrame, segment_col: str) -> pd.DataFrame:
if segment_col not in df.columns:
raise ValueError(f"Segment column '{segment_col}' not found in DataFrame")
numeric_cols = [c for c in df.columns if c != segment_col and
pd.api.types.is_numeric_dtype(df[c])]
for col in numeric_cols:
df[col] = pd.to_numeric(df[col], errors="coerce")
null_total = df.isna().sum().sum()
if null_total:
print(f"Warning: {null_total} null values found after coercion", file=sys.stderr)
return df
# ── Excel ─────────────────────────────────────────────────────────────────────
def write_excel(df: pd.DataFrame, segment: str, output_dir: Path) -> Path:
path = output_dir / f"pipeline-reports-{safe_name(segment)}.xlsx"
try:
with pd.ExcelWriter(path, engine="openpyxl") as writer:
df.to_excel(writer, sheet_name="Data", index=False)
except PermissionError:
raise RuntimeError(f"Cannot write Excel (file locked?): {path}")
wb = load_workbook(path)
ws = wb["Data"]
header_fill = PatternFill("solid", fgColor="2563EB")
total_fill = PatternFill("solid", fgColor="DBEAFE")
for cell in ws[1]:
cell.font = Font(bold=True, color="FFFFFF", size=11)
cell.fill = header_fill
cell.alignment = Alignment(horizontal="center")
numeric_headers = [
ws.cell(1, c).value for c in range(1, ws.max_column + 1)
if ws.cell(1, c).value and pd.api.types.is_numeric_dtype(df.get(ws.cell(1, c).value, pd.Series(dtype=float)))
]
total_row = ws.max_row + 1
for col_idx in range(1, ws.max_column + 1):
header = ws.cell(1, col_idx).value
cell = ws.cell(total_row, col_idx)
letter = get_column_letter(col_idx)
if header in numeric_headers:
cell.value = f"=SUM({letter}2:{letter}{ws.max_row - 1})"
cell.font = Font(bold=True)
cell.fill = total_fill
elif col_idx == 1:
cell.value = "TOTAL"
cell.font = Font(bold=True)
cell.fill = total_fill
wb.save(path)
return path
# ── Word ──────────────────────────────────────────────────────────────────────
def write_word(df: pd.DataFrame, segment: str, output_dir: Path) -> Path:
path = output_dir / f"pipeline-reports-{safe_name(segment)}.docx"
doc = Document()
heading = doc.add_heading(f"Pipeline Report — {segment}", level=1)
heading.runs[0].font.color.rgb = RGBColor(0x0F, 0x17, 0x2A)
numeric_df = df.select_dtypes(include="number")
totals = numeric_df.sum()
revenue = totals.get("revenue", 0)
margin = totals.get("margin", revenue - totals.get("cost", 0))
margin_pct = (margin / revenue * 100) if revenue else 0.0
p = doc.add_paragraph(
f"Segment {segment}: {len(df)} rows, revenue ${revenue:,.0f}, margin {margin_pct:.1f}%."
)
p.runs[0].font.size = Pt(11)
doc.add_heading("Detail", level=2)
cols = list(df.columns)
table = doc.add_table(rows=1, cols=len(cols))
table.style = "Table Grid"
hdr = table.rows[0].cells
for i, col in enumerate(cols):
hdr[i].text = col
hdr[i].paragraphs[0].runs[0].font.bold = True
for _, row in df.iterrows():
cells = table.add_row().cells
for i, col in enumerate(cols):
val = row[col]
cells[i].text = (
f"${val:,.0f}" if col in ("revenue", "cost", "margin")
else f"{int(val):,}" if col == "units"
else str(val)
)
try:
doc.save(path)
except PermissionError:
raise RuntimeError(f"Cannot write Word doc (file locked?): {path}")
return path
# ── PDF ───────────────────────────────────────────────────────────────────────
def write_pdf(df: pd.DataFrame, segment: str, output_dir: Path) -> Path:
path = output_dir / f"pipeline-reports-{safe_name(segment)}.pdf"
styles = getSampleStyleSheet()
accent = colors.HexColor("#2563EB")
soft = colors.HexColor("#DBEAFE")
border = colors.HexColor("#E2E8F0")
title_style = ParagraphStyle(
"Title2", parent=styles["Heading1"], fontSize=16, textColor=accent, spaceAfter=6
)
body_style = ParagraphStyle(
"Body2", parent=styles["Normal"], fontSize=10, leading=14, spaceAfter=8
)
numeric_df = df.select_dtypes(include="number")
totals = numeric_df.sum()
revenue = totals.get("revenue", 0)
margin = totals.get("margin", 0)
margin_pct = (margin / revenue * 100) if revenue else 0.0
story = [
Paragraph(f"Pipeline Report — {segment}", title_style),
Paragraph(
f"Revenue: <b>${revenue:,.0f}</b> | Margin: <b>{margin_pct:.1f}%</b> | Rows: <b>{len(df)}</b>",
body_style,
),
Spacer(1, 0.3 * cm),
]
cols = list(df.columns)
header_row = [c.capitalize() for c in cols]
data = [header_row]
for _, row in df.iterrows():
data.append([
f"${row[c]:,.0f}" if c in ("revenue", "cost", "margin")
else f"{int(row[c]):,}" if c == "units"
else str(row[c])
for c in cols
])
col_count = len(cols)
col_width = 16.0 / col_count * cm
tbl = Table(data, colWidths=[col_width] * col_count, repeatRows=1)
tbl.setStyle(TableStyle([
("BACKGROUND", (0, 0), (-1, 0), accent),
("TEXTCOLOR", (0, 0), (-1, 0), colors.white),
("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"),
("FONTSIZE", (0, 0), (-1, -1), 9),
("ROWBACKGROUNDS", (0, 1), (-1, -1), [colors.white, soft]),
("GRID", (0, 0), (-1, -1), 0.5, border),
("ALIGN", (1, 0), (-1, -1), "RIGHT"),
]))
story.append(tbl)
try:
doc = SimpleDocTemplate(
str(path), pagesize=A4,
leftMargin=2 * cm, rightMargin=2 * cm,
topMargin=2 * cm, bottomMargin=2 * cm,
)
doc.build(story)
except PermissionError:
raise RuntimeError(f"Cannot write PDF (file locked?): {path}")
return path
# ── validation ────────────────────────────────────────────────────────────────
def validate(segment: str, expected_rows: int, output_dir: Path, formats: list) -> None:
sn = safe_name(segment)
if "excel" in formats:
p = output_dir / f"pipeline-reports-{sn}.xlsx"
xl = pd.read_excel(p, sheet_name="Data", engine="openpyxl")
actual = len(xl) - 1 # subtract TOTAL row
assert actual == expected_rows, f"[{segment}] Excel: expected {expected_rows}, got {actual}"
print(f" PASS Excel {p.name} ({actual} rows)")
if "word" in formats:
p = output_dir / f"pipeline-reports-{sn}.docx"
assert p.stat().st_size > 512, f"[{segment}] DOCX too small"
print(f" PASS Word {p.name}")
if "pdf" in formats:
p = output_dir / f"pipeline-reports-{sn}.pdf"
assert p.stat().st_size > 1024, f"[{segment}] PDF too small"
print(f" PASS PDF {p.name}")
# ── main ──────────────────────────────────────────────────────────────────────
def main():
parser = argparse.ArgumentParser(description="Fan out pipeline DataFrame to multiple report formats.")
parser.add_argument("--input", required=True, type=Path, help="Input CSV file")
parser.add_argument("--output-dir", required=True, type=Path, help="Output directory")
parser.add_argument("--segment-col", default="region", help="Column to split on (default: region)")
parser.add_argument("--formats", nargs="+", default=["excel", "word", "pdf"],
choices=["excel", "word", "pdf"], help="Output formats to generate")
args = parser.parse_args()
if not args.input.exists():
sys.exit(f"Input file not found: {args.input}")
args.output_dir.mkdir(parents=True, exist_ok=True)
try:
df = pd.read_csv(args.input)
except Exception as exc:
sys.exit(f"Failed to read CSV: {exc}")
df = validate_df(df, args.segment_col)
print(f"Input shape: {df.shape} | Segments: {df[args.segment_col].nunique()}")
writers = {
"excel": write_excel,
"word": write_word,
"pdf": write_pdf,
}
for segment, group in df.groupby(args.segment_col):
print(f"\nSegment: {segment} ({len(group)} rows)")
for fmt in args.formats:
try:
out = writers[fmt](group.copy(), str(segment), args.output_dir)
print(f" Wrote {fmt}: {out}")
except Exception as exc:
print(f" ERROR {fmt} [{segment}]: {exc}", file=sys.stderr)
print(f" Validating…")
try:
validate(str(segment), len(group), args.output_dir, args.formats)
except AssertionError as exc:
print(f" VALIDATION FAILED: {exc}", file=sys.stderr)
print("\nDone.")
if __name__ == "__main__":
main()
Run it against the fixture:
python pipeline_report_fanout.py \
--input /tmp/pipeline_sample.csv \
--output-dir /tmp/reports \
--segment-col region \
--formats excel word pdf
Expected output: nine files under /tmp/reports/ — three formats per three regions — each passing the validation step.
Related
- Automating Excel Report Generation — openpyxl formatting, charts, and scheduled workbook creation
- Dynamic Mail Merge with Python — docxtpl Jinja2 templates and per-recipient Word output
- Generating PDF Reports Dynamically — ReportLab layout, fonts, and invoice-style PDF generation
- Extracting PDF Data into pandas — the upstream step that produces the DataFrame this guide consumes
Part of Automating Document & Data Pipelines.