Dynamic Mail Merge with Python
Word's built-in mail merge tops out fast: it requires a running Word instance, chokes on conditional logic beyond simple if/else, and produces no audit trail. Python with docxtpl replaces it with a repeatable script that renders hundreds of documents in a single pass — no GUI, no manual field mapping, no version drift between runs.
This guide covers the full workflow: authoring a .docx template with {{ placeholders }}, loading a CSV or Excel data source with pandas, rendering one document per data row, using conditional sections and in-template loops, and naming the output files deterministically.
Prerequisites
# pip install docxtpl pandas openpyxl
pip install docxtpl pandas openpyxl
docxtpl depends on python-docx internally — you do not install it separately. openpyxl is needed only when the data source is an .xlsx file.
Create the working tree before running any script:
mkdir -p project/{templates,data,output}
Expected files:
project/templates/letter_template.docx— your Jinja2-tagged Word templateproject/data/recipients.csv— one row per output documentproject/output/— rendered.docxfiles land here
Inspect the Data Source First
Before writing the template, confirm exact column names. Typos between a CSV header and a template placeholder are the leading cause of UndefinedError at render time.
# pip install pandas openpyxl
import pandas as pd
from pathlib import Path
DATA = Path("project/data/recipients.csv")
try:
df = pd.read_csv(DATA)
except FileNotFoundError as exc:
raise SystemExit(f"Data file not found: {exc}")
print("Columns:", df.columns.tolist())
print("Rows:", len(df))
print(df.head(3).to_string())
For an Excel source swap pd.read_csv for pd.read_excel(DATA, engine="openpyxl"). The rest of the pipeline is identical. See Reading Excel Files with Python for multi-sheet and engine selection details.
Sample output you want to verify before proceeding:
Columns: ['first_name', 'last_name', 'company', 'invoice_date', 'total_due', 'tier']
Rows: 42
first_name last_name company invoice_date total_due tier
0 Alice Smith Acme Corp Ltd 2026-06-01 1250.00 premium
1 Bob Jones Beta LLC 2026-06-03 480.50 standard
2 Charlie Brown Gamma Inc. 2026-06-05 3200.00 premium
Write down these exact column names — they become your template placeholder names.
Step 1 — Author the Word Template
Open a new .docx in Word (or LibreOffice Writer). Type your letter content and insert {{ column_name }} exactly where each data value belongs. The delimiters must appear as a single continuous text run; if Word splits them across formatting runs the placeholder will silently fail to render.
Tip: Type the placeholder, select it, and apply a character style. This forces Word to keep it in one XML run.
A minimal invoice letter template body:
Dear {{ first_name }} {{ last_name }},
Thank you for your business with {{ company }}.
{% if tier == 'premium' %}
As a Premium client you receive priority processing on all orders.
{% endif %}
Invoice date: {{ invoice_date }}
Amount due: {{ total_due }}
Items ordered:
{%tr for item in line_items %}
{{ item.description }} Qty: {{ item.qty }} Price: {{ item.unit_price }}
{%tr endfor %}
Regards,
Billing Team
Key syntax:
{{ variable }}— scalar substitution{% if condition %}...{% endif %}— conditional section (the entire paragraph is removed when false){%tr for item in items %}...{%tr endfor %}— table row loop; must live inside a Word table row, not a paragraph
Save as project/templates/letter_template.docx.
Step 2 — Build the Context Dictionary
Each call to template.render() takes a plain Python dict. Build one per data row, converting types that Jinja2 cannot format automatically.
# pip install pandas
import pandas as pd
def build_context(row: pd.Series) -> dict:
"""Convert a DataFrame row to a render-ready context dict."""
ctx: dict = row.to_dict()
# Format date — raw pandas Timestamp renders as ugly repr
try:
ctx["invoice_date"] = pd.to_datetime(ctx["invoice_date"]).strftime("%B %d, %Y")
except (ValueError, TypeError):
ctx["invoice_date"] = str(ctx.get("invoice_date", ""))
# Format currency
try:
ctx["total_due"] = f"${float(ctx['total_due']):,.2f}"
except (ValueError, TypeError):
ctx["total_due"] = str(ctx.get("total_due", "0.00"))
# Provide a default for every optional key the template references
ctx.setdefault("tier", "standard")
# Nested list for the table loop — empty list collapses the loop cleanly
ctx["line_items"] = ctx.get("line_items", [])
return ctx
Providing defaults with setdefault prevents UndefinedError when a CSV row has a missing value. For a full explanation of that error and all its variants see Fix docxtpl Jinja2 UndefinedError.
SVG — Mail Merge Data Flow
Step 3 — Render One Document per Row
# pip install docxtpl pandas openpyxl
import logging
from pathlib import Path
import pandas as pd
from docxtpl import DocxTemplate
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s | %(levelname)s | %(message)s",
handlers=[logging.FileHandler("merge.log"), logging.StreamHandler()],
)
TEMPLATE = Path("project/templates/letter_template.docx")
DATA = Path("project/data/recipients.csv")
OUT_DIR = Path("project/output")
def build_context(row: pd.Series) -> dict:
ctx: dict = row.to_dict()
try:
ctx["invoice_date"] = pd.to_datetime(ctx["invoice_date"]).strftime("%B %d, %Y")
except (ValueError, TypeError):
ctx["invoice_date"] = str(ctx.get("invoice_date", ""))
try:
ctx["total_due"] = f"${float(ctx['total_due']):,.2f}"
except (ValueError, TypeError):
ctx["total_due"] = "0.00"
ctx.setdefault("tier", "standard")
ctx["line_items"] = ctx.get("line_items", [])
return ctx
def render_row(template_path: Path, ctx: dict, output_path: Path) -> bool:
try:
tpl = DocxTemplate(str(template_path)) # fresh instance per row — prevents state bleed
tpl.render(ctx)
output_path.parent.mkdir(parents=True, exist_ok=True)
tpl.save(str(output_path))
return True
except Exception as exc:
logging.error("Render failed for %s: %s", output_path.name, exc)
return False
def run_batch() -> None:
if not TEMPLATE.exists():
raise FileNotFoundError(f"Template not found: {TEMPLATE}")
try:
df = pd.read_csv(DATA)
except FileNotFoundError as exc:
raise SystemExit(f"Data file missing: {exc}") from exc
logging.info("Loaded %d rows from %s", len(df), DATA)
ok = 0
for idx, row in df.iterrows():
ctx = build_context(row)
# Deterministic filename: sanitize the primary-key column
safe_id = str(ctx.get("client_id", idx)).replace(" ", "_").replace("/", "-")
out = OUT_DIR / f"invoice_{safe_id}.docx"
if render_row(TEMPLATE, ctx, out):
ok += 1
logging.info("OK %s", out.name)
logging.info("Done: %d/%d succeeded", ok, len(df))
if __name__ == "__main__":
run_batch()
The template is re-opened per iteration (DocxTemplate(...) inside the loop). This is intentional — reusing a single instance can carry rendered content from one row into the next.
Step 4 — Conditional Sections
A conditional section shows or hides a paragraph depending on a value in the context dict.
In the template (one paragraph per branch):
{% if tier == 'premium' %}
As a Premium client you receive priority SLA and a dedicated account manager.
{% endif %}
{% if total_overdue_days > 30 %}
IMPORTANT: Your account is {{ total_overdue_days }} days overdue. Please remit immediately.
{% endif %}
In build_context, map the raw column to a typed Python value:
# pip install pandas
import pandas as pd
def build_context(row: pd.Series) -> dict:
ctx: dict = row.to_dict()
ctx["tier"] = str(ctx.get("tier", "standard")).lower()
try:
ctx["total_overdue_days"] = int(ctx.get("total_overdue_days", 0))
except (ValueError, TypeError):
ctx["total_overdue_days"] = 0
return ctx
The {% if %} tag evaluates Python truthiness, so passing an integer 0 correctly suppresses the overdue block while 31 renders it.
Step 5 — Table Row Loops
When each recipient has a variable number of line items the template needs a {%tr %} loop. Place the loop tag in a Word table:
| {%tr for item in line_items %} | | |
| {{ item.description }} | {{ item.qty }} | {{ item.unit_price }} |
| {%tr endfor %} | | |
(In the actual .docx the tags occupy a full table row each, not a Markdown table.)
Build the line_items list from a second CSV or a JSON column:
# pip install pandas
import json
import pandas as pd
def build_context_with_items(row: pd.Series) -> dict:
ctx: dict = row.to_dict()
# Option A: items encoded as JSON string in a CSV column
raw_items = ctx.get("line_items_json", "[]")
try:
items = json.loads(raw_items) if isinstance(raw_items, str) else []
except json.JSONDecodeError:
items = []
# Option B: pull from a related DataFrame keyed on client_id
# items_df = all_items_df[all_items_df["client_id"] == row["client_id"]]
# items = items_df[["description","qty","unit_price"]].to_dict("records")
ctx["line_items"] = [
{
"description": str(i.get("description", "")),
"qty": str(i.get("qty", "")),
"unit_price": f"${float(i.get('unit_price', 0)):,.2f}",
}
for i in items
]
return ctx
Passing line_items=[] when there are no items collapses the loop without raising an error.
Edge Cases & Variants
Excel as Data Source
Swap one line in run_batch:
# pip install pandas openpyxl
import pandas as pd
from pathlib import Path
DATA = Path("project/data/recipients.xlsx")
df = pd.read_excel(DATA, engine="openpyxl", sheet_name="Sheet1")
# dtype normalization — Excel dates arrive as datetime objects, not strings
df["invoice_date"] = pd.to_datetime(df["invoice_date"], errors="coerce")
See Cleaning Messy CSV Data with Pandas for handling encoding issues and mixed-type columns in flat files.
Parallel Rendering for Large Batches
For datasets over 500 rows the GIL is not the bottleneck — disk I/O is. ThreadPoolExecutor gives a meaningful speedup:
# pip install docxtpl pandas
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
import pandas as pd
from docxtpl import DocxTemplate
import logging
TEMPLATE = Path("project/templates/letter_template.docx")
OUT_DIR = Path("project/output")
def render_one(args: tuple) -> tuple[int, bool]:
idx, ctx = args
safe_id = str(ctx.get("client_id", idx)).replace(" ", "_")
out = OUT_DIR / f"invoice_{safe_id}.docx"
try:
tpl = DocxTemplate(str(TEMPLATE))
tpl.render(ctx)
out.parent.mkdir(parents=True, exist_ok=True)
tpl.save(str(out))
return idx, True
except Exception as exc:
logging.error("Row %d failed: %s", idx, exc)
return idx, False
def run_parallel(contexts: list[dict]) -> None:
with ThreadPoolExecutor(max_workers=8) as pool:
futures = {pool.submit(render_one, (i, ctx)): i for i, ctx in enumerate(contexts)}
ok = sum(1 for f in as_completed(futures) if f.result()[1])
logging.info("Parallel batch: %d/%d OK", ok, len(contexts))
Rendering a docx Template from an In-Memory Buffer
When the template is fetched from cloud storage (S3, Azure Blob) you can pass a BytesIO object directly:
# pip install docxtpl boto3
import io
import boto3
from docxtpl import DocxTemplate
from pathlib import Path
s3 = boto3.client("s3")
obj = s3.get_object(Bucket="my-bucket", Key="templates/letter_template.docx")
buf = io.BytesIO(obj["Body"].read())
tpl = DocxTemplate(buf)
tpl.render({"first_name": "Alice", "last_name": "Smith"})
tpl.save(str(Path("project/output/alice_smith.docx")))
Validation — Confirm Output Correctness
After the batch completes, spot-check that placeholders were resolved:
# pip install python-docx
from docx import Document
from pathlib import Path
import re
OUT_DIR = Path("project/output")
unrendered_pattern = re.compile(r"\{\{.*?\}\}")
for docx_file in sorted(OUT_DIR.glob("*.docx"))[:5]:
doc = Document(str(docx_file))
full_text = " ".join(p.text for p in doc.paragraphs)
matches = unrendered_pattern.findall(full_text)
if matches:
print(f"UNRENDERED in {docx_file.name}: {matches}")
else:
print(f"OK: {docx_file.name}")
Any {{ }} remaining in the output means a context key was missing at render time. Cross-reference column names printed in the diagnostic step.
Performance & Scale Notes
- Memory:
DocxTemplateloads the whole.docxXML into memory. A typical 50 KB template uses ~2 MB of working memory after parsing. 500 parallel renders sit comfortably inside 1 GB. - Disk: Each output file is roughly the size of the template plus text expansion. Budget ~70 KB per rendered invoice.
- Bottleneck: Disk write throughput dominates at scale, not CPU. Threads beat processes for I/O-bound work.
- Very large batches: Chunk the DataFrame with
pd.read_csv(DATA, chunksize=200)and process one chunk at a time to cap peak memory.
Troubleshooting
| Error | Root cause | Fix |
|---|---|---|
jinja2.exceptions.UndefinedError: 'xxx' is undefined | Context dict missing a key that the template references | Add ctx.setdefault('xxx', '') or align column names; see Fix docxtpl Jinja2 UndefinedError |
XMLSyntaxError on render | Placeholder split across XML runs in the template | Retype the {{ }} tag as one continuous run in Word |
| Output file is 0 bytes | tpl.save() path parent directory does not exist | Call output_path.parent.mkdir(parents=True, exist_ok=True) before saving |
| Table loop only renders one row | Loop tag is in a paragraph, not a table row | Move {%tr for %} / {%tr endfor %} into separate Word table rows |
Date column renders as 2026-06-01 00:00:00 | pandas Timestamp not formatted before render | Apply .strftime("%B %d, %Y") in build_context |
Complete Working Script
# pip install docxtpl pandas openpyxl
"""
mailmerge_batch.py — render one .docx per CSV/Excel row.
Usage: python mailmerge_batch.py --data recipients.csv --template letter_template.docx --out output/
"""
import argparse
import logging
from pathlib import Path
import pandas as pd
from docxtpl import DocxTemplate
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s | %(levelname)s | %(message)s",
handlers=[logging.FileHandler("merge.log"), logging.StreamHandler()],
)
def build_context(row: pd.Series) -> dict:
ctx: dict = row.to_dict()
try:
ctx["invoice_date"] = pd.to_datetime(ctx["invoice_date"]).strftime("%B %d, %Y")
except (ValueError, TypeError, KeyError):
ctx["invoice_date"] = str(ctx.get("invoice_date", ""))
try:
ctx["total_due"] = f"${float(ctx['total_due']):,.2f}"
except (ValueError, TypeError, KeyError):
ctx["total_due"] = "$0.00"
ctx.setdefault("tier", "standard")
ctx["line_items"] = ctx.get("line_items", [])
return ctx
def main() -> None:
parser = argparse.ArgumentParser(description="Batch mail merge with docxtpl")
parser.add_argument("--data", required=True, help="CSV or .xlsx data source")
parser.add_argument("--template", required=True, help="Path to .docx template")
parser.add_argument("--out", required=True, help="Output directory")
parser.add_argument("--key", default="client_id", help="Column for output filename")
args = parser.parse_args()
template_path = Path(args.template)
out_dir = Path(args.out)
data_path = Path(args.data)
out_dir.mkdir(parents=True, exist_ok=True)
if not template_path.exists():
raise FileNotFoundError(f"Template not found: {template_path}")
# Load data — supports both CSV and Excel
try:
if data_path.suffix.lower() in (".xlsx", ".xls"):
df = pd.read_excel(data_path, engine="openpyxl")
else:
df = pd.read_csv(data_path)
except Exception as exc:
raise SystemExit(f"Failed to load data: {exc}") from exc
logging.info("Loaded %d rows from %s", len(df), data_path)
ok = 0
for idx, row in df.iterrows():
ctx = build_context(row)
safe_key = str(ctx.get(args.key, idx)).replace(" ", "_").replace("/", "-")
out_path = out_dir / f"doc_{safe_key}.docx"
try:
tpl = DocxTemplate(str(template_path))
tpl.render(ctx)
tpl.save(str(out_path))
ok += 1
logging.info("OK %s", out_path.name)
except Exception as exc:
logging.error("FAIL row %d (%s): %s", idx, safe_key, exc)
logging.info("Batch complete: %d/%d rendered", ok, len(df))
if __name__ == "__main__":
main()
Run:
python mailmerge_batch.py \
--data project/data/recipients.csv \
--template project/templates/letter_template.docx \
--out project/output \
--key client_id
Related
- Fix docxtpl Jinja2 UndefinedError — diagnose and fix
UndefinedError: 'xxx' is undefinedduring render - Automating Word Document Creation — foundational python-docx patterns for building documents programmatically
- Reading Excel Files with Python — engine selection and multi-sheet loading for Excel data sources
- Cleaning Messy CSV Data with Pandas — fix encoding issues and mixed-type columns before feeding into the render loop