Merging and Splitting PDF Documents
Manually assembling multi-file reports or slicing a 300-page export into per-client packets does not scale. Scripts that call writer.add_page() in a loop drop bookmarks, break internal links, and exhaust memory on anything beyond a few dozen pages. This guide covers the correct primitives in pypdf — PdfReader, PdfWriter, and append() — plus page reordering, outline preservation, and streaming patterns for large batches.
The same assembly layer powers Generating PDF Reports Dynamically (injecting cover pages and appendices) and feeds the access-control step in Watermarking and Securing PDFs.
Prerequisites
# pip install pypdf pikepdf
pip install pypdf # pure-Python; covers 95 % of use cases
pip install pikepdf # C++ wrapper around QPDF; use for repair/large-scale work
Create test fixtures quickly:
# Generate three small test PDFs with ImageMagick (or use your own files)
for i in 1 2 3; do
convert -size 595x842 xc:white -pointsize 48 \
-annotate +220+420 "Page set $i" "test_input_$i.pdf"
done
Inspect Before You Process
Before merging or splitting, confirm page count, encryption status, and outline depth:
# pip install pypdf
from pypdf import PdfReader
from pathlib import Path
def inspect_pdf(path: Path) -> dict:
"""Return basic structural metadata for a PDF."""
try:
reader = PdfReader(path)
return {
"pages": len(reader.pages),
"encrypted": reader.is_encrypted,
"outline_items": len(reader.outline),
"title": (reader.metadata or {}).get("/Title", ""),
}
except Exception as exc:
return {"error": str(exc)}
if __name__ == "__main__":
for pdf in sorted(Path("./input_docs").glob("*.pdf")):
print(pdf.name, inspect_pdf(pdf))
Run this on every input directory before the first merge. Files where encrypted=True need reader.decrypt(password) before any page access; files with error keys should be skipped or repaired with pikepdf.
Core Workflow: Merging
Step 1 — Use append(), not add_page()
PdfWriter.append(reader) recursively imports page resources, form fields, annotations, and the document outline. add_page() does a shallow copy that silently drops all of those.
# pip install pypdf
from pypdf import PdfWriter, PdfReader
from pathlib import Path
import logging
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
logger = logging.getLogger(__name__)
def merge_pdfs(input_dir: Path, output_path: Path) -> None:
"""Merge all PDFs in a directory in sorted order, preserving outlines."""
writer = PdfWriter()
try:
pdf_files = sorted(input_dir.glob("*.pdf"))
if not pdf_files:
logger.warning("No PDF files found in %s", input_dir)
return
for pdf_file in pdf_files:
logger.info("Appending %s", pdf_file.name)
with open(pdf_file, "rb") as fh:
reader = PdfReader(fh)
writer.append(reader, import_outline=True) # preserves bookmarks
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "wb") as out:
writer.write(out)
logger.info("Merged %d files → %s", len(pdf_files), output_path)
except Exception as exc:
logger.error("Merge failed: %s", exc)
raise
finally:
writer.close()
if __name__ == "__main__":
merge_pdfs(Path("./input_docs"), Path("./output/merged.pdf"))
Open the file inside with open() and pass the file object (not just the path) to PdfReader. This guarantees the OS releases the file descriptor after each iteration — critical on Windows where open handles block subsequent reads.
Step 2 — Preserve and inspect the outline
# pip install pypdf
from pypdf import PdfReader
from pathlib import Path
def print_outline(reader: PdfReader, items=None, depth: int = 0) -> None:
"""Recursively print the document outline (bookmarks)."""
if items is None:
items = reader.outline
for item in items:
if isinstance(item, list):
print_outline(reader, item, depth + 1)
else:
page_num = reader.get_destination_page_number(item) + 1 # 1-based
print(" " * depth + f"[p{page_num}] {item.title}")
if __name__ == "__main__":
reader = PdfReader(Path("./output/merged.pdf"))
print_outline(reader)
After merging, run this to confirm outlines from every source document are present. If a source had no outline, that is expected; if a source had one and it is missing, you passed import_outline=False (the default before pypdf 3.x — pin pypdf>=3.0).
Step 3 — Reorder pages before writing
Sometimes you need to rearrange pages without re-reading every file. PdfWriter exposes its internal page list; manipulate it directly:
# pip install pypdf
from pypdf import PdfWriter, PdfReader
from pathlib import Path
def reorder_pages(input_path: Path, output_path: Path, new_order: list[int]) -> None:
"""
Write pages in new_order (1-based) to output_path.
Example: new_order=[3,1,2] puts page 3 first.
"""
reader = PdfReader(input_path)
writer = PdfWriter()
total = len(reader.pages)
try:
for page_num in new_order:
if not 1 <= page_num <= total:
raise ValueError(f"Page {page_num} out of range (1–{total})")
writer.add_page(reader.pages[page_num - 1]) # convert 1-based → 0-based
with open(output_path, "wb") as out:
writer.write(out)
print(f"Reordered {len(new_order)} pages → {output_path}")
except Exception as exc:
print(f"Reorder failed: {exc}")
raise
finally:
writer.close()
if __name__ == "__main__":
reorder_pages(Path("./source.pdf"), Path("./reordered.pdf"), [3, 1, 2])
Core Workflow: Splitting
Range-based split (1-based UI, 0-based pypdf)
The single most common indexing mistake: PDF viewers show page 1, pypdf stores it at index 0. Always subtract 1 when converting user-visible page numbers to slice indices.
# pip install pypdf
from pypdf import PdfReader, PdfWriter
from pathlib import Path
def split_pdf_by_ranges(
input_path: Path,
output_dir: Path,
ranges: list[tuple[int, int]],
) -> list[Path]:
"""
Split a PDF by 1-based page ranges.
ranges=[(1,3),(4,8)] → two output files.
Returns list of created paths.
"""
output_dir.mkdir(parents=True, exist_ok=True)
created: list[Path] = []
try:
with open(input_path, "rb") as fh:
reader = PdfReader(fh)
total = len(reader.pages)
for idx, (start, end) in enumerate(ranges, start=1):
if start < 1 or end > total or start > end:
raise ValueError(
f"Invalid range ({start}–{end}) for {total}-page document"
)
writer = PdfWriter()
for page_num in range(start - 1, end): # 0-based
writer.add_page(reader.pages[page_num])
out_path = output_dir / f"{input_path.stem}_part{idx:02d}.pdf"
with open(out_path, "wb") as out:
writer.write(out)
writer.close()
created.append(out_path)
print(f"Created: {out_path} ({end - start + 1} pages)")
except Exception as exc:
print(f"Split failed: {exc}")
raise
return created
if __name__ == "__main__":
split_pdf_by_ranges(
Path("./annual_report.pdf"),
Path("./output/splits"),
[(1, 3), (4, 10), (11, 20)],
)
For a command-line interface with range-string parsing ("1-3,4-10"), see Split a PDF by Page Ranges with Python.
Edge Cases and Variants
Encrypted source files
# pip install pypdf
from pypdf import PdfReader, PdfWriter
from pypdf.errors import FileNotDecryptedError
from pathlib import Path
def merge_with_password(paths: list[Path], password: str, output: Path) -> None:
"""Merge password-protected PDFs after decrypting with a shared password."""
writer = PdfWriter()
try:
for p in paths:
with open(p, "rb") as fh:
reader = PdfReader(fh)
if reader.is_encrypted:
result = reader.decrypt(password)
if result == 0:
print(f"[SKIP] Wrong password for {p.name}")
continue
writer.append(reader, import_outline=True)
with open(output, "wb") as out:
writer.write(out)
except FileNotDecryptedError as exc:
print(f"Decryption failed: {exc}")
raise
finally:
writer.close()
After merging, the output PDF is unencrypted. Re-apply protection as described in Watermarking and Securing PDFs before distributing.
Mismatched page sizes
When sources have different /MediaBox dimensions (e.g., mixing A4 and Letter), the merged PDF preserves each page's original size. If uniform sizing is required, copy the target /MediaBox onto each page object after appending:
# pip install pypdf
from pypdf import PdfWriter, PdfReader
from pypdf.generic import RectangleObject
from pathlib import Path
A4 = RectangleObject((0, 0, 595.28, 841.89)) # points
def merge_normalize_size(paths: list[Path], output: Path) -> None:
writer = PdfWriter()
try:
for p in paths:
with open(p, "rb") as fh:
reader = PdfReader(fh)
writer.append(reader)
# Normalize all pages to A4 after appending
for page in writer.pages:
page.mediabox = A4
with open(output, "wb") as out:
writer.write(out)
finally:
writer.close()
Split every N pages
# pip install pypdf
from pypdf import PdfReader, PdfWriter
from pathlib import Path
import math
def split_every_n(input_path: Path, output_dir: Path, n: int) -> list[Path]:
"""Split a PDF into chunks of n pages each."""
output_dir.mkdir(parents=True, exist_ok=True)
created: list[Path] = []
with open(input_path, "rb") as fh:
reader = PdfReader(fh)
total = len(reader.pages)
chunks = math.ceil(total / n)
for chunk_idx in range(chunks):
writer = PdfWriter()
start = chunk_idx * n
end = min(start + n, total)
for i in range(start, end):
writer.add_page(reader.pages[i])
out_path = output_dir / f"{input_path.stem}_chunk{chunk_idx + 1:02d}.pdf"
with open(out_path, "wb") as out:
writer.write(out)
writer.close()
created.append(out_path)
return created
Validation
After every merge or split, assert structural integrity:
# pip install pypdf
from pypdf import PdfReader
from pathlib import Path
def validate_pdf_output(
output_path: Path,
expected_pages: int | None = None,
) -> bool:
"""Return True if the PDF opens cleanly and page count matches expectation."""
try:
reader = PdfReader(output_path)
actual = len(reader.pages)
if expected_pages is not None and actual != expected_pages:
print(f"FAIL: expected {expected_pages} pages, got {actual}")
return False
print(f"OK: {output_path.name} ({actual} pages)")
return True
except Exception as exc:
print(f"FAIL: {output_path.name} ({exc})")
return False
if __name__ == "__main__":
# After merging 3 files with 5+7+8 pages:
validate_pdf_output(Path("./output/merged.pdf"), expected_pages=20)
Also open a random output file in a PDF viewer after running automated tests — automated checks catch structural errors but not rendering artifacts from corrupt font streams.
Performance and Scale
Memory model: PdfWriter accumulates page references in memory but does not load pixel data. Peak memory is proportional to the largest single page's resource dictionary, not the sum of all pages. A merge of 500 one-page documents uses far less memory than a merge of 5 documents with embedded high-resolution images.
Streaming large batches: For batches over 200 files, avoid holding all PdfReader objects open simultaneously. The pattern in the merge snippet above — open, append, close inside with — is correct. Avoid readers = [PdfReader(p) for p in files] patterns.
Chunked intermediate merge: For 1 000+ files, merge in chunks of 100 to temporary files, then do a final merge of the temporaries. This caps peak RAM and isolates corruption to specific chunks.
# pip install pypdf
from pypdf import PdfWriter, PdfReader
from pathlib import Path
import tempfile, shutil
def chunked_merge(all_files: list[Path], output: Path, chunk_size: int = 100) -> None:
"""Merge a large list of PDFs in chunks to keep memory bounded."""
tmp_dir = Path(tempfile.mkdtemp())
try:
chunk_files: list[Path] = []
for i in range(0, len(all_files), chunk_size):
chunk = all_files[i:i + chunk_size]
chunk_out = tmp_dir / f"chunk_{i // chunk_size:04d}.pdf"
writer = PdfWriter()
for p in chunk:
with open(p, "rb") as fh:
writer.append(PdfReader(fh))
with open(chunk_out, "wb") as out:
writer.write(out)
writer.close()
chunk_files.append(chunk_out)
# Final merge of chunk files
final_writer = PdfWriter()
for c in chunk_files:
with open(c, "rb") as fh:
final_writer.append(PdfReader(fh))
with open(output, "wb") as out:
final_writer.write(out)
final_writer.close()
finally:
shutil.rmtree(tmp_dir, ignore_errors=True)
pikepdf for repair: If source files have corrupted cross-reference tables (PdfReadError: EOF marker not found), open them first with pikepdf (pikepdf.open(path, allow_overwriting_input=False)), save a repaired copy, then process with pypdf.
Troubleshooting
| Error | Root cause | Fix |
|---|---|---|
PdfReadError: EOF marker not found | Truncated or corrupted file | Open and re-save with pikepdf; wrap in try/except PdfReadError to skip in batch |
FileNotDecryptedError | Accessing pages of encrypted PDF before decrypting | Call reader.decrypt(password) and check the return value (0 = wrong password) |
PermissionError: [Errno 13] | File handle still open (Windows) | Always use with open(path, "rb") as fh: reader = PdfReader(fh) pattern |
| Bookmarks missing in merged output | import_outline=False (old default) or using add_page() instead of append() | Pass import_outline=True to append(); never use add_page() for full-document merges |
| Different page sizes in output | Source documents have mixed /MediaBox values | Normalize page.mediabox after appending, or accept per-page sizes |
| Output file is 0 bytes | writer.write() called before any pages appended | Check len(writer.pages) > 0 before writing |
Complete Script
#!/usr/bin/env python3
# pip install pypdf
"""
merge_split.py — merge all PDFs in a folder, then split the result by ranges.
Usage: python merge_split.py --input ./docs --output ./out --split 1-5,6-10
"""
import argparse
import re
from pathlib import Path
from pypdf import PdfReader, PdfWriter
from pypdf.errors import PdfReadError
def natural_key(p: Path) -> list:
return [int(c) if c.isdigit() else c.lower() for c in re.split(r"(\d+)", p.name)]
def merge(input_dir: Path, output: Path) -> int:
"""Return page count of merged file."""
writer = PdfWriter()
files = sorted(input_dir.glob("*.pdf"), key=natural_key)
for f in files:
try:
with open(f, "rb") as fh:
writer.append(PdfReader(fh), import_outline=True)
except PdfReadError as exc:
print(f"[SKIP] {f.name}: {exc}")
output.parent.mkdir(parents=True, exist_ok=True)
with open(output, "wb") as out:
writer.write(out)
count = len(writer.pages)
writer.close()
print(f"Merged {len(files)} files → {output} ({count} pages)")
return count
def parse_ranges(spec: str) -> list[tuple[int, int]]:
"""Parse '1-5,6-10' → [(1,5),(6,10)]."""
result = []
for part in spec.split(","):
part = part.strip()
if "-" in part:
a, b = part.split("-", 1)
result.append((int(a), int(b)))
else:
n = int(part)
result.append((n, n))
return result
def split(input_path: Path, output_dir: Path, ranges: list[tuple[int, int]]) -> None:
output_dir.mkdir(parents=True, exist_ok=True)
with open(input_path, "rb") as fh:
reader = PdfReader(fh)
total = len(reader.pages)
for idx, (start, end) in enumerate(ranges, 1):
if not (1 <= start <= end <= total):
print(f"[SKIP] Range ({start}-{end}) invalid for {total}-page doc")
continue
writer = PdfWriter()
for i in range(start - 1, end):
writer.add_page(reader.pages[i])
out_path = output_dir / f"{input_path.stem}_part{idx:02d}.pdf"
with open(out_path, "wb") as out:
writer.write(out)
writer.close()
print(f" part{idx:02d}: pages {start}–{end} → {out_path.name}")
def main() -> None:
ap = argparse.ArgumentParser(description="Merge folder of PDFs, optionally split result")
ap.add_argument("--input", required=True, type=Path, help="Directory of source PDFs")
ap.add_argument("--output", required=True, type=Path, help="Output directory")
ap.add_argument("--split", default="", help="Page ranges to split, e.g. '1-5,6-10'")
args = ap.parse_args()
merged_path = args.output / "merged.pdf"
merge(args.input, merged_path)
if args.split:
ranges = parse_ranges(args.split)
split(merged_path, args.output / "splits", ranges)
if __name__ == "__main__":
main()
Guides in This Section
- Batch Merge PDFs with a Python Script — argparse CLI for merging all PDFs in a folder with natural sort and error recovery
- Split a PDF by Page Ranges with Python — parse a ranges string, off-by-one pitfalls, split every N pages, split on bookmarks
Related
- Generating PDF Reports Dynamically — use merge operations to assemble cover pages, body sections, and appendices into final reports
- Watermarking and Securing PDFs — apply password protection and watermarks after the merge/split step
- Extracting Tables from PDFs — coordinate-based parsing of the content inside the pages you are assembling