Merging and Splitting PDF Documents

Manually assembling multi-file reports or slicing a 300-page export into per-client packets does not scale. Scripts that call writer.add_page() in a loop drop bookmarks, break internal links, and exhaust memory on anything beyond a few dozen pages. This guide covers the correct primitives in pypdfPdfReader, PdfWriter, and append() — plus page reordering, outline preservation, and streaming patterns for large batches.

The same assembly layer powers Generating PDF Reports Dynamically (injecting cover pages and appendices) and feeds the access-control step in Watermarking and Securing PDFs.

Prerequisites

# pip install pypdf pikepdf
pip install pypdf            # pure-Python; covers 95 % of use cases
pip install pikepdf          # C++ wrapper around QPDF; use for repair/large-scale work

Create test fixtures quickly:

# Generate three small test PDFs with ImageMagick (or use your own files)
for i in 1 2 3; do
  convert -size 595x842 xc:white -pointsize 48 \
    -annotate +220+420 "Page set $i" "test_input_$i.pdf"
done

Inspect Before You Process

Before merging or splitting, confirm page count, encryption status, and outline depth:

# pip install pypdf
from pypdf import PdfReader
from pathlib import Path

def inspect_pdf(path: Path) -> dict:
    """Return basic structural metadata for a PDF."""
    try:
        reader = PdfReader(path)
        return {
            "pages": len(reader.pages),
            "encrypted": reader.is_encrypted,
            "outline_items": len(reader.outline),
            "title": (reader.metadata or {}).get("/Title", ""),
        }
    except Exception as exc:
        return {"error": str(exc)}

if __name__ == "__main__":
    for pdf in sorted(Path("./input_docs").glob("*.pdf")):
        print(pdf.name, inspect_pdf(pdf))

Run this on every input directory before the first merge. Files where encrypted=True need reader.decrypt(password) before any page access; files with error keys should be skipped or repaired with pikepdf.

Core Workflow: Merging

Merge and Split PDF workflow Diagram showing multiple source PDFs flowing into PdfWriter.append() to produce a merged PDF, then PdfWriter per page range producing split output files. doc_a.pdf doc_b.pdf doc_c.pdf PdfWriter .append(reader) merged.pdf ─── SPLIT ─── PdfWriter × range one writer per slice part1 part2 part3

Step 1 — Use append(), not add_page()

PdfWriter.append(reader) recursively imports page resources, form fields, annotations, and the document outline. add_page() does a shallow copy that silently drops all of those.

# pip install pypdf
from pypdf import PdfWriter, PdfReader
from pathlib import Path
import logging

logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
logger = logging.getLogger(__name__)

def merge_pdfs(input_dir: Path, output_path: Path) -> None:
    """Merge all PDFs in a directory in sorted order, preserving outlines."""
    writer = PdfWriter()
    try:
        pdf_files = sorted(input_dir.glob("*.pdf"))
        if not pdf_files:
            logger.warning("No PDF files found in %s", input_dir)
            return

        for pdf_file in pdf_files:
            logger.info("Appending %s", pdf_file.name)
            with open(pdf_file, "rb") as fh:
                reader = PdfReader(fh)
                writer.append(reader, import_outline=True)   # preserves bookmarks

        output_path.parent.mkdir(parents=True, exist_ok=True)
        with open(output_path, "wb") as out:
            writer.write(out)
        logger.info("Merged %d files → %s", len(pdf_files), output_path)
    except Exception as exc:
        logger.error("Merge failed: %s", exc)
        raise
    finally:
        writer.close()

if __name__ == "__main__":
    merge_pdfs(Path("./input_docs"), Path("./output/merged.pdf"))

Open the file inside with open() and pass the file object (not just the path) to PdfReader. This guarantees the OS releases the file descriptor after each iteration — critical on Windows where open handles block subsequent reads.

Step 2 — Preserve and inspect the outline

# pip install pypdf
from pypdf import PdfReader
from pathlib import Path

def print_outline(reader: PdfReader, items=None, depth: int = 0) -> None:
    """Recursively print the document outline (bookmarks)."""
    if items is None:
        items = reader.outline
    for item in items:
        if isinstance(item, list):
            print_outline(reader, item, depth + 1)
        else:
            page_num = reader.get_destination_page_number(item) + 1  # 1-based
            print("  " * depth + f"[p{page_num}] {item.title}")

if __name__ == "__main__":
    reader = PdfReader(Path("./output/merged.pdf"))
    print_outline(reader)

After merging, run this to confirm outlines from every source document are present. If a source had no outline, that is expected; if a source had one and it is missing, you passed import_outline=False (the default before pypdf 3.x — pin pypdf>=3.0).

Step 3 — Reorder pages before writing

Sometimes you need to rearrange pages without re-reading every file. PdfWriter exposes its internal page list; manipulate it directly:

# pip install pypdf
from pypdf import PdfWriter, PdfReader
from pathlib import Path

def reorder_pages(input_path: Path, output_path: Path, new_order: list[int]) -> None:
    """
    Write pages in new_order (1-based) to output_path.
    Example: new_order=[3,1,2] puts page 3 first.
    """
    reader = PdfReader(input_path)
    writer = PdfWriter()
    total = len(reader.pages)
    try:
        for page_num in new_order:
            if not 1 <= page_num <= total:
                raise ValueError(f"Page {page_num} out of range (1–{total})")
            writer.add_page(reader.pages[page_num - 1])   # convert 1-based → 0-based
        with open(output_path, "wb") as out:
            writer.write(out)
        print(f"Reordered {len(new_order)} pages → {output_path}")
    except Exception as exc:
        print(f"Reorder failed: {exc}")
        raise
    finally:
        writer.close()

if __name__ == "__main__":
    reorder_pages(Path("./source.pdf"), Path("./reordered.pdf"), [3, 1, 2])

Core Workflow: Splitting

Range-based split (1-based UI, 0-based pypdf)

The single most common indexing mistake: PDF viewers show page 1, pypdf stores it at index 0. Always subtract 1 when converting user-visible page numbers to slice indices.

# pip install pypdf
from pypdf import PdfReader, PdfWriter
from pathlib import Path

def split_pdf_by_ranges(
    input_path: Path,
    output_dir: Path,
    ranges: list[tuple[int, int]],
) -> list[Path]:
    """
    Split a PDF by 1-based page ranges.
    ranges=[(1,3),(4,8)] → two output files.
    Returns list of created paths.
    """
    output_dir.mkdir(parents=True, exist_ok=True)
    created: list[Path] = []
    try:
        with open(input_path, "rb") as fh:
            reader = PdfReader(fh)
            total = len(reader.pages)

            for idx, (start, end) in enumerate(ranges, start=1):
                if start < 1 or end > total or start > end:
                    raise ValueError(
                        f"Invalid range ({start}{end}) for {total}-page document"
                    )
                writer = PdfWriter()
                for page_num in range(start - 1, end):   # 0-based
                    writer.add_page(reader.pages[page_num])

                out_path = output_dir / f"{input_path.stem}_part{idx:02d}.pdf"
                with open(out_path, "wb") as out:
                    writer.write(out)
                writer.close()
                created.append(out_path)
                print(f"Created: {out_path}  ({end - start + 1} pages)")

    except Exception as exc:
        print(f"Split failed: {exc}")
        raise
    return created

if __name__ == "__main__":
    split_pdf_by_ranges(
        Path("./annual_report.pdf"),
        Path("./output/splits"),
        [(1, 3), (4, 10), (11, 20)],
    )

For a command-line interface with range-string parsing ("1-3,4-10"), see Split a PDF by Page Ranges with Python.

Edge Cases and Variants

Encrypted source files

# pip install pypdf
from pypdf import PdfReader, PdfWriter
from pypdf.errors import FileNotDecryptedError
from pathlib import Path

def merge_with_password(paths: list[Path], password: str, output: Path) -> None:
    """Merge password-protected PDFs after decrypting with a shared password."""
    writer = PdfWriter()
    try:
        for p in paths:
            with open(p, "rb") as fh:
                reader = PdfReader(fh)
                if reader.is_encrypted:
                    result = reader.decrypt(password)
                    if result == 0:
                        print(f"[SKIP] Wrong password for {p.name}")
                        continue
                writer.append(reader, import_outline=True)
        with open(output, "wb") as out:
            writer.write(out)
    except FileNotDecryptedError as exc:
        print(f"Decryption failed: {exc}")
        raise
    finally:
        writer.close()

After merging, the output PDF is unencrypted. Re-apply protection as described in Watermarking and Securing PDFs before distributing.

Mismatched page sizes

When sources have different /MediaBox dimensions (e.g., mixing A4 and Letter), the merged PDF preserves each page's original size. If uniform sizing is required, copy the target /MediaBox onto each page object after appending:

# pip install pypdf
from pypdf import PdfWriter, PdfReader
from pypdf.generic import RectangleObject
from pathlib import Path

A4 = RectangleObject((0, 0, 595.28, 841.89))   # points

def merge_normalize_size(paths: list[Path], output: Path) -> None:
    writer = PdfWriter()
    try:
        for p in paths:
            with open(p, "rb") as fh:
                reader = PdfReader(fh)
                writer.append(reader)
        # Normalize all pages to A4 after appending
        for page in writer.pages:
            page.mediabox = A4
        with open(output, "wb") as out:
            writer.write(out)
    finally:
        writer.close()

Split every N pages

# pip install pypdf
from pypdf import PdfReader, PdfWriter
from pathlib import Path
import math

def split_every_n(input_path: Path, output_dir: Path, n: int) -> list[Path]:
    """Split a PDF into chunks of n pages each."""
    output_dir.mkdir(parents=True, exist_ok=True)
    created: list[Path] = []
    with open(input_path, "rb") as fh:
        reader = PdfReader(fh)
        total = len(reader.pages)
        chunks = math.ceil(total / n)
        for chunk_idx in range(chunks):
            writer = PdfWriter()
            start = chunk_idx * n
            end = min(start + n, total)
            for i in range(start, end):
                writer.add_page(reader.pages[i])
            out_path = output_dir / f"{input_path.stem}_chunk{chunk_idx + 1:02d}.pdf"
            with open(out_path, "wb") as out:
                writer.write(out)
            writer.close()
            created.append(out_path)
    return created

Validation

After every merge or split, assert structural integrity:

# pip install pypdf
from pypdf import PdfReader
from pathlib import Path

def validate_pdf_output(
    output_path: Path,
    expected_pages: int | None = None,
) -> bool:
    """Return True if the PDF opens cleanly and page count matches expectation."""
    try:
        reader = PdfReader(output_path)
        actual = len(reader.pages)
        if expected_pages is not None and actual != expected_pages:
            print(f"FAIL: expected {expected_pages} pages, got {actual}")
            return False
        print(f"OK: {output_path.name}  ({actual} pages)")
        return True
    except Exception as exc:
        print(f"FAIL: {output_path.name}  ({exc})")
        return False

if __name__ == "__main__":
    # After merging 3 files with 5+7+8 pages:
    validate_pdf_output(Path("./output/merged.pdf"), expected_pages=20)

Also open a random output file in a PDF viewer after running automated tests — automated checks catch structural errors but not rendering artifacts from corrupt font streams.

Performance and Scale

Memory model: PdfWriter accumulates page references in memory but does not load pixel data. Peak memory is proportional to the largest single page's resource dictionary, not the sum of all pages. A merge of 500 one-page documents uses far less memory than a merge of 5 documents with embedded high-resolution images.

Streaming large batches: For batches over 200 files, avoid holding all PdfReader objects open simultaneously. The pattern in the merge snippet above — open, append, close inside with — is correct. Avoid readers = [PdfReader(p) for p in files] patterns.

Chunked intermediate merge: For 1 000+ files, merge in chunks of 100 to temporary files, then do a final merge of the temporaries. This caps peak RAM and isolates corruption to specific chunks.

# pip install pypdf
from pypdf import PdfWriter, PdfReader
from pathlib import Path
import tempfile, shutil

def chunked_merge(all_files: list[Path], output: Path, chunk_size: int = 100) -> None:
    """Merge a large list of PDFs in chunks to keep memory bounded."""
    tmp_dir = Path(tempfile.mkdtemp())
    try:
        chunk_files: list[Path] = []
        for i in range(0, len(all_files), chunk_size):
            chunk = all_files[i:i + chunk_size]
            chunk_out = tmp_dir / f"chunk_{i // chunk_size:04d}.pdf"
            writer = PdfWriter()
            for p in chunk:
                with open(p, "rb") as fh:
                    writer.append(PdfReader(fh))
            with open(chunk_out, "wb") as out:
                writer.write(out)
            writer.close()
            chunk_files.append(chunk_out)

        # Final merge of chunk files
        final_writer = PdfWriter()
        for c in chunk_files:
            with open(c, "rb") as fh:
                final_writer.append(PdfReader(fh))
        with open(output, "wb") as out:
            final_writer.write(out)
        final_writer.close()
    finally:
        shutil.rmtree(tmp_dir, ignore_errors=True)

pikepdf for repair: If source files have corrupted cross-reference tables (PdfReadError: EOF marker not found), open them first with pikepdf (pikepdf.open(path, allow_overwriting_input=False)), save a repaired copy, then process with pypdf.

Troubleshooting

ErrorRoot causeFix
PdfReadError: EOF marker not foundTruncated or corrupted fileOpen and re-save with pikepdf; wrap in try/except PdfReadError to skip in batch
FileNotDecryptedErrorAccessing pages of encrypted PDF before decryptingCall reader.decrypt(password) and check the return value (0 = wrong password)
PermissionError: [Errno 13]File handle still open (Windows)Always use with open(path, "rb") as fh: reader = PdfReader(fh) pattern
Bookmarks missing in merged outputimport_outline=False (old default) or using add_page() instead of append()Pass import_outline=True to append(); never use add_page() for full-document merges
Different page sizes in outputSource documents have mixed /MediaBox valuesNormalize page.mediabox after appending, or accept per-page sizes
Output file is 0 byteswriter.write() called before any pages appendedCheck len(writer.pages) > 0 before writing

Complete Script

#!/usr/bin/env python3
# pip install pypdf
"""
merge_split.py — merge all PDFs in a folder, then split the result by ranges.
Usage: python merge_split.py --input ./docs --output ./out --split 1-5,6-10
"""
import argparse
import re
from pathlib import Path

from pypdf import PdfReader, PdfWriter
from pypdf.errors import PdfReadError


def natural_key(p: Path) -> list:
    return [int(c) if c.isdigit() else c.lower() for c in re.split(r"(\d+)", p.name)]


def merge(input_dir: Path, output: Path) -> int:
    """Return page count of merged file."""
    writer = PdfWriter()
    files = sorted(input_dir.glob("*.pdf"), key=natural_key)
    for f in files:
        try:
            with open(f, "rb") as fh:
                writer.append(PdfReader(fh), import_outline=True)
        except PdfReadError as exc:
            print(f"[SKIP] {f.name}: {exc}")
    output.parent.mkdir(parents=True, exist_ok=True)
    with open(output, "wb") as out:
        writer.write(out)
    count = len(writer.pages)
    writer.close()
    print(f"Merged {len(files)} files → {output}  ({count} pages)")
    return count


def parse_ranges(spec: str) -> list[tuple[int, int]]:
    """Parse '1-5,6-10' → [(1,5),(6,10)]."""
    result = []
    for part in spec.split(","):
        part = part.strip()
        if "-" in part:
            a, b = part.split("-", 1)
            result.append((int(a), int(b)))
        else:
            n = int(part)
            result.append((n, n))
    return result


def split(input_path: Path, output_dir: Path, ranges: list[tuple[int, int]]) -> None:
    output_dir.mkdir(parents=True, exist_ok=True)
    with open(input_path, "rb") as fh:
        reader = PdfReader(fh)
        total = len(reader.pages)
        for idx, (start, end) in enumerate(ranges, 1):
            if not (1 <= start <= end <= total):
                print(f"[SKIP] Range ({start}-{end}) invalid for {total}-page doc")
                continue
            writer = PdfWriter()
            for i in range(start - 1, end):
                writer.add_page(reader.pages[i])
            out_path = output_dir / f"{input_path.stem}_part{idx:02d}.pdf"
            with open(out_path, "wb") as out:
                writer.write(out)
            writer.close()
            print(f"  part{idx:02d}: pages {start}{end}{out_path.name}")


def main() -> None:
    ap = argparse.ArgumentParser(description="Merge folder of PDFs, optionally split result")
    ap.add_argument("--input", required=True, type=Path, help="Directory of source PDFs")
    ap.add_argument("--output", required=True, type=Path, help="Output directory")
    ap.add_argument("--split", default="", help="Page ranges to split, e.g. '1-5,6-10'")
    args = ap.parse_args()

    merged_path = args.output / "merged.pdf"
    merge(args.input, merged_path)

    if args.split:
        ranges = parse_ranges(args.split)
        split(merged_path, args.output / "splits", ranges)


if __name__ == "__main__":
    main()

Guides in This Section

Part of Automating PDF Extraction & Generation.

Explore next