Handle Multi-Page PDF Tables in pandas

A table that spans multiple PDF pages produces a repeated header row at the top of each new page. When you extract per-page DataFrames with pdfplumber or camelot and call pd.concat() without filtering, those header strings appear as data rows scattered through the combined DataFrame. Downstream operations — pd.to_numeric, groupby, any dtype coercion — fail or silently produce NaN wherever a header row landed.

The same symptom has two other causes: a page break splits a single row across two pages (the row appears fragmented in both frames), and inconsistent column counts across pages (some pages export N columns, others N+1 due to a merged-cell footnote column). Each variant needs a slightly different fix.

Root Cause

PDF table extractors treat each page independently. They do not know that pages 1 through 8 hold one logical table. Each page's extracted frame has its own header row — usually promoted from the first data row extracted on that page. When those frames are concatenated, the header strings ("Date", "Amount", etc.) become ordinary string values in whatever rows they occupy.

# pip install pdfplumber pandas
from pathlib import Path
import pdfplumber
import pandas as pd

PDF_PATH = Path("data/report.pdf")


def naive_concat(path: Path) -> pd.DataFrame:
    """Demonstrates the duplicated-header problem — do not use as-is."""
    frames = []
    with pdfplumber.open(path) as pdf:
        for page in pdf.pages:
            raw = page.extract_tables()
            for tbl in (raw or []):
                if tbl and len(tbl) > 1:
                    # First raw row becomes columns; already promoted
                    frames.append(pd.DataFrame(tbl[1:], columns=tbl[0]))
    return pd.concat(frames, ignore_index=True)


if __name__ == "__main__":
    df = naive_concat(PDF_PATH)
    # Spot the header-as-data problem:
    print(df[df["Amount"] == "Amount"])   # should return empty; if not, headers leaked

Run the diagnostic above first. If the filter returns rows, repeated headers are present and need to be removed before any further processing.

Fix: Drop Repeated Header Rows, Standardize Columns, and Concat

# pip install pdfplumber pandas
from pathlib import Path
import pdfplumber
import pandas as pd

PDF_PATH = Path("data/report.pdf")


def extract_frames(path: Path) -> list[pd.DataFrame]:
    """Extract one DataFrame per page table. First raw row → column names."""
    frames: list[pd.DataFrame] = []
    try:
        with pdfplumber.open(path) as pdf:
            for page_num, page in enumerate(pdf.pages, start=1):
                for raw in (page.extract_tables() or []):
                    if not raw or len(raw) < 2:
                        continue
                    # Normalize header: strip whitespace, replace None
                    header = [
                        str(c).strip() if c else f"col_{i}"
                        for i, c in enumerate(raw[0])
                    ]
                    df = pd.DataFrame(raw[1:], columns=header)
                    df["_page"] = page_num   # audit column — drop before export
                    frames.append(df)
    except Exception as e:
        raise RuntimeError(f"Extraction failed: {e}") from e
    return frames


def drop_repeated_headers(
    frames: list[pd.DataFrame],
    canonical_cols: list[str] | None = None,
) -> pd.DataFrame:
    """
    Remove rows where every non-audit cell matches its column name.
    Standardize all frames to the canonical column set before concat.

    Args:
        frames:         List of per-page DataFrames from extract_frames().
        canonical_cols: Expected column names. Defaults to the first frame's columns.
    """
    if not frames:
        return pd.DataFrame()

    # Determine canonical columns from first frame if not provided
    non_audit = [c for c in frames[0].columns if not c.startswith("_")]
    if canonical_cols is None:
        canonical_cols = non_audit

    cleaned: list[pd.DataFrame] = []
    for df in frames:
        # Drop rows where every data cell equals its column name (repeated header)
        data_cols = [c for c in df.columns if not c.startswith("_")]
        header_mask = df[data_cols].apply(
            lambda row: all(             # True if every cell is the column name
                str(v).strip() == str(col).strip()
                for v, col in zip(row, data_cols)
            ),
            axis=1,
        )
        df = df[~header_mask].copy()    # remove header rows

        # Standardize to canonical column set; fill missing columns with NaN
        audit = [c for c in df.columns if c.startswith("_")]
        for col in canonical_cols:
            if col not in df.columns:
                df[col] = pd.NA         # add missing column as empty
        df = df[canonical_cols + audit] # reorder to canonical + audit
        cleaned.append(df)

    # ignore_index=True re-sequences the integer index across all pages
    combined = pd.concat(cleaned, ignore_index=True)

    # Forward-fill group-label columns (merged cells appear only on first data row)
    combined = combined.ffill()

    return combined


if __name__ == "__main__":
    frames = extract_frames(PDF_PATH)
    print(f"Extracted {len(frames)} page frame(s)")

    combined = drop_repeated_headers(frames)
    print(f"Combined shape: {combined.shape}")
    print(combined.dtypes)

    # Verify no header strings leaked
    for col in combined.columns:
        if not col.startswith("_"):
            leaks = combined[combined[col].astype(str).str.strip() == col]
            if not leaks.empty:
                print(f"Warning: header leak in column '{col}': {len(leaks)} row(s)")

    combined.drop(columns=["_page"], errors="ignore").to_csv(
        "output/combined.csv", index=False
    )

Key changes from the naive version:

  • header_mask compares every cell value to its column name — a row matches only if all cells are header strings, which avoids false positives on data rows that happen to contain a column's name in one cell.
  • ignore_index=True in pd.concat prevents index collisions; each page's index starts at 0, so without it the combined frame has duplicate index values.
  • The column standardization loop adds missing columns as pd.NA rather than raising a KeyError during concat when column counts differ across pages.

Variant Fix 1: Page-Break Row Splits (Fragmented Rows)

Some PDF generators cut a table row at a page break and continue it on the next page. The first page ends with a partial row (some cells empty), and the next page starts with the row's continuation — not a header.

# pip install pdfplumber pandas
from pathlib import Path
import pdfplumber
import pandas as pd

PDF_PATH = Path("data/split_rows.pdf")
# Assume the last column on each page is "Notes" — often the column that gets split


def stitch_split_rows(frames: list[pd.DataFrame], key_col: str) -> pd.DataFrame:
    """
    Merge a row that was split across a page break.
    Identifies the fragment by a null value in key_col on the first row
    of a subsequent frame — that null signals continuation, not a new row.

    Args:
        frames:   Per-page DataFrames, already header-deduped.
        key_col:  A column that is always populated on a real new row (e.g., "ID").
    """
    merged: list[pd.DataFrame] = []
    carry: pd.Series | None = None   # partial row carried from previous page

    for df in frames:
        rows = []
        for _, row in df.iterrows():
            if carry is not None:
                # First row of this frame: if key_col is null, it's a continuation
                if pd.isna(row.get(key_col)):
                    # Merge: prefer non-null values from the continuation row
                    combined_row = carry.combine_first(row)  # carry wins for non-null
                    rows.append(combined_row)
                    carry = None
                    continue
                else:
                    rows.append(carry)   # carry was complete; flush it
                    carry = None

            # Check if this row itself is a split (last cell null → continues next page)
            if pd.isna(row.iloc[-1]) and not pd.isna(row.get(key_col, pd.NA)):
                carry = row.copy()      # hold it until next page confirms continuation
            else:
                rows.append(row)

        if rows:
            merged.append(pd.DataFrame(rows))

    if carry is not None:
        merged.append(pd.DataFrame([carry]))   # flush final carry

    return pd.concat(merged, ignore_index=True) if merged else pd.DataFrame()


if __name__ == "__main__":
    frames = extract_frames(PDF_PATH)       # reuse function from main fix above
    frames = [drop_repeated_headers([f]) for f in frames]
    result = stitch_split_rows(frames, key_col="ID")
    print(result)

This approach works when a consistent key column (e.g., a row ID or date) is always populated on a genuine new data row. If no such column exists, fall back to detecting the split by checking whether the last N cells of the last row on a page are empty.

Variant Fix 2: Inconsistent Column Counts Across Pages

A PDF where some pages add a "footnote" or "change" column causes per-page DataFrames with differing widths. pd.concat with sort=False pads missing columns with NaN, but column order becomes unpredictable.

# pip install pandas
import pandas as pd


def unify_columns(
    frames: list[pd.DataFrame],
    canonical_cols: list[str],
) -> pd.DataFrame:
    """
    Force all frames to exactly canonical_cols, dropping extras and
    inserting NaN columns for any that are absent.
    """
    unified: list[pd.DataFrame] = []
    for df in frames:
        # Drop any columns not in canonical set
        extra = [c for c in df.columns if c not in canonical_cols and not c.startswith("_")]
        df = df.drop(columns=extra, errors="ignore")
        # Add missing canonical columns
        for col in canonical_cols:
            if col not in df.columns:
                df[col] = pd.NA
        df = df[canonical_cols]   # enforce column order
        unified.append(df)
    return pd.concat(unified, ignore_index=True)


if __name__ == "__main__":
    # Example: pages 3 and 7 have an extra "Revision" column
    frames = extract_frames(Path("data/inconsistent_cols.pdf"))
    frames_cleaned = [drop_repeated_headers([f]) for f in frames]
    CANONICAL = ["Date", "Invoice", "Amount", "Currency", "Status"]
    result = unify_columns(frames_cleaned, CANONICAL)
    print(result.shape)
    assert list(result.columns) == CANONICAL, "Column order mismatch"

Verification

After any fix, confirm that no header strings appear in data columns and that the row count matches a manual count from the source PDF:

# pip install pandas
import pandas as pd


def verify_no_header_leaks(df: pd.DataFrame) -> None:
    """Assert that no cell in a data column exactly matches its column name."""
    leaks_found = False
    for col in df.columns:
        if col.startswith("_"):
            continue
        leaks = df[df[col].astype(str).str.strip() == str(col).strip()]
        if not leaks.empty:
            print(f"Header leak in '{col}': rows {leaks.index.tolist()}")
            leaks_found = True
    if not leaks_found:
        print(f"OK — no header leaks. Shape: {df.shape}")
    else:
        raise AssertionError("Header rows present in data — re-run drop_repeated_headers()")


if __name__ == "__main__":
    df = pd.read_csv("output/combined.csv")
    verify_no_header_leaks(df)
    # Compare against a manually counted total from the PDF
    EXPECTED_ROWS = 243   # set from your source PDF
    assert df.shape[0] == EXPECTED_ROWS, (
        f"Row count mismatch: got {df.shape[0]}, expected {EXPECTED_ROWS}"
    )
    print("Verification passed.")

Run verify_no_header_leaks after every pipeline run, not just during development. PDF structure changes silently when the upstream source system upgrades its PDF generator.

Part of Extracting PDF Data into pandas.