Extracting PDF Data into pandas

PDF files do not store tabular data the way spreadsheets do. A "table" in a PDF is usually a set of positioned text characters that happen to look like a grid — there is no schema, no delimiter, and no dtype. Pulling those characters into a pandas DataFrame requires choosing the right extractor for the PDF's internal structure, promoting headers, handling repeated header rows across pages, and coercing strings into proper numeric and datetime types before any analysis is possible.

This guide implements the complete workflow: classify the PDF structure, route to pdfplumber or camelot based on what you find, build per-page DataFrames, concatenate them into a single tidy frame, and normalize dtypes so the result is analysis-ready.

Prerequisites

Install Python dependencies and verify the system-level requirements camelot needs:

# System dependencies (Ubuntu/Debian — camelot lattice mode requires Ghostscript)
sudo apt-get install ghostscript libsm6 libxext6

# Python packages
pip install pdfplumber "camelot-py[cv]" pandas

# Verify camelot can see Ghostscript
python -c "import camelot; print(camelot.__version__)"

If the camelot import raises OSError: ghostscript not found, the Ghostscript binary is missing or not on PATH. Lattice mode will not work without it; stream mode and pdfplumber will. For a full fix, see Fix Camelot Import Error on Linux.

Place a representative PDF at data/report.pdf to follow along. A multi-page financial statement or invoice works well.

Step 1: Classify the PDF Before Extracting

Not every PDF responds to the same extractor. Running a quick diagnostic before extraction avoids silent failures — camelot lattice returns empty results on PDFs without vector lines, while pdfplumber's default table finder fails on scanned pages.

# pip install pdfplumber
from pathlib import Path
import pdfplumber

PDF_PATH = Path("data/report.pdf")


def classify_pdf(path: Path) -> str:
    """
    Return 'lattice', 'stream', or 'ocr' based on the first page's content.
    'lattice'  — vector lines found; use camelot lattice
    'stream'   — text layer present, no vector lines; use camelot stream or pdfplumber
    'ocr'      — no text and no lines; rasterized scan, needs OCR
    """
    try:
        with pdfplumber.open(path) as pdf:
            page = pdf.pages[0]
            has_text = bool((page.extract_text() or "").strip())
            has_lines = bool(page.lines or page.rects)
    except Exception as e:
        raise RuntimeError(f"Cannot open {path}: {e}") from e

    if not has_text and not has_lines:
        return "ocr"
    return "lattice" if has_lines else "stream"


if __name__ == "__main__":
    mode = classify_pdf(PDF_PATH)
    print(f"Recommended extraction mode: {mode}")

The check on page.lines and page.rects distinguishes PDFs with drawn borders (camelot lattice) from those with whitespace-delimited columns (camelot stream or pdfplumber). For scanned documents the result is "ocr" — see How to Extract Tables from Scanned PDFs for that path.

Step 2: Extract Tables Per Page

Extract each page independently and collect the raw DataFrames in a list. Keeping pages separate at this stage makes header deduplication straightforward in the next step.

pdfplumber — whitespace and loosely bordered tables

# pip install pdfplumber pandas
from pathlib import Path
import pdfplumber
import pandas as pd

PDF_PATH = Path("data/report.pdf")


def extract_pages_pdfplumber(path: Path) -> list[pd.DataFrame]:
    """Extract tables from every page using pdfplumber."""
    page_frames: list[pd.DataFrame] = []
    try:
        with pdfplumber.open(path) as pdf:
            for page_num, page in enumerate(pdf.pages, start=1):
                raw_tables = page.extract_tables(table_settings={
                    "vertical_strategy": "lines_strict",
                    "horizontal_strategy": "lines_strict",
                    "snap_tolerance": 3,
                }) or []
                for raw in raw_tables:
                    if not raw or len(raw) < 2:
                        continue
                    # Promote first row as header; replace None with placeholder
                    header = [
                        str(c).strip() if c else f"col_{i}"
                        for i, c in enumerate(raw[0])
                    ]
                    df = pd.DataFrame(raw[1:], columns=header)
                    df["_source_page"] = page_num       # audit column
                    page_frames.append(df)
    except Exception as e:
        raise RuntimeError(f"pdfplumber extraction failed on {path}: {e}") from e
    return page_frames


if __name__ == "__main__":
    frames = extract_pages_pdfplumber(PDF_PATH)
    print(f"Extracted {len(frames)} table(s) across pages")
    for df in frames:
        print(df.shape, df.columns.tolist())

If columns are still misaligned after extraction — values appearing in the wrong column — that is coordinate drift. The extract_words approach in Fix PDF Text Extraction Alignment Issues resolves it by sorting text objects by their x/y positions directly.

camelot — bordered and whitespace-column tables

# pip install "camelot-py[cv]" pandas
from pathlib import Path
import camelot
import pandas as pd

PDF_PATH = Path("data/report.pdf")


def extract_pages_camelot(
    path: Path,
    pages: str = "1-end",
    flavor: str = "lattice",
) -> list[pd.DataFrame]:
    """Extract tables using camelot, one DataFrame per detected table."""
    try:
        table_list = camelot.read_pdf(
            str(path),
            pages=pages,
            flavor=flavor,
            process_background=(flavor == "lattice"),  # find lines on coloured fills
        )
    except Exception as e:
        raise RuntimeError(f"camelot ({flavor}) failed on {path}: {e}") from e

    if table_list.n == 0:
        return []

    frames: list[pd.DataFrame] = []
    for t in table_list:
        df = t.df.copy()
        # First row is always the header in camelot's raw output
        df.columns = df.iloc[0].str.strip()
        df = df.iloc[1:].reset_index(drop=True)
        df.replace("", pd.NA, inplace=True)
        df["_accuracy"] = t.parsing_report.get("accuracy", None)  # audit column
        frames.append(df)
    return frames


if __name__ == "__main__":
    frames = extract_pages_camelot(PDF_PATH, pages="1-5", flavor="lattice")
    for i, df in enumerate(frames):
        print(f"Table {i + 1}: {df.shape}, accuracy={df['_accuracy'].iloc[0]}")

camelot's parsing_report["accuracy"] score runs from 0 to 100. Scores below 80 on lattice mode indicate that line detection missed borders — try increasing line_scale from the default 15 to 40 or 60, or switch to flavor="stream".

PDF to pandas DataFrame Pipeline Five-stage flow: PDF pages are fed into an extractor (pdfplumber or camelot), which produces per-page DataFrames, which are concatenated and cleaned into a single tidy DataFrame ready for analysis or export. PDF Pages page 1 page 2 page N Extractor pdfplumber or camelot Per-page DataFrames df_page_1 df_page_2 df_page_N Concat + Clean dedup headers Tidy DataFrame correct dtypes clean strings analysis-ready Each page extracted independently; per-page frames concatenated then dtype-normalized

Step 3: Concatenate Per-Page Tables and Drop Repeated Headers

PDFs that paginate a single logical table repeat the header row at the top of each page. Concatenating naively pollutes the DataFrame with header strings as data rows.

# pip install pandas
from pathlib import Path
import pandas as pd


def concat_drop_headers(frames: list[pd.DataFrame]) -> pd.DataFrame:
    """
    Concatenate per-page DataFrames and remove repeated header rows.
    Assumes all frames share the same column set (or a superset).
    """
    if not frames:
        return pd.DataFrame()

    canonical_cols = frames[0].columns.tolist()

    cleaned: list[pd.DataFrame] = []
    for df in frames:
        # A row is a repeated header if every non-null cell matches its column name
        non_audit = [c for c in df.columns if not c.startswith("_")]
        header_mask = df[non_audit].apply(
            lambda row: [
                str(v).strip() == str(col).strip()
                for v, col in zip(row, non_audit)
            ],
            axis=1,
        ).apply(all)
        df = df[~header_mask].copy()
        cleaned.append(df)

    combined = pd.concat(cleaned, ignore_index=True)
    return combined


if __name__ == "__main__":
    # Example using pdfplumber frames from Step 2
    from pathlib import Path
    frames = extract_pages_pdfplumber(Path("data/report.pdf"))
    combined = concat_drop_headers(frames)
    print(combined.shape)
    print(combined.head())

For tables where the header row is not an exact string match — for example, the PDF adds a page number inside the header cell — apply a looser match: row.str.contains(col_keyword). The variant with ignore_index=True in pd.concat re-sequences the integer index across pages.

Step 4: Forward-Fill Merged Cells

PDF tables frequently use merged cells to avoid repeating a group label (e.g., a date that applies to several rows). After extraction each merged cell's value appears only in the first row; the rest are empty or NaN.

# pip install pandas
import pandas as pd


def forward_fill_groups(df: pd.DataFrame, group_cols: list[str]) -> pd.DataFrame:
    """
    Forward-fill a list of columns that represent merged-cell group labels.
    Only fills within a run of NaN values; stops at the next explicit value.
    """
    df = df.copy()
    for col in group_cols:
        if col in df.columns:
            # Replace empty strings with NaN first
            df[col] = df[col].replace("", pd.NA)
            df[col] = df[col].ffill()
    return df


if __name__ == "__main__":
    sample = pd.DataFrame({
        "Region":   ["North", pd.NA, pd.NA, "South", pd.NA],
        "Month":    ["Jan", "Feb", "Mar", "Jan", "Feb"],
        "Revenue":  [1000, 1200, 900, 1500, 1100],
    })
    print(forward_fill_groups(sample, group_cols=["Region"]))

Apply forward_fill_groups immediately after concat_drop_headers, before any dtype coercion.

Step 5: Normalize Dtypes — Numbers, Dates, Currency

Extracted PDF text is always object dtype. Numeric columns contain comma-formatted strings ("1,250.00"), currency symbols ("$1,250.00"), and percentage suffixes ("12.5%"). Date columns contain free-form strings.

# pip install pandas
import re
import pandas as pd


def clean_currency(series: pd.Series) -> pd.Series:
    """Strip currency symbols, commas, and parentheses (negatives) → float."""
    s = series.astype(str).str.strip()
    s = s.str.replace(r"[$€£¥,\s]", "", regex=True)   # remove symbols and commas
    s = s.str.replace(r"\((.+)\)", r"-\1", regex=True)  # (1250.00) → -1250.00
    s = s.str.replace(r"%$", "", regex=True)             # strip trailing %
    return pd.to_numeric(s, errors="coerce")


def coerce_dtypes(df: pd.DataFrame) -> pd.DataFrame:
    """
    Attempt automatic dtype normalization on all columns:
    - Columns with >70 % parseable numeric values → float
    - Columns with >70 % parseable dates → datetime
    """
    df = df.copy()
    for col in df.columns:
        if col.startswith("_"):
            continue  # skip audit columns

        series = df[col].copy()

        # Try numeric (including currency-formatted strings)
        numeric_candidate = clean_currency(series)
        if numeric_candidate.notna().mean() > 0.70:
            df[col] = numeric_candidate
            continue

        # Try datetime
        try:
            date_candidate = pd.to_datetime(series, infer_datetime_format=True, errors="coerce")
            if date_candidate.notna().mean() > 0.70:
                df[col] = date_candidate
                continue
        except Exception:
            pass

        # Otherwise strip leading/trailing whitespace and keep as string
        df[col] = series.astype(str).str.strip().replace("nan", pd.NA)

    return df


if __name__ == "__main__":
    sample = pd.DataFrame({
        "Date":    ["2026-01-15", "2026-02-01", "bad"],
        "Revenue": ["$1,250.00", "$900.50", "n/a"],
        "Units":   ["120", "85", "62"],
    })
    cleaned = coerce_dtypes(sample)
    print(cleaned.dtypes)
    print(cleaned)

The 70 % threshold means a column is classified as numeric/date even if a minority of cells contain free-form notes or error markers. Adjust the threshold upward for cleaner sources.

For a broader treatment of pandas string-cleaning patterns, see Cleaning Messy CSV Data with pandas — the same encoding-fix and whitespace-strip techniques apply to text extracted from PDFs.

Step 6: Validate Before Export

Row-count and dtype checks catch extraction failures early — before downstream code silently processes a truncated or mistyped DataFrame.

# pip install pandas
import pandas as pd


def validate_dataframe(
    df: pd.DataFrame,
    expected_cols: list[str],
    min_rows: int = 1,
    max_null_ratio: float = 0.30,
) -> None:
    """Raise or warn if the DataFrame does not meet basic quality expectations."""
    missing = [c for c in expected_cols if c not in df.columns]
    if missing:
        raise ValueError(f"Missing expected columns: {missing}")

    if df.shape[0] < min_rows:
        raise ValueError(f"Too few rows: got {df.shape[0]}, expected >= {min_rows}")

    null_ratio = df[expected_cols].isnull().mean().mean()
    if null_ratio > max_null_ratio:
        print(
            f"Warning: {null_ratio:.1%} of values are null in expected columns "
            f"(threshold {max_null_ratio:.0%}). Check forward-fill and header dedup."
        )

    print(f"OK — shape={df.shape}, null_ratio={null_ratio:.1%}")
    print(df.dtypes.to_string())


if __name__ == "__main__":
    df = pd.DataFrame({
        "Date":    pd.to_datetime(["2026-01-15", "2026-02-01"]),
        "Revenue": [1250.0, 900.5],
        "Units":   [120, 85],
    })
    validate_dataframe(df, expected_cols=["Date", "Revenue", "Units"], min_rows=1)

Edge Cases & Variants

Variant 1: pdfplumber with text-based extraction (no table grid detected)

When page.extract_tables() returns nothing — the PDF has no detectable grid at all — fall back to page.extract_text() and parse it line by line:

# pip install pdfplumber pandas
from pathlib import Path
import pdfplumber
import pandas as pd

PDF_PATH = Path("data/report.pdf")


def extract_text_lines(path: Path, separator: str = r"\s{2,}") -> pd.DataFrame:
    """
    Parse a whitespace-aligned PDF as fixed-width text rows.
    Works when no table grid is detectable.
    """
    import re
    rows: list[list[str]] = []
    try:
        with pdfplumber.open(path) as pdf:
            for page in pdf.pages:
                text = page.extract_text() or ""
                for line in text.splitlines():
                    line = line.strip()
                    if not line:
                        continue
                    cells = re.split(separator, line)
                    rows.append(cells)
    except Exception as e:
        raise RuntimeError(f"Text extraction failed: {e}") from e

    if not rows:
        return pd.DataFrame()

    # Use the first non-empty row as header if it looks like a header
    header = rows[0]
    return pd.DataFrame(rows[1:], columns=header)


if __name__ == "__main__":
    df = extract_text_lines(PDF_PATH)
    print(df.head())

Variant 2: camelot stream with explicit column coordinates

When flavor="stream" merges adjacent columns, provide explicit x-coordinates measured from the PDF viewer. Open the PDF, note the x-positions of column separators (in points, 72 per inch), and pass them directly:

# pip install "camelot-py[cv]" pandas
import camelot
from pathlib import Path

PDF_PATH = Path("data/report.pdf")

tables = camelot.read_pdf(
    str(PDF_PATH),
    pages="2",
    flavor="stream",
    columns=["120,240,360,480,600"],   # x-coordinates of column separators in pt
    edge_tol=500,
    row_tol=2,
)
df = tables[0].df
print(df)

Variant 3: Mixed-structure PDF — some pages lattice, some stream

# pip install pdfplumber "camelot-py[cv]" pandas
from pathlib import Path
import pdfplumber
import camelot
import pandas as pd

PDF_PATH = Path("data/mixed.pdf")


def extract_mixed(path: Path) -> list[pd.DataFrame]:
    """Route each page to the best extractor based on its own structure."""
    frames: list[pd.DataFrame] = []
    try:
        with pdfplumber.open(path) as pdf:
            for page_num, page in enumerate(pdf.pages, start=1):
                has_lines = bool(page.lines or page.rects)
                if has_lines:
                    tbls = camelot.read_pdf(
                        str(path), pages=str(page_num), flavor="lattice"
                    )
                    for t in tbls:
                        df = t.df.copy()
                        df.columns = df.iloc[0].str.strip()
                        df = df.iloc[1:].reset_index(drop=True)
                        frames.append(df)
                else:
                    raw_tables = page.extract_tables() or []
                    for raw in raw_tables:
                        if raw and len(raw) > 1:
                            df = pd.DataFrame(raw[1:], columns=raw[0])
                            frames.append(df)
    except Exception as e:
        raise RuntimeError(f"Mixed extraction failed: {e}") from e
    return frames

Performance & Scale

Memory: pdfplumber loads the entire file into memory when opened. For PDFs above 100 MB, slice pages directly: pdf.pages[0:10] — Python slice notation works on pdfplumber's page list.

Concurrency: PDF parsing is CPU-bound. Use concurrent.futures.ProcessPoolExecutor to process multiple PDF files simultaneously; do not use threads.

# pip install pdfplumber pandas
from concurrent.futures import ProcessPoolExecutor, as_completed
from pathlib import Path
import pandas as pd


def process_pdf(path: Path) -> pd.DataFrame:
    """Import inside function so each worker process gets clean imports."""
    import pdfplumber

    frames = []
    with pdfplumber.open(path) as pdf:
        for page in pdf.pages:
            for raw in (page.extract_tables() or []):
                if raw and len(raw) > 1:
                    frames.append(pd.DataFrame(raw[1:], columns=raw[0]))
    return pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()


def batch_extract(pdf_dir: Path) -> dict[str, pd.DataFrame]:
    paths = list(pdf_dir.glob("*.pdf"))
    results: dict[str, pd.DataFrame] = {}
    with ProcessPoolExecutor() as pool:
        futures = {pool.submit(process_pdf, p): p for p in paths}
        for fut in as_completed(futures):
            p = futures[fut]
            try:
                results[p.name] = fut.result()
            except Exception as e:
                print(f"Failed {p.name}: {e}")
    return results


if __name__ == "__main__":
    all_frames = batch_extract(Path("data/pdfs/"))
    for name, df in all_frames.items():
        print(f"{name}: {df.shape}")

Out-of-core: for datasets that exceed RAM, write each page's DataFrame directly to Parquet in append mode using pyarrow and read it back with pd.read_parquet() at analysis time. Parquet preserves dtypes and compresses numeric data efficiently.

For choosing between pdfplumber, camelot, and tabula before building a pipeline, see pdfplumber vs camelot vs tabula — it benchmarks accuracy, speed, and dependency weight on a common set of PDF types.

Troubleshooting

ErrorRoot causeFix
camelot.TableList.n == 0 on latticeNo vector lines detected on the target pagesSwitch to flavor="stream" or pdfplumber; verify with classify_pdf()
Header row appears mid-DataFrameRepeated header rows not removed before concatApply concat_drop_headers() from Step 3
Numeric column stays object dtypeValues contain currency symbols or commasApply clean_currency() from Step 5 before pd.to_numeric
ValueError: shape of values on concatPer-page tables have differing column countsInspect each frame's .columns; use pd.concat(..., sort=False) with uniform columns
Date column stays objectInconsistent date formats across pagesPass format=None and infer_datetime_format=True or use dateutil.parser.parse
OSError: ghostscript not foundGhostscript binary not on PATHsudo apt-get install ghostscript; lattice mode requires it

Complete Script

#!/usr/bin/env python3
"""
pdf_to_pandas.py — Extract PDF tables into a clean pandas DataFrame.

Usage:
    python pdf_to_pandas.py report.pdf --pages 1-end --output output/result.csv
    python pdf_to_pandas.py report.pdf --flavor stream --group-cols "Region,Category"

pip install pdfplumber "camelot-py[cv]" pandas
"""
import argparse
import re
import sys
from pathlib import Path

import pandas as pd
import pdfplumber


def classify(path: Path) -> str:
    with pdfplumber.open(path) as pdf:
        page = pdf.pages[0]
        has_text = bool((page.extract_text() or "").strip())
        has_lines = bool(page.lines or page.rects)
    if not has_text and not has_lines:
        return "ocr"
    return "lattice" if has_lines else "stream"


def extract_pdfplumber(path: Path) -> list[pd.DataFrame]:
    frames = []
    with pdfplumber.open(path) as pdf:
        for pnum, page in enumerate(pdf.pages, start=1):
            for raw in (page.extract_tables() or []):
                if raw and len(raw) > 1:
                    header = [str(c).strip() if c else f"col_{i}" for i, c in enumerate(raw[0])]
                    df = pd.DataFrame(raw[1:], columns=header)
                    df["_page"] = pnum
                    frames.append(df)
    return frames


def extract_camelot(path: Path, pages: str, flavor: str) -> list[pd.DataFrame]:
    import camelot as cam
    tbl = cam.read_pdf(str(path), pages=pages, flavor=flavor,
                       process_background=(flavor == "lattice"))
    frames = []
    for t in tbl:
        df = t.df.copy()
        df.columns = df.iloc[0].str.strip()
        df = df.iloc[1:].reset_index(drop=True)
        df.replace("", pd.NA, inplace=True)
        frames.append(df)
    return frames


def dedup_headers(frames: list[pd.DataFrame]) -> pd.DataFrame:
    if not frames:
        return pd.DataFrame()
    cleaned = []
    for df in frames:
        non_a = [c for c in df.columns if not c.startswith("_")]
        mask = df[non_a].apply(
            lambda r: all(str(v).strip() == str(c).strip() for v, c in zip(r, non_a)), axis=1
        )
        cleaned.append(df[~mask])
    return pd.concat(cleaned, ignore_index=True)


def clean_currency(s: pd.Series) -> pd.Series:
    s = s.astype(str).str.strip()
    s = s.str.replace(r"[$€£¥,\s]", "", regex=True)
    s = s.str.replace(r"\((.+)\)", r"-\1", regex=True)
    s = s.str.replace(r"%$", "", regex=True)
    return pd.to_numeric(s, errors="coerce")


def coerce_dtypes(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    for col in df.columns:
        if col.startswith("_"):
            continue
        num = clean_currency(df[col])
        if num.notna().mean() > 0.70:
            df[col] = num
            continue
        try:
            dt = pd.to_datetime(df[col], infer_datetime_format=True, errors="coerce")
            if dt.notna().mean() > 0.70:
                df[col] = dt
                continue
        except Exception:
            pass
        df[col] = df[col].astype(str).str.strip().replace("nan", pd.NA)
    return df


def main() -> None:
    ap = argparse.ArgumentParser(description="Extract PDF tables into pandas")
    ap.add_argument("pdf", type=Path)
    ap.add_argument("--pages", default="1-end")
    ap.add_argument("--flavor", choices=["auto", "lattice", "stream", "pdfplumber"],
                    default="auto")
    ap.add_argument("--group-cols", default="",
                    help="Comma-separated column names to forward-fill (merged cells)")
    ap.add_argument("--output", type=Path, default=Path("output/result.csv"))
    args = ap.parse_args()

    if not args.pdf.exists():
        sys.exit(f"File not found: {args.pdf}")

    flavor = classify(args.pdf) if args.flavor == "auto" else args.flavor
    print(f"Extraction mode: {flavor}")

    if flavor == "ocr":
        sys.exit("Scanned PDF — use the OCR pipeline (see how-to-extract-tables-from-scanned-pdfs/)")

    try:
        frames = (
            extract_pdfplumber(args.pdf)
            if flavor == "pdfplumber"
            else extract_camelot(args.pdf, args.pages, flavor)
        )
    except Exception as e:
        sys.exit(f"Extraction failed: {e}")

    if not frames:
        sys.exit("No tables found.")

    combined = dedup_headers(frames)

    group_cols = [c.strip() for c in args.group_cols.split(",") if c.strip()]
    for col in group_cols:
        if col in combined.columns:
            combined[col] = combined[col].replace("", pd.NA).ffill()

    combined = coerce_dtypes(combined)

    args.output.parent.mkdir(parents=True, exist_ok=True)
    combined.to_csv(args.output, index=False)
    print(f"Saved {combined.shape[0]} rows × {combined.shape[1]} cols → {args.output}")
    print(combined.dtypes.to_string())


if __name__ == "__main__":
    main()

Part of Automating Document & Data Pipelines.

Explore next