Extracting Tables from PDFs

PDFs store table data in three fundamentally different ways — bordered grids with vector lines, whitespace-aligned columns, and rasterized images — and each requires a different extraction path. Generic text parsers collapse all three into a single misaligned blob. This guide implements a decision-driven pipeline: diagnose the PDF structure first, then route to pdfplumber for whitespace tables, camelot lattice for bordered grids, or the OCR pipeline for scanned documents.

Prerequisites

Install system and Python dependencies before any extraction attempt:

# System dependencies (Ubuntu/Debian)
sudo apt-get install ghostscript libsm6 libxext6

# Python packages
pip install pdfplumber "camelot-py[cv]" pandas pdf2image pytesseract

# Optional: verify camelot system deps are present
python -c "import camelot; print(camelot.__version__)"

If the camelot import raises OSError: ghostscript not found or a cv2 import error, see Fix Camelot Import Error on Linux before continuing.

Place a representative sample PDF at data/input.pdf to follow along. A one-page PDF with a visible-border invoice table works well for lattice mode; a financial statement with whitespace-separated columns suits stream mode.

Step 1: Diagnostic — Classify the PDF Before Choosing a Parser

Run this snippet on any unknown PDF before touching camelot or pdfplumber. It detects text layer presence and line density to determine which extraction path applies.

# pip install pdfplumber
from pathlib import Path
import pdfplumber

PDF_PATH = Path("data/input.pdf")

def classify_pdf(path: Path) -> dict:
    """Return page-level metadata to choose the right extraction strategy."""
    result = {"pages": [], "has_text": False, "has_lines": False, "likely_scanned": False}
    try:
        with pdfplumber.open(path) as pdf:
            for i, page in enumerate(pdf.pages):
                text = page.extract_text() or ""
                lines = page.lines or []
                rects = page.rects or []
                page_info = {
                    "page": i + 1,
                    "text_chars": len(text.strip()),
                    "vector_lines": len(lines),
                    "vector_rects": len(rects),
                }
                result["pages"].append(page_info)
                if text.strip():
                    result["has_text"] = True
                if lines or rects:
                    result["has_lines"] = True
    except Exception as e:
        raise RuntimeError(f"Could not open {path}: {e}") from e

    # Heuristic: no text and no lines → rasterized scan
    result["likely_scanned"] = not result["has_text"] and not result["has_lines"]
    return result

if __name__ == "__main__":
    info = classify_pdf(PDF_PATH)
    for p in info["pages"]:
        print(p)
    print(f"Recommended path: {'OCR' if info['likely_scanned'] else 'lattice' if info['has_lines'] else 'stream'}")

Reading the output:

ConditionRecommended parser
has_lines=True, has_text=Truecamelot lattice
has_text=True, has_lines=Falsecamelot stream or pdfplumber
has_text=False, has_lines=FalseOCR pipeline (pdf2image + Tesseract)
PDF Table Extraction Decision Tree A decision tree showing how to route a PDF to the correct extraction method: bordered grid to camelot lattice, whitespace columns to stream mode, and scanned image to OCR. Input PDF run classify_pdf() Vector lines present? YES camelot lattice NO Text layer present? YES camelot stream NO pdf2image + Tesseract OCR Scanned pages → OCR pipeline; vector lines → lattice; whitespace columns → stream

Step 2: Extract Bordered Tables with camelot Lattice

Lattice mode traces the physical grid lines in the PDF. Use it whenever a table has visible borders — invoices, financial statements, structured reports.

# pip install "camelot-py[cv]" pandas
from pathlib import Path
import camelot
import pandas as pd

PDF_PATH = Path("data/financial_statement.pdf")
OUTPUT_DIR = Path("output")
OUTPUT_DIR.mkdir(exist_ok=True)

def extract_lattice(path: Path, pages: str = "1-end") -> list[pd.DataFrame]:
    """Extract bordered tables using camelot lattice mode."""
    try:
        tables = camelot.read_pdf(
            str(path),
            pages=pages,
            flavor="lattice",
            process_background=True,   # detect lines on coloured backgrounds
            line_scale=40,             # raise to 50-60 for thin/faint lines
        )
    except Exception as e:
        raise RuntimeError(f"camelot lattice failed on {path}: {e}") from e

    if tables.n == 0:
        raise ValueError(f"No tables found in {path} — check if borders are truly vector lines")

    dfs = []
    for t in tables:
        df = t.df.copy()
        df.replace("", pd.NA, inplace=True)
        df.dropna(how="all", inplace=True)
        df.dropna(axis=1, how="all", inplace=True)
        dfs.append(df)

    return dfs

if __name__ == "__main__":
    results = extract_lattice(PDF_PATH, pages="1-3")
    for i, df in enumerate(results):
        out = OUTPUT_DIR / f"table_{i+1}.csv"
        df.to_csv(out, index=False)
        print(f"Table {i+1}: {df.shape[0]} rows × {df.shape[1]} cols → {out}")

Accuracy score: camelot exposes t.parsing_report["accuracy"] (0–100). Scores below 80 indicate line detection problems — try adjusting line_scale or switching to stream mode.

Step 3: Extract Whitespace Tables with camelot Stream

Stream mode infers column boundaries from the whitespace gaps between text runs. Use it for financial PDFs and reports that use fixed-width fonts and spaced columns instead of drawn borders.

# pip install "camelot-py[cv]" pandas
from pathlib import Path
import camelot
import pandas as pd

PDF_PATH = Path("data/annual_report.pdf")

def extract_stream(path: Path, pages: str = "1", col_sep_width: int = 10) -> list[pd.DataFrame]:
    """Extract whitespace-delimited tables using camelot stream mode."""
    try:
        tables = camelot.read_pdf(
            str(path),
            pages=pages,
            flavor="stream",
            # edge_tol controls how close text must be to a column edge
            edge_tol=500,
            # row_tol: vertical tolerance for grouping text into the same row
            row_tol=2,
        )
    except Exception as e:
        raise RuntimeError(f"camelot stream failed: {e}") from e

    dfs = []
    for t in tables:
        df = t.df.copy()
        # First row is usually the header; promote it
        df.columns = df.iloc[0].str.strip()
        df = df.iloc[1:].reset_index(drop=True)
        df.replace("", pd.NA, inplace=True)
        dfs.append(df)
    return dfs

if __name__ == "__main__":
    results = extract_stream(PDF_PATH, pages="2-5")
    for i, df in enumerate(results):
        print(f"Table {i+1}: {df.shape}")
        print(df.head(3), "\n")

When stream fails: if columns merge, open the PDF in a viewer, note the x-coordinates of column separators, and pass them explicitly: camelot.read_pdf(..., columns=["72,144,288,432"]).

Step 4: pdfplumber Fallback for Sparse or Irregular Grids

pdfplumber works well when camelot finds no tables — particularly for loosely-bordered tables or when the PDF is generated by tools that draw border-like rectangles rather than actual PDF line objects.

# pip install pdfplumber pandas
from pathlib import Path
import pdfplumber
import pandas as pd

PDF_PATH = Path("data/report.pdf")

def extract_with_pdfplumber(path: Path) -> list[pd.DataFrame]:
    """Extract tables page-by-page using pdfplumber's built-in table finder."""
    dfs = []
    try:
        with pdfplumber.open(path) as pdf:
            for page_num, page in enumerate(pdf.pages, start=1):
                raw_tables = page.extract_tables(table_settings={
                    "vertical_strategy": "lines_strict",
                    "horizontal_strategy": "lines_strict",
                    "snap_tolerance": 3,
                })
                for raw in raw_tables:
                    if not raw or len(raw) < 2:
                        continue
                    # First row as header
                    header = [str(c).strip() if c else f"col_{i}" for i, c in enumerate(raw[0])]
                    rows = raw[1:]
                    df = pd.DataFrame(rows, columns=header)
                    df.replace("", pd.NA, inplace=True)
                    dfs.append(df)
    except Exception as e:
        raise RuntimeError(f"pdfplumber extraction failed: {e}") from e
    return dfs

if __name__ == "__main__":
    tables = extract_with_pdfplumber(PDF_PATH)
    print(f"Found {len(tables)} table(s)")
    for df in tables:
        print(df.head())

If columns are still misaligned after pdfplumber extraction, the root cause is usually coordinate drift in multi-column layouts — see Fix PDF Text Extraction Alignment Issues for the extract_words + coordinate-sorting approach.

Step 5: Multi-Page Header Deduplication

Paginated PDFs repeat the header row at each page break. Concatenating tables naively produces a DataFrame with hundreds of mid-data header rows.

# pip install pandas
from pathlib import Path
import pandas as pd

def dedup_and_concat(tables: list[pd.DataFrame]) -> pd.DataFrame:
    """Merge tables from multiple pages, removing repeated header rows."""
    if not tables:
        return pd.DataFrame()

    # Use first row of first table as the canonical header
    first = tables[0]
    if first.iloc[0].tolist() == first.columns.tolist():
        # DataFrame was built with header already promoted; skip first data row check
        canonical_header = first.columns.tolist()
    else:
        canonical_header = first.iloc[0].tolist()
        first = first.iloc[1:].copy()
        first.columns = canonical_header

    cleaned = [first]
    for df in tables[1:]:
        # Detect and drop repeated header rows by comparing to canonical
        if df.shape[1] == len(canonical_header):
            mask = df.apply(lambda row: row.tolist() == canonical_header, axis=1)
            df = df[~mask].copy()
        df.columns = canonical_header
        cleaned.append(df)

    combined = pd.concat(cleaned, ignore_index=True)

    # Forward-fill merged/spanned cells (common in PDF tables)
    combined = combined.ffill()

    # Coerce numeric columns
    for col in combined.columns:
        numeric = pd.to_numeric(combined[col], errors="coerce")
        if numeric.notna().mean() > 0.8:   # >80% parseable → treat as numeric
            combined[col] = numeric

    return combined

if __name__ == "__main__":
    sample = [
        pd.DataFrame([["ID", "Amount", "Date"], ["1", "500.00", "2026-01-01"]]),
        pd.DataFrame([["ID", "Amount", "Date"], ["2", "750.50", "2026-01-15"]]),
    ]
    result = dedup_and_concat(sample)
    print(result)
    result.to_csv("output/merged_tables.csv", index=False)

Edge Cases

Tables Spanning Two Pages with No Repeat Header

Some PDF generators split a table mid-row at a page boundary, producing a bottom fragment and a top fragment on successive pages.

# pip install camelot-py[cv] pandas
from pathlib import Path
import camelot, pandas as pd

def stitch_split_table(path: Path, pages: str = "3,4") -> pd.DataFrame:
    """Join tables split across adjacent pages when no header repeats."""
    tables = camelot.read_pdf(str(path), pages=pages, flavor="lattice")
    dfs = [t.df for t in tables]
    if not dfs:
        return pd.DataFrame()
    header = dfs[0].iloc[0].tolist()
    frames = []
    for df in dfs:
        if df.iloc[0].tolist() == header:
            df = df.iloc[1:]
        df.columns = header
        frames.append(df)
    return pd.concat(frames, ignore_index=True)

Rotated Tables (Landscape Pages)

pdfplumber respects page rotation automatically. For camelot, crop the page manually using camelot.read_pdf(..., layout_kwargs={"char_margin": 2.0}). If rotation is inconsistent, pre-rotate with PyMuPDF before extraction.

Tables with Background Colour Fills

camelot's process_background=True flag handles most cases. If lines are still missed, lower the threshold: add copy_text=["h", "v"] to camelot's settings to extract text from cells with coloured fills.

Validation

Always verify shape and dtypes before writing downstream outputs:

# pip install pandas
import pandas as pd

def validate_table(df: pd.DataFrame, expected_cols: int, min_rows: int = 1) -> None:
    """Assert basic structural integrity of an extracted table."""
    assert df.shape[1] == expected_cols, (
        f"Column count mismatch: expected {expected_cols}, got {df.shape[1]}. "
        "Check for header dedup or coordinate drift."
    )
    assert df.shape[0] >= min_rows, (
        f"Too few rows: expected >= {min_rows}, got {df.shape[0]}."
    )
    null_ratio = df.isnull().mean().mean()
    if null_ratio > 0.3:
        print(f"Warning: {null_ratio:.0%} of cells are null — check forward-fill or merged cells.")

    numeric_cols = df.select_dtypes(include="number").columns.tolist()
    print(f"Shape: {df.shape}, Numeric cols: {numeric_cols}, Null ratio: {null_ratio:.1%}")

if __name__ == "__main__":
    df = pd.read_csv("output/merged_tables.csv")
    validate_table(df, expected_cols=3, min_rows=5)

Run this after every extraction and compare against a manually counted row total from the source PDF.

Performance & Scale

Memory limits: pdfplumber loads the entire PDF into memory. For files above ~100 MB, open only specific page ranges: pdf.pages[start:end] to avoid OOM errors.

Batch processing: use concurrent.futures.ProcessPoolExecutor — PDF parsing is CPU-bound, so threading gains nothing.

from concurrent.futures import ProcessPoolExecutor, as_completed
from pathlib import Path
import pandas as pd

def process_one(path: Path) -> pd.DataFrame:
    # Import inside function so each worker gets its own import state
    import pdfplumber
    dfs = []
    with pdfplumber.open(path) as pdf:
        for page in pdf.pages:
            raw = page.extract_tables()
            for t in (raw or []):
                if t and len(t) > 1:
                    dfs.append(pd.DataFrame(t[1:], columns=t[0]))
    return pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()

def batch_extract(pdf_dir: Path) -> dict[Path, pd.DataFrame]:
    paths = list(pdf_dir.glob("*.pdf"))
    results = {}
    with ProcessPoolExecutor() as pool:
        futures = {pool.submit(process_one, p): p for p in paths}
        for fut in as_completed(futures):
            p = futures[fut]
            try:
                results[p] = fut.result()
            except Exception as e:
                print(f"Failed {p}: {e}")
    return results

For very large PDFs (500+ pages), consider splitting them first — see Merging and Splitting PDF Documents for pypdf-based splitting.

Out-of-core alternative: pipe camelot output directly to pandas.DataFrame.to_parquet() in append mode using pyarrow for datasets that exceed RAM.

Troubleshooting

ErrorRoot causeFix
OSError: ghostscript not foundcamelot's lattice mode requires the Ghostscript system binarysudo apt-get install ghostscript; see Fix Camelot Import Error on Linux
camelot.TableList returns 0 tablesNo vector lines detected on target pagesSwitch to flavor="stream" or use pdfplumber
Columns merge or shift mid-tableCoordinate drift across pages; y-tolerance mismatchUse camelot's copy_text parameter or pdfplumber with snap_tolerance=3
ValueError: could not convert string on numeric colsMerged cells contain concatenated values (e.g., "12,450.001,200.50")Apply extract_words-based reconstruction; see Fix PDF Text Extraction Alignment Issues
Empty DataFrame from scanned PDFNo text layer presentRoute to OCR pipeline via How to Extract Tables from Scanned PDFs

Complete Script

#!/usr/bin/env python3
"""
extract_tables.py — Extract tables from a PDF using the best available method.

Usage:
    python extract_tables.py input.pdf --pages 1-5 --flavor auto --output output/
    python extract_tables.py input.pdf --flavor stream --output output/

pip install pdfplumber "camelot-py[cv]" pandas pdf2image pytesseract
"""
import argparse
import sys
from pathlib import Path
import pdfplumber
import camelot
import pandas as pd


def classify(path: Path) -> str:
    """Return 'lattice', 'stream', or 'ocr' based on PDF content."""
    with pdfplumber.open(path) as pdf:
        page = pdf.pages[0]
        has_text = bool((page.extract_text() or "").strip())
        has_lines = bool(page.lines or page.rects)
    if not has_text and not has_lines:
        return "ocr"
    return "lattice" if has_lines else "stream"


def extract(path: Path, pages: str, flavor: str) -> list[pd.DataFrame]:
    if flavor == "ocr":
        raise SystemExit(
            "OCR pipeline required — run: python -m ocr_extract "
            "(see how-to-extract-tables-from-scanned-pdfs/)"
        )
    if flavor in ("lattice", "stream"):
        tables = camelot.read_pdf(str(path), pages=pages, flavor=flavor,
                                  process_background=(flavor == "lattice"))
        return [t.df for t in tables]
    # pdfplumber fallback
    dfs = []
    with pdfplumber.open(path) as pdf:
        for page in pdf.pages:
            for raw in (page.extract_tables() or []):
                if raw and len(raw) > 1:
                    dfs.append(pd.DataFrame(raw[1:], columns=raw[0]))
    return dfs


def main() -> None:
    parser = argparse.ArgumentParser(description="Extract tables from a PDF")
    parser.add_argument("pdf", type=Path, help="Path to input PDF")
    parser.add_argument("--pages", default="1", help="Pages to parse, e.g. 1-5 or 1,3,5")
    parser.add_argument(
        "--flavor", choices=["auto", "lattice", "stream", "pdfplumber"],
        default="auto", help="Extraction method"
    )
    parser.add_argument("--output", type=Path, default=Path("output"),
                        help="Output directory for CSV files")
    args = parser.parse_args()

    if not args.pdf.exists():
        sys.exit(f"File not found: {args.pdf}")

    args.output.mkdir(parents=True, exist_ok=True)

    flavor = classify(args.pdf) if args.flavor == "auto" else args.flavor
    print(f"Using flavor: {flavor}")

    try:
        tables = extract(args.pdf, args.pages, flavor)
    except Exception as e:
        sys.exit(f"Extraction failed: {e}")

    if not tables:
        sys.exit("No tables found. Try --flavor stream or --flavor pdfplumber.")

    for i, df in enumerate(tables, start=1):
        df.replace("", pd.NA, inplace=True)
        df.dropna(how="all", inplace=True)
        out = args.output / f"table_{i:03d}.csv"
        df.to_csv(out, index=False)
        print(f"  [{i}] {df.shape[0]} rows × {df.shape[1]} cols → {out}")

    print(f"Done. {len(tables)} table(s) exported to {args.output}/")


if __name__ == "__main__":
    main()

Part of Automating PDF Extraction & Generation.

Explore next

/html>