Extracting Tables from PDFs
PDFs store table data in three fundamentally different ways — bordered grids with vector lines, whitespace-aligned columns, and rasterized images — and each requires a different extraction path. Generic text parsers collapse all three into a single misaligned blob. This guide implements a decision-driven pipeline: diagnose the PDF structure first, then route to pdfplumber for whitespace tables, camelot lattice for bordered grids, or the OCR pipeline for scanned documents.
Prerequisites
Install system and Python dependencies before any extraction attempt:
# System dependencies (Ubuntu/Debian)
sudo apt-get install ghostscript libsm6 libxext6
# Python packages
pip install pdfplumber "camelot-py[cv]" pandas pdf2image pytesseract
# Optional: verify camelot system deps are present
python -c "import camelot; print(camelot.__version__)"
If the camelot import raises OSError: ghostscript not found or a cv2 import error, see Fix Camelot Import Error on Linux before continuing.
Place a representative sample PDF at data/input.pdf to follow along. A one-page PDF with a visible-border invoice table works well for lattice mode; a financial statement with whitespace-separated columns suits stream mode.
Step 1: Diagnostic — Classify the PDF Before Choosing a Parser
Run this snippet on any unknown PDF before touching camelot or pdfplumber. It detects text layer presence and line density to determine which extraction path applies.
# pip install pdfplumber
from pathlib import Path
import pdfplumber
PDF_PATH = Path("data/input.pdf")
def classify_pdf(path: Path) -> dict:
"""Return page-level metadata to choose the right extraction strategy."""
result = {"pages": [], "has_text": False, "has_lines": False, "likely_scanned": False}
try:
with pdfplumber.open(path) as pdf:
for i, page in enumerate(pdf.pages):
text = page.extract_text() or ""
lines = page.lines or []
rects = page.rects or []
page_info = {
"page": i + 1,
"text_chars": len(text.strip()),
"vector_lines": len(lines),
"vector_rects": len(rects),
}
result["pages"].append(page_info)
if text.strip():
result["has_text"] = True
if lines or rects:
result["has_lines"] = True
except Exception as e:
raise RuntimeError(f"Could not open {path}: {e}") from e
# Heuristic: no text and no lines → rasterized scan
result["likely_scanned"] = not result["has_text"] and not result["has_lines"]
return result
if __name__ == "__main__":
info = classify_pdf(PDF_PATH)
for p in info["pages"]:
print(p)
print(f"Recommended path: {'OCR' if info['likely_scanned'] else 'lattice' if info['has_lines'] else 'stream'}")
Reading the output:
| Condition | Recommended parser |
|---|---|
has_lines=True, has_text=True | camelot lattice |
has_text=True, has_lines=False | camelot stream or pdfplumber |
has_text=False, has_lines=False | OCR pipeline (pdf2image + Tesseract) |
Step 2: Extract Bordered Tables with camelot Lattice
Lattice mode traces the physical grid lines in the PDF. Use it whenever a table has visible borders — invoices, financial statements, structured reports.
# pip install "camelot-py[cv]" pandas
from pathlib import Path
import camelot
import pandas as pd
PDF_PATH = Path("data/financial_statement.pdf")
OUTPUT_DIR = Path("output")
OUTPUT_DIR.mkdir(exist_ok=True)
def extract_lattice(path: Path, pages: str = "1-end") -> list[pd.DataFrame]:
"""Extract bordered tables using camelot lattice mode."""
try:
tables = camelot.read_pdf(
str(path),
pages=pages,
flavor="lattice",
process_background=True, # detect lines on coloured backgrounds
line_scale=40, # raise to 50-60 for thin/faint lines
)
except Exception as e:
raise RuntimeError(f"camelot lattice failed on {path}: {e}") from e
if tables.n == 0:
raise ValueError(f"No tables found in {path} — check if borders are truly vector lines")
dfs = []
for t in tables:
df = t.df.copy()
df.replace("", pd.NA, inplace=True)
df.dropna(how="all", inplace=True)
df.dropna(axis=1, how="all", inplace=True)
dfs.append(df)
return dfs
if __name__ == "__main__":
results = extract_lattice(PDF_PATH, pages="1-3")
for i, df in enumerate(results):
out = OUTPUT_DIR / f"table_{i+1}.csv"
df.to_csv(out, index=False)
print(f"Table {i+1}: {df.shape[0]} rows × {df.shape[1]} cols → {out}")
Accuracy score: camelot exposes t.parsing_report["accuracy"] (0–100). Scores below 80 indicate line detection problems — try adjusting line_scale or switching to stream mode.
Step 3: Extract Whitespace Tables with camelot Stream
Stream mode infers column boundaries from the whitespace gaps between text runs. Use it for financial PDFs and reports that use fixed-width fonts and spaced columns instead of drawn borders.
# pip install "camelot-py[cv]" pandas
from pathlib import Path
import camelot
import pandas as pd
PDF_PATH = Path("data/annual_report.pdf")
def extract_stream(path: Path, pages: str = "1", col_sep_width: int = 10) -> list[pd.DataFrame]:
"""Extract whitespace-delimited tables using camelot stream mode."""
try:
tables = camelot.read_pdf(
str(path),
pages=pages,
flavor="stream",
# edge_tol controls how close text must be to a column edge
edge_tol=500,
# row_tol: vertical tolerance for grouping text into the same row
row_tol=2,
)
except Exception as e:
raise RuntimeError(f"camelot stream failed: {e}") from e
dfs = []
for t in tables:
df = t.df.copy()
# First row is usually the header; promote it
df.columns = df.iloc[0].str.strip()
df = df.iloc[1:].reset_index(drop=True)
df.replace("", pd.NA, inplace=True)
dfs.append(df)
return dfs
if __name__ == "__main__":
results = extract_stream(PDF_PATH, pages="2-5")
for i, df in enumerate(results):
print(f"Table {i+1}: {df.shape}")
print(df.head(3), "\n")
When stream fails: if columns merge, open the PDF in a viewer, note the x-coordinates of column separators, and pass them explicitly: camelot.read_pdf(..., columns=["72,144,288,432"]).
Step 4: pdfplumber Fallback for Sparse or Irregular Grids
pdfplumber works well when camelot finds no tables — particularly for loosely-bordered tables or when the PDF is generated by tools that draw border-like rectangles rather than actual PDF line objects.
# pip install pdfplumber pandas
from pathlib import Path
import pdfplumber
import pandas as pd
PDF_PATH = Path("data/report.pdf")
def extract_with_pdfplumber(path: Path) -> list[pd.DataFrame]:
"""Extract tables page-by-page using pdfplumber's built-in table finder."""
dfs = []
try:
with pdfplumber.open(path) as pdf:
for page_num, page in enumerate(pdf.pages, start=1):
raw_tables = page.extract_tables(table_settings={
"vertical_strategy": "lines_strict",
"horizontal_strategy": "lines_strict",
"snap_tolerance": 3,
})
for raw in raw_tables:
if not raw or len(raw) < 2:
continue
# First row as header
header = [str(c).strip() if c else f"col_{i}" for i, c in enumerate(raw[0])]
rows = raw[1:]
df = pd.DataFrame(rows, columns=header)
df.replace("", pd.NA, inplace=True)
dfs.append(df)
except Exception as e:
raise RuntimeError(f"pdfplumber extraction failed: {e}") from e
return dfs
if __name__ == "__main__":
tables = extract_with_pdfplumber(PDF_PATH)
print(f"Found {len(tables)} table(s)")
for df in tables:
print(df.head())
If columns are still misaligned after pdfplumber extraction, the root cause is usually coordinate drift in multi-column layouts — see Fix PDF Text Extraction Alignment Issues for the extract_words + coordinate-sorting approach.
Step 5: Multi-Page Header Deduplication
Paginated PDFs repeat the header row at each page break. Concatenating tables naively produces a DataFrame with hundreds of mid-data header rows.
# pip install pandas
from pathlib import Path
import pandas as pd
def dedup_and_concat(tables: list[pd.DataFrame]) -> pd.DataFrame:
"""Merge tables from multiple pages, removing repeated header rows."""
if not tables:
return pd.DataFrame()
# Use first row of first table as the canonical header
first = tables[0]
if first.iloc[0].tolist() == first.columns.tolist():
# DataFrame was built with header already promoted; skip first data row check
canonical_header = first.columns.tolist()
else:
canonical_header = first.iloc[0].tolist()
first = first.iloc[1:].copy()
first.columns = canonical_header
cleaned = [first]
for df in tables[1:]:
# Detect and drop repeated header rows by comparing to canonical
if df.shape[1] == len(canonical_header):
mask = df.apply(lambda row: row.tolist() == canonical_header, axis=1)
df = df[~mask].copy()
df.columns = canonical_header
cleaned.append(df)
combined = pd.concat(cleaned, ignore_index=True)
# Forward-fill merged/spanned cells (common in PDF tables)
combined = combined.ffill()
# Coerce numeric columns
for col in combined.columns:
numeric = pd.to_numeric(combined[col], errors="coerce")
if numeric.notna().mean() > 0.8: # >80% parseable → treat as numeric
combined[col] = numeric
return combined
if __name__ == "__main__":
sample = [
pd.DataFrame([["ID", "Amount", "Date"], ["1", "500.00", "2026-01-01"]]),
pd.DataFrame([["ID", "Amount", "Date"], ["2", "750.50", "2026-01-15"]]),
]
result = dedup_and_concat(sample)
print(result)
result.to_csv("output/merged_tables.csv", index=False)
Edge Cases
Tables Spanning Two Pages with No Repeat Header
Some PDF generators split a table mid-row at a page boundary, producing a bottom fragment and a top fragment on successive pages.
# pip install camelot-py[cv] pandas
from pathlib import Path
import camelot, pandas as pd
def stitch_split_table(path: Path, pages: str = "3,4") -> pd.DataFrame:
"""Join tables split across adjacent pages when no header repeats."""
tables = camelot.read_pdf(str(path), pages=pages, flavor="lattice")
dfs = [t.df for t in tables]
if not dfs:
return pd.DataFrame()
header = dfs[0].iloc[0].tolist()
frames = []
for df in dfs:
if df.iloc[0].tolist() == header:
df = df.iloc[1:]
df.columns = header
frames.append(df)
return pd.concat(frames, ignore_index=True)
Rotated Tables (Landscape Pages)
pdfplumber respects page rotation automatically. For camelot, crop the page manually using camelot.read_pdf(..., layout_kwargs={"char_margin": 2.0}). If rotation is inconsistent, pre-rotate with PyMuPDF before extraction.
Tables with Background Colour Fills
camelot's process_background=True flag handles most cases. If lines are still missed, lower the threshold: add copy_text=["h", "v"] to camelot's settings to extract text from cells with coloured fills.
Validation
Always verify shape and dtypes before writing downstream outputs:
# pip install pandas
import pandas as pd
def validate_table(df: pd.DataFrame, expected_cols: int, min_rows: int = 1) -> None:
"""Assert basic structural integrity of an extracted table."""
assert df.shape[1] == expected_cols, (
f"Column count mismatch: expected {expected_cols}, got {df.shape[1]}. "
"Check for header dedup or coordinate drift."
)
assert df.shape[0] >= min_rows, (
f"Too few rows: expected >= {min_rows}, got {df.shape[0]}."
)
null_ratio = df.isnull().mean().mean()
if null_ratio > 0.3:
print(f"Warning: {null_ratio:.0%} of cells are null — check forward-fill or merged cells.")
numeric_cols = df.select_dtypes(include="number").columns.tolist()
print(f"Shape: {df.shape}, Numeric cols: {numeric_cols}, Null ratio: {null_ratio:.1%}")
if __name__ == "__main__":
df = pd.read_csv("output/merged_tables.csv")
validate_table(df, expected_cols=3, min_rows=5)
Run this after every extraction and compare against a manually counted row total from the source PDF.
Performance & Scale
Memory limits: pdfplumber loads the entire PDF into memory. For files above ~100 MB, open only specific page ranges: pdf.pages[start:end] to avoid OOM errors.
Batch processing: use concurrent.futures.ProcessPoolExecutor — PDF parsing is CPU-bound, so threading gains nothing.
from concurrent.futures import ProcessPoolExecutor, as_completed
from pathlib import Path
import pandas as pd
def process_one(path: Path) -> pd.DataFrame:
# Import inside function so each worker gets its own import state
import pdfplumber
dfs = []
with pdfplumber.open(path) as pdf:
for page in pdf.pages:
raw = page.extract_tables()
for t in (raw or []):
if t and len(t) > 1:
dfs.append(pd.DataFrame(t[1:], columns=t[0]))
return pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()
def batch_extract(pdf_dir: Path) -> dict[Path, pd.DataFrame]:
paths = list(pdf_dir.glob("*.pdf"))
results = {}
with ProcessPoolExecutor() as pool:
futures = {pool.submit(process_one, p): p for p in paths}
for fut in as_completed(futures):
p = futures[fut]
try:
results[p] = fut.result()
except Exception as e:
print(f"Failed {p}: {e}")
return results
For very large PDFs (500+ pages), consider splitting them first — see Merging and Splitting PDF Documents for pypdf-based splitting.
Out-of-core alternative: pipe camelot output directly to pandas.DataFrame.to_parquet() in append mode using pyarrow for datasets that exceed RAM.
Troubleshooting
| Error | Root cause | Fix |
|---|---|---|
OSError: ghostscript not found | camelot's lattice mode requires the Ghostscript system binary | sudo apt-get install ghostscript; see Fix Camelot Import Error on Linux |
camelot.TableList returns 0 tables | No vector lines detected on target pages | Switch to flavor="stream" or use pdfplumber |
| Columns merge or shift mid-table | Coordinate drift across pages; y-tolerance mismatch | Use camelot's copy_text parameter or pdfplumber with snap_tolerance=3 |
ValueError: could not convert string on numeric cols | Merged cells contain concatenated values (e.g., "12,450.001,200.50") | Apply extract_words-based reconstruction; see Fix PDF Text Extraction Alignment Issues |
| Empty DataFrame from scanned PDF | No text layer present | Route to OCR pipeline via How to Extract Tables from Scanned PDFs |
Complete Script
#!/usr/bin/env python3
"""
extract_tables.py — Extract tables from a PDF using the best available method.
Usage:
python extract_tables.py input.pdf --pages 1-5 --flavor auto --output output/
python extract_tables.py input.pdf --flavor stream --output output/
pip install pdfplumber "camelot-py[cv]" pandas pdf2image pytesseract
"""
import argparse
import sys
from pathlib import Path
import pdfplumber
import camelot
import pandas as pd
def classify(path: Path) -> str:
"""Return 'lattice', 'stream', or 'ocr' based on PDF content."""
with pdfplumber.open(path) as pdf:
page = pdf.pages[0]
has_text = bool((page.extract_text() or "").strip())
has_lines = bool(page.lines or page.rects)
if not has_text and not has_lines:
return "ocr"
return "lattice" if has_lines else "stream"
def extract(path: Path, pages: str, flavor: str) -> list[pd.DataFrame]:
if flavor == "ocr":
raise SystemExit(
"OCR pipeline required — run: python -m ocr_extract "
"(see how-to-extract-tables-from-scanned-pdfs/)"
)
if flavor in ("lattice", "stream"):
tables = camelot.read_pdf(str(path), pages=pages, flavor=flavor,
process_background=(flavor == "lattice"))
return [t.df for t in tables]
# pdfplumber fallback
dfs = []
with pdfplumber.open(path) as pdf:
for page in pdf.pages:
for raw in (page.extract_tables() or []):
if raw and len(raw) > 1:
dfs.append(pd.DataFrame(raw[1:], columns=raw[0]))
return dfs
def main() -> None:
parser = argparse.ArgumentParser(description="Extract tables from a PDF")
parser.add_argument("pdf", type=Path, help="Path to input PDF")
parser.add_argument("--pages", default="1", help="Pages to parse, e.g. 1-5 or 1,3,5")
parser.add_argument(
"--flavor", choices=["auto", "lattice", "stream", "pdfplumber"],
default="auto", help="Extraction method"
)
parser.add_argument("--output", type=Path, default=Path("output"),
help="Output directory for CSV files")
args = parser.parse_args()
if not args.pdf.exists():
sys.exit(f"File not found: {args.pdf}")
args.output.mkdir(parents=True, exist_ok=True)
flavor = classify(args.pdf) if args.flavor == "auto" else args.flavor
print(f"Using flavor: {flavor}")
try:
tables = extract(args.pdf, args.pages, flavor)
except Exception as e:
sys.exit(f"Extraction failed: {e}")
if not tables:
sys.exit("No tables found. Try --flavor stream or --flavor pdfplumber.")
for i, df in enumerate(tables, start=1):
df.replace("", pd.NA, inplace=True)
df.dropna(how="all", inplace=True)
out = args.output / f"table_{i:03d}.csv"
df.to_csv(out, index=False)
print(f" [{i}] {df.shape[0]} rows × {df.shape[1]} cols → {out}")
print(f"Done. {len(tables)} table(s) exported to {args.output}/")
if __name__ == "__main__":
main()
Related
- Fix PDF Text Extraction Alignment Issues — resolve jumbled columns from coordinate drift
- How to Extract Tables from Scanned PDFs — OCR pipeline for rasterized documents
- Fix Camelot Import Error on Linux — resolve Ghostscript and cv2 import failures
- Scanning and OCR Processing with Python — broader OCR preprocessing techniques
- Cleaning Messy CSV Data with pandas — clean extracted tables after export