Fix PDF Text Extraction Alignment Issues
Calling page.extract_text() or splitting on newlines returns jumbled output when the PDF contains multi-column layouts. The symptom is merged numeric values like "12,450.001,200.50", column headers appearing mid-row, or a complete reversal of reading order. The cause is not a bug in the library — it is a fundamental property of the PDF format.
Root Cause
PDFs store each glyph as an independent object with an absolute x/y coordinate. There is no concept of "row" or "column" in the format. When extract_text() reassembles glyphs, it sorts by vertical position and then reads left-to-right — but two text blocks from different columns that share a similar y-coordinate land on the same output line, concatenated without a separator.
The fix: bypass extract_text() entirely and work with pdfplumber's extract_words(), which returns each word with its bounding box. Cluster words into rows using a y-tolerance, then sort each row by x-position.
Minimal Diagnostic
Confirm the root cause before investing in a full fix:
# pip install pdfplumber
from pathlib import Path
import pdfplumber
import statistics
PDF_PATH = Path("data/report.pdf")
def diagnose_alignment(path: Path, page_idx: int = 0) -> dict:
"""Show coordinate overlap statistics for a single PDF page."""
try:
with pdfplumber.open(path) as pdf:
page = pdf.pages[page_idx]
words = page.extract_words(x_tolerance=2)
except Exception as e:
raise RuntimeError(f"Could not open {path}: {e}") from e
if not words:
return {"status": "empty", "word_count": 0}
tops = [w["top"] for w in words]
bottoms = [w["bottom"] for w in words]
heights = [b - t for t, b in zip(tops, bottoms)]
median_h = statistics.median(heights)
# Count word pairs that share a y-band but have a large x-gap (separate columns)
overlaps = 0
for i, w1 in enumerate(words):
for w2 in words[i + 1:]:
same_row = abs(w1["top"] - w2["top"]) < median_h * 0.5
wide_gap = (w2["x0"] - w1["x1"]) > 20 # >20pt gap → likely different column
if same_row and wide_gap:
overlaps += 1
return {
"word_count": len(words),
"median_line_height": round(median_h, 2),
"column_overlap_pairs": overlaps,
"recommended_y_tolerance": round(median_h * 0.4, 2),
}
if __name__ == "__main__":
result = diagnose_alignment(PDF_PATH)
print(result)
# column_overlap_pairs > 0 → coordinate sorting is needed
If column_overlap_pairs is zero, the file does not have multi-column misalignment — check instead for encoding issues (see Fixing Encoding Errors in CSV Files if the garbling appears after a CSV export step).
Fix: Coordinate-Sorted Row Reconstruction
Replace extract_text() with a word-level reconstruction pipeline. The key changes are on every modified line:
# pip install pdfplumber pandas
from pathlib import Path
import pdfplumber
import pandas as pd
import statistics
PDF_PATH = Path("data/report.pdf")
OUTPUT_PATH = Path("output/aligned.csv")
def extract_aligned_rows(
path: Path,
page_idx: int = 0,
y_tolerance: float | None = None, # None → auto-detect from median line height
x_tolerance: int = 2, # merge kerned/hyphenated glyphs within 2pt
) -> list[list[str]]:
"""
Extract text rows from a PDF page using x/y coordinate sorting.
Returns a list of rows, where each row is a list of word strings.
"""
try:
with pdfplumber.open(path) as pdf:
page = pdf.pages[page_idx]
# extract_words returns dicts with x0, x1, top, bottom, text
words = page.extract_words(x_tolerance=x_tolerance) # changed: was extract_text()
except Exception as e:
raise RuntimeError(f"Failed to read {path}: {e}") from e
if not words:
return []
# Auto-detect y_tolerance from median glyph height
if y_tolerance is None:
heights = [w["bottom"] - w["top"] for w in words]
y_tolerance = statistics.median(heights) * 0.4 # changed: was hardcoded 3.0
# Sort words top-to-bottom
words.sort(key=lambda w: w["top"]) # changed: was not sorted
rows: list[list[dict]] = []
current_row: list[dict] = [words[0]]
current_top: float = words[0]["top"]
for word in words[1:]:
if abs(word["top"] - current_top) <= y_tolerance: # same visual row
current_row.append(word)
else:
current_row.sort(key=lambda w: w["x0"]) # changed: sort left-to-right
rows.append([w["text"] for w in current_row])
current_row = [word]
current_top = word["top"]
if current_row:
current_row.sort(key=lambda w: w["x0"])
rows.append([w["text"] for w in current_row])
return rows
if __name__ == "__main__":
OUTPUT_PATH.parent.mkdir(exist_ok=True)
rows = extract_aligned_rows(PDF_PATH)
df = pd.DataFrame(rows)
df.to_csv(OUTPUT_PATH, index=False, header=False)
print(f"Exported {len(rows)} rows to {OUTPUT_PATH}")
The three # changed: comments mark every line that differs from the naive extract_text() approach.
Variant Fix 1: Use layout=True for Simple Single-Column Misalignment
For documents where text flows in a single column but extract_text() still scrambles order, the layout=True parameter re-sorts glyphs spatially before assembling output:
# pip install pdfplumber
from pathlib import Path
import pdfplumber
PDF_PATH = Path("data/report.pdf")
def extract_with_layout(path: Path) -> str:
"""Use layout=True to restore reading order without manual coordinate work."""
with pdfplumber.open(path) as pdf:
pages_text = []
for page in pdf.pages:
# layout=True uses pdfminer's LAParams to sort text spatially
text = page.extract_text(layout=True) # changed: added layout=True
if text:
pages_text.append(text)
return "\n\n".join(pages_text)
if __name__ == "__main__":
text = extract_with_layout(PDF_PATH)
print(text[:500])
layout=True is simpler but less precise than the word-level approach — it works for single-column text but still merges columns in multi-column PDFs.
Variant Fix 2: Char-Level Clustering with chars
When even extract_words() splits tokens incorrectly (common with PDFs generated from LaTeX or some CAD tools), drop to character-level objects and cluster manually:
# pip install pdfplumber
from pathlib import Path
import pdfplumber
PDF_PATH = Path("data/cad_export.pdf")
def cluster_chars(path: Path, page_idx: int = 0, y_tol: float = 2.0, x_merge: float = 1.5) -> list[str]:
"""Reconstruct text from individual characters for heavily fragmented PDFs."""
with pdfplumber.open(path) as pdf:
chars = pdf.pages[page_idx].chars # list of dicts with x0, top, text, etc.
if not chars:
return []
chars = sorted(chars, key=lambda c: (round(c["top"] / y_tol), c["x0"]))
lines: list[str] = []
current_line: list[dict] = [chars[0]]
for ch in chars[1:]:
prev = current_line[-1]
same_row = abs(ch["top"] - prev["top"]) < y_tol
adjacent = (ch["x0"] - prev["x1"]) < x_merge
if same_row:
# Insert space if there is a gap wider than x_merge
if not adjacent:
current_line.append({"text": " ", "x0": prev["x1"], "x1": prev["x1"], "top": prev["top"]})
current_line.append(ch)
else:
lines.append("".join(c["text"] for c in current_line))
current_line = [ch]
if current_line:
lines.append("".join(c["text"] for c in current_line))
return lines
if __name__ == "__main__":
lines = cluster_chars(PDF_PATH)
for line in lines[:10]:
print(repr(line))
Handling Multi-Page Documents
When extracting from a document with several pages, iterate and keep the first row of the first page as the canonical header. Do not concatenate raw word lists before promoting a header — the header will appear duplicated on every page if the PDF was generated with page-level table repeats.
# pip install pdfplumber pandas
from pathlib import Path
import pdfplumber
import pandas as pd
import statistics
PDF_PATH = Path("data/multi_page_report.pdf")
OUTPUT_PATH = Path("output/multi_page_aligned.csv")
def extract_all_pages(path: Path, x_tolerance: int = 2) -> pd.DataFrame:
"""
Extract aligned rows from every page of a PDF and return a single DataFrame.
Promotes the first row of the first page as column headers.
Drops repeated header rows on subsequent pages.
"""
all_rows: list[list[str]] = []
header: list[str] | None = None
try:
with pdfplumber.open(path) as pdf:
for page in pdf.pages:
words = page.extract_words(x_tolerance=x_tolerance)
if not words:
continue
heights = [w["bottom"] - w["top"] for w in words]
y_tol = statistics.median(heights) * 0.4
words.sort(key=lambda w: w["top"])
rows: list[list[str]] = []
cur_row = [words[0]]
cur_top = words[0]["top"]
for word in words[1:]:
if abs(word["top"] - cur_top) <= y_tol:
cur_row.append(word)
else:
cur_row.sort(key=lambda w: w["x0"])
rows.append([w["text"] for w in cur_row])
cur_row = [word]
cur_top = word["top"]
if cur_row:
cur_row.sort(key=lambda w: w["x0"])
rows.append([w["text"] for w in cur_row])
if header is None and rows:
header = rows[0]
rows = rows[1:]
# Drop rows that match the canonical header (page-break repeats)
all_rows.extend(r for r in rows if r != header)
except Exception as e:
raise RuntimeError(f"Extraction failed on {path}: {e}") from e
if not all_rows:
return pd.DataFrame()
max_cols = max(len(r) for r in all_rows)
padded = [r + [""] * (max_cols - len(r)) for r in all_rows]
df = pd.DataFrame(padded, columns=(header or list(range(max_cols))))
df.replace("", pd.NA, inplace=True)
return df
if __name__ == "__main__":
OUTPUT_PATH.parent.mkdir(exist_ok=True)
df = extract_all_pages(PDF_PATH)
df.to_csv(OUTPUT_PATH, index=False)
print(f"Exported {len(df)} rows × {df.shape[1]} cols")
Troubleshooting Common Symptoms
| Symptom | Root cause | Fix |
|---|---|---|
Two columns merged into one ("12,450.001,200.50") | extract_text() reads same-y glyphs left-to-right without column awareness | Switch to extract_words() + coordinate sorting |
| Row count doubles on second and later pages | Header row not being detected and dropped | Compare each row against canonical_header; drop matches |
| Single words split across multiple output rows | y_tolerance too small — sub-pixel font variations cause word fragmentation | Increase y_tolerance to median_h * 0.5 or 0.6 |
| Entire page text in a single line | y_tolerance too large — all words collapse into one row | Reduce to median_h * 0.25; check DPI if it is a scanned image |
| Latin-1 special characters garbled | Encoding mismatch when writing CSV | Add encoding="utf-8" to to_csv(); check source PDF font encoding |
Verification
Confirm the fix worked by checking three things:
- No merged numerics. Parse the output CSV with
pd.to_numeric(..., errors="coerce")and assertNaNcount is low:
import pandas as pd
from pathlib import Path
df = pd.read_csv(Path("output/aligned.csv"), header=None)
for col in df.columns:
numeric = pd.to_numeric(df[col], errors="coerce")
nan_rate = numeric.isna().mean()
if nan_rate < 0.2:
print(f"Col {col}: {nan_rate:.0%} non-numeric — looks clean")
else:
print(f"Col {col}: {nan_rate:.0%} non-numeric — check alignment")
- Row count matches source. Open the PDF in a viewer, count rows in one table manually, and assert
len(rows) == expected. - Column count is consistent.
pd.Series([len(r) for r in rows]).value_counts()should show a single dominant column width; multiple widths indicate rows that were split or merged incorrectly. - Spot-check known values. If the PDF is a financial report, pick three cells with known values (e.g., a specific total) and assert they appear in the correct column after extraction.
Related
- Extracting Tables from PDFs — full camelot and pdfplumber pipeline including lattice vs stream selection
- How to Extract Tables from Scanned PDFs — when there is no text layer at all
- Cleaning Messy CSV Data with pandas — post-extraction data cleaning and type coercion
- Fixing Encoding Errors in CSV Files — separate issue when garbling comes from character encoding, not coordinates
Part of Extracting Tables from PDFs.