Merging Multiple Spreadsheets
Manual copy-paste consolidation breaks the moment a second person edits a file or a folder grows past a handful of workbooks. Columns drift, rows duplicate, and someone's monthly report quietly disappears. This guide replaces that process with a repeatable Python script that discovers files with glob, aligns mismatched schemas, applies the right pandas operation (concat, merge, or join), deduplicates, and handles multi-sheet workbooks — producing one clean table every time.
For reading individual files before consolidation, see Reading Excel Files with Python. For cleaning up encoding and header noise before merging, see Cleaning Messy CSV Data with pandas.
Prerequisites
# pip install pandas openpyxl
pip install pandas openpyxl
Test data layout assumed throughout:
reports/
jan_sales.xlsx
feb_sales.xlsx
mar_sales.xlsx
targets.xlsx # different schema — used for merge examples
Create throwaway fixtures to follow along:
# pip install pandas openpyxl
from pathlib import Path
import pandas as pd
Path("reports").mkdir(exist_ok=True)
for month, data in [
("jan", {"region": ["North", "South"], "revenue": [12000, 9500], "units": [120, 95]}),
("feb", {"region": ["North", "South"], "revenue": [13200, 10100], "units": [130, 101]}),
("mar", {"region": ["North", "South"], "revenue": [11800, 9800], "units": [115, 98]}),
]:
pd.DataFrame(data).to_excel(f"reports/{month}_sales.xlsx", index=False)
pd.DataFrame({"region": ["North", "South"], "target": [40000, 32000]}).to_excel(
"reports/targets.xlsx", index=False
)
Step 1 — Discover Files with glob
Hard-coded file lists break when a new month is added. Use pathlib.Path.glob (or rglob for nested folders) and filter out Office lock files that start with ~$.
# pip install pandas openpyxl
from pathlib import Path
REPORTS_DIR = Path("reports")
def discover_files(directory: Path, pattern: str = "*.xlsx") -> list[Path]:
"""Return sorted list of matching files, skipping Office temp files."""
return sorted(
p for p in directory.glob(pattern)
if not p.name.startswith(("~$", "."))
)
sales_files = [f for f in discover_files(REPORTS_DIR) if "sales" in f.stem]
print(sales_files)
# [PosixPath('reports/feb_sales.xlsx'), PosixPath('reports/jan_sales.xlsx'), ...]
Pass multiple patterns by calling discover_files twice and combining the lists, or switch to rglob("**/*.csv") for recursive discovery.
Step 2 — Load Each File into a DataFrame
# pip install pandas openpyxl
from pathlib import Path
import pandas as pd
import logging
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
def load_files(paths: list[Path]) -> list[pd.DataFrame]:
"""Load .xlsx and .csv files; tag each row with its source filename."""
frames: list[pd.DataFrame] = []
for p in paths:
try:
df = pd.read_excel(p, engine="openpyxl") if p.suffix == ".xlsx" else pd.read_csv(p)
df["source_file"] = p.name # traceability column
frames.append(df)
logging.info("Loaded %s (%d rows)", p.name, len(df))
except Exception as exc:
logging.warning("Skipping %s: %s", p.name, exc)
return frames
Always include source_file. Without it, duplicate-row debugging is painful after consolidation.
Step 3 — Align Mismatched Columns
Before stacking, normalize column names so Client_ID, ClientID, and client id all resolve to the same key. Then check which columns are present in every file vs. only some.
# pip install pandas openpyxl
import pandas as pd
import re
def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
"""Lowercase, strip, replace spaces/hyphens with underscores."""
df.columns = [re.sub(r"[\s\-]+", "_", c.strip().lower()) for c in df.columns]
return df
def align_schemas(frames: list[pd.DataFrame]) -> list[pd.DataFrame]:
"""Report column coverage and return normalized frames."""
normalized = [normalize_columns(df.copy()) for df in frames]
all_cols: set[str] = set()
for df in normalized:
all_cols |= set(df.columns)
common = set.intersection(*(set(df.columns) for df in normalized))
extra = all_cols - common
if extra:
logging.info("Columns present in some files only: %s", extra)
return normalized
pd.concat with sort=False will create NaN-filled columns for rows that lack an optional column — that is usually fine for reporting. If a column must be present in every file, validate explicitly here and raise before the concatenation.
The concat vs merge vs join Decision
pd.concat — stacking same-schema files
Use pd.concat when every file represents the same measurement at a different time or from a different source (monthly sales, per-branch exports). It aligns on column names, not position, so column order mismatches are handled automatically.
# pip install pandas openpyxl
from pathlib import Path
import pandas as pd
REPORTS_DIR = Path("reports")
frames = []
for p in sorted(REPORTS_DIR.glob("*_sales.xlsx")):
if p.name.startswith("~$"):
continue
df = pd.read_excel(p, engine="openpyxl")
df["source_file"] = p.name
frames.append(df)
try:
combined = pd.concat(frames, ignore_index=True, sort=False)
except ValueError as exc:
raise RuntimeError("No valid DataFrames to concatenate") from exc
print(combined.shape) # (6, 4) — 3 months × 2 regions
print(combined.head())
Key parameters:
ignore_index=True— re-numbers the index from 0 instead of repeating 0, 1 from each source.sort=False— preserves column order from the first frame; avoids surprising reordering.join="inner"— keep only columns common to all frames (useful when optional columns are genuinely unwanted).
pd.merge — joining on a shared key
Use pd.merge when two DataFrames contain different information about the same entities. The on= parameter names the shared key column. If the key column has different names in each frame use left_on= / right_on=.
# pip install pandas openpyxl
from pathlib import Path
import pandas as pd
REPORTS_DIR = Path("reports")
try:
actuals = pd.concat(
[pd.read_excel(p, engine="openpyxl") for p in sorted(REPORTS_DIR.glob("*_sales.xlsx"))
if not p.name.startswith("~$")],
ignore_index=True
).groupby("region", as_index=False)[["revenue", "units"]].sum()
targets = pd.read_excel(REPORTS_DIR / "targets.xlsx", engine="openpyxl")
result = pd.merge(actuals, targets, on="region", how="left")
result["attainment_pct"] = (result["revenue"] / result["target"] * 100).round(1)
print(result)
except FileNotFoundError as exc:
print(f"Missing file: {exc}")
how="left" keeps every row from actuals even if targets has no matching region — missing targets become NaN rather than silently dropping rows.
join — merging on the index
DataFrame.join is shorthand for a merge on the index. It is useful when both frames are already indexed by the same key (e.g., a date-indexed time series).
# pip install pandas openpyxl
import pandas as pd
df_a = pd.DataFrame({"revenue": [12000, 13200]}, index=["North", "South"])
df_b = pd.DataFrame({"cost": [8000, 7500]}, index=["North", "South"])
combined = df_a.join(df_b) # index-aligned, no explicit key needed
print(combined)
Use join for index-keyed frames, merge for everything else.
Step 4 — Handle Multi-Sheet Workbooks
When one .xlsx file contains multiple sheets, pd.read_excel(sheet_name=None) returns an OrderedDict mapping sheet name to DataFrame.
# pip install pandas openpyxl
from pathlib import Path
import pandas as pd
def load_all_sheets(path: Path) -> pd.DataFrame:
"""Concatenate all sheets in a workbook into one DataFrame."""
try:
sheets: dict[str, pd.DataFrame] = pd.read_excel(
path, sheet_name=None, engine="openpyxl"
)
except Exception as exc:
raise RuntimeError(f"Cannot open {path}: {exc}") from exc
frames = []
for sheet_name, df in sheets.items():
df["sheet"] = sheet_name # track origin sheet
frames.append(df)
return pd.concat(frames, ignore_index=True, sort=False)
# Usage
wb_path = Path("reports") / "annual_summary.xlsx"
if wb_path.exists():
all_data = load_all_sheets(wb_path)
print(all_data["sheet"].value_counts())
This pattern is identical to the multi-file loop: tag the source, then concatenate.
Step 5 — Deduplicate
Concatenating data from overlapping sources (daily extracts that include the prior day, workbooks shared between teams) inevitably introduces duplicates.
# pip install pandas openpyxl
import pandas as pd
def deduplicate(df: pd.DataFrame, subset: list[str] | None = None, keep: str = "last") -> pd.DataFrame:
"""
Drop exact duplicate rows.
subset — columns that define identity; None means all columns.
keep — 'last' retains the most recently loaded copy (useful when later files are corrections).
"""
before = len(df)
df = df.drop_duplicates(subset=subset, keep=keep).reset_index(drop=True)
removed = before - len(df)
if removed:
print(f"Removed {removed} duplicate rows ({before} → {len(df)})")
return df
# deduplicate(combined, subset=["region", "source_file"])
For near-duplicate detection (same entity, slightly different values) use a hash of normalized key columns rather than relying on exact equality.
Step 6 — Validate Before Export
# pip install pandas openpyxl
import pandas as pd
def validate(df: pd.DataFrame, required_cols: list[str]) -> None:
missing = [c for c in required_cols if c not in df.columns]
if missing:
raise ValueError(f"Missing required columns: {missing}")
null_counts = df[required_cols].isnull().sum()
if null_counts.any():
print("Nulls in required columns:\n", null_counts[null_counts > 0])
assert df.index.is_unique, "Index is not unique — call reset_index(drop=True)"
print(f"Validation passed. Shape: {df.shape}")
# validate(combined, required_cols=["region", "revenue", "units"])
Edge Cases & Variants
Variant A — Mixed .xlsx and .csv in the same folder
# pip install pandas openpyxl
from pathlib import Path
import pandas as pd
DATA_DIR = Path("mixed_reports")
frames = []
for p in DATA_DIR.iterdir():
if p.suffix == ".xlsx" and not p.name.startswith("~$"):
frames.append(pd.read_excel(p, engine="openpyxl"))
elif p.suffix == ".csv":
frames.append(pd.read_csv(p))
combined = pd.concat(frames, ignore_index=True, sort=False)
Variant B — read_excel with explicit sheet and header row
Some workbooks have metadata rows above the actual header. Use header= to skip them.
# pip install pandas openpyxl
import pandas as pd
from pathlib import Path
df = pd.read_excel(
Path("reports") / "budget_2026.xlsx",
sheet_name="Q1",
header=3, # row index 3 (0-based) contains the real header
usecols="B:F", # only read columns B through F
engine="openpyxl",
)
Variant C — merging on differently-named keys with validation
If the key column has a different name in each DataFrame, use left_on / right_on. Add validate= to catch unexpected many-to-many joins early.
# pip install pandas openpyxl
import pandas as pd
sales = pd.DataFrame({"sale_region": ["North", "South"], "revenue": [37000, 29400]})
targets = pd.DataFrame({"region": ["North", "South"], "target": [40000, 32000]})
try:
result = pd.merge(
sales, targets,
left_on="sale_region", right_on="region",
how="left",
validate="many_to_one", # raises MergeError if targets has duplicate region keys
)
result = result.drop(columns=["region"]) # remove the redundant right-hand key column
print(result)
except pd.errors.MergeError as exc:
print(f"Join cardinality error: {exc}")
When the merge produces unexpected _x/_y suffix columns, see Fix pandas merge Overlapping Column Suffixes for root cause and fixes.
Performance & Scale
| Approach | When to use | Typical limit |
|---|---|---|
pd.concat in memory | < 500 MB total | Available RAM |
CSV chunking (chunksize) | Large CSVs, stream-friendly | Unlimited |
dask.dataframe | Parallel, out-of-core, multi-file | Disk size |
DuckDB read_excel / read_csv | SQL over flat files, joins at scale | Disk size |
For workloads that fit in memory, pd.concat is fastest. Beyond ~500 MB, chunked CSV reading or DuckDB avoids out-of-memory crashes without rewriting your logic.
Troubleshooting
| Error | Root cause | Fix |
|---|---|---|
ValueError: No objects to concatenate | frames list is empty — no matching files found | Check glob pattern and directory path; print frames before concat |
KeyError: 'region' after concat | Column name normalized differently in one file (Region vs region) | Apply normalize_columns before concatenating |
MemoryError | Too many large files loaded at once | Process in batches or use dask.dataframe |
Columns appear as col_x / col_y | Shared non-key column names in pd.merge | Set suffixes= or drop/rename before merging — see Fix pandas merge Overlapping Column Suffixes |
MergeError: Merge keys are not unique | validate= caught a cardinality mismatch | Inspect duplicate keys with df[df.duplicated(subset=["key"])] |
Complete Working Script
# pip install pandas openpyxl
"""merge_spreadsheets.py — consolidate a folder of .xlsx and .csv files into one output."""
import argparse
import logging
import re
import sys
from pathlib import Path
import pandas as pd
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
def discover(directory: Path, extensions: tuple[str, ...] = (".xlsx", ".csv")) -> list[Path]:
paths: list[Path] = []
for ext in extensions:
paths.extend(p for p in directory.rglob(f"*{ext}") if not p.name.startswith(("~$", ".")))
return sorted(paths)
def load(paths: list[Path]) -> list[pd.DataFrame]:
frames: list[pd.DataFrame] = []
for p in paths:
try:
df = pd.read_excel(p, engine="openpyxl") if p.suffix == ".xlsx" else pd.read_csv(p)
df["source_file"] = p.name
frames.append(df)
logging.info("Loaded %s (%d rows)", p.name, len(df))
except Exception as exc:
logging.warning("Skipping %s: %s", p.name, exc)
return frames
def normalize_cols(df: pd.DataFrame) -> pd.DataFrame:
df.columns = [re.sub(r"[\s\-]+", "_", c.strip().lower()) for c in df.columns]
return df
def main() -> None:
parser = argparse.ArgumentParser(description="Merge spreadsheets in a folder")
parser.add_argument("input_dir", type=Path, help="Folder containing source files")
parser.add_argument("output_file", type=Path, help="Destination .csv or .xlsx")
parser.add_argument("--dedup-cols", nargs="*", help="Columns that define a unique row")
args = parser.parse_args()
if not args.input_dir.is_dir():
sys.exit(f"Not a directory: {args.input_dir}")
paths = discover(args.input_dir)
if not paths:
sys.exit("No .xlsx or .csv files found")
frames = load(paths)
if not frames:
sys.exit("All files failed to load")
frames = [normalize_cols(df) for df in frames]
try:
combined = pd.concat(frames, ignore_index=True, sort=False)
except ValueError as exc:
sys.exit(f"Concatenation failed: {exc}")
if args.dedup_cols:
before = len(combined)
combined = combined.drop_duplicates(subset=args.dedup_cols, keep="last")
logging.info("Dedup removed %d rows", before - len(combined))
combined = combined.reset_index(drop=True)
try:
if args.output_file.suffix == ".xlsx":
combined.to_excel(args.output_file, index=False, engine="openpyxl")
else:
combined.to_csv(args.output_file, index=False)
logging.info("Wrote %d rows to %s", len(combined), args.output_file)
except Exception as exc:
sys.exit(f"Export failed: {exc}")
if __name__ == "__main__":
main()
Run it:
python merge_spreadsheets.py reports/ output/combined.csv --dedup-cols region source_file
Related
- Reading Excel Files with Python — engine options, sheet selection, and header row handling before you merge
- Cleaning Messy CSV Data with pandas — normalize headers and fix encoding before concatenating
- Converting Excel to JSON with Python — serialize the merged table for API or web consumption
- Fix pandas merge Overlapping Column Suffixes — resolve
_x/_ycolumns after a merge