Merging Multiple Spreadsheets

Manual copy-paste consolidation breaks the moment a second person edits a file or a folder grows past a handful of workbooks. Columns drift, rows duplicate, and someone's monthly report quietly disappears. This guide replaces that process with a repeatable Python script that discovers files with glob, aligns mismatched schemas, applies the right pandas operation (concat, merge, or join), deduplicates, and handles multi-sheet workbooks — producing one clean table every time.

For reading individual files before consolidation, see Reading Excel Files with Python. For cleaning up encoding and header noise before merging, see Cleaning Messy CSV Data with pandas.

Prerequisites

# pip install pandas openpyxl
pip install pandas openpyxl

Test data layout assumed throughout:

reports/
  jan_sales.xlsx
  feb_sales.xlsx
  mar_sales.xlsx
  targets.xlsx     # different schema — used for merge examples

Create throwaway fixtures to follow along:

# pip install pandas openpyxl
from pathlib import Path
import pandas as pd

Path("reports").mkdir(exist_ok=True)

for month, data in [
    ("jan", {"region": ["North", "South"], "revenue": [12000, 9500], "units": [120, 95]}),
    ("feb", {"region": ["North", "South"], "revenue": [13200, 10100], "units": [130, 101]}),
    ("mar", {"region": ["North", "South"], "revenue": [11800, 9800], "units": [115, 98]}),
]:
    pd.DataFrame(data).to_excel(f"reports/{month}_sales.xlsx", index=False)

pd.DataFrame({"region": ["North", "South"], "target": [40000, 32000]}).to_excel(
    "reports/targets.xlsx", index=False
)

Step 1 — Discover Files with glob

Hard-coded file lists break when a new month is added. Use pathlib.Path.glob (or rglob for nested folders) and filter out Office lock files that start with ~$.

# pip install pandas openpyxl
from pathlib import Path

REPORTS_DIR = Path("reports")

def discover_files(directory: Path, pattern: str = "*.xlsx") -> list[Path]:
    """Return sorted list of matching files, skipping Office temp files."""
    return sorted(
        p for p in directory.glob(pattern)
        if not p.name.startswith(("~$", "."))
    )

sales_files = [f for f in discover_files(REPORTS_DIR) if "sales" in f.stem]
print(sales_files)
# [PosixPath('reports/feb_sales.xlsx'), PosixPath('reports/jan_sales.xlsx'), ...]

Pass multiple patterns by calling discover_files twice and combining the lists, or switch to rglob("**/*.csv") for recursive discovery.

Step 2 — Load Each File into a DataFrame

# pip install pandas openpyxl
from pathlib import Path
import pandas as pd
import logging

logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")

def load_files(paths: list[Path]) -> list[pd.DataFrame]:
    """Load .xlsx and .csv files; tag each row with its source filename."""
    frames: list[pd.DataFrame] = []
    for p in paths:
        try:
            df = pd.read_excel(p, engine="openpyxl") if p.suffix == ".xlsx" else pd.read_csv(p)
            df["source_file"] = p.name          # traceability column
            frames.append(df)
            logging.info("Loaded %s  (%d rows)", p.name, len(df))
        except Exception as exc:
            logging.warning("Skipping %s: %s", p.name, exc)
    return frames

Always include source_file. Without it, duplicate-row debugging is painful after consolidation.

Step 3 — Align Mismatched Columns

Before stacking, normalize column names so Client_ID, ClientID, and client id all resolve to the same key. Then check which columns are present in every file vs. only some.

# pip install pandas openpyxl
import pandas as pd
import re

def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    """Lowercase, strip, replace spaces/hyphens with underscores."""
    df.columns = [re.sub(r"[\s\-]+", "_", c.strip().lower()) for c in df.columns]
    return df

def align_schemas(frames: list[pd.DataFrame]) -> list[pd.DataFrame]:
    """Report column coverage and return normalized frames."""
    normalized = [normalize_columns(df.copy()) for df in frames]
    all_cols: set[str] = set()
    for df in normalized:
        all_cols |= set(df.columns)
    common = set.intersection(*(set(df.columns) for df in normalized))
    extra  = all_cols - common
    if extra:
        logging.info("Columns present in some files only: %s", extra)
    return normalized

pd.concat with sort=False will create NaN-filled columns for rows that lack an optional column — that is usually fine for reporting. If a column must be present in every file, validate explicitly here and raise before the concatenation.

The concat vs merge vs join Decision

Merging spreadsheets workflow Flow diagram showing: multiple source files go through glob discovery and column alignment, then branch to pd.concat for same-schema stacking or pd.merge for key-based joining, both producing one consolidated table. jan_sales.xlsx region · revenue · units feb_sales.xlsx region · revenue · units mar_sales.xlsx region · revenue · units targets.xlsx region · target glob + normalize column names different schema needs a key column same schema? Yes pd.concat stack rows No pd.merge join on key one table dedup · validate

pd.concat — stacking same-schema files

Use pd.concat when every file represents the same measurement at a different time or from a different source (monthly sales, per-branch exports). It aligns on column names, not position, so column order mismatches are handled automatically.

# pip install pandas openpyxl
from pathlib import Path
import pandas as pd

REPORTS_DIR = Path("reports")

frames = []
for p in sorted(REPORTS_DIR.glob("*_sales.xlsx")):
    if p.name.startswith("~$"):
        continue
    df = pd.read_excel(p, engine="openpyxl")
    df["source_file"] = p.name
    frames.append(df)

try:
    combined = pd.concat(frames, ignore_index=True, sort=False)
except ValueError as exc:
    raise RuntimeError("No valid DataFrames to concatenate") from exc

print(combined.shape)   # (6, 4) — 3 months × 2 regions
print(combined.head())

Key parameters:

  • ignore_index=True — re-numbers the index from 0 instead of repeating 0, 1 from each source.
  • sort=False — preserves column order from the first frame; avoids surprising reordering.
  • join="inner" — keep only columns common to all frames (useful when optional columns are genuinely unwanted).

pd.merge — joining on a shared key

Use pd.merge when two DataFrames contain different information about the same entities. The on= parameter names the shared key column. If the key column has different names in each frame use left_on= / right_on=.

# pip install pandas openpyxl
from pathlib import Path
import pandas as pd

REPORTS_DIR = Path("reports")

try:
    actuals  = pd.concat(
        [pd.read_excel(p, engine="openpyxl") for p in sorted(REPORTS_DIR.glob("*_sales.xlsx"))
         if not p.name.startswith("~$")],
        ignore_index=True
    ).groupby("region", as_index=False)[["revenue", "units"]].sum()

    targets = pd.read_excel(REPORTS_DIR / "targets.xlsx", engine="openpyxl")

    result = pd.merge(actuals, targets, on="region", how="left")
    result["attainment_pct"] = (result["revenue"] / result["target"] * 100).round(1)
    print(result)
except FileNotFoundError as exc:
    print(f"Missing file: {exc}")

how="left" keeps every row from actuals even if targets has no matching region — missing targets become NaN rather than silently dropping rows.

join — merging on the index

DataFrame.join is shorthand for a merge on the index. It is useful when both frames are already indexed by the same key (e.g., a date-indexed time series).

# pip install pandas openpyxl
import pandas as pd

df_a = pd.DataFrame({"revenue": [12000, 13200]}, index=["North", "South"])
df_b = pd.DataFrame({"cost": [8000, 7500]},    index=["North", "South"])

combined = df_a.join(df_b)   # index-aligned, no explicit key needed
print(combined)

Use join for index-keyed frames, merge for everything else.

Step 4 — Handle Multi-Sheet Workbooks

When one .xlsx file contains multiple sheets, pd.read_excel(sheet_name=None) returns an OrderedDict mapping sheet name to DataFrame.

# pip install pandas openpyxl
from pathlib import Path
import pandas as pd

def load_all_sheets(path: Path) -> pd.DataFrame:
    """Concatenate all sheets in a workbook into one DataFrame."""
    try:
        sheets: dict[str, pd.DataFrame] = pd.read_excel(
            path, sheet_name=None, engine="openpyxl"
        )
    except Exception as exc:
        raise RuntimeError(f"Cannot open {path}: {exc}") from exc

    frames = []
    for sheet_name, df in sheets.items():
        df["sheet"] = sheet_name      # track origin sheet
        frames.append(df)

    return pd.concat(frames, ignore_index=True, sort=False)

# Usage
wb_path = Path("reports") / "annual_summary.xlsx"
if wb_path.exists():
    all_data = load_all_sheets(wb_path)
    print(all_data["sheet"].value_counts())

This pattern is identical to the multi-file loop: tag the source, then concatenate.

Step 5 — Deduplicate

Concatenating data from overlapping sources (daily extracts that include the prior day, workbooks shared between teams) inevitably introduces duplicates.

# pip install pandas openpyxl
import pandas as pd

def deduplicate(df: pd.DataFrame, subset: list[str] | None = None, keep: str = "last") -> pd.DataFrame:
    """
    Drop exact duplicate rows.
    subset  — columns that define identity; None means all columns.
    keep    — 'last' retains the most recently loaded copy (useful when later files are corrections).
    """
    before = len(df)
    df = df.drop_duplicates(subset=subset, keep=keep).reset_index(drop=True)
    removed = before - len(df)
    if removed:
        print(f"Removed {removed} duplicate rows ({before}{len(df)})")
    return df

# deduplicate(combined, subset=["region", "source_file"])

For near-duplicate detection (same entity, slightly different values) use a hash of normalized key columns rather than relying on exact equality.

Step 6 — Validate Before Export

# pip install pandas openpyxl
import pandas as pd

def validate(df: pd.DataFrame, required_cols: list[str]) -> None:
    missing = [c for c in required_cols if c not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns: {missing}")

    null_counts = df[required_cols].isnull().sum()
    if null_counts.any():
        print("Nulls in required columns:\n", null_counts[null_counts > 0])

    assert df.index.is_unique, "Index is not unique — call reset_index(drop=True)"
    print(f"Validation passed. Shape: {df.shape}")

# validate(combined, required_cols=["region", "revenue", "units"])

Edge Cases & Variants

Variant A — Mixed .xlsx and .csv in the same folder

# pip install pandas openpyxl
from pathlib import Path
import pandas as pd

DATA_DIR = Path("mixed_reports")

frames = []
for p in DATA_DIR.iterdir():
    if p.suffix == ".xlsx" and not p.name.startswith("~$"):
        frames.append(pd.read_excel(p, engine="openpyxl"))
    elif p.suffix == ".csv":
        frames.append(pd.read_csv(p))

combined = pd.concat(frames, ignore_index=True, sort=False)

Variant B — read_excel with explicit sheet and header row

Some workbooks have metadata rows above the actual header. Use header= to skip them.

# pip install pandas openpyxl
import pandas as pd
from pathlib import Path

df = pd.read_excel(
    Path("reports") / "budget_2026.xlsx",
    sheet_name="Q1",
    header=3,           # row index 3 (0-based) contains the real header
    usecols="B:F",      # only read columns B through F
    engine="openpyxl",
)

Variant C — merging on differently-named keys with validation

If the key column has a different name in each DataFrame, use left_on / right_on. Add validate= to catch unexpected many-to-many joins early.

# pip install pandas openpyxl
import pandas as pd

sales   = pd.DataFrame({"sale_region": ["North", "South"], "revenue": [37000, 29400]})
targets = pd.DataFrame({"region":      ["North", "South"], "target":  [40000, 32000]})

try:
    result = pd.merge(
        sales, targets,
        left_on="sale_region", right_on="region",
        how="left",
        validate="many_to_one",   # raises MergeError if targets has duplicate region keys
    )
    result = result.drop(columns=["region"])   # remove the redundant right-hand key column
    print(result)
except pd.errors.MergeError as exc:
    print(f"Join cardinality error: {exc}")

When the merge produces unexpected _x/_y suffix columns, see Fix pandas merge Overlapping Column Suffixes for root cause and fixes.

Performance & Scale

ApproachWhen to useTypical limit
pd.concat in memory< 500 MB totalAvailable RAM
CSV chunking (chunksize)Large CSVs, stream-friendlyUnlimited
dask.dataframeParallel, out-of-core, multi-fileDisk size
DuckDB read_excel / read_csvSQL over flat files, joins at scaleDisk size

For workloads that fit in memory, pd.concat is fastest. Beyond ~500 MB, chunked CSV reading or DuckDB avoids out-of-memory crashes without rewriting your logic.

Troubleshooting

ErrorRoot causeFix
ValueError: No objects to concatenateframes list is empty — no matching files foundCheck glob pattern and directory path; print frames before concat
KeyError: 'region' after concatColumn name normalized differently in one file (Region vs region)Apply normalize_columns before concatenating
MemoryErrorToo many large files loaded at onceProcess in batches or use dask.dataframe
Columns appear as col_x / col_yShared non-key column names in pd.mergeSet suffixes= or drop/rename before merging — see Fix pandas merge Overlapping Column Suffixes
MergeError: Merge keys are not uniquevalidate= caught a cardinality mismatchInspect duplicate keys with df[df.duplicated(subset=["key"])]

Complete Working Script

# pip install pandas openpyxl
"""merge_spreadsheets.py — consolidate a folder of .xlsx and .csv files into one output."""
import argparse
import logging
import re
import sys
from pathlib import Path

import pandas as pd

logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")


def discover(directory: Path, extensions: tuple[str, ...] = (".xlsx", ".csv")) -> list[Path]:
    paths: list[Path] = []
    for ext in extensions:
        paths.extend(p for p in directory.rglob(f"*{ext}") if not p.name.startswith(("~$", ".")))
    return sorted(paths)


def load(paths: list[Path]) -> list[pd.DataFrame]:
    frames: list[pd.DataFrame] = []
    for p in paths:
        try:
            df = pd.read_excel(p, engine="openpyxl") if p.suffix == ".xlsx" else pd.read_csv(p)
            df["source_file"] = p.name
            frames.append(df)
            logging.info("Loaded %s (%d rows)", p.name, len(df))
        except Exception as exc:
            logging.warning("Skipping %s: %s", p.name, exc)
    return frames


def normalize_cols(df: pd.DataFrame) -> pd.DataFrame:
    df.columns = [re.sub(r"[\s\-]+", "_", c.strip().lower()) for c in df.columns]
    return df


def main() -> None:
    parser = argparse.ArgumentParser(description="Merge spreadsheets in a folder")
    parser.add_argument("input_dir",  type=Path, help="Folder containing source files")
    parser.add_argument("output_file", type=Path, help="Destination .csv or .xlsx")
    parser.add_argument("--dedup-cols", nargs="*", help="Columns that define a unique row")
    args = parser.parse_args()

    if not args.input_dir.is_dir():
        sys.exit(f"Not a directory: {args.input_dir}")

    paths = discover(args.input_dir)
    if not paths:
        sys.exit("No .xlsx or .csv files found")

    frames = load(paths)
    if not frames:
        sys.exit("All files failed to load")

    frames = [normalize_cols(df) for df in frames]

    try:
        combined = pd.concat(frames, ignore_index=True, sort=False)
    except ValueError as exc:
        sys.exit(f"Concatenation failed: {exc}")

    if args.dedup_cols:
        before = len(combined)
        combined = combined.drop_duplicates(subset=args.dedup_cols, keep="last")
        logging.info("Dedup removed %d rows", before - len(combined))

    combined = combined.reset_index(drop=True)

    try:
        if args.output_file.suffix == ".xlsx":
            combined.to_excel(args.output_file, index=False, engine="openpyxl")
        else:
            combined.to_csv(args.output_file, index=False)
        logging.info("Wrote %d rows to %s", len(combined), args.output_file)
    except Exception as exc:
        sys.exit(f"Export failed: {exc}")


if __name__ == "__main__":
    main()

Run it:

python merge_spreadsheets.py reports/ output/combined.csv --dedup-cols region source_file

Part of Python for Excel & CSV Data Processing.

Explore next