Merging Multiple Spreadsheets

Manual copy-paste consolidation breaks the moment a second person edits a file or a folder grows past a handful of workbooks. Columns drift, rows duplicate, and someone's monthly report quietly disappears. This guide replaces that process with a repeatable Python script that discovers files with glob, aligns mismatched schemas, applies the right pandas operation (concat, merge, or join), deduplicates, and handles multi-sheet workbooks — producing one clean table every time.

For reading individual files before consolidation, see Reading Excel Files with Python. For cleaning up encoding and header noise before merging, see Cleaning Messy CSV Data with pandas.

Prerequisites

# pip install pandas openpyxl
pip install pandas openpyxl

Test data layout assumed throughout:

reports/
  jan_sales.xlsx
  feb_sales.xlsx
  mar_sales.xlsx
  targets.xlsx     # different schema — used for merge examples

Create throwaway fixtures to follow along:

# pip install pandas openpyxl
from pathlib import Path
import pandas as pd

Path("reports").mkdir(exist_ok=True)

for month, data in [
    ("jan", {"region": ["North", "South"], "revenue": [12000, 9500], "units": [120, 95]}),
    ("feb", {"region": ["North", "South"], "revenue": [13200, 10100], "units": [130, 101]}),
    ("mar", {"region": ["North", "South"], "revenue": [11800, 9800], "units": [115, 98]}),
]:
    pd.DataFrame(data).to_excel(f"reports/{month}_sales.xlsx", index=False)

pd.DataFrame({"region": ["North", "South"], "target": [40000, 32000]}).to_excel(
    "reports/targets.xlsx", index=False
)

Step 1 — Discover Files with glob

Hard-coded file lists break when a new month is added. Use pathlib.Path.glob (or rglob for nested folders) and filter out Office lock files that start with ~$.

# pip install pandas openpyxl
from pathlib import Path

REPORTS_DIR = Path("reports")

def discover_files(directory: Path, pattern: str = "*.xlsx") -> list[Path]:
    """Return sorted list of matching files, skipping Office temp files."""
    return sorted(
        p for p in directory.glob(pattern)
        if not p.name.startswith(("~$", "."))
    )

sales_files = [f for f in discover_files(REPORTS_DIR) if "sales" in f.stem]
print(sales_files)
# [PosixPath('reports/feb_sales.xlsx'), PosixPath('reports/jan_sales.xlsx'), ...]

Pass multiple patterns by calling discover_files twice and combining the lists, or switch to rglob("**/*.csv") for recursive discovery.

Step 2 — Load Each File into a DataFrame

# pip install pandas openpyxl
from pathlib import Path
import pandas as pd
import logging

logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")

def load_files(paths: list[Path]) -> list[pd.DataFrame]:
    """Load .xlsx and .csv files; tag each row with its source filename."""
    frames: list[pd.DataFrame] = []
    for p in paths:
        try:
            df = pd.read_excel(p, engine="openpyxl") if p.suffix == ".xlsx" else pd.read_csv(p)
            df["source_file"] = p.name          # traceability column
            frames.append(df)
            logging.info("Loaded %s  (%d rows)", p.name, len(df))
        except Exception as exc:
            logging.warning("Skipping %s: %s", p.name, exc)
    return frames

Always include source_file. Without it, duplicate-row debugging is painful after consolidation.

Step 3 — Align Mismatched Columns

Before stacking, normalize column names so Client_ID, ClientID, and client id all resolve to the same key. Then check which columns are present in every file vs. only some.

# pip install pandas openpyxl
import pandas as pd
import re

def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    """Lowercase, strip, replace spaces/hyphens with underscores."""
    df.columns = [re.sub(r"[\s\-]+", "_", c.strip().lower()) for c in df.columns]
    return df

def align_schemas(frames: list[pd.DataFrame]) -> list[pd.DataFrame]:
    """Report column coverage and return normalized frames."""
    normalized = [normalize_columns(df.copy()) for df in frames]
    all_cols: set[str] = set()
    for df in normalized:
        all_cols |= set(df.columns)
    common = set.intersection(*(set(df.columns) for df in normalized))
    extra  = all_cols - common
    if extra:
        logging.info("Columns present in some files only: %s", extra)
    return normalized

pd.concat with sort=False will create NaN-filled columns for rows that lack an optional column — that is usually fine for reporting. If a column must be present in every file, validate explicitly here and raise before the concatenation.

The concat vs merge vs join Decision

pd.concat — stacking same-schema files

Use pd.concat when every file represents the same measurement at a different time or from a different source (monthly sales, per-branch exports). It aligns on column names, not position, so column order mismatches are handled automatically.

# pip install pandas openpyxl
from pathlib import Path
import pandas as pd

REPORTS_DIR = Path("reports")

frames = []
for p in sorted(REPORTS_DIR.glob("*_sales.xlsx")):
    if p.name.startswith("~$"):
        continue
    df = pd.read_excel(p, engine="openpyxl")
    df["source_file"] = p.name
    frames.append(df)

try:
    combined = pd.concat(frames, ignore_index=True, sort=False)
except ValueError as exc:
    raise RuntimeError("No valid DataFrames to concatenate") from exc

print(combined.shape)   # (6, 4) — 3 months × 2 regions
print(combined.head())

Key parameters:

ignore_index=True — re-numbers the index from 0 instead of repeating 0, 1 from each source.
sort=False — preserves column order from the first frame; avoids surprising reordering.
join="inner" — keep only columns common to all frames (useful when optional columns are genuinely unwanted).

pd.merge — joining on a shared key

Use pd.merge when two DataFrames contain different information about the same entities. The on= parameter names the shared key column. If the key column has different names in each frame use left_on= / right_on=.

# pip install pandas openpyxl
from pathlib import Path
import pandas as pd

REPORTS_DIR = Path("reports")

try:
    actuals  = pd.concat(
        [pd.read_excel(p, engine="openpyxl") for p in sorted(REPORTS_DIR.glob("*_sales.xlsx"))
         if not p.name.startswith("~$")],
        ignore_index=True
    ).groupby("region", as_index=False)[["revenue", "units"]].sum()

    targets = pd.read_excel(REPORTS_DIR / "targets.xlsx", engine="openpyxl")

    result = pd.merge(actuals, targets, on="region", how="left")
    result["attainment_pct"] = (result["revenue"] / result["target"] * 100).round(1)
    print(result)
except FileNotFoundError as exc:
    print(f"Missing file: {exc}")

how="left" keeps every row from actuals even if targets has no matching region — missing targets become NaN rather than silently dropping rows.

join — merging on the index

DataFrame.join is shorthand for a merge on the index. It is useful when both frames are already indexed by the same key (e.g., a date-indexed time series).

# pip install pandas openpyxl
import pandas as pd

df_a = pd.DataFrame({"revenue": [12000, 13200]}, index=["North", "South"])
df_b = pd.DataFrame({"cost": [8000, 7500]},    index=["North", "South"])

combined = df_a.join(df_b)   # index-aligned, no explicit key needed
print(combined)

Use join for index-keyed frames, merge for everything else.

Step 4 — Handle Multi-Sheet Workbooks

When one .xlsx file contains multiple sheets, pd.read_excel(sheet_name=None) returns an OrderedDict mapping sheet name to DataFrame.

# pip install pandas openpyxl
from pathlib import Path
import pandas as pd

def load_all_sheets(path: Path) -> pd.DataFrame:
    """Concatenate all sheets in a workbook into one DataFrame."""
    try:
        sheets: dict[str, pd.DataFrame] = pd.read_excel(
            path, sheet_name=None, engine="openpyxl"
        )
    except Exception as exc:
        raise RuntimeError(f"Cannot open {path}: {exc}") from exc

    frames = []
    for sheet_name, df in sheets.items():
        df["sheet"] = sheet_name      # track origin sheet
        frames.append(df)

    return pd.concat(frames, ignore_index=True, sort=False)

# Usage
wb_path = Path("reports") / "annual_summary.xlsx"
if wb_path.exists():
    all_data = load_all_sheets(wb_path)
    print(all_data["sheet"].value_counts())

This pattern is identical to the multi-file loop: tag the source, then concatenate.

Step 5 — Deduplicate

Concatenating data from overlapping sources (daily extracts that include the prior day, workbooks shared between teams) inevitably introduces duplicates.

# pip install pandas openpyxl
import pandas as pd

def deduplicate(df: pd.DataFrame, subset: list[str] | None = None, keep: str = "last") -> pd.DataFrame:
    """
    Drop exact duplicate rows.
    subset  — columns that define identity; None means all columns.
    keep    — 'last' retains the most recently loaded copy (useful when later files are corrections).
    """
    before = len(df)
    df = df.drop_duplicates(subset=subset, keep=keep).reset_index(drop=True)
    removed = before - len(df)
    if removed:
        print(f"Removed {removed} duplicate rows ({before} → {len(df)})")
    return df

# deduplicate(combined, subset=["region", "source_file"])

For near-duplicate detection (same entity, slightly different values) use a hash of normalized key columns rather than relying on exact equality.

Step 6 — Validate Before Export

# pip install pandas openpyxl
import pandas as pd

def validate(df: pd.DataFrame, required_cols: list[str]) -> None:
    missing = [c for c in required_cols if c not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns: {missing}")

    null_counts = df[required_cols].isnull().sum()
    if null_counts.any():
        print("Nulls in required columns:\n", null_counts[null_counts > 0])

    assert df.index.is_unique, "Index is not unique — call reset_index(drop=True)"
    print(f"Validation passed. Shape: {df.shape}")

# validate(combined, required_cols=["region", "revenue", "units"])

Edge Cases & Variants

Variant A — Mixed .xlsx and .csv in the same folder

# pip install pandas openpyxl
from pathlib import Path
import pandas as pd

DATA_DIR = Path("mixed_reports")

frames = []
for p in DATA_DIR.iterdir():
    if p.suffix == ".xlsx" and not p.name.startswith("~$"):
        frames.append(pd.read_excel(p, engine="openpyxl"))
    elif p.suffix == ".csv":
        frames.append(pd.read_csv(p))

combined = pd.concat(frames, ignore_index=True, sort=False)

Variant B — read_excel with explicit sheet and header row

Some workbooks have metadata rows above the actual header. Use header= to skip them.

# pip install pandas openpyxl
import pandas as pd
from pathlib import Path

df = pd.read_excel(
    Path("reports") / "budget_2026.xlsx",
    sheet_name="Q1",
    header=3,           # row index 3 (0-based) contains the real header
    usecols="B:F",      # only read columns B through F
    engine="openpyxl",
)

Variant C — merging on differently-named keys with validation

If the key column has a different name in each DataFrame, use left_on / right_on. Add validate= to catch unexpected many-to-many joins early.

# pip install pandas openpyxl
import pandas as pd

sales   = pd.DataFrame({"sale_region": ["North", "South"], "revenue": [37000, 29400]})
targets = pd.DataFrame({"region":      ["North", "South"], "target":  [40000, 32000]})

try:
    result = pd.merge(
        sales, targets,
        left_on="sale_region", right_on="region",
        how="left",
        validate="many_to_one",   # raises MergeError if targets has duplicate region keys
    )
    result = result.drop(columns=["region"])   # remove the redundant right-hand key column
    print(result)
except pd.errors.MergeError as exc:
    print(f"Join cardinality error: {exc}")

When the merge produces unexpected _x/_y suffix columns, see Fix pandas merge Overlapping Column Suffixes for root cause and fixes.

Performance & Scale

Approach	When to use	Typical limit
`pd.concat` in memory	< 500 MB total	Available RAM
CSV chunking (`chunksize`)	Large CSVs, stream-friendly	Unlimited
`dask.dataframe`	Parallel, out-of-core, multi-file	Disk size
DuckDB `read_excel` / `read_csv`	SQL over flat files, joins at scale	Disk size

For workloads that fit in memory, pd.concat is fastest. Beyond ~500 MB, chunked CSV reading or DuckDB avoids out-of-memory crashes without rewriting your logic.

Troubleshooting

Error	Root cause	Fix
`ValueError: No objects to concatenate`	`frames` list is empty — no matching files found	Check `glob` pattern and directory path; print `frames` before `concat`
`KeyError: 'region'` after concat	Column name normalized differently in one file (`Region` vs `region`)	Apply `normalize_columns` before concatenating
`MemoryError`	Too many large files loaded at once	Process in batches or use `dask.dataframe`
Columns appear as `col_x` / `col_y`	Shared non-key column names in `pd.merge`	Set `suffixes=` or drop/rename before merging — see Fix pandas merge Overlapping Column Suffixes
`MergeError: Merge keys are not unique`	`validate=` caught a cardinality mismatch	Inspect duplicate keys with `df[df.duplicated(subset=["key"])]`

Complete Working Script

# pip install pandas openpyxl
"""merge_spreadsheets.py — consolidate a folder of .xlsx and .csv files into one output."""
import argparse
import logging
import re
import sys
from pathlib import Path

import pandas as pd

logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")


def discover(directory: Path, extensions: tuple[str, ...] = (".xlsx", ".csv")) -> list[Path]:
    paths: list[Path] = []
    for ext in extensions:
        paths.extend(p for p in directory.rglob(f"*{ext}") if not p.name.startswith(("~$", ".")))
    return sorted(paths)


def load(paths: list[Path]) -> list[pd.DataFrame]:
    frames: list[pd.DataFrame] = []
    for p in paths:
        try:
            df = pd.read_excel(p, engine="openpyxl") if p.suffix == ".xlsx" else pd.read_csv(p)
            df["source_file"] = p.name
            frames.append(df)
            logging.info("Loaded %s (%d rows)", p.name, len(df))
        except Exception as exc:
            logging.warning("Skipping %s: %s", p.name, exc)
    return frames


def normalize_cols(df: pd.DataFrame) -> pd.DataFrame:
    df.columns = [re.sub(r"[\s\-]+", "_", c.strip().lower()) for c in df.columns]
    return df


def main() -> None:
    parser = argparse.ArgumentParser(description="Merge spreadsheets in a folder")
    parser.add_argument("input_dir",  type=Path, help="Folder containing source files")
    parser.add_argument("output_file", type=Path, help="Destination .csv or .xlsx")
    parser.add_argument("--dedup-cols", nargs="*", help="Columns that define a unique row")
    args = parser.parse_args()

    if not args.input_dir.is_dir():
        sys.exit(f"Not a directory: {args.input_dir}")

    paths = discover(args.input_dir)
    if not paths:
        sys.exit("No .xlsx or .csv files found")

    frames = load(paths)
    if not frames:
        sys.exit("All files failed to load")

    frames = [normalize_cols(df) for df in frames]

    try:
        combined = pd.concat(frames, ignore_index=True, sort=False)
    except ValueError as exc:
        sys.exit(f"Concatenation failed: {exc}")

    if args.dedup_cols:
        before = len(combined)
        combined = combined.drop_duplicates(subset=args.dedup_cols, keep="last")
        logging.info("Dedup removed %d rows", before - len(combined))

    combined = combined.reset_index(drop=True)

    try:
        if args.output_file.suffix == ".xlsx":
            combined.to_excel(args.output_file, index=False, engine="openpyxl")
        else:
            combined.to_csv(args.output_file, index=False)
        logging.info("Wrote %d rows to %s", len(combined), args.output_file)
    except Exception as exc:
        sys.exit(f"Export failed: {exc}")


if __name__ == "__main__":
    main()

Run it:

python merge_spreadsheets.py reports/ output/combined.csv --dedup-cols region source_file

Reading Excel Files with Python — engine options, sheet selection, and header row handling before you merge
Cleaning Messy CSV Data with pandas — normalize headers and fix encoding before concatenating
Converting Excel to JSON with Python — serialize the merged table for API or web consumption
Fix pandas merge Overlapping Column Suffixes — resolve _x/_y columns after a merge

Part of Python for Excel & CSV Data Processing.

Explore next

Converting Excel to JSON with Python Fix pandas merge Overlapping Column Suffixes