vreau-digital/services/seap-scraper/scripts/xlsx-to-csv.py

#!/usr/bin/env python3
"""XLSX/XLS → CSV converter for SEAP data.gov.ro yearly dumps.

Reads the first sheet, writes a UTF-8 CSV (comma + double-quote) so the
existing SEAP normalizer (import-seap-historical.py) can ingest it.

Auto-detects file format:
- XLSX (zip archive) → openpyxl
- XLS (BIFF8 OLE)    → xlrd 1.x

Usage: python3 xlsx-to-csv.py INPUT.{xlsx|xls} OUTPUT.csv
"""
import csv
import sys
from datetime import datetime
from pathlib import Path


def is_xlsx(path: Path) -> bool:
    """XLSX is a ZIP archive (PK header)."""
    with path.open("rb") as f:
        return f.read(2) == b"PK"


def main() -> None:
    if len(sys.argv) != 3:
        print(__doc__)
        sys.exit(2)
    src = Path(sys.argv[1])
    dst = Path(sys.argv[2])
    written = 0

    if is_xlsx(src):
        import openpyxl
        wb = openpyxl.load_workbook(src, read_only=True, data_only=True)
        ws = wb.active
        with dst.open("w", encoding="utf-8", newline="") as f:
            w = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
            for row in ws.iter_rows(values_only=True):
                out = []
                for v in row:
                    if v is None:
                        out.append("")
                    elif isinstance(v, datetime):
                        out.append(v.strftime("%m/%d/%Y %H:%M:%S"))
                    elif isinstance(v, float) and v.is_integer():
                        out.append(str(int(v)))
                    else:
                        out.append(str(v))
                w.writerow(out)
                written += 1
    else:
        # Legacy XLS via xlrd 1.x — concat ALL sheets (some big SEAP files use
        # multiple sheets due to the 65k row limit in old XLS format).
        import xlrd
        b = xlrd.open_workbook(str(src))
        wrote_header = False
        with dst.open("w", encoding="utf-8", newline="") as f:
            w = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
            for sidx, sname in enumerate(b.sheet_names()):
                sh = b.sheet_by_index(sidx)
                if sh.nrows == 0:
                    continue
                start = 0
                if wrote_header:
                    start = 1  # skip repeated header on subsequent sheets
                else:
                    wrote_header = True
                for ridx in range(start, sh.nrows):
                    row = sh.row(ridx)
                    out = []
                    for cell in row:
                        if cell.ctype == xlrd.XL_CELL_EMPTY or cell.ctype == xlrd.XL_CELL_BLANK:
                            out.append("")
                        elif cell.ctype == xlrd.XL_CELL_DATE:
                            try:
                                tup = xlrd.xldate_as_tuple(cell.value, b.datemode)
                                out.append(datetime(*tup).strftime("%m/%d/%Y %H:%M:%S"))
                            except Exception:
                                out.append(str(cell.value))
                        elif cell.ctype == xlrd.XL_CELL_NUMBER:
                            v = cell.value
                            if v == int(v):
                                out.append(str(int(v)))
                            else:
                                out.append(str(v))
                        else:
                            out.append(str(cell.value))
                    w.writerow(out)
                    written += 1
    print(f"[xlsx2csv] {src.name} → {dst.name}: {written} rows", file=sys.stderr)


if __name__ == "__main__":
    main()