#!/usr/bin/env python3 """XLSX/XLS → CSV converter for SEAP data.gov.ro yearly dumps. Reads the first sheet, writes a UTF-8 CSV (comma + double-quote) so the existing SEAP normalizer (import-seap-historical.py) can ingest it. Auto-detects file format: - XLSX (zip archive) → openpyxl - XLS (BIFF8 OLE) → xlrd 1.x Usage: python3 xlsx-to-csv.py INPUT.{xlsx|xls} OUTPUT.csv """ import csv import sys from datetime import datetime from pathlib import Path def is_xlsx(path: Path) -> bool: """XLSX is a ZIP archive (PK header).""" with path.open("rb") as f: return f.read(2) == b"PK" def main() -> None: if len(sys.argv) != 3: print(__doc__) sys.exit(2) src = Path(sys.argv[1]) dst = Path(sys.argv[2]) written = 0 if is_xlsx(src): import openpyxl wb = openpyxl.load_workbook(src, read_only=True, data_only=True) ws = wb.active with dst.open("w", encoding="utf-8", newline="") as f: w = csv.writer(f, quoting=csv.QUOTE_MINIMAL) for row in ws.iter_rows(values_only=True): out = [] for v in row: if v is None: out.append("") elif isinstance(v, datetime): out.append(v.strftime("%m/%d/%Y %H:%M:%S")) elif isinstance(v, float) and v.is_integer(): out.append(str(int(v))) else: out.append(str(v)) w.writerow(out) written += 1 else: # Legacy XLS via xlrd 1.x — concat ALL sheets (some big SEAP files use # multiple sheets due to the 65k row limit in old XLS format). import xlrd b = xlrd.open_workbook(str(src)) wrote_header = False with dst.open("w", encoding="utf-8", newline="") as f: w = csv.writer(f, quoting=csv.QUOTE_MINIMAL) for sidx, sname in enumerate(b.sheet_names()): sh = b.sheet_by_index(sidx) if sh.nrows == 0: continue start = 0 if wrote_header: start = 1 # skip repeated header on subsequent sheets else: wrote_header = True for ridx in range(start, sh.nrows): row = sh.row(ridx) out = [] for cell in row: if cell.ctype == xlrd.XL_CELL_EMPTY or cell.ctype == xlrd.XL_CELL_BLANK: out.append("") elif cell.ctype == xlrd.XL_CELL_DATE: try: tup = xlrd.xldate_as_tuple(cell.value, b.datemode) out.append(datetime(*tup).strftime("%m/%d/%Y %H:%M:%S")) except Exception: out.append(str(cell.value)) elif cell.ctype == xlrd.XL_CELL_NUMBER: v = cell.value if v == int(v): out.append(str(int(v))) else: out.append(str(v)) else: out.append(str(cell.value)) w.writerow(out) written += 1 print(f"[xlsx2csv] {src.name} → {dst.name}: {written} rows", file=sys.stderr) if __name__ == "__main__": main()