#!/usr/bin/env python3 """ SEAP historical CSV importer for data.gov.ro yearly dumps. Reads a SEAP CSV (any year/quarter/type) and emits a clean TSV that PostgreSQL COPY can ingest into seap.announcements. Handles: - BOM stripping - Romanian decimal commas → dots - "MM/DD/YYYY HH:MM:SS" date parsing (with second column variants) - Column dedupe by (type, ref_number) — first-row-wins for multi-lot CANs - CUI normalization (strip "RO " prefix) Usage: python3 import-seap-historical.py CSV_PATH OUTPUT_TSV TYPE SOURCE TYPE: 'contract' | 'da' | 'initiere' | 'atribuire_fara' | 'modificare' SOURCE: e.g. 'datagov_2024_t1_contracte' The output TSV columns are FIXED (15 columns matching the import SQL): type, ref_number, authority_name, authority_cui, cpv_code, cpv_name, contract_type, publication_date, contract_date, awarded_value, supplier_name, supplier_cui, procedure_type, legislation, source Column mapping is inferred from CSV headers (case+diacritic-insensitive). Falls back gracefully when columns are missing (older years had fewer cols). """ from __future__ import annotations import csv import re import sys import unicodedata from datetime import datetime from pathlib import Path def normalize_header(s: str) -> str: """Strip BOM, lowercase, strip diacritics, collapse whitespace.""" s = s.replace("", "").strip().lower() s = "".join(c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn") s = re.sub(r"\s+", " ", s) s = s.replace("?", "") return s.strip() def detect_dialect(first_line: str) -> tuple[str, str | None]: """Detect delimiter and quote char from first line. SEAP historical CSVs vary wildly: - 2017/2018: ^ delim, no quote - 2022: , delim, | quote (header looks like |FIELD|,|FIELD|) - 2023 T3: | delim, " quote (header: FIELD|FIELD with row "txt"|"txt") - 2023 T4: , delim, " quote (standard CSV with title-case headers) - 2024+: , delim, " quote (standard CSV) Returns (delim, quotechar_or_None). """ # Strip BOM (efbb bf) and lstrip whitespace before sniffing s = first_line if s.startswith(""): s = s[1:] s_strip = s.lstrip() # 2022 wire format: header LINE starts with `|` and uses `|FIELD|,|FIELD|` # → delim=',' quote='|' if s_strip.startswith("|") and "|," in s_strip: return (",", "|") counts = {c: s.count(c) for c in [",", "|", "^", ";", "\t"]} # Pick highest-count delimiter delim = max(counts, key=counts.get) if counts[delim] == 0: delim = "," if delim == "|": return ("|", '"') if delim == "^": return ("^", None) if delim == ";": return (";", '"') return (",", '"') # Maps normalized header → output column name. # Multiple headers may map to the same output (e.g. two "data publicare" cols). # Schema variants seen across data.gov.ro yearly dumps: # - 2024 (CSV, comma): "Autoritate contractanta", "Numar anunt", "Cod CPV" # - 2022/2023 (CSV/pipe, |QUOTE|): "DENUMIRE_AC", "NUMAR_ANUNT_ATRIBUIRE", "COD_CPV" # - 2017/2018 (^-delim): "AutoritateContractanta", "NumarAnuntAtribuire", "CPVCode" HEADER_MAP = { # 2024 standard CSV "autoritate contractanta": "authority_name", "cui": "authority_cui", "cui autoritate contractanta": "authority_cui", "cod cpv": "cpv_code", "denumire cpv": "cpv_name", "tip contract": "contract_type", "tip procedura": "procedure_type", "tip legislatie": "legislation", "tip incheiere contract": "award_type", "tip inchiere contract": "award_type", # typo seen in 2023 T1 XLS "tip criteriu de atribuire": "criterion", "numar anunt atribuire": "ref_number", "numar anunt initiere": "ref_initiere", "numar anunt": "ref_number", "numar contract": "contract_number", "numar lot": "lot_number", "data contract": "contract_date", "data publicare": "publication_date", "data publicare anunt atribuire": "publication_date", # 2023 T4 standard CSV "data anunt atribuire": "publication_date", # 2023 T1 XLS, 2017 ^-delim "data anunt initiere": "ref_initiere_date", "data publicare anunt initiere": "ref_initiere_date", "data publicare anunt": "publication_date", # 2023 T4 atribuire-fara "valoare atribuita (ron)": "awarded_value", "valoare estimata procedura": "estimated_value", "moneda valoare estimata procedura": "estimated_currency", "denumire procedura": "procedure_name", "tip activitate autoritate": "authority_activity", "criteriu de atribuire": "criterion", "denumire contract": "contract_title", "oras ofertant castigator": "supplier_city", "tara ofertant castigator": "supplier_country", "data publicare contract": "contract_date", "tip activitate": "authority_activity", "tip autoritate": "authority_type", "tip anunt": "announcement_type", "criterii de atribuire": "criterion", "licitatie electronica": "electronic_auction", "ofertant castigator": "supplier_name", "cui ofertant castigator": "supplier_cui", "oras ofertant": "supplier_city", "tara ofertant": "supplier_country", "incheiat prin": "award_type", "valoare contract (ron)": "awarded_value", "valoare contract": "awarded_value", "valoare estimata (ron)": "estimated_value", "valoare estimata": "estimated_value", "ofertant": "supplier_name", "cui ofertant": "supplier_cui", "cui castigator": "supplier_cui", "castigator": "supplier_name", "oras": "supplier_city", "tara": "supplier_country", "modalitate de desfasurare": "modality", # 2022/2023 UPPER_SNAKE_CASE pipe-delim schema "denumire_ac": "authority_name", "cui_ac": "authority_cui", "cui_autoritate": "authority_cui", "autoritate_contractanta": "authority_name", "numar_anunt_atribuire": "ref_number", "numar_anunt": "ref_number", "data_anunt_atribuire": "publication_date", "data_publicare": "publication_date", "data_publicare_ai": "ref_initiere_date", "data_contract": "contract_date", "numar_contract": "contract_number", "denumire_contract": "contract_title", "cod_cpv": "cpv_code", "cod_cpv_procedura": "cpv_code", "cpv_code": "cpv_code", # 2023 schema variant "denumire_cpv": "cpv_name", "denumire_cpv_procedura": "cpv_name", "tip_contract": "contract_type", "tip_procedura": "procedure_type", "tip_legislatie": "legislation", "tip_lesiglatie": "legislation", # SEAP typo present in many 2023 files "tip_anunt": "announcement_type", "tip_incheiere_contract": "award_type", "incheiat_prin": "award_type", "valoare_contract_ron": "awarded_value", "valoare_atribuita": "awarded_value", "valoare_estimata_procedura": "estimated_value", "ofertant": "supplier_name", "cui_of": "supplier_cui", "nume_castigator": "supplier_name", "cui_castigator": "supplier_cui", "oras_castigator": "supplier_city", "tara_castigator": "supplier_country", "modalitate_desfasurare": "modality", "modalitate_atribuire": "modality", "tip_criterii_atribuire": "criterion", "criteriu_de_atribuire": "criterion", "numar_anunt_ai": "ref_initiere", "numar_anunt_initiere": "ref_initiere", "data_anunt_initiere": "ref_initiere_date", "denumire_procedura": "procedure_name", # 2017/2018 ^-delim CamelCase legacy schema "castigator": "supplier_name", # already exists for 2024 but also legacy "castigatorcui": "supplier_cui", "castigatortara": "supplier_country", "castigatorlocalitate": "supplier_city", "castigatoradresa": "supplier_address", "tipcontract": "contract_type", "tipprocedura": "procedure_type", "autoritatecontractanta": "authority_name", "autoritatecontractantacui": "authority_cui", "tipac": "authority_type", "tipactivitateac": "authority_activity", "denumireac": "authority_name", "numaranuntatribuire": "ref_number", "numaranuntparticipare": "ref_initiere", "numaranunt": "ref_number", "dataanuntatribuire": "publication_date", "dataanuntparticipare": "ref_initiere_date", "datapublicare": "publication_date", "tipincheierecontract": "award_type", "tipcriteriiatribuire": "criterion", "culicitatieelectronica": "electronic_auction", "numarofertepre primite": "n_offers", "numarofertePrimite": "n_offers", "subcontractat": "subcontracted", "numarcontract": "contract_number", "datacontract": "contract_date", "titlucontract": "contract_title", "valoare": "awarded_value_orig", # may be in non-RON currency for 2017 "moneda": "currency", "valoareron": "awarded_value", "valoareeur": "awarded_value_eur", "cpvcodeid": "cpv_code_id", # internal SEAP id, not CPV "cpvcode": "cpv_code", # actual CPV like 85150000-5 "valoareestimataparticipare": "estimated_value", "monedavaloareestimataparticipare": "estimated_currency", "fonduricomunitare": "eu_funded", "tipfinantare": "funding_type", "tiplegislatieid": "legislation", "fondeuropean": "eu_fund", "contractperiodic": "periodic", "depozitegarantii": "deposits", "modalitatifinantare": "funding_modes", "tip": "announcement_subtype", # 2017 contracte has bare "Tip" # 2018-2019 XLS schema (UPPER_SNAKE with explicit underscores) "castigator": "supplier_name", "castigator_cui": "supplier_cui", "castigator_tara": "supplier_country", "castigator_localitate": "supplier_city", "castigaor_localitate": "supplier_city", # SEAP typo seen in 2018 T2 XLS "castigator_adresa": "supplier_address", "tip_ac": "authority_type", "tip_activitate_ac": "authority_activity", "autoritate_contractanta_cui": "authority_cui", "numar_anunt_participare": "ref_initiere", "data_anunt_participare": "ref_initiere_date", "tip_incheiere_contract": "award_type", "tip_criterii_atribuire": "criterion", "cu_licitatie_electronica": "electronic_auction", "numar_oferte_primite": "n_offers", "titlu_contract": "contract_title", "valoare_ron": "awarded_value", "valoare_eur": "awarded_value_eur", "valoare_estimata_participare": "estimated_value", "moneda_valoare_estimata_participare": "estimated_currency", "fonduri_comunitare": "eu_funded", "tip_finantare": "funding_type", "tip_legislatie_id": "legislation", "fond_european": "eu_fund", "contract_periodic": "periodic", "depozite_garantii": "deposits", "modalitati_finantare": "funding_modes", "cpv_code_id": "cpv_code_id", "cpv_code": "cpv_code", } def parse_date(s: str | None) -> str | None: """Parse MM/DD/YYYY [HH:MM:SS] or DD.MM.YYYY → ISO YYYY-MM-DD.""" if not s: return None s = s.strip() if not s: return None # MM/DD/YYYY 01:35:39 m = re.match(r"^(\d{1,2})/(\d{1,2})/(\d{4})", s) if m: try: mm, dd, yy = int(m[1]), int(m[2]), int(m[3]) datetime(yy, mm, dd) # validate return f"{yy:04d}-{mm:02d}-{dd:02d}" except ValueError: return None # DD.MM.YYYY m = re.match(r"^(\d{1,2})\.(\d{1,2})\.(\d{4})", s) if m: try: dd, mm, yy = int(m[1]), int(m[2]), int(m[3]) datetime(yy, mm, dd) return f"{yy:04d}-{mm:02d}-{dd:02d}" except ValueError: return None # YYYY-MM-DD passthrough if re.match(r"^\d{4}-\d{2}-\d{2}", s): return s[:10] return None def parse_number(s: str | None) -> str | None: """Parse Romanian number → ISO float string. SEAP CSV uses MIXED conventions: - "1.234.567,89" → period=thousand, comma=decimal → 1234567.89 - "123,126" → comma=THOUSAND (3 digits after) → 123126 - "12345,67" → comma=decimal (2 digits after) → 12345.67 - "1,234,567" → all commas=thousand → 1234567 Heuristic: digits-after-final-comma == 3 → thousand separator, otherwise → decimal. Robust to most real RO data. """ if not s: return None s = s.strip().strip('"').replace("\xa0", "").replace(" ", "") if not s or s == "-": return None # Mixed period+comma → assume RO format (period thousand, comma decimal) if "," in s and "." in s: s = s.replace(".", "").replace(",", ".") try: return f"{float(s):.2f}" except ValueError: return None # Multi-comma → all thousand separators if s.count(",") >= 2: try: return f"{int(s.replace(',', '')):d}.00" except ValueError: return None # Single comma → check digits after if "," in s: parts = s.split(",") if len(parts) == 2 and parts[0].isdigit() and parts[1].isdigit(): digits_after = len(parts[1]) if digits_after == 3: # Thousand separator (most common SEAP case) try: return f"{int(parts[0] + parts[1])}.00" except ValueError: return None # 1-2 digits after → decimal separator try: return f"{float(parts[0] + '.' + parts[1]):.2f}" except ValueError: return None try: return f"{float(s):.2f}" except ValueError: return None def normalize_cui(s: str | None) -> str | None: if not s: return None s = s.strip().strip('"') s = re.sub(r"^RO\s*", "", s, flags=re.IGNORECASE) s = s.strip() if not s or not s.isdigit(): return None return s def main() -> None: if len(sys.argv) != 5: print(__doc__) sys.exit(2) csv_path = Path(sys.argv[1]) out_path = Path(sys.argv[2]) record_type = sys.argv[3] source = sys.argv[4] if not csv_path.exists(): print(f"ERROR: {csv_path} does not exist", file=sys.stderr) sys.exit(1) seen: set[tuple[str, str]] = set() out_cols = [ "type", "ref_number", "authority_name", "authority_cui", "cpv_code", "cpv_name", "contract_type", "publication_date", "contract_date", "awarded_value", "supplier_name", "supplier_cui", "procedure_type", "legislation", "source", ] written = 0 skipped_dup = 0 skipped_no_ref = 0 total = 0 # Sniff first line to detect delimiter/quotechar with csv_path.open("r", encoding="utf-8-sig", errors="replace") as f: first_line = f.readline() delim, quotechar = detect_dialect(first_line) print(f"[import] delim={delim!r} quote={quotechar!r}", file=sys.stderr) with csv_path.open("r", encoding="utf-8-sig", errors="replace") as f, \ out_path.open("w", encoding="utf-8") as out: if quotechar: reader = csv.reader(f, delimiter=delim, quotechar=quotechar) else: reader = csv.reader(f, delimiter=delim, quoting=csv.QUOTE_NONE) # Skip "title" rows — some XLS exports begin with a single-cell # title (rest empty), then the real header row follows. header_raw = next(reader) non_empty = sum(1 for h in header_raw if h.strip().strip("|").strip()) if non_empty <= 1: print("[import] skipping title row, advancing to next", file=sys.stderr) header_raw = next(reader) # Strip pipe-quote artifacts: 2022 fields look like |"FIELD"| with literal | bookends header_raw = [h.strip().strip("|").strip() for h in header_raw] header = [normalize_header(h) for h in header_raw] # Build column index map. For dup headers (2× "data publicare"), LAST wins. col_idx: dict[str, int] = {} for i, h in enumerate(header): mapped = HEADER_MAP.get(h) if mapped: col_idx[mapped] = i # Write header line for COPY (\\\\N markers for nulls) out.write("\t".join(out_cols) + "\n") for row in reader: total += 1 if len(row) < len(header): row = row + [""] * (len(header) - len(row)) def get(col: str) -> str | None: idx = col_idx.get(col) if idx is None or idx >= len(row): return None v = row[idx].strip().strip("|").strip() return v if v else None ref = get("ref_number") # For initiere imports, files name the ref column "Numar anunt initiere" # which we map to ref_initiere. Fall through to that field. if not ref and record_type in ("initiere",): ref = get("ref_initiere") if not ref: skipped_no_ref += 1 continue key = (record_type, ref) if key in seen: skipped_dup += 1 continue seen.add(key) fields = { "type": record_type, "ref_number": ref, "authority_name": get("authority_name"), "authority_cui": normalize_cui(get("authority_cui")), "cpv_code": get("cpv_code"), "cpv_name": get("cpv_name"), "contract_type": get("contract_type"), "publication_date": parse_date(get("publication_date")), "contract_date": parse_date(get("contract_date")), "awarded_value": parse_number(get("awarded_value")), "supplier_name": get("supplier_name"), "supplier_cui": normalize_cui(get("supplier_cui")), "procedure_type": get("procedure_type"), "legislation": get("legislation"), "source": source, } line_parts = [] for c in out_cols: v = fields.get(c) if v is None: line_parts.append("\\N") else: # Escape tabs, newlines, backslashes for COPY format v = str(v).replace("\\", "\\\\").replace("\t", " ").replace("\n", " ").replace("\r", "") line_parts.append(v) out.write("\t".join(line_parts) + "\n") written += 1 print(f"[import] CSV={csv_path.name}") print(f"[import] total rows: {total}") print(f"[import] written: {written}") print(f"[import] dup-skip: {skipped_dup}") print(f"[import] no-ref: {skipped_no_ref}") print(f"[import] output: {out_path}") if __name__ == "__main__": main()