#!/usr/bin/env python3 """AFIR XLSX → pipe-delimited TSV normalizer. Source: AFIR yearly listaplati XLSX (FEADR or FEGA), as published at https://www.afir.ro/rapoarte/beneficiari-de-fonduri-europene/date-deschise/ The XLSX has 9 banner rows, then a 15-column header at row 10 (1-indexed), then ~470K-560K data rows. Schema (since 2023, identical for 2024): Numele beneficiarului Numele de familie al beneficiarului Denumirea societatii-mama si codul de inregistrare fiscala Localitate Codul masurii/tipului de interventie Obiectiv Data inceperii Data incheierii Cuantum Operatiune FEGA Cuantum Total FEGA Cuantum Operatiune FEADR Cuantum Total FEADR Cuantum aferent operatiunii Cuantum total cofinantare beneficiari Cuantum total UE Beneficiar Output: pipe-delimited TSV (no quoting), in the same column order, suitable for `\\copy fonduri.staging_afir FROM ... WITH (FORMAT text, DELIMITER '|')`. Usage: python3 import-afir-historical.py INPUT.xlsx OUTPUT.tsv Numeric columns are normalized: Romanian decimal "12.345,67" → "12345.67". Empty strings stay empty (NULL in COPY with NULL ''). """ import sys import re import openpyxl EXPECTED_HEADER = "Numele beneficiarului" def norm_num(v): if v is None: return "" if isinstance(v, (int, float)): # Already numeric (rare for AFIR XLSX — values arrive as strings). return f"{v:.2f}".replace("-0.00", "0.00") s = str(v).strip() if not s: return "" # Strip thousands "." and convert "," → "." # AFIR uses Romanian format: 12.345,67 or 12345,67 or 0,00 if "," in s: s = s.replace(".", "").replace(",", ".") # Strip leading/trailing whitespace, replace any embedded pipe to be safe return s.replace("|", "/") def norm_text(v): if v is None: return "" s = str(v).strip() if not s: return "" # COPY text format: tab and pipe collide with our delimiter; backslash needs escape. # We chose pipe as delimiter — replace embedded pipes with "/". # Newlines collapse to space. s = s.replace("|", "/").replace("\t", " ").replace("\r", " ").replace("\n", " ") s = re.sub(r"\s+", " ", s) # Backslash escape for Postgres COPY text format s = s.replace("\\", "\\\\") return s def main(): if len(sys.argv) != 3: print("usage: import-afir-historical.py INPUT.xlsx OUTPUT.tsv", file=sys.stderr) sys.exit(2) in_path, out_path = sys.argv[1], sys.argv[2] wb = openpyxl.load_workbook(in_path, read_only=True, data_only=True) ws = wb.active rows = ws.iter_rows(values_only=True) header_idx = None for i, r in enumerate(rows): if r and r[0] and EXPECTED_HEADER in str(r[0]): header_idx = i break if i > 50: break if header_idx is None: print("[afir-import] ERROR: header row not found in first 50 rows", file=sys.stderr) sys.exit(1) n_data = 0 n_skipped = 0 with open(out_path, "w", encoding="utf-8") as f: for r in rows: # 16 columns observed (last is None padding) if r is None: continue cells = list(r) + [None] * (16 - len(r)) beneficiar = norm_text(cells[0]) if not beneficiar: # Trailing empty rows n_skipped += 1 continue out = [ beneficiar, norm_text(cells[1]), # last_name norm_text(cells[2]), # mama_cui norm_text(cells[3]), # localitate norm_text(cells[4]), # cod_masura norm_text(cells[5]), # obiectiv norm_text(cells[6]), # data_start norm_text(cells[7]), # data_end norm_num(cells[8]), # fega_op norm_num(cells[9]), # fega_total norm_num(cells[10]), # feadr_op norm_num(cells[11]), # feadr_total norm_num(cells[12]), # op_amount norm_num(cells[13]), # cofinantare norm_num(cells[14]), # ue_total ] f.write("|".join(out) + "\n") n_data += 1 if n_data % 50000 == 0: print(f"[afir-import] wrote {n_data} rows", file=sys.stderr) print(f"[afir-import] done — {n_data} rows, {n_skipped} skipped", file=sys.stderr) if __name__ == "__main__": main()