vreau-digital/services/seap-scraper/scripts/import-afir-historical.py

#!/usr/bin/env python3
"""AFIR XLSX → pipe-delimited TSV normalizer.

Source: AFIR yearly listaplati XLSX (FEADR or FEGA), as published at
https://www.afir.ro/rapoarte/beneficiari-de-fonduri-europene/date-deschise/

The XLSX has 9 banner rows, then a 15-column header at row 10 (1-indexed),
then ~470K-560K data rows. Schema (since 2023, identical for 2024):

  Numele beneficiarului
  Numele de familie al beneficiarului
  Denumirea societatii-mama si codul de inregistrare fiscala
  Localitate
  Codul masurii/tipului de interventie
  Obiectiv
  Data inceperii
  Data incheierii
  Cuantum Operatiune FEGA
  Cuantum Total FEGA
  Cuantum Operatiune FEADR
  Cuantum Total FEADR
  Cuantum aferent operatiunii
  Cuantum total cofinantare beneficiari
  Cuantum total UE Beneficiar

Output: pipe-delimited TSV (no quoting), in the same column order, suitable
for `\\copy fonduri.staging_afir FROM ... WITH (FORMAT text, DELIMITER '|')`.

Usage:
  python3 import-afir-historical.py INPUT.xlsx OUTPUT.tsv

Numeric columns are normalized: Romanian decimal "12.345,67" → "12345.67".
Empty strings stay empty (NULL in COPY with NULL '').
"""

import sys
import re

import openpyxl

EXPECTED_HEADER = "Numele beneficiarului"


def norm_num(v):
    if v is None:
        return ""
    if isinstance(v, (int, float)):
        # Already numeric (rare for AFIR XLSX — values arrive as strings).
        return f"{v:.2f}".replace("-0.00", "0.00")
    s = str(v).strip()
    if not s:
        return ""
    # Strip thousands "." and convert "," → "."
    # AFIR uses Romanian format: 12.345,67  or  12345,67  or  0,00
    if "," in s:
        s = s.replace(".", "").replace(",", ".")
    # Strip leading/trailing whitespace, replace any embedded pipe to be safe
    return s.replace("|", "/")


def norm_text(v):
    if v is None:
        return ""
    s = str(v).strip()
    if not s:
        return ""
    # COPY text format: tab and pipe collide with our delimiter; backslash needs escape.
    # We chose pipe as delimiter — replace embedded pipes with "/".
    # Newlines collapse to space.
    s = s.replace("|", "/").replace("\t", " ").replace("\r", " ").replace("\n", " ")
    s = re.sub(r"\s+", " ", s)
    # Backslash escape for Postgres COPY text format
    s = s.replace("\\", "\\\\")
    return s


def main():
    if len(sys.argv) != 3:
        print("usage: import-afir-historical.py INPUT.xlsx OUTPUT.tsv", file=sys.stderr)
        sys.exit(2)
    in_path, out_path = sys.argv[1], sys.argv[2]

    wb = openpyxl.load_workbook(in_path, read_only=True, data_only=True)
    ws = wb.active

    rows = ws.iter_rows(values_only=True)
    header_idx = None
    for i, r in enumerate(rows):
        if r and r[0] and EXPECTED_HEADER in str(r[0]):
            header_idx = i
            break
        if i > 50:
            break
    if header_idx is None:
        print("[afir-import] ERROR: header row not found in first 50 rows", file=sys.stderr)
        sys.exit(1)

    n_data = 0
    n_skipped = 0

    with open(out_path, "w", encoding="utf-8") as f:
        for r in rows:
            # 16 columns observed (last is None padding)
            if r is None:
                continue
            cells = list(r) + [None] * (16 - len(r))
            beneficiar = norm_text(cells[0])
            if not beneficiar:
                # Trailing empty rows
                n_skipped += 1
                continue

            out = [
                beneficiar,
                norm_text(cells[1]),  # last_name
                norm_text(cells[2]),  # mama_cui
                norm_text(cells[3]),  # localitate
                norm_text(cells[4]),  # cod_masura
                norm_text(cells[5]),  # obiectiv
                norm_text(cells[6]),  # data_start
                norm_text(cells[7]),  # data_end
                norm_num(cells[8]),   # fega_op
                norm_num(cells[9]),   # fega_total
                norm_num(cells[10]),  # feadr_op
                norm_num(cells[11]),  # feadr_total
                norm_num(cells[12]),  # op_amount
                norm_num(cells[13]),  # cofinantare
                norm_num(cells[14]),  # ue_total
            ]
            f.write("|".join(out) + "\n")
            n_data += 1
            if n_data % 50000 == 0:
                print(f"[afir-import] wrote {n_data} rows", file=sys.stderr)

    print(f"[afir-import] done — {n_data} rows, {n_skipped} skipped", file=sys.stderr)


if __name__ == "__main__":
    main()