vreau-digital/services/seap-scraper/scripts/import-apia-fermieri.py

#!/usr/bin/env python3
"""APIA "Lista fermieri" XLSX → pipe-delimited TSV normalizer.

Source: data.gov.ro CKAN package "lista-fermierilor-campania-apia-2024".
Currently a single resource (comuna Găgești, Vaslui, ~192 farmers), but the
package is supposed to grow as more UATs publish their lists. The XLSX
schema is set by APIA and identical across UATs:

  Row 0 (header): NR.CRT | NUME PRENUME | RESPONSABIL UAT 2024
                  | COMUNA/ORAS | SAT | DATE CONTACT | CENTRUL APIA
                  | SUPRAFATA 2023 | (~17 None columns)
  Rows 1..N (data): one row per farmer, NR.CRT 1-indexed.

Output: pipe-delimited TSV (no quoting), columns in this order:

  campaign_year | name | comuna_oras | sat | centru_apia
                | responsabil_uat | suprafata_ha
                | source_dataset_id | source_resource_id | source_url

Empty strings stay empty (NULL in COPY with NULL '').

Usage:
  python3 import-apia-fermieri.py INPUT.xlsx OUTPUT.tsv \\
      CAMPAIGN_YEAR DATASET_ID RESOURCE_ID SOURCE_URL
"""

import re
import sys

import openpyxl

EXPECTED_HEADER_COL0 = "NR.CRT"
EXPECTED_HEADER_COL1 = "NUME"   # "NUME PRENUME" or "NUME SI PRENUME"


def norm_text(v):
    if v is None:
        return ""
    s = str(v).strip()
    if not s:
        return ""
    # Pipe is our delimiter — replace embedded pipes; collapse newlines.
    s = s.replace("|", "/").replace("\t", " ").replace("\r", " ").replace("\n", " ")
    s = re.sub(r"\s+", " ", s)
    s = s.replace("\\", "\\\\")
    return s


def norm_num(v):
    if v is None:
        return ""
    if isinstance(v, (int, float)):
        # APIA SUPRAFATA arrives as float ("1.04", "12.45") — already English.
        # Trim trailing zeros after decimal.
        s = f"{v:.4f}"
        s = s.rstrip("0").rstrip(".")
        return s if s else "0"
    s = str(v).strip()
    if not s:
        return ""
    if "," in s:
        s = s.replace(".", "").replace(",", ".")
    return s.replace("|", "/")


def main():
    if len(sys.argv) != 7:
        print(
            "usage: import-apia-fermieri.py INPUT.xlsx OUTPUT.tsv "
            "CAMPAIGN_YEAR DATASET_ID RESOURCE_ID SOURCE_URL",
            file=sys.stderr,
        )
        sys.exit(2)

    in_path = sys.argv[1]
    out_path = sys.argv[2]
    campaign_year = sys.argv[3]
    dataset_id = sys.argv[4]
    resource_id = sys.argv[5]
    source_url = sys.argv[6]

    wb = openpyxl.load_workbook(in_path, read_only=True, data_only=True)
    ws = wb.active

    rows = ws.iter_rows(values_only=True)
    header_idx = None
    col_map = None
    for i, r in enumerate(rows):
        if not r:
            continue
        if r[0] and EXPECTED_HEADER_COL0 in str(r[0]).upper():
            # Build column index map from header for resilience.
            header = [str(c).strip().upper() if c is not None else "" for c in r]
            col_map = {}
            for idx, h in enumerate(header):
                if "NR.CRT" in h or "NRCRT" in h:
                    col_map["nr"] = idx
                elif "NUME" in h:  # "NUME PRENUME" / "NUME SI PRENUME"
                    col_map.setdefault("name", idx)
                elif "RESPONSABIL" in h:
                    col_map["responsabil"] = idx
                elif "COMUNA" in h or "ORAS" in h:
                    col_map["comuna"] = idx
                elif h == "SAT" or h.startswith("SAT "):
                    col_map["sat"] = idx
                elif "CENTRUL" in h or "CENTRU" in h:
                    col_map["centru"] = idx
                elif "SUPRAFATA" in h or "SUPRAFAȚA" in h:
                    col_map["suprafata"] = idx
            header_idx = i
            break
        if i > 50:
            break

    if header_idx is None or not col_map or "name" not in col_map:
        print(
            "[apia-import] ERROR: header row not found in first 50 rows",
            file=sys.stderr,
        )
        sys.exit(1)

    print(f"[apia-import] header at row {header_idx}, col_map={col_map}", file=sys.stderr)

    n_data = 0
    n_skipped = 0

    with open(out_path, "w", encoding="utf-8") as f:
        for r in rows:
            if r is None:
                continue
            cells = list(r)
            # Pad if short
            max_idx = max(col_map.values()) if col_map else 0
            while len(cells) <= max_idx:
                cells.append(None)

            name = norm_text(cells[col_map["name"]])
            if not name:
                n_skipped += 1
                continue

            comuna = norm_text(cells[col_map["comuna"]]) if "comuna" in col_map else ""
            sat = norm_text(cells[col_map["sat"]]) if "sat" in col_map else ""
            centru = norm_text(cells[col_map["centru"]]) if "centru" in col_map else ""
            responsabil = norm_text(cells[col_map["responsabil"]]) if "responsabil" in col_map else ""
            suprafata = norm_num(cells[col_map["suprafata"]]) if "suprafata" in col_map else ""

            out = [
                campaign_year,
                name,
                comuna,
                sat,
                centru,
                responsabil,
                suprafata,
                dataset_id,
                resource_id,
                source_url,
            ]
            f.write("|".join(out) + "\n")
            n_data += 1

    print(f"[apia-import] done — {n_data} rows, {n_skipped} skipped", file=sys.stderr)


if __name__ == "__main__":
    main()