#!/usr/bin/env python3 """APIA "Lista fermieri" XLSX → pipe-delimited TSV normalizer. Source: data.gov.ro CKAN package "lista-fermierilor-campania-apia-2024". Currently a single resource (comuna Găgești, Vaslui, ~192 farmers), but the package is supposed to grow as more UATs publish their lists. The XLSX schema is set by APIA and identical across UATs: Row 0 (header): NR.CRT | NUME PRENUME | RESPONSABIL UAT 2024 | COMUNA/ORAS | SAT | DATE CONTACT | CENTRUL APIA | SUPRAFATA 2023 | (~17 None columns) Rows 1..N (data): one row per farmer, NR.CRT 1-indexed. Output: pipe-delimited TSV (no quoting), columns in this order: campaign_year | name | comuna_oras | sat | centru_apia | responsabil_uat | suprafata_ha | source_dataset_id | source_resource_id | source_url Empty strings stay empty (NULL in COPY with NULL ''). Usage: python3 import-apia-fermieri.py INPUT.xlsx OUTPUT.tsv \\ CAMPAIGN_YEAR DATASET_ID RESOURCE_ID SOURCE_URL """ import re import sys import openpyxl EXPECTED_HEADER_COL0 = "NR.CRT" EXPECTED_HEADER_COL1 = "NUME" # "NUME PRENUME" or "NUME SI PRENUME" def norm_text(v): if v is None: return "" s = str(v).strip() if not s: return "" # Pipe is our delimiter — replace embedded pipes; collapse newlines. s = s.replace("|", "/").replace("\t", " ").replace("\r", " ").replace("\n", " ") s = re.sub(r"\s+", " ", s) s = s.replace("\\", "\\\\") return s def norm_num(v): if v is None: return "" if isinstance(v, (int, float)): # APIA SUPRAFATA arrives as float ("1.04", "12.45") — already English. # Trim trailing zeros after decimal. s = f"{v:.4f}" s = s.rstrip("0").rstrip(".") return s if s else "0" s = str(v).strip() if not s: return "" if "," in s: s = s.replace(".", "").replace(",", ".") return s.replace("|", "/") def main(): if len(sys.argv) != 7: print( "usage: import-apia-fermieri.py INPUT.xlsx OUTPUT.tsv " "CAMPAIGN_YEAR DATASET_ID RESOURCE_ID SOURCE_URL", file=sys.stderr, ) sys.exit(2) in_path = sys.argv[1] out_path = sys.argv[2] campaign_year = sys.argv[3] dataset_id = sys.argv[4] resource_id = sys.argv[5] source_url = sys.argv[6] wb = openpyxl.load_workbook(in_path, read_only=True, data_only=True) ws = wb.active rows = ws.iter_rows(values_only=True) header_idx = None col_map = None for i, r in enumerate(rows): if not r: continue if r[0] and EXPECTED_HEADER_COL0 in str(r[0]).upper(): # Build column index map from header for resilience. header = [str(c).strip().upper() if c is not None else "" for c in r] col_map = {} for idx, h in enumerate(header): if "NR.CRT" in h or "NRCRT" in h: col_map["nr"] = idx elif "NUME" in h: # "NUME PRENUME" / "NUME SI PRENUME" col_map.setdefault("name", idx) elif "RESPONSABIL" in h: col_map["responsabil"] = idx elif "COMUNA" in h or "ORAS" in h: col_map["comuna"] = idx elif h == "SAT" or h.startswith("SAT "): col_map["sat"] = idx elif "CENTRUL" in h or "CENTRU" in h: col_map["centru"] = idx elif "SUPRAFATA" in h or "SUPRAFAȚA" in h: col_map["suprafata"] = idx header_idx = i break if i > 50: break if header_idx is None or not col_map or "name" not in col_map: print( "[apia-import] ERROR: header row not found in first 50 rows", file=sys.stderr, ) sys.exit(1) print(f"[apia-import] header at row {header_idx}, col_map={col_map}", file=sys.stderr) n_data = 0 n_skipped = 0 with open(out_path, "w", encoding="utf-8") as f: for r in rows: if r is None: continue cells = list(r) # Pad if short max_idx = max(col_map.values()) if col_map else 0 while len(cells) <= max_idx: cells.append(None) name = norm_text(cells[col_map["name"]]) if not name: n_skipped += 1 continue comuna = norm_text(cells[col_map["comuna"]]) if "comuna" in col_map else "" sat = norm_text(cells[col_map["sat"]]) if "sat" in col_map else "" centru = norm_text(cells[col_map["centru"]]) if "centru" in col_map else "" responsabil = norm_text(cells[col_map["responsabil"]]) if "responsabil" in col_map else "" suprafata = norm_num(cells[col_map["suprafata"]]) if "suprafata" in col_map else "" out = [ campaign_year, name, comuna, sat, centru, responsabil, suprafata, dataset_id, resource_id, source_url, ] f.write("|".join(out) + "\n") n_data += 1 print(f"[apia-import] done — {n_data} rows, {n_skipped} skipped", file=sys.stderr) if __name__ == "__main__": main()