Files
vreau-digital/services/seap-scraper/scripts/import-apia-fermieri.py
T
Claude VM a6c03a091e initial: split from gov-agreg — vreau.digital standalone platform
Moved from gov-agreg/src/pages/achizitii/* to root (drop prefix).
- 22 pages migrated, 127 files total
- All internal links: /achizitii/X → /X (176 occurrences fixed)
- AchizitiiLayout subnav rewritten: /X paths, top-right link to vreaudigital.ro hub
- BaseLayout new (vreau.digital branding, OG tags, site URL)
- astro.config.mjs: site https://vreau.digital, server output (was static)
- docker-compose: port 5096 (vreaudigital is 5095), container vreau-digital
- deploy.sh: paths /opt/vreau-digital, log /var/log/vreau-digital-deploy.log

Backend shared with gov-agreg:
- PostgreSQL satra (same schemas: seap, firms, anaf, anre, ...)
- Photon, Martin tiles
- Infisical /vreaudigital path (DATABASE_URL etc. shared)

build: PASS (npx astro check 0 errors, npm run build 5s vite + 10s server)
2026-05-13 00:10:32 +03:00

168 lines
5.3 KiB
Python
Executable File

#!/usr/bin/env python3
"""APIA "Lista fermieri" XLSX → pipe-delimited TSV normalizer.
Source: data.gov.ro CKAN package "lista-fermierilor-campania-apia-2024".
Currently a single resource (comuna Găgești, Vaslui, ~192 farmers), but the
package is supposed to grow as more UATs publish their lists. The XLSX
schema is set by APIA and identical across UATs:
Row 0 (header): NR.CRT | NUME PRENUME | RESPONSABIL UAT 2024
| COMUNA/ORAS | SAT | DATE CONTACT | CENTRUL APIA
| SUPRAFATA 2023 | (~17 None columns)
Rows 1..N (data): one row per farmer, NR.CRT 1-indexed.
Output: pipe-delimited TSV (no quoting), columns in this order:
campaign_year | name | comuna_oras | sat | centru_apia
| responsabil_uat | suprafata_ha
| source_dataset_id | source_resource_id | source_url
Empty strings stay empty (NULL in COPY with NULL '').
Usage:
python3 import-apia-fermieri.py INPUT.xlsx OUTPUT.tsv \\
CAMPAIGN_YEAR DATASET_ID RESOURCE_ID SOURCE_URL
"""
import re
import sys
import openpyxl
EXPECTED_HEADER_COL0 = "NR.CRT"
EXPECTED_HEADER_COL1 = "NUME" # "NUME PRENUME" or "NUME SI PRENUME"
def norm_text(v):
if v is None:
return ""
s = str(v).strip()
if not s:
return ""
# Pipe is our delimiter — replace embedded pipes; collapse newlines.
s = s.replace("|", "/").replace("\t", " ").replace("\r", " ").replace("\n", " ")
s = re.sub(r"\s+", " ", s)
s = s.replace("\\", "\\\\")
return s
def norm_num(v):
if v is None:
return ""
if isinstance(v, (int, float)):
# APIA SUPRAFATA arrives as float ("1.04", "12.45") — already English.
# Trim trailing zeros after decimal.
s = f"{v:.4f}"
s = s.rstrip("0").rstrip(".")
return s if s else "0"
s = str(v).strip()
if not s:
return ""
if "," in s:
s = s.replace(".", "").replace(",", ".")
return s.replace("|", "/")
def main():
if len(sys.argv) != 7:
print(
"usage: import-apia-fermieri.py INPUT.xlsx OUTPUT.tsv "
"CAMPAIGN_YEAR DATASET_ID RESOURCE_ID SOURCE_URL",
file=sys.stderr,
)
sys.exit(2)
in_path = sys.argv[1]
out_path = sys.argv[2]
campaign_year = sys.argv[3]
dataset_id = sys.argv[4]
resource_id = sys.argv[5]
source_url = sys.argv[6]
wb = openpyxl.load_workbook(in_path, read_only=True, data_only=True)
ws = wb.active
rows = ws.iter_rows(values_only=True)
header_idx = None
col_map = None
for i, r in enumerate(rows):
if not r:
continue
if r[0] and EXPECTED_HEADER_COL0 in str(r[0]).upper():
# Build column index map from header for resilience.
header = [str(c).strip().upper() if c is not None else "" for c in r]
col_map = {}
for idx, h in enumerate(header):
if "NR.CRT" in h or "NRCRT" in h:
col_map["nr"] = idx
elif "NUME" in h: # "NUME PRENUME" / "NUME SI PRENUME"
col_map.setdefault("name", idx)
elif "RESPONSABIL" in h:
col_map["responsabil"] = idx
elif "COMUNA" in h or "ORAS" in h:
col_map["comuna"] = idx
elif h == "SAT" or h.startswith("SAT "):
col_map["sat"] = idx
elif "CENTRUL" in h or "CENTRU" in h:
col_map["centru"] = idx
elif "SUPRAFATA" in h or "SUPRAFAȚA" in h:
col_map["suprafata"] = idx
header_idx = i
break
if i > 50:
break
if header_idx is None or not col_map or "name" not in col_map:
print(
"[apia-import] ERROR: header row not found in first 50 rows",
file=sys.stderr,
)
sys.exit(1)
print(f"[apia-import] header at row {header_idx}, col_map={col_map}", file=sys.stderr)
n_data = 0
n_skipped = 0
with open(out_path, "w", encoding="utf-8") as f:
for r in rows:
if r is None:
continue
cells = list(r)
# Pad if short
max_idx = max(col_map.values()) if col_map else 0
while len(cells) <= max_idx:
cells.append(None)
name = norm_text(cells[col_map["name"]])
if not name:
n_skipped += 1
continue
comuna = norm_text(cells[col_map["comuna"]]) if "comuna" in col_map else ""
sat = norm_text(cells[col_map["sat"]]) if "sat" in col_map else ""
centru = norm_text(cells[col_map["centru"]]) if "centru" in col_map else ""
responsabil = norm_text(cells[col_map["responsabil"]]) if "responsabil" in col_map else ""
suprafata = norm_num(cells[col_map["suprafata"]]) if "suprafata" in col_map else ""
out = [
campaign_year,
name,
comuna,
sat,
centru,
responsabil,
suprafata,
dataset_id,
resource_id,
source_url,
]
f.write("|".join(out) + "\n")
n_data += 1
print(f"[apia-import] done — {n_data} rows, {n_skipped} skipped", file=sys.stderr)
if __name__ == "__main__":
main()