a6c03a091e
Moved from gov-agreg/src/pages/achizitii/* to root (drop prefix). - 22 pages migrated, 127 files total - All internal links: /achizitii/X → /X (176 occurrences fixed) - AchizitiiLayout subnav rewritten: /X paths, top-right link to vreaudigital.ro hub - BaseLayout new (vreau.digital branding, OG tags, site URL) - astro.config.mjs: site https://vreau.digital, server output (was static) - docker-compose: port 5096 (vreaudigital is 5095), container vreau-digital - deploy.sh: paths /opt/vreau-digital, log /var/log/vreau-digital-deploy.log Backend shared with gov-agreg: - PostgreSQL satra (same schemas: seap, firms, anaf, anre, ...) - Photon, Martin tiles - Infisical /vreaudigital path (DATABASE_URL etc. shared) build: PASS (npx astro check 0 errors, npm run build 5s vite + 10s server)
168 lines
5.3 KiB
Python
Executable File
168 lines
5.3 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""APIA "Lista fermieri" XLSX → pipe-delimited TSV normalizer.
|
|
|
|
Source: data.gov.ro CKAN package "lista-fermierilor-campania-apia-2024".
|
|
Currently a single resource (comuna Găgești, Vaslui, ~192 farmers), but the
|
|
package is supposed to grow as more UATs publish their lists. The XLSX
|
|
schema is set by APIA and identical across UATs:
|
|
|
|
Row 0 (header): NR.CRT | NUME PRENUME | RESPONSABIL UAT 2024
|
|
| COMUNA/ORAS | SAT | DATE CONTACT | CENTRUL APIA
|
|
| SUPRAFATA 2023 | (~17 None columns)
|
|
Rows 1..N (data): one row per farmer, NR.CRT 1-indexed.
|
|
|
|
Output: pipe-delimited TSV (no quoting), columns in this order:
|
|
|
|
campaign_year | name | comuna_oras | sat | centru_apia
|
|
| responsabil_uat | suprafata_ha
|
|
| source_dataset_id | source_resource_id | source_url
|
|
|
|
Empty strings stay empty (NULL in COPY with NULL '').
|
|
|
|
Usage:
|
|
python3 import-apia-fermieri.py INPUT.xlsx OUTPUT.tsv \\
|
|
CAMPAIGN_YEAR DATASET_ID RESOURCE_ID SOURCE_URL
|
|
"""
|
|
|
|
import re
|
|
import sys
|
|
|
|
import openpyxl
|
|
|
|
EXPECTED_HEADER_COL0 = "NR.CRT"
|
|
EXPECTED_HEADER_COL1 = "NUME" # "NUME PRENUME" or "NUME SI PRENUME"
|
|
|
|
|
|
def norm_text(v):
|
|
if v is None:
|
|
return ""
|
|
s = str(v).strip()
|
|
if not s:
|
|
return ""
|
|
# Pipe is our delimiter — replace embedded pipes; collapse newlines.
|
|
s = s.replace("|", "/").replace("\t", " ").replace("\r", " ").replace("\n", " ")
|
|
s = re.sub(r"\s+", " ", s)
|
|
s = s.replace("\\", "\\\\")
|
|
return s
|
|
|
|
|
|
def norm_num(v):
|
|
if v is None:
|
|
return ""
|
|
if isinstance(v, (int, float)):
|
|
# APIA SUPRAFATA arrives as float ("1.04", "12.45") — already English.
|
|
# Trim trailing zeros after decimal.
|
|
s = f"{v:.4f}"
|
|
s = s.rstrip("0").rstrip(".")
|
|
return s if s else "0"
|
|
s = str(v).strip()
|
|
if not s:
|
|
return ""
|
|
if "," in s:
|
|
s = s.replace(".", "").replace(",", ".")
|
|
return s.replace("|", "/")
|
|
|
|
|
|
def main():
|
|
if len(sys.argv) != 7:
|
|
print(
|
|
"usage: import-apia-fermieri.py INPUT.xlsx OUTPUT.tsv "
|
|
"CAMPAIGN_YEAR DATASET_ID RESOURCE_ID SOURCE_URL",
|
|
file=sys.stderr,
|
|
)
|
|
sys.exit(2)
|
|
|
|
in_path = sys.argv[1]
|
|
out_path = sys.argv[2]
|
|
campaign_year = sys.argv[3]
|
|
dataset_id = sys.argv[4]
|
|
resource_id = sys.argv[5]
|
|
source_url = sys.argv[6]
|
|
|
|
wb = openpyxl.load_workbook(in_path, read_only=True, data_only=True)
|
|
ws = wb.active
|
|
|
|
rows = ws.iter_rows(values_only=True)
|
|
header_idx = None
|
|
col_map = None
|
|
for i, r in enumerate(rows):
|
|
if not r:
|
|
continue
|
|
if r[0] and EXPECTED_HEADER_COL0 in str(r[0]).upper():
|
|
# Build column index map from header for resilience.
|
|
header = [str(c).strip().upper() if c is not None else "" for c in r]
|
|
col_map = {}
|
|
for idx, h in enumerate(header):
|
|
if "NR.CRT" in h or "NRCRT" in h:
|
|
col_map["nr"] = idx
|
|
elif "NUME" in h: # "NUME PRENUME" / "NUME SI PRENUME"
|
|
col_map.setdefault("name", idx)
|
|
elif "RESPONSABIL" in h:
|
|
col_map["responsabil"] = idx
|
|
elif "COMUNA" in h or "ORAS" in h:
|
|
col_map["comuna"] = idx
|
|
elif h == "SAT" or h.startswith("SAT "):
|
|
col_map["sat"] = idx
|
|
elif "CENTRUL" in h or "CENTRU" in h:
|
|
col_map["centru"] = idx
|
|
elif "SUPRAFATA" in h or "SUPRAFAȚA" in h:
|
|
col_map["suprafata"] = idx
|
|
header_idx = i
|
|
break
|
|
if i > 50:
|
|
break
|
|
|
|
if header_idx is None or not col_map or "name" not in col_map:
|
|
print(
|
|
"[apia-import] ERROR: header row not found in first 50 rows",
|
|
file=sys.stderr,
|
|
)
|
|
sys.exit(1)
|
|
|
|
print(f"[apia-import] header at row {header_idx}, col_map={col_map}", file=sys.stderr)
|
|
|
|
n_data = 0
|
|
n_skipped = 0
|
|
|
|
with open(out_path, "w", encoding="utf-8") as f:
|
|
for r in rows:
|
|
if r is None:
|
|
continue
|
|
cells = list(r)
|
|
# Pad if short
|
|
max_idx = max(col_map.values()) if col_map else 0
|
|
while len(cells) <= max_idx:
|
|
cells.append(None)
|
|
|
|
name = norm_text(cells[col_map["name"]])
|
|
if not name:
|
|
n_skipped += 1
|
|
continue
|
|
|
|
comuna = norm_text(cells[col_map["comuna"]]) if "comuna" in col_map else ""
|
|
sat = norm_text(cells[col_map["sat"]]) if "sat" in col_map else ""
|
|
centru = norm_text(cells[col_map["centru"]]) if "centru" in col_map else ""
|
|
responsabil = norm_text(cells[col_map["responsabil"]]) if "responsabil" in col_map else ""
|
|
suprafata = norm_num(cells[col_map["suprafata"]]) if "suprafata" in col_map else ""
|
|
|
|
out = [
|
|
campaign_year,
|
|
name,
|
|
comuna,
|
|
sat,
|
|
centru,
|
|
responsabil,
|
|
suprafata,
|
|
dataset_id,
|
|
resource_id,
|
|
source_url,
|
|
]
|
|
f.write("|".join(out) + "\n")
|
|
n_data += 1
|
|
|
|
print(f"[apia-import] done — {n_data} rows, {n_skipped} skipped", file=sys.stderr)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|