Files
Claude VM a6c03a091e initial: split from gov-agreg — vreau.digital standalone platform
Moved from gov-agreg/src/pages/achizitii/* to root (drop prefix).
- 22 pages migrated, 127 files total
- All internal links: /achizitii/X → /X (176 occurrences fixed)
- AchizitiiLayout subnav rewritten: /X paths, top-right link to vreaudigital.ro hub
- BaseLayout new (vreau.digital branding, OG tags, site URL)
- astro.config.mjs: site https://vreau.digital, server output (was static)
- docker-compose: port 5096 (vreaudigital is 5095), container vreau-digital
- deploy.sh: paths /opt/vreau-digital, log /var/log/vreau-digital-deploy.log

Backend shared with gov-agreg:
- PostgreSQL satra (same schemas: seap, firms, anaf, anre, ...)
- Photon, Martin tiles
- Infisical /vreaudigital path (DATABASE_URL etc. shared)

build: PASS (npx astro check 0 errors, npm run build 5s vite + 10s server)
2026-05-13 00:10:32 +03:00

211 lines
7.6 KiB
Bash
Executable File

#!/bin/bash
# APIA "Lista fermieri" importer wrapper.
#
# Discovers CKAN package "lista-fermierilor-campania-apia-{YEAR}" on
# data.gov.ro and ingests each XLSX resource into apia.fermieri. The
# package can grow over time as more UATs publish their lists; the importer
# is resource-id keyed so re-runs are idempotent (DELETE WHERE
# source_resource_id = X before re-INSERT).
#
# Pattern follows cron/import-afir-historical.sh but simpler — APIA XLSX is
# tiny (KB-MB, not 30 MB), so we don't need streaming COPY tricks; we
# stage on satra and load directly.
#
# Usage:
# ./import-apia-fermieri.sh # all years (currently 2024)
# ./import-apia-fermieri.sh 2024 # only the given year
# ./import-apia-fermieri.sh 2024 1 # smoke test: only first resource
#
# Requires `jq` and `python3-openpyxl` on satra (already installed).
set -euo pipefail
YEAR_FILTER="${1:-}" # empty = all years discoverable
RESOURCE_LIMIT="${2:-0}" # 0 = all resources within selected year(s)
WORK_LOCAL="/tmp/apia-import-$$"
trap "rm -rf $WORK_LOCAL" EXIT
mkdir -p "$WORK_LOCAL"
SCRIPT_DIR="$(cd "$(dirname "$0")/.." && pwd)/scripts"
NORMALIZER="$SCRIPT_DIR/import-apia-fermieri.py"
# 1. Discover candidate datasets via CKAN search.
echo "[apia-import] discovering CKAN datasets..."
curl -sSL --max-time 60 \
"https://data.gov.ro/api/3/action/package_search?q=lista+fermieri+APIA&rows=50" \
> "$WORK_LOCAL/search.json"
# Extract: dataset_name | resource_id | resource_url | resource_format | resource_name
# Filter to xlsx resources whose dataset name matches lista-fermier*-apia-*.
python3 - "$WORK_LOCAL/search.json" "$YEAR_FILTER" > "$WORK_LOCAL/resources.tsv" <<'PY'
import json, sys, re
path, year_filter = sys.argv[1], sys.argv[2]
with open(path) as f:
d = json.load(f)
results = d.get("result", {}).get("results", [])
out_lines = []
for pkg in results:
name = pkg.get("name", "")
if not re.search(r"lista[-_]ferm", name, re.I):
continue
# Year extraction from package name (e.g. "lista-fermierilor-campania-apia-2024")
m = re.search(r"(20\d{2})", name)
pkg_year = m.group(1) if m else ""
if year_filter and pkg_year != year_filter:
continue
for rs in pkg.get("resources", []):
fmt = (rs.get("format") or "").upper()
if fmt not in ("XLSX", "XLS"):
continue
rid = rs.get("id") or ""
rurl = rs.get("url") or ""
rname = (rs.get("name") or "").replace("\t", " ")
if not (rid and rurl and pkg_year):
continue
out_lines.append(f"{name}\t{pkg_year}\t{rid}\t{rurl}\t{rname}")
if not out_lines:
print("[apia-import] no matching xlsx resources found", file=sys.stderr)
print("\n".join(out_lines))
PY
N_RESOURCES=$(wc -l < "$WORK_LOCAL/resources.tsv" || echo 0)
echo "[apia-import] found $N_RESOURCES candidate XLSX resource(s)"
if [ "$N_RESOURCES" -eq 0 ]; then
exit 0
fi
# Optional smoke truncation (head N).
if [ "$RESOURCE_LIMIT" -gt 0 ] 2>/dev/null; then
head -n "$RESOURCE_LIMIT" "$WORK_LOCAL/resources.tsv" > "$WORK_LOCAL/resources.smoke.tsv"
mv "$WORK_LOCAL/resources.smoke.tsv" "$WORK_LOCAL/resources.tsv"
echo "[apia-import] smoke mode — truncated to first $RESOURCE_LIMIT resource(s)"
fi
# 2. Upload normalizer to satra (once).
echo "[apia-import] uploading normalizer..."
ssh satra "mkdir -p /tmp/apia-import"
scp -q "$NORMALIZER" satra:/tmp/apia-import/normalize.py
# 3. For each resource: download → normalize → stage → INSERT.
TOTAL_ROWS=0
TOTAL_INSERTED=0
TOTAL_RESOURCES=0
while IFS=$'\t' read -r DATASET_ID YEAR RESOURCE_ID SOURCE_URL RESOURCE_NAME; do
TOTAL_RESOURCES=$((TOTAL_RESOURCES + 1))
WORK_REMOTE="/tmp/apia-import/$RESOURCE_ID"
echo "[apia-import] === $DATASET_ID / $RESOURCE_ID ($RESOURCE_NAME) ==="
STARTED_AT=$(date -u +%Y-%m-%dT%H:%M:%S.%3NZ)
T0=$(date +%s%3N)
ssh satra "mkdir -p $WORK_REMOTE && curl -sLkf --max-time 120 -o $WORK_REMOTE/listaferm.xlsx '$SOURCE_URL' && ls -lh $WORK_REMOTE/listaferm.xlsx"
ssh satra "python3 /tmp/apia-import/normalize.py \
$WORK_REMOTE/listaferm.xlsx $WORK_REMOTE/data.tsv \
'$YEAR' '$DATASET_ID' '$RESOURCE_ID' '$SOURCE_URL' 2>&1 | tail -5"
N_TSV=$(ssh satra "wc -l < $WORK_REMOTE/data.tsv")
echo "[apia-import] normalized rows: $N_TSV"
# Idempotent: drop existing rows for this resource_id, then re-INSERT.
ssh satra "/tmp/baseline.sh <<SQL
\\set ON_ERROR_STOP on
TRUNCATE TABLE apia.staging_fermieri;
\\copy apia.staging_fermieri FROM '$WORK_REMOTE/data.tsv' WITH (FORMAT text, DELIMITER '|', NULL '')
SELECT 'staged' AS step, COUNT(*) AS rows FROM apia.staging_fermieri;
DELETE FROM apia.fermieri WHERE source_resource_id = '$RESOURCE_ID';
-- Dedupe within the staging set on the natural key (UAT XLSXes occasionally
-- list the same farmer twice for separate parcel categories). Pick the row
-- with max suprafata_ha so we don't lose the larger declaration.
INSERT INTO apia.fermieri (
campaign_year, name, comuna_oras, sat, centru_apia,
responsabil_uat, suprafata_ha,
source_dataset_id, source_resource_id, source_url
)
SELECT DISTINCT ON (campaign_year::smallint, name, NULLIF(comuna_oras,''), NULLIF(sat,''))
campaign_year::smallint,
name,
NULLIF(comuna_oras, ''),
NULLIF(sat, ''),
NULLIF(centru_apia, ''),
NULLIF(responsabil_uat, ''),
NULLIF(suprafata_ha, '')::numeric,
source_dataset_id,
source_resource_id,
source_url
FROM apia.staging_fermieri
ORDER BY campaign_year::smallint, name, NULLIF(comuna_oras,''), NULLIF(sat,''),
NULLIF(suprafata_ha,'')::numeric DESC NULLS LAST
ON CONFLICT (campaign_year, name, comuna_oras, sat) DO UPDATE
SET centru_apia = EXCLUDED.centru_apia,
responsabil_uat = EXCLUDED.responsabil_uat,
suprafata_ha = EXCLUDED.suprafata_ha,
source_dataset_id = EXCLUDED.source_dataset_id,
source_resource_id = EXCLUDED.source_resource_id,
source_url = EXCLUDED.source_url,
fetched_at = now();
SELECT 'inserted' AS step,
COUNT(*) AS rows_now
FROM apia.fermieri WHERE source_resource_id = '$RESOURCE_ID';
SQL"
N_NOW=$(ssh satra "/tmp/baseline.sh -t -A -c \"SELECT COUNT(*) FROM apia.fermieri WHERE source_resource_id = '$RESOURCE_ID';\" 2>/dev/null | tail -1")
echo "[apia-import] inserted rows for $RESOURCE_ID: $N_NOW"
T1=$(date +%s%3N)
DURATION=$((T1 - T0))
# Log the run
ssh satra "/tmp/baseline.sh -c \"
INSERT INTO apia.scrape_log (
source_dataset_id, source_resource_id, source_url, campaign_year,
rows_seen, rows_inserted, duration_ms, started_at
) VALUES (
'$DATASET_ID', '$RESOURCE_ID', '$SOURCE_URL', $YEAR,
$N_TSV, $N_NOW, $DURATION, '$STARTED_AT'
);\" 2>&1 | tail -2"
TOTAL_ROWS=$((TOTAL_ROWS + N_TSV))
TOTAL_INSERTED=$((TOTAL_INSERTED + N_NOW))
ssh satra "rm -rf $WORK_REMOTE"
done < "$WORK_LOCAL/resources.tsv"
# 4. CUI matcher
echo "[apia-import] matching CUI..."
ssh satra "/tmp/baseline.sh -c 'SELECT * FROM apia.match_cui();' 2>&1 | tail -10"
# 5. Refresh MV
echo "[apia-import] refreshing materialized view..."
ssh satra "/tmp/baseline.sh -c 'REFRESH MATERIALIZED VIEW apia.mv_per_cui;' 2>&1 | tail -5"
# 6. Final summary
echo "[apia-import] === SUMMARY ==="
ssh satra "/tmp/baseline.sh <<'SQL'
SELECT
'totals' AS metric,
COUNT(*) AS rows_total,
COUNT(DISTINCT source_resource_id) AS resources,
COUNT(DISTINCT comuna_oras) AS comune,
COUNT(DISTINCT centru_apia) AS centre_apia,
ROUND(SUM(suprafata_ha)::numeric, 2) AS total_ha,
COUNT(*) FILTER (WHERE cui IS NOT NULL) AS rows_with_cui,
COUNT(*) FILTER (WHERE is_legal_person) AS rows_pj
FROM apia.fermieri;
SQL"
echo "[apia-import] === done ($TOTAL_RESOURCES resource(s), $TOTAL_INSERTED rows) ==="