a6c03a091e
Moved from gov-agreg/src/pages/achizitii/* to root (drop prefix). - 22 pages migrated, 127 files total - All internal links: /achizitii/X → /X (176 occurrences fixed) - AchizitiiLayout subnav rewritten: /X paths, top-right link to vreaudigital.ro hub - BaseLayout new (vreau.digital branding, OG tags, site URL) - astro.config.mjs: site https://vreau.digital, server output (was static) - docker-compose: port 5096 (vreaudigital is 5095), container vreau-digital - deploy.sh: paths /opt/vreau-digital, log /var/log/vreau-digital-deploy.log Backend shared with gov-agreg: - PostgreSQL satra (same schemas: seap, firms, anaf, anre, ...) - Photon, Martin tiles - Infisical /vreaudigital path (DATABASE_URL etc. shared) build: PASS (npx astro check 0 errors, npm run build 5s vite + 10s server)
211 lines
7.6 KiB
Bash
Executable File
211 lines
7.6 KiB
Bash
Executable File
#!/bin/bash
|
|
# APIA "Lista fermieri" importer wrapper.
|
|
#
|
|
# Discovers CKAN package "lista-fermierilor-campania-apia-{YEAR}" on
|
|
# data.gov.ro and ingests each XLSX resource into apia.fermieri. The
|
|
# package can grow over time as more UATs publish their lists; the importer
|
|
# is resource-id keyed so re-runs are idempotent (DELETE WHERE
|
|
# source_resource_id = X before re-INSERT).
|
|
#
|
|
# Pattern follows cron/import-afir-historical.sh but simpler — APIA XLSX is
|
|
# tiny (KB-MB, not 30 MB), so we don't need streaming COPY tricks; we
|
|
# stage on satra and load directly.
|
|
#
|
|
# Usage:
|
|
# ./import-apia-fermieri.sh # all years (currently 2024)
|
|
# ./import-apia-fermieri.sh 2024 # only the given year
|
|
# ./import-apia-fermieri.sh 2024 1 # smoke test: only first resource
|
|
#
|
|
# Requires `jq` and `python3-openpyxl` on satra (already installed).
|
|
|
|
set -euo pipefail
|
|
|
|
YEAR_FILTER="${1:-}" # empty = all years discoverable
|
|
RESOURCE_LIMIT="${2:-0}" # 0 = all resources within selected year(s)
|
|
|
|
WORK_LOCAL="/tmp/apia-import-$$"
|
|
trap "rm -rf $WORK_LOCAL" EXIT
|
|
mkdir -p "$WORK_LOCAL"
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "$0")/.." && pwd)/scripts"
|
|
NORMALIZER="$SCRIPT_DIR/import-apia-fermieri.py"
|
|
|
|
# 1. Discover candidate datasets via CKAN search.
|
|
echo "[apia-import] discovering CKAN datasets..."
|
|
curl -sSL --max-time 60 \
|
|
"https://data.gov.ro/api/3/action/package_search?q=lista+fermieri+APIA&rows=50" \
|
|
> "$WORK_LOCAL/search.json"
|
|
|
|
# Extract: dataset_name | resource_id | resource_url | resource_format | resource_name
|
|
# Filter to xlsx resources whose dataset name matches lista-fermier*-apia-*.
|
|
python3 - "$WORK_LOCAL/search.json" "$YEAR_FILTER" > "$WORK_LOCAL/resources.tsv" <<'PY'
|
|
import json, sys, re
|
|
|
|
path, year_filter = sys.argv[1], sys.argv[2]
|
|
with open(path) as f:
|
|
d = json.load(f)
|
|
|
|
results = d.get("result", {}).get("results", [])
|
|
out_lines = []
|
|
for pkg in results:
|
|
name = pkg.get("name", "")
|
|
if not re.search(r"lista[-_]ferm", name, re.I):
|
|
continue
|
|
# Year extraction from package name (e.g. "lista-fermierilor-campania-apia-2024")
|
|
m = re.search(r"(20\d{2})", name)
|
|
pkg_year = m.group(1) if m else ""
|
|
if year_filter and pkg_year != year_filter:
|
|
continue
|
|
for rs in pkg.get("resources", []):
|
|
fmt = (rs.get("format") or "").upper()
|
|
if fmt not in ("XLSX", "XLS"):
|
|
continue
|
|
rid = rs.get("id") or ""
|
|
rurl = rs.get("url") or ""
|
|
rname = (rs.get("name") or "").replace("\t", " ")
|
|
if not (rid and rurl and pkg_year):
|
|
continue
|
|
out_lines.append(f"{name}\t{pkg_year}\t{rid}\t{rurl}\t{rname}")
|
|
|
|
if not out_lines:
|
|
print("[apia-import] no matching xlsx resources found", file=sys.stderr)
|
|
|
|
print("\n".join(out_lines))
|
|
PY
|
|
|
|
N_RESOURCES=$(wc -l < "$WORK_LOCAL/resources.tsv" || echo 0)
|
|
echo "[apia-import] found $N_RESOURCES candidate XLSX resource(s)"
|
|
|
|
if [ "$N_RESOURCES" -eq 0 ]; then
|
|
exit 0
|
|
fi
|
|
|
|
# Optional smoke truncation (head N).
|
|
if [ "$RESOURCE_LIMIT" -gt 0 ] 2>/dev/null; then
|
|
head -n "$RESOURCE_LIMIT" "$WORK_LOCAL/resources.tsv" > "$WORK_LOCAL/resources.smoke.tsv"
|
|
mv "$WORK_LOCAL/resources.smoke.tsv" "$WORK_LOCAL/resources.tsv"
|
|
echo "[apia-import] smoke mode — truncated to first $RESOURCE_LIMIT resource(s)"
|
|
fi
|
|
|
|
# 2. Upload normalizer to satra (once).
|
|
echo "[apia-import] uploading normalizer..."
|
|
ssh satra "mkdir -p /tmp/apia-import"
|
|
scp -q "$NORMALIZER" satra:/tmp/apia-import/normalize.py
|
|
|
|
# 3. For each resource: download → normalize → stage → INSERT.
|
|
TOTAL_ROWS=0
|
|
TOTAL_INSERTED=0
|
|
TOTAL_RESOURCES=0
|
|
|
|
while IFS=$'\t' read -r DATASET_ID YEAR RESOURCE_ID SOURCE_URL RESOURCE_NAME; do
|
|
TOTAL_RESOURCES=$((TOTAL_RESOURCES + 1))
|
|
WORK_REMOTE="/tmp/apia-import/$RESOURCE_ID"
|
|
echo "[apia-import] === $DATASET_ID / $RESOURCE_ID ($RESOURCE_NAME) ==="
|
|
|
|
STARTED_AT=$(date -u +%Y-%m-%dT%H:%M:%S.%3NZ)
|
|
T0=$(date +%s%3N)
|
|
|
|
ssh satra "mkdir -p $WORK_REMOTE && curl -sLkf --max-time 120 -o $WORK_REMOTE/listaferm.xlsx '$SOURCE_URL' && ls -lh $WORK_REMOTE/listaferm.xlsx"
|
|
|
|
ssh satra "python3 /tmp/apia-import/normalize.py \
|
|
$WORK_REMOTE/listaferm.xlsx $WORK_REMOTE/data.tsv \
|
|
'$YEAR' '$DATASET_ID' '$RESOURCE_ID' '$SOURCE_URL' 2>&1 | tail -5"
|
|
|
|
N_TSV=$(ssh satra "wc -l < $WORK_REMOTE/data.tsv")
|
|
echo "[apia-import] normalized rows: $N_TSV"
|
|
|
|
# Idempotent: drop existing rows for this resource_id, then re-INSERT.
|
|
ssh satra "/tmp/baseline.sh <<SQL
|
|
\\set ON_ERROR_STOP on
|
|
|
|
TRUNCATE TABLE apia.staging_fermieri;
|
|
|
|
\\copy apia.staging_fermieri FROM '$WORK_REMOTE/data.tsv' WITH (FORMAT text, DELIMITER '|', NULL '')
|
|
|
|
SELECT 'staged' AS step, COUNT(*) AS rows FROM apia.staging_fermieri;
|
|
|
|
DELETE FROM apia.fermieri WHERE source_resource_id = '$RESOURCE_ID';
|
|
|
|
-- Dedupe within the staging set on the natural key (UAT XLSXes occasionally
|
|
-- list the same farmer twice for separate parcel categories). Pick the row
|
|
-- with max suprafata_ha so we don't lose the larger declaration.
|
|
INSERT INTO apia.fermieri (
|
|
campaign_year, name, comuna_oras, sat, centru_apia,
|
|
responsabil_uat, suprafata_ha,
|
|
source_dataset_id, source_resource_id, source_url
|
|
)
|
|
SELECT DISTINCT ON (campaign_year::smallint, name, NULLIF(comuna_oras,''), NULLIF(sat,''))
|
|
campaign_year::smallint,
|
|
name,
|
|
NULLIF(comuna_oras, ''),
|
|
NULLIF(sat, ''),
|
|
NULLIF(centru_apia, ''),
|
|
NULLIF(responsabil_uat, ''),
|
|
NULLIF(suprafata_ha, '')::numeric,
|
|
source_dataset_id,
|
|
source_resource_id,
|
|
source_url
|
|
FROM apia.staging_fermieri
|
|
ORDER BY campaign_year::smallint, name, NULLIF(comuna_oras,''), NULLIF(sat,''),
|
|
NULLIF(suprafata_ha,'')::numeric DESC NULLS LAST
|
|
ON CONFLICT (campaign_year, name, comuna_oras, sat) DO UPDATE
|
|
SET centru_apia = EXCLUDED.centru_apia,
|
|
responsabil_uat = EXCLUDED.responsabil_uat,
|
|
suprafata_ha = EXCLUDED.suprafata_ha,
|
|
source_dataset_id = EXCLUDED.source_dataset_id,
|
|
source_resource_id = EXCLUDED.source_resource_id,
|
|
source_url = EXCLUDED.source_url,
|
|
fetched_at = now();
|
|
|
|
SELECT 'inserted' AS step,
|
|
COUNT(*) AS rows_now
|
|
FROM apia.fermieri WHERE source_resource_id = '$RESOURCE_ID';
|
|
SQL"
|
|
|
|
N_NOW=$(ssh satra "/tmp/baseline.sh -t -A -c \"SELECT COUNT(*) FROM apia.fermieri WHERE source_resource_id = '$RESOURCE_ID';\" 2>/dev/null | tail -1")
|
|
echo "[apia-import] inserted rows for $RESOURCE_ID: $N_NOW"
|
|
|
|
T1=$(date +%s%3N)
|
|
DURATION=$((T1 - T0))
|
|
|
|
# Log the run
|
|
ssh satra "/tmp/baseline.sh -c \"
|
|
INSERT INTO apia.scrape_log (
|
|
source_dataset_id, source_resource_id, source_url, campaign_year,
|
|
rows_seen, rows_inserted, duration_ms, started_at
|
|
) VALUES (
|
|
'$DATASET_ID', '$RESOURCE_ID', '$SOURCE_URL', $YEAR,
|
|
$N_TSV, $N_NOW, $DURATION, '$STARTED_AT'
|
|
);\" 2>&1 | tail -2"
|
|
|
|
TOTAL_ROWS=$((TOTAL_ROWS + N_TSV))
|
|
TOTAL_INSERTED=$((TOTAL_INSERTED + N_NOW))
|
|
|
|
ssh satra "rm -rf $WORK_REMOTE"
|
|
done < "$WORK_LOCAL/resources.tsv"
|
|
|
|
# 4. CUI matcher
|
|
echo "[apia-import] matching CUI..."
|
|
ssh satra "/tmp/baseline.sh -c 'SELECT * FROM apia.match_cui();' 2>&1 | tail -10"
|
|
|
|
# 5. Refresh MV
|
|
echo "[apia-import] refreshing materialized view..."
|
|
ssh satra "/tmp/baseline.sh -c 'REFRESH MATERIALIZED VIEW apia.mv_per_cui;' 2>&1 | tail -5"
|
|
|
|
# 6. Final summary
|
|
echo "[apia-import] === SUMMARY ==="
|
|
ssh satra "/tmp/baseline.sh <<'SQL'
|
|
SELECT
|
|
'totals' AS metric,
|
|
COUNT(*) AS rows_total,
|
|
COUNT(DISTINCT source_resource_id) AS resources,
|
|
COUNT(DISTINCT comuna_oras) AS comune,
|
|
COUNT(DISTINCT centru_apia) AS centre_apia,
|
|
ROUND(SUM(suprafata_ha)::numeric, 2) AS total_ha,
|
|
COUNT(*) FILTER (WHERE cui IS NOT NULL) AS rows_with_cui,
|
|
COUNT(*) FILTER (WHERE is_legal_person) AS rows_pj
|
|
FROM apia.fermieri;
|
|
SQL"
|
|
|
|
echo "[apia-import] === done ($TOTAL_RESOURCES resource(s), $TOTAL_INSERTED rows) ==="
|