a6c03a091e
Moved from gov-agreg/src/pages/achizitii/* to root (drop prefix). - 22 pages migrated, 127 files total - All internal links: /achizitii/X → /X (176 occurrences fixed) - AchizitiiLayout subnav rewritten: /X paths, top-right link to vreaudigital.ro hub - BaseLayout new (vreau.digital branding, OG tags, site URL) - astro.config.mjs: site https://vreau.digital, server output (was static) - docker-compose: port 5096 (vreaudigital is 5095), container vreau-digital - deploy.sh: paths /opt/vreau-digital, log /var/log/vreau-digital-deploy.log Backend shared with gov-agreg: - PostgreSQL satra (same schemas: seap, firms, anaf, anre, ...) - Photon, Martin tiles - Infisical /vreaudigital path (DATABASE_URL etc. shared) build: PASS (npx astro check 0 errors, npm run build 5s vite + 10s server)
200 lines
7.3 KiB
Bash
Executable File
200 lines
7.3 KiB
Bash
Executable File
#!/bin/bash
|
|
# Download GeoNames RO postal codes and rebuild firms.postal_codes.
|
|
# Then geocode firms.entities by postal_code lookup, falling back to UAT
|
|
# centroid for firms without a valid postal code but with a siruta UAT.
|
|
#
|
|
# Coverage estimates (snapshot 2026-05-08):
|
|
# - postal-precision: ~2.07M / 3.97M firms (52%) — accuracy ~100m-2km
|
|
# - UAT-centroid fallback: +1.7M firms (44%) — accuracy 5-30km
|
|
# - combined: ~96% of all firms get lat/lng
|
|
#
|
|
# Run before geocode-photon.ts (which targets the remaining ~4% / refines the
|
|
# postal-level pins to housenumber level when available).
|
|
#
|
|
# Idempotent: safe to re-run weekly. Only rewrites firms.entities rows where
|
|
# the existing pin is null OR was set by an older/lower-precision source.
|
|
|
|
set -euo pipefail
|
|
|
|
DATA_DIR=/opt/vreaudigital/data/postal
|
|
LOG=/var/log/vreaudigital-postal-import.log
|
|
GEONAMES_URL=https://download.geonames.org/export/zip/RO.zip
|
|
|
|
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
|
|
mkdir -p "$DATA_DIR"
|
|
|
|
log "=== Postal-codes import started ==="
|
|
|
|
# ── Resolve DATABASE_URL via Infisical Machine Identity ──
|
|
source /opt/vreaudigital/.infisical-mi
|
|
TOKEN=$(infisical login --method=universal-auth \
|
|
--domain="$INFISICAL_API_URL" \
|
|
--client-id="$INFISICAL_CLIENT_ID" \
|
|
--client-secret="$INFISICAL_CLIENT_SECRET" \
|
|
--silent --plain)
|
|
DATABASE_URL=$(infisical run --domain="$INFISICAL_API_URL" \
|
|
--projectId="$INFISICAL_PROJECT_ID" \
|
|
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \
|
|
--silent --token="$TOKEN" \
|
|
-- sh -c 'echo "$DATABASE_URL"')
|
|
DB=$(echo "$DATABASE_URL" | sed -E 's/[?&]schema=[^&]*//; s/\?$//')
|
|
export PGUSER=$(echo "$DB" | sed -E 's|^postgresql://([^:]+):.*|\1|')
|
|
export PGPASSWORD=$(echo "$DB" | sed -E 's|^postgresql://[^:]+:([^@]+)@.*|\1|')
|
|
export PGHOST=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@([^:/]+).*|\1|')
|
|
export PGPORT=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^:]+:([0-9]+)/.*|\1|')
|
|
export PGDATABASE=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^/]+/([^?]+).*|\1|')
|
|
unset DATABASE_URL TOKEN DB
|
|
|
|
# ── Download + unzip ──
|
|
log "Downloading $GEONAMES_URL..."
|
|
curl -fsSL --max-time 120 -o "$DATA_DIR/RO.zip" "$GEONAMES_URL"
|
|
log "Unzipping..."
|
|
cd "$DATA_DIR" && unzip -o RO.zip -d "$DATA_DIR" >/dev/null
|
|
[ -s "$DATA_DIR/RO.txt" ] || { log "FATAL: RO.txt missing or empty"; exit 1; }
|
|
|
|
# ── Apply schema (idempotent) ──
|
|
psql -v ON_ERROR_STOP=1 -f /opt/vreaudigital/services/seap-scraper/sql/014_firms_postal_codes.sql >/dev/null
|
|
|
|
# ── Stage + UPSERT into firms.postal_codes ──
|
|
log "TRUNCATE staging + COPY..."
|
|
psql -v ON_ERROR_STOP=1 -c "TRUNCATE TABLE firms.staging_postal_codes;"
|
|
|
|
# GeoNames RO.txt is tab-separated, no header, US-ASCII safe (no quote escapes).
|
|
psql -v ON_ERROR_STOP=1 <<COPYEOF
|
|
\\copy firms.staging_postal_codes (country_code, postal_code, place_name, admin1_name, admin1_code, admin2_name, admin2_code, admin3_name, admin3_code, lat, lng, accuracy) FROM '$DATA_DIR/RO.txt' WITH (FORMAT csv, DELIMITER E'\t', NULL '', QUOTE E'\b', HEADER false);
|
|
COPYEOF
|
|
|
|
log "Rebuilding firms.postal_codes from staging..."
|
|
psql -v ON_ERROR_STOP=1 <<'SQL'
|
|
TRUNCATE TABLE firms.postal_codes;
|
|
INSERT INTO firms.postal_codes (postal_code, place_name, county, county_code, admin2_code, admin3_code, admin3_name, lat, lng, accuracy)
|
|
SELECT
|
|
s.postal_code,
|
|
s.place_name,
|
|
NULLIF(s.admin1_name, ''),
|
|
NULLIF(s.admin1_code, ''),
|
|
NULLIF(s.admin2_code, ''),
|
|
NULLIF(s.admin3_code, ''),
|
|
NULLIF(s.admin3_name, ''),
|
|
s.lat::numeric(9,6),
|
|
s.lng::numeric(9,6),
|
|
NULLIF(s.accuracy, '')::int
|
|
FROM firms.staging_postal_codes s
|
|
WHERE s.postal_code ~ '^[0-9]{6}$'
|
|
AND s.lat ~ '^-?[0-9.]+$'
|
|
AND s.lng ~ '^-?[0-9.]+$'
|
|
ON CONFLICT (postal_code, place_name) DO UPDATE
|
|
SET lat = EXCLUDED.lat, lng = EXCLUDED.lng, accuracy = EXCLUDED.accuracy;
|
|
SQL
|
|
|
|
log "Stats:"
|
|
psql -At -F"|" -c "
|
|
SELECT 'postal_codes_loaded', COUNT(*) FROM firms.postal_codes UNION ALL
|
|
SELECT 'distinct_postal_codes', COUNT(DISTINCT postal_code) FROM firms.postal_codes;
|
|
" 2>&1 | tee -a "$LOG"
|
|
|
|
# ── Geocode firms.entities (chunked, deadlock-retry) ──
|
|
# Two-pass: postal first (more precise), then UAT centroid as fallback.
|
|
# Each chunk is its own psql transaction so a deadlock against the
|
|
# concurrent ANAF enrichment script aborts only the current chunk
|
|
# (caught + retried), not the entire batch's progress.
|
|
run_chunked_update() {
|
|
local label="$1"
|
|
local sql="$2"
|
|
local chunk_total=0 chunk_n=0 retries=0
|
|
while :; do
|
|
# -X disables psqlrc, -e echoes the statement so we get "UPDATE N" tag
|
|
OUT=$(psql -v ON_ERROR_STOP=1 -X 2>&1 <<SQL
|
|
$sql
|
|
SQL
|
|
)
|
|
if echo "$OUT" | grep -q "deadlock detected"; then
|
|
retries=$((retries + 1))
|
|
if [ "$retries" -gt 8 ]; then
|
|
log "[$label] giving up after 8 deadlock retries"
|
|
echo "$OUT" | tail -5 | tee -a "$LOG"
|
|
return 1
|
|
fi
|
|
log "[$label] deadlock — retry #$retries in 2s"
|
|
sleep 2
|
|
continue
|
|
fi
|
|
if echo "$OUT" | grep -qE "^ERROR:"; then
|
|
echo "$OUT" | tail -10 | tee -a "$LOG"
|
|
return 1
|
|
fi
|
|
ROWS=$(echo "$OUT" | grep -oE '^UPDATE [0-9]+' | tail -1 | awk '{print $2}')
|
|
ROWS=${ROWS:-0}
|
|
chunk_n=$((chunk_n + 1))
|
|
chunk_total=$((chunk_total + ROWS))
|
|
if [ "$ROWS" = "0" ]; then
|
|
log "[$label] done — $chunk_n chunks, $chunk_total rows"
|
|
return 0
|
|
fi
|
|
log "[$label] chunk #$chunk_n: $ROWS rows (running total $chunk_total)"
|
|
done
|
|
}
|
|
|
|
log "Geocoding firms.entities by postal_code..."
|
|
run_chunked_update "postal" "
|
|
WITH cand AS (
|
|
SELECT e.cui FROM firms.entities e
|
|
WHERE e.adr_cod_postal ~ '^[0-9]{6}\$'
|
|
AND (e.geocode_source IS NULL OR e.geocode_source = 'uat_centroid')
|
|
AND EXISTS (SELECT 1 FROM firms.postal_codes_best pc WHERE pc.postal_code = e.adr_cod_postal)
|
|
ORDER BY e.cui
|
|
LIMIT 50000
|
|
)
|
|
UPDATE firms.entities e
|
|
SET
|
|
lat = pc.lat::double precision,
|
|
lng = pc.lng::double precision,
|
|
geom = ST_SetSRID(ST_MakePoint(pc.lng, pc.lat), 4326)::geography,
|
|
geocode_source = 'geonames_postal',
|
|
geocode_score = 0.6,
|
|
geocoded_at = now(),
|
|
updated_at = now()
|
|
FROM firms.postal_codes_best pc, cand
|
|
WHERE e.cui = cand.cui
|
|
AND e.adr_cod_postal = pc.postal_code;
|
|
"
|
|
|
|
log "Geocoding firms.entities fallback to UAT centroid..."
|
|
# public.\"GisUat\".geom is in SRID 3844 (RO STEREO70 projected). Geography
|
|
# requires WGS84 lon/lat (4326), so ST_Transform before ::geography.
|
|
run_chunked_update "uat" "
|
|
WITH cand AS (
|
|
SELECT e.cui FROM firms.entities e
|
|
WHERE e.siruta IS NOT NULL
|
|
AND e.geocode_source IS NULL
|
|
AND EXISTS (SELECT 1 FROM public.\"GisUat\" gu WHERE gu.siruta = e.siruta)
|
|
ORDER BY e.cui
|
|
LIMIT 50000
|
|
)
|
|
UPDATE firms.entities e
|
|
SET
|
|
lat = ST_Y(ST_Transform(ST_Centroid(gu.geom), 4326))::double precision,
|
|
lng = ST_X(ST_Transform(ST_Centroid(gu.geom), 4326))::double precision,
|
|
geom = ST_Transform(ST_Centroid(gu.geom), 4326)::geography,
|
|
geocode_source = 'uat_centroid',
|
|
geocode_score = 0.3,
|
|
geocoded_at = now(),
|
|
updated_at = now()
|
|
FROM public.\"GisUat\" gu, cand
|
|
WHERE e.cui = cand.cui
|
|
AND e.siruta = gu.siruta;
|
|
"
|
|
|
|
log "Final stats:"
|
|
psql -At -F"|" -c "
|
|
SELECT
|
|
COUNT(*) AS total,
|
|
COUNT(*) FILTER (WHERE lat IS NOT NULL) AS cu_lat_lng,
|
|
COUNT(*) FILTER (WHERE geocode_source = 'geonames_postal') AS via_postal,
|
|
COUNT(*) FILTER (WHERE geocode_source = 'uat_centroid') AS via_uat,
|
|
COUNT(*) FILTER (WHERE geocode_source = 'photon') AS via_photon
|
|
FROM firms.entities;
|
|
" 2>&1 | tee -a "$LOG"
|
|
|
|
log "=== Postal-codes import done ==="
|