Files
Claude VM a6c03a091e initial: split from gov-agreg — vreau.digital standalone platform
Moved from gov-agreg/src/pages/achizitii/* to root (drop prefix).
- 22 pages migrated, 127 files total
- All internal links: /achizitii/X → /X (176 occurrences fixed)
- AchizitiiLayout subnav rewritten: /X paths, top-right link to vreaudigital.ro hub
- BaseLayout new (vreau.digital branding, OG tags, site URL)
- astro.config.mjs: site https://vreau.digital, server output (was static)
- docker-compose: port 5096 (vreaudigital is 5095), container vreau-digital
- deploy.sh: paths /opt/vreau-digital, log /var/log/vreau-digital-deploy.log

Backend shared with gov-agreg:
- PostgreSQL satra (same schemas: seap, firms, anaf, anre, ...)
- Photon, Martin tiles
- Infisical /vreaudigital path (DATABASE_URL etc. shared)

build: PASS (npx astro check 0 errors, npm run build 5s vite + 10s server)
2026-05-13 00:10:32 +03:00

200 lines
7.3 KiB
Bash
Executable File

#!/bin/bash
# Download GeoNames RO postal codes and rebuild firms.postal_codes.
# Then geocode firms.entities by postal_code lookup, falling back to UAT
# centroid for firms without a valid postal code but with a siruta UAT.
#
# Coverage estimates (snapshot 2026-05-08):
# - postal-precision: ~2.07M / 3.97M firms (52%) — accuracy ~100m-2km
# - UAT-centroid fallback: +1.7M firms (44%) — accuracy 5-30km
# - combined: ~96% of all firms get lat/lng
#
# Run before geocode-photon.ts (which targets the remaining ~4% / refines the
# postal-level pins to housenumber level when available).
#
# Idempotent: safe to re-run weekly. Only rewrites firms.entities rows where
# the existing pin is null OR was set by an older/lower-precision source.
set -euo pipefail
DATA_DIR=/opt/vreaudigital/data/postal
LOG=/var/log/vreaudigital-postal-import.log
GEONAMES_URL=https://download.geonames.org/export/zip/RO.zip
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
mkdir -p "$DATA_DIR"
log "=== Postal-codes import started ==="
# ── Resolve DATABASE_URL via Infisical Machine Identity ──
source /opt/vreaudigital/.infisical-mi
TOKEN=$(infisical login --method=universal-auth \
--domain="$INFISICAL_API_URL" \
--client-id="$INFISICAL_CLIENT_ID" \
--client-secret="$INFISICAL_CLIENT_SECRET" \
--silent --plain)
DATABASE_URL=$(infisical run --domain="$INFISICAL_API_URL" \
--projectId="$INFISICAL_PROJECT_ID" \
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \
--silent --token="$TOKEN" \
-- sh -c 'echo "$DATABASE_URL"')
DB=$(echo "$DATABASE_URL" | sed -E 's/[?&]schema=[^&]*//; s/\?$//')
export PGUSER=$(echo "$DB" | sed -E 's|^postgresql://([^:]+):.*|\1|')
export PGPASSWORD=$(echo "$DB" | sed -E 's|^postgresql://[^:]+:([^@]+)@.*|\1|')
export PGHOST=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@([^:/]+).*|\1|')
export PGPORT=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^:]+:([0-9]+)/.*|\1|')
export PGDATABASE=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^/]+/([^?]+).*|\1|')
unset DATABASE_URL TOKEN DB
# ── Download + unzip ──
log "Downloading $GEONAMES_URL..."
curl -fsSL --max-time 120 -o "$DATA_DIR/RO.zip" "$GEONAMES_URL"
log "Unzipping..."
cd "$DATA_DIR" && unzip -o RO.zip -d "$DATA_DIR" >/dev/null
[ -s "$DATA_DIR/RO.txt" ] || { log "FATAL: RO.txt missing or empty"; exit 1; }
# ── Apply schema (idempotent) ──
psql -v ON_ERROR_STOP=1 -f /opt/vreaudigital/services/seap-scraper/sql/014_firms_postal_codes.sql >/dev/null
# ── Stage + UPSERT into firms.postal_codes ──
log "TRUNCATE staging + COPY..."
psql -v ON_ERROR_STOP=1 -c "TRUNCATE TABLE firms.staging_postal_codes;"
# GeoNames RO.txt is tab-separated, no header, US-ASCII safe (no quote escapes).
psql -v ON_ERROR_STOP=1 <<COPYEOF
\\copy firms.staging_postal_codes (country_code, postal_code, place_name, admin1_name, admin1_code, admin2_name, admin2_code, admin3_name, admin3_code, lat, lng, accuracy) FROM '$DATA_DIR/RO.txt' WITH (FORMAT csv, DELIMITER E'\t', NULL '', QUOTE E'\b', HEADER false);
COPYEOF
log "Rebuilding firms.postal_codes from staging..."
psql -v ON_ERROR_STOP=1 <<'SQL'
TRUNCATE TABLE firms.postal_codes;
INSERT INTO firms.postal_codes (postal_code, place_name, county, county_code, admin2_code, admin3_code, admin3_name, lat, lng, accuracy)
SELECT
s.postal_code,
s.place_name,
NULLIF(s.admin1_name, ''),
NULLIF(s.admin1_code, ''),
NULLIF(s.admin2_code, ''),
NULLIF(s.admin3_code, ''),
NULLIF(s.admin3_name, ''),
s.lat::numeric(9,6),
s.lng::numeric(9,6),
NULLIF(s.accuracy, '')::int
FROM firms.staging_postal_codes s
WHERE s.postal_code ~ '^[0-9]{6}$'
AND s.lat ~ '^-?[0-9.]+$'
AND s.lng ~ '^-?[0-9.]+$'
ON CONFLICT (postal_code, place_name) DO UPDATE
SET lat = EXCLUDED.lat, lng = EXCLUDED.lng, accuracy = EXCLUDED.accuracy;
SQL
log "Stats:"
psql -At -F"|" -c "
SELECT 'postal_codes_loaded', COUNT(*) FROM firms.postal_codes UNION ALL
SELECT 'distinct_postal_codes', COUNT(DISTINCT postal_code) FROM firms.postal_codes;
" 2>&1 | tee -a "$LOG"
# ── Geocode firms.entities (chunked, deadlock-retry) ──
# Two-pass: postal first (more precise), then UAT centroid as fallback.
# Each chunk is its own psql transaction so a deadlock against the
# concurrent ANAF enrichment script aborts only the current chunk
# (caught + retried), not the entire batch's progress.
run_chunked_update() {
local label="$1"
local sql="$2"
local chunk_total=0 chunk_n=0 retries=0
while :; do
# -X disables psqlrc, -e echoes the statement so we get "UPDATE N" tag
OUT=$(psql -v ON_ERROR_STOP=1 -X 2>&1 <<SQL
$sql
SQL
)
if echo "$OUT" | grep -q "deadlock detected"; then
retries=$((retries + 1))
if [ "$retries" -gt 8 ]; then
log "[$label] giving up after 8 deadlock retries"
echo "$OUT" | tail -5 | tee -a "$LOG"
return 1
fi
log "[$label] deadlock — retry #$retries in 2s"
sleep 2
continue
fi
if echo "$OUT" | grep -qE "^ERROR:"; then
echo "$OUT" | tail -10 | tee -a "$LOG"
return 1
fi
ROWS=$(echo "$OUT" | grep -oE '^UPDATE [0-9]+' | tail -1 | awk '{print $2}')
ROWS=${ROWS:-0}
chunk_n=$((chunk_n + 1))
chunk_total=$((chunk_total + ROWS))
if [ "$ROWS" = "0" ]; then
log "[$label] done — $chunk_n chunks, $chunk_total rows"
return 0
fi
log "[$label] chunk #$chunk_n: $ROWS rows (running total $chunk_total)"
done
}
log "Geocoding firms.entities by postal_code..."
run_chunked_update "postal" "
WITH cand AS (
SELECT e.cui FROM firms.entities e
WHERE e.adr_cod_postal ~ '^[0-9]{6}\$'
AND (e.geocode_source IS NULL OR e.geocode_source = 'uat_centroid')
AND EXISTS (SELECT 1 FROM firms.postal_codes_best pc WHERE pc.postal_code = e.adr_cod_postal)
ORDER BY e.cui
LIMIT 50000
)
UPDATE firms.entities e
SET
lat = pc.lat::double precision,
lng = pc.lng::double precision,
geom = ST_SetSRID(ST_MakePoint(pc.lng, pc.lat), 4326)::geography,
geocode_source = 'geonames_postal',
geocode_score = 0.6,
geocoded_at = now(),
updated_at = now()
FROM firms.postal_codes_best pc, cand
WHERE e.cui = cand.cui
AND e.adr_cod_postal = pc.postal_code;
"
log "Geocoding firms.entities fallback to UAT centroid..."
# public.\"GisUat\".geom is in SRID 3844 (RO STEREO70 projected). Geography
# requires WGS84 lon/lat (4326), so ST_Transform before ::geography.
run_chunked_update "uat" "
WITH cand AS (
SELECT e.cui FROM firms.entities e
WHERE e.siruta IS NOT NULL
AND e.geocode_source IS NULL
AND EXISTS (SELECT 1 FROM public.\"GisUat\" gu WHERE gu.siruta = e.siruta)
ORDER BY e.cui
LIMIT 50000
)
UPDATE firms.entities e
SET
lat = ST_Y(ST_Transform(ST_Centroid(gu.geom), 4326))::double precision,
lng = ST_X(ST_Transform(ST_Centroid(gu.geom), 4326))::double precision,
geom = ST_Transform(ST_Centroid(gu.geom), 4326)::geography,
geocode_source = 'uat_centroid',
geocode_score = 0.3,
geocoded_at = now(),
updated_at = now()
FROM public.\"GisUat\" gu, cand
WHERE e.cui = cand.cui
AND e.siruta = gu.siruta;
"
log "Final stats:"
psql -At -F"|" -c "
SELECT
COUNT(*) AS total,
COUNT(*) FILTER (WHERE lat IS NOT NULL) AS cu_lat_lng,
COUNT(*) FILTER (WHERE geocode_source = 'geonames_postal') AS via_postal,
COUNT(*) FILTER (WHERE geocode_source = 'uat_centroid') AS via_uat,
COUNT(*) FILTER (WHERE geocode_source = 'photon') AS via_photon
FROM firms.entities;
" 2>&1 | tee -a "$LOG"
log "=== Postal-codes import done ==="