initial: split from gov-agreg — vreau.digital standalone platform
Moved from gov-agreg/src/pages/achizitii/* to root (drop prefix). - 22 pages migrated, 127 files total - All internal links: /achizitii/X → /X (176 occurrences fixed) - AchizitiiLayout subnav rewritten: /X paths, top-right link to vreaudigital.ro hub - BaseLayout new (vreau.digital branding, OG tags, site URL) - astro.config.mjs: site https://vreau.digital, server output (was static) - docker-compose: port 5096 (vreaudigital is 5095), container vreau-digital - deploy.sh: paths /opt/vreau-digital, log /var/log/vreau-digital-deploy.log Backend shared with gov-agreg: - PostgreSQL satra (same schemas: seap, firms, anaf, anre, ...) - Photon, Martin tiles - Infisical /vreaudigital path (DATABASE_URL etc. shared) build: PASS (npx astro check 0 errors, npm run build 5s vite + 10s server)
This commit is contained in:
+199
@@ -0,0 +1,199 @@
|
||||
#!/bin/bash
|
||||
# Download GeoNames RO postal codes and rebuild firms.postal_codes.
|
||||
# Then geocode firms.entities by postal_code lookup, falling back to UAT
|
||||
# centroid for firms without a valid postal code but with a siruta UAT.
|
||||
#
|
||||
# Coverage estimates (snapshot 2026-05-08):
|
||||
# - postal-precision: ~2.07M / 3.97M firms (52%) — accuracy ~100m-2km
|
||||
# - UAT-centroid fallback: +1.7M firms (44%) — accuracy 5-30km
|
||||
# - combined: ~96% of all firms get lat/lng
|
||||
#
|
||||
# Run before geocode-photon.ts (which targets the remaining ~4% / refines the
|
||||
# postal-level pins to housenumber level when available).
|
||||
#
|
||||
# Idempotent: safe to re-run weekly. Only rewrites firms.entities rows where
|
||||
# the existing pin is null OR was set by an older/lower-precision source.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
DATA_DIR=/opt/vreaudigital/data/postal
|
||||
LOG=/var/log/vreaudigital-postal-import.log
|
||||
GEONAMES_URL=https://download.geonames.org/export/zip/RO.zip
|
||||
|
||||
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
|
||||
mkdir -p "$DATA_DIR"
|
||||
|
||||
log "=== Postal-codes import started ==="
|
||||
|
||||
# ── Resolve DATABASE_URL via Infisical Machine Identity ──
|
||||
source /opt/vreaudigital/.infisical-mi
|
||||
TOKEN=$(infisical login --method=universal-auth \
|
||||
--domain="$INFISICAL_API_URL" \
|
||||
--client-id="$INFISICAL_CLIENT_ID" \
|
||||
--client-secret="$INFISICAL_CLIENT_SECRET" \
|
||||
--silent --plain)
|
||||
DATABASE_URL=$(infisical run --domain="$INFISICAL_API_URL" \
|
||||
--projectId="$INFISICAL_PROJECT_ID" \
|
||||
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \
|
||||
--silent --token="$TOKEN" \
|
||||
-- sh -c 'echo "$DATABASE_URL"')
|
||||
DB=$(echo "$DATABASE_URL" | sed -E 's/[?&]schema=[^&]*//; s/\?$//')
|
||||
export PGUSER=$(echo "$DB" | sed -E 's|^postgresql://([^:]+):.*|\1|')
|
||||
export PGPASSWORD=$(echo "$DB" | sed -E 's|^postgresql://[^:]+:([^@]+)@.*|\1|')
|
||||
export PGHOST=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@([^:/]+).*|\1|')
|
||||
export PGPORT=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^:]+:([0-9]+)/.*|\1|')
|
||||
export PGDATABASE=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^/]+/([^?]+).*|\1|')
|
||||
unset DATABASE_URL TOKEN DB
|
||||
|
||||
# ── Download + unzip ──
|
||||
log "Downloading $GEONAMES_URL..."
|
||||
curl -fsSL --max-time 120 -o "$DATA_DIR/RO.zip" "$GEONAMES_URL"
|
||||
log "Unzipping..."
|
||||
cd "$DATA_DIR" && unzip -o RO.zip -d "$DATA_DIR" >/dev/null
|
||||
[ -s "$DATA_DIR/RO.txt" ] || { log "FATAL: RO.txt missing or empty"; exit 1; }
|
||||
|
||||
# ── Apply schema (idempotent) ──
|
||||
psql -v ON_ERROR_STOP=1 -f /opt/vreaudigital/services/seap-scraper/sql/014_firms_postal_codes.sql >/dev/null
|
||||
|
||||
# ── Stage + UPSERT into firms.postal_codes ──
|
||||
log "TRUNCATE staging + COPY..."
|
||||
psql -v ON_ERROR_STOP=1 -c "TRUNCATE TABLE firms.staging_postal_codes;"
|
||||
|
||||
# GeoNames RO.txt is tab-separated, no header, US-ASCII safe (no quote escapes).
|
||||
psql -v ON_ERROR_STOP=1 <<COPYEOF
|
||||
\\copy firms.staging_postal_codes (country_code, postal_code, place_name, admin1_name, admin1_code, admin2_name, admin2_code, admin3_name, admin3_code, lat, lng, accuracy) FROM '$DATA_DIR/RO.txt' WITH (FORMAT csv, DELIMITER E'\t', NULL '', QUOTE E'\b', HEADER false);
|
||||
COPYEOF
|
||||
|
||||
log "Rebuilding firms.postal_codes from staging..."
|
||||
psql -v ON_ERROR_STOP=1 <<'SQL'
|
||||
TRUNCATE TABLE firms.postal_codes;
|
||||
INSERT INTO firms.postal_codes (postal_code, place_name, county, county_code, admin2_code, admin3_code, admin3_name, lat, lng, accuracy)
|
||||
SELECT
|
||||
s.postal_code,
|
||||
s.place_name,
|
||||
NULLIF(s.admin1_name, ''),
|
||||
NULLIF(s.admin1_code, ''),
|
||||
NULLIF(s.admin2_code, ''),
|
||||
NULLIF(s.admin3_code, ''),
|
||||
NULLIF(s.admin3_name, ''),
|
||||
s.lat::numeric(9,6),
|
||||
s.lng::numeric(9,6),
|
||||
NULLIF(s.accuracy, '')::int
|
||||
FROM firms.staging_postal_codes s
|
||||
WHERE s.postal_code ~ '^[0-9]{6}$'
|
||||
AND s.lat ~ '^-?[0-9.]+$'
|
||||
AND s.lng ~ '^-?[0-9.]+$'
|
||||
ON CONFLICT (postal_code, place_name) DO UPDATE
|
||||
SET lat = EXCLUDED.lat, lng = EXCLUDED.lng, accuracy = EXCLUDED.accuracy;
|
||||
SQL
|
||||
|
||||
log "Stats:"
|
||||
psql -At -F"|" -c "
|
||||
SELECT 'postal_codes_loaded', COUNT(*) FROM firms.postal_codes UNION ALL
|
||||
SELECT 'distinct_postal_codes', COUNT(DISTINCT postal_code) FROM firms.postal_codes;
|
||||
" 2>&1 | tee -a "$LOG"
|
||||
|
||||
# ── Geocode firms.entities (chunked, deadlock-retry) ──
|
||||
# Two-pass: postal first (more precise), then UAT centroid as fallback.
|
||||
# Each chunk is its own psql transaction so a deadlock against the
|
||||
# concurrent ANAF enrichment script aborts only the current chunk
|
||||
# (caught + retried), not the entire batch's progress.
|
||||
run_chunked_update() {
|
||||
local label="$1"
|
||||
local sql="$2"
|
||||
local chunk_total=0 chunk_n=0 retries=0
|
||||
while :; do
|
||||
# -X disables psqlrc, -e echoes the statement so we get "UPDATE N" tag
|
||||
OUT=$(psql -v ON_ERROR_STOP=1 -X 2>&1 <<SQL
|
||||
$sql
|
||||
SQL
|
||||
)
|
||||
if echo "$OUT" | grep -q "deadlock detected"; then
|
||||
retries=$((retries + 1))
|
||||
if [ "$retries" -gt 8 ]; then
|
||||
log "[$label] giving up after 8 deadlock retries"
|
||||
echo "$OUT" | tail -5 | tee -a "$LOG"
|
||||
return 1
|
||||
fi
|
||||
log "[$label] deadlock — retry #$retries in 2s"
|
||||
sleep 2
|
||||
continue
|
||||
fi
|
||||
if echo "$OUT" | grep -qE "^ERROR:"; then
|
||||
echo "$OUT" | tail -10 | tee -a "$LOG"
|
||||
return 1
|
||||
fi
|
||||
ROWS=$(echo "$OUT" | grep -oE '^UPDATE [0-9]+' | tail -1 | awk '{print $2}')
|
||||
ROWS=${ROWS:-0}
|
||||
chunk_n=$((chunk_n + 1))
|
||||
chunk_total=$((chunk_total + ROWS))
|
||||
if [ "$ROWS" = "0" ]; then
|
||||
log "[$label] done — $chunk_n chunks, $chunk_total rows"
|
||||
return 0
|
||||
fi
|
||||
log "[$label] chunk #$chunk_n: $ROWS rows (running total $chunk_total)"
|
||||
done
|
||||
}
|
||||
|
||||
log "Geocoding firms.entities by postal_code..."
|
||||
run_chunked_update "postal" "
|
||||
WITH cand AS (
|
||||
SELECT e.cui FROM firms.entities e
|
||||
WHERE e.adr_cod_postal ~ '^[0-9]{6}\$'
|
||||
AND (e.geocode_source IS NULL OR e.geocode_source = 'uat_centroid')
|
||||
AND EXISTS (SELECT 1 FROM firms.postal_codes_best pc WHERE pc.postal_code = e.adr_cod_postal)
|
||||
ORDER BY e.cui
|
||||
LIMIT 50000
|
||||
)
|
||||
UPDATE firms.entities e
|
||||
SET
|
||||
lat = pc.lat::double precision,
|
||||
lng = pc.lng::double precision,
|
||||
geom = ST_SetSRID(ST_MakePoint(pc.lng, pc.lat), 4326)::geography,
|
||||
geocode_source = 'geonames_postal',
|
||||
geocode_score = 0.6,
|
||||
geocoded_at = now(),
|
||||
updated_at = now()
|
||||
FROM firms.postal_codes_best pc, cand
|
||||
WHERE e.cui = cand.cui
|
||||
AND e.adr_cod_postal = pc.postal_code;
|
||||
"
|
||||
|
||||
log "Geocoding firms.entities fallback to UAT centroid..."
|
||||
# public.\"GisUat\".geom is in SRID 3844 (RO STEREO70 projected). Geography
|
||||
# requires WGS84 lon/lat (4326), so ST_Transform before ::geography.
|
||||
run_chunked_update "uat" "
|
||||
WITH cand AS (
|
||||
SELECT e.cui FROM firms.entities e
|
||||
WHERE e.siruta IS NOT NULL
|
||||
AND e.geocode_source IS NULL
|
||||
AND EXISTS (SELECT 1 FROM public.\"GisUat\" gu WHERE gu.siruta = e.siruta)
|
||||
ORDER BY e.cui
|
||||
LIMIT 50000
|
||||
)
|
||||
UPDATE firms.entities e
|
||||
SET
|
||||
lat = ST_Y(ST_Transform(ST_Centroid(gu.geom), 4326))::double precision,
|
||||
lng = ST_X(ST_Transform(ST_Centroid(gu.geom), 4326))::double precision,
|
||||
geom = ST_Transform(ST_Centroid(gu.geom), 4326)::geography,
|
||||
geocode_source = 'uat_centroid',
|
||||
geocode_score = 0.3,
|
||||
geocoded_at = now(),
|
||||
updated_at = now()
|
||||
FROM public.\"GisUat\" gu, cand
|
||||
WHERE e.cui = cand.cui
|
||||
AND e.siruta = gu.siruta;
|
||||
"
|
||||
|
||||
log "Final stats:"
|
||||
psql -At -F"|" -c "
|
||||
SELECT
|
||||
COUNT(*) AS total,
|
||||
COUNT(*) FILTER (WHERE lat IS NOT NULL) AS cu_lat_lng,
|
||||
COUNT(*) FILTER (WHERE geocode_source = 'geonames_postal') AS via_postal,
|
||||
COUNT(*) FILTER (WHERE geocode_source = 'uat_centroid') AS via_uat,
|
||||
COUNT(*) FILTER (WHERE geocode_source = 'photon') AS via_photon
|
||||
FROM firms.entities;
|
||||
" 2>&1 | tee -a "$LOG"
|
||||
|
||||
log "=== Postal-codes import done ==="
|
||||
Reference in New Issue
Block a user