#!/bin/bash # Download GeoNames RO postal codes and rebuild firms.postal_codes. # Then geocode firms.entities by postal_code lookup, falling back to UAT # centroid for firms without a valid postal code but with a siruta UAT. # # Coverage estimates (snapshot 2026-05-08): # - postal-precision: ~2.07M / 3.97M firms (52%) — accuracy ~100m-2km # - UAT-centroid fallback: +1.7M firms (44%) — accuracy 5-30km # - combined: ~96% of all firms get lat/lng # # Run before geocode-photon.ts (which targets the remaining ~4% / refines the # postal-level pins to housenumber level when available). # # Idempotent: safe to re-run weekly. Only rewrites firms.entities rows where # the existing pin is null OR was set by an older/lower-precision source. set -euo pipefail DATA_DIR=/opt/vreaudigital/data/postal LOG=/var/log/vreaudigital-postal-import.log GEONAMES_URL=https://download.geonames.org/export/zip/RO.zip log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; } mkdir -p "$DATA_DIR" log "=== Postal-codes import started ===" # ── Resolve DATABASE_URL via Infisical Machine Identity ── source /opt/vreaudigital/.infisical-mi TOKEN=$(infisical login --method=universal-auth \ --domain="$INFISICAL_API_URL" \ --client-id="$INFISICAL_CLIENT_ID" \ --client-secret="$INFISICAL_CLIENT_SECRET" \ --silent --plain) DATABASE_URL=$(infisical run --domain="$INFISICAL_API_URL" \ --projectId="$INFISICAL_PROJECT_ID" \ --env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \ --silent --token="$TOKEN" \ -- sh -c 'echo "$DATABASE_URL"') DB=$(echo "$DATABASE_URL" | sed -E 's/[?&]schema=[^&]*//; s/\?$//') export PGUSER=$(echo "$DB" | sed -E 's|^postgresql://([^:]+):.*|\1|') export PGPASSWORD=$(echo "$DB" | sed -E 's|^postgresql://[^:]+:([^@]+)@.*|\1|') export PGHOST=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@([^:/]+).*|\1|') export PGPORT=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^:]+:([0-9]+)/.*|\1|') export PGDATABASE=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^/]+/([^?]+).*|\1|') unset DATABASE_URL TOKEN DB # ── Download + unzip ── log "Downloading $GEONAMES_URL..." curl -fsSL --max-time 120 -o "$DATA_DIR/RO.zip" "$GEONAMES_URL" log "Unzipping..." cd "$DATA_DIR" && unzip -o RO.zip -d "$DATA_DIR" >/dev/null [ -s "$DATA_DIR/RO.txt" ] || { log "FATAL: RO.txt missing or empty"; exit 1; } # ── Apply schema (idempotent) ── psql -v ON_ERROR_STOP=1 -f /opt/vreaudigital/services/seap-scraper/sql/014_firms_postal_codes.sql >/dev/null # ── Stage + UPSERT into firms.postal_codes ── log "TRUNCATE staging + COPY..." psql -v ON_ERROR_STOP=1 -c "TRUNCATE TABLE firms.staging_postal_codes;" # GeoNames RO.txt is tab-separated, no header, US-ASCII safe (no quote escapes). psql -v ON_ERROR_STOP=1 <&1 | tee -a "$LOG" # ── Geocode firms.entities (chunked, deadlock-retry) ── # Two-pass: postal first (more precise), then UAT centroid as fallback. # Each chunk is its own psql transaction so a deadlock against the # concurrent ANAF enrichment script aborts only the current chunk # (caught + retried), not the entire batch's progress. run_chunked_update() { local label="$1" local sql="$2" local chunk_total=0 chunk_n=0 retries=0 while :; do # -X disables psqlrc, -e echoes the statement so we get "UPDATE N" tag OUT=$(psql -v ON_ERROR_STOP=1 -X 2>&1 <&1 | tee -a "$LOG" log "=== Postal-codes import done ==="