#!/bin/bash # Full geocoding fallback chain for firms.entities (WHERE lat IS NULL). # # Re-runnable / idempotent. Filters every stage on `lat IS NULL` so re-runs # are no-ops once coverage is full. Safe to call after any ONRC fresh import # (import-onrc-fresh.sh) which by itself does NOT geocode new rows. # # Stage chain (highest accuracy first): # 1. geonames_postal — exact 6-digit RO postal match against firms.postal_codes_best # 2. uat_centroid — by siruta → public."GisUat" polygon centroid # 3. photon — Komoot Photon OSM geocoder (local 127.0.0.1:2322), street-level # 3b/3c/3d. uat_centroid by postal_codes (locality+county median) — for rows w/o # adr_strada (Photon's filter requires it). Tries locality token, # then Comuna parent, then â/î normalization. # 4. judet_centroid — last resort, county median from firms.postal_codes # # Two rows in the entire dataset have literally zero address fields and stay NULL. # # Usage: # sudo /opt/vreaudigital/services/seap-scraper/cron/geocode-firms.sh # sudo SKIP_PHOTON=1 /opt/vreaudigital/services/seap-scraper/cron/geocode-firms.sh # # Env: # SKIP_PHOTON=1 — skip stage 3 (photon docker) — useful when Photon down # PHOTON_CONCURRENCY=40 # PHOTON_BATCH=200 set -euo pipefail LOG=/var/log/vreaudigital-geocode-firms.log SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" SEAP_DIR="$(dirname "$SCRIPT_DIR")" SKIP_PHOTON="${SKIP_PHOTON:-0}" PHOTON_CONCURRENCY="${PHOTON_CONCURRENCY:-40}" PHOTON_BATCH="${PHOTON_BATCH:-200}" log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; } log "=== Geocode-firms fallback chain started ===" if [ ! -f /opt/vreaudigital/.infisical-mi ]; then log "FATAL: /opt/vreaudigital/.infisical-mi missing" exit 1 fi # shellcheck disable=SC1091 source /opt/vreaudigital/.infisical-mi TOKEN=$(infisical login --method=universal-auth \ --domain="$INFISICAL_API_URL" \ --client-id="$INFISICAL_CLIENT_ID" \ --client-secret="$INFISICAL_CLIENT_SECRET" \ --silent --plain) DATABASE_URL=$(infisical run --domain="$INFISICAL_API_URL" \ --projectId="$INFISICAL_PROJECT_ID" \ --env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \ --silent --token="$TOKEN" \ -- sh -c 'echo "$DATABASE_URL"') DB=$(echo "$DATABASE_URL" | sed -E 's/[?&]schema=[^&]*//; s/\?$//') export PGUSER=$(echo "$DB" | sed -E 's|^postgresql://([^:]+):.*|\1|') export PGPASSWORD=$(echo "$DB" | sed -E 's|^postgresql://[^:]+:([^@]+)@.*|\1|') export PGHOST=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@([^:/]+).*|\1|') export PGPORT=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^:]+:([0-9]+)/.*|\1|') export PGDATABASE=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^/]+/([^?]+).*|\1|') initial_null=$(psql -At -c "SELECT count(*) FROM firms.entities WHERE lat IS NULL;") log "Initial WHERE lat IS NULL count: $initial_null" if [ "$initial_null" = "0" ]; then log "Nothing to do — no firms with NULL lat." unset DATABASE_URL TOKEN DB PGPASSWORD exit 0 fi # ── Stage 1: geonames_postal ──────────────────────────────────────────────── log "[stage 1] geonames_postal (exact 6-digit postal match)..." n=$(psql -v ON_ERROR_STOP=1 -At -c " WITH cand AS ( SELECT e.cui FROM firms.entities e WHERE e.lat IS NULL AND e.adr_cod_postal ~ '^[0-9]{6}\$' AND EXISTS (SELECT 1 FROM firms.postal_codes_best pc WHERE pc.postal_code = e.adr_cod_postal) ) UPDATE firms.entities e SET lat = pc.lat::double precision, lng = pc.lng::double precision, geom = ST_SetSRID(ST_MakePoint(pc.lng::double precision, pc.lat::double precision), 4326)::geography, geocode_source = 'geonames_postal', geocode_score = 0.6, geocoded_at = now(), updated_at = now() FROM firms.postal_codes_best pc, cand WHERE e.cui = cand.cui AND e.adr_cod_postal = pc.postal_code AND e.lat IS NULL RETURNING 1 " | wc -l) log "[stage 1] updated $n rows" # ── Stage 2: uat_centroid by siruta ───────────────────────────────────────── log "[stage 2] uat_centroid (via siruta → GisUat polygon centroid)..." n=$(psql -v ON_ERROR_STOP=1 -At -c " WITH cand AS ( SELECT e.cui FROM firms.entities e WHERE e.lat IS NULL AND e.siruta IS NOT NULL AND EXISTS (SELECT 1 FROM public.\"GisUat\" gu WHERE gu.siruta = e.siruta) ) UPDATE firms.entities e SET lat = ST_Y(ST_Transform(ST_Centroid(gu.geom), 4326))::double precision, lng = ST_X(ST_Transform(ST_Centroid(gu.geom), 4326))::double precision, geom = ST_Transform(ST_Centroid(gu.geom), 4326)::geography, geocode_source = 'uat_centroid', geocode_score = 0.3, geocoded_at = now(), updated_at = now() FROM public.\"GisUat\" gu, cand WHERE e.cui = cand.cui AND e.siruta = gu.siruta AND e.lat IS NULL RETURNING 1 " | wc -l) log "[stage 2] updated $n rows" # ── Stage 3: photon (docker) ──────────────────────────────────────────────── if [ "$SKIP_PHOTON" = "1" ]; then log "[stage 3] SKIP_PHOTON=1 — skipping photon stage" else remaining_photon=$(psql -At -c " SELECT count(*) FROM firms.entities WHERE geocode_source IS NULL AND adr_strada IS NOT NULL AND adr_judet IS NOT NULL ") if [ "$remaining_photon" = "0" ]; then log "[stage 3] no photon-eligible rows — skipping" else log "[stage 3] photon — $remaining_photon candidates..." if docker ps --filter name=vreaudigital-geocode --format '{{.Names}}' | grep -q '^vreaudigital-geocode$'; then log "WARN: vreaudigital-geocode already running — skipping stage 3" else docker rm -f vreaudigital-geocode 2>/dev/null || true umask 077 ENVF=$(mktemp /tmp/.vreaudigital-geocode-env.XXXXXX) printf 'DATABASE_URL=%s\nPHOTON_URL=http://127.0.0.1:2322\n' \ "$DATABASE_URL" > "$ENVF" cd "$SEAP_DIR" CID=$(docker run -d \ --name vreaudigital-geocode \ --network host \ --env-file "$ENVF" \ -v "$(pwd):/work" -w /work \ --user "$(id -u):$(id -g)" \ --restart no \ node:22-alpine \ sh -c "npx tsx src/geocode-photon.ts --concurrency=$PHOTON_CONCURRENCY --batch=$PHOTON_BATCH") log "container started: $CID" sleep 3 rm -f "$ENVF" docker wait vreaudigital-geocode >/dev/null EXIT_CODE=$(docker inspect -f '{{.State.ExitCode}}' vreaudigital-geocode 2>/dev/null || echo "?") docker logs vreaudigital-geocode 2>&1 | tail -10 | tee -a "$LOG" log "[stage 3] photon container exit=$EXIT_CODE" fi fi fi unset DATABASE_URL TOKEN DB # ── Stage 3b/3c/3d: uat_centroid by name (no siruta, no postal) ───────────── # For rows w/o adr_strada (skipped by photon) match postal_codes locality+county # median. Three normalization variants try locality token, comuna parent, and # Romanian â/î diacritic normalization. log "[stage 3b] uat_centroid by postal_codes locality+county median (locality token)..." n=$(psql -v ON_ERROR_STOP=1 -At -c " WITH cand AS ( SELECT e.cui, e.adr_judet, e.adr_localitate FROM firms.entities e WHERE e.lat IS NULL AND e.adr_judet IS NOT NULL AND e.adr_localitate IS NOT NULL ), loc_clean AS ( SELECT cui, upper(unaccent(regexp_replace(adr_judet,'^MUNICIPIUL ',''))) AS judet_key, upper(unaccent(trim(regexp_replace( regexp_replace(adr_localitate, ',.*\$', ''), '^(Sat|Or[şs]\\.?|Mun\\.?|Loc\\.?|Cartier|Comuna)\\s+', '', 'i' )))) AS loc_key FROM cand ), pc_agg AS ( SELECT upper(unaccent(coalesce(county,''))) AS judet_key, upper(unaccent(place_name)) AS loc_key, percentile_cont(0.5) WITHIN GROUP (ORDER BY lat::double precision) AS lat, percentile_cont(0.5) WITHIN GROUP (ORDER BY lng::double precision) AS lng FROM firms.postal_codes WHERE place_name IS NOT NULL GROUP BY 1, 2 ) UPDATE firms.entities e SET lat = pc.lat, lng = pc.lng, geom = ST_SetSRID(ST_MakePoint(pc.lng, pc.lat), 4326)::geography, geocode_source = 'uat_centroid', geocode_score = 0.3, geocoded_at = now(), updated_at = now() FROM loc_clean lc JOIN pc_agg pc ON pc.judet_key = lc.judet_key AND pc.loc_key = lc.loc_key WHERE e.cui = lc.cui AND e.lat IS NULL RETURNING 1 " | wc -l) log "[stage 3b] updated $n rows" log "[stage 3c] uat_centroid by Comuna parent..." n=$(psql -v ON_ERROR_STOP=1 -At -c " WITH cand AS ( SELECT e.cui, e.adr_judet, e.adr_localitate FROM firms.entities e WHERE e.lat IS NULL AND e.adr_judet IS NOT NULL AND e.adr_localitate IS NOT NULL ), loc_clean AS ( SELECT cui, upper(unaccent(regexp_replace(adr_judet,'^MUNICIPIUL ',''))) AS judet_key, upper(unaccent(trim((regexp_match(adr_localitate, 'Comuna\\s+([^,]+)', 'i'))[1]))) AS loc_key FROM cand ), pc_agg AS ( SELECT upper(unaccent(coalesce(county,''))) AS judet_key, upper(unaccent(place_name)) AS loc_key, percentile_cont(0.5) WITHIN GROUP (ORDER BY lat::double precision) AS lat, percentile_cont(0.5) WITHIN GROUP (ORDER BY lng::double precision) AS lng FROM firms.postal_codes WHERE place_name IS NOT NULL GROUP BY 1, 2 ) UPDATE firms.entities e SET lat = pc.lat, lng = pc.lng, geom = ST_SetSRID(ST_MakePoint(pc.lng, pc.lat), 4326)::geography, geocode_source = 'uat_centroid', geocode_score = 0.3, geocoded_at = now(), updated_at = now() FROM loc_clean lc JOIN pc_agg pc ON pc.judet_key = lc.judet_key AND pc.loc_key = lc.loc_key WHERE e.cui = lc.cui AND e.lat IS NULL AND lc.loc_key IS NOT NULL RETURNING 1 " | wc -l) log "[stage 3c] updated $n rows" log "[stage 3d] uat_centroid with â/î normalization (Oraş/Comuna/locality)..." n=$(psql -v ON_ERROR_STOP=1 -At -c " WITH cand AS ( SELECT e.cui, e.adr_judet, e.adr_localitate FROM firms.entities e WHERE e.lat IS NULL AND e.adr_judet IS NOT NULL AND e.adr_localitate IS NOT NULL ), loc_norm AS ( SELECT cui, upper(unaccent(regexp_replace(adr_judet,'^MUNICIPIUL ',''))) AS judet_key, upper(unaccent(translate(trim(coalesce( (regexp_match(adr_localitate, 'Or[şs]\\.?\\s+([^,]+)', 'i'))[1], (regexp_match(adr_localitate, 'Comuna\\s+([^,]+)', 'i'))[1], regexp_replace(regexp_replace(adr_localitate, ',.*\$',''), '^(Sat|Loc\\.?)\\s+','','i') )), 'îÎ', 'âÂ'))) AS loc_key FROM cand ), pc_agg AS ( SELECT upper(unaccent(coalesce(county,''))) AS judet_key, upper(unaccent(translate(place_name, 'îÎ','âÂ'))) AS loc_key, percentile_cont(0.5) WITHIN GROUP (ORDER BY lat::double precision) AS lat, percentile_cont(0.5) WITHIN GROUP (ORDER BY lng::double precision) AS lng FROM firms.postal_codes WHERE place_name IS NOT NULL GROUP BY 1, 2 ) UPDATE firms.entities e SET lat = pc.lat, lng = pc.lng, geom = ST_SetSRID(ST_MakePoint(pc.lng, pc.lat), 4326)::geography, geocode_source = 'uat_centroid', geocode_score = 0.3, geocoded_at = now(), updated_at = now() FROM loc_norm ln JOIN pc_agg pc ON pc.judet_key = ln.judet_key AND pc.loc_key = ln.loc_key WHERE e.cui = ln.cui AND e.lat IS NULL AND ln.loc_key IS NOT NULL RETURNING 1 " | wc -l) log "[stage 3d] updated $n rows" # ── Stage 4: judet_centroid fallback ──────────────────────────────────────── log "[stage 4] judet_centroid (county median, last resort)..." n=$(psql -v ON_ERROR_STOP=1 -At -c " WITH judet_agg AS ( SELECT upper(unaccent(coalesce(county,''))) AS judet_key, percentile_cont(0.5) WITHIN GROUP (ORDER BY lat::double precision) AS lat, percentile_cont(0.5) WITHIN GROUP (ORDER BY lng::double precision) AS lng FROM firms.postal_codes WHERE county IS NOT NULL GROUP BY 1 ) UPDATE firms.entities e SET lat = ja.lat, lng = ja.lng, geom = ST_SetSRID(ST_MakePoint(ja.lng, ja.lat), 4326)::geography, geocode_source = 'judet_centroid', geocode_score = 0.1, geocoded_at = now(), updated_at = now() FROM judet_agg ja WHERE upper(unaccent(regexp_replace(e.adr_judet,'^MUNICIPIUL ',''))) = ja.judet_key AND e.lat IS NULL RETURNING 1 " | wc -l) log "[stage 4] updated $n rows" # ── Final stats ───────────────────────────────────────────────────────────── log "Final stats:" psql -A -F"|" -c " SELECT geocode_source, count(*) AS rows FROM firms.entities GROUP BY geocode_source ORDER BY rows DESC; " 2>&1 | tee -a "$LOG" residual=$(psql -At -c "SELECT count(*) FROM firms.entities WHERE lat IS NULL;") log "Residual WHERE lat IS NULL: $residual (out of reach — no address fields)" log "=== Geocode-firms fallback chain done ===" unset PGPASSWORD