Files
Claude VM a6c03a091e initial: split from gov-agreg — vreau.digital standalone platform
Moved from gov-agreg/src/pages/achizitii/* to root (drop prefix).
- 22 pages migrated, 127 files total
- All internal links: /achizitii/X → /X (176 occurrences fixed)
- AchizitiiLayout subnav rewritten: /X paths, top-right link to vreaudigital.ro hub
- BaseLayout new (vreau.digital branding, OG tags, site URL)
- astro.config.mjs: site https://vreau.digital, server output (was static)
- docker-compose: port 5096 (vreaudigital is 5095), container vreau-digital
- deploy.sh: paths /opt/vreau-digital, log /var/log/vreau-digital-deploy.log

Backend shared with gov-agreg:
- PostgreSQL satra (same schemas: seap, firms, anaf, anre, ...)
- Photon, Martin tiles
- Infisical /vreaudigital path (DATABASE_URL etc. shared)

build: PASS (npx astro check 0 errors, npm run build 5s vite + 10s server)
2026-05-13 00:10:32 +03:00

344 lines
13 KiB
Bash
Executable File

#!/bin/bash
# Full geocoding fallback chain for firms.entities (WHERE lat IS NULL).
#
# Re-runnable / idempotent. Filters every stage on `lat IS NULL` so re-runs
# are no-ops once coverage is full. Safe to call after any ONRC fresh import
# (import-onrc-fresh.sh) which by itself does NOT geocode new rows.
#
# Stage chain (highest accuracy first):
# 1. geonames_postal — exact 6-digit RO postal match against firms.postal_codes_best
# 2. uat_centroid — by siruta → public."GisUat" polygon centroid
# 3. photon — Komoot Photon OSM geocoder (local 127.0.0.1:2322), street-level
# 3b/3c/3d. uat_centroid by postal_codes (locality+county median) — for rows w/o
# adr_strada (Photon's filter requires it). Tries locality token,
# then Comuna parent, then â/î normalization.
# 4. judet_centroid — last resort, county median from firms.postal_codes
#
# Two rows in the entire dataset have literally zero address fields and stay NULL.
#
# Usage:
# sudo /opt/vreaudigital/services/seap-scraper/cron/geocode-firms.sh
# sudo SKIP_PHOTON=1 /opt/vreaudigital/services/seap-scraper/cron/geocode-firms.sh
#
# Env:
# SKIP_PHOTON=1 — skip stage 3 (photon docker) — useful when Photon down
# PHOTON_CONCURRENCY=40
# PHOTON_BATCH=200
set -euo pipefail
LOG=/var/log/vreaudigital-geocode-firms.log
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
SEAP_DIR="$(dirname "$SCRIPT_DIR")"
SKIP_PHOTON="${SKIP_PHOTON:-0}"
PHOTON_CONCURRENCY="${PHOTON_CONCURRENCY:-40}"
PHOTON_BATCH="${PHOTON_BATCH:-200}"
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
log "=== Geocode-firms fallback chain started ==="
if [ ! -f /opt/vreaudigital/.infisical-mi ]; then
log "FATAL: /opt/vreaudigital/.infisical-mi missing"
exit 1
fi
# shellcheck disable=SC1091
source /opt/vreaudigital/.infisical-mi
TOKEN=$(infisical login --method=universal-auth \
--domain="$INFISICAL_API_URL" \
--client-id="$INFISICAL_CLIENT_ID" \
--client-secret="$INFISICAL_CLIENT_SECRET" \
--silent --plain)
DATABASE_URL=$(infisical run --domain="$INFISICAL_API_URL" \
--projectId="$INFISICAL_PROJECT_ID" \
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \
--silent --token="$TOKEN" \
-- sh -c 'echo "$DATABASE_URL"')
DB=$(echo "$DATABASE_URL" | sed -E 's/[?&]schema=[^&]*//; s/\?$//')
export PGUSER=$(echo "$DB" | sed -E 's|^postgresql://([^:]+):.*|\1|')
export PGPASSWORD=$(echo "$DB" | sed -E 's|^postgresql://[^:]+:([^@]+)@.*|\1|')
export PGHOST=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@([^:/]+).*|\1|')
export PGPORT=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^:]+:([0-9]+)/.*|\1|')
export PGDATABASE=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^/]+/([^?]+).*|\1|')
initial_null=$(psql -At -c "SELECT count(*) FROM firms.entities WHERE lat IS NULL;")
log "Initial WHERE lat IS NULL count: $initial_null"
if [ "$initial_null" = "0" ]; then
log "Nothing to do — no firms with NULL lat."
unset DATABASE_URL TOKEN DB PGPASSWORD
exit 0
fi
# ── Stage 1: geonames_postal ────────────────────────────────────────────────
log "[stage 1] geonames_postal (exact 6-digit postal match)..."
n=$(psql -v ON_ERROR_STOP=1 -At -c "
WITH cand AS (
SELECT e.cui FROM firms.entities e
WHERE e.lat IS NULL
AND e.adr_cod_postal ~ '^[0-9]{6}\$'
AND EXISTS (SELECT 1 FROM firms.postal_codes_best pc WHERE pc.postal_code = e.adr_cod_postal)
)
UPDATE firms.entities e
SET
lat = pc.lat::double precision,
lng = pc.lng::double precision,
geom = ST_SetSRID(ST_MakePoint(pc.lng::double precision, pc.lat::double precision), 4326)::geography,
geocode_source = 'geonames_postal',
geocode_score = 0.6,
geocoded_at = now(),
updated_at = now()
FROM firms.postal_codes_best pc, cand
WHERE e.cui = cand.cui
AND e.adr_cod_postal = pc.postal_code
AND e.lat IS NULL
RETURNING 1
" | wc -l)
log "[stage 1] updated $n rows"
# ── Stage 2: uat_centroid by siruta ─────────────────────────────────────────
log "[stage 2] uat_centroid (via siruta → GisUat polygon centroid)..."
n=$(psql -v ON_ERROR_STOP=1 -At -c "
WITH cand AS (
SELECT e.cui FROM firms.entities e
WHERE e.lat IS NULL
AND e.siruta IS NOT NULL
AND EXISTS (SELECT 1 FROM public.\"GisUat\" gu WHERE gu.siruta = e.siruta)
)
UPDATE firms.entities e
SET
lat = ST_Y(ST_Transform(ST_Centroid(gu.geom), 4326))::double precision,
lng = ST_X(ST_Transform(ST_Centroid(gu.geom), 4326))::double precision,
geom = ST_Transform(ST_Centroid(gu.geom), 4326)::geography,
geocode_source = 'uat_centroid',
geocode_score = 0.3,
geocoded_at = now(),
updated_at = now()
FROM public.\"GisUat\" gu, cand
WHERE e.cui = cand.cui
AND e.siruta = gu.siruta
AND e.lat IS NULL
RETURNING 1
" | wc -l)
log "[stage 2] updated $n rows"
# ── Stage 3: photon (docker) ────────────────────────────────────────────────
if [ "$SKIP_PHOTON" = "1" ]; then
log "[stage 3] SKIP_PHOTON=1 — skipping photon stage"
else
remaining_photon=$(psql -At -c "
SELECT count(*) FROM firms.entities
WHERE geocode_source IS NULL
AND adr_strada IS NOT NULL
AND adr_judet IS NOT NULL
")
if [ "$remaining_photon" = "0" ]; then
log "[stage 3] no photon-eligible rows — skipping"
else
log "[stage 3] photon — $remaining_photon candidates..."
if docker ps --filter name=vreaudigital-geocode --format '{{.Names}}' | grep -q '^vreaudigital-geocode$'; then
log "WARN: vreaudigital-geocode already running — skipping stage 3"
else
docker rm -f vreaudigital-geocode 2>/dev/null || true
umask 077
ENVF=$(mktemp /tmp/.vreaudigital-geocode-env.XXXXXX)
printf 'DATABASE_URL=%s\nPHOTON_URL=http://127.0.0.1:2322\n' \
"$DATABASE_URL" > "$ENVF"
cd "$SEAP_DIR"
CID=$(docker run -d \
--name vreaudigital-geocode \
--network host \
--env-file "$ENVF" \
-v "$(pwd):/work" -w /work \
--user "$(id -u):$(id -g)" \
--restart no \
node:22-alpine \
sh -c "npx tsx src/geocode-photon.ts --concurrency=$PHOTON_CONCURRENCY --batch=$PHOTON_BATCH")
log "container started: $CID"
sleep 3
rm -f "$ENVF"
docker wait vreaudigital-geocode >/dev/null
EXIT_CODE=$(docker inspect -f '{{.State.ExitCode}}' vreaudigital-geocode 2>/dev/null || echo "?")
docker logs vreaudigital-geocode 2>&1 | tail -10 | tee -a "$LOG"
log "[stage 3] photon container exit=$EXIT_CODE"
fi
fi
fi
unset DATABASE_URL TOKEN DB
# ── Stage 3b/3c/3d: uat_centroid by name (no siruta, no postal) ─────────────
# For rows w/o adr_strada (skipped by photon) match postal_codes locality+county
# median. Three normalization variants try locality token, comuna parent, and
# Romanian â/î diacritic normalization.
log "[stage 3b] uat_centroid by postal_codes locality+county median (locality token)..."
n=$(psql -v ON_ERROR_STOP=1 -At -c "
WITH cand AS (
SELECT e.cui, e.adr_judet, e.adr_localitate FROM firms.entities e
WHERE e.lat IS NULL AND e.adr_judet IS NOT NULL AND e.adr_localitate IS NOT NULL
),
loc_clean AS (
SELECT
cui,
upper(unaccent(regexp_replace(adr_judet,'^MUNICIPIUL ',''))) AS judet_key,
upper(unaccent(trim(regexp_replace(
regexp_replace(adr_localitate, ',.*\$', ''),
'^(Sat|Or[şs]\\.?|Mun\\.?|Loc\\.?|Cartier|Comuna)\\s+', '', 'i'
)))) AS loc_key
FROM cand
),
pc_agg AS (
SELECT
upper(unaccent(coalesce(county,''))) AS judet_key,
upper(unaccent(place_name)) AS loc_key,
percentile_cont(0.5) WITHIN GROUP (ORDER BY lat::double precision) AS lat,
percentile_cont(0.5) WITHIN GROUP (ORDER BY lng::double precision) AS lng
FROM firms.postal_codes
WHERE place_name IS NOT NULL
GROUP BY 1, 2
)
UPDATE firms.entities e
SET
lat = pc.lat,
lng = pc.lng,
geom = ST_SetSRID(ST_MakePoint(pc.lng, pc.lat), 4326)::geography,
geocode_source = 'uat_centroid',
geocode_score = 0.3,
geocoded_at = now(),
updated_at = now()
FROM loc_clean lc
JOIN pc_agg pc ON pc.judet_key = lc.judet_key AND pc.loc_key = lc.loc_key
WHERE e.cui = lc.cui AND e.lat IS NULL
RETURNING 1
" | wc -l)
log "[stage 3b] updated $n rows"
log "[stage 3c] uat_centroid by Comuna parent..."
n=$(psql -v ON_ERROR_STOP=1 -At -c "
WITH cand AS (
SELECT e.cui, e.adr_judet, e.adr_localitate FROM firms.entities e
WHERE e.lat IS NULL AND e.adr_judet IS NOT NULL AND e.adr_localitate IS NOT NULL
),
loc_clean AS (
SELECT
cui,
upper(unaccent(regexp_replace(adr_judet,'^MUNICIPIUL ',''))) AS judet_key,
upper(unaccent(trim((regexp_match(adr_localitate, 'Comuna\\s+([^,]+)', 'i'))[1]))) AS loc_key
FROM cand
),
pc_agg AS (
SELECT
upper(unaccent(coalesce(county,''))) AS judet_key,
upper(unaccent(place_name)) AS loc_key,
percentile_cont(0.5) WITHIN GROUP (ORDER BY lat::double precision) AS lat,
percentile_cont(0.5) WITHIN GROUP (ORDER BY lng::double precision) AS lng
FROM firms.postal_codes
WHERE place_name IS NOT NULL
GROUP BY 1, 2
)
UPDATE firms.entities e
SET
lat = pc.lat,
lng = pc.lng,
geom = ST_SetSRID(ST_MakePoint(pc.lng, pc.lat), 4326)::geography,
geocode_source = 'uat_centroid',
geocode_score = 0.3,
geocoded_at = now(),
updated_at = now()
FROM loc_clean lc
JOIN pc_agg pc ON pc.judet_key = lc.judet_key AND pc.loc_key = lc.loc_key
WHERE e.cui = lc.cui AND e.lat IS NULL AND lc.loc_key IS NOT NULL
RETURNING 1
" | wc -l)
log "[stage 3c] updated $n rows"
log "[stage 3d] uat_centroid with â/î normalization (Oraş/Comuna/locality)..."
n=$(psql -v ON_ERROR_STOP=1 -At -c "
WITH cand AS (
SELECT e.cui, e.adr_judet, e.adr_localitate FROM firms.entities e
WHERE e.lat IS NULL AND e.adr_judet IS NOT NULL AND e.adr_localitate IS NOT NULL
),
loc_norm AS (
SELECT
cui,
upper(unaccent(regexp_replace(adr_judet,'^MUNICIPIUL ',''))) AS judet_key,
upper(unaccent(translate(trim(coalesce(
(regexp_match(adr_localitate, 'Or[şs]\\.?\\s+([^,]+)', 'i'))[1],
(regexp_match(adr_localitate, 'Comuna\\s+([^,]+)', 'i'))[1],
regexp_replace(regexp_replace(adr_localitate, ',.*\$',''), '^(Sat|Loc\\.?)\\s+','','i')
)), 'îÎ', 'âÂ'))) AS loc_key
FROM cand
),
pc_agg AS (
SELECT
upper(unaccent(coalesce(county,''))) AS judet_key,
upper(unaccent(translate(place_name, 'îÎ','âÂ'))) AS loc_key,
percentile_cont(0.5) WITHIN GROUP (ORDER BY lat::double precision) AS lat,
percentile_cont(0.5) WITHIN GROUP (ORDER BY lng::double precision) AS lng
FROM firms.postal_codes
WHERE place_name IS NOT NULL
GROUP BY 1, 2
)
UPDATE firms.entities e
SET
lat = pc.lat,
lng = pc.lng,
geom = ST_SetSRID(ST_MakePoint(pc.lng, pc.lat), 4326)::geography,
geocode_source = 'uat_centroid',
geocode_score = 0.3,
geocoded_at = now(),
updated_at = now()
FROM loc_norm ln
JOIN pc_agg pc ON pc.judet_key = ln.judet_key AND pc.loc_key = ln.loc_key
WHERE e.cui = ln.cui AND e.lat IS NULL AND ln.loc_key IS NOT NULL
RETURNING 1
" | wc -l)
log "[stage 3d] updated $n rows"
# ── Stage 4: judet_centroid fallback ────────────────────────────────────────
log "[stage 4] judet_centroid (county median, last resort)..."
n=$(psql -v ON_ERROR_STOP=1 -At -c "
WITH judet_agg AS (
SELECT
upper(unaccent(coalesce(county,''))) AS judet_key,
percentile_cont(0.5) WITHIN GROUP (ORDER BY lat::double precision) AS lat,
percentile_cont(0.5) WITHIN GROUP (ORDER BY lng::double precision) AS lng
FROM firms.postal_codes
WHERE county IS NOT NULL
GROUP BY 1
)
UPDATE firms.entities e
SET
lat = ja.lat,
lng = ja.lng,
geom = ST_SetSRID(ST_MakePoint(ja.lng, ja.lat), 4326)::geography,
geocode_source = 'judet_centroid',
geocode_score = 0.1,
geocoded_at = now(),
updated_at = now()
FROM judet_agg ja
WHERE upper(unaccent(regexp_replace(e.adr_judet,'^MUNICIPIUL ',''))) = ja.judet_key
AND e.lat IS NULL
RETURNING 1
" | wc -l)
log "[stage 4] updated $n rows"
# ── Final stats ─────────────────────────────────────────────────────────────
log "Final stats:"
psql -A -F"|" -c "
SELECT
geocode_source,
count(*) AS rows
FROM firms.entities
GROUP BY geocode_source
ORDER BY rows DESC;
" 2>&1 | tee -a "$LOG"
residual=$(psql -At -c "SELECT count(*) FROM firms.entities WHERE lat IS NULL;")
log "Residual WHERE lat IS NULL: $residual (out of reach — no address fields)"
log "=== Geocode-firms fallback chain done ==="
unset PGPASSWORD