a6c03a091e
Moved from gov-agreg/src/pages/achizitii/* to root (drop prefix). - 22 pages migrated, 127 files total - All internal links: /achizitii/X → /X (176 occurrences fixed) - AchizitiiLayout subnav rewritten: /X paths, top-right link to vreaudigital.ro hub - BaseLayout new (vreau.digital branding, OG tags, site URL) - astro.config.mjs: site https://vreau.digital, server output (was static) - docker-compose: port 5096 (vreaudigital is 5095), container vreau-digital - deploy.sh: paths /opt/vreau-digital, log /var/log/vreau-digital-deploy.log Backend shared with gov-agreg: - PostgreSQL satra (same schemas: seap, firms, anaf, anre, ...) - Photon, Martin tiles - Infisical /vreaudigital path (DATABASE_URL etc. shared) build: PASS (npx astro check 0 errors, npm run build 5s vite + 10s server)
344 lines
13 KiB
Bash
Executable File
344 lines
13 KiB
Bash
Executable File
#!/bin/bash
|
|
# Full geocoding fallback chain for firms.entities (WHERE lat IS NULL).
|
|
#
|
|
# Re-runnable / idempotent. Filters every stage on `lat IS NULL` so re-runs
|
|
# are no-ops once coverage is full. Safe to call after any ONRC fresh import
|
|
# (import-onrc-fresh.sh) which by itself does NOT geocode new rows.
|
|
#
|
|
# Stage chain (highest accuracy first):
|
|
# 1. geonames_postal — exact 6-digit RO postal match against firms.postal_codes_best
|
|
# 2. uat_centroid — by siruta → public."GisUat" polygon centroid
|
|
# 3. photon — Komoot Photon OSM geocoder (local 127.0.0.1:2322), street-level
|
|
# 3b/3c/3d. uat_centroid by postal_codes (locality+county median) — for rows w/o
|
|
# adr_strada (Photon's filter requires it). Tries locality token,
|
|
# then Comuna parent, then â/î normalization.
|
|
# 4. judet_centroid — last resort, county median from firms.postal_codes
|
|
#
|
|
# Two rows in the entire dataset have literally zero address fields and stay NULL.
|
|
#
|
|
# Usage:
|
|
# sudo /opt/vreaudigital/services/seap-scraper/cron/geocode-firms.sh
|
|
# sudo SKIP_PHOTON=1 /opt/vreaudigital/services/seap-scraper/cron/geocode-firms.sh
|
|
#
|
|
# Env:
|
|
# SKIP_PHOTON=1 — skip stage 3 (photon docker) — useful when Photon down
|
|
# PHOTON_CONCURRENCY=40
|
|
# PHOTON_BATCH=200
|
|
|
|
set -euo pipefail
|
|
|
|
LOG=/var/log/vreaudigital-geocode-firms.log
|
|
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
|
|
SEAP_DIR="$(dirname "$SCRIPT_DIR")"
|
|
|
|
SKIP_PHOTON="${SKIP_PHOTON:-0}"
|
|
PHOTON_CONCURRENCY="${PHOTON_CONCURRENCY:-40}"
|
|
PHOTON_BATCH="${PHOTON_BATCH:-200}"
|
|
|
|
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
|
|
|
|
log "=== Geocode-firms fallback chain started ==="
|
|
|
|
if [ ! -f /opt/vreaudigital/.infisical-mi ]; then
|
|
log "FATAL: /opt/vreaudigital/.infisical-mi missing"
|
|
exit 1
|
|
fi
|
|
|
|
# shellcheck disable=SC1091
|
|
source /opt/vreaudigital/.infisical-mi
|
|
TOKEN=$(infisical login --method=universal-auth \
|
|
--domain="$INFISICAL_API_URL" \
|
|
--client-id="$INFISICAL_CLIENT_ID" \
|
|
--client-secret="$INFISICAL_CLIENT_SECRET" \
|
|
--silent --plain)
|
|
DATABASE_URL=$(infisical run --domain="$INFISICAL_API_URL" \
|
|
--projectId="$INFISICAL_PROJECT_ID" \
|
|
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \
|
|
--silent --token="$TOKEN" \
|
|
-- sh -c 'echo "$DATABASE_URL"')
|
|
DB=$(echo "$DATABASE_URL" | sed -E 's/[?&]schema=[^&]*//; s/\?$//')
|
|
export PGUSER=$(echo "$DB" | sed -E 's|^postgresql://([^:]+):.*|\1|')
|
|
export PGPASSWORD=$(echo "$DB" | sed -E 's|^postgresql://[^:]+:([^@]+)@.*|\1|')
|
|
export PGHOST=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@([^:/]+).*|\1|')
|
|
export PGPORT=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^:]+:([0-9]+)/.*|\1|')
|
|
export PGDATABASE=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^/]+/([^?]+).*|\1|')
|
|
|
|
initial_null=$(psql -At -c "SELECT count(*) FROM firms.entities WHERE lat IS NULL;")
|
|
log "Initial WHERE lat IS NULL count: $initial_null"
|
|
|
|
if [ "$initial_null" = "0" ]; then
|
|
log "Nothing to do — no firms with NULL lat."
|
|
unset DATABASE_URL TOKEN DB PGPASSWORD
|
|
exit 0
|
|
fi
|
|
|
|
# ── Stage 1: geonames_postal ────────────────────────────────────────────────
|
|
log "[stage 1] geonames_postal (exact 6-digit postal match)..."
|
|
n=$(psql -v ON_ERROR_STOP=1 -At -c "
|
|
WITH cand AS (
|
|
SELECT e.cui FROM firms.entities e
|
|
WHERE e.lat IS NULL
|
|
AND e.adr_cod_postal ~ '^[0-9]{6}\$'
|
|
AND EXISTS (SELECT 1 FROM firms.postal_codes_best pc WHERE pc.postal_code = e.adr_cod_postal)
|
|
)
|
|
UPDATE firms.entities e
|
|
SET
|
|
lat = pc.lat::double precision,
|
|
lng = pc.lng::double precision,
|
|
geom = ST_SetSRID(ST_MakePoint(pc.lng::double precision, pc.lat::double precision), 4326)::geography,
|
|
geocode_source = 'geonames_postal',
|
|
geocode_score = 0.6,
|
|
geocoded_at = now(),
|
|
updated_at = now()
|
|
FROM firms.postal_codes_best pc, cand
|
|
WHERE e.cui = cand.cui
|
|
AND e.adr_cod_postal = pc.postal_code
|
|
AND e.lat IS NULL
|
|
RETURNING 1
|
|
" | wc -l)
|
|
log "[stage 1] updated $n rows"
|
|
|
|
# ── Stage 2: uat_centroid by siruta ─────────────────────────────────────────
|
|
log "[stage 2] uat_centroid (via siruta → GisUat polygon centroid)..."
|
|
n=$(psql -v ON_ERROR_STOP=1 -At -c "
|
|
WITH cand AS (
|
|
SELECT e.cui FROM firms.entities e
|
|
WHERE e.lat IS NULL
|
|
AND e.siruta IS NOT NULL
|
|
AND EXISTS (SELECT 1 FROM public.\"GisUat\" gu WHERE gu.siruta = e.siruta)
|
|
)
|
|
UPDATE firms.entities e
|
|
SET
|
|
lat = ST_Y(ST_Transform(ST_Centroid(gu.geom), 4326))::double precision,
|
|
lng = ST_X(ST_Transform(ST_Centroid(gu.geom), 4326))::double precision,
|
|
geom = ST_Transform(ST_Centroid(gu.geom), 4326)::geography,
|
|
geocode_source = 'uat_centroid',
|
|
geocode_score = 0.3,
|
|
geocoded_at = now(),
|
|
updated_at = now()
|
|
FROM public.\"GisUat\" gu, cand
|
|
WHERE e.cui = cand.cui
|
|
AND e.siruta = gu.siruta
|
|
AND e.lat IS NULL
|
|
RETURNING 1
|
|
" | wc -l)
|
|
log "[stage 2] updated $n rows"
|
|
|
|
# ── Stage 3: photon (docker) ────────────────────────────────────────────────
|
|
if [ "$SKIP_PHOTON" = "1" ]; then
|
|
log "[stage 3] SKIP_PHOTON=1 — skipping photon stage"
|
|
else
|
|
remaining_photon=$(psql -At -c "
|
|
SELECT count(*) FROM firms.entities
|
|
WHERE geocode_source IS NULL
|
|
AND adr_strada IS NOT NULL
|
|
AND adr_judet IS NOT NULL
|
|
")
|
|
if [ "$remaining_photon" = "0" ]; then
|
|
log "[stage 3] no photon-eligible rows — skipping"
|
|
else
|
|
log "[stage 3] photon — $remaining_photon candidates..."
|
|
if docker ps --filter name=vreaudigital-geocode --format '{{.Names}}' | grep -q '^vreaudigital-geocode$'; then
|
|
log "WARN: vreaudigital-geocode already running — skipping stage 3"
|
|
else
|
|
docker rm -f vreaudigital-geocode 2>/dev/null || true
|
|
umask 077
|
|
ENVF=$(mktemp /tmp/.vreaudigital-geocode-env.XXXXXX)
|
|
printf 'DATABASE_URL=%s\nPHOTON_URL=http://127.0.0.1:2322\n' \
|
|
"$DATABASE_URL" > "$ENVF"
|
|
cd "$SEAP_DIR"
|
|
CID=$(docker run -d \
|
|
--name vreaudigital-geocode \
|
|
--network host \
|
|
--env-file "$ENVF" \
|
|
-v "$(pwd):/work" -w /work \
|
|
--user "$(id -u):$(id -g)" \
|
|
--restart no \
|
|
node:22-alpine \
|
|
sh -c "npx tsx src/geocode-photon.ts --concurrency=$PHOTON_CONCURRENCY --batch=$PHOTON_BATCH")
|
|
log "container started: $CID"
|
|
sleep 3
|
|
rm -f "$ENVF"
|
|
docker wait vreaudigital-geocode >/dev/null
|
|
EXIT_CODE=$(docker inspect -f '{{.State.ExitCode}}' vreaudigital-geocode 2>/dev/null || echo "?")
|
|
docker logs vreaudigital-geocode 2>&1 | tail -10 | tee -a "$LOG"
|
|
log "[stage 3] photon container exit=$EXIT_CODE"
|
|
fi
|
|
fi
|
|
fi
|
|
|
|
unset DATABASE_URL TOKEN DB
|
|
|
|
# ── Stage 3b/3c/3d: uat_centroid by name (no siruta, no postal) ─────────────
|
|
# For rows w/o adr_strada (skipped by photon) match postal_codes locality+county
|
|
# median. Three normalization variants try locality token, comuna parent, and
|
|
# Romanian â/î diacritic normalization.
|
|
log "[stage 3b] uat_centroid by postal_codes locality+county median (locality token)..."
|
|
n=$(psql -v ON_ERROR_STOP=1 -At -c "
|
|
WITH cand AS (
|
|
SELECT e.cui, e.adr_judet, e.adr_localitate FROM firms.entities e
|
|
WHERE e.lat IS NULL AND e.adr_judet IS NOT NULL AND e.adr_localitate IS NOT NULL
|
|
),
|
|
loc_clean AS (
|
|
SELECT
|
|
cui,
|
|
upper(unaccent(regexp_replace(adr_judet,'^MUNICIPIUL ',''))) AS judet_key,
|
|
upper(unaccent(trim(regexp_replace(
|
|
regexp_replace(adr_localitate, ',.*\$', ''),
|
|
'^(Sat|Or[şs]\\.?|Mun\\.?|Loc\\.?|Cartier|Comuna)\\s+', '', 'i'
|
|
)))) AS loc_key
|
|
FROM cand
|
|
),
|
|
pc_agg AS (
|
|
SELECT
|
|
upper(unaccent(coalesce(county,''))) AS judet_key,
|
|
upper(unaccent(place_name)) AS loc_key,
|
|
percentile_cont(0.5) WITHIN GROUP (ORDER BY lat::double precision) AS lat,
|
|
percentile_cont(0.5) WITHIN GROUP (ORDER BY lng::double precision) AS lng
|
|
FROM firms.postal_codes
|
|
WHERE place_name IS NOT NULL
|
|
GROUP BY 1, 2
|
|
)
|
|
UPDATE firms.entities e
|
|
SET
|
|
lat = pc.lat,
|
|
lng = pc.lng,
|
|
geom = ST_SetSRID(ST_MakePoint(pc.lng, pc.lat), 4326)::geography,
|
|
geocode_source = 'uat_centroid',
|
|
geocode_score = 0.3,
|
|
geocoded_at = now(),
|
|
updated_at = now()
|
|
FROM loc_clean lc
|
|
JOIN pc_agg pc ON pc.judet_key = lc.judet_key AND pc.loc_key = lc.loc_key
|
|
WHERE e.cui = lc.cui AND e.lat IS NULL
|
|
RETURNING 1
|
|
" | wc -l)
|
|
log "[stage 3b] updated $n rows"
|
|
|
|
log "[stage 3c] uat_centroid by Comuna parent..."
|
|
n=$(psql -v ON_ERROR_STOP=1 -At -c "
|
|
WITH cand AS (
|
|
SELECT e.cui, e.adr_judet, e.adr_localitate FROM firms.entities e
|
|
WHERE e.lat IS NULL AND e.adr_judet IS NOT NULL AND e.adr_localitate IS NOT NULL
|
|
),
|
|
loc_clean AS (
|
|
SELECT
|
|
cui,
|
|
upper(unaccent(regexp_replace(adr_judet,'^MUNICIPIUL ',''))) AS judet_key,
|
|
upper(unaccent(trim((regexp_match(adr_localitate, 'Comuna\\s+([^,]+)', 'i'))[1]))) AS loc_key
|
|
FROM cand
|
|
),
|
|
pc_agg AS (
|
|
SELECT
|
|
upper(unaccent(coalesce(county,''))) AS judet_key,
|
|
upper(unaccent(place_name)) AS loc_key,
|
|
percentile_cont(0.5) WITHIN GROUP (ORDER BY lat::double precision) AS lat,
|
|
percentile_cont(0.5) WITHIN GROUP (ORDER BY lng::double precision) AS lng
|
|
FROM firms.postal_codes
|
|
WHERE place_name IS NOT NULL
|
|
GROUP BY 1, 2
|
|
)
|
|
UPDATE firms.entities e
|
|
SET
|
|
lat = pc.lat,
|
|
lng = pc.lng,
|
|
geom = ST_SetSRID(ST_MakePoint(pc.lng, pc.lat), 4326)::geography,
|
|
geocode_source = 'uat_centroid',
|
|
geocode_score = 0.3,
|
|
geocoded_at = now(),
|
|
updated_at = now()
|
|
FROM loc_clean lc
|
|
JOIN pc_agg pc ON pc.judet_key = lc.judet_key AND pc.loc_key = lc.loc_key
|
|
WHERE e.cui = lc.cui AND e.lat IS NULL AND lc.loc_key IS NOT NULL
|
|
RETURNING 1
|
|
" | wc -l)
|
|
log "[stage 3c] updated $n rows"
|
|
|
|
log "[stage 3d] uat_centroid with â/î normalization (Oraş/Comuna/locality)..."
|
|
n=$(psql -v ON_ERROR_STOP=1 -At -c "
|
|
WITH cand AS (
|
|
SELECT e.cui, e.adr_judet, e.adr_localitate FROM firms.entities e
|
|
WHERE e.lat IS NULL AND e.adr_judet IS NOT NULL AND e.adr_localitate IS NOT NULL
|
|
),
|
|
loc_norm AS (
|
|
SELECT
|
|
cui,
|
|
upper(unaccent(regexp_replace(adr_judet,'^MUNICIPIUL ',''))) AS judet_key,
|
|
upper(unaccent(translate(trim(coalesce(
|
|
(regexp_match(adr_localitate, 'Or[şs]\\.?\\s+([^,]+)', 'i'))[1],
|
|
(regexp_match(adr_localitate, 'Comuna\\s+([^,]+)', 'i'))[1],
|
|
regexp_replace(regexp_replace(adr_localitate, ',.*\$',''), '^(Sat|Loc\\.?)\\s+','','i')
|
|
)), 'îÎ', 'âÂ'))) AS loc_key
|
|
FROM cand
|
|
),
|
|
pc_agg AS (
|
|
SELECT
|
|
upper(unaccent(coalesce(county,''))) AS judet_key,
|
|
upper(unaccent(translate(place_name, 'îÎ','âÂ'))) AS loc_key,
|
|
percentile_cont(0.5) WITHIN GROUP (ORDER BY lat::double precision) AS lat,
|
|
percentile_cont(0.5) WITHIN GROUP (ORDER BY lng::double precision) AS lng
|
|
FROM firms.postal_codes
|
|
WHERE place_name IS NOT NULL
|
|
GROUP BY 1, 2
|
|
)
|
|
UPDATE firms.entities e
|
|
SET
|
|
lat = pc.lat,
|
|
lng = pc.lng,
|
|
geom = ST_SetSRID(ST_MakePoint(pc.lng, pc.lat), 4326)::geography,
|
|
geocode_source = 'uat_centroid',
|
|
geocode_score = 0.3,
|
|
geocoded_at = now(),
|
|
updated_at = now()
|
|
FROM loc_norm ln
|
|
JOIN pc_agg pc ON pc.judet_key = ln.judet_key AND pc.loc_key = ln.loc_key
|
|
WHERE e.cui = ln.cui AND e.lat IS NULL AND ln.loc_key IS NOT NULL
|
|
RETURNING 1
|
|
" | wc -l)
|
|
log "[stage 3d] updated $n rows"
|
|
|
|
# ── Stage 4: judet_centroid fallback ────────────────────────────────────────
|
|
log "[stage 4] judet_centroid (county median, last resort)..."
|
|
n=$(psql -v ON_ERROR_STOP=1 -At -c "
|
|
WITH judet_agg AS (
|
|
SELECT
|
|
upper(unaccent(coalesce(county,''))) AS judet_key,
|
|
percentile_cont(0.5) WITHIN GROUP (ORDER BY lat::double precision) AS lat,
|
|
percentile_cont(0.5) WITHIN GROUP (ORDER BY lng::double precision) AS lng
|
|
FROM firms.postal_codes
|
|
WHERE county IS NOT NULL
|
|
GROUP BY 1
|
|
)
|
|
UPDATE firms.entities e
|
|
SET
|
|
lat = ja.lat,
|
|
lng = ja.lng,
|
|
geom = ST_SetSRID(ST_MakePoint(ja.lng, ja.lat), 4326)::geography,
|
|
geocode_source = 'judet_centroid',
|
|
geocode_score = 0.1,
|
|
geocoded_at = now(),
|
|
updated_at = now()
|
|
FROM judet_agg ja
|
|
WHERE upper(unaccent(regexp_replace(e.adr_judet,'^MUNICIPIUL ',''))) = ja.judet_key
|
|
AND e.lat IS NULL
|
|
RETURNING 1
|
|
" | wc -l)
|
|
log "[stage 4] updated $n rows"
|
|
|
|
# ── Final stats ─────────────────────────────────────────────────────────────
|
|
log "Final stats:"
|
|
psql -A -F"|" -c "
|
|
SELECT
|
|
geocode_source,
|
|
count(*) AS rows
|
|
FROM firms.entities
|
|
GROUP BY geocode_source
|
|
ORDER BY rows DESC;
|
|
" 2>&1 | tee -a "$LOG"
|
|
|
|
residual=$(psql -At -c "SELECT count(*) FROM firms.entities WHERE lat IS NULL;")
|
|
log "Residual WHERE lat IS NULL: $residual (out of reach — no address fields)"
|
|
log "=== Geocode-firms fallback chain done ==="
|
|
|
|
unset PGPASSWORD
|