a6c03a091e
Moved from gov-agreg/src/pages/achizitii/* to root (drop prefix). - 22 pages migrated, 127 files total - All internal links: /achizitii/X → /X (176 occurrences fixed) - AchizitiiLayout subnav rewritten: /X paths, top-right link to vreaudigital.ro hub - BaseLayout new (vreau.digital branding, OG tags, site URL) - astro.config.mjs: site https://vreau.digital, server output (was static) - docker-compose: port 5096 (vreaudigital is 5095), container vreau-digital - deploy.sh: paths /opt/vreau-digital, log /var/log/vreau-digital-deploy.log Backend shared with gov-agreg: - PostgreSQL satra (same schemas: seap, firms, anaf, anre, ...) - Photon, Martin tiles - Infisical /vreaudigital path (DATABASE_URL etc. shared) build: PASS (npx astro check 0 errors, npm run build 5s vite + 10s server)
238 lines
8.9 KiB
Bash
Executable File
238 lines
8.9 KiB
Bash
Executable File
#!/bin/bash
|
||
# Run CUI-matching pass over external tables that have company names
|
||
# but no CUI yet. Idempotent — only touches rows where cui IS NULL.
|
||
#
|
||
# Currently matches:
|
||
# - fonduri.beneficiar_anunt (~41K names)
|
||
# - fonduri.afir_plati (~316K distinct names)
|
||
#
|
||
# Future: ANI shareholdings, license registries, etc. — all use the same
|
||
# firms.normalize_company_name() helper from sql/019_cui_matcher.sql.
|
||
|
||
set -uo pipefail
|
||
|
||
LOG=/var/log/vreaudigital-cui-match.log
|
||
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
|
||
|
||
# Resolve DATABASE_URL via Infisical Machine Identity
|
||
source /opt/vreaudigital/.infisical-mi
|
||
TOKEN=$(infisical login --method=universal-auth --domain="$INFISICAL_API_URL" \
|
||
--client-id="$INFISICAL_CLIENT_ID" --client-secret="$INFISICAL_CLIENT_SECRET" --silent --plain)
|
||
DBURL=$(infisical run --domain="$INFISICAL_API_URL" --projectId="$INFISICAL_PROJECT_ID" \
|
||
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" --silent --token="$TOKEN" \
|
||
-- sh -c 'echo "$DATABASE_URL"')
|
||
DB=$(echo "$DBURL" | sed -E 's/[?&]schema=[^&]*//; s/\?$//')
|
||
export PGUSER=$(echo "$DB" | sed -E 's|^postgresql://([^:]+):.*|\1|')
|
||
export PGPASSWORD=$(echo "$DB" | sed -E 's|^postgresql://[^:]+:([^@]+)@.*|\1|')
|
||
export PGHOST=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@([^:/]+).*|\1|')
|
||
export PGPORT=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^:]+:([0-9]+)/.*|\1|')
|
||
export PGDATABASE=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^/]+/([^?]+).*|\1|')
|
||
unset DBURL TOKEN DB
|
||
|
||
log "=== CUI matcher started ==="
|
||
|
||
# Apply schema (idempotent — generates name_normalized column + indexes)
|
||
psql -v ON_ERROR_STOP=1 -f /opt/vreaudigital/services/seap-scraper/sql/019_cui_matcher.sql >/dev/null
|
||
|
||
run_matcher() {
|
||
local TABLE="$1"
|
||
local NAME_COL="$2"
|
||
local JUDET_COL="$3" # may be empty string if source has no judet
|
||
local PRINTABLE="$4"
|
||
local RUN_TRGM="${5:-true}" # set to "false" to skip Stages B+C
|
||
# (e.g. AFIR direct payments where unmatched
|
||
# rows are individual farmers, not companies)
|
||
|
||
log "[$PRINTABLE] before: $(psql -At -c "SELECT COUNT(*) FILTER (WHERE cui IS NULL), COUNT(*) FROM $TABLE;" | tr '|' '/')"
|
||
|
||
# Stage A: exact normalized match (unique). When multiple firms share the
|
||
# same normalized name (homonyms), we skip — Stage B + judet handles them.
|
||
log "[$PRINTABLE] Stage A: exact normalized match..."
|
||
psql -v ON_ERROR_STOP=1 <<SQL 2>&1 | tee -a "$LOG"
|
||
WITH cand AS (
|
||
SELECT t.ctid AS row_ctid,
|
||
firms.normalize_company_name(t.$NAME_COL) AS norm
|
||
FROM $TABLE t
|
||
WHERE t.cui IS NULL
|
||
AND t.$NAME_COL IS NOT NULL
|
||
),
|
||
matched AS (
|
||
SELECT c.row_ctid,
|
||
MIN(e.cui) AS cui,
|
||
COUNT(*) AS n
|
||
FROM cand c
|
||
JOIN firms.entities e ON e.name_normalized = c.norm
|
||
GROUP BY c.row_ctid
|
||
)
|
||
UPDATE $TABLE t
|
||
SET cui = m.cui,
|
||
cui_match_score = 1.0,
|
||
cui_match_method = 'exact_norm',
|
||
matched_at = now()
|
||
FROM matched m
|
||
WHERE t.ctid = m.row_ctid
|
||
AND t.cui IS NULL
|
||
AND m.n = 1;
|
||
SQL
|
||
log "[$PRINTABLE] Stage A done"
|
||
|
||
# Stage B: pg_trgm similarity. Picks top candidate if score ≥ 0.85 AND
|
||
# gap to second-best ≥ 0.10 (so we know it's unambiguously the best match).
|
||
#
|
||
# Performance: previously O(unmatched_rows × candidate_pool) at default
|
||
# threshold 0.3 — 30+ min on AFIR (493K rows). Three-step pipeline now:
|
||
# 1. Materialize unmatched rows (rowid + norm) into a temp table
|
||
# 2. DISTINCT norms → much smaller trgm input set (BEN 13K→2K, AFIR 493K→274K)
|
||
# 3. SET pg_trgm.similarity_threshold = 0.7 so the gin `%` operator returns
|
||
# only candidates above the post-filter floor (drops fan-out by ~10×)
|
||
# The 0.85/0.10 accept rule is unchanged and produces identical matches.
|
||
if [ "$RUN_TRGM" != "true" ]; then
|
||
log "[$PRINTABLE] Stage B/C skipped (RUN_TRGM=false) — unmatched rows in this source are individuals, not registered companies"
|
||
log "[$PRINTABLE] after: $(psql -At -c "
|
||
SELECT COUNT(*) FILTER (WHERE cui IS NULL),
|
||
COUNT(*),
|
||
ROUND(100.0*COUNT(*) FILTER (WHERE cui IS NOT NULL) / COUNT(*), 1) || '%'
|
||
FROM $TABLE;" | tr '|' '/')"
|
||
return 0
|
||
fi
|
||
|
||
log "[$PRINTABLE] Stage B: pg_trgm fuzzy (score ≥ 0.85, gap ≥ 0.10)..."
|
||
psql -v ON_ERROR_STOP=1 <<SQL 2>&1 | tee -a "$LOG"
|
||
SET pg_trgm.similarity_threshold = 0.7;
|
||
|
||
CREATE TEMP TABLE _sb_rows AS
|
||
SELECT t.ctid AS rowid,
|
||
firms.normalize_company_name(t.$NAME_COL) AS norm
|
||
FROM $TABLE t
|
||
WHERE t.cui IS NULL
|
||
AND t.$NAME_COL IS NOT NULL
|
||
AND length(firms.normalize_company_name(t.$NAME_COL)) >= 5;
|
||
CREATE INDEX ON _sb_rows (norm);
|
||
ANALYZE _sb_rows;
|
||
|
||
CREATE TEMP TABLE _sb_norms AS SELECT DISTINCT norm FROM _sb_rows;
|
||
ANALYZE _sb_norms;
|
||
|
||
CREATE TEMP TABLE _sb_resolved AS
|
||
WITH ranked AS (
|
||
SELECT c.norm,
|
||
e.cui,
|
||
similarity(e.name_normalized, c.norm) AS sim,
|
||
ROW_NUMBER() OVER (
|
||
PARTITION BY c.norm
|
||
ORDER BY similarity(e.name_normalized, c.norm) DESC, e.cui
|
||
) AS rn
|
||
FROM _sb_norms c
|
||
JOIN firms.entities e ON e.name_normalized % c.norm
|
||
),
|
||
top2 AS (
|
||
SELECT norm,
|
||
MAX(sim) FILTER (WHERE rn = 1) AS s1,
|
||
MAX(sim) FILTER (WHERE rn = 2) AS s2,
|
||
MAX(cui) FILTER (WHERE rn = 1) AS cui1
|
||
FROM ranked WHERE rn <= 2
|
||
GROUP BY norm
|
||
)
|
||
SELECT norm, cui1, s1
|
||
FROM top2
|
||
WHERE s1 >= 0.85
|
||
AND (s2 IS NULL OR (s1 - s2) >= 0.10);
|
||
CREATE INDEX ON _sb_resolved (norm);
|
||
ANALYZE _sb_resolved;
|
||
|
||
UPDATE $TABLE t
|
||
SET cui = r.cui1,
|
||
cui_match_score = r.s1,
|
||
cui_match_method = 'trgm_unique',
|
||
matched_at = now()
|
||
FROM _sb_rows rw
|
||
JOIN _sb_resolved r ON rw.norm = r.norm
|
||
WHERE t.ctid = rw.rowid
|
||
AND t.cui IS NULL;
|
||
|
||
DROP TABLE _sb_rows, _sb_norms, _sb_resolved;
|
||
SQL
|
||
log "[$PRINTABLE] Stage B done"
|
||
|
||
# Stage C: judet disambiguation when source has a judet column.
|
||
# Multiple candidates above 0.7 → prefer the one whose adr_judet matches.
|
||
# Same dedup-by-(norm,judet) + SET threshold pipeline as Stage B.
|
||
if [ -n "$JUDET_COL" ]; then
|
||
log "[$PRINTABLE] Stage C: judet disambiguation..."
|
||
psql -v ON_ERROR_STOP=1 <<SQL 2>&1 | tee -a "$LOG"
|
||
SET pg_trgm.similarity_threshold = 0.7;
|
||
|
||
CREATE TEMP TABLE _sc_rows AS
|
||
SELECT t.ctid AS rowid,
|
||
firms.normalize_company_name(t.$NAME_COL) AS norm,
|
||
firms.normalize_judet(t.$JUDET_COL) AS judet_norm
|
||
FROM $TABLE t
|
||
WHERE t.cui IS NULL
|
||
AND t.$NAME_COL IS NOT NULL
|
||
AND t.$JUDET_COL IS NOT NULL
|
||
AND length(firms.normalize_company_name(t.$NAME_COL)) >= 5;
|
||
CREATE INDEX ON _sc_rows (norm, judet_norm);
|
||
ANALYZE _sc_rows;
|
||
|
||
CREATE TEMP TABLE _sc_keys AS
|
||
SELECT DISTINCT norm, judet_norm FROM _sc_rows;
|
||
ANALYZE _sc_keys;
|
||
|
||
CREATE TEMP TABLE _sc_resolved AS
|
||
WITH ranked AS (
|
||
SELECT c.norm,
|
||
c.judet_norm,
|
||
e.cui,
|
||
similarity(e.name_normalized, c.norm) AS sim,
|
||
(firms.normalize_judet(e.adr_judet) = c.judet_norm) AS judet_match
|
||
FROM _sc_keys c
|
||
JOIN firms.entities e ON e.name_normalized % c.norm
|
||
),
|
||
pick AS (
|
||
SELECT DISTINCT ON (norm, judet_norm)
|
||
norm, judet_norm, cui, sim
|
||
FROM ranked
|
||
WHERE judet_match
|
||
ORDER BY norm, judet_norm, sim DESC, cui
|
||
)
|
||
SELECT * FROM pick WHERE sim >= 0.7;
|
||
CREATE INDEX ON _sc_resolved (norm, judet_norm);
|
||
ANALYZE _sc_resolved;
|
||
|
||
UPDATE $TABLE t
|
||
SET cui = r.cui,
|
||
cui_match_score = r.sim,
|
||
cui_match_method = 'trgm_judet',
|
||
matched_at = now()
|
||
FROM _sc_rows rw
|
||
JOIN _sc_resolved r
|
||
ON rw.norm = r.norm AND rw.judet_norm = r.judet_norm
|
||
WHERE t.ctid = rw.rowid
|
||
AND t.cui IS NULL;
|
||
|
||
DROP TABLE _sc_rows, _sc_keys, _sc_resolved;
|
||
SQL
|
||
log "[$PRINTABLE] Stage C done"
|
||
fi
|
||
|
||
log "[$PRINTABLE] after: $(psql -At -c "
|
||
SELECT COUNT(*) FILTER (WHERE cui IS NULL),
|
||
COUNT(*),
|
||
ROUND(100.0*COUNT(*) FILTER (WHERE cui IS NOT NULL) / COUNT(*), 1) || '%'
|
||
FROM $TABLE;" | tr '|' '/')"
|
||
log "[$PRINTABLE] by method:"
|
||
psql -At -F'|' -c "
|
||
SELECT cui_match_method, COUNT(*)
|
||
FROM $TABLE
|
||
GROUP BY 1 ORDER BY 2 DESC NULLS LAST;" 2>&1 | tee -a "$LOG"
|
||
}
|
||
|
||
run_matcher "fonduri.beneficiar_anunt" "beneficiar_name" "beneficiar_judet" "BEN_PRIVAT" true
|
||
# AFIR: skip trgm — unmatched rows are individual farmers (popa gheorghe,
|
||
# radu vasile, …) receiving FEADR direct payments. They have no CUI and
|
||
# never appear in firms.entities (private company registry). Running trgm
|
||
# on 274K distinct names against 4M entities would take 30+ hours for ~0 gain.
|
||
run_matcher "fonduri.afir_plati" "beneficiar_name" "localitate" "AFIR" false
|
||
|
||
log "=== CUI matcher done ==="
|