#!/bin/bash # Run CUI-matching pass over external tables that have company names # but no CUI yet. Idempotent — only touches rows where cui IS NULL. # # Currently matches: # - fonduri.beneficiar_anunt (~41K names) # - fonduri.afir_plati (~316K distinct names) # # Future: ANI shareholdings, license registries, etc. — all use the same # firms.normalize_company_name() helper from sql/019_cui_matcher.sql. set -uo pipefail LOG=/var/log/vreaudigital-cui-match.log log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; } # Resolve DATABASE_URL via Infisical Machine Identity source /opt/vreaudigital/.infisical-mi TOKEN=$(infisical login --method=universal-auth --domain="$INFISICAL_API_URL" \ --client-id="$INFISICAL_CLIENT_ID" --client-secret="$INFISICAL_CLIENT_SECRET" --silent --plain) DBURL=$(infisical run --domain="$INFISICAL_API_URL" --projectId="$INFISICAL_PROJECT_ID" \ --env="$INFISICAL_ENV" --path="$INFISICAL_PATH" --silent --token="$TOKEN" \ -- sh -c 'echo "$DATABASE_URL"') DB=$(echo "$DBURL" | sed -E 's/[?&]schema=[^&]*//; s/\?$//') export PGUSER=$(echo "$DB" | sed -E 's|^postgresql://([^:]+):.*|\1|') export PGPASSWORD=$(echo "$DB" | sed -E 's|^postgresql://[^:]+:([^@]+)@.*|\1|') export PGHOST=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@([^:/]+).*|\1|') export PGPORT=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^:]+:([0-9]+)/.*|\1|') export PGDATABASE=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^/]+/([^?]+).*|\1|') unset DBURL TOKEN DB log "=== CUI matcher started ===" # Apply schema (idempotent — generates name_normalized column + indexes) psql -v ON_ERROR_STOP=1 -f /opt/vreaudigital/services/seap-scraper/sql/019_cui_matcher.sql >/dev/null run_matcher() { local TABLE="$1" local NAME_COL="$2" local JUDET_COL="$3" # may be empty string if source has no judet local PRINTABLE="$4" local RUN_TRGM="${5:-true}" # set to "false" to skip Stages B+C # (e.g. AFIR direct payments where unmatched # rows are individual farmers, not companies) log "[$PRINTABLE] before: $(psql -At -c "SELECT COUNT(*) FILTER (WHERE cui IS NULL), COUNT(*) FROM $TABLE;" | tr '|' '/')" # Stage A: exact normalized match (unique). When multiple firms share the # same normalized name (homonyms), we skip — Stage B + judet handles them. log "[$PRINTABLE] Stage A: exact normalized match..." psql -v ON_ERROR_STOP=1 <&1 | tee -a "$LOG" WITH cand AS ( SELECT t.ctid AS row_ctid, firms.normalize_company_name(t.$NAME_COL) AS norm FROM $TABLE t WHERE t.cui IS NULL AND t.$NAME_COL IS NOT NULL ), matched AS ( SELECT c.row_ctid, MIN(e.cui) AS cui, COUNT(*) AS n FROM cand c JOIN firms.entities e ON e.name_normalized = c.norm GROUP BY c.row_ctid ) UPDATE $TABLE t SET cui = m.cui, cui_match_score = 1.0, cui_match_method = 'exact_norm', matched_at = now() FROM matched m WHERE t.ctid = m.row_ctid AND t.cui IS NULL AND m.n = 1; SQL log "[$PRINTABLE] Stage A done" # Stage B: pg_trgm similarity. Picks top candidate if score ≥ 0.85 AND # gap to second-best ≥ 0.10 (so we know it's unambiguously the best match). # # Performance: previously O(unmatched_rows × candidate_pool) at default # threshold 0.3 — 30+ min on AFIR (493K rows). Three-step pipeline now: # 1. Materialize unmatched rows (rowid + norm) into a temp table # 2. DISTINCT norms → much smaller trgm input set (BEN 13K→2K, AFIR 493K→274K) # 3. SET pg_trgm.similarity_threshold = 0.7 so the gin `%` operator returns # only candidates above the post-filter floor (drops fan-out by ~10×) # The 0.85/0.10 accept rule is unchanged and produces identical matches. if [ "$RUN_TRGM" != "true" ]; then log "[$PRINTABLE] Stage B/C skipped (RUN_TRGM=false) — unmatched rows in this source are individuals, not registered companies" log "[$PRINTABLE] after: $(psql -At -c " SELECT COUNT(*) FILTER (WHERE cui IS NULL), COUNT(*), ROUND(100.0*COUNT(*) FILTER (WHERE cui IS NOT NULL) / COUNT(*), 1) || '%' FROM $TABLE;" | tr '|' '/')" return 0 fi log "[$PRINTABLE] Stage B: pg_trgm fuzzy (score ≥ 0.85, gap ≥ 0.10)..." psql -v ON_ERROR_STOP=1 <&1 | tee -a "$LOG" SET pg_trgm.similarity_threshold = 0.7; CREATE TEMP TABLE _sb_rows AS SELECT t.ctid AS rowid, firms.normalize_company_name(t.$NAME_COL) AS norm FROM $TABLE t WHERE t.cui IS NULL AND t.$NAME_COL IS NOT NULL AND length(firms.normalize_company_name(t.$NAME_COL)) >= 5; CREATE INDEX ON _sb_rows (norm); ANALYZE _sb_rows; CREATE TEMP TABLE _sb_norms AS SELECT DISTINCT norm FROM _sb_rows; ANALYZE _sb_norms; CREATE TEMP TABLE _sb_resolved AS WITH ranked AS ( SELECT c.norm, e.cui, similarity(e.name_normalized, c.norm) AS sim, ROW_NUMBER() OVER ( PARTITION BY c.norm ORDER BY similarity(e.name_normalized, c.norm) DESC, e.cui ) AS rn FROM _sb_norms c JOIN firms.entities e ON e.name_normalized % c.norm ), top2 AS ( SELECT norm, MAX(sim) FILTER (WHERE rn = 1) AS s1, MAX(sim) FILTER (WHERE rn = 2) AS s2, MAX(cui) FILTER (WHERE rn = 1) AS cui1 FROM ranked WHERE rn <= 2 GROUP BY norm ) SELECT norm, cui1, s1 FROM top2 WHERE s1 >= 0.85 AND (s2 IS NULL OR (s1 - s2) >= 0.10); CREATE INDEX ON _sb_resolved (norm); ANALYZE _sb_resolved; UPDATE $TABLE t SET cui = r.cui1, cui_match_score = r.s1, cui_match_method = 'trgm_unique', matched_at = now() FROM _sb_rows rw JOIN _sb_resolved r ON rw.norm = r.norm WHERE t.ctid = rw.rowid AND t.cui IS NULL; DROP TABLE _sb_rows, _sb_norms, _sb_resolved; SQL log "[$PRINTABLE] Stage B done" # Stage C: judet disambiguation when source has a judet column. # Multiple candidates above 0.7 → prefer the one whose adr_judet matches. # Same dedup-by-(norm,judet) + SET threshold pipeline as Stage B. if [ -n "$JUDET_COL" ]; then log "[$PRINTABLE] Stage C: judet disambiguation..." psql -v ON_ERROR_STOP=1 <&1 | tee -a "$LOG" SET pg_trgm.similarity_threshold = 0.7; CREATE TEMP TABLE _sc_rows AS SELECT t.ctid AS rowid, firms.normalize_company_name(t.$NAME_COL) AS norm, firms.normalize_judet(t.$JUDET_COL) AS judet_norm FROM $TABLE t WHERE t.cui IS NULL AND t.$NAME_COL IS NOT NULL AND t.$JUDET_COL IS NOT NULL AND length(firms.normalize_company_name(t.$NAME_COL)) >= 5; CREATE INDEX ON _sc_rows (norm, judet_norm); ANALYZE _sc_rows; CREATE TEMP TABLE _sc_keys AS SELECT DISTINCT norm, judet_norm FROM _sc_rows; ANALYZE _sc_keys; CREATE TEMP TABLE _sc_resolved AS WITH ranked AS ( SELECT c.norm, c.judet_norm, e.cui, similarity(e.name_normalized, c.norm) AS sim, (firms.normalize_judet(e.adr_judet) = c.judet_norm) AS judet_match FROM _sc_keys c JOIN firms.entities e ON e.name_normalized % c.norm ), pick AS ( SELECT DISTINCT ON (norm, judet_norm) norm, judet_norm, cui, sim FROM ranked WHERE judet_match ORDER BY norm, judet_norm, sim DESC, cui ) SELECT * FROM pick WHERE sim >= 0.7; CREATE INDEX ON _sc_resolved (norm, judet_norm); ANALYZE _sc_resolved; UPDATE $TABLE t SET cui = r.cui, cui_match_score = r.sim, cui_match_method = 'trgm_judet', matched_at = now() FROM _sc_rows rw JOIN _sc_resolved r ON rw.norm = r.norm AND rw.judet_norm = r.judet_norm WHERE t.ctid = rw.rowid AND t.cui IS NULL; DROP TABLE _sc_rows, _sc_keys, _sc_resolved; SQL log "[$PRINTABLE] Stage C done" fi log "[$PRINTABLE] after: $(psql -At -c " SELECT COUNT(*) FILTER (WHERE cui IS NULL), COUNT(*), ROUND(100.0*COUNT(*) FILTER (WHERE cui IS NOT NULL) / COUNT(*), 1) || '%' FROM $TABLE;" | tr '|' '/')" log "[$PRINTABLE] by method:" psql -At -F'|' -c " SELECT cui_match_method, COUNT(*) FROM $TABLE GROUP BY 1 ORDER BY 2 DESC NULLS LAST;" 2>&1 | tee -a "$LOG" } run_matcher "fonduri.beneficiar_anunt" "beneficiar_name" "beneficiar_judet" "BEN_PRIVAT" true # AFIR: skip trgm — unmatched rows are individual farmers (popa gheorghe, # radu vasile, …) receiving FEADR direct payments. They have no CUI and # never appear in firms.entities (private company registry). Running trgm # on 274K distinct names against 4M entities would take 30+ hours for ~0 gain. run_matcher "fonduri.afir_plati" "beneficiar_name" "localitate" "AFIR" false log "=== CUI matcher done ==="