#!/bin/bash # Fuzzy-match anre.licente.titular_name → firms.entities.cui via the # same Stage A (exact normalized) + Stage B (pg_trgm unique-pick) + Stage C # (judet disambiguation) pipeline as cron/match-cui-external.sh. # # Idempotent — only touches rows where titular_cui IS NULL. # # anre.licente has its own column names (titular_cui not cui), so we have # a dedicated wrapper here. Same SQL approach, different column names. set -uo pipefail LOG=/var/log/vreaudigital-cui-match-anre.log log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; } # Resolve DATABASE_URL via Infisical Machine Identity source /opt/vreaudigital/.infisical-mi TOKEN=$(infisical login --method=universal-auth --domain="$INFISICAL_API_URL" \ --client-id="$INFISICAL_CLIENT_ID" --client-secret="$INFISICAL_CLIENT_SECRET" --silent --plain) DBURL=$(infisical run --domain="$INFISICAL_API_URL" --projectId="$INFISICAL_PROJECT_ID" \ --env="$INFISICAL_ENV" --path="$INFISICAL_PATH" --silent --token="$TOKEN" \ -- sh -c 'echo "$DATABASE_URL"') DB=$(echo "$DBURL" | sed -E 's/[?&]schema=[^&]*//; s/\?$//') export PGUSER=$(echo "$DB" | sed -E 's|^postgresql://([^:]+):.*|\1|') export PGPASSWORD=$(echo "$DB" | sed -E 's|^postgresql://[^:]+:([^@]+)@.*|\1|') export PGHOST=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@([^:/]+).*|\1|') export PGPORT=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^:]+:([0-9]+)/.*|\1|') export PGDATABASE=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^/]+/([^?]+).*|\1|') unset DBURL TOKEN DB log "=== ANRE CUI matcher started ===" BEFORE=$(psql -At -c "SELECT COUNT(*) FILTER (WHERE titular_cui IS NULL) || '/' || COUNT(*) FROM anre.licente;") log "before: $BEFORE" # Pre-step: populate titular_name_norm for all rows where it's NULL. log "pre-step: populating titular_name_norm..." psql -v ON_ERROR_STOP=1 <<'SQL' 2>&1 | tee -a "$LOG" UPDATE anre.licente SET titular_name_norm = firms.normalize_company_name(titular_name) WHERE titular_name_norm IS NULL AND titular_name IS NOT NULL; SQL # Stage A: exact normalized match (unique only). log "Stage A: exact normalized match..." psql -v ON_ERROR_STOP=1 <<'SQL' 2>&1 | tee -a "$LOG" WITH cand AS ( SELECT t.id AS row_id, t.titular_name_norm AS norm FROM anre.licente t WHERE t.titular_cui IS NULL AND t.titular_name_norm IS NOT NULL ), matched AS ( SELECT c.row_id, MIN(e.cui) AS cui, COUNT(*) AS n FROM cand c JOIN firms.entities e ON e.name_normalized = c.norm GROUP BY c.row_id ) UPDATE anre.licente t SET titular_cui = m.cui, cui_match_score = 1.0, cui_match_method = 'exact_norm', matched_at = now() FROM matched m WHERE t.id = m.row_id AND t.titular_cui IS NULL AND m.n = 1; SQL log "Stage A done" # Stage B: pg_trgm fuzzy. Same SET threshold 0.7 + 0.85/0.10 accept rule # as match-cui-external.sh. log "Stage B: pg_trgm fuzzy (score >= 0.85, gap >= 0.10)..." psql -v ON_ERROR_STOP=1 <<'SQL' 2>&1 | tee -a "$LOG" SET pg_trgm.similarity_threshold = 0.7; CREATE TEMP TABLE _sb_rows AS SELECT t.id AS rowid, t.titular_name_norm AS norm FROM anre.licente t WHERE t.titular_cui IS NULL AND t.titular_name_norm IS NOT NULL AND length(t.titular_name_norm) >= 5; CREATE INDEX ON _sb_rows (norm); ANALYZE _sb_rows; CREATE TEMP TABLE _sb_norms AS SELECT DISTINCT norm FROM _sb_rows; ANALYZE _sb_norms; CREATE TEMP TABLE _sb_resolved AS WITH ranked AS ( SELECT c.norm, e.cui, similarity(e.name_normalized, c.norm) AS sim, ROW_NUMBER() OVER ( PARTITION BY c.norm ORDER BY similarity(e.name_normalized, c.norm) DESC, e.cui ) AS rn FROM _sb_norms c JOIN firms.entities e ON e.name_normalized % c.norm ), top2 AS ( SELECT norm, MAX(sim) FILTER (WHERE rn = 1) AS s1, MAX(sim) FILTER (WHERE rn = 2) AS s2, MAX(cui) FILTER (WHERE rn = 1) AS cui1 FROM ranked WHERE rn <= 2 GROUP BY norm ) SELECT norm, cui1, s1 FROM top2 WHERE s1 >= 0.85 AND (s2 IS NULL OR (s1 - s2) >= 0.10); CREATE INDEX ON _sb_resolved (norm); ANALYZE _sb_resolved; UPDATE anre.licente t SET titular_cui = r.cui1, cui_match_score = r.s1, cui_match_method = 'trgm_unique', matched_at = now() FROM _sb_rows rw JOIN _sb_resolved r ON rw.norm = r.norm WHERE t.id = rw.rowid AND t.titular_cui IS NULL; DROP TABLE _sb_rows, _sb_norms, _sb_resolved; SQL log "Stage B done" # Stage C: judet disambiguation when there are multiple trgm candidates. log "Stage C: judet disambiguation..." psql -v ON_ERROR_STOP=1 <<'SQL' 2>&1 | tee -a "$LOG" SET pg_trgm.similarity_threshold = 0.7; CREATE TEMP TABLE _sc_rows AS SELECT t.id AS rowid, t.titular_name_norm AS norm, firms.normalize_judet(t.judet) AS judet_norm FROM anre.licente t WHERE t.titular_cui IS NULL AND t.titular_name_norm IS NOT NULL AND t.judet IS NOT NULL AND length(t.titular_name_norm) >= 5; CREATE INDEX ON _sc_rows (norm, judet_norm); ANALYZE _sc_rows; CREATE TEMP TABLE _sc_keys AS SELECT DISTINCT norm, judet_norm FROM _sc_rows; ANALYZE _sc_keys; CREATE TEMP TABLE _sc_resolved AS WITH ranked AS ( SELECT c.norm, c.judet_norm, e.cui, similarity(e.name_normalized, c.norm) AS sim, (firms.normalize_judet(e.adr_judet) = c.judet_norm) AS judet_match FROM _sc_keys c JOIN firms.entities e ON e.name_normalized % c.norm ), pick AS ( SELECT DISTINCT ON (norm, judet_norm) norm, judet_norm, cui, sim FROM ranked WHERE judet_match ORDER BY norm, judet_norm, sim DESC, cui ) SELECT * FROM pick WHERE sim >= 0.7; CREATE INDEX ON _sc_resolved (norm, judet_norm); ANALYZE _sc_resolved; UPDATE anre.licente t SET titular_cui = r.cui, cui_match_score = r.sim, cui_match_method = 'trgm_judet', matched_at = now() FROM _sc_rows rw JOIN _sc_resolved r ON rw.norm = r.norm AND rw.judet_norm = r.judet_norm WHERE t.id = rw.rowid AND t.titular_cui IS NULL; DROP TABLE _sc_rows, _sc_keys, _sc_resolved; SQL log "Stage C done" AFTER=$(psql -At -c " SELECT COUNT(*) FILTER (WHERE titular_cui IS NULL) || '/' || COUNT(*) || ' (matched ' || ROUND(100.0*COUNT(*) FILTER (WHERE titular_cui IS NOT NULL) / COUNT(*), 1) || '%)' FROM anre.licente;") log "after: $AFTER" log "by method:" psql -At -F'|' -c " SELECT cui_match_method, COUNT(*) FROM anre.licente GROUP BY 1 ORDER BY 2 DESC NULLS LAST;" 2>&1 | tee -a "$LOG" # Refresh the per-CUI MV now that titular_cui is populated. log "refreshing anre.mv_licente_per_cui..." psql -v ON_ERROR_STOP=1 -c "REFRESH MATERIALIZED VIEW CONCURRENTLY anre.mv_licente_per_cui;" \ 2>>"$LOG" \ || psql -v ON_ERROR_STOP=1 -c "REFRESH MATERIALIZED VIEW anre.mv_licente_per_cui;" 2>&1 | tee -a "$LOG" log "=== ANRE CUI matcher done ==="