Files
vreau-digital/services/seap-scraper/cron/match-cui-external.sh
T
Claude VM a6c03a091e initial: split from gov-agreg — vreau.digital standalone platform
Moved from gov-agreg/src/pages/achizitii/* to root (drop prefix).
- 22 pages migrated, 127 files total
- All internal links: /achizitii/X → /X (176 occurrences fixed)
- AchizitiiLayout subnav rewritten: /X paths, top-right link to vreaudigital.ro hub
- BaseLayout new (vreau.digital branding, OG tags, site URL)
- astro.config.mjs: site https://vreau.digital, server output (was static)
- docker-compose: port 5096 (vreaudigital is 5095), container vreau-digital
- deploy.sh: paths /opt/vreau-digital, log /var/log/vreau-digital-deploy.log

Backend shared with gov-agreg:
- PostgreSQL satra (same schemas: seap, firms, anaf, anre, ...)
- Photon, Martin tiles
- Infisical /vreaudigital path (DATABASE_URL etc. shared)

build: PASS (npx astro check 0 errors, npm run build 5s vite + 10s server)
2026-05-13 00:10:32 +03:00

238 lines
8.9 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/bin/bash
# Run CUI-matching pass over external tables that have company names
# but no CUI yet. Idempotent — only touches rows where cui IS NULL.
#
# Currently matches:
# - fonduri.beneficiar_anunt (~41K names)
# - fonduri.afir_plati (~316K distinct names)
#
# Future: ANI shareholdings, license registries, etc. — all use the same
# firms.normalize_company_name() helper from sql/019_cui_matcher.sql.
set -uo pipefail
LOG=/var/log/vreaudigital-cui-match.log
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
# Resolve DATABASE_URL via Infisical Machine Identity
source /opt/vreaudigital/.infisical-mi
TOKEN=$(infisical login --method=universal-auth --domain="$INFISICAL_API_URL" \
--client-id="$INFISICAL_CLIENT_ID" --client-secret="$INFISICAL_CLIENT_SECRET" --silent --plain)
DBURL=$(infisical run --domain="$INFISICAL_API_URL" --projectId="$INFISICAL_PROJECT_ID" \
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" --silent --token="$TOKEN" \
-- sh -c 'echo "$DATABASE_URL"')
DB=$(echo "$DBURL" | sed -E 's/[?&]schema=[^&]*//; s/\?$//')
export PGUSER=$(echo "$DB" | sed -E 's|^postgresql://([^:]+):.*|\1|')
export PGPASSWORD=$(echo "$DB" | sed -E 's|^postgresql://[^:]+:([^@]+)@.*|\1|')
export PGHOST=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@([^:/]+).*|\1|')
export PGPORT=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^:]+:([0-9]+)/.*|\1|')
export PGDATABASE=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^/]+/([^?]+).*|\1|')
unset DBURL TOKEN DB
log "=== CUI matcher started ==="
# Apply schema (idempotent — generates name_normalized column + indexes)
psql -v ON_ERROR_STOP=1 -f /opt/vreaudigital/services/seap-scraper/sql/019_cui_matcher.sql >/dev/null
run_matcher() {
local TABLE="$1"
local NAME_COL="$2"
local JUDET_COL="$3" # may be empty string if source has no judet
local PRINTABLE="$4"
local RUN_TRGM="${5:-true}" # set to "false" to skip Stages B+C
# (e.g. AFIR direct payments where unmatched
# rows are individual farmers, not companies)
log "[$PRINTABLE] before: $(psql -At -c "SELECT COUNT(*) FILTER (WHERE cui IS NULL), COUNT(*) FROM $TABLE;" | tr '|' '/')"
# Stage A: exact normalized match (unique). When multiple firms share the
# same normalized name (homonyms), we skip — Stage B + judet handles them.
log "[$PRINTABLE] Stage A: exact normalized match..."
psql -v ON_ERROR_STOP=1 <<SQL 2>&1 | tee -a "$LOG"
WITH cand AS (
SELECT t.ctid AS row_ctid,
firms.normalize_company_name(t.$NAME_COL) AS norm
FROM $TABLE t
WHERE t.cui IS NULL
AND t.$NAME_COL IS NOT NULL
),
matched AS (
SELECT c.row_ctid,
MIN(e.cui) AS cui,
COUNT(*) AS n
FROM cand c
JOIN firms.entities e ON e.name_normalized = c.norm
GROUP BY c.row_ctid
)
UPDATE $TABLE t
SET cui = m.cui,
cui_match_score = 1.0,
cui_match_method = 'exact_norm',
matched_at = now()
FROM matched m
WHERE t.ctid = m.row_ctid
AND t.cui IS NULL
AND m.n = 1;
SQL
log "[$PRINTABLE] Stage A done"
# Stage B: pg_trgm similarity. Picks top candidate if score ≥ 0.85 AND
# gap to second-best ≥ 0.10 (so we know it's unambiguously the best match).
#
# Performance: previously O(unmatched_rows × candidate_pool) at default
# threshold 0.3 — 30+ min on AFIR (493K rows). Three-step pipeline now:
# 1. Materialize unmatched rows (rowid + norm) into a temp table
# 2. DISTINCT norms → much smaller trgm input set (BEN 13K→2K, AFIR 493K→274K)
# 3. SET pg_trgm.similarity_threshold = 0.7 so the gin `%` operator returns
# only candidates above the post-filter floor (drops fan-out by ~10×)
# The 0.85/0.10 accept rule is unchanged and produces identical matches.
if [ "$RUN_TRGM" != "true" ]; then
log "[$PRINTABLE] Stage B/C skipped (RUN_TRGM=false) — unmatched rows in this source are individuals, not registered companies"
log "[$PRINTABLE] after: $(psql -At -c "
SELECT COUNT(*) FILTER (WHERE cui IS NULL),
COUNT(*),
ROUND(100.0*COUNT(*) FILTER (WHERE cui IS NOT NULL) / COUNT(*), 1) || '%'
FROM $TABLE;" | tr '|' '/')"
return 0
fi
log "[$PRINTABLE] Stage B: pg_trgm fuzzy (score ≥ 0.85, gap ≥ 0.10)..."
psql -v ON_ERROR_STOP=1 <<SQL 2>&1 | tee -a "$LOG"
SET pg_trgm.similarity_threshold = 0.7;
CREATE TEMP TABLE _sb_rows AS
SELECT t.ctid AS rowid,
firms.normalize_company_name(t.$NAME_COL) AS norm
FROM $TABLE t
WHERE t.cui IS NULL
AND t.$NAME_COL IS NOT NULL
AND length(firms.normalize_company_name(t.$NAME_COL)) >= 5;
CREATE INDEX ON _sb_rows (norm);
ANALYZE _sb_rows;
CREATE TEMP TABLE _sb_norms AS SELECT DISTINCT norm FROM _sb_rows;
ANALYZE _sb_norms;
CREATE TEMP TABLE _sb_resolved AS
WITH ranked AS (
SELECT c.norm,
e.cui,
similarity(e.name_normalized, c.norm) AS sim,
ROW_NUMBER() OVER (
PARTITION BY c.norm
ORDER BY similarity(e.name_normalized, c.norm) DESC, e.cui
) AS rn
FROM _sb_norms c
JOIN firms.entities e ON e.name_normalized % c.norm
),
top2 AS (
SELECT norm,
MAX(sim) FILTER (WHERE rn = 1) AS s1,
MAX(sim) FILTER (WHERE rn = 2) AS s2,
MAX(cui) FILTER (WHERE rn = 1) AS cui1
FROM ranked WHERE rn <= 2
GROUP BY norm
)
SELECT norm, cui1, s1
FROM top2
WHERE s1 >= 0.85
AND (s2 IS NULL OR (s1 - s2) >= 0.10);
CREATE INDEX ON _sb_resolved (norm);
ANALYZE _sb_resolved;
UPDATE $TABLE t
SET cui = r.cui1,
cui_match_score = r.s1,
cui_match_method = 'trgm_unique',
matched_at = now()
FROM _sb_rows rw
JOIN _sb_resolved r ON rw.norm = r.norm
WHERE t.ctid = rw.rowid
AND t.cui IS NULL;
DROP TABLE _sb_rows, _sb_norms, _sb_resolved;
SQL
log "[$PRINTABLE] Stage B done"
# Stage C: judet disambiguation when source has a judet column.
# Multiple candidates above 0.7 → prefer the one whose adr_judet matches.
# Same dedup-by-(norm,judet) + SET threshold pipeline as Stage B.
if [ -n "$JUDET_COL" ]; then
log "[$PRINTABLE] Stage C: judet disambiguation..."
psql -v ON_ERROR_STOP=1 <<SQL 2>&1 | tee -a "$LOG"
SET pg_trgm.similarity_threshold = 0.7;
CREATE TEMP TABLE _sc_rows AS
SELECT t.ctid AS rowid,
firms.normalize_company_name(t.$NAME_COL) AS norm,
firms.normalize_judet(t.$JUDET_COL) AS judet_norm
FROM $TABLE t
WHERE t.cui IS NULL
AND t.$NAME_COL IS NOT NULL
AND t.$JUDET_COL IS NOT NULL
AND length(firms.normalize_company_name(t.$NAME_COL)) >= 5;
CREATE INDEX ON _sc_rows (norm, judet_norm);
ANALYZE _sc_rows;
CREATE TEMP TABLE _sc_keys AS
SELECT DISTINCT norm, judet_norm FROM _sc_rows;
ANALYZE _sc_keys;
CREATE TEMP TABLE _sc_resolved AS
WITH ranked AS (
SELECT c.norm,
c.judet_norm,
e.cui,
similarity(e.name_normalized, c.norm) AS sim,
(firms.normalize_judet(e.adr_judet) = c.judet_norm) AS judet_match
FROM _sc_keys c
JOIN firms.entities e ON e.name_normalized % c.norm
),
pick AS (
SELECT DISTINCT ON (norm, judet_norm)
norm, judet_norm, cui, sim
FROM ranked
WHERE judet_match
ORDER BY norm, judet_norm, sim DESC, cui
)
SELECT * FROM pick WHERE sim >= 0.7;
CREATE INDEX ON _sc_resolved (norm, judet_norm);
ANALYZE _sc_resolved;
UPDATE $TABLE t
SET cui = r.cui,
cui_match_score = r.sim,
cui_match_method = 'trgm_judet',
matched_at = now()
FROM _sc_rows rw
JOIN _sc_resolved r
ON rw.norm = r.norm AND rw.judet_norm = r.judet_norm
WHERE t.ctid = rw.rowid
AND t.cui IS NULL;
DROP TABLE _sc_rows, _sc_keys, _sc_resolved;
SQL
log "[$PRINTABLE] Stage C done"
fi
log "[$PRINTABLE] after: $(psql -At -c "
SELECT COUNT(*) FILTER (WHERE cui IS NULL),
COUNT(*),
ROUND(100.0*COUNT(*) FILTER (WHERE cui IS NOT NULL) / COUNT(*), 1) || '%'
FROM $TABLE;" | tr '|' '/')"
log "[$PRINTABLE] by method:"
psql -At -F'|' -c "
SELECT cui_match_method, COUNT(*)
FROM $TABLE
GROUP BY 1 ORDER BY 2 DESC NULLS LAST;" 2>&1 | tee -a "$LOG"
}
run_matcher "fonduri.beneficiar_anunt" "beneficiar_name" "beneficiar_judet" "BEN_PRIVAT" true
# AFIR: skip trgm — unmatched rows are individual farmers (popa gheorghe,
# radu vasile, …) receiving FEADR direct payments. They have no CUI and
# never appear in firms.entities (private company registry). Running trgm
# on 274K distinct names against 4M entities would take 30+ hours for ~0 gain.
run_matcher "fonduri.afir_plati" "beneficiar_name" "localitate" "AFIR" false
log "=== CUI matcher done ==="