initial: split from gov-agreg — vreau.digital standalone platform
Moved from gov-agreg/src/pages/achizitii/* to root (drop prefix). - 22 pages migrated, 127 files total - All internal links: /achizitii/X → /X (176 occurrences fixed) - AchizitiiLayout subnav rewritten: /X paths, top-right link to vreaudigital.ro hub - BaseLayout new (vreau.digital branding, OG tags, site URL) - astro.config.mjs: site https://vreau.digital, server output (was static) - docker-compose: port 5096 (vreaudigital is 5095), container vreau-digital - deploy.sh: paths /opt/vreau-digital, log /var/log/vreau-digital-deploy.log Backend shared with gov-agreg: - PostgreSQL satra (same schemas: seap, firms, anaf, anre, ...) - Photon, Martin tiles - Infisical /vreaudigital path (DATABASE_URL etc. shared) build: PASS (npx astro check 0 errors, npm run build 5s vite + 10s server)
This commit is contained in:
+204
@@ -0,0 +1,204 @@
|
||||
#!/bin/bash
|
||||
# Fuzzy-match ancom.operatori.titular_name → firms.entities.cui via the
|
||||
# same Stage A (exact normalized) + Stage B (pg_trgm unique-pick) + Stage C
|
||||
# (judet disambiguation) pipeline as cron/match-cui-anre.sh.
|
||||
#
|
||||
# Most ANCOM rows have CUI directly from the detail page (cui_match_method='direct'),
|
||||
# so this is a fallback for whatever subset has titular_cui IS NULL.
|
||||
#
|
||||
# Idempotent — only touches rows where titular_cui IS NULL.
|
||||
|
||||
set -uo pipefail
|
||||
|
||||
LOG=/var/log/vreaudigital-cui-match-ancom.log
|
||||
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
|
||||
|
||||
# Resolve DATABASE_URL via Infisical Machine Identity
|
||||
source /opt/vreaudigital/.infisical-mi
|
||||
TOKEN=$(infisical login --method=universal-auth --domain="$INFISICAL_API_URL" \
|
||||
--client-id="$INFISICAL_CLIENT_ID" --client-secret="$INFISICAL_CLIENT_SECRET" --silent --plain)
|
||||
DBURL=$(infisical run --domain="$INFISICAL_API_URL" --projectId="$INFISICAL_PROJECT_ID" \
|
||||
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" --silent --token="$TOKEN" \
|
||||
-- sh -c 'echo "$DATABASE_URL"')
|
||||
DB=$(echo "$DBURL" | sed -E 's/[?&]schema=[^&]*//; s/\?$//')
|
||||
export PGUSER=$(echo "$DB" | sed -E 's|^postgresql://([^:]+):.*|\1|')
|
||||
export PGPASSWORD=$(echo "$DB" | sed -E 's|^postgresql://[^:]+:([^@]+)@.*|\1|')
|
||||
export PGHOST=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@([^:/]+).*|\1|')
|
||||
export PGPORT=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^:]+:([0-9]+)/.*|\1|')
|
||||
export PGDATABASE=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^/]+/([^?]+).*|\1|')
|
||||
unset DBURL TOKEN DB
|
||||
|
||||
log "=== ANCOM CUI matcher started ==="
|
||||
|
||||
BEFORE=$(psql -At -c "SELECT COUNT(*) FILTER (WHERE titular_cui IS NULL) || '/' || COUNT(*) FROM ancom.operatori;")
|
||||
log "before: $BEFORE"
|
||||
|
||||
# Pre-step: populate titular_name_norm for all rows where it's NULL.
|
||||
log "pre-step: populating titular_name_norm..."
|
||||
psql -v ON_ERROR_STOP=1 <<'SQL' 2>&1 | tee -a "$LOG"
|
||||
UPDATE ancom.operatori
|
||||
SET titular_name_norm = firms.normalize_company_name(titular_name)
|
||||
WHERE titular_name_norm IS NULL
|
||||
AND titular_name IS NOT NULL;
|
||||
SQL
|
||||
|
||||
# Stage A: exact normalized match (unique only).
|
||||
log "Stage A: exact normalized match..."
|
||||
psql -v ON_ERROR_STOP=1 <<'SQL' 2>&1 | tee -a "$LOG"
|
||||
WITH cand AS (
|
||||
SELECT t.ancom_id AS row_id, t.titular_name_norm AS norm
|
||||
FROM ancom.operatori t
|
||||
WHERE t.titular_cui IS NULL
|
||||
AND t.titular_name_norm IS NOT NULL
|
||||
),
|
||||
matched AS (
|
||||
SELECT c.row_id, MIN(e.cui) AS cui, COUNT(*) AS n
|
||||
FROM cand c
|
||||
JOIN firms.entities e ON e.name_normalized = c.norm
|
||||
GROUP BY c.row_id
|
||||
)
|
||||
UPDATE ancom.operatori t
|
||||
SET titular_cui = m.cui,
|
||||
cui_match_score = 1.0,
|
||||
cui_match_method = 'exact_norm',
|
||||
matched_at = now()
|
||||
FROM matched m
|
||||
WHERE t.ancom_id = m.row_id
|
||||
AND t.titular_cui IS NULL
|
||||
AND m.n = 1;
|
||||
SQL
|
||||
log "Stage A done"
|
||||
|
||||
# Stage B: pg_trgm fuzzy. Same SET threshold 0.7 + 0.85/0.10 accept rule
|
||||
# as match-cui-external.sh.
|
||||
log "Stage B: pg_trgm fuzzy (score >= 0.85, gap >= 0.10)..."
|
||||
psql -v ON_ERROR_STOP=1 <<'SQL' 2>&1 | tee -a "$LOG"
|
||||
SET pg_trgm.similarity_threshold = 0.7;
|
||||
|
||||
CREATE TEMP TABLE _sb_rows AS
|
||||
SELECT t.ancom_id AS rowid, t.titular_name_norm AS norm
|
||||
FROM ancom.operatori t
|
||||
WHERE t.titular_cui IS NULL
|
||||
AND t.titular_name_norm IS NOT NULL
|
||||
AND length(t.titular_name_norm) >= 5;
|
||||
CREATE INDEX ON _sb_rows (norm);
|
||||
ANALYZE _sb_rows;
|
||||
|
||||
CREATE TEMP TABLE _sb_norms AS SELECT DISTINCT norm FROM _sb_rows;
|
||||
ANALYZE _sb_norms;
|
||||
|
||||
CREATE TEMP TABLE _sb_resolved AS
|
||||
WITH ranked AS (
|
||||
SELECT c.norm,
|
||||
e.cui,
|
||||
similarity(e.name_normalized, c.norm) AS sim,
|
||||
ROW_NUMBER() OVER (
|
||||
PARTITION BY c.norm
|
||||
ORDER BY similarity(e.name_normalized, c.norm) DESC, e.cui
|
||||
) AS rn
|
||||
FROM _sb_norms c
|
||||
JOIN firms.entities e ON e.name_normalized % c.norm
|
||||
),
|
||||
top2 AS (
|
||||
SELECT norm,
|
||||
MAX(sim) FILTER (WHERE rn = 1) AS s1,
|
||||
MAX(sim) FILTER (WHERE rn = 2) AS s2,
|
||||
MAX(cui) FILTER (WHERE rn = 1) AS cui1
|
||||
FROM ranked WHERE rn <= 2
|
||||
GROUP BY norm
|
||||
)
|
||||
SELECT norm, cui1, s1
|
||||
FROM top2
|
||||
WHERE s1 >= 0.85
|
||||
AND (s2 IS NULL OR (s1 - s2) >= 0.10);
|
||||
CREATE INDEX ON _sb_resolved (norm);
|
||||
ANALYZE _sb_resolved;
|
||||
|
||||
UPDATE ancom.operatori t
|
||||
SET titular_cui = r.cui1,
|
||||
cui_match_score = r.s1,
|
||||
cui_match_method = 'trgm_unique',
|
||||
matched_at = now()
|
||||
FROM _sb_rows rw
|
||||
JOIN _sb_resolved r ON rw.norm = r.norm
|
||||
WHERE t.ancom_id = rw.rowid
|
||||
AND t.titular_cui IS NULL;
|
||||
|
||||
DROP TABLE _sb_rows, _sb_norms, _sb_resolved;
|
||||
SQL
|
||||
log "Stage B done"
|
||||
|
||||
# Stage C: judet disambiguation when there are multiple trgm candidates.
|
||||
log "Stage C: judet disambiguation..."
|
||||
psql -v ON_ERROR_STOP=1 <<'SQL' 2>&1 | tee -a "$LOG"
|
||||
SET pg_trgm.similarity_threshold = 0.7;
|
||||
|
||||
CREATE TEMP TABLE _sc_rows AS
|
||||
SELECT t.ancom_id AS rowid,
|
||||
t.titular_name_norm AS norm,
|
||||
firms.normalize_judet(t.judet) AS judet_norm
|
||||
FROM ancom.operatori t
|
||||
WHERE t.titular_cui IS NULL
|
||||
AND t.titular_name_norm IS NOT NULL
|
||||
AND t.judet IS NOT NULL
|
||||
AND length(t.titular_name_norm) >= 5;
|
||||
CREATE INDEX ON _sc_rows (norm, judet_norm);
|
||||
ANALYZE _sc_rows;
|
||||
|
||||
CREATE TEMP TABLE _sc_keys AS
|
||||
SELECT DISTINCT norm, judet_norm FROM _sc_rows;
|
||||
ANALYZE _sc_keys;
|
||||
|
||||
CREATE TEMP TABLE _sc_resolved AS
|
||||
WITH ranked AS (
|
||||
SELECT c.norm, c.judet_norm, e.cui,
|
||||
similarity(e.name_normalized, c.norm) AS sim,
|
||||
(firms.normalize_judet(e.adr_judet) = c.judet_norm) AS judet_match
|
||||
FROM _sc_keys c
|
||||
JOIN firms.entities e ON e.name_normalized % c.norm
|
||||
),
|
||||
pick AS (
|
||||
SELECT DISTINCT ON (norm, judet_norm)
|
||||
norm, judet_norm, cui, sim
|
||||
FROM ranked
|
||||
WHERE judet_match
|
||||
ORDER BY norm, judet_norm, sim DESC, cui
|
||||
)
|
||||
SELECT * FROM pick WHERE sim >= 0.7;
|
||||
CREATE INDEX ON _sc_resolved (norm, judet_norm);
|
||||
ANALYZE _sc_resolved;
|
||||
|
||||
UPDATE ancom.operatori t
|
||||
SET titular_cui = r.cui,
|
||||
cui_match_score = r.sim,
|
||||
cui_match_method = 'trgm_judet',
|
||||
matched_at = now()
|
||||
FROM _sc_rows rw
|
||||
JOIN _sc_resolved r ON rw.norm = r.norm AND rw.judet_norm = r.judet_norm
|
||||
WHERE t.ancom_id = rw.rowid
|
||||
AND t.titular_cui IS NULL;
|
||||
|
||||
DROP TABLE _sc_rows, _sc_keys, _sc_resolved;
|
||||
SQL
|
||||
log "Stage C done"
|
||||
|
||||
AFTER=$(psql -At -c "
|
||||
SELECT COUNT(*) FILTER (WHERE titular_cui IS NULL) || '/' ||
|
||||
COUNT(*) || ' (matched ' ||
|
||||
ROUND(100.0*COUNT(*) FILTER (WHERE titular_cui IS NOT NULL) / COUNT(*), 1) || '%)'
|
||||
FROM ancom.operatori;")
|
||||
log "after: $AFTER"
|
||||
|
||||
log "by method:"
|
||||
psql -At -F'|' -c "
|
||||
SELECT cui_match_method, COUNT(*)
|
||||
FROM ancom.operatori
|
||||
GROUP BY 1 ORDER BY 2 DESC NULLS LAST;" 2>&1 | tee -a "$LOG"
|
||||
|
||||
# Refresh the per-CUI MV now that titular_cui is populated.
|
||||
log "refreshing ancom.mv_operatori_per_cui..."
|
||||
psql -v ON_ERROR_STOP=1 -c "REFRESH MATERIALIZED VIEW CONCURRENTLY ancom.mv_operatori_per_cui;" \
|
||||
2>>"$LOG" \
|
||||
|| psql -v ON_ERROR_STOP=1 -c "REFRESH MATERIALIZED VIEW ancom.mv_operatori_per_cui;" 2>&1 | tee -a "$LOG"
|
||||
|
||||
log "=== ANCOM CUI matcher done ==="
|
||||
Reference in New Issue
Block a user