-- 039_bugetar_uat_pattern_match.sql -- High-precision CUI match for bugetar.entitate UAT entries by stripping -- ONRC parenthetical suffix "X (PRIMARIA Y)" and comparing normalized names. -- -- Replaces the trgm-based 038 attempt which was too slow (90+ min, low yield) -- and hit false positives on the COMUNA-PRIMARIA naming gap. -- -- Insight: ONRC stores comune/orașe with parenthetical suffix: -- "Comuna Surduc (Primaria Comunei Surduc)" -- "COMUNA CIZER (PRIMARIA)" -- "Comuna Mesesenii de Jos (Primaria Mesesenii de Jos Jud. Salaj)" -- Bugetar entries are clean: "COMUNA SURDUC". Stripping ' (...)' from ONRC -- name and comparing normalized → exact match → high-confidence resolve. -- -- Idempotent: UPDATEs only WHERE cui IS NULL. Threshold-agnostic. -- Test runtime: ~1.7s per judet, 42 judete → ~70s total. \timing on SET pg_trgm.similarity_threshold = 0.78; -- not used here but reset for safety WITH judet_map AS ( SELECT * FROM (VALUES ('AB','ALBA'),('AG','ARGES'),('AR','ARAD'),('B','MUNICIPIUL BUCURESTI'), ('BC','BACAU'),('BH','BIHOR'),('BN','BISTRITA-NASAUD'),('BR','BRAILA'), ('BT','BOTOSANI'),('BV','BRASOV'),('BZ','BUZAU'),('CJ','CLUJ'), ('CL','CALARASI'),('CS','CARAS-SEVERIN'),('CT','CONSTANTA'),('CV','COVASNA'), ('DB','DAMBOVITA'),('DJ','DOLJ'),('GJ','GORJ'),('GL','GALATI'), ('GR','GIURGIU'),('HD','HUNEDOARA'),('HR','HARGHITA'),('IF','ILFOV'), ('IL','IALOMITA'),('IS','IASI'),('MH','MEHEDINTI'),('MM','MARAMURES'), ('MS','MURES'),('NT','NEAMT'),('OT','OLT'),('PH','PRAHOVA'), ('SB','SIBIU'),('SJ','SALAJ'),('SM','SATU MARE'),('SV','SUCEAVA'), ('TL','TULCEA'),('TM','TIMIS'),('TR','TELEORMAN'),('VL','VALCEA'), ('VN','VRANCEA'),('VS','VASLUI') ) AS m(code, name_ascii) ), candidates AS ( SELECT DISTINCT ON (b.id) b.id, e.cui, e.name AS firm_name FROM bugetar.entitate b JOIN judet_map jm ON jm.code = b.judet JOIN firms.entities e ON firms.normalize_company_name(regexp_replace(e.name, '\s*\(.*$', '')) = firms.normalize_company_name(b.entity_name) AND UPPER(translate(COALESCE(e.adr_judet,''), 'ŞȘŢȚăâîĂÂÎ', 'SSTTAAIAAI')) = jm.name_ascii WHERE b.cui IS NULL ORDER BY b.id, e.cui -- deterministic when multiple firms share normalized stripped name ) UPDATE bugetar.entitate b SET cui = c.cui, cui_match_score = 0.95::numeric(5,2), -- high-confidence stripped-exact match cui_match_method = 'uat_pattern', updated_at = now() FROM candidates c WHERE b.id = c.id AND b.cui IS NULL; -- Final stats SELECT cui_match_method, count(*), round(avg(cui_match_score)::numeric, 2) AS avg_score FROM bugetar.entitate WHERE cui IS NOT NULL GROUP BY 1 ORDER BY 2 DESC; SELECT 'unmatched' AS t, count(*) FROM bugetar.entitate WHERE cui IS NULL;