-- 040_curteacont_uat_pattern_match.sql -- High-precision CUI match for curteacont.rapoarte using UAT-pattern + strip-parens. -- -- Curtea de Conturi uses specific abbreviations for territorial units: -- UATC X → COMUNA X -- UATJ X → JUDETUL X -- UATO X → ORASUL X / ORAS X -- UATM X → MUNICIPIUL X -- UAT SECTOR N (... BUCURESTI) → SECTOR N -- -- ONRC stores these with parenthetical suffix indicating the operating body: -- "JUDETUL MARAMURES (CONSILIUL JUDETEAN MARAMURES)" -- "Comuna Surduc (Primaria Comunei Surduc)" -- "SECTOR 3 (PRIMARIA SECTOR 3 BUCURESTI)" -- -- Strip ONRC " (...)" suffix and compare normalized → exact match. -- -- Idempotent: UPDATEs only WHERE audited_entity_cui IS NULL. \timing on SET pg_trgm.similarity_threshold = 0.78; -- safety reset -- Build a small prefiltered firms table once (UATs only ~10K rows) DROP TABLE IF EXISTS tmp_firms_uat; CREATE TEMP TABLE tmp_firms_uat AS SELECT cui, name, firms.normalize_company_name(regexp_replace(name, '\s*\(.*$', '')) AS norm_stripped FROM firms.entities WHERE name ~* '^(COMUNA |JUDETUL |ORAS |ORASUL |MUNICIPIUL |SECTOR(UL)? [1-6])'; CREATE INDEX ON tmp_firms_uat (norm_stripped); ANALYZE tmp_firms_uat; -- Compute expected ONRC-form name for each cdc audited entity DROP TABLE IF EXISTS tmp_cdc_uat; CREATE TEMP TABLE tmp_cdc_uat AS SELECT slug_id, audited_entity_name, firms.normalize_company_name( CASE WHEN audited_entity_name ~* '^UATC ' THEN 'COMUNA ' || regexp_replace(audited_entity_name, '^UATC\s+', '', 'i') WHEN audited_entity_name ~* '^UATJ ' THEN 'JUDETUL ' || regexp_replace(audited_entity_name, '^UATJ\s+', '', 'i') WHEN audited_entity_name ~* '^UATO ' THEN 'ORAS ' || regexp_replace(audited_entity_name, '^UATO\s+', '', 'i') WHEN audited_entity_name ~* '^UATM ' THEN 'MUNICIPIUL ' || regexp_replace(audited_entity_name, '^UATM\s+', '', 'i') WHEN audited_entity_name ~* '^UAT SECTOR(UL)? [1-6]' THEN 'SECTOR ' || substring(audited_entity_name FROM '^UAT SECTOR(?:UL)? ([1-6])') ELSE NULL END ) AS expected_norm FROM curteacont.rapoarte WHERE audited_entity_cui IS NULL AND audited_entity_name IS NOT NULL AND audited_entity_name ~* '^(UATC |UATJ |UATO |UATM |UAT SECTOR)'; -- Stats before update SELECT count(*) AS unmapped_uat_rows FROM tmp_cdc_uat WHERE expected_norm IS NOT NULL; -- Apply the match WITH candidates AS ( SELECT DISTINCT ON (c.slug_id) c.slug_id, f.cui FROM tmp_cdc_uat c JOIN tmp_firms_uat f ON f.norm_stripped = c.expected_norm ORDER BY c.slug_id, f.cui ) UPDATE curteacont.rapoarte r SET audited_entity_cui = c.cui, parsed_at = COALESCE(r.parsed_at, now()) FROM candidates c WHERE r.slug_id = c.slug_id AND r.audited_entity_cui IS NULL; -- Also try a fallback exact-match path for non-UAT names (ministries etc.) -- Match audited_entity_name directly to firms.entities.name with strip-parens. WITH cdc_non_uat AS ( SELECT slug_id, audited_entity_name, firms.normalize_company_name(audited_entity_name) AS norm FROM curteacont.rapoarte WHERE audited_entity_cui IS NULL AND audited_entity_name IS NOT NULL AND audited_entity_name !~* '^(UATC |UATJ |UATO |UATM |UAT SECTOR)' ), candidates2 AS ( SELECT DISTINCT ON (c.slug_id) c.slug_id, e.cui FROM cdc_non_uat c JOIN firms.entities e ON firms.normalize_company_name(regexp_replace(e.name, '\s*\(.*$', '')) = c.norm ORDER BY c.slug_id, e.cui ) UPDATE curteacont.rapoarte r SET audited_entity_cui = c.cui, parsed_at = COALESCE(r.parsed_at, now()) FROM candidates2 c WHERE r.slug_id = c.slug_id AND r.audited_entity_cui IS NULL; -- Final stats SELECT count(*) AS total, count(audited_entity_cui) AS with_cui, round(100.0 * count(audited_entity_cui) / count(*), 1) AS pct FROM curteacont.rapoarte; -- Refresh the per-audited rollup if exists (no MV defined yet for curteacont but -- the autoritate profile pulls live; no refresh needed) DROP TABLE tmp_firms_uat, tmp_cdc_uat;