-- 042_cnsc_authority_cui_match.sql -- Backfill authority_cuis array on cnsc.decizii using the strip-parens + -- UAT-pattern strategy proven by bugetar 039 + curteacont 040. -- -- Current state: 29,488 decizii, 12,527 (42%) have authority_cuis populated -- via the scraper's authority_cui_raw extraction. Remaining 16,961 (58%) -- have authority_name but no CUI. -- -- CNSC names use these patterns: -- COMUNA X / ORASUL X / MUNICIPIUL X / JUDETUL X → UAT direct -- PRIMARIA COMUNEI X / PRIMARIA X → strip PRIMARIA, try UAT -- CONSILIUL JUDETEAN X / CJ X → "CONSILIUL JUDETEAN X" / "JUDETUL X" -- SA / SRL → direct firm name match -- — strip-parens fallback -- -- This SQL UPDATEs authority_cuis = ARRAY[cui]::text[] when a match is found. -- The mv_per_authority_cui must be refreshed afterward. -- -- Idempotent: only updates rows where authority_cuis IS NULL or empty. \timing on -- Build the UAT firm cache once (reusable across passes) DROP TABLE IF EXISTS tmp_firms_uat; CREATE TEMP TABLE tmp_firms_uat AS SELECT cui, name, firms.normalize_company_name(regexp_replace(name, '\s*\(.*$', '')) AS norm_stripped FROM firms.entities WHERE name ~* '^(COMUNA |JUDETUL |ORAS |ORASUL |MUNICIPIUL |SECTOR(UL)? [1-6]|CONSILIUL JUDETEAN |PRIMARIA )'; CREATE INDEX ON tmp_firms_uat (norm_stripped); ANALYZE tmp_firms_uat; -- Unmatched authority rows with normalized expected forms DROP TABLE IF EXISTS tmp_cnsc_unmatched; CREATE TEMP TABLE tmp_cnsc_unmatched AS SELECT id, authority_name, -- Expected ONRC-form normalized name (try several patterns; pick best one): firms.normalize_company_name(authority_name) AS direct_norm, firms.normalize_company_name( regexp_replace(authority_name, '^PRIMARIA\s+(COMUNEI\s+|ORASULUI\s+|MUNICIPIULUI\s+|JUDETULUI\s+)?', '', 'i') ) AS primaria_stripped_norm, firms.normalize_company_name( CASE WHEN authority_name ~* '^CONSILIUL JUDETEAN ' THEN 'JUDETUL ' || regexp_replace(authority_name, '^CONSILIUL JUDETEAN\s+', '', 'i') ELSE NULL END ) AS cj_norm FROM cnsc.decizii WHERE (authority_cuis IS NULL OR array_length(authority_cuis, 1) IS NULL) AND authority_name IS NOT NULL; CREATE INDEX ON tmp_cnsc_unmatched (direct_norm); -- Pass 1: direct strip-parens match (firm name matches CNSC authority_name) WITH candidates AS ( SELECT DISTINCT ON (u.id) u.id, e.cui FROM tmp_cnsc_unmatched u JOIN firms.entities e ON firms.normalize_company_name(regexp_replace(e.name, '\s*\(.*$', '')) = u.direct_norm ORDER BY u.id, e.cui ) UPDATE cnsc.decizii d SET authority_cuis = ARRAY[c.cui]::text[] FROM candidates c WHERE d.id = c.id AND (d.authority_cuis IS NULL OR array_length(d.authority_cuis, 1) IS NULL); -- Pass 2: PRIMARIA-stripped match (PRIMARIA COMUNEI X → match COMUNA X / ORAS X etc.) WITH candidates AS ( SELECT DISTINCT ON (u.id) u.id, f.cui FROM tmp_cnsc_unmatched u JOIN tmp_firms_uat f ON f.norm_stripped = u.primaria_stripped_norm WHERE u.primaria_stripped_norm != u.direct_norm -- only when PRIMARIA-strip changed the name ORDER BY u.id, f.cui ) UPDATE cnsc.decizii d SET authority_cuis = ARRAY[c.cui]::text[] FROM candidates c WHERE d.id = c.id AND (d.authority_cuis IS NULL OR array_length(d.authority_cuis, 1) IS NULL); -- Pass 3: CONSILIUL JUDETEAN X → JUDETUL X WITH candidates AS ( SELECT DISTINCT ON (u.id) u.id, f.cui FROM tmp_cnsc_unmatched u JOIN tmp_firms_uat f ON f.norm_stripped = u.cj_norm WHERE u.cj_norm IS NOT NULL ORDER BY u.id, f.cui ) UPDATE cnsc.decizii d SET authority_cuis = ARRAY[c.cui]::text[] FROM candidates c WHERE d.id = c.id AND (d.authority_cuis IS NULL OR array_length(d.authority_cuis, 1) IS NULL); -- Refresh the per-authority MV REFRESH MATERIALIZED VIEW CONCURRENTLY cnsc.mv_per_authority_cui; -- Final stats SELECT count(*) AS total, count(*) FILTER (WHERE array_length(authority_cuis, 1) > 0) AS with_auth_cui, round(100.0 * count(*) FILTER (WHERE array_length(authority_cuis, 1) > 0) / count(*), 1) AS pct FROM cnsc.decizii; DROP TABLE tmp_firms_uat, tmp_cnsc_unmatched;