-- 041_curteacont_cleaned_name_match.sql -- Follow-up to 040 — handles the residual 131 rows whose audited_entity_name -- contains the prefix " ) privind raportul de audit al performantei nr.X, ENTITY" -- (scraper bug: parser kept the prefix instead of just the entity). -- -- Extract the entity name via split-on-last-comma, then retry both -- UAT-pattern and strip-parens match. -- -- Source bug should also be fixed in services/seap-scraper/src/scrape-curteacont.ts -- but that's a separate task; SQL repair lands the data improvement immediately. \timing on DROP TABLE IF EXISTS tmp_cdc_residue; CREATE TEMP TABLE tmp_cdc_residue AS SELECT slug_id, audited_entity_name, trim(reverse(split_part(reverse(audited_entity_name), ',', 1))) AS clean_name FROM curteacont.rapoarte WHERE audited_entity_cui IS NULL AND audited_entity_name IS NOT NULL AND audited_entity_name ~ '\) privind raportul'; -- Pass 1: UAT-pattern on cleaned names WITH cleaned AS ( SELECT slug_id, clean_name, firms.normalize_company_name( CASE WHEN clean_name ~* '^UATC ' THEN 'COMUNA ' || regexp_replace(clean_name, '^UATC\s+', '', 'i') WHEN clean_name ~* '^UATJ ' THEN 'JUDETUL ' || regexp_replace(clean_name, '^UATJ\s+', '', 'i') WHEN clean_name ~* '^UATO ' THEN 'ORAS ' || regexp_replace(clean_name, '^UATO\s+', '', 'i') WHEN clean_name ~* '^UATM ' THEN 'MUNICIPIUL ' || regexp_replace(clean_name, '^UATM\s+', '', 'i') WHEN clean_name ~* '^UAT SECTOR(UL)? [1-6]' THEN 'SECTOR ' || substring(clean_name FROM '^UAT SECTOR(?:UL)? ([1-6])') ELSE NULL END ) AS expected_norm FROM tmp_cdc_residue ), firms_uat AS ( SELECT cui, firms.normalize_company_name(regexp_replace(name, '\s*\(.*$', '')) AS norm_stripped FROM firms.entities WHERE name ~* '^(COMUNA |JUDETUL |ORAS |ORASUL |MUNICIPIUL |SECTOR(UL)? [1-6])' ), candidates AS ( SELECT DISTINCT ON (c.slug_id) c.slug_id, f.cui FROM cleaned c JOIN firms_uat f ON f.norm_stripped = c.expected_norm WHERE c.expected_norm IS NOT NULL ORDER BY c.slug_id, f.cui ) UPDATE curteacont.rapoarte r SET audited_entity_cui = c.cui, audited_entity_name = trim(reverse(split_part(reverse(r.audited_entity_name), ',', 1))), -- also fix the name field parsed_at = COALESCE(r.parsed_at, now()) FROM candidates c WHERE r.slug_id = c.slug_id AND r.audited_entity_cui IS NULL; -- Pass 2: strip-parens exact on cleaned name + ONRC stripped name WITH cleaned AS ( SELECT slug_id, trim(reverse(split_part(reverse(audited_entity_name), ',', 1))) AS clean_name FROM curteacont.rapoarte WHERE audited_entity_cui IS NULL AND audited_entity_name ~ '\) privind raportul' ), candidates2 AS ( SELECT DISTINCT ON (c.slug_id) c.slug_id, e.cui FROM cleaned c JOIN firms.entities e ON firms.normalize_company_name(regexp_replace(e.name, '\s*\(.*$', '')) = firms.normalize_company_name(regexp_replace(c.clean_name, '\s*\(.*$', '')) ORDER BY c.slug_id, e.cui ) UPDATE curteacont.rapoarte r SET audited_entity_cui = c.cui, audited_entity_name = trim(reverse(split_part(reverse(r.audited_entity_name), ',', 1))), parsed_at = COALESCE(r.parsed_at, now()) FROM candidates2 c WHERE r.slug_id = c.slug_id AND r.audited_entity_cui IS NULL; -- Final stats SELECT count(*) AS total, count(audited_entity_cui) AS with_cui, round(100.0 * count(audited_entity_cui) / count(*), 1) AS pct FROM curteacont.rapoarte; DROP TABLE tmp_cdc_residue;