-- 019_cui_matcher.sql -- Fuzzy CUI matcher for any external table that has a company name but no CUI -- (fonduri.beneficiar_anunt, fonduri.afir_plati, future ANI shareholdings, etc.) -- against firms.entities. -- -- Strategy: -- 1. Build a normalized form of each company name on both sides -- (lowercase + unaccent + strip legal suffixes + collapse whitespace). -- 2. Stage A (exact normalized match): expect ~40-50% hit rate when the -- ONRC-canonical legal name was used in the source. -- 3. Stage B (pg_trgm fuzzy): top candidate ≥ 0.85 AND uniquely best -- (gap to second-best ≥ 0.10) → auto-accept. -- 4. Stage C (judet disambiguation): when multiple candidates above -- threshold, prefer firm whose adr_judet matches source's judet. -- Idempotent. Re-runnable after each scrape. CREATE EXTENSION IF NOT EXISTS pg_trgm; CREATE EXTENSION IF NOT EXISTS unaccent; -- ── Normalization helper ──────────────────────────────────────────────── -- We can't mark unaccent as immutable in a function via CREATE FUNCTION -- because unaccent is by default STABLE. Wrap in IMMUTABLE so we can use -- in indexes. (See Postgres docs: a custom IMMUTABLE wrapper is the standard -- workaround.) CREATE OR REPLACE FUNCTION firms.normalize_company_name(input text) RETURNS text LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE AS $$ DECLARE s text; BEGIN IF input IS NULL THEN RETURN NULL; END IF; s := lower(public.unaccent(input)); -- Strip leading legal-form prefixes s := regexp_replace(s, '^(s\.?c\.?|sc)\s+', '', 'i'); -- Strip trailing legal-form suffixes (SRL, SRL-D, SA, PFA, II, IF, etc.) s := regexp_replace(s, '\s+(s\.?r\.?l\.?(\s*-?\s*d)?|s\.?a\.?|s\.?n\.?c\.?|s\.?c\.?s\.?|s\.?c\.?a\.?|p\.?f\.?a\.?|i\.?i\.?|i\.?f\.?)\s*\.?\s*$', '', 'i'); -- Collapse internal punctuation/whitespace s := regexp_replace(s, '[\.,\-\(\)/\\]+', ' ', 'g'); s := regexp_replace(s, '\s+', ' ', 'g'); RETURN trim(s); END; $$; -- Materialized helper column on firms.entities (no UPDATE — generated) ALTER TABLE firms.entities ADD COLUMN IF NOT EXISTS name_normalized text GENERATED ALWAYS AS (firms.normalize_company_name(name)) STORED; CREATE INDEX IF NOT EXISTS idx_entities_name_normalized ON firms.entities(name_normalized); CREATE INDEX IF NOT EXISTS idx_entities_name_norm_trgm ON firms.entities USING gin (name_normalized gin_trgm_ops); -- Optional: judet normalization to disambiguate CREATE OR REPLACE FUNCTION firms.normalize_judet(input text) RETURNS text LANGUAGE sql IMMUTABLE PARALLEL SAFE AS $$ SELECT lower(public.unaccent(coalesce(input, ''))) $$;