vreau-digital/services/seap-scraper/sql/019_cui_matcher.sql

-- 019_cui_matcher.sql
-- Fuzzy CUI matcher for any external table that has a company name but no CUI
-- (fonduri.beneficiar_anunt, fonduri.afir_plati, future ANI shareholdings, etc.)
-- against firms.entities.
--
-- Strategy:
--   1. Build a normalized form of each company name on both sides
--      (lowercase + unaccent + strip legal suffixes + collapse whitespace).
--   2. Stage A (exact normalized match): expect ~40-50% hit rate when the
--      ONRC-canonical legal name was used in the source.
--   3. Stage B (pg_trgm fuzzy): top candidate ≥ 0.85 AND uniquely best
--      (gap to second-best ≥ 0.10) → auto-accept.
--   4. Stage C (judet disambiguation): when multiple candidates above
--      threshold, prefer firm whose adr_judet matches source's judet.

-- Idempotent. Re-runnable after each scrape.

CREATE EXTENSION IF NOT EXISTS pg_trgm;
CREATE EXTENSION IF NOT EXISTS unaccent;

-- ── Normalization helper ────────────────────────────────────────────────
-- We can't mark unaccent as immutable in a function via CREATE FUNCTION
-- because unaccent is by default STABLE. Wrap in IMMUTABLE so we can use
-- in indexes. (See Postgres docs: a custom IMMUTABLE wrapper is the standard
-- workaround.)
CREATE OR REPLACE FUNCTION firms.normalize_company_name(input text)
RETURNS text
LANGUAGE plpgsql
IMMUTABLE
PARALLEL SAFE
AS $$
DECLARE
  s text;
BEGIN
  IF input IS NULL THEN RETURN NULL; END IF;
  s := lower(public.unaccent(input));
  -- Strip leading legal-form prefixes
  s := regexp_replace(s, '^(s\.?c\.?|sc)\s+', '', 'i');
  -- Strip trailing legal-form suffixes (SRL, SRL-D, SA, PFA, II, IF, etc.)
  s := regexp_replace(s,
    '\s+(s\.?r\.?l\.?(\s*-?\s*d)?|s\.?a\.?|s\.?n\.?c\.?|s\.?c\.?s\.?|s\.?c\.?a\.?|p\.?f\.?a\.?|i\.?i\.?|i\.?f\.?)\s*\.?\s*$',
    '', 'i');
  -- Collapse internal punctuation/whitespace
  s := regexp_replace(s, '[\.,\-\(\)/\\]+', ' ', 'g');
  s := regexp_replace(s, '\s+', ' ', 'g');
  RETURN trim(s);
END;
$$;

-- Materialized helper column on firms.entities (no UPDATE — generated)
ALTER TABLE firms.entities
  ADD COLUMN IF NOT EXISTS name_normalized text
    GENERATED ALWAYS AS (firms.normalize_company_name(name)) STORED;

CREATE INDEX IF NOT EXISTS idx_entities_name_normalized ON firms.entities(name_normalized);
CREATE INDEX IF NOT EXISTS idx_entities_name_norm_trgm ON firms.entities USING gin (name_normalized gin_trgm_ops);

-- Optional: judet normalization to disambiguate
CREATE OR REPLACE FUNCTION firms.normalize_judet(input text)
RETURNS text
LANGUAGE sql
IMMUTABLE
PARALLEL SAFE
AS $$
  SELECT lower(public.unaccent(coalesce(input, '')))
$$;