initial: split from gov-agreg — vreau.digital standalone platform
Moved from gov-agreg/src/pages/achizitii/* to root (drop prefix). - 22 pages migrated, 127 files total - All internal links: /achizitii/X → /X (176 occurrences fixed) - AchizitiiLayout subnav rewritten: /X paths, top-right link to vreaudigital.ro hub - BaseLayout new (vreau.digital branding, OG tags, site URL) - astro.config.mjs: site https://vreau.digital, server output (was static) - docker-compose: port 5096 (vreaudigital is 5095), container vreau-digital - deploy.sh: paths /opt/vreau-digital, log /var/log/vreau-digital-deploy.log Backend shared with gov-agreg: - PostgreSQL satra (same schemas: seap, firms, anaf, anre, ...) - Photon, Martin tiles - Infisical /vreaudigital path (DATABASE_URL etc. shared) build: PASS (npx astro check 0 errors, npm run build 5s vite + 10s server)
This commit is contained in:
@@ -0,0 +1,66 @@
|
||||
-- 019_cui_matcher.sql
|
||||
-- Fuzzy CUI matcher for any external table that has a company name but no CUI
|
||||
-- (fonduri.beneficiar_anunt, fonduri.afir_plati, future ANI shareholdings, etc.)
|
||||
-- against firms.entities.
|
||||
--
|
||||
-- Strategy:
|
||||
-- 1. Build a normalized form of each company name on both sides
|
||||
-- (lowercase + unaccent + strip legal suffixes + collapse whitespace).
|
||||
-- 2. Stage A (exact normalized match): expect ~40-50% hit rate when the
|
||||
-- ONRC-canonical legal name was used in the source.
|
||||
-- 3. Stage B (pg_trgm fuzzy): top candidate ≥ 0.85 AND uniquely best
|
||||
-- (gap to second-best ≥ 0.10) → auto-accept.
|
||||
-- 4. Stage C (judet disambiguation): when multiple candidates above
|
||||
-- threshold, prefer firm whose adr_judet matches source's judet.
|
||||
|
||||
-- Idempotent. Re-runnable after each scrape.
|
||||
|
||||
CREATE EXTENSION IF NOT EXISTS pg_trgm;
|
||||
CREATE EXTENSION IF NOT EXISTS unaccent;
|
||||
|
||||
-- ── Normalization helper ────────────────────────────────────────────────
|
||||
-- We can't mark unaccent as immutable in a function via CREATE FUNCTION
|
||||
-- because unaccent is by default STABLE. Wrap in IMMUTABLE so we can use
|
||||
-- in indexes. (See Postgres docs: a custom IMMUTABLE wrapper is the standard
|
||||
-- workaround.)
|
||||
CREATE OR REPLACE FUNCTION firms.normalize_company_name(input text)
|
||||
RETURNS text
|
||||
LANGUAGE plpgsql
|
||||
IMMUTABLE
|
||||
PARALLEL SAFE
|
||||
AS $$
|
||||
DECLARE
|
||||
s text;
|
||||
BEGIN
|
||||
IF input IS NULL THEN RETURN NULL; END IF;
|
||||
s := lower(public.unaccent(input));
|
||||
-- Strip leading legal-form prefixes
|
||||
s := regexp_replace(s, '^(s\.?c\.?|sc)\s+', '', 'i');
|
||||
-- Strip trailing legal-form suffixes (SRL, SRL-D, SA, PFA, II, IF, etc.)
|
||||
s := regexp_replace(s,
|
||||
'\s+(s\.?r\.?l\.?(\s*-?\s*d)?|s\.?a\.?|s\.?n\.?c\.?|s\.?c\.?s\.?|s\.?c\.?a\.?|p\.?f\.?a\.?|i\.?i\.?|i\.?f\.?)\s*\.?\s*$',
|
||||
'', 'i');
|
||||
-- Collapse internal punctuation/whitespace
|
||||
s := regexp_replace(s, '[\.,\-\(\)/\\]+', ' ', 'g');
|
||||
s := regexp_replace(s, '\s+', ' ', 'g');
|
||||
RETURN trim(s);
|
||||
END;
|
||||
$$;
|
||||
|
||||
-- Materialized helper column on firms.entities (no UPDATE — generated)
|
||||
ALTER TABLE firms.entities
|
||||
ADD COLUMN IF NOT EXISTS name_normalized text
|
||||
GENERATED ALWAYS AS (firms.normalize_company_name(name)) STORED;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_entities_name_normalized ON firms.entities(name_normalized);
|
||||
CREATE INDEX IF NOT EXISTS idx_entities_name_norm_trgm ON firms.entities USING gin (name_normalized gin_trgm_ops);
|
||||
|
||||
-- Optional: judet normalization to disambiguate
|
||||
CREATE OR REPLACE FUNCTION firms.normalize_judet(input text)
|
||||
RETURNS text
|
||||
LANGUAGE sql
|
||||
IMMUTABLE
|
||||
PARALLEL SAFE
|
||||
AS $$
|
||||
SELECT lower(public.unaccent(coalesce(input, '')))
|
||||
$$;
|
||||
Reference in New Issue
Block a user