-- 031_cnas.sql -- CNAS — Casa Națională de Asigurări de Sănătate. -- Lista furnizorilor de servicii medicale aflați în relație contractuală cu CAS-urile județene. -- -- ─────────────────────────────────────────────────────────────────────────── -- Source landscape (investigated 2026-05-10): -- ─────────────────────────────────────────────────────────────────────────── -- The CNAS source ecosystem is in active migration. There are 3 layers: -- -- 1. cnas.ro/wp-content/uploads/ — central WordPress media library, 4.18K -- files total. ~70-90 are "furnizori" PDFs (national + per-județ snapshots). -- Discoverable via /wp-json/wp/v2/media REST API. 99% PDF, ~5 XLSX. -- → THIS IS THE PRIMARY SOURCE for v1. -- -- 2. cas.cnas.ro/casXX — new Angular SPA per județ (42 sub-instances). The -- backend admin/api is a Blazor app at /admin/api/* with X-Instance-Key -- header routing. As of 2026-05, most endpoints return [] or 500. The data -- hasn't been migrated to the new infra yet. -- → DEFERRED — track via watch script, ingest when populated. -- -- 3. www.cnas.ro/casXX/page/lista-furnizori-*.html — old pre-migration WP. -- 301-redirects to cnas.ro/casXX/ which is now a stub. Effectively dead -- for our purposes; some files survive in CKEditor uploads on the old -- cas.cnas.ro/casXX/theme/cnas/js/ckeditor/filemanager/userfiles/... -- → DEFERRED — best harvested via Wayback CDX. -- -- ─────────────────────────────────────────────────────────────────────────── -- File naming convention (cnas.ro/wp-content/uploads/): -- ─────────────────────────────────────────────────────────────────────────── -- Most files DON'T contain CUI codes. They contain provider names + sediu + -- contact data. CUI matching is post-ingest via firms.normalize_company_name -- + trgm fuzzy search (mirroring anre.licente pattern). -- -- Filename signals tip_serviciu: -- FURNIZORI-SPITALE-IN-CONTRACT-2024.pdf → tip='spital' -- FURNIZORI-IN-CONTRACT-MEDICINA-DE-FAMILIE.. → tip='medicina_familie' -- FURNIZORI-DE-SERVICII-FARMACEUTICE-.. → tip='farmacie' -- FURNIZORI-DISPOZITIVE-MEDICALE-.. → tip='dispozitive_medicale' -- FURNIZORI-MEDICINA-DENTARA-.. → tip='medicina_dentara' -- FURNIZORI-INGRIJIRI-MEDICALE-.. → tip='ingrijiri_medicale' -- FURNIZORI-INGRIJIRI-PALIATIVE-.. → tip='ingrijiri_paliative' -- FURNIZORI-RECUPERARE-MEDICALA-.. → tip='recuperare_medicala' -- FURNIZORI-PNS-.. → tip='pns' (programe nationale) -- FURNIZORI-IN-CONTRACT-AMBULATORIU-CLINIC-.. → tip='ambulatoriu_clinic' -- FURNIZORI-IN-CONTRACT-AMBULATORIU-PARACLIN.. → tip='paraclinic' -- FURNIZORI-..-URGENTA-PRESPITALICEASCA-.. → tip='urgenta_transport' -- Lista-furnizori-clinice-.. → tip='clinic' -- Lista-furnizori-RECA-.. → tip='recuperare_a' -- Lista-furnizori-radioterapie-.. → tip='radioterapie' -- Lista-furnizori-testare-genetica-.. → tip='testare_genetica' -- Lista-furnizori-tumori-solide-maligne-.. → tip='oncologie' -- -- Județ extraction is heuristic: from filename (e.g. CAS-GORJ, CAS-ARAD) OR -- from PDF page header ("CASA DE ASIGURARI A JUDETULUI GORJ"). When both fail, -- it's a national list (rare — most centrally-uploaded files are actually -- per-județ, as the PDFs are produced by individual CAS-uri). -- -- ─────────────────────────────────────────────────────────────────────────── -- Cross-source value: -- ─────────────────────────────────────────────────────────────────────────── -- 1. cnas.furnizori.cui (resolved post-ingest) × seap.announcements.supplier_cui -- @ CPV 33.* (medical equipment) / 85.* (medical services) = -- "medical providers winning state contracts directly + via insurance" -- 2. cnas.furnizori.cui × anaf_datornici.datornic.cui = "spitale & clinici cu -- datorii la stat" — red-flag pattern. -- 3. cnas.furnizori.cui × fonduri.proiect.beneficiar_cui (POIM-Sănătate) = -- EU-funded healthcare providers. CREATE SCHEMA IF NOT EXISTS cnas; -- ── 1. Documents (file metadata catalog) ─────────────────────────────────── -- Tracks every PDF/XLSX harvested from cnas.ro WP media. One row per file URL. -- Idempotent re-fetch: same URL → UPDATE fetched_at + parsed_at. CREATE TABLE IF NOT EXISTS cnas.documents ( id bigserial PRIMARY KEY, source_url text NOT NULL UNIQUE, source text NOT NULL, -- 'wp-media' | 'cas-cnas-spa' | 'wayback' | 'manual' wp_media_id bigint, -- /wp-json/wp/v2/media id (when source='wp-media') title text, filename text, mime_type text, file_size_bytes integer, file_sha1 char(40), -- of downloaded bytes (used to detect changes) published_at timestamptz, -- WP "date" (publication on CNAS site) -- Inferred classification (heuristic from filename + content): tip_serviciu text, -- 'spital' / 'farmacie' / 'medicina_familie' / etc. (see header above) judet text, -- "GORJ" / "BIHOR" / NULL when national or unknown perioada text, -- raw period from filename ("01.07.2024", "decembrie-2024") page_count integer, -- for PDFs, post-parse parse_status text NOT NULL DEFAULT 'pending',-- 'pending' / 'ok' / 'failed' / 'unsupported_format' / 'no_table' parse_error text, rows_extracted integer NOT NULL DEFAULT 0, fetched_at timestamptz NOT NULL DEFAULT now(), parsed_at timestamptz ); CREATE INDEX IF NOT EXISTS idx_cnas_documents_tip ON cnas.documents(tip_serviciu); CREATE INDEX IF NOT EXISTS idx_cnas_documents_judet ON cnas.documents(judet); CREATE INDEX IF NOT EXISTS idx_cnas_documents_published ON cnas.documents(published_at DESC); CREATE INDEX IF NOT EXISTS idx_cnas_documents_parse_status ON cnas.documents(parse_status); COMMENT ON TABLE cnas.documents IS 'Catalog of every CNAS provider-list document (PDF/XLSX) harvested. One row per source URL. Source: cnas.ro/wp-json/wp/v2/media + future SPA endpoints.'; COMMENT ON COLUMN cnas.documents.tip_serviciu IS 'Service category inferred from filename: spital, farmacie, medicina_familie, medicina_dentara, dispozitive_medicale, ambulatoriu_clinic, paraclinic, ingrijiri_medicale, ingrijiri_paliative, recuperare_medicala, urgenta_transport, pns, clinic, oncologie, testare_genetica, radioterapie, other.'; -- ── 2. Furnizori (extracted provider records) ────────────────────────────── -- One row per (document, NR_CRT). Provider name is the fundamental key — CUI -- is resolved POST-INSERT via firms.normalize_company_name fuzzy match (mirror -- of anre.licente pattern). -- -- We allow the same legal entity (same CUI) to appear MULTIPLE times across -- documents (e.g. same hospital listed in spitale + paraclinic + clinic lists). -- Dedup is via mv_cnas_per_cui rollup, not at insert time. CREATE TABLE IF NOT EXISTS cnas.furnizori ( id bigserial PRIMARY KEY, document_id bigint NOT NULL REFERENCES cnas.documents(id) ON DELETE CASCADE, -- Document context (denormalized for fast filtering): tip_serviciu text, -- inherited from document judet text, -- inherited from document (or row-level when available) perioada text, -- "la 01.07.2024" etc. -- Source row data: nr_crt integer, -- in-document index (1..N) cod_furnizor text, -- "BH01" / "CT12" — CAS-internal supplier code (when present) name text NOT NULL, -- raw from document ("Spitalul Clinic Județean Oradea", "DR.HEIM HERMINA") name_norm text, -- firms.normalize_company_name — populated post-insert reprezentant text, -- legal rep / cabinet doctor (for medicina familie, dentara) sediu text, -- adresa localitate text, telefon text, fax text, email text, specialitate text, -- pentru ambulatoriu, paraclinic, clinic -- CUI matching (resolved post-ingest): cui text, cui_match_score numeric(4,3), cui_match_method text, -- 'exact_norm' / 'trgm_unique' / 'trgm_judet' / 'manual' matched_at timestamptz, raw_text text, -- the raw text-row from PDF for debugging fetched_at timestamptz NOT NULL DEFAULT now(), UNIQUE (document_id, nr_crt, name) ); CREATE INDEX IF NOT EXISTS idx_cnas_furn_cui ON cnas.furnizori(cui) WHERE cui IS NOT NULL; CREATE INDEX IF NOT EXISTS idx_cnas_furn_judet_tip ON cnas.furnizori(judet, tip_serviciu); CREATE INDEX IF NOT EXISTS idx_cnas_furn_name_trgm ON cnas.furnizori USING gin (name_norm gin_trgm_ops); CREATE INDEX IF NOT EXISTS idx_cnas_furn_document ON cnas.furnizori(document_id); CREATE INDEX IF NOT EXISTS idx_cnas_furn_localitate ON cnas.furnizori(localitate) WHERE localitate IS NOT NULL; COMMENT ON TABLE cnas.furnizori IS 'Furnizori de servicii medicale extrași din PDF-urile CNAS. One row per (document, nr_crt, name). CUI resolved post-insert via firms.normalize_company_name + trgm fuzzy.'; COMMENT ON COLUMN cnas.furnizori.cod_furnizor IS 'Internal CAS supplier code, e.g. "BH01" (Bihor seq 01), "MB427" (București-Ilfov seq 427). NOT a CUI.'; COMMENT ON COLUMN cnas.furnizori.cui IS 'Resolved CUI via firms.normalize_company_name + pg_trgm match. NULL = unmatched; cabinete medicale individuale (CMI) often have no CUI in firms registry.'; -- ── 3. Scrape log ───────────────────────────────────────────────────────── -- Mirrors anre.scrape_log convention. CREATE TABLE IF NOT EXISTS cnas.scrape_log ( id bigserial PRIMARY KEY, scraper text NOT NULL, -- 'wp-media-list' / 'parse-pdf' / 'match-cui' / 'cas-cnas-spa' source_url text, rows_seen integer NOT NULL DEFAULT 0, rows_inserted integer NOT NULL DEFAULT 0, rows_updated integer NOT NULL DEFAULT 0, rows_skipped integer NOT NULL DEFAULT 0, duration_ms integer NOT NULL DEFAULT 0, started_at timestamptz NOT NULL, finished_at timestamptz NOT NULL DEFAULT now(), error text ); CREATE INDEX IF NOT EXISTS idx_cnas_scrape_log_started ON cnas.scrape_log(started_at DESC); -- ── 4. Materialized view: per-CUI provider rollup ───────────────────────── -- Joinable with seap.announcements + anaf_datornici + fonduri.proiect for -- cross-source detection. Refreshed via cron after match-cui pass. CREATE MATERIALIZED VIEW IF NOT EXISTS cnas.mv_furnizori_per_cui AS SELECT cui, COUNT(*) AS nr_aparitii, -- across all lists COUNT(DISTINCT tip_serviciu) FILTER (WHERE tip_serviciu IS NOT NULL) AS nr_tipuri_serviciu, COUNT(DISTINCT judet) FILTER (WHERE judet IS NOT NULL) AS nr_judete, array_agg(DISTINCT tip_serviciu) FILTER (WHERE tip_serviciu IS NOT NULL) AS tipuri_serviciu, array_agg(DISTINCT judet) FILTER (WHERE judet IS NOT NULL) AS judete, -- One representative name (most common): (array_agg(name ORDER BY length(name)))[1] AS name_sample, MIN(fetched_at) AS first_seen, MAX(fetched_at) AS last_seen FROM cnas.furnizori WHERE cui IS NOT NULL GROUP BY cui; CREATE UNIQUE INDEX IF NOT EXISTS idx_cnas_mv_per_cui ON cnas.mv_furnizori_per_cui(cui); COMMENT ON MATERIALIZED VIEW cnas.mv_furnizori_per_cui IS 'Per-CUI rollup of CNAS provider appearances. Refresh: REFRESH MATERIALIZED VIEW CONCURRENTLY cnas.mv_furnizori_per_cui;';