Files
vreau-digital/services/seap-scraper/sql/031_cnas.sql
T
Claude VM a6c03a091e initial: split from gov-agreg — vreau.digital standalone platform
Moved from gov-agreg/src/pages/achizitii/* to root (drop prefix).
- 22 pages migrated, 127 files total
- All internal links: /achizitii/X → /X (176 occurrences fixed)
- AchizitiiLayout subnav rewritten: /X paths, top-right link to vreaudigital.ro hub
- BaseLayout new (vreau.digital branding, OG tags, site URL)
- astro.config.mjs: site https://vreau.digital, server output (was static)
- docker-compose: port 5096 (vreaudigital is 5095), container vreau-digital
- deploy.sh: paths /opt/vreau-digital, log /var/log/vreau-digital-deploy.log

Backend shared with gov-agreg:
- PostgreSQL satra (same schemas: seap, firms, anaf, anre, ...)
- Photon, Martin tiles
- Infisical /vreaudigital path (DATABASE_URL etc. shared)

build: PASS (npx astro check 0 errors, npm run build 5s vite + 10s server)
2026-05-13 00:10:32 +03:00

198 lines
13 KiB
SQL
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
-- 031_cnas.sql
-- CNAS — Casa Națională de Asigurări de Sănătate.
-- Lista furnizorilor de servicii medicale aflați în relație contractuală cu CAS-urile județene.
--
-- ───────────────────────────────────────────────────────────────────────────
-- Source landscape (investigated 2026-05-10):
-- ───────────────────────────────────────────────────────────────────────────
-- The CNAS source ecosystem is in active migration. There are 3 layers:
--
-- 1. cnas.ro/wp-content/uploads/ — central WordPress media library, 4.18K
-- files total. ~70-90 are "furnizori" PDFs (national + per-județ snapshots).
-- Discoverable via /wp-json/wp/v2/media REST API. 99% PDF, ~5 XLSX.
-- → THIS IS THE PRIMARY SOURCE for v1.
--
-- 2. cas.cnas.ro/casXX — new Angular SPA per județ (42 sub-instances). The
-- backend admin/api is a Blazor app at /admin/api/* with X-Instance-Key
-- header routing. As of 2026-05, most endpoints return [] or 500. The data
-- hasn't been migrated to the new infra yet.
-- → DEFERRED — track via watch script, ingest when populated.
--
-- 3. www.cnas.ro/casXX/page/lista-furnizori-*.html — old pre-migration WP.
-- 301-redirects to cnas.ro/casXX/ which is now a stub. Effectively dead
-- for our purposes; some files survive in CKEditor uploads on the old
-- cas.cnas.ro/casXX/theme/cnas/js/ckeditor/filemanager/userfiles/...
-- → DEFERRED — best harvested via Wayback CDX.
--
-- ───────────────────────────────────────────────────────────────────────────
-- File naming convention (cnas.ro/wp-content/uploads/):
-- ───────────────────────────────────────────────────────────────────────────
-- Most files DON'T contain CUI codes. They contain provider names + sediu +
-- contact data. CUI matching is post-ingest via firms.normalize_company_name
-- + trgm fuzzy search (mirroring anre.licente pattern).
--
-- Filename signals tip_serviciu:
-- FURNIZORI-SPITALE-IN-CONTRACT-2024.pdf → tip='spital'
-- FURNIZORI-IN-CONTRACT-MEDICINA-DE-FAMILIE.. → tip='medicina_familie'
-- FURNIZORI-DE-SERVICII-FARMACEUTICE-.. → tip='farmacie'
-- FURNIZORI-DISPOZITIVE-MEDICALE-.. → tip='dispozitive_medicale'
-- FURNIZORI-MEDICINA-DENTARA-.. → tip='medicina_dentara'
-- FURNIZORI-INGRIJIRI-MEDICALE-.. → tip='ingrijiri_medicale'
-- FURNIZORI-INGRIJIRI-PALIATIVE-.. → tip='ingrijiri_paliative'
-- FURNIZORI-RECUPERARE-MEDICALA-.. → tip='recuperare_medicala'
-- FURNIZORI-PNS-.. → tip='pns' (programe nationale)
-- FURNIZORI-IN-CONTRACT-AMBULATORIU-CLINIC-.. → tip='ambulatoriu_clinic'
-- FURNIZORI-IN-CONTRACT-AMBULATORIU-PARACLIN.. → tip='paraclinic'
-- FURNIZORI-..-URGENTA-PRESPITALICEASCA-.. → tip='urgenta_transport'
-- Lista-furnizori-clinice-.. → tip='clinic'
-- Lista-furnizori-RECA-.. → tip='recuperare_a'
-- Lista-furnizori-radioterapie-.. → tip='radioterapie'
-- Lista-furnizori-testare-genetica-.. → tip='testare_genetica'
-- Lista-furnizori-tumori-solide-maligne-.. → tip='oncologie'
--
-- Județ extraction is heuristic: from filename (e.g. CAS-GORJ, CAS-ARAD) OR
-- from PDF page header ("CASA DE ASIGURARI A JUDETULUI GORJ"). When both fail,
-- it's a national list (rare — most centrally-uploaded files are actually
-- per-județ, as the PDFs are produced by individual CAS-uri).
--
-- ───────────────────────────────────────────────────────────────────────────
-- Cross-source value:
-- ───────────────────────────────────────────────────────────────────────────
-- 1. cnas.furnizori.cui (resolved post-ingest) × seap.announcements.supplier_cui
-- @ CPV 33.* (medical equipment) / 85.* (medical services) =
-- "medical providers winning state contracts directly + via insurance"
-- 2. cnas.furnizori.cui × anaf_datornici.datornic.cui = "spitale & clinici cu
-- datorii la stat" — red-flag pattern.
-- 3. cnas.furnizori.cui × fonduri.proiect.beneficiar_cui (POIM-Sănătate) =
-- EU-funded healthcare providers.
CREATE SCHEMA IF NOT EXISTS cnas;
-- ── 1. Documents (file metadata catalog) ───────────────────────────────────
-- Tracks every PDF/XLSX harvested from cnas.ro WP media. One row per file URL.
-- Idempotent re-fetch: same URL → UPDATE fetched_at + parsed_at.
CREATE TABLE IF NOT EXISTS cnas.documents (
id bigserial PRIMARY KEY,
source_url text NOT NULL UNIQUE,
source text NOT NULL, -- 'wp-media' | 'cas-cnas-spa' | 'wayback' | 'manual'
wp_media_id bigint, -- /wp-json/wp/v2/media id (when source='wp-media')
title text,
filename text,
mime_type text,
file_size_bytes integer,
file_sha1 char(40), -- of downloaded bytes (used to detect changes)
published_at timestamptz, -- WP "date" (publication on CNAS site)
-- Inferred classification (heuristic from filename + content):
tip_serviciu text, -- 'spital' / 'farmacie' / 'medicina_familie' / etc. (see header above)
judet text, -- "GORJ" / "BIHOR" / NULL when national or unknown
perioada text, -- raw period from filename ("01.07.2024", "decembrie-2024")
page_count integer, -- for PDFs, post-parse
parse_status text NOT NULL DEFAULT 'pending',-- 'pending' / 'ok' / 'failed' / 'unsupported_format' / 'no_table'
parse_error text,
rows_extracted integer NOT NULL DEFAULT 0,
fetched_at timestamptz NOT NULL DEFAULT now(),
parsed_at timestamptz
);
CREATE INDEX IF NOT EXISTS idx_cnas_documents_tip ON cnas.documents(tip_serviciu);
CREATE INDEX IF NOT EXISTS idx_cnas_documents_judet ON cnas.documents(judet);
CREATE INDEX IF NOT EXISTS idx_cnas_documents_published ON cnas.documents(published_at DESC);
CREATE INDEX IF NOT EXISTS idx_cnas_documents_parse_status ON cnas.documents(parse_status);
COMMENT ON TABLE cnas.documents IS
'Catalog of every CNAS provider-list document (PDF/XLSX) harvested. One row per source URL. Source: cnas.ro/wp-json/wp/v2/media + future SPA endpoints.';
COMMENT ON COLUMN cnas.documents.tip_serviciu IS
'Service category inferred from filename: spital, farmacie, medicina_familie, medicina_dentara, dispozitive_medicale, ambulatoriu_clinic, paraclinic, ingrijiri_medicale, ingrijiri_paliative, recuperare_medicala, urgenta_transport, pns, clinic, oncologie, testare_genetica, radioterapie, other.';
-- ── 2. Furnizori (extracted provider records) ──────────────────────────────
-- One row per (document, NR_CRT). Provider name is the fundamental key — CUI
-- is resolved POST-INSERT via firms.normalize_company_name fuzzy match (mirror
-- of anre.licente pattern).
--
-- We allow the same legal entity (same CUI) to appear MULTIPLE times across
-- documents (e.g. same hospital listed in spitale + paraclinic + clinic lists).
-- Dedup is via mv_cnas_per_cui rollup, not at insert time.
CREATE TABLE IF NOT EXISTS cnas.furnizori (
id bigserial PRIMARY KEY,
document_id bigint NOT NULL REFERENCES cnas.documents(id) ON DELETE CASCADE,
-- Document context (denormalized for fast filtering):
tip_serviciu text, -- inherited from document
judet text, -- inherited from document (or row-level when available)
perioada text, -- "la 01.07.2024" etc.
-- Source row data:
nr_crt integer, -- in-document index (1..N)
cod_furnizor text, -- "BH01" / "CT12" — CAS-internal supplier code (when present)
name text NOT NULL, -- raw from document ("Spitalul Clinic Județean Oradea", "DR.HEIM HERMINA")
name_norm text, -- firms.normalize_company_name — populated post-insert
reprezentant text, -- legal rep / cabinet doctor (for medicina familie, dentara)
sediu text, -- adresa
localitate text,
telefon text,
fax text,
email text,
specialitate text, -- pentru ambulatoriu, paraclinic, clinic
-- CUI matching (resolved post-ingest):
cui text,
cui_match_score numeric(4,3),
cui_match_method text, -- 'exact_norm' / 'trgm_unique' / 'trgm_judet' / 'manual'
matched_at timestamptz,
raw_text text, -- the raw text-row from PDF for debugging
fetched_at timestamptz NOT NULL DEFAULT now(),
UNIQUE (document_id, nr_crt, name)
);
CREATE INDEX IF NOT EXISTS idx_cnas_furn_cui ON cnas.furnizori(cui) WHERE cui IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_cnas_furn_judet_tip ON cnas.furnizori(judet, tip_serviciu);
CREATE INDEX IF NOT EXISTS idx_cnas_furn_name_trgm ON cnas.furnizori USING gin (name_norm gin_trgm_ops);
CREATE INDEX IF NOT EXISTS idx_cnas_furn_document ON cnas.furnizori(document_id);
CREATE INDEX IF NOT EXISTS idx_cnas_furn_localitate ON cnas.furnizori(localitate) WHERE localitate IS NOT NULL;
COMMENT ON TABLE cnas.furnizori IS
'Furnizori de servicii medicale extrași din PDF-urile CNAS. One row per (document, nr_crt, name). CUI resolved post-insert via firms.normalize_company_name + trgm fuzzy.';
COMMENT ON COLUMN cnas.furnizori.cod_furnizor IS
'Internal CAS supplier code, e.g. "BH01" (Bihor seq 01), "MB427" (București-Ilfov seq 427). NOT a CUI.';
COMMENT ON COLUMN cnas.furnizori.cui IS
'Resolved CUI via firms.normalize_company_name + pg_trgm match. NULL = unmatched; cabinete medicale individuale (CMI) often have no CUI in firms registry.';
-- ── 3. Scrape log ─────────────────────────────────────────────────────────
-- Mirrors anre.scrape_log convention.
CREATE TABLE IF NOT EXISTS cnas.scrape_log (
id bigserial PRIMARY KEY,
scraper text NOT NULL, -- 'wp-media-list' / 'parse-pdf' / 'match-cui' / 'cas-cnas-spa'
source_url text,
rows_seen integer NOT NULL DEFAULT 0,
rows_inserted integer NOT NULL DEFAULT 0,
rows_updated integer NOT NULL DEFAULT 0,
rows_skipped integer NOT NULL DEFAULT 0,
duration_ms integer NOT NULL DEFAULT 0,
started_at timestamptz NOT NULL,
finished_at timestamptz NOT NULL DEFAULT now(),
error text
);
CREATE INDEX IF NOT EXISTS idx_cnas_scrape_log_started ON cnas.scrape_log(started_at DESC);
-- ── 4. Materialized view: per-CUI provider rollup ─────────────────────────
-- Joinable with seap.announcements + anaf_datornici + fonduri.proiect for
-- cross-source detection. Refreshed via cron after match-cui pass.
CREATE MATERIALIZED VIEW IF NOT EXISTS cnas.mv_furnizori_per_cui AS
SELECT
cui,
COUNT(*) AS nr_aparitii, -- across all lists
COUNT(DISTINCT tip_serviciu) FILTER (WHERE tip_serviciu IS NOT NULL) AS nr_tipuri_serviciu,
COUNT(DISTINCT judet) FILTER (WHERE judet IS NOT NULL) AS nr_judete,
array_agg(DISTINCT tip_serviciu) FILTER (WHERE tip_serviciu IS NOT NULL) AS tipuri_serviciu,
array_agg(DISTINCT judet) FILTER (WHERE judet IS NOT NULL) AS judete,
-- One representative name (most common):
(array_agg(name ORDER BY length(name)))[1] AS name_sample,
MIN(fetched_at) AS first_seen,
MAX(fetched_at) AS last_seen
FROM cnas.furnizori
WHERE cui IS NOT NULL
GROUP BY cui;
CREATE UNIQUE INDEX IF NOT EXISTS idx_cnas_mv_per_cui ON cnas.mv_furnizori_per_cui(cui);
COMMENT ON MATERIALIZED VIEW cnas.mv_furnizori_per_cui IS
'Per-CUI rollup of CNAS provider appearances. Refresh: REFRESH MATERIALIZED VIEW CONCURRENTLY cnas.mv_furnizori_per_cui;';