-- 035_curteacont.sql -- Curtea de Conturi a României — Rapoarte de audit financiar / conformitate / -- performanță / control / follow-up. -- -- Source: https://www.curteadeconturi.ro/rapoarte-audit/{category} -- Categories scraped: -- - rapoarte-audit-financiar (~1,890 reports, 127 listing pages × 15) -- - rapoarte-conformitate (~2,580 reports, 173 pages × 15) -- - rapoarte-audit-performanta (~135 reports, 9 pages × 15) -- Approximate total: ~4,600 reports, growing weekly with new audits. -- -- Detail page exposes a single PDF download link of the form -- `/rapoarte-audit/downloads/{integer_id}` (verified IDs: 4078, 7335, 7854, -- 10653, 12418, 13832, 14183 — sequential, predictable). -- -- Stage 1 (this file + scrape-curteacont.ts): -- - Walks listing pages, harvests slug URLs + titles + publication dates + -- audit_type + audited entity name (parsed from title). -- - DOES NOT fetch detail pages or download PDFs (that is Stage 2 — see -- CURTEACONT-PLAN.md for the 15-25h roadmap). -- -- Stage 2 (next session): -- - For each row with NULL pdf_url, fetch detail page → extract -- /downloads/{id} numeric PDF ID + file size. -- - Optionally download PDF to satra disk under /opt/vreaudigital/data/cdc/. -- - Run pdfminer/pdftotext against first 3 pages → extract structured -- summary, findings_count, key amounts. -- - Fuzzy-match audited_entity_name against firms.entities.denumire (lib -- curatat already exists at services/seap-scraper/src/matching/) → fill -- audited_entity_cui. -- -- PRIMARY KEY: -- slug_id = sha1(category || '|' || slug). The numeric download ID is NULL -- until Stage 2 resolves it from the detail page. We keep it nullable + add -- a separate UNIQUE constraint when discovered. -- -- Cross-source value (recipe drafts in CURTEACONT-PLAN.md): -- 1. "Autorități audited de N ori în 5 ani" — repeat-audit risk score. -- 2. "Spitale audited POST SEAP award" — paralelă cu CNAS cross-source. -- 3. "Rapoarte follow-up" — semnal că auditul anterior n-a fost remediat. CREATE SCHEMA IF NOT EXISTS curteacont; -- ── Rapoarte de audit ─────────────────────────────────────────────────────── -- One row per audit report listed by Curtea de Conturi. Source of truth is -- the listing page slug; numeric download_id (PDF) is filled in Stage 2. CREATE TABLE IF NOT EXISTS curteacont.rapoarte ( slug_id char(40) PRIMARY KEY, -- sha1(category|slug) download_id integer, -- /downloads/{id}, filled in Stage 2 category text NOT NULL, -- 'rapoarte-audit-financiar' | 'rapoarte-conformitate' | 'rapoarte-audit-performanta' slug text NOT NULL, -- last URL segment, unique within category detail_url text NOT NULL, -- absolute URL to detail page title text NOT NULL, -- raw title from listing audit_type text, -- 'financiar' | 'conformitate' | 'performanta' | 'control' | 'follow-up' audit_year smallint, -- year the audit covers (e.g. 2024 in "pentru anul 2024") doc_number text, -- "nr.27500" → "27500" doc_date date, -- "07.04.2026" parsed audited_entity_name text, -- raw extracted from title after the last comma audited_entity_cui text, -- filled in Stage 2 via fuzzy match publication_date date, -- from