vreau-digital/services/seap-scraper/sql/035_curteacont.sql

-- 035_curteacont.sql
-- Curtea de Conturi a României — Rapoarte de audit financiar / conformitate /
-- performanță / control / follow-up.
--
-- Source: https://www.curteadeconturi.ro/rapoarte-audit/{category}
--   Categories scraped:
--     - rapoarte-audit-financiar     (~1,890 reports, 127 listing pages × 15)
--     - rapoarte-conformitate        (~2,580 reports, 173 pages × 15)
--     - rapoarte-audit-performanta   (~135 reports,    9 pages × 15)
--   Approximate total: ~4,600 reports, growing weekly with new audits.
--
-- Detail page exposes a single PDF download link of the form
-- `/rapoarte-audit/downloads/{integer_id}` (verified IDs: 4078, 7335, 7854,
-- 10653, 12418, 13832, 14183 — sequential, predictable).
--
-- Stage 1 (this file + scrape-curteacont.ts):
--   - Walks listing pages, harvests slug URLs + titles + publication dates +
--     audit_type + audited entity name (parsed from title).
--   - DOES NOT fetch detail pages or download PDFs (that is Stage 2 — see
--     CURTEACONT-PLAN.md for the 15-25h roadmap).
--
-- Stage 2 (next session):
--   - For each row with NULL pdf_url, fetch detail page → extract
--     /downloads/{id} numeric PDF ID + file size.
--   - Optionally download PDF to satra disk under /opt/vreaudigital/data/cdc/.
--   - Run pdfminer/pdftotext against first 3 pages → extract structured
--     summary, findings_count, key amounts.
--   - Fuzzy-match audited_entity_name against firms.entities.denumire (lib
--     curatat already exists at services/seap-scraper/src/matching/) → fill
--     audited_entity_cui.
--
-- PRIMARY KEY:
--   slug_id = sha1(category || '|' || slug). The numeric download ID is NULL
--   until Stage 2 resolves it from the detail page. We keep it nullable + add
--   a separate UNIQUE constraint when discovered.
--
-- Cross-source value (recipe drafts in CURTEACONT-PLAN.md):
--   1. "Autorități audited de N ori în 5 ani" — repeat-audit risk score.
--   2. "Spitale audited POST SEAP award" — paralelă cu CNAS cross-source.
--   3. "Rapoarte follow-up" — semnal că auditul anterior n-a fost remediat.

CREATE SCHEMA IF NOT EXISTS curteacont;

-- ── Rapoarte de audit ───────────────────────────────────────────────────────
-- One row per audit report listed by Curtea de Conturi. Source of truth is
-- the listing page slug; numeric download_id (PDF) is filled in Stage 2.
CREATE TABLE IF NOT EXISTS curteacont.rapoarte (
  slug_id          char(40) PRIMARY KEY,                    -- sha1(category|slug)
  download_id      integer,                                  -- /downloads/{id}, filled in Stage 2
  category         text NOT NULL,                            -- 'rapoarte-audit-financiar' | 'rapoarte-conformitate' | 'rapoarte-audit-performanta'
  slug             text NOT NULL,                            -- last URL segment, unique within category
  detail_url       text NOT NULL,                            -- absolute URL to detail page

  title            text NOT NULL,                            -- raw title from listing
  audit_type       text,                                     -- 'financiar' | 'conformitate' | 'performanta' | 'control' | 'follow-up'
  audit_year       smallint,                                 -- year the audit covers (e.g. 2024 in "pentru anul 2024")
  doc_number       text,                                     -- "nr.27500" → "27500"
  doc_date         date,                                     -- "07.04.2026" parsed
  audited_entity_name text,                                  -- raw extracted from title after the last comma
  audited_entity_cui  text,                                  -- filled in Stage 2 via fuzzy match

  publication_date date,                                     -- from <time datetime="..."> on listing card

  pdf_url          text,                                     -- /rapoarte-audit/downloads/{id} — Stage 2
  pdf_size_bytes   bigint,                                   -- parsed from "(X,YZ MB)" — Stage 2
  pdf_path         text,                                     -- if mirrored to satra disk — Stage 2 optional
  summary          text,                                     -- first-page abstract — Stage 2 PDF parse
  findings_count   integer,                                  -- count of "constatări" — Stage 2 PDF parse

  fetched_at       timestamptz NOT NULL DEFAULT now(),
  parsed_at        timestamptz                               -- set when Stage 2 PDF parse completes
);

CREATE UNIQUE INDEX IF NOT EXISTS rapoarte_category_slug_uniq
  ON curteacont.rapoarte (category, slug);

CREATE UNIQUE INDEX IF NOT EXISTS rapoarte_download_id_uniq
  ON curteacont.rapoarte (download_id) WHERE download_id IS NOT NULL;

CREATE INDEX IF NOT EXISTS rapoarte_audit_year_idx
  ON curteacont.rapoarte (audit_year DESC NULLS LAST);

CREATE INDEX IF NOT EXISTS rapoarte_audit_type_idx
  ON curteacont.rapoarte (audit_type);

CREATE INDEX IF NOT EXISTS rapoarte_pub_date_idx
  ON curteacont.rapoarte (publication_date DESC NULLS LAST);

CREATE INDEX IF NOT EXISTS rapoarte_audited_cui_idx
  ON curteacont.rapoarte (audited_entity_cui) WHERE audited_entity_cui IS NOT NULL;

-- Trigram index for fuzzy entity-name matching (Stage 2 needs it for CUI resolve).
CREATE EXTENSION IF NOT EXISTS pg_trgm;
CREATE INDEX IF NOT EXISTS rapoarte_entity_trgm_idx
  ON curteacont.rapoarte USING gin (audited_entity_name gin_trgm_ops)
  WHERE audited_entity_name IS NOT NULL;

-- ── Scrape run log ──────────────────────────────────────────────────────────
-- One row per CLI invocation. Idempotent inserts on (started_at,category).
CREATE TABLE IF NOT EXISTS curteacont.scrape_runs (
  id            bigserial PRIMARY KEY,
  category      text NOT NULL,
  started_at    timestamptz NOT NULL DEFAULT now(),
  finished_at   timestamptz,
  pages_visited integer NOT NULL DEFAULT 0,
  rows_inserted integer NOT NULL DEFAULT 0,
  rows_updated  integer NOT NULL DEFAULT 0,
  rows_skipped  integer NOT NULL DEFAULT 0,
  last_error    text,
  notes         text
);

CREATE INDEX IF NOT EXISTS scrape_runs_started_idx
  ON curteacont.scrape_runs (started_at DESC);

COMMENT ON SCHEMA curteacont IS
  'Curtea de Conturi audit reports (https://www.curteadeconturi.ro/rapoarte-audit). Stage 1 = URL+metadata harvest from listing pages. Stage 2 = detail-page resolve + PDF parse + CUI fuzzy match.';

COMMENT ON TABLE curteacont.rapoarte IS
  'One row per audit report. PK is sha1(category|slug). Numeric download_id and PDF metadata filled in Stage 2 (see CURTEACONT-PLAN.md).';