Files
Claude VM a6c03a091e initial: split from gov-agreg — vreau.digital standalone platform
Moved from gov-agreg/src/pages/achizitii/* to root (drop prefix).
- 22 pages migrated, 127 files total
- All internal links: /achizitii/X → /X (176 occurrences fixed)
- AchizitiiLayout subnav rewritten: /X paths, top-right link to vreaudigital.ro hub
- BaseLayout new (vreau.digital branding, OG tags, site URL)
- astro.config.mjs: site https://vreau.digital, server output (was static)
- docker-compose: port 5096 (vreaudigital is 5095), container vreau-digital
- deploy.sh: paths /opt/vreau-digital, log /var/log/vreau-digital-deploy.log

Backend shared with gov-agreg:
- PostgreSQL satra (same schemas: seap, firms, anaf, anre, ...)
- Photon, Martin tiles
- Infisical /vreaudigital path (DATABASE_URL etc. shared)

build: PASS (npx astro check 0 errors, npm run build 5s vite + 10s server)
2026-05-13 00:10:32 +03:00

121 lines
6.6 KiB
SQL
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
-- 035_curteacont.sql
-- Curtea de Conturi a României — Rapoarte de audit financiar / conformitate /
-- performanță / control / follow-up.
--
-- Source: https://www.curteadeconturi.ro/rapoarte-audit/{category}
-- Categories scraped:
-- - rapoarte-audit-financiar (~1,890 reports, 127 listing pages × 15)
-- - rapoarte-conformitate (~2,580 reports, 173 pages × 15)
-- - rapoarte-audit-performanta (~135 reports, 9 pages × 15)
-- Approximate total: ~4,600 reports, growing weekly with new audits.
--
-- Detail page exposes a single PDF download link of the form
-- `/rapoarte-audit/downloads/{integer_id}` (verified IDs: 4078, 7335, 7854,
-- 10653, 12418, 13832, 14183 — sequential, predictable).
--
-- Stage 1 (this file + scrape-curteacont.ts):
-- - Walks listing pages, harvests slug URLs + titles + publication dates +
-- audit_type + audited entity name (parsed from title).
-- - DOES NOT fetch detail pages or download PDFs (that is Stage 2 — see
-- CURTEACONT-PLAN.md for the 15-25h roadmap).
--
-- Stage 2 (next session):
-- - For each row with NULL pdf_url, fetch detail page → extract
-- /downloads/{id} numeric PDF ID + file size.
-- - Optionally download PDF to satra disk under /opt/vreaudigital/data/cdc/.
-- - Run pdfminer/pdftotext against first 3 pages → extract structured
-- summary, findings_count, key amounts.
-- - Fuzzy-match audited_entity_name against firms.entities.denumire (lib
-- curatat already exists at services/seap-scraper/src/matching/) → fill
-- audited_entity_cui.
--
-- PRIMARY KEY:
-- slug_id = sha1(category || '|' || slug). The numeric download ID is NULL
-- until Stage 2 resolves it from the detail page. We keep it nullable + add
-- a separate UNIQUE constraint when discovered.
--
-- Cross-source value (recipe drafts in CURTEACONT-PLAN.md):
-- 1. "Autorități audited de N ori în 5 ani" — repeat-audit risk score.
-- 2. "Spitale audited POST SEAP award" — paralelă cu CNAS cross-source.
-- 3. "Rapoarte follow-up" — semnal că auditul anterior n-a fost remediat.
CREATE SCHEMA IF NOT EXISTS curteacont;
-- ── Rapoarte de audit ───────────────────────────────────────────────────────
-- One row per audit report listed by Curtea de Conturi. Source of truth is
-- the listing page slug; numeric download_id (PDF) is filled in Stage 2.
CREATE TABLE IF NOT EXISTS curteacont.rapoarte (
slug_id char(40) PRIMARY KEY, -- sha1(category|slug)
download_id integer, -- /downloads/{id}, filled in Stage 2
category text NOT NULL, -- 'rapoarte-audit-financiar' | 'rapoarte-conformitate' | 'rapoarte-audit-performanta'
slug text NOT NULL, -- last URL segment, unique within category
detail_url text NOT NULL, -- absolute URL to detail page
title text NOT NULL, -- raw title from listing
audit_type text, -- 'financiar' | 'conformitate' | 'performanta' | 'control' | 'follow-up'
audit_year smallint, -- year the audit covers (e.g. 2024 in "pentru anul 2024")
doc_number text, -- "nr.27500" → "27500"
doc_date date, -- "07.04.2026" parsed
audited_entity_name text, -- raw extracted from title after the last comma
audited_entity_cui text, -- filled in Stage 2 via fuzzy match
publication_date date, -- from <time datetime="..."> on listing card
pdf_url text, -- /rapoarte-audit/downloads/{id} — Stage 2
pdf_size_bytes bigint, -- parsed from "(X,YZ MB)" — Stage 2
pdf_path text, -- if mirrored to satra disk — Stage 2 optional
summary text, -- first-page abstract — Stage 2 PDF parse
findings_count integer, -- count of "constatări" — Stage 2 PDF parse
fetched_at timestamptz NOT NULL DEFAULT now(),
parsed_at timestamptz -- set when Stage 2 PDF parse completes
);
CREATE UNIQUE INDEX IF NOT EXISTS rapoarte_category_slug_uniq
ON curteacont.rapoarte (category, slug);
CREATE UNIQUE INDEX IF NOT EXISTS rapoarte_download_id_uniq
ON curteacont.rapoarte (download_id) WHERE download_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS rapoarte_audit_year_idx
ON curteacont.rapoarte (audit_year DESC NULLS LAST);
CREATE INDEX IF NOT EXISTS rapoarte_audit_type_idx
ON curteacont.rapoarte (audit_type);
CREATE INDEX IF NOT EXISTS rapoarte_pub_date_idx
ON curteacont.rapoarte (publication_date DESC NULLS LAST);
CREATE INDEX IF NOT EXISTS rapoarte_audited_cui_idx
ON curteacont.rapoarte (audited_entity_cui) WHERE audited_entity_cui IS NOT NULL;
-- Trigram index for fuzzy entity-name matching (Stage 2 needs it for CUI resolve).
CREATE EXTENSION IF NOT EXISTS pg_trgm;
CREATE INDEX IF NOT EXISTS rapoarte_entity_trgm_idx
ON curteacont.rapoarte USING gin (audited_entity_name gin_trgm_ops)
WHERE audited_entity_name IS NOT NULL;
-- ── Scrape run log ──────────────────────────────────────────────────────────
-- One row per CLI invocation. Idempotent inserts on (started_at,category).
CREATE TABLE IF NOT EXISTS curteacont.scrape_runs (
id bigserial PRIMARY KEY,
category text NOT NULL,
started_at timestamptz NOT NULL DEFAULT now(),
finished_at timestamptz,
pages_visited integer NOT NULL DEFAULT 0,
rows_inserted integer NOT NULL DEFAULT 0,
rows_updated integer NOT NULL DEFAULT 0,
rows_skipped integer NOT NULL DEFAULT 0,
last_error text,
notes text
);
CREATE INDEX IF NOT EXISTS scrape_runs_started_idx
ON curteacont.scrape_runs (started_at DESC);
COMMENT ON SCHEMA curteacont IS
'Curtea de Conturi audit reports (https://www.curteadeconturi.ro/rapoarte-audit). Stage 1 = URL+metadata harvest from listing pages. Stage 2 = detail-page resolve + PDF parse + CUI fuzzy match.';
COMMENT ON TABLE curteacont.rapoarte IS
'One row per audit report. PK is sha1(category|slug). Numeric download_id and PDF metadata filled in Stage 2 (see CURTEACONT-PLAN.md).';