a6c03a091e
Moved from gov-agreg/src/pages/achizitii/* to root (drop prefix). - 22 pages migrated, 127 files total - All internal links: /achizitii/X → /X (176 occurrences fixed) - AchizitiiLayout subnav rewritten: /X paths, top-right link to vreaudigital.ro hub - BaseLayout new (vreau.digital branding, OG tags, site URL) - astro.config.mjs: site https://vreau.digital, server output (was static) - docker-compose: port 5096 (vreaudigital is 5095), container vreau-digital - deploy.sh: paths /opt/vreau-digital, log /var/log/vreau-digital-deploy.log Backend shared with gov-agreg: - PostgreSQL satra (same schemas: seap, firms, anaf, anre, ...) - Photon, Martin tiles - Infisical /vreaudigital path (DATABASE_URL etc. shared) build: PASS (npx astro check 0 errors, npm run build 5s vite + 10s server)
121 lines
6.6 KiB
SQL
121 lines
6.6 KiB
SQL
-- 035_curteacont.sql
|
||
-- Curtea de Conturi a României — Rapoarte de audit financiar / conformitate /
|
||
-- performanță / control / follow-up.
|
||
--
|
||
-- Source: https://www.curteadeconturi.ro/rapoarte-audit/{category}
|
||
-- Categories scraped:
|
||
-- - rapoarte-audit-financiar (~1,890 reports, 127 listing pages × 15)
|
||
-- - rapoarte-conformitate (~2,580 reports, 173 pages × 15)
|
||
-- - rapoarte-audit-performanta (~135 reports, 9 pages × 15)
|
||
-- Approximate total: ~4,600 reports, growing weekly with new audits.
|
||
--
|
||
-- Detail page exposes a single PDF download link of the form
|
||
-- `/rapoarte-audit/downloads/{integer_id}` (verified IDs: 4078, 7335, 7854,
|
||
-- 10653, 12418, 13832, 14183 — sequential, predictable).
|
||
--
|
||
-- Stage 1 (this file + scrape-curteacont.ts):
|
||
-- - Walks listing pages, harvests slug URLs + titles + publication dates +
|
||
-- audit_type + audited entity name (parsed from title).
|
||
-- - DOES NOT fetch detail pages or download PDFs (that is Stage 2 — see
|
||
-- CURTEACONT-PLAN.md for the 15-25h roadmap).
|
||
--
|
||
-- Stage 2 (next session):
|
||
-- - For each row with NULL pdf_url, fetch detail page → extract
|
||
-- /downloads/{id} numeric PDF ID + file size.
|
||
-- - Optionally download PDF to satra disk under /opt/vreaudigital/data/cdc/.
|
||
-- - Run pdfminer/pdftotext against first 3 pages → extract structured
|
||
-- summary, findings_count, key amounts.
|
||
-- - Fuzzy-match audited_entity_name against firms.entities.denumire (lib
|
||
-- curatat already exists at services/seap-scraper/src/matching/) → fill
|
||
-- audited_entity_cui.
|
||
--
|
||
-- PRIMARY KEY:
|
||
-- slug_id = sha1(category || '|' || slug). The numeric download ID is NULL
|
||
-- until Stage 2 resolves it from the detail page. We keep it nullable + add
|
||
-- a separate UNIQUE constraint when discovered.
|
||
--
|
||
-- Cross-source value (recipe drafts in CURTEACONT-PLAN.md):
|
||
-- 1. "Autorități audited de N ori în 5 ani" — repeat-audit risk score.
|
||
-- 2. "Spitale audited POST SEAP award" — paralelă cu CNAS cross-source.
|
||
-- 3. "Rapoarte follow-up" — semnal că auditul anterior n-a fost remediat.
|
||
|
||
CREATE SCHEMA IF NOT EXISTS curteacont;
|
||
|
||
-- ── Rapoarte de audit ───────────────────────────────────────────────────────
|
||
-- One row per audit report listed by Curtea de Conturi. Source of truth is
|
||
-- the listing page slug; numeric download_id (PDF) is filled in Stage 2.
|
||
CREATE TABLE IF NOT EXISTS curteacont.rapoarte (
|
||
slug_id char(40) PRIMARY KEY, -- sha1(category|slug)
|
||
download_id integer, -- /downloads/{id}, filled in Stage 2
|
||
category text NOT NULL, -- 'rapoarte-audit-financiar' | 'rapoarte-conformitate' | 'rapoarte-audit-performanta'
|
||
slug text NOT NULL, -- last URL segment, unique within category
|
||
detail_url text NOT NULL, -- absolute URL to detail page
|
||
|
||
title text NOT NULL, -- raw title from listing
|
||
audit_type text, -- 'financiar' | 'conformitate' | 'performanta' | 'control' | 'follow-up'
|
||
audit_year smallint, -- year the audit covers (e.g. 2024 in "pentru anul 2024")
|
||
doc_number text, -- "nr.27500" → "27500"
|
||
doc_date date, -- "07.04.2026" parsed
|
||
audited_entity_name text, -- raw extracted from title after the last comma
|
||
audited_entity_cui text, -- filled in Stage 2 via fuzzy match
|
||
|
||
publication_date date, -- from <time datetime="..."> on listing card
|
||
|
||
pdf_url text, -- /rapoarte-audit/downloads/{id} — Stage 2
|
||
pdf_size_bytes bigint, -- parsed from "(X,YZ MB)" — Stage 2
|
||
pdf_path text, -- if mirrored to satra disk — Stage 2 optional
|
||
summary text, -- first-page abstract — Stage 2 PDF parse
|
||
findings_count integer, -- count of "constatări" — Stage 2 PDF parse
|
||
|
||
fetched_at timestamptz NOT NULL DEFAULT now(),
|
||
parsed_at timestamptz -- set when Stage 2 PDF parse completes
|
||
);
|
||
|
||
CREATE UNIQUE INDEX IF NOT EXISTS rapoarte_category_slug_uniq
|
||
ON curteacont.rapoarte (category, slug);
|
||
|
||
CREATE UNIQUE INDEX IF NOT EXISTS rapoarte_download_id_uniq
|
||
ON curteacont.rapoarte (download_id) WHERE download_id IS NOT NULL;
|
||
|
||
CREATE INDEX IF NOT EXISTS rapoarte_audit_year_idx
|
||
ON curteacont.rapoarte (audit_year DESC NULLS LAST);
|
||
|
||
CREATE INDEX IF NOT EXISTS rapoarte_audit_type_idx
|
||
ON curteacont.rapoarte (audit_type);
|
||
|
||
CREATE INDEX IF NOT EXISTS rapoarte_pub_date_idx
|
||
ON curteacont.rapoarte (publication_date DESC NULLS LAST);
|
||
|
||
CREATE INDEX IF NOT EXISTS rapoarte_audited_cui_idx
|
||
ON curteacont.rapoarte (audited_entity_cui) WHERE audited_entity_cui IS NOT NULL;
|
||
|
||
-- Trigram index for fuzzy entity-name matching (Stage 2 needs it for CUI resolve).
|
||
CREATE EXTENSION IF NOT EXISTS pg_trgm;
|
||
CREATE INDEX IF NOT EXISTS rapoarte_entity_trgm_idx
|
||
ON curteacont.rapoarte USING gin (audited_entity_name gin_trgm_ops)
|
||
WHERE audited_entity_name IS NOT NULL;
|
||
|
||
-- ── Scrape run log ──────────────────────────────────────────────────────────
|
||
-- One row per CLI invocation. Idempotent inserts on (started_at,category).
|
||
CREATE TABLE IF NOT EXISTS curteacont.scrape_runs (
|
||
id bigserial PRIMARY KEY,
|
||
category text NOT NULL,
|
||
started_at timestamptz NOT NULL DEFAULT now(),
|
||
finished_at timestamptz,
|
||
pages_visited integer NOT NULL DEFAULT 0,
|
||
rows_inserted integer NOT NULL DEFAULT 0,
|
||
rows_updated integer NOT NULL DEFAULT 0,
|
||
rows_skipped integer NOT NULL DEFAULT 0,
|
||
last_error text,
|
||
notes text
|
||
);
|
||
|
||
CREATE INDEX IF NOT EXISTS scrape_runs_started_idx
|
||
ON curteacont.scrape_runs (started_at DESC);
|
||
|
||
COMMENT ON SCHEMA curteacont IS
|
||
'Curtea de Conturi audit reports (https://www.curteadeconturi.ro/rapoarte-audit). Stage 1 = URL+metadata harvest from listing pages. Stage 2 = detail-page resolve + PDF parse + CUI fuzzy match.';
|
||
|
||
COMMENT ON TABLE curteacont.rapoarte IS
|
||
'One row per audit report. PK is sha1(category|slug). Numeric download_id and PDF metadata filled in Stage 2 (see CURTEACONT-PLAN.md).';
|