initial: split from gov-agreg — vreau.digital standalone platform

Moved from gov-agreg/src/pages/achizitii/* to root (drop prefix).
- 22 pages migrated, 127 files total
- All internal links: /achizitii/X → /X (176 occurrences fixed)
- AchizitiiLayout subnav rewritten: /X paths, top-right link to vreaudigital.ro hub
- BaseLayout new (vreau.digital branding, OG tags, site URL)
- astro.config.mjs: site https://vreau.digital, server output (was static)
- docker-compose: port 5096 (vreaudigital is 5095), container vreau-digital
- deploy.sh: paths /opt/vreau-digital, log /var/log/vreau-digital-deploy.log

Backend shared with gov-agreg:
- PostgreSQL satra (same schemas: seap, firms, anaf, anre, ...)
- Photon, Martin tiles
- Infisical /vreaudigital path (DATABASE_URL etc. shared)

build: PASS (npx astro check 0 errors, npm run build 5s vite + 10s server)
This commit is contained in:
Claude VM
2026-05-13 00:10:32 +03:00
commit a6c03a091e
352 changed files with 75295 additions and 0 deletions
+115
View File
@@ -0,0 +1,115 @@
-- 037_gnm.sql
-- GNM — Garda Națională de Mediu.
-- Public press releases of environmental enforcement actions scraped from
-- gnm.ro/feed/ (WordPress RSS, ~358 items / 36 pages).
--
-- Investigation summary (2026-05-10):
-- • The institution publishes only AGGREGATE statistics (per-month / per-judet
-- totals) in their monthly synthesis PDFs (sinteza_<luna>_<an>.pdf) and the
-- annual activity report (raport_activitate_<an>.pdf). NO per-firm registry
-- is published with CUIs and individual fine amounts.
-- • The only place where individual violators are named is in press releases
-- ("comunicate de presă"). Even there:
-- Most releases reference "doi operatori", "șapte operatori în patru
-- județe" without naming firms.
-- When firms are named (e.g. Petrobrazi, Vega, Lukoil refineries), the
-- individual amount is rarely broken down — they receive a collective
-- "€340,000 în ultimul an" figure.
-- CUIs are NEVER published; we must fuzzy-match on company name +
-- judet via cui_matcher (Stage B of the pipeline).
-- • data.gov.ro has 0 GNM datasets; ANPM publishes IPPC/SEVESO inventories
-- (which we ingest separately) but no fines.
--
-- Conclusion: this is a partial / sample-quality dataset. We capture every
-- press release as gnm.communicate, then run a regex extractor to surface
-- candidate (company, fine_lei, fapta) tuples into gnm.amenzi_extrase. The
-- coverage will be ~5-15% of total GNM enforcement activity (estimated 5K
-- fines/year, of which only ~50-200 firms are named publicly per year).
--
-- The cross-source value remains: any firm publicly shamed by GNM that ALSO
-- wins SEAP construction/industrial contracts is a 1st-page scandal pattern.
-- We accept that we miss the long tail; we capture the headlines.
CREATE SCHEMA IF NOT EXISTS gnm;
-- ── 1. Press releases (one row per gnm.ro article) ─────────────────────────
-- Captures the full enforcement-related communicate published by GNM. Used
-- both as raw archive (in case interpretation rules change) and as parent
-- for extracted violator rows.
CREATE TABLE IF NOT EXISTS gnm.comunicate (
id bigserial PRIMARY KEY,
guid text NOT NULL UNIQUE, -- WordPress GUID (stable post id)
url text NOT NULL,
titlu text NOT NULL,
publicat_la timestamptz, -- pubDate from RSS
autor text, -- dc:creator
categorii text[], -- e.g. {COMUNICATE DE PRESĂ, NOUTĂȚI}
continut_html text, -- raw content:encoded
continut_text text, -- HTML-stripped, line-collapsed
is_enforcement boolean NOT NULL DEFAULT false,
-- true if title/body matches
-- /amenz|sancțiun|sancțiun|sistare|confiscat/i
total_amenzi_lei numeric, -- sum mentioned in article (best-effort)
raw_hash text NOT NULL, -- sha1(continut_text) for change detection
fetched_at timestamptz NOT NULL DEFAULT now()
);
CREATE INDEX IF NOT EXISTS idx_gnm_com_publicat ON gnm.comunicate(publicat_la DESC);
CREATE INDEX IF NOT EXISTS idx_gnm_com_enforcement ON gnm.comunicate(is_enforcement) WHERE is_enforcement;
CREATE INDEX IF NOT EXISTS idx_gnm_com_total_amenzi ON gnm.comunicate(total_amenzi_lei DESC NULLS LAST);
COMMENT ON TABLE gnm.comunicate IS
'GNM press releases (gnm.ro/feed/). Source-of-truth raw archive. Articles flagged is_enforcement when text mentions fines/sanctions; gnm.amenzi_extrase populated by NLP-light extractor.';
-- ── 2. Extracted violator records ──────────────────────────────────────────
-- One row per (article × candidate firm) tuple identified by the regex/NER
-- pass. Most enforcement articles have 0-3 firms named; some have none
-- (collective references like "operatori industriali din Prahova").
--
-- contravenient_cui is filled by Stage B fuzzy match against firms.cui_lookup
-- using contravenient_name + judet hint. Score ≥ 0.85 is acceptable.
CREATE TABLE IF NOT EXISTS gnm.amenzi_extrase (
id bigserial PRIMARY KEY,
comunicat_id bigint NOT NULL REFERENCES gnm.comunicate(id) ON DELETE CASCADE,
contravenient_name text NOT NULL, -- raw mention (e.g. "Rafinăria Petrobrazi")
contravenient_name_norm text, -- firms.normalize_company_name(); NULL until Stage B
contravenient_cui text, -- fuzzy-matched, NULL when unmatched
cui_match_method text, -- 'direct' | 'fuzzy_name' | 'fuzzy_name_judet' | NULL
cui_match_score numeric, -- 0..1
matched_at timestamptz,
judet text, -- inferred from article title/body
fapta text, -- short violation description (extracted snippet)
suma_lei numeric, -- per-firm amount when present, NULL when only aggregate
suma_eur numeric, -- when source quotes EUR (rare)
suma_aggregate boolean NOT NULL DEFAULT false,
-- true when amount applies to >1 firm collectively
context_snippet text NOT NULL, -- the sentence(s) that triggered extraction
fetched_at timestamptz NOT NULL DEFAULT now()
);
CREATE INDEX IF NOT EXISTS idx_gnm_amenzi_cui ON gnm.amenzi_extrase(contravenient_cui)
WHERE contravenient_cui IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_gnm_amenzi_name_norm ON gnm.amenzi_extrase(contravenient_name_norm);
CREATE INDEX IF NOT EXISTS idx_gnm_amenzi_judet ON gnm.amenzi_extrase(judet);
CREATE INDEX IF NOT EXISTS idx_gnm_amenzi_suma ON gnm.amenzi_extrase(suma_lei DESC NULLS LAST);
CREATE INDEX IF NOT EXISTS idx_gnm_amenzi_comunicat ON gnm.amenzi_extrase(comunicat_id);
COMMENT ON TABLE gnm.amenzi_extrase IS
'Best-effort extracted violator tuples from gnm.comunicate. Coverage is partial — only firms named in press releases. Use gnm.comunicate.is_enforcement for full enforcement-article archive.';
-- ── 3. Scrape log (mirrors anre/ancom convention) ──────────────────────────
CREATE TABLE IF NOT EXISTS gnm.scrape_log (
id bigserial PRIMARY KEY,
scraper text NOT NULL, -- 'rss_feed' | 'extractor'
source_url text NOT NULL,
rows_seen integer NOT NULL DEFAULT 0,
rows_inserted integer NOT NULL DEFAULT 0,
rows_updated integer NOT NULL DEFAULT 0,
rows_skipped integer NOT NULL DEFAULT 0,
duration_ms integer NOT NULL DEFAULT 0,
started_at timestamptz NOT NULL,
finished_at timestamptz NOT NULL DEFAULT now(),
error text
);
CREATE INDEX IF NOT EXISTS idx_gnm_scrape_log_started ON gnm.scrape_log(started_at DESC);