initial: split from gov-agreg — vreau.digital standalone platform

Moved from gov-agreg/src/pages/achizitii/* to root (drop prefix). - 22 pages migrated, 127 files total - All internal links: /achizitii/X → /X (176 occurrences fixed) - AchizitiiLayout subnav rewritten: /X paths, top-right link to vreaudigital.ro hub - BaseLayout new (vreau.digital branding, OG tags, site URL) - astro.config.mjs: site https://vreau.digital, server output (was static) - docker-compose: port 5096 (vreaudigital is 5095), container vreau-digital - deploy.sh: paths /opt/vreau-digital, log /var/log/vreau-digital-deploy.log Backend shared with gov-agreg: - PostgreSQL satra (same schemas: seap, firms, anaf, anre, ...) - Photon, Martin tiles - Infisical /vreaudigital path (DATABASE_URL etc. shared) build: PASS (npx astro check 0 errors, npm run build 5s vite + 10s server)
2026-05-13 00:10:32 +03:00
commit a6c03a091e
352 changed files with 75295 additions and 0 deletions
@@ -0,0 +1,115 @@
+-- 037_gnm.sql
+-- GNM — Garda Națională de Mediu.
+-- Public press releases of environmental enforcement actions scraped from
+-- gnm.ro/feed/ (WordPress RSS, ~358 items / 36 pages).
+--
+-- Investigation summary (2026-05-10):
+--   • The institution publishes only AGGREGATE statistics (per-month / per-judet
+--     totals) in their monthly synthesis PDFs (sinteza_<luna>_<an>.pdf) and the
+--     annual activity report (raport_activitate_<an>.pdf). NO per-firm registry
+--     is published with CUIs and individual fine amounts.
+--   • The only place where individual violators are named is in press releases
+--     ("comunicate de presă"). Even there:
+--       – Most releases reference "doi operatori", "șapte operatori în patru
+--         județe" without naming firms.
+--       – When firms are named (e.g. Petrobrazi, Vega, Lukoil refineries), the
+--         individual amount is rarely broken down — they receive a collective
+--         "€340,000 în ultimul an" figure.
+--       – CUIs are NEVER published; we must fuzzy-match on company name +
+--         judet via cui_matcher (Stage B of the pipeline).
+--   • data.gov.ro has 0 GNM datasets; ANPM publishes IPPC/SEVESO inventories
+--     (which we ingest separately) but no fines.
+--
+-- Conclusion: this is a partial / sample-quality dataset. We capture every
+-- press release as gnm.communicate, then run a regex extractor to surface
+-- candidate (company, fine_lei, fapta) tuples into gnm.amenzi_extrase. The
+-- coverage will be ~5-15% of total GNM enforcement activity (estimated 5K
+-- fines/year, of which only ~50-200 firms are named publicly per year).
+--
+-- The cross-source value remains: any firm publicly shamed by GNM that ALSO
+-- wins SEAP construction/industrial contracts is a 1st-page scandal pattern.
+-- We accept that we miss the long tail; we capture the headlines.
+
+CREATE SCHEMA IF NOT EXISTS gnm;
+
+-- ── 1. Press releases (one row per gnm.ro article) ─────────────────────────
+-- Captures the full enforcement-related communicate published by GNM. Used
+-- both as raw archive (in case interpretation rules change) and as parent
+-- for extracted violator rows.
+CREATE TABLE IF NOT EXISTS gnm.comunicate (
+  id              bigserial PRIMARY KEY,
+  guid            text NOT NULL UNIQUE,        -- WordPress GUID (stable post id)
+  url             text NOT NULL,
+  titlu           text NOT NULL,
+  publicat_la     timestamptz,                  -- pubDate from RSS
+  autor           text,                         -- dc:creator
+  categorii       text[],                       -- e.g. {COMUNICATE DE PRESĂ, NOUTĂȚI}
+  continut_html   text,                         -- raw content:encoded
+  continut_text   text,                         -- HTML-stripped, line-collapsed
+  is_enforcement  boolean NOT NULL DEFAULT false,
+                                                -- true if title/body matches
+                                                -- /amenz|sancțiun|sancțiun|sistare|confiscat/i
+  total_amenzi_lei numeric,                     -- sum mentioned in article (best-effort)
+  raw_hash        text NOT NULL,                -- sha1(continut_text) for change detection
+  fetched_at      timestamptz NOT NULL DEFAULT now()
+);
+
+CREATE INDEX IF NOT EXISTS idx_gnm_com_publicat        ON gnm.comunicate(publicat_la DESC);
+CREATE INDEX IF NOT EXISTS idx_gnm_com_enforcement     ON gnm.comunicate(is_enforcement) WHERE is_enforcement;
+CREATE INDEX IF NOT EXISTS idx_gnm_com_total_amenzi    ON gnm.comunicate(total_amenzi_lei DESC NULLS LAST);
+
+COMMENT ON TABLE gnm.comunicate IS
+  'GNM press releases (gnm.ro/feed/). Source-of-truth raw archive. Articles flagged is_enforcement when text mentions fines/sanctions; gnm.amenzi_extrase populated by NLP-light extractor.';
+
+-- ── 2. Extracted violator records ──────────────────────────────────────────
+-- One row per (article × candidate firm) tuple identified by the regex/NER
+-- pass. Most enforcement articles have 0-3 firms named; some have none
+-- (collective references like "operatori industriali din Prahova").
+--
+-- contravenient_cui is filled by Stage B fuzzy match against firms.cui_lookup
+-- using contravenient_name + judet hint. Score ≥ 0.85 is acceptable.
+CREATE TABLE IF NOT EXISTS gnm.amenzi_extrase (
+  id                  bigserial PRIMARY KEY,
+  comunicat_id        bigint NOT NULL REFERENCES gnm.comunicate(id) ON DELETE CASCADE,
+  contravenient_name  text NOT NULL,            -- raw mention (e.g. "Rafinăria Petrobrazi")
+  contravenient_name_norm text,                 -- firms.normalize_company_name(); NULL until Stage B
+  contravenient_cui   text,                     -- fuzzy-matched, NULL when unmatched
+  cui_match_method    text,                     -- 'direct' | 'fuzzy_name' | 'fuzzy_name_judet' | NULL
+  cui_match_score     numeric,                  -- 0..1
+  matched_at          timestamptz,
+  judet               text,                     -- inferred from article title/body
+  fapta               text,                     -- short violation description (extracted snippet)
+  suma_lei            numeric,                  -- per-firm amount when present, NULL when only aggregate
+  suma_eur            numeric,                  -- when source quotes EUR (rare)
+  suma_aggregate      boolean NOT NULL DEFAULT false,
+                                                -- true when amount applies to >1 firm collectively
+  context_snippet     text NOT NULL,            -- the sentence(s) that triggered extraction
+  fetched_at          timestamptz NOT NULL DEFAULT now()
+);
+
+CREATE INDEX IF NOT EXISTS idx_gnm_amenzi_cui          ON gnm.amenzi_extrase(contravenient_cui)
+  WHERE contravenient_cui IS NOT NULL;
+CREATE INDEX IF NOT EXISTS idx_gnm_amenzi_name_norm    ON gnm.amenzi_extrase(contravenient_name_norm);
+CREATE INDEX IF NOT EXISTS idx_gnm_amenzi_judet        ON gnm.amenzi_extrase(judet);
+CREATE INDEX IF NOT EXISTS idx_gnm_amenzi_suma         ON gnm.amenzi_extrase(suma_lei DESC NULLS LAST);
+CREATE INDEX IF NOT EXISTS idx_gnm_amenzi_comunicat    ON gnm.amenzi_extrase(comunicat_id);
+
+COMMENT ON TABLE gnm.amenzi_extrase IS
+  'Best-effort extracted violator tuples from gnm.comunicate. Coverage is partial — only firms named in press releases. Use gnm.comunicate.is_enforcement for full enforcement-article archive.';
+
+-- ── 3. Scrape log (mirrors anre/ancom convention) ──────────────────────────
+CREATE TABLE IF NOT EXISTS gnm.scrape_log (
+  id            bigserial PRIMARY KEY,
+  scraper       text NOT NULL,                  -- 'rss_feed' | 'extractor'
+  source_url    text NOT NULL,
+  rows_seen     integer NOT NULL DEFAULT 0,
+  rows_inserted integer NOT NULL DEFAULT 0,
+  rows_updated  integer NOT NULL DEFAULT 0,
+  rows_skipped  integer NOT NULL DEFAULT 0,
+  duration_ms   integer NOT NULL DEFAULT 0,
+  started_at    timestamptz NOT NULL,
+  finished_at   timestamptz NOT NULL DEFAULT now(),
+  error         text
+);
+
+CREATE INDEX IF NOT EXISTS idx_gnm_scrape_log_started ON gnm.scrape_log(started_at DESC);