initial: split from gov-agreg — vreau.digital standalone platform

Moved from gov-agreg/src/pages/achizitii/* to root (drop prefix). - 22 pages migrated, 127 files total - All internal links: /achizitii/X → /X (176 occurrences fixed) - AchizitiiLayout subnav rewritten: /X paths, top-right link to vreaudigital.ro hub - BaseLayout new (vreau.digital branding, OG tags, site URL) - astro.config.mjs: site https://vreau.digital, server output (was static) - docker-compose: port 5096 (vreaudigital is 5095), container vreau-digital - deploy.sh: paths /opt/vreau-digital, log /var/log/vreau-digital-deploy.log Backend shared with gov-agreg: - PostgreSQL satra (same schemas: seap, firms, anaf, anre, ...) - Photon, Martin tiles - Infisical /vreaudigital path (DATABASE_URL etc. shared) build: PASS (npx astro check 0 errors, npm run build 5s vite + 10s server)
2026-05-13 00:10:32 +03:00
commit a6c03a091e
352 changed files with 75295 additions and 0 deletions
@@ -0,0 +1,525 @@
+"""
+ANAF datornici (persoane juridice) — live scraper.
+
+Source: https://www.anaf.ro/restante/ (JSF/PrimeFaces, JCaptcha image).
+NOT Cloudflare Turnstile (initial assumption was wrong, confirmed via probe).
+
+Mechanism (per probe 2026-05-12):
+  1. GET /restante/ → extract `javax.faces.ViewState` + session cookies
+  2. GET /restante/kaptcha.jpg (same session)
+  3. POST kaptcha image to 2captcha (~$0.0005) → get 5-char text token
+  4. POST /restante/index.xhtml with captcha + form fields → first page of data
+  5. AJAX PrimeFaces pagination POSTs for subsequent pages (no new captcha)
+  6. Parse <tr data-ri=N> rows, extract 24 cells per row, UPSERT to anaf.datornici
+
+Site shows CURRENT QUARTER ONLY (no historical access). Each quarterly run
+captures one snapshot. Historical pre-2026-Q1 is permanently lost — we keep
+the 2016-Q1 data.gov.ro snapshot already in DB.
+
+Env vars:
+  TWOCAPTCHA_KEY     — required (image solver)
+  DATABASE_URL       — postgres conn string (Prisma-style ?schema= stripped)
+  DRY_RUN=1          — parse plan, no captcha, no DB writes
+  ROWS_PER_PAGE=1000 — pagination chunk size (default 1000; reduce if PrimeFaces times out)
+  ANAF_DATORNICI_LOG — log path (default stderr)
+"""
+
+from __future__ import annotations
+
+import base64
+import io
+import logging
+import os
+import re
+import sys
+import time
+from dataclasses import dataclass, field
+from datetime import date
+from typing import Any
+
+import psycopg2
+import psycopg2.extras
+import requests
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Logging
+
+LOG_FILE = os.environ.get("ANAF_DATORNICI_LOG", "")
+_handlers: list[logging.Handler] = [logging.StreamHandler(sys.stderr)]
+if LOG_FILE:
+    try:
+        _handlers.append(logging.FileHandler(LOG_FILE))
+    except OSError:
+        pass
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+    handlers=_handlers,
+)
+log = logging.getLogger("anaf_datornici")
+
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Constants
+
+BASE = "https://www.anaf.ro/restante"
+INDEX_PAGE = f"{BASE}/"
+INDEX_FORM = f"{BASE}/index.xhtml"
+KAPTCHA_URL = f"{BASE}/kaptcha.jpg"
+USER_AGENT = (
+    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
+    "(KHTML, like Gecko) Chrome/120.0 Safari/537.36"
+)
+TIMEOUT = 60
+
+TWOCAPTCHA_IN = "https://2captcha.com/in.php"
+TWOCAPTCHA_RES = "https://2captcha.com/res.php"
+TWOCAPTCHA_POLL_INTERVAL = 5  # seconds
+TWOCAPTCHA_MAX_POLL = 36       # 36 * 5s = 180s
+TWOCAPTCHA_MAX_ATTEMPTS = 3    # captcha solve retries on wrong-text
+TWOCAPTCHA_REPORT = True       # report bad solves for credit refund
+
+DEFAULT_ROWS_PER_PAGE = int(os.environ.get("ROWS_PER_PAGE", "1000"))
+
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Quarter math
+
+def parse_publication_date(html: str) -> tuple[date, str]:
+    """Extract 'Obligații fiscale restante la data de DD.MM.YYYY' from page."""
+    m = re.search(r"data\s+de\s+(\d{2})\.(\d{2})\.(\d{4})", html, re.IGNORECASE)
+    if not m:
+        raise RuntimeError("Cannot parse publication_date from page HTML")
+    d, mo, y = int(m.group(1)), int(m.group(2)), int(m.group(3))
+    pub_date = date(y, mo, d)
+    # Map publication_date → quarter label.
+    # Convention: pub at end-of-quarter (31 Mar = T1, 30 Jun = T2, 30 Sep = T3, 31 Dec = T4).
+    q = (mo - 1) // 3 + 1
+    period_label = f"T{q} {y}"
+    return pub_date, period_label
+
+
+# ─────────────────────────────────────────────────────────────────────────────
+# 2captcha image solver
+
+def solve_kaptcha(api_key: str, image: bytes, *, attempt: int = 1) -> tuple[str, str]:
+    """Submit JPEG to 2captcha, poll for text. Returns (token, captcha_id)."""
+    b64 = base64.b64encode(image).decode()
+    r = requests.post(
+        TWOCAPTCHA_IN,
+        data={
+            "key": api_key,
+            "method": "base64",
+            "body": b64,
+            "json": "1",
+            "numeric": "0",      # any chars
+            "min_len": "4",
+            "max_len": "8",
+            "language": "2",     # any language
+            "regsense": "1",     # case-sensitive
+        },
+        timeout=TIMEOUT,
+    )
+    r.raise_for_status()
+    j = r.json()
+    if j.get("status") != 1:
+        raise RuntimeError(f"2captcha in.php error: {j}")
+    cid = j["request"]
+    log.info(f"2captcha attempt {attempt}: id={cid}, polling…")
+
+    for poll in range(TWOCAPTCHA_MAX_POLL):
+        time.sleep(TWOCAPTCHA_POLL_INTERVAL)
+        rr = requests.get(
+            TWOCAPTCHA_RES,
+            params={"key": api_key, "action": "get", "id": cid, "json": "1"},
+            timeout=TIMEOUT,
+        )
+        jj = rr.json()
+        if jj.get("status") == 1:
+            return jj["request"], cid
+        if jj.get("request") in ("CAPCHA_NOT_READY", "CAPTCHA_NOT_READY"):
+            continue
+        raise RuntimeError(f"2captcha res.php error: {jj}")
+    raise RuntimeError(f"2captcha timeout after {TWOCAPTCHA_MAX_POLL*TWOCAPTCHA_POLL_INTERVAL}s")
+
+
+def report_bad_solve(api_key: str, cid: str) -> None:
+    """Report wrong solve to 2captcha for credit refund."""
+    try:
+        requests.get(
+            TWOCAPTCHA_RES,
+            params={"key": api_key, "action": "reportbad", "id": cid},
+            timeout=TIMEOUT,
+        )
+    except Exception:
+        pass
+
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Session / pagination
+
+@dataclass
+class AnafSession:
+    api_key: str
+    s: requests.Session = field(default_factory=requests.Session)
+    viewstate: str = ""
+    publication_date: date | None = None
+    period_label: str = ""
+    total_records: int = 0
+
+    def __post_init__(self):
+        self.s.headers.update({"User-Agent": USER_AGENT})
+
+    def bootstrap(self) -> None:
+        """GET initial page, extract ViewState + session cookies."""
+        log.info(f"GET {INDEX_PAGE}")
+        r = self.s.get(INDEX_PAGE, timeout=TIMEOUT)
+        r.raise_for_status()
+        m = re.search(r'name="javax\.faces\.ViewState"[^>]*value="([^"]+)"', r.text)
+        if not m:
+            raise RuntimeError("No ViewState in initial page")
+        self.viewstate = m.group(1)
+        log.info(f"viewstate fetched ({len(self.viewstate)} chars)")
+
+    def get_kaptcha(self) -> bytes:
+        r = self.s.get(
+            KAPTCHA_URL, timeout=TIMEOUT, headers={"Referer": INDEX_PAGE}
+        )
+        r.raise_for_status()
+        if not r.content.startswith(b"\xff\xd8\xff"):
+            raise RuntimeError("kaptcha response not JPEG")
+        return r.content
+
+    def submit_initial(self, captcha_text: str, rows_per_page: int) -> str:
+        """POST form with captcha → first page of data (HTML)."""
+        log.info(f"POST {INDEX_FORM} (captcha={captcha_text!r}, rows={rows_per_page})")
+        r = self.s.post(
+            INDEX_FORM,
+            data={
+                "form": "form",
+                "form:inputc": captcha_text,
+                "form:searchdata": "",
+                "form:submit": "",
+                "form_SUBMIT": "1",
+                "javax.faces.ViewState": self.viewstate,
+            },
+            headers={
+                "Referer": INDEX_PAGE,
+                "Origin": "https://www.anaf.ro",
+                "User-Agent": USER_AGENT,
+                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+                "Accept-Language": "ro,en;q=0.9",
+                "Content-Type": "application/x-www-form-urlencoded",
+            },
+            timeout=TIMEOUT,
+        )
+        r.raise_for_status()
+        # Refresh ViewState (JSF rotates it on each interaction)
+        m = re.search(r'name="javax\.faces\.ViewState"[^>]*value="([^"]+)"', r.text)
+        if m:
+            self.viewstate = m.group(1)
+        # Detect captcha-error case
+        if "Cod de validare gresit" in r.text or "incorect" in r.text.lower()[:5000]:
+            raise CaptchaWrong(r.text)
+        # Extract publication date + total
+        try:
+            self.publication_date, self.period_label = parse_publication_date(r.text)
+            log.info(f"publication_date={self.publication_date} period={self.period_label}")
+        except RuntimeError:
+            log.warning("could not parse publication_date — using today's quarter")
+            today = date.today()
+            self.publication_date = today
+            self.period_label = f"T{(today.month - 1) // 3 + 1} {today.year}"
+        m = re.search(r"\((\d+)\s+of\s+(\d+)\)", r.text)
+        if m:
+            self.total_records = int(m.group(2)) * 16  # pages * rows-per-page-default
+            log.info(f"total_records estimate (from paginator): ~{self.total_records}")
+        return r.text
+
+    def fetch_page(self, first: int, rows_per_page: int) -> str:
+        """AJAX PrimeFaces pagination POST. Returns partial response XML."""
+        r = self.s.post(
+            INDEX_FORM,
+            data={
+                "javax.faces.partial.ajax": "true",
+                "javax.faces.source": "form:dataTable",
+                "javax.faces.partial.execute": "form:dataTable",
+                "javax.faces.partial.render": "form:dataTable",
+                "form:dataTable": "form:dataTable",
+                "form:dataTable_pagination": "true",
+                "form:dataTable_first": str(first),
+                "form:dataTable_rows": str(rows_per_page),
+                "form:dataTable_encodeFeature": "true",
+                "form": "form",
+                "form:inputc": "",
+                "form:searchdata": "",
+                "javax.faces.ViewState": self.viewstate,
+            },
+            headers={
+                "Referer": INDEX_FORM,
+                "Origin": "https://www.anaf.ro",
+                "User-Agent": USER_AGENT,
+                "Accept": "application/xml,text/xml,*/*;q=0.01",
+                "Accept-Language": "ro,en;q=0.9",
+                "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
+                "X-Requested-With": "XMLHttpRequest",
+                "Faces-Request": "partial/ajax",
+            },
+            timeout=TIMEOUT,
+        )
+        r.raise_for_status()
+        # Update ViewState from partial response
+        m = re.search(r'<update id="[^"]*javax\.faces\.ViewState[^"]*"><!\[CDATA\[([^\]]+)\]\]>', r.text)
+        if m:
+            self.viewstate = m.group(1)
+        return r.text
+
+
+class CaptchaWrong(Exception):
+    pass
+
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Row parsing
+
+def parse_rows(html_or_partial: str) -> list[dict[str, Any]]:
+    """Extract debtor rows from initial HTML or AJAX partial response.
+
+    Row layout (24 cells observed via probe 2026-05-12):
+       0: nr_crt
+       1: name (denumire debitor)
+       2: CIF (cui)
+       3: total bugetul de stat
+       4: total asigurări sociale
+       5: total șomaj
+       6: total sănătate
+       7-10: state {principal, accesorii, necontestate, contestate}
+       11-14: social {principal, accesorii, necontestate, contestate}
+       15-18: unemployment {principal, accesorii, necontestate, contestate}
+       19-22: health {principal, accesorii, necontestate, contestate}
+       23: observation/status (e.g. "Faliment")
+    """
+    rows: list[dict[str, Any]] = []
+    # Match each <tr ... data-ri="N">…</tr>
+    for tr_m in re.finditer(
+        r'<tr\b[^>]*data-ri="(\d+)"[^>]*>(.*?)</tr>',
+        html_or_partial, re.DOTALL,
+    ):
+        body = tr_m.group(2)
+        cells = re.findall(r"<td\b[^>]*>(.*?)</td>", body, re.DOTALL)
+        if len(cells) < 24:
+            continue
+        def _txt(s: str) -> str:
+            t = re.sub(r"<[^>]+>", "", s)
+            return re.sub(r"\s+", " ", t).strip()
+        def _num(s: str) -> float:
+            t = _txt(s).replace(".", "").replace(",", ".")
+            try:
+                return float(t)
+            except ValueError:
+                return 0.0
+        rows.append({
+            "nr_crt": _txt(cells[0]),
+            "name": _txt(cells[1]),
+            "cui": _txt(cells[2]),
+            "budget_state_total": _num(cells[3]),
+            "budget_social_total": _num(cells[4]),
+            "budget_unemployment_total": _num(cells[5]),
+            "budget_health_total": _num(cells[6]),
+            "state_principal":      _num(cells[7]),
+            "state_penalty":        _num(cells[8]),
+            "state_necontestate":   _num(cells[9]),
+            "state_contestate":     _num(cells[10]),
+            "social_principal":     _num(cells[11]),
+            "social_penalty":       _num(cells[12]),
+            "social_necontestate":  _num(cells[13]),
+            "social_contestate":    _num(cells[14]),
+            "unemp_principal":      _num(cells[15]),
+            "unemp_penalty":        _num(cells[16]),
+            "unemp_necontestate":   _num(cells[17]),
+            "unemp_contestate":     _num(cells[18]),
+            "health_principal":     _num(cells[19]),
+            "health_penalty":       _num(cells[20]),
+            "health_necontestate":  _num(cells[21]),
+            "health_contestate":    _num(cells[22]),
+            "observation":          _txt(cells[23]),
+        })
+    return rows
+
+
+# ─────────────────────────────────────────────────────────────────────────────
+# DB UPSERT
+
+def upsert_rows(
+    conn,
+    rows: list[dict[str, Any]],
+    publication_date: date,
+    period_label: str,
+    debtor_category: str = "persoane_juridice",
+) -> int:
+    if not rows:
+        return 0
+    source_url = INDEX_PAGE
+    # debt_total per row = sum of 4 category totals
+    payload = [(
+        r["cui"].replace(" ", "").upper().lstrip("RO"),
+        r["name"],
+        None,                  # judet not provided by ANAF /restante/
+        publication_date,
+        period_label,
+        debtor_category,
+        # debt_total = sum of 4 category totals
+        r["budget_state_total"] + r["budget_social_total"]
+            + r["budget_unemployment_total"] + r["budget_health_total"],
+        # principal across categories
+        r["state_principal"] + r["social_principal"]
+            + r["unemp_principal"] + r["health_principal"],
+        # penalty across categories
+        r["state_penalty"] + r["social_penalty"]
+            + r["unemp_penalty"] + r["health_penalty"],
+        # contestate across categories
+        r["state_contestate"] + r["social_contestate"]
+            + r["unemp_contestate"] + r["health_contestate"],
+        # per-budget detail (12 columns)
+        r["state_principal"], r["state_penalty"], r["state_contestate"],
+        r["social_principal"], r["social_penalty"], r["social_contestate"],
+        r["unemp_principal"], r["unemp_penalty"], r["unemp_contestate"],
+        r["health_principal"], r["health_penalty"], r["health_contestate"],
+        source_url,
+    ) for r in rows if r["cui"]]
+
+    sql = """
+        INSERT INTO anaf.datornici (
+            cui, name, judet, publication_date, period_label, debtor_category,
+            debt_total, debt_principal, debt_penalty, debt_contested,
+            budget_state_principal, budget_state_penalty, budget_state_contested,
+            budget_social_principal, budget_social_penalty, budget_social_contested,
+            budget_unemployment_principal, budget_unemployment_penalty, budget_unemployment_contested,
+            budget_health_principal, budget_health_penalty, budget_health_contested,
+            source_url
+        ) VALUES %s
+        ON CONFLICT (cui, publication_date)
+        DO UPDATE SET
+            name = EXCLUDED.name,
+            debt_total = EXCLUDED.debt_total,
+            debt_principal = EXCLUDED.debt_principal,
+            debt_penalty = EXCLUDED.debt_penalty,
+            debt_contested = EXCLUDED.debt_contested,
+            budget_state_principal = EXCLUDED.budget_state_principal,
+            budget_state_penalty = EXCLUDED.budget_state_penalty,
+            budget_state_contested = EXCLUDED.budget_state_contested,
+            budget_social_principal = EXCLUDED.budget_social_principal,
+            budget_social_penalty = EXCLUDED.budget_social_penalty,
+            budget_social_contested = EXCLUDED.budget_social_contested,
+            budget_unemployment_principal = EXCLUDED.budget_unemployment_principal,
+            budget_unemployment_penalty = EXCLUDED.budget_unemployment_penalty,
+            budget_unemployment_contested = EXCLUDED.budget_unemployment_contested,
+            budget_health_principal = EXCLUDED.budget_health_principal,
+            budget_health_penalty = EXCLUDED.budget_health_penalty,
+            budget_health_contested = EXCLUDED.budget_health_contested,
+            fetched_at = now()
+    """
+    with conn.cursor() as cur:
+        psycopg2.extras.execute_values(cur, sql, payload, page_size=500)
+    conn.commit()
+    return len(payload)
+
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Orchestration
+
+def run(*, dry_run: bool, rows_per_page: int) -> dict[str, int]:
+    api_key = os.environ.get("TWOCAPTCHA_KEY", "")
+    if not api_key and not dry_run:
+        raise RuntimeError("Missing TWOCAPTCHA_KEY env var — see HANDOFF doc")
+
+    if dry_run:
+        log.info("DRY_RUN=1 — connecting only to validate config, no captcha solve")
+        sess = AnafSession(api_key="")
+        sess.bootstrap()
+        log.info(f"bootstrap OK, viewstate captured ({len(sess.viewstate)} chars)")
+        log.info(f"would solve 1 captcha (~$0.001 worst case) then paginate {rows_per_page} rows/page")
+        return {"datornici_inserted": 0, "errors": 0}
+
+    db_url = os.environ.get("DATABASE_URL", "")
+    if not db_url:
+        raise RuntimeError("Missing DATABASE_URL env var")
+    db_url = re.sub(r"[?&]schema=[^&]*", "", db_url)
+    db_url = re.sub(r"\?$", "", db_url)
+    conn = psycopg2.connect(db_url)
+    conn.autocommit = False
+
+    sess = AnafSession(api_key=api_key)
+
+    # Captcha solve with retries (wrong-text bounce)
+    last_cid: str | None = None
+    for attempt in range(1, TWOCAPTCHA_MAX_ATTEMPTS + 1):
+        sess.bootstrap()
+        image = sess.get_kaptcha()
+        token, cid = solve_kaptcha(api_key, image, attempt=attempt)
+        last_cid = cid
+        try:
+            initial_html = sess.submit_initial(token, rows_per_page)
+            log.info(f"captcha accepted on attempt {attempt}")
+            break
+        except CaptchaWrong:
+            log.warning(f"captcha rejected by ANAF on attempt {attempt}, retrying")
+            if TWOCAPTCHA_REPORT and last_cid:
+                report_bad_solve(api_key, last_cid)
+            if attempt == TWOCAPTCHA_MAX_ATTEMPTS:
+                raise RuntimeError("captcha solve failed after retries")
+
+    # Initial page rows
+    all_rows = parse_rows(initial_html)
+    log.info(f"page 1: {len(all_rows)} rows")
+
+    # Paginate
+    # Discover total via paginator markup. Default page count is 16/page;
+    # if we set rows_per_page>16, total_records estimate may be wrong.
+    # Just iterate until parse_rows returns empty.
+    first = len(all_rows)
+    page_num = 2
+    while True:
+        try:
+            partial = sess.fetch_page(first=first, rows_per_page=rows_per_page)
+            new_rows = parse_rows(partial)
+            if not new_rows:
+                log.info(f"pagination exhausted at first={first}")
+                break
+            all_rows.extend(new_rows)
+            log.info(f"page {page_num}: {len(new_rows)} rows (running total: {len(all_rows)})")
+            first += len(new_rows)
+            page_num += 1
+        except Exception as e:
+            log.error(f"pagination error at page {page_num}: {e}")
+            break
+
+    log.info(f"total rows collected: {len(all_rows)}")
+    if not sess.publication_date:
+        raise RuntimeError("No publication_date captured")
+
+    inserted = upsert_rows(
+        conn, all_rows,
+        publication_date=sess.publication_date,
+        period_label=sess.period_label,
+    )
+    log.info(f"upserted {inserted} rows into anaf.datornici for {sess.period_label}")
+
+    conn.close()
+    return {"datornici_inserted": inserted, "errors": 0}
+
+
+def main():
+    dry_run = os.environ.get("DRY_RUN", "0") == "1"
+    rows_per_page = int(os.environ.get("ROWS_PER_PAGE", str(DEFAULT_ROWS_PER_PAGE)))
+    log.info(f"=== ANAF datornici scrape: dry_run={dry_run} rows_per_page={rows_per_page} ===")
+    try:
+        result = run(dry_run=dry_run, rows_per_page=rows_per_page)
+    except Exception as e:
+        log.error(f"FATAL: {e}", exc_info=True)
+        sys.exit(1)
+    log.info(f"DONE {result}")
+
+
+if __name__ == "__main__":
+    main()