vreau-digital/services/seap-scraper/scrapers/anaf_datornici/scraper.py

"""
ANAF datornici (persoane juridice) — live scraper.

Source: https://www.anaf.ro/restante/ (JSF/PrimeFaces, JCaptcha image).
NOT Cloudflare Turnstile (initial assumption was wrong, confirmed via probe).

Mechanism (per probe 2026-05-12):
  1. GET /restante/ → extract `javax.faces.ViewState` + session cookies
  2. GET /restante/kaptcha.jpg (same session)
  3. POST kaptcha image to 2captcha (~$0.0005) → get 5-char text token
  4. POST /restante/index.xhtml with captcha + form fields → first page of data
  5. AJAX PrimeFaces pagination POSTs for subsequent pages (no new captcha)
  6. Parse <tr data-ri=N> rows, extract 24 cells per row, UPSERT to anaf.datornici

Site shows CURRENT QUARTER ONLY (no historical access). Each quarterly run
captures one snapshot. Historical pre-2026-Q1 is permanently lost — we keep
the 2016-Q1 data.gov.ro snapshot already in DB.

Env vars:
  TWOCAPTCHA_KEY     — required (image solver)
  DATABASE_URL       — postgres conn string (Prisma-style ?schema= stripped)
  DRY_RUN=1          — parse plan, no captcha, no DB writes
  ROWS_PER_PAGE=1000 — pagination chunk size (default 1000; reduce if PrimeFaces times out)
  ANAF_DATORNICI_LOG — log path (default stderr)
"""

from __future__ import annotations

import base64
import io
import logging
import os
import re
import sys
import time
from dataclasses import dataclass, field
from datetime import date
from typing import Any

import psycopg2
import psycopg2.extras
import requests

# ─────────────────────────────────────────────────────────────────────────────
# Logging

LOG_FILE = os.environ.get("ANAF_DATORNICI_LOG", "")
_handlers: list[logging.Handler] = [logging.StreamHandler(sys.stderr)]
if LOG_FILE:
    try:
        _handlers.append(logging.FileHandler(LOG_FILE))
    except OSError:
        pass
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=_handlers,
)
log = logging.getLogger("anaf_datornici")


# ─────────────────────────────────────────────────────────────────────────────
# Constants

BASE = "https://www.anaf.ro/restante"
INDEX_PAGE = f"{BASE}/"
INDEX_FORM = f"{BASE}/index.xhtml"
KAPTCHA_URL = f"{BASE}/kaptcha.jpg"
USER_AGENT = (
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
    "(KHTML, like Gecko) Chrome/120.0 Safari/537.36"
)
TIMEOUT = 60

TWOCAPTCHA_IN = "https://2captcha.com/in.php"
TWOCAPTCHA_RES = "https://2captcha.com/res.php"
TWOCAPTCHA_POLL_INTERVAL = 5  # seconds
TWOCAPTCHA_MAX_POLL = 36       # 36 * 5s = 180s
TWOCAPTCHA_MAX_ATTEMPTS = 3    # captcha solve retries on wrong-text
TWOCAPTCHA_REPORT = True       # report bad solves for credit refund

DEFAULT_ROWS_PER_PAGE = int(os.environ.get("ROWS_PER_PAGE", "1000"))


# ─────────────────────────────────────────────────────────────────────────────
# Quarter math

def parse_publication_date(html: str) -> tuple[date, str]:
    """Extract 'Obligații fiscale restante la data de DD.MM.YYYY' from page."""
    m = re.search(r"data\s+de\s+(\d{2})\.(\d{2})\.(\d{4})", html, re.IGNORECASE)
    if not m:
        raise RuntimeError("Cannot parse publication_date from page HTML")
    d, mo, y = int(m.group(1)), int(m.group(2)), int(m.group(3))
    pub_date = date(y, mo, d)
    # Map publication_date → quarter label.
    # Convention: pub at end-of-quarter (31 Mar = T1, 30 Jun = T2, 30 Sep = T3, 31 Dec = T4).
    q = (mo - 1) // 3 + 1
    period_label = f"T{q} {y}"
    return pub_date, period_label


# ─────────────────────────────────────────────────────────────────────────────
# 2captcha image solver

def solve_kaptcha(api_key: str, image: bytes, *, attempt: int = 1) -> tuple[str, str]:
    """Submit JPEG to 2captcha, poll for text. Returns (token, captcha_id)."""
    b64 = base64.b64encode(image).decode()
    r = requests.post(
        TWOCAPTCHA_IN,
        data={
            "key": api_key,
            "method": "base64",
            "body": b64,
            "json": "1",
            "numeric": "0",      # any chars
            "min_len": "4",
            "max_len": "8",
            "language": "2",     # any language
            "regsense": "1",     # case-sensitive
        },
        timeout=TIMEOUT,
    )
    r.raise_for_status()
    j = r.json()
    if j.get("status") != 1:
        raise RuntimeError(f"2captcha in.php error: {j}")
    cid = j["request"]
    log.info(f"2captcha attempt {attempt}: id={cid}, polling…")

    for poll in range(TWOCAPTCHA_MAX_POLL):
        time.sleep(TWOCAPTCHA_POLL_INTERVAL)
        rr = requests.get(
            TWOCAPTCHA_RES,
            params={"key": api_key, "action": "get", "id": cid, "json": "1"},
            timeout=TIMEOUT,
        )
        jj = rr.json()
        if jj.get("status") == 1:
            return jj["request"], cid
        if jj.get("request") in ("CAPCHA_NOT_READY", "CAPTCHA_NOT_READY"):
            continue
        raise RuntimeError(f"2captcha res.php error: {jj}")
    raise RuntimeError(f"2captcha timeout after {TWOCAPTCHA_MAX_POLL*TWOCAPTCHA_POLL_INTERVAL}s")


def report_bad_solve(api_key: str, cid: str) -> None:
    """Report wrong solve to 2captcha for credit refund."""
    try:
        requests.get(
            TWOCAPTCHA_RES,
            params={"key": api_key, "action": "reportbad", "id": cid},
            timeout=TIMEOUT,
        )
    except Exception:
        pass


# ─────────────────────────────────────────────────────────────────────────────
# Session / pagination

@dataclass
class AnafSession:
    api_key: str
    s: requests.Session = field(default_factory=requests.Session)
    viewstate: str = ""
    publication_date: date | None = None
    period_label: str = ""
    total_records: int = 0

    def __post_init__(self):
        self.s.headers.update({"User-Agent": USER_AGENT})

    def bootstrap(self) -> None:
        """GET initial page, extract ViewState + session cookies."""
        log.info(f"GET {INDEX_PAGE}")
        r = self.s.get(INDEX_PAGE, timeout=TIMEOUT)
        r.raise_for_status()
        m = re.search(r'name="javax\.faces\.ViewState"[^>]*value="([^"]+)"', r.text)
        if not m:
            raise RuntimeError("No ViewState in initial page")
        self.viewstate = m.group(1)
        log.info(f"viewstate fetched ({len(self.viewstate)} chars)")

    def get_kaptcha(self) -> bytes:
        r = self.s.get(
            KAPTCHA_URL, timeout=TIMEOUT, headers={"Referer": INDEX_PAGE}
        )
        r.raise_for_status()
        if not r.content.startswith(b"\xff\xd8\xff"):
            raise RuntimeError("kaptcha response not JPEG")
        return r.content

    def submit_initial(self, captcha_text: str, rows_per_page: int) -> str:
        """POST form with captcha → first page of data (HTML)."""
        log.info(f"POST {INDEX_FORM} (captcha={captcha_text!r}, rows={rows_per_page})")
        r = self.s.post(
            INDEX_FORM,
            data={
                "form": "form",
                "form:inputc": captcha_text,
                "form:searchdata": "",
                "form:submit": "",
                "form_SUBMIT": "1",
                "javax.faces.ViewState": self.viewstate,
            },
            headers={
                "Referer": INDEX_PAGE,
                "Origin": "https://www.anaf.ro",
                "User-Agent": USER_AGENT,
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                "Accept-Language": "ro,en;q=0.9",
                "Content-Type": "application/x-www-form-urlencoded",
            },
            timeout=TIMEOUT,
        )
        r.raise_for_status()
        # Refresh ViewState (JSF rotates it on each interaction)
        m = re.search(r'name="javax\.faces\.ViewState"[^>]*value="([^"]+)"', r.text)
        if m:
            self.viewstate = m.group(1)
        # Detect captcha-error case
        if "Cod de validare gresit" in r.text or "incorect" in r.text.lower()[:5000]:
            raise CaptchaWrong(r.text)
        # Extract publication date + total
        try:
            self.publication_date, self.period_label = parse_publication_date(r.text)
            log.info(f"publication_date={self.publication_date} period={self.period_label}")
        except RuntimeError:
            log.warning("could not parse publication_date — using today's quarter")
            today = date.today()
            self.publication_date = today
            self.period_label = f"T{(today.month - 1) // 3 + 1} {today.year}"
        m = re.search(r"\((\d+)\s+of\s+(\d+)\)", r.text)
        if m:
            self.total_records = int(m.group(2)) * 16  # pages * rows-per-page-default
            log.info(f"total_records estimate (from paginator): ~{self.total_records}")
        return r.text

    def fetch_page(self, first: int, rows_per_page: int) -> str:
        """AJAX PrimeFaces pagination POST. Returns partial response XML."""
        r = self.s.post(
            INDEX_FORM,
            data={
                "javax.faces.partial.ajax": "true",
                "javax.faces.source": "form:dataTable",
                "javax.faces.partial.execute": "form:dataTable",
                "javax.faces.partial.render": "form:dataTable",
                "form:dataTable": "form:dataTable",
                "form:dataTable_pagination": "true",
                "form:dataTable_first": str(first),
                "form:dataTable_rows": str(rows_per_page),
                "form:dataTable_encodeFeature": "true",
                "form": "form",
                "form:inputc": "",
                "form:searchdata": "",
                "javax.faces.ViewState": self.viewstate,
            },
            headers={
                "Referer": INDEX_FORM,
                "Origin": "https://www.anaf.ro",
                "User-Agent": USER_AGENT,
                "Accept": "application/xml,text/xml,*/*;q=0.01",
                "Accept-Language": "ro,en;q=0.9",
                "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
                "X-Requested-With": "XMLHttpRequest",
                "Faces-Request": "partial/ajax",
            },
            timeout=TIMEOUT,
        )
        r.raise_for_status()
        # Update ViewState from partial response
        m = re.search(r'<update id="[^"]*javax\.faces\.ViewState[^"]*"><!\[CDATA\[([^\]]+)\]\]>', r.text)
        if m:
            self.viewstate = m.group(1)
        return r.text


class CaptchaWrong(Exception):
    pass


# ─────────────────────────────────────────────────────────────────────────────
# Row parsing

def parse_rows(html_or_partial: str) -> list[dict[str, Any]]:
    """Extract debtor rows from initial HTML or AJAX partial response.

    Row layout (24 cells observed via probe 2026-05-12):
       0: nr_crt
       1: name (denumire debitor)
       2: CIF (cui)
       3: total bugetul de stat
       4: total asigurări sociale
       5: total șomaj
       6: total sănătate
       7-10: state {principal, accesorii, necontestate, contestate}
       11-14: social {principal, accesorii, necontestate, contestate}
       15-18: unemployment {principal, accesorii, necontestate, contestate}
       19-22: health {principal, accesorii, necontestate, contestate}
       23: observation/status (e.g. "Faliment")
    """
    rows: list[dict[str, Any]] = []
    # Match each <tr ... data-ri="N">…</tr>
    for tr_m in re.finditer(
        r'<tr\b[^>]*data-ri="(\d+)"[^>]*>(.*?)</tr>',
        html_or_partial, re.DOTALL,
    ):
        body = tr_m.group(2)
        cells = re.findall(r"<td\b[^>]*>(.*?)</td>", body, re.DOTALL)
        if len(cells) < 24:
            continue
        def _txt(s: str) -> str:
            t = re.sub(r"<[^>]+>", "", s)
            return re.sub(r"\s+", " ", t).strip()
        def _num(s: str) -> float:
            t = _txt(s).replace(".", "").replace(",", ".")
            try:
                return float(t)
            except ValueError:
                return 0.0
        rows.append({
            "nr_crt": _txt(cells[0]),
            "name": _txt(cells[1]),
            "cui": _txt(cells[2]),
            "budget_state_total": _num(cells[3]),
            "budget_social_total": _num(cells[4]),
            "budget_unemployment_total": _num(cells[5]),
            "budget_health_total": _num(cells[6]),
            "state_principal":      _num(cells[7]),
            "state_penalty":        _num(cells[8]),
            "state_necontestate":   _num(cells[9]),
            "state_contestate":     _num(cells[10]),
            "social_principal":     _num(cells[11]),
            "social_penalty":       _num(cells[12]),
            "social_necontestate":  _num(cells[13]),
            "social_contestate":    _num(cells[14]),
            "unemp_principal":      _num(cells[15]),
            "unemp_penalty":        _num(cells[16]),
            "unemp_necontestate":   _num(cells[17]),
            "unemp_contestate":     _num(cells[18]),
            "health_principal":     _num(cells[19]),
            "health_penalty":       _num(cells[20]),
            "health_necontestate":  _num(cells[21]),
            "health_contestate":    _num(cells[22]),
            "observation":          _txt(cells[23]),
        })
    return rows


# ─────────────────────────────────────────────────────────────────────────────
# DB UPSERT

def upsert_rows(
    conn,
    rows: list[dict[str, Any]],
    publication_date: date,
    period_label: str,
    debtor_category: str = "persoane_juridice",
) -> int:
    if not rows:
        return 0
    source_url = INDEX_PAGE
    # debt_total per row = sum of 4 category totals
    payload = [(
        r["cui"].replace(" ", "").upper().lstrip("RO"),
        r["name"],
        None,                  # judet not provided by ANAF /restante/
        publication_date,
        period_label,
        debtor_category,
        # debt_total = sum of 4 category totals
        r["budget_state_total"] + r["budget_social_total"]
            + r["budget_unemployment_total"] + r["budget_health_total"],
        # principal across categories
        r["state_principal"] + r["social_principal"]
            + r["unemp_principal"] + r["health_principal"],
        # penalty across categories
        r["state_penalty"] + r["social_penalty"]
            + r["unemp_penalty"] + r["health_penalty"],
        # contestate across categories
        r["state_contestate"] + r["social_contestate"]
            + r["unemp_contestate"] + r["health_contestate"],
        # per-budget detail (12 columns)
        r["state_principal"], r["state_penalty"], r["state_contestate"],
        r["social_principal"], r["social_penalty"], r["social_contestate"],
        r["unemp_principal"], r["unemp_penalty"], r["unemp_contestate"],
        r["health_principal"], r["health_penalty"], r["health_contestate"],
        source_url,
    ) for r in rows if r["cui"]]

    sql = """
        INSERT INTO anaf.datornici (
            cui, name, judet, publication_date, period_label, debtor_category,
            debt_total, debt_principal, debt_penalty, debt_contested,
            budget_state_principal, budget_state_penalty, budget_state_contested,
            budget_social_principal, budget_social_penalty, budget_social_contested,
            budget_unemployment_principal, budget_unemployment_penalty, budget_unemployment_contested,
            budget_health_principal, budget_health_penalty, budget_health_contested,
            source_url
        ) VALUES %s
        ON CONFLICT (cui, publication_date)
        DO UPDATE SET
            name = EXCLUDED.name,
            debt_total = EXCLUDED.debt_total,
            debt_principal = EXCLUDED.debt_principal,
            debt_penalty = EXCLUDED.debt_penalty,
            debt_contested = EXCLUDED.debt_contested,
            budget_state_principal = EXCLUDED.budget_state_principal,
            budget_state_penalty = EXCLUDED.budget_state_penalty,
            budget_state_contested = EXCLUDED.budget_state_contested,
            budget_social_principal = EXCLUDED.budget_social_principal,
            budget_social_penalty = EXCLUDED.budget_social_penalty,
            budget_social_contested = EXCLUDED.budget_social_contested,
            budget_unemployment_principal = EXCLUDED.budget_unemployment_principal,
            budget_unemployment_penalty = EXCLUDED.budget_unemployment_penalty,
            budget_unemployment_contested = EXCLUDED.budget_unemployment_contested,
            budget_health_principal = EXCLUDED.budget_health_principal,
            budget_health_penalty = EXCLUDED.budget_health_penalty,
            budget_health_contested = EXCLUDED.budget_health_contested,
            fetched_at = now()
    """
    with conn.cursor() as cur:
        psycopg2.extras.execute_values(cur, sql, payload, page_size=500)
    conn.commit()
    return len(payload)


# ─────────────────────────────────────────────────────────────────────────────
# Orchestration

def run(*, dry_run: bool, rows_per_page: int) -> dict[str, int]:
    api_key = os.environ.get("TWOCAPTCHA_KEY", "")
    if not api_key and not dry_run:
        raise RuntimeError("Missing TWOCAPTCHA_KEY env var — see HANDOFF doc")

    if dry_run:
        log.info("DRY_RUN=1 — connecting only to validate config, no captcha solve")
        sess = AnafSession(api_key="")
        sess.bootstrap()
        log.info(f"bootstrap OK, viewstate captured ({len(sess.viewstate)} chars)")
        log.info(f"would solve 1 captcha (~$0.001 worst case) then paginate {rows_per_page} rows/page")
        return {"datornici_inserted": 0, "errors": 0}

    db_url = os.environ.get("DATABASE_URL", "")
    if not db_url:
        raise RuntimeError("Missing DATABASE_URL env var")
    db_url = re.sub(r"[?&]schema=[^&]*", "", db_url)
    db_url = re.sub(r"\?$", "", db_url)
    conn = psycopg2.connect(db_url)
    conn.autocommit = False

    sess = AnafSession(api_key=api_key)

    # Captcha solve with retries (wrong-text bounce)
    last_cid: str | None = None
    for attempt in range(1, TWOCAPTCHA_MAX_ATTEMPTS + 1):
        sess.bootstrap()
        image = sess.get_kaptcha()
        token, cid = solve_kaptcha(api_key, image, attempt=attempt)
        last_cid = cid
        try:
            initial_html = sess.submit_initial(token, rows_per_page)
            log.info(f"captcha accepted on attempt {attempt}")
            break
        except CaptchaWrong:
            log.warning(f"captcha rejected by ANAF on attempt {attempt}, retrying")
            if TWOCAPTCHA_REPORT and last_cid:
                report_bad_solve(api_key, last_cid)
            if attempt == TWOCAPTCHA_MAX_ATTEMPTS:
                raise RuntimeError("captcha solve failed after retries")

    # Initial page rows
    all_rows = parse_rows(initial_html)
    log.info(f"page 1: {len(all_rows)} rows")

    # Paginate
    # Discover total via paginator markup. Default page count is 16/page;
    # if we set rows_per_page>16, total_records estimate may be wrong.
    # Just iterate until parse_rows returns empty.
    first = len(all_rows)
    page_num = 2
    while True:
        try:
            partial = sess.fetch_page(first=first, rows_per_page=rows_per_page)
            new_rows = parse_rows(partial)
            if not new_rows:
                log.info(f"pagination exhausted at first={first}")
                break
            all_rows.extend(new_rows)
            log.info(f"page {page_num}: {len(new_rows)} rows (running total: {len(all_rows)})")
            first += len(new_rows)
            page_num += 1
        except Exception as e:
            log.error(f"pagination error at page {page_num}: {e}")
            break

    log.info(f"total rows collected: {len(all_rows)}")
    if not sess.publication_date:
        raise RuntimeError("No publication_date captured")

    inserted = upsert_rows(
        conn, all_rows,
        publication_date=sess.publication_date,
        period_label=sess.period_label,
    )
    log.info(f"upserted {inserted} rows into anaf.datornici for {sess.period_label}")

    conn.close()
    return {"datornici_inserted": inserted, "errors": 0}


def main():
    dry_run = os.environ.get("DRY_RUN", "0") == "1"
    rows_per_page = int(os.environ.get("ROWS_PER_PAGE", str(DEFAULT_ROWS_PER_PAGE)))
    log.info(f"=== ANAF datornici scrape: dry_run={dry_run} rows_per_page={rows_per_page} ===")
    try:
        result = run(dry_run=dry_run, rows_per_page=rows_per_page)
    except Exception as e:
        log.error(f"FATAL: {e}", exc_info=True)
        sys.exit(1)
    log.info(f"DONE {result}")


if __name__ == "__main__":
    main()