initial: split from gov-agreg — vreau.digital standalone platform
Moved from gov-agreg/src/pages/achizitii/* to root (drop prefix). - 22 pages migrated, 127 files total - All internal links: /achizitii/X → /X (176 occurrences fixed) - AchizitiiLayout subnav rewritten: /X paths, top-right link to vreaudigital.ro hub - BaseLayout new (vreau.digital branding, OG tags, site URL) - astro.config.mjs: site https://vreau.digital, server output (was static) - docker-compose: port 5096 (vreaudigital is 5095), container vreau-digital - deploy.sh: paths /opt/vreau-digital, log /var/log/vreau-digital-deploy.log Backend shared with gov-agreg: - PostgreSQL satra (same schemas: seap, firms, anaf, anre, ...) - Photon, Martin tiles - Infisical /vreaudigital path (DATABASE_URL etc. shared) build: PASS (npx astro check 0 errors, npm run build 5s vite + 10s server)
This commit is contained in:
@@ -0,0 +1,212 @@
|
||||
"""
|
||||
ANAF /restante/ probe — discovers actual mechanism.
|
||||
|
||||
Steps:
|
||||
1. GET /restante/ → extract javax.faces.ViewState, session cookie
|
||||
2. GET kaptcha.jpg (same session)
|
||||
3. POST kaptcha image to 2captcha → get text solution
|
||||
4. POST /restante/index.xhtml with captcha + form fields → get response
|
||||
5. Print: response HTML structure, table shape, pagination markers, quarter
|
||||
selector evidence
|
||||
|
||||
Used ONCE to understand the page before committing to a full scraper rewrite.
|
||||
Spends ~$0.001 of 2captcha credit.
|
||||
"""
|
||||
|
||||
import base64
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
|
||||
import requests
|
||||
|
||||
BASE = "https://www.anaf.ro/restante"
|
||||
INDEX_URL = f"{BASE}/index.xhtml"
|
||||
KAPTCHA_URL = f"{BASE}/kaptcha.jpg"
|
||||
USER_AGENT = (
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/120.0 Safari/537.36"
|
||||
)
|
||||
TIMEOUT = 30
|
||||
|
||||
TWOCAPTCHA_IN = "https://2captcha.com/in.php"
|
||||
TWOCAPTCHA_RES = "https://2captcha.com/res.php"
|
||||
|
||||
|
||||
def log(msg: str) -> None:
|
||||
print(f"[probe] {msg}", file=sys.stderr, flush=True)
|
||||
|
||||
|
||||
def get_initial(session: requests.Session) -> tuple[str, str]:
|
||||
"""Fetch /restante/ page, return (html, viewstate)."""
|
||||
log(f"GET {BASE}/")
|
||||
r = session.get(f"{BASE}/", timeout=TIMEOUT)
|
||||
r.raise_for_status()
|
||||
html = r.text
|
||||
m = re.search(r'name="javax\.faces\.ViewState"[^>]*value="([^"]+)"', html)
|
||||
if not m:
|
||||
raise RuntimeError("No ViewState found")
|
||||
viewstate = m.group(1)
|
||||
log(f"viewstate={viewstate[:24]}…")
|
||||
log(f"cookies after GET: {list(session.cookies.keys())}")
|
||||
return html, viewstate
|
||||
|
||||
|
||||
def get_kaptcha(session: requests.Session) -> bytes:
|
||||
log(f"GET {KAPTCHA_URL}")
|
||||
r = session.get(KAPTCHA_URL, timeout=TIMEOUT, headers={"Referer": f"{BASE}/"})
|
||||
r.raise_for_status()
|
||||
if not r.content.startswith(b"\xff\xd8\xff"):
|
||||
log(f"WARN: kaptcha response not JPEG (first bytes: {r.content[:10]!r})")
|
||||
log(f"kaptcha bytes: {len(r.content)} (jpg)")
|
||||
return r.content
|
||||
|
||||
|
||||
def solve_kaptcha(api_key: str, image: bytes) -> str:
|
||||
"""Submit image to 2captcha, poll for solution."""
|
||||
b64 = base64.b64encode(image).decode()
|
||||
log("POST 2captcha in.php with image…")
|
||||
r = requests.post(
|
||||
TWOCAPTCHA_IN,
|
||||
data={
|
||||
"key": api_key,
|
||||
"method": "base64",
|
||||
"body": b64,
|
||||
"json": "1",
|
||||
# Hint to 2captcha workers: this is short alphanumeric (kaptcha
|
||||
# default is 5-6 chars, mixed letter+digit, anti-aliased).
|
||||
"numeric": "0", # 0 = any chars allowed
|
||||
"min_len": "4",
|
||||
"max_len": "8",
|
||||
"language": "2", # 2 = any language (alphanumeric)
|
||||
"regsense": "1", # case-sensitive ON
|
||||
},
|
||||
timeout=TIMEOUT,
|
||||
)
|
||||
r.raise_for_status()
|
||||
j = r.json()
|
||||
if j.get("status") != 1:
|
||||
raise RuntimeError(f"2captcha in.php error: {j}")
|
||||
cid = j["request"]
|
||||
log(f"2captcha job id={cid}, polling…")
|
||||
|
||||
for attempt in range(30): # 30 * 5s = 150s cap
|
||||
time.sleep(5)
|
||||
r = requests.get(
|
||||
TWOCAPTCHA_RES,
|
||||
params={"key": api_key, "action": "get", "id": cid, "json": "1"},
|
||||
timeout=TIMEOUT,
|
||||
)
|
||||
j = r.json()
|
||||
if j.get("status") == 1:
|
||||
token = j["request"]
|
||||
log(f"2captcha solved: {token!r}")
|
||||
return token
|
||||
if j.get("request") in ("CAPCHA_NOT_READY", "CAPTCHA_NOT_READY"):
|
||||
log(f" poll {attempt+1}: not ready")
|
||||
continue
|
||||
raise RuntimeError(f"2captcha res.php error: {j}")
|
||||
raise RuntimeError("2captcha timeout 150s")
|
||||
|
||||
|
||||
def post_search(session: requests.Session, viewstate: str, captcha: str, search: str = "") -> requests.Response:
|
||||
"""POST the form. Empty search = list all (best-case, hopefully bulk)."""
|
||||
log(f"POST {INDEX_URL} captcha={captcha!r} search={search!r}")
|
||||
r = session.post(
|
||||
INDEX_URL,
|
||||
data={
|
||||
"form": "form",
|
||||
"form:inputc": captcha,
|
||||
"form:searchdata": search,
|
||||
"form:submit": "", # button submit
|
||||
"form_SUBMIT": "1",
|
||||
"javax.faces.ViewState": viewstate,
|
||||
},
|
||||
headers={
|
||||
"Referer": f"{BASE}/",
|
||||
"Origin": "https://www.anaf.ro",
|
||||
"User-Agent": USER_AGENT,
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "ro,en;q=0.9",
|
||||
"Content-Type": "application/x-www-form-urlencoded",
|
||||
},
|
||||
timeout=TIMEOUT,
|
||||
)
|
||||
log(f"POST status={r.status_code} bytes={len(r.text)} content-type={r.headers.get('content-type')}")
|
||||
return r
|
||||
|
||||
|
||||
def analyze_response(html: str) -> None:
|
||||
"""Look for key signals in the response."""
|
||||
log("=" * 70)
|
||||
log("RESPONSE ANALYSIS:")
|
||||
# Captcha error?
|
||||
if "incorect" in html.lower() or "invalid" in html.lower() or "gresit" in html.lower():
|
||||
for m in re.finditer(r".{40}(?:incorect|invalid|gresit).{80}", html, re.IGNORECASE):
|
||||
log(f" ERR phrase: {m.group(0)!r}")
|
||||
# Table presence?
|
||||
tbls = re.findall(r"<table[^>]*>", html, re.IGNORECASE)
|
||||
log(f" <table> count: {len(tbls)}")
|
||||
# Row count in datatable?
|
||||
trs = re.findall(r"<tr[^>]*>", html, re.IGNORECASE)
|
||||
log(f" <tr> count: {len(trs)}")
|
||||
# PrimeFaces datatable markers?
|
||||
if "ui-datatable" in html:
|
||||
log(" PrimeFaces DataTable detected")
|
||||
# rows per page hint?
|
||||
m = re.search(r'rows="?(\d+)"?', html)
|
||||
if m: log(f" rows attr: {m.group(1)}")
|
||||
# Pagination evidence?
|
||||
if "ui-paginator" in html or "paginator" in html.lower():
|
||||
log(" Pagination control present")
|
||||
# CUI/CIF column?
|
||||
cuis = re.findall(r"\b\d{6,10}\b", html)
|
||||
log(f" numeric strings 6-10 digits: {len(cuis)} (possible CUIs)")
|
||||
if cuis: log(f" samples: {cuis[:10]}")
|
||||
# Total count somewhere?
|
||||
for m in re.finditer(r"(?:total|înregistrări|inregistrari|rezultate)[^<>]{0,60}", html, re.IGNORECASE):
|
||||
log(f" total phrase: {m.group(0)!r}")
|
||||
# Quarter / publication date references?
|
||||
for m in re.finditer(r"(?:trim|trimestru|publicat)[^<>]{0,80}", html, re.IGNORECASE):
|
||||
log(f" date phrase: {m.group(0)!r}")
|
||||
# Export buttons (CSV/XLSX)?
|
||||
for m in re.finditer(r"(?:export|descarc|csv|xls)[^<>]{0,40}", html, re.IGNORECASE):
|
||||
log(f" export phrase: {m.group(0)!r}")
|
||||
# First 200 chars of body
|
||||
body = re.search(r"<body[^>]*>(.*?)</body>", html, re.DOTALL | re.IGNORECASE)
|
||||
if body:
|
||||
text = re.sub(r"<[^>]+>", " ", body.group(1))
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
log(f" body text preview: {text[:500]!r}")
|
||||
|
||||
|
||||
def main():
|
||||
api_key = os.environ.get("TWOCAPTCHA_KEY")
|
||||
if not api_key:
|
||||
print("Missing TWOCAPTCHA_KEY env var", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
s = requests.Session()
|
||||
s.headers.update({"User-Agent": USER_AGENT})
|
||||
|
||||
html_initial, viewstate = get_initial(s)
|
||||
image = get_kaptcha(s)
|
||||
|
||||
# Save image locally for debugging
|
||||
with open("/tmp/probe_kaptcha.jpg", "wb") as f:
|
||||
f.write(image)
|
||||
log("kaptcha image saved /tmp/probe_kaptcha.jpg")
|
||||
|
||||
captcha_text = solve_kaptcha(api_key, image)
|
||||
|
||||
r = post_search(s, viewstate, captcha_text, search="")
|
||||
with open("/tmp/probe_response.html", "w") as f:
|
||||
f.write(r.text)
|
||||
log("response saved /tmp/probe_response.html")
|
||||
|
||||
analyze_response(r.text)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,525 @@
|
||||
"""
|
||||
ANAF datornici (persoane juridice) — live scraper.
|
||||
|
||||
Source: https://www.anaf.ro/restante/ (JSF/PrimeFaces, JCaptcha image).
|
||||
NOT Cloudflare Turnstile (initial assumption was wrong, confirmed via probe).
|
||||
|
||||
Mechanism (per probe 2026-05-12):
|
||||
1. GET /restante/ → extract `javax.faces.ViewState` + session cookies
|
||||
2. GET /restante/kaptcha.jpg (same session)
|
||||
3. POST kaptcha image to 2captcha (~$0.0005) → get 5-char text token
|
||||
4. POST /restante/index.xhtml with captcha + form fields → first page of data
|
||||
5. AJAX PrimeFaces pagination POSTs for subsequent pages (no new captcha)
|
||||
6. Parse <tr data-ri=N> rows, extract 24 cells per row, UPSERT to anaf.datornici
|
||||
|
||||
Site shows CURRENT QUARTER ONLY (no historical access). Each quarterly run
|
||||
captures one snapshot. Historical pre-2026-Q1 is permanently lost — we keep
|
||||
the 2016-Q1 data.gov.ro snapshot already in DB.
|
||||
|
||||
Env vars:
|
||||
TWOCAPTCHA_KEY — required (image solver)
|
||||
DATABASE_URL — postgres conn string (Prisma-style ?schema= stripped)
|
||||
DRY_RUN=1 — parse plan, no captcha, no DB writes
|
||||
ROWS_PER_PAGE=1000 — pagination chunk size (default 1000; reduce if PrimeFaces times out)
|
||||
ANAF_DATORNICI_LOG — log path (default stderr)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import date
|
||||
from typing import Any
|
||||
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
import requests
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Logging
|
||||
|
||||
LOG_FILE = os.environ.get("ANAF_DATORNICI_LOG", "")
|
||||
_handlers: list[logging.Handler] = [logging.StreamHandler(sys.stderr)]
|
||||
if LOG_FILE:
|
||||
try:
|
||||
_handlers.append(logging.FileHandler(LOG_FILE))
|
||||
except OSError:
|
||||
pass
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [%(levelname)s] %(message)s",
|
||||
handlers=_handlers,
|
||||
)
|
||||
log = logging.getLogger("anaf_datornici")
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Constants
|
||||
|
||||
BASE = "https://www.anaf.ro/restante"
|
||||
INDEX_PAGE = f"{BASE}/"
|
||||
INDEX_FORM = f"{BASE}/index.xhtml"
|
||||
KAPTCHA_URL = f"{BASE}/kaptcha.jpg"
|
||||
USER_AGENT = (
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/120.0 Safari/537.36"
|
||||
)
|
||||
TIMEOUT = 60
|
||||
|
||||
TWOCAPTCHA_IN = "https://2captcha.com/in.php"
|
||||
TWOCAPTCHA_RES = "https://2captcha.com/res.php"
|
||||
TWOCAPTCHA_POLL_INTERVAL = 5 # seconds
|
||||
TWOCAPTCHA_MAX_POLL = 36 # 36 * 5s = 180s
|
||||
TWOCAPTCHA_MAX_ATTEMPTS = 3 # captcha solve retries on wrong-text
|
||||
TWOCAPTCHA_REPORT = True # report bad solves for credit refund
|
||||
|
||||
DEFAULT_ROWS_PER_PAGE = int(os.environ.get("ROWS_PER_PAGE", "1000"))
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Quarter math
|
||||
|
||||
def parse_publication_date(html: str) -> tuple[date, str]:
|
||||
"""Extract 'Obligații fiscale restante la data de DD.MM.YYYY' from page."""
|
||||
m = re.search(r"data\s+de\s+(\d{2})\.(\d{2})\.(\d{4})", html, re.IGNORECASE)
|
||||
if not m:
|
||||
raise RuntimeError("Cannot parse publication_date from page HTML")
|
||||
d, mo, y = int(m.group(1)), int(m.group(2)), int(m.group(3))
|
||||
pub_date = date(y, mo, d)
|
||||
# Map publication_date → quarter label.
|
||||
# Convention: pub at end-of-quarter (31 Mar = T1, 30 Jun = T2, 30 Sep = T3, 31 Dec = T4).
|
||||
q = (mo - 1) // 3 + 1
|
||||
period_label = f"T{q} {y}"
|
||||
return pub_date, period_label
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# 2captcha image solver
|
||||
|
||||
def solve_kaptcha(api_key: str, image: bytes, *, attempt: int = 1) -> tuple[str, str]:
|
||||
"""Submit JPEG to 2captcha, poll for text. Returns (token, captcha_id)."""
|
||||
b64 = base64.b64encode(image).decode()
|
||||
r = requests.post(
|
||||
TWOCAPTCHA_IN,
|
||||
data={
|
||||
"key": api_key,
|
||||
"method": "base64",
|
||||
"body": b64,
|
||||
"json": "1",
|
||||
"numeric": "0", # any chars
|
||||
"min_len": "4",
|
||||
"max_len": "8",
|
||||
"language": "2", # any language
|
||||
"regsense": "1", # case-sensitive
|
||||
},
|
||||
timeout=TIMEOUT,
|
||||
)
|
||||
r.raise_for_status()
|
||||
j = r.json()
|
||||
if j.get("status") != 1:
|
||||
raise RuntimeError(f"2captcha in.php error: {j}")
|
||||
cid = j["request"]
|
||||
log.info(f"2captcha attempt {attempt}: id={cid}, polling…")
|
||||
|
||||
for poll in range(TWOCAPTCHA_MAX_POLL):
|
||||
time.sleep(TWOCAPTCHA_POLL_INTERVAL)
|
||||
rr = requests.get(
|
||||
TWOCAPTCHA_RES,
|
||||
params={"key": api_key, "action": "get", "id": cid, "json": "1"},
|
||||
timeout=TIMEOUT,
|
||||
)
|
||||
jj = rr.json()
|
||||
if jj.get("status") == 1:
|
||||
return jj["request"], cid
|
||||
if jj.get("request") in ("CAPCHA_NOT_READY", "CAPTCHA_NOT_READY"):
|
||||
continue
|
||||
raise RuntimeError(f"2captcha res.php error: {jj}")
|
||||
raise RuntimeError(f"2captcha timeout after {TWOCAPTCHA_MAX_POLL*TWOCAPTCHA_POLL_INTERVAL}s")
|
||||
|
||||
|
||||
def report_bad_solve(api_key: str, cid: str) -> None:
|
||||
"""Report wrong solve to 2captcha for credit refund."""
|
||||
try:
|
||||
requests.get(
|
||||
TWOCAPTCHA_RES,
|
||||
params={"key": api_key, "action": "reportbad", "id": cid},
|
||||
timeout=TIMEOUT,
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Session / pagination
|
||||
|
||||
@dataclass
|
||||
class AnafSession:
|
||||
api_key: str
|
||||
s: requests.Session = field(default_factory=requests.Session)
|
||||
viewstate: str = ""
|
||||
publication_date: date | None = None
|
||||
period_label: str = ""
|
||||
total_records: int = 0
|
||||
|
||||
def __post_init__(self):
|
||||
self.s.headers.update({"User-Agent": USER_AGENT})
|
||||
|
||||
def bootstrap(self) -> None:
|
||||
"""GET initial page, extract ViewState + session cookies."""
|
||||
log.info(f"GET {INDEX_PAGE}")
|
||||
r = self.s.get(INDEX_PAGE, timeout=TIMEOUT)
|
||||
r.raise_for_status()
|
||||
m = re.search(r'name="javax\.faces\.ViewState"[^>]*value="([^"]+)"', r.text)
|
||||
if not m:
|
||||
raise RuntimeError("No ViewState in initial page")
|
||||
self.viewstate = m.group(1)
|
||||
log.info(f"viewstate fetched ({len(self.viewstate)} chars)")
|
||||
|
||||
def get_kaptcha(self) -> bytes:
|
||||
r = self.s.get(
|
||||
KAPTCHA_URL, timeout=TIMEOUT, headers={"Referer": INDEX_PAGE}
|
||||
)
|
||||
r.raise_for_status()
|
||||
if not r.content.startswith(b"\xff\xd8\xff"):
|
||||
raise RuntimeError("kaptcha response not JPEG")
|
||||
return r.content
|
||||
|
||||
def submit_initial(self, captcha_text: str, rows_per_page: int) -> str:
|
||||
"""POST form with captcha → first page of data (HTML)."""
|
||||
log.info(f"POST {INDEX_FORM} (captcha={captcha_text!r}, rows={rows_per_page})")
|
||||
r = self.s.post(
|
||||
INDEX_FORM,
|
||||
data={
|
||||
"form": "form",
|
||||
"form:inputc": captcha_text,
|
||||
"form:searchdata": "",
|
||||
"form:submit": "",
|
||||
"form_SUBMIT": "1",
|
||||
"javax.faces.ViewState": self.viewstate,
|
||||
},
|
||||
headers={
|
||||
"Referer": INDEX_PAGE,
|
||||
"Origin": "https://www.anaf.ro",
|
||||
"User-Agent": USER_AGENT,
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "ro,en;q=0.9",
|
||||
"Content-Type": "application/x-www-form-urlencoded",
|
||||
},
|
||||
timeout=TIMEOUT,
|
||||
)
|
||||
r.raise_for_status()
|
||||
# Refresh ViewState (JSF rotates it on each interaction)
|
||||
m = re.search(r'name="javax\.faces\.ViewState"[^>]*value="([^"]+)"', r.text)
|
||||
if m:
|
||||
self.viewstate = m.group(1)
|
||||
# Detect captcha-error case
|
||||
if "Cod de validare gresit" in r.text or "incorect" in r.text.lower()[:5000]:
|
||||
raise CaptchaWrong(r.text)
|
||||
# Extract publication date + total
|
||||
try:
|
||||
self.publication_date, self.period_label = parse_publication_date(r.text)
|
||||
log.info(f"publication_date={self.publication_date} period={self.period_label}")
|
||||
except RuntimeError:
|
||||
log.warning("could not parse publication_date — using today's quarter")
|
||||
today = date.today()
|
||||
self.publication_date = today
|
||||
self.period_label = f"T{(today.month - 1) // 3 + 1} {today.year}"
|
||||
m = re.search(r"\((\d+)\s+of\s+(\d+)\)", r.text)
|
||||
if m:
|
||||
self.total_records = int(m.group(2)) * 16 # pages * rows-per-page-default
|
||||
log.info(f"total_records estimate (from paginator): ~{self.total_records}")
|
||||
return r.text
|
||||
|
||||
def fetch_page(self, first: int, rows_per_page: int) -> str:
|
||||
"""AJAX PrimeFaces pagination POST. Returns partial response XML."""
|
||||
r = self.s.post(
|
||||
INDEX_FORM,
|
||||
data={
|
||||
"javax.faces.partial.ajax": "true",
|
||||
"javax.faces.source": "form:dataTable",
|
||||
"javax.faces.partial.execute": "form:dataTable",
|
||||
"javax.faces.partial.render": "form:dataTable",
|
||||
"form:dataTable": "form:dataTable",
|
||||
"form:dataTable_pagination": "true",
|
||||
"form:dataTable_first": str(first),
|
||||
"form:dataTable_rows": str(rows_per_page),
|
||||
"form:dataTable_encodeFeature": "true",
|
||||
"form": "form",
|
||||
"form:inputc": "",
|
||||
"form:searchdata": "",
|
||||
"javax.faces.ViewState": self.viewstate,
|
||||
},
|
||||
headers={
|
||||
"Referer": INDEX_FORM,
|
||||
"Origin": "https://www.anaf.ro",
|
||||
"User-Agent": USER_AGENT,
|
||||
"Accept": "application/xml,text/xml,*/*;q=0.01",
|
||||
"Accept-Language": "ro,en;q=0.9",
|
||||
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
|
||||
"X-Requested-With": "XMLHttpRequest",
|
||||
"Faces-Request": "partial/ajax",
|
||||
},
|
||||
timeout=TIMEOUT,
|
||||
)
|
||||
r.raise_for_status()
|
||||
# Update ViewState from partial response
|
||||
m = re.search(r'<update id="[^"]*javax\.faces\.ViewState[^"]*"><!\[CDATA\[([^\]]+)\]\]>', r.text)
|
||||
if m:
|
||||
self.viewstate = m.group(1)
|
||||
return r.text
|
||||
|
||||
|
||||
class CaptchaWrong(Exception):
|
||||
pass
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Row parsing
|
||||
|
||||
def parse_rows(html_or_partial: str) -> list[dict[str, Any]]:
|
||||
"""Extract debtor rows from initial HTML or AJAX partial response.
|
||||
|
||||
Row layout (24 cells observed via probe 2026-05-12):
|
||||
0: nr_crt
|
||||
1: name (denumire debitor)
|
||||
2: CIF (cui)
|
||||
3: total bugetul de stat
|
||||
4: total asigurări sociale
|
||||
5: total șomaj
|
||||
6: total sănătate
|
||||
7-10: state {principal, accesorii, necontestate, contestate}
|
||||
11-14: social {principal, accesorii, necontestate, contestate}
|
||||
15-18: unemployment {principal, accesorii, necontestate, contestate}
|
||||
19-22: health {principal, accesorii, necontestate, contestate}
|
||||
23: observation/status (e.g. "Faliment")
|
||||
"""
|
||||
rows: list[dict[str, Any]] = []
|
||||
# Match each <tr ... data-ri="N">…</tr>
|
||||
for tr_m in re.finditer(
|
||||
r'<tr\b[^>]*data-ri="(\d+)"[^>]*>(.*?)</tr>',
|
||||
html_or_partial, re.DOTALL,
|
||||
):
|
||||
body = tr_m.group(2)
|
||||
cells = re.findall(r"<td\b[^>]*>(.*?)</td>", body, re.DOTALL)
|
||||
if len(cells) < 24:
|
||||
continue
|
||||
def _txt(s: str) -> str:
|
||||
t = re.sub(r"<[^>]+>", "", s)
|
||||
return re.sub(r"\s+", " ", t).strip()
|
||||
def _num(s: str) -> float:
|
||||
t = _txt(s).replace(".", "").replace(",", ".")
|
||||
try:
|
||||
return float(t)
|
||||
except ValueError:
|
||||
return 0.0
|
||||
rows.append({
|
||||
"nr_crt": _txt(cells[0]),
|
||||
"name": _txt(cells[1]),
|
||||
"cui": _txt(cells[2]),
|
||||
"budget_state_total": _num(cells[3]),
|
||||
"budget_social_total": _num(cells[4]),
|
||||
"budget_unemployment_total": _num(cells[5]),
|
||||
"budget_health_total": _num(cells[6]),
|
||||
"state_principal": _num(cells[7]),
|
||||
"state_penalty": _num(cells[8]),
|
||||
"state_necontestate": _num(cells[9]),
|
||||
"state_contestate": _num(cells[10]),
|
||||
"social_principal": _num(cells[11]),
|
||||
"social_penalty": _num(cells[12]),
|
||||
"social_necontestate": _num(cells[13]),
|
||||
"social_contestate": _num(cells[14]),
|
||||
"unemp_principal": _num(cells[15]),
|
||||
"unemp_penalty": _num(cells[16]),
|
||||
"unemp_necontestate": _num(cells[17]),
|
||||
"unemp_contestate": _num(cells[18]),
|
||||
"health_principal": _num(cells[19]),
|
||||
"health_penalty": _num(cells[20]),
|
||||
"health_necontestate": _num(cells[21]),
|
||||
"health_contestate": _num(cells[22]),
|
||||
"observation": _txt(cells[23]),
|
||||
})
|
||||
return rows
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# DB UPSERT
|
||||
|
||||
def upsert_rows(
|
||||
conn,
|
||||
rows: list[dict[str, Any]],
|
||||
publication_date: date,
|
||||
period_label: str,
|
||||
debtor_category: str = "persoane_juridice",
|
||||
) -> int:
|
||||
if not rows:
|
||||
return 0
|
||||
source_url = INDEX_PAGE
|
||||
# debt_total per row = sum of 4 category totals
|
||||
payload = [(
|
||||
r["cui"].replace(" ", "").upper().lstrip("RO"),
|
||||
r["name"],
|
||||
None, # judet not provided by ANAF /restante/
|
||||
publication_date,
|
||||
period_label,
|
||||
debtor_category,
|
||||
# debt_total = sum of 4 category totals
|
||||
r["budget_state_total"] + r["budget_social_total"]
|
||||
+ r["budget_unemployment_total"] + r["budget_health_total"],
|
||||
# principal across categories
|
||||
r["state_principal"] + r["social_principal"]
|
||||
+ r["unemp_principal"] + r["health_principal"],
|
||||
# penalty across categories
|
||||
r["state_penalty"] + r["social_penalty"]
|
||||
+ r["unemp_penalty"] + r["health_penalty"],
|
||||
# contestate across categories
|
||||
r["state_contestate"] + r["social_contestate"]
|
||||
+ r["unemp_contestate"] + r["health_contestate"],
|
||||
# per-budget detail (12 columns)
|
||||
r["state_principal"], r["state_penalty"], r["state_contestate"],
|
||||
r["social_principal"], r["social_penalty"], r["social_contestate"],
|
||||
r["unemp_principal"], r["unemp_penalty"], r["unemp_contestate"],
|
||||
r["health_principal"], r["health_penalty"], r["health_contestate"],
|
||||
source_url,
|
||||
) for r in rows if r["cui"]]
|
||||
|
||||
sql = """
|
||||
INSERT INTO anaf.datornici (
|
||||
cui, name, judet, publication_date, period_label, debtor_category,
|
||||
debt_total, debt_principal, debt_penalty, debt_contested,
|
||||
budget_state_principal, budget_state_penalty, budget_state_contested,
|
||||
budget_social_principal, budget_social_penalty, budget_social_contested,
|
||||
budget_unemployment_principal, budget_unemployment_penalty, budget_unemployment_contested,
|
||||
budget_health_principal, budget_health_penalty, budget_health_contested,
|
||||
source_url
|
||||
) VALUES %s
|
||||
ON CONFLICT (cui, publication_date)
|
||||
DO UPDATE SET
|
||||
name = EXCLUDED.name,
|
||||
debt_total = EXCLUDED.debt_total,
|
||||
debt_principal = EXCLUDED.debt_principal,
|
||||
debt_penalty = EXCLUDED.debt_penalty,
|
||||
debt_contested = EXCLUDED.debt_contested,
|
||||
budget_state_principal = EXCLUDED.budget_state_principal,
|
||||
budget_state_penalty = EXCLUDED.budget_state_penalty,
|
||||
budget_state_contested = EXCLUDED.budget_state_contested,
|
||||
budget_social_principal = EXCLUDED.budget_social_principal,
|
||||
budget_social_penalty = EXCLUDED.budget_social_penalty,
|
||||
budget_social_contested = EXCLUDED.budget_social_contested,
|
||||
budget_unemployment_principal = EXCLUDED.budget_unemployment_principal,
|
||||
budget_unemployment_penalty = EXCLUDED.budget_unemployment_penalty,
|
||||
budget_unemployment_contested = EXCLUDED.budget_unemployment_contested,
|
||||
budget_health_principal = EXCLUDED.budget_health_principal,
|
||||
budget_health_penalty = EXCLUDED.budget_health_penalty,
|
||||
budget_health_contested = EXCLUDED.budget_health_contested,
|
||||
fetched_at = now()
|
||||
"""
|
||||
with conn.cursor() as cur:
|
||||
psycopg2.extras.execute_values(cur, sql, payload, page_size=500)
|
||||
conn.commit()
|
||||
return len(payload)
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Orchestration
|
||||
|
||||
def run(*, dry_run: bool, rows_per_page: int) -> dict[str, int]:
|
||||
api_key = os.environ.get("TWOCAPTCHA_KEY", "")
|
||||
if not api_key and not dry_run:
|
||||
raise RuntimeError("Missing TWOCAPTCHA_KEY env var — see HANDOFF doc")
|
||||
|
||||
if dry_run:
|
||||
log.info("DRY_RUN=1 — connecting only to validate config, no captcha solve")
|
||||
sess = AnafSession(api_key="")
|
||||
sess.bootstrap()
|
||||
log.info(f"bootstrap OK, viewstate captured ({len(sess.viewstate)} chars)")
|
||||
log.info(f"would solve 1 captcha (~$0.001 worst case) then paginate {rows_per_page} rows/page")
|
||||
return {"datornici_inserted": 0, "errors": 0}
|
||||
|
||||
db_url = os.environ.get("DATABASE_URL", "")
|
||||
if not db_url:
|
||||
raise RuntimeError("Missing DATABASE_URL env var")
|
||||
db_url = re.sub(r"[?&]schema=[^&]*", "", db_url)
|
||||
db_url = re.sub(r"\?$", "", db_url)
|
||||
conn = psycopg2.connect(db_url)
|
||||
conn.autocommit = False
|
||||
|
||||
sess = AnafSession(api_key=api_key)
|
||||
|
||||
# Captcha solve with retries (wrong-text bounce)
|
||||
last_cid: str | None = None
|
||||
for attempt in range(1, TWOCAPTCHA_MAX_ATTEMPTS + 1):
|
||||
sess.bootstrap()
|
||||
image = sess.get_kaptcha()
|
||||
token, cid = solve_kaptcha(api_key, image, attempt=attempt)
|
||||
last_cid = cid
|
||||
try:
|
||||
initial_html = sess.submit_initial(token, rows_per_page)
|
||||
log.info(f"captcha accepted on attempt {attempt}")
|
||||
break
|
||||
except CaptchaWrong:
|
||||
log.warning(f"captcha rejected by ANAF on attempt {attempt}, retrying")
|
||||
if TWOCAPTCHA_REPORT and last_cid:
|
||||
report_bad_solve(api_key, last_cid)
|
||||
if attempt == TWOCAPTCHA_MAX_ATTEMPTS:
|
||||
raise RuntimeError("captcha solve failed after retries")
|
||||
|
||||
# Initial page rows
|
||||
all_rows = parse_rows(initial_html)
|
||||
log.info(f"page 1: {len(all_rows)} rows")
|
||||
|
||||
# Paginate
|
||||
# Discover total via paginator markup. Default page count is 16/page;
|
||||
# if we set rows_per_page>16, total_records estimate may be wrong.
|
||||
# Just iterate until parse_rows returns empty.
|
||||
first = len(all_rows)
|
||||
page_num = 2
|
||||
while True:
|
||||
try:
|
||||
partial = sess.fetch_page(first=first, rows_per_page=rows_per_page)
|
||||
new_rows = parse_rows(partial)
|
||||
if not new_rows:
|
||||
log.info(f"pagination exhausted at first={first}")
|
||||
break
|
||||
all_rows.extend(new_rows)
|
||||
log.info(f"page {page_num}: {len(new_rows)} rows (running total: {len(all_rows)})")
|
||||
first += len(new_rows)
|
||||
page_num += 1
|
||||
except Exception as e:
|
||||
log.error(f"pagination error at page {page_num}: {e}")
|
||||
break
|
||||
|
||||
log.info(f"total rows collected: {len(all_rows)}")
|
||||
if not sess.publication_date:
|
||||
raise RuntimeError("No publication_date captured")
|
||||
|
||||
inserted = upsert_rows(
|
||||
conn, all_rows,
|
||||
publication_date=sess.publication_date,
|
||||
period_label=sess.period_label,
|
||||
)
|
||||
log.info(f"upserted {inserted} rows into anaf.datornici for {sess.period_label}")
|
||||
|
||||
conn.close()
|
||||
return {"datornici_inserted": inserted, "errors": 0}
|
||||
|
||||
|
||||
def main():
|
||||
dry_run = os.environ.get("DRY_RUN", "0") == "1"
|
||||
rows_per_page = int(os.environ.get("ROWS_PER_PAGE", str(DEFAULT_ROWS_PER_PAGE)))
|
||||
log.info(f"=== ANAF datornici scrape: dry_run={dry_run} rows_per_page={rows_per_page} ===")
|
||||
try:
|
||||
result = run(dry_run=dry_run, rows_per_page=rows_per_page)
|
||||
except Exception as e:
|
||||
log.error(f"FATAL: {e}", exc_info=True)
|
||||
sys.exit(1)
|
||||
log.info(f"DONE {result}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user