""" ANAF /restante/ probe — discovers actual mechanism. Steps: 1. GET /restante/ → extract javax.faces.ViewState, session cookie 2. GET kaptcha.jpg (same session) 3. POST kaptcha image to 2captcha → get text solution 4. POST /restante/index.xhtml with captcha + form fields → get response 5. Print: response HTML structure, table shape, pagination markers, quarter selector evidence Used ONCE to understand the page before committing to a full scraper rewrite. Spends ~$0.001 of 2captcha credit. """ import base64 import os import re import sys import time import requests BASE = "https://www.anaf.ro/restante" INDEX_URL = f"{BASE}/index.xhtml" KAPTCHA_URL = f"{BASE}/kaptcha.jpg" USER_AGENT = ( "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/120.0 Safari/537.36" ) TIMEOUT = 30 TWOCAPTCHA_IN = "https://2captcha.com/in.php" TWOCAPTCHA_RES = "https://2captcha.com/res.php" def log(msg: str) -> None: print(f"[probe] {msg}", file=sys.stderr, flush=True) def get_initial(session: requests.Session) -> tuple[str, str]: """Fetch /restante/ page, return (html, viewstate).""" log(f"GET {BASE}/") r = session.get(f"{BASE}/", timeout=TIMEOUT) r.raise_for_status() html = r.text m = re.search(r'name="javax\.faces\.ViewState"[^>]*value="([^"]+)"', html) if not m: raise RuntimeError("No ViewState found") viewstate = m.group(1) log(f"viewstate={viewstate[:24]}…") log(f"cookies after GET: {list(session.cookies.keys())}") return html, viewstate def get_kaptcha(session: requests.Session) -> bytes: log(f"GET {KAPTCHA_URL}") r = session.get(KAPTCHA_URL, timeout=TIMEOUT, headers={"Referer": f"{BASE}/"}) r.raise_for_status() if not r.content.startswith(b"\xff\xd8\xff"): log(f"WARN: kaptcha response not JPEG (first bytes: {r.content[:10]!r})") log(f"kaptcha bytes: {len(r.content)} (jpg)") return r.content def solve_kaptcha(api_key: str, image: bytes) -> str: """Submit image to 2captcha, poll for solution.""" b64 = base64.b64encode(image).decode() log("POST 2captcha in.php with image…") r = requests.post( TWOCAPTCHA_IN, data={ "key": api_key, "method": "base64", "body": b64, "json": "1", # Hint to 2captcha workers: this is short alphanumeric (kaptcha # default is 5-6 chars, mixed letter+digit, anti-aliased). "numeric": "0", # 0 = any chars allowed "min_len": "4", "max_len": "8", "language": "2", # 2 = any language (alphanumeric) "regsense": "1", # case-sensitive ON }, timeout=TIMEOUT, ) r.raise_for_status() j = r.json() if j.get("status") != 1: raise RuntimeError(f"2captcha in.php error: {j}") cid = j["request"] log(f"2captcha job id={cid}, polling…") for attempt in range(30): # 30 * 5s = 150s cap time.sleep(5) r = requests.get( TWOCAPTCHA_RES, params={"key": api_key, "action": "get", "id": cid, "json": "1"}, timeout=TIMEOUT, ) j = r.json() if j.get("status") == 1: token = j["request"] log(f"2captcha solved: {token!r}") return token if j.get("request") in ("CAPCHA_NOT_READY", "CAPTCHA_NOT_READY"): log(f" poll {attempt+1}: not ready") continue raise RuntimeError(f"2captcha res.php error: {j}") raise RuntimeError("2captcha timeout 150s") def post_search(session: requests.Session, viewstate: str, captcha: str, search: str = "") -> requests.Response: """POST the form. Empty search = list all (best-case, hopefully bulk).""" log(f"POST {INDEX_URL} captcha={captcha!r} search={search!r}") r = session.post( INDEX_URL, data={ "form": "form", "form:inputc": captcha, "form:searchdata": search, "form:submit": "", # button submit "form_SUBMIT": "1", "javax.faces.ViewState": viewstate, }, headers={ "Referer": f"{BASE}/", "Origin": "https://www.anaf.ro", "User-Agent": USER_AGENT, "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "ro,en;q=0.9", "Content-Type": "application/x-www-form-urlencoded", }, timeout=TIMEOUT, ) log(f"POST status={r.status_code} bytes={len(r.text)} content-type={r.headers.get('content-type')}") return r def analyze_response(html: str) -> None: """Look for key signals in the response.""" log("=" * 70) log("RESPONSE ANALYSIS:") # Captcha error? if "incorect" in html.lower() or "invalid" in html.lower() or "gresit" in html.lower(): for m in re.finditer(r".{40}(?:incorect|invalid|gresit).{80}", html, re.IGNORECASE): log(f" ERR phrase: {m.group(0)!r}") # Table presence? tbls = re.findall(r"]*>", html, re.IGNORECASE) log(f" count: {len(tbls)}") # Row count in datatable? trs = re.findall(r"]*>", html, re.IGNORECASE) log(f" count: {len(trs)}") # PrimeFaces datatable markers? if "ui-datatable" in html: log(" PrimeFaces DataTable detected") # rows per page hint? m = re.search(r'rows="?(\d+)"?', html) if m: log(f" rows attr: {m.group(1)}") # Pagination evidence? if "ui-paginator" in html or "paginator" in html.lower(): log(" Pagination control present") # CUI/CIF column? cuis = re.findall(r"\b\d{6,10}\b", html) log(f" numeric strings 6-10 digits: {len(cuis)} (possible CUIs)") if cuis: log(f" samples: {cuis[:10]}") # Total count somewhere? for m in re.finditer(r"(?:total|înregistrări|inregistrari|rezultate)[^<>]{0,60}", html, re.IGNORECASE): log(f" total phrase: {m.group(0)!r}") # Quarter / publication date references? for m in re.finditer(r"(?:trim|trimestru|publicat)[^<>]{0,80}", html, re.IGNORECASE): log(f" date phrase: {m.group(0)!r}") # Export buttons (CSV/XLSX)? for m in re.finditer(r"(?:export|descarc|csv|xls)[^<>]{0,40}", html, re.IGNORECASE): log(f" export phrase: {m.group(0)!r}") # First 200 chars of body body = re.search(r"]*>(.*?)", html, re.DOTALL | re.IGNORECASE) if body: text = re.sub(r"<[^>]+>", " ", body.group(1)) text = re.sub(r"\s+", " ", text).strip() log(f" body text preview: {text[:500]!r}") def main(): api_key = os.environ.get("TWOCAPTCHA_KEY") if not api_key: print("Missing TWOCAPTCHA_KEY env var", file=sys.stderr) sys.exit(1) s = requests.Session() s.headers.update({"User-Agent": USER_AGENT}) html_initial, viewstate = get_initial(s) image = get_kaptcha(s) # Save image locally for debugging with open("/tmp/probe_kaptcha.jpg", "wb") as f: f.write(image) log("kaptcha image saved /tmp/probe_kaptcha.jpg") captcha_text = solve_kaptcha(api_key, image) r = post_search(s, viewstate, captcha_text, search="") with open("/tmp/probe_response.html", "w") as f: f.write(r.text) log("response saved /tmp/probe_response.html") analyze_response(r.text) if __name__ == "__main__": main()