initial: split from gov-agreg — vreau.digital standalone platform

Moved from gov-agreg/src/pages/achizitii/* to root (drop prefix).
- 22 pages migrated, 127 files total
- All internal links: /achizitii/X → /X (176 occurrences fixed)
- AchizitiiLayout subnav rewritten: /X paths, top-right link to vreaudigital.ro hub
- BaseLayout new (vreau.digital branding, OG tags, site URL)
- astro.config.mjs: site https://vreau.digital, server output (was static)
- docker-compose: port 5096 (vreaudigital is 5095), container vreau-digital
- deploy.sh: paths /opt/vreau-digital, log /var/log/vreau-digital-deploy.log

Backend shared with gov-agreg:
- PostgreSQL satra (same schemas: seap, firms, anaf, anre, ...)
- Photon, Martin tiles
- Infisical /vreaudigital path (DATABASE_URL etc. shared)

build: PASS (npx astro check 0 errors, npm run build 5s vite + 10s server)
This commit is contained in:
Claude VM
2026-05-13 00:10:32 +03:00
commit a6c03a091e
352 changed files with 75295 additions and 0 deletions
+19
View File
@@ -0,0 +1,19 @@
FROM python:3.12-slim
RUN apt-get update && apt-get install -y --no-install-recommends \
curl ca-certificates && \
rm -rf /var/lib/apt/lists/*
WORKDIR /app
COPY requirements.txt /app/
RUN pip install --no-cache-dir -r requirements.txt
COPY wsp/ /app/wsp/
COPY credentials/50076FB3826FADA540ACFB19.p12 /app/credentials/
ENV PYTHONPATH=/app
ENV PYTHONUNBUFFERED=1
# Default: run incremental for all feeds. Override with `command` in compose.
CMD ["python", "-m", "wsp.runner", "incremental", "all"]
+102
View File
@@ -0,0 +1,102 @@
"""
In-memory mTLS certificate loader for SEAP WSP.
Decrypts the .p12 in memory using SEAP_CERT_KEY env var, builds an
ssl.SSLContext, and exposes it via a requests.HTTPAdapter. The private
key never touches disk.
"""
from __future__ import annotations
import os
import ssl
import tempfile
from pathlib import Path
import requests
from cryptography.hazmat.primitives import serialization
from cryptography.hazmat.primitives.serialization import pkcs12
from requests.adapters import HTTPAdapter
from urllib3.poolmanager import PoolManager
def load_p12_to_ssl_context(p12_path: str | Path, password: str) -> ssl.SSLContext:
"""Load PKCS#12 in memory and return an SSLContext with client cert + key.
The PEMs are written to a temp file briefly because OpenSSL/SSLContext
requires a file path. We use mkstemp on a tmpfs-backed dir when
available (/dev/shm on Linux) and unlink immediately after load —
so the data lives only as an ephemeral inode held by SSLContext.
"""
p12_data = Path(p12_path).read_bytes()
private_key, cert, additional = pkcs12.load_key_and_certificates(
p12_data, password.encode()
)
if private_key is None or cert is None:
raise RuntimeError('p12 missing private key or certificate')
key_pem = private_key.private_bytes(
encoding=serialization.Encoding.PEM,
format=serialization.PrivateFormat.PKCS8,
encryption_algorithm=serialization.NoEncryption(),
)
cert_pem = cert.public_bytes(serialization.Encoding.PEM)
if additional:
for ca in additional:
cert_pem += ca.public_bytes(serialization.Encoding.PEM)
# Use /dev/shm if available (RAM-backed); fallback to /tmp
shm = Path('/dev/shm')
tmp_dir = str(shm) if shm.is_dir() and os.access(shm, os.W_OK) else None
fd, path = tempfile.mkstemp(prefix='wsp_chain_', suffix='.pem', dir=tmp_dir)
try:
os.write(fd, cert_pem + b'\n' + key_pem)
os.close(fd)
os.chmod(path, 0o600)
ctx = ssl.create_default_context()
ctx.load_cert_chain(certfile=path)
return ctx
finally:
# Unlink immediately — kernel keeps the file alive until SSLContext closes
try:
os.unlink(path)
except OSError:
pass
class _SSLContextAdapter(HTTPAdapter):
"""Requests adapter that uses a pre-built SSLContext (with client cert)."""
def __init__(self, ssl_context: ssl.SSLContext, **kwargs):
self._ssl_context = ssl_context
super().__init__(**kwargs)
def init_poolmanager(self, *args, **kwargs):
kwargs['ssl_context'] = self._ssl_context
return super().init_poolmanager(*args, **kwargs)
def proxy_manager_for(self, *args, **kwargs):
kwargs['ssl_context'] = self._ssl_context
return super().proxy_manager_for(*args, **kwargs)
def make_mtls_session(p12_path: str | Path, password: str) -> requests.Session:
"""Build a requests.Session with mTLS via in-memory cert."""
ctx = load_p12_to_ssl_context(p12_path, password)
session = requests.Session()
adapter = _SSLContextAdapter(ssl_context=ctx)
session.mount('https://', adapter)
return session
def make_mtls_session_from_env() -> requests.Session:
"""Convenience: read p12 path + password from env."""
p12 = os.environ.get(
'SEAP_P12_PATH',
str(Path(__file__).parent.parent / 'credentials' / '50076FB3826FADA540ACFB19.p12'),
)
pwd = os.environ.get('SEAP_CERT_KEY')
if not pwd:
raise RuntimeError('SEAP_CERT_KEY not in env')
return make_mtls_session(p12, pwd)
+224
View File
@@ -0,0 +1,224 @@
"""
WSP SOAP client — envelope construction + HTTPS+mTLS POST + response parsing.
Critical, non-negotiable rules (validated empirically against e-licitatie.ro:8883):
1. Endpoint URL is case-sensitive: /Pub (capital P).
2. Field children of <tem:request> MUST be in alphabetic order (WCF DataContract).
3. SeapUserCredentials xmlns="http://tempuri.org" (NO trailing slash).
4. SOAPAction header must be quoted: "http://tempuri.org/{contract}/{op}".
5. None values must be encoded as <sic:Field i:nil="true"/>, not omitted —
omitting an alphabetic-order intermediate field causes WCF to silently
null-out subsequent fields (validation cascades).
"""
from __future__ import annotations
import os
import time
from dataclasses import dataclass
from typing import Any, Optional
import requests
from lxml import etree
from .operations import WspOp, PROD_URL, NS_INTEG
NS_TEM = 'http://tempuri.org' # NOTE: no trailing slash for SeapUserCredentials
NS_TEM_BODY = 'http://tempuri.org/' # WITH slash for tem:Op body
NS_XSI = 'http://www.w3.org/2001/XMLSchema-instance'
NS_SOAP = 'http://schemas.xmlsoap.org/soap/envelope/'
# Response namespaces for status extraction
NS_INTEG_BRACED = '{%s}' % NS_INTEG
def _envelope(op: WspOp, fields: dict[str, Any], username: str, password: str) -> bytes:
"""Build a WCF-compatible SOAP envelope.
Children of <tem:request> are emitted in sorted (alphabetic) order.
None values become <sic:F i:nil="true"/>. Other values are str-converted.
"""
parts: list[str] = []
for name in sorted(fields.keys()):
val = fields[name]
if val is None:
parts.append(f' <sic:{name} i:nil="true"/>')
else:
parts.append(f' <sic:{name}>{_xml_escape(str(val))}</sic:{name}>')
body = f'''<soapenv:Envelope xmlns:soapenv="{NS_SOAP}" xmlns:tem="{NS_TEM_BODY}" xmlns:sic="{op.request_ns}" xmlns:i="{NS_XSI}">
<soapenv:Header>
<SeapUserCredentials xmlns="{NS_TEM}" xmlns:i="{NS_XSI}">
<Password xmlns="{NS_INTEG}">{_xml_escape(password)}</Password>
<Username xmlns="{NS_INTEG}">{_xml_escape(username)}</Username>
</SeapUserCredentials>
</soapenv:Header>
<soapenv:Body>
<tem:{op.name}>
<tem:request>
{chr(10).join(parts)}
</tem:request>
</tem:{op.name}>
</soapenv:Body>
</soapenv:Envelope>'''
return body.encode('utf-8')
def _xml_escape(s: str) -> str:
return (s.replace('&', '&amp;').replace('<', '&lt;')
.replace('>', '&gt;').replace('"', '&quot;')
.replace("'", '&apos;'))
@dataclass
class WspResult:
status: str # 'Success' | 'ValidationError' | 'SystemError' | 'AuthFailed' | etc
description: Optional[str]
page_index: int
page_total: int # total ITEM count (not page count). PageSize=100, so pages = ceil(page_total/100)
items_xml: bytes # raw <a:Items>...</a:Items> bytes for parser
raw_envelope: bytes # full response (for debugging / saving)
elapsed_ms: int
items: Optional[list] = None # populated by paginator after parser runs
@property
def success(self) -> bool:
return self.status == 'Success'
page_size: int = 50 # confirmed empirically
@property
def num_pages(self) -> int:
ps = self.page_size or 50
return (self.page_total + ps - 1) // ps if self.page_total > 0 else 0
class WspClient:
"""Thread-safe SOAP client for SEAP WSP. Reuses a single HTTPS session."""
def __init__(self, session: requests.Session, username: str, password: str,
endpoint: str = PROD_URL):
self.session = session
self.username = username
self.password = password
self.endpoint = endpoint
def call(self, op: WspOp, fields: dict[str, Any], timeout: int = 120,
max_retries: int = 3) -> WspResult:
"""Execute a SOAP call with retry on transient errors."""
body = _envelope(op, fields, self.username, self.password)
headers = {
'Content-Type': 'text/xml; charset=utf-8',
'SOAPAction': f'"{op.soap_action}"',
}
last_exc: Optional[Exception] = None
for attempt in range(max_retries):
try:
t0 = time.time()
resp = self.session.post(self.endpoint, data=body,
headers=headers, timeout=timeout)
elapsed_ms = int((time.time() - t0) * 1000)
if resp.status_code in (429, 500, 502, 503, 504):
# 500 includes WCF ActionNotSupported faults — but those
# are deterministic, so don't retry. Differentiate via body.
if b'<faultcode' in resp.content:
# SOAP fault — return as parsed error, no retry
return self._parse_fault(resp.content, elapsed_ms)
# transient — retry with backoff
time.sleep(2 ** attempt)
continue
resp.raise_for_status()
return self._parse_response(resp.content, elapsed_ms)
except (requests.Timeout, requests.ConnectionError) as e:
last_exc = e
time.sleep(2 ** attempt)
raise RuntimeError(f'Exhausted retries on {op.name}: {last_exc}')
def _parse_response(self, content: bytes, elapsed_ms: int) -> WspResult:
"""Extract Status, Description, PageIndex, PageTotal, Items.
SEAP returns MTOM/XOP multipart sometimes — strip wrapping if present.
"""
# Strip MTOM multipart wrapper if present
if content[:4] == b'\r\n--' or content[:2] == b'--':
# Multipart — find the XML part
idx = content.find(b'<s:Envelope')
if idx == -1:
idx = content.find(b'<?xml')
if idx != -1:
end = content.find(b'</s:Envelope>')
if end != -1:
content = content[idx:end + len(b'</s:Envelope>')]
else:
# Sometimes the multipart starts with --uuid... mid-stream
idx = content.find(b'<s:Envelope')
if idx > 0:
content = content[idx:]
end = content.find(b'</s:Envelope>')
if end != -1:
content = content[:end + len(b'</s:Envelope>')]
try:
root = etree.fromstring(content)
except etree.XMLSyntaxError as e:
return WspResult(
status='ParseError', description=str(e),
page_index=0, page_total=0,
items_xml=b'', raw_envelope=content, elapsed_ms=elapsed_ms,
)
status_el = root.find(f'.//{NS_INTEG_BRACED}Status')
desc_el = root.find(f'.//{NS_INTEG_BRACED}Description')
# PageIndex/PageTotal live in a:* namespace (varies per response type)
page_index = 0
page_total = 0
for el in root.iter():
tag = etree.QName(el.tag).localname
if tag == 'PageIndex' and el.text and el.text.isdigit():
page_index = int(el.text)
elif tag == 'PageTotal' and el.text and el.text.isdigit():
page_total = int(el.text)
# Find Items element — varies by namespace
items_el = None
for el in root.iter():
if etree.QName(el.tag).localname == 'Items':
items_el = el
break
items_xml = etree.tostring(items_el) if items_el is not None else b''
return WspResult(
status=status_el.text if status_el is not None else 'UNKNOWN',
description=desc_el.text if desc_el is not None else None,
page_index=page_index,
page_total=page_total,
items_xml=items_xml,
raw_envelope=content,
elapsed_ms=elapsed_ms,
)
def _parse_fault(self, content: bytes, elapsed_ms: int) -> WspResult:
try:
root = etree.fromstring(content)
faultcode = root.find('.//{*}faultcode')
faultstring = root.find('.//{*}faultstring')
status = f'Fault:{faultcode.text}' if faultcode is not None else 'Fault'
desc = faultstring.text if faultstring is not None else None
except Exception:
status = 'Fault:Parse'
desc = content[:200].decode('utf-8', errors='replace')
return WspResult(
status=status, description=desc,
page_index=0, page_total=0,
items_xml=b'', raw_envelope=content, elapsed_ms=elapsed_ms,
)
def make_client_from_env(endpoint: str = PROD_URL) -> WspClient:
"""Build a WspClient using SEAP_USER/SEAP_PASS/SEAP_CERT_KEY env vars."""
from .cert_loader import make_mtls_session_from_env
user = os.environ.get('SEAP_USER')
pwd = os.environ.get('SEAP_PASS')
if not user or not pwd:
raise RuntimeError('SEAP_USER / SEAP_PASS not in env')
session = make_mtls_session_from_env()
return WspClient(session, user, pwd, endpoint=endpoint)
+47
View File
@@ -0,0 +1,47 @@
#!/usr/bin/env bash
#
# Daily WSP incremental sync — runs on satra via systemd timer or cron.
#
# Loads DATABASE_URL from /opt/architools/.env and SEAP secrets from
# Infisical. Runs incremental for all configured public + Beletage feeds.
# Logs go to /var/log/wsp-sync/ (rotated by logrotate).
set -euo pipefail
LOG_DIR=/var/log/wsp-sync
LOG_FILE="$LOG_DIR/wsp-incremental-$(date +%Y%m%d).log"
mkdir -p "$LOG_DIR"
cd /opt/vreaudigital/services/seap-scraper
# Load DB URL (architools_user — same DB as TED imports)
set -a
. /opt/architools/.env
set +a
# Strip ?schema= param (psycopg2 doesn't accept it)
export DATABASE_URL="${DATABASE_URL%%\?*}"
# Load SEAP secrets from Infisical (using the same loader as claude-start.sh)
if [ -f /opt/wsp/.env.seap ]; then
# Production: secrets pre-loaded into a static file readable by service account
set -a
. /opt/wsp/.env.seap
set +a
else
echo "ERROR: /opt/wsp/.env.seap not found — run wsp-secrets-fetch.sh first" >&2
exit 1
fi
# Run incremental for each feed in sequence (low-volume daily = ~5 min total)
for op in SU_CaNotices SU_CNotices SU_PiNotices SU_RfqNotices SU_RfqInvitations \
SU_DCNotices SU_PCNotices SU_RdcNotices SU_ENotices \
SuContracts SuInvoices SuDirectAcquisitions; do
echo "=== $(date -Iseconds)$op ===" | tee -a "$LOG_FILE"
./.venv/bin/python -m wsp.runner incremental "$op" --lookback-hours 36 \
>> "$LOG_FILE" 2>&1 || echo " (failed, continuing)" | tee -a "$LOG_FILE"
done
echo "=== $(date -Iseconds) — Done ===" | tee -a "$LOG_FILE"
# Show status summary at the end (also captured by journalctl if systemd-run)
./.venv/bin/python -m wsp.runner status 2>&1 | tail -30 | tee -a "$LOG_FILE"
+336
View File
@@ -0,0 +1,336 @@
"""DB sinks — idempotent UPSERT for each parsed row type."""
from __future__ import annotations
import json
import logging
import os
from contextlib import contextmanager
from datetime import datetime
from decimal import Decimal
from typing import Any, Iterable
import psycopg2
from psycopg2.extras import Json, execute_values
log = logging.getLogger(__name__)
def get_db_url() -> str:
url = os.environ.get('DATABASE_URL')
if not url:
raise RuntimeError('DATABASE_URL not set in env')
return url
def connect():
return psycopg2.connect(get_db_url())
@contextmanager
def connection():
conn = connect()
try:
yield conn
conn.commit()
except Exception:
conn.rollback()
raise
finally:
conn.close()
def _json_safe(obj):
"""Convert Decimal/datetime/date to JSON-friendly types recursively."""
if isinstance(obj, Decimal):
return str(obj)
if isinstance(obj, datetime):
return obj.isoformat()
if isinstance(obj, dict):
return {k: _json_safe(v) for k, v in obj.items()}
if isinstance(obj, list):
return [_json_safe(v) for v in obj]
return obj
# ── seap.announcements (public feeds) ──
ANNOUNCEMENTS_COLS = [
'type', 'ref_number',
'authority_name', 'authority_cui', 'authority_address',
'authority_email', 'authority_phone', 'authority_url',
'authority_type', 'authority_main_activity', 'authority_entity_id',
'title', 'cpv_code', 'contract_type',
'publication_date', 'estimated_value', 'awarded_value', 'currency',
'supplier_name', 'supplier_cui', 'supplier_address', 'supplier_is_sme',
'procedure_type', 'procedure_state', 'legislation',
'has_lots', 'contract_has_lots', 'lots_count',
'joue', 'county_code', 'notice_state', 'notice_state_id',
'framework_agreement', 'notice_id_internal',
'deadline_submission', 'opening_date', 'duration_months',
'documents', 'award_criteria', 'lots', 'details',
'seap_url', 'source',
]
_JSONB_COLS = {'documents', 'award_criteria', 'lots', 'details'}
def insert_announcements(cur, rows: Iterable[dict]) -> int:
"""Bulk upsert into seap.announcements. Returns count inserted/updated."""
rows = list(rows)
if not rows:
return 0
values = []
for r in rows:
values.append(tuple(
Json(_json_safe(r.get(c))) if c in _JSONB_COLS and r.get(c) is not None
else r.get(c)
for c in ANNOUNCEMENTS_COLS
))
cols_sql = ', '.join(ANNOUNCEMENTS_COLS)
update_cols = [c for c in ANNOUNCEMENTS_COLS if c not in ('type', 'ref_number')]
update_sql = ', '.join(
f'{c} = COALESCE(EXCLUDED.{c}, seap.announcements.{c})' if c not in _JSONB_COLS
else f'{c} = EXCLUDED.{c}'
for c in update_cols
)
sql = f"""
INSERT INTO seap.announcements ({cols_sql})
VALUES %s
ON CONFLICT (type, ref_number) DO UPDATE SET
{update_sql},
enriched_at = now()
"""
execute_values(cur, sql, values, page_size=100)
return len(rows)
# ── Beletage tables ──
BELETAGE_CONTRACTS_COLS = [
'contract_id', 'contract_no', 'contract_title', 'contract_type',
'contract_phase', 'contract_state',
'awarding_date', 'contract_date', 'publication_date',
'duration_months', 'contract_value', 'default_currency_value', 'currency',
'ca_notice_id', 'ca_notice_no', 'authority_name', 'authority_cui',
'is_current_version', 'is_rejected', 'version_no', 'version_date',
'justification', 'additional_information', 'details',
]
def insert_beletage_contracts(cur, rows):
rows = list(rows)
if not rows:
return 0
values = [
tuple(
Json(_json_safe(r.get(c))) if c == 'details' and r.get(c) is not None
else r.get(c)
for c in BELETAGE_CONTRACTS_COLS
)
for r in rows
]
cols_sql = ', '.join(BELETAGE_CONTRACTS_COLS)
update_cols = [c for c in BELETAGE_CONTRACTS_COLS if c != 'contract_id']
update_sql = ', '.join(f'{c} = EXCLUDED.{c}' for c in update_cols)
sql = f"""
INSERT INTO seap.beletage_contracts ({cols_sql})
VALUES %s
ON CONFLICT (contract_id) DO UPDATE SET
{update_sql},
enriched_at = now()
"""
execute_values(cur, sql, values, page_size=100)
return len(rows)
BELETAGE_INVOICES_COLS = [
'invoice_id', 'invoice_no', 'invoice_date', 'due_date',
'contract_id', 'contract_no', 'authority_name', 'authority_cui',
'total_value', 'total_value_no_vat', 'vat_value', 'currency',
'state', 'paid_value', 'paid_at', 'details',
]
def insert_beletage_invoices(cur, rows):
rows = list(rows)
if not rows:
return 0
values = [
tuple(
Json(_json_safe(r.get(c))) if c == 'details' and r.get(c) is not None
else r.get(c)
for c in BELETAGE_INVOICES_COLS
)
for r in rows
]
cols_sql = ', '.join(BELETAGE_INVOICES_COLS)
update_cols = [c for c in BELETAGE_INVOICES_COLS if c != 'invoice_id']
update_sql = ', '.join(f'{c} = EXCLUDED.{c}' for c in update_cols)
sql = f"""
INSERT INTO seap.beletage_invoices ({cols_sql})
VALUES %s
ON CONFLICT (invoice_id) DO UPDATE SET
{update_sql}
"""
execute_values(cur, sql, values, page_size=100)
return len(rows)
BELETAGE_DA_COLS = [
'da_id', 'da_name', 'unique_identification_code',
'cpv_code', 'cpv_name', 'contract_type',
'publication_date', 'finalization_date',
'estimated_value', 'closing_value', 'currency',
'da_state', 'authority_id', 'authority_name', 'authority_cui',
'details',
]
def insert_beletage_da(cur, rows):
rows = list(rows)
if not rows:
return 0
values = [
tuple(
Json(_json_safe(r.get(c))) if c == 'details' and r.get(c) is not None
else r.get(c)
for c in BELETAGE_DA_COLS
)
for r in rows
]
cols_sql = ', '.join(BELETAGE_DA_COLS)
update_cols = [c for c in BELETAGE_DA_COLS if c != 'da_id']
update_sql = ', '.join(f'{c} = EXCLUDED.{c}' for c in update_cols)
sql = f"""
INSERT INTO seap.beletage_direct_acquisitions ({cols_sql})
VALUES %s
ON CONFLICT (da_id) DO UPDATE SET
{update_sql}
"""
execute_values(cur, sql, values, page_size=100)
return len(rows)
BELETAGE_CATALOG_COLS = ['item_code', 'item_name', 'cpv_code', 'unit_price',
'currency', 'last_updated', 'details']
def insert_beletage_catalog(cur, rows):
rows = list(rows)
if not rows:
return 0
values = [
tuple(
Json(_json_safe(r.get(c))) if c == 'details' and r.get(c) is not None
else r.get(c)
for c in BELETAGE_CATALOG_COLS
)
for r in rows
]
cols_sql = ', '.join(BELETAGE_CATALOG_COLS)
update_cols = [c for c in BELETAGE_CATALOG_COLS if c != 'item_code']
update_sql = ', '.join(f'{c} = EXCLUDED.{c}' for c in update_cols)
sql = f"""
INSERT INTO seap.beletage_catalog ({cols_sql})
VALUES %s
ON CONFLICT (item_code) DO UPDATE SET {update_sql}
"""
execute_values(cur, sql, values, page_size=100)
return len(rows)
# Sink dispatch
SINK_FUNCS = {
'announcements': insert_announcements,
'beletage_contracts': insert_beletage_contracts,
'beletage_invoices': insert_beletage_invoices,
'beletage_da': insert_beletage_da,
'beletage_catalog': insert_beletage_catalog,
}
# ── Sync state ──
def get_cursor(cur, feed: str) -> dict | None:
cur.execute('SELECT last_cursor_date, items_imported_total, consecutive_errors '
'FROM seap.wsp_sync_state WHERE feed = %s', (feed,))
row = cur.fetchone()
if not row:
return None
return {
'last_cursor_date': row[0],
'items_imported_total': row[1],
'consecutive_errors': row[2],
}
def update_sync_state(cur, feed: str, *, cursor_date=None, window_start=None,
window_end=None, items_added=0, error: str | None = None):
cur.execute("""
INSERT INTO seap.wsp_sync_state (feed, last_run_at, last_cursor_date,
last_window_start, last_window_end, items_imported_total,
items_imported_24h, consecutive_errors, last_error, last_error_at)
VALUES (%s, now(), %s, %s, %s, %s, %s,
CASE WHEN %s IS NULL THEN 0 ELSE 1 END,
%s,
CASE WHEN %s IS NULL THEN NULL ELSE now() END)
ON CONFLICT (feed) DO UPDATE SET
last_run_at = now(),
last_cursor_date = COALESCE(EXCLUDED.last_cursor_date, seap.wsp_sync_state.last_cursor_date),
last_window_start = EXCLUDED.last_window_start,
last_window_end = EXCLUDED.last_window_end,
items_imported_total = seap.wsp_sync_state.items_imported_total + %s,
items_imported_24h = %s,
consecutive_errors = CASE WHEN %s IS NULL THEN 0
ELSE seap.wsp_sync_state.consecutive_errors + 1 END,
last_error = EXCLUDED.last_error,
last_error_at = CASE WHEN %s IS NULL THEN seap.wsp_sync_state.last_error_at
ELSE now() END
""", (feed, cursor_date, window_start, window_end, items_added, items_added,
error, error, error,
items_added, items_added, error, error))
def upsert_backfill_window(cur, feed: str, ws, we, county=None) -> int:
cur.execute("""
INSERT INTO seap.wsp_backfill_windows (feed, window_start, window_end, county_code)
VALUES (%s, %s, %s, %s)
ON CONFLICT (feed, window_start, window_end, county_code) DO NOTHING
RETURNING id
""", (feed, ws, we, county))
row = cur.fetchone()
return row[0] if row else 0
def claim_backfill_window(cur, feed: str) -> dict | None:
"""Atomically claim oldest pending window for processing."""
cur.execute("""
UPDATE seap.wsp_backfill_windows
SET state = 'in_progress', started_at = now(),
attempts = attempts + 1
WHERE id = (
SELECT id FROM seap.wsp_backfill_windows
WHERE feed = %s AND state IN ('pending', 'failed') AND attempts < 5
ORDER BY window_start
LIMIT 1
FOR UPDATE SKIP LOCKED
)
RETURNING id, window_start, window_end, county_code, attempts
""", (feed,))
row = cur.fetchone()
if not row:
return None
return {
'id': row[0], 'window_start': row[1], 'window_end': row[2],
'county_code': row[3], 'attempts': row[4],
}
def complete_backfill_window(cur, win_id: int, items: int, page_total: int,
error: str | None = None):
state = 'failed' if error else 'completed'
cur.execute("""
UPDATE seap.wsp_backfill_windows
SET state = %s, items_imported = %s, page_total = %s,
completed_at = now(), last_error = %s
WHERE id = %s
""", (state, items, page_total, error, win_id))
+107
View File
@@ -0,0 +1,107 @@
#!/usr/bin/env python3
"""Load EU CPV nomenclature (RO names) into seap.cpv_codes + backfill announcements."""
import json
import os
import sys
import urllib.request
from pathlib import Path
import psycopg2
from psycopg2.extras import execute_values
CPV_URL = 'https://raw.githubusercontent.com/samhallskod/cpv-eu/main/data/cpv.json'
def main():
db = os.environ.get('DATABASE_URL')
if not db:
print('DATABASE_URL not set', file=sys.stderr)
sys.exit(1)
cache = Path('/tmp/cpv.json')
if not cache.exists():
print(f'Downloading {CPV_URL}...')
urllib.request.urlretrieve(CPV_URL, cache)
data = json.loads(cache.read_text())
print(f'Loaded {len(data)} CPV codes from JSON')
rows = []
for entry in data:
code = entry['code'] # 8-digit (no dash)
labels = entry.get('labels', {})
name_ro = labels.get('ro') or labels.get('en') or code
name_en = labels.get('en')
level = entry.get('level', 0)
emoji = entry.get('emoji')
# Division = first 2 digits + 6 zeros
division = code[:2] + '000000'
# Parent = level - 1 — easiest: trim trailing zero pairs based on level
# Level mapping: 1=XX000000, 2=XXX00000, 3=XXXX0000, 4=XXXXX000, 5=XXXXXX00, 6=XXXXXXX0, 7=XXXXXXXX (full)
# For our purposes, parent is one level up — replace last non-zero digit with 0
parent = None
if level > 1:
# find rightmost non-zero, zero it out
digits = list(code)
# Find position of last meaningful digit
for i in range(len(digits) - 1, -1, -1):
if digits[i] != '0':
digits[i] = '0'
break
parent_code = ''.join(digits)
if parent_code != code:
parent = parent_code
# Build full code with check digit (synthesize a placeholder dash + 0)
# The real check digit is per-code — we don't compute it; use the 8-digit form everywhere.
rows.append((code, code, name_ro, name_en, level, division, parent, emoji))
print(f'Inserting {len(rows)} rows into seap.cpv_codes...')
conn = psycopg2.connect(db)
cur = conn.cursor()
cur.execute('TRUNCATE seap.cpv_codes')
execute_values(cur, """
INSERT INTO seap.cpv_codes (code, code_full, name_ro, name_en, level,
division_code, parent_code, emoji)
VALUES %s
""", rows, page_size=500)
conn.commit()
cur.execute('SELECT count(*), count(*) FILTER (WHERE level=1) FROM seap.cpv_codes')
total, divisions = cur.fetchone()
print(f'{total} codes loaded, {divisions} top-level divisions')
# Backfill cpv_division + cpv_name_ro on announcements
print('Backfilling cpv_division + cpv_name_ro on announcements...')
cur.execute("""
UPDATE seap.announcements
SET cpv_division = seap.cpv_division(cpv_code),
cpv_name_ro = seap.cpv_name(cpv_code)
WHERE cpv_code IS NOT NULL
AND (cpv_division IS NULL OR cpv_name_ro IS NULL)
""")
affected = cur.rowcount
conn.commit()
print(f'{affected:,} announcement rows enriched')
# Show top divisions
cur.execute("""
SELECT a.cpv_division, c.name_ro, c.emoji,
count(*) as n,
sum(awarded_value)::numeric(15,0) as total_value
FROM seap.announcements a
LEFT JOIN seap.cpv_codes c ON c.code = a.cpv_division
WHERE a.cpv_division IS NOT NULL AND a.source LIKE 'wsp_%'
GROUP BY 1, 2, 3
ORDER BY 4 DESC
LIMIT 15
""")
print('\nTop 15 CPV divisions in WSP data:')
for r in cur.fetchall():
print(f' {r[2] or " "} {r[0]:<10} {(r[1] or "?")[:45]:<45} {r[3]:>5} contracte {r[4] or 0:>13,} RON')
conn.close()
if __name__ == '__main__':
main()
+202
View File
@@ -0,0 +1,202 @@
"""
WSP operation registry — single source of truth for SOAP call config.
Each operation declares:
- contract: SOAP contract (ISupplierWebService, IPubWebService, etc.)
- request_ns: XML namespace of the Request type
- date_fields: (start_field, end_field) — for window-based ops
- max_window_days: server-imposed max window
- extra_fields: static fields to inject (e.g. defaults)
- response_xpath: XPath to extract Items elements from the response
- parser: dotted module name of parser
- sink: 'announcements' (public) or 'beletage' (own)
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Optional
NS_INTEG = 'http://schemas.datacontract.org/2004/07/SICAP.Service.Integration'
NS_MODEL_BASE = 'http://schemas.datacontract.org/2004/07/SICAP.Supplier.Interface.Model'
NS_MODEL_CONTRACTS = NS_MODEL_BASE + '.Contracts'
# Endpoints
PROD_URL = 'https://e-licitatie.ro:8883/Pub'
DEMO_URL = 'http://demo.e-licitatie.ro:8884/Pub'
@dataclass
class WspOp:
name: str # operation name (matches SOAPAction tail)
contract: str # SOAP contract — 'ISupplierWebService'
request_ns: str = NS_MODEL_BASE
date_start_field: Optional[str] = None
date_end_field: Optional[str] = None
max_window_days: int = 7 # server max
page_size_inferred: int = 50 # confirmed empirically: server returns 50 items/page
items_cap: int = 1000 # if PageTotal == 1000, window may be truncated; auto-split
parser_module: str = '' # 'wsp.parsers.ca_notice'
sink: str = 'announcements' # 'announcements' | 'beletage_contracts' | 'beletage_invoices' | 'beletage_da' | 'beletage_catalog'
source_tag: str = '' # value for seap.announcements.source column
item_xpath: str = '' # XPath under <a:Items> to find each item element
@property
def soap_action(self) -> str:
return f'http://tempuri.org/{self.contract}/{self.name}'
# Public-data feeds (SU_*) — return all-Romania notices, not scoped to Beletage
PUBLIC_OPS: dict[str, WspOp] = {
'SU_CaNotices': WspOp(
name='SU_CaNotices',
contract='ISupplierWebService',
date_start_field='PublicationStartDate',
date_end_field='PublicationEndDate',
max_window_days=1, # high volume — must split per day
parser_module='wsp.parsers.ca_notice',
source_tag='wsp_canotice',
item_xpath='.//{http://schemas.datacontract.org/2004/07/SICAP.Supplier.Interface.Model.CANotice}CaNoticeBase',
),
'SU_CNotices': WspOp(
name='SU_CNotices',
contract='ISupplierWebService',
date_start_field='PublicationStartDate',
date_end_field='PublicationEndDate',
max_window_days=7,
parser_module='wsp.parsers.c_notice',
source_tag='wsp_cnotice',
item_xpath='.//{http://schemas.datacontract.org/2004/07/SICAP.Supplier.Interface.Model.CNotice}CNoticeBase',
),
'SU_PiNotices': WspOp(
name='SU_PiNotices',
contract='ISupplierWebService',
date_start_field='PublicationStartDate',
date_end_field='PublicationEndDate',
max_window_days=30,
parser_module='wsp.parsers.pi_notice',
source_tag='wsp_pinotice',
item_xpath='.//{http://schemas.datacontract.org/2004/07/SICAP.Supplier.Interface.Model.PINotice}PiNoticeBase',
),
'SU_RfqNotices': WspOp(
name='SU_RfqNotices',
contract='ISupplierWebService',
date_start_field='PublicationStartDate',
date_end_field='PublicationEndDate',
max_window_days=7,
parser_module='wsp.parsers.rfq_notice',
source_tag='wsp_rfq_notice',
item_xpath='.//{http://schemas.datacontract.org/2004/07/SICAP.Supplier.Interface.Model.RfqNotice}RfqNoticeBase',
),
'SU_RfqInvitations': WspOp(
name='SU_RfqInvitations',
contract='ISupplierWebService',
date_start_field='PublicationStartDate',
date_end_field='PublicationEndDate',
max_window_days=7,
parser_module='wsp.parsers.rfq_invitation',
source_tag='wsp_rfq_invitation',
item_xpath='.//{http://schemas.datacontract.org/2004/07/SICAP.Supplier.Interface.Model.RfqInvitation}RfqInvitationBase',
),
'SU_DCNotices': WspOp(
name='SU_DCNotices',
contract='ISupplierWebService',
date_start_field='PublicationStartDate',
date_end_field='PublicationEndDate',
max_window_days=30,
parser_module='wsp.parsers.dc_notice',
source_tag='wsp_dcnotice',
item_xpath='.//{http://schemas.datacontract.org/2004/07/SICAP.Supplier.Interface.Model.DCNotice}DCNoticeBase',
),
'SU_PCNotices': WspOp(
name='SU_PCNotices',
contract='ISupplierWebService',
date_start_field='PublicationStartDate',
date_end_field='PublicationEndDate',
max_window_days=30,
parser_module='wsp.parsers.pc_notice',
source_tag='wsp_pcnotice',
item_xpath='.//{http://schemas.datacontract.org/2004/07/SICAP.Supplier.Interface.Model.PCNotice}PCNoticeBase',
),
'SU_RdcNotices': WspOp(
name='SU_RdcNotices',
contract='ISupplierWebService',
date_start_field='PublicationStartDate',
date_end_field='PublicationEndDate',
max_window_days=30,
parser_module='wsp.parsers.rdc_notice',
source_tag='wsp_rdcnotice',
item_xpath='.//{http://schemas.datacontract.org/2004/07/SICAP.Supplier.Interface.Model.RDCNotice}RdcNoticeBase',
),
'SU_ENotices': WspOp(
name='SU_ENotices',
contract='ISupplierWebService',
date_start_field='PublicationStartDate',
date_end_field='PublicationEndDate',
max_window_days=30,
parser_module='wsp.parsers.e_notice',
source_tag='wsp_enotice',
item_xpath='.//{http://schemas.datacontract.org/2004/07/SICAP.Supplier.Interface.Model.ENotice}ENoticeBase',
),
'SU_EAProcedure': WspOp(
name='SU_EAProcedure',
contract='ISupplierWebService',
date_start_field='PublicationStartDate',
date_end_field='PublicationEndDate',
max_window_days=30,
parser_module='wsp.parsers.ea_procedure',
source_tag='wsp_eaprocedure',
item_xpath='.//{http://schemas.datacontract.org/2004/07/SICAP.Supplier.Interface.Model.EAProcedure}EAProcedureBase',
),
}
# Beletage-scoped (own data)
OWN_OPS: dict[str, WspOp] = {
'SuContracts': WspOp(
name='SuContracts',
contract='ISupplierWebService',
request_ns=NS_MODEL_CONTRACTS,
date_start_field='ContractStartDate',
date_end_field='ContractEndDate',
max_window_days=10,
parser_module='wsp.parsers.su_contract',
sink='beletage_contracts',
source_tag='wsp_beletage_contract',
item_xpath='.//{http://schemas.datacontract.org/2004/07/SICAP.Supplier.Interface.Model.Contracts}ContractItem',
),
'SuInvoices': WspOp(
name='SuInvoices',
contract='ISupplierWebService',
date_start_field='MinDate',
date_end_field='MaxDate',
max_window_days=10,
parser_module='wsp.parsers.su_invoice',
sink='beletage_invoices',
source_tag='wsp_beletage_invoice',
item_xpath='.//{http://schemas.datacontract.org/2004/07/SICAP.Supplier.Interface.Model}InvoiceItem',
),
'SuDirectAcquisitions': WspOp(
name='SuDirectAcquisitions',
contract='ISupplierWebService',
date_start_field='PublicationStartDate',
date_end_field='PublicationEndDate',
max_window_days=30,
parser_module='wsp.parsers.su_da',
sink='beletage_da',
source_tag='wsp_beletage_da',
item_xpath='.//{http://schemas.datacontract.org/2004/07/SICAP.Supplier.Interface.Model}SUDirectAcquisition',
),
'Catalog_ListItems': WspOp(
name='Catalog_ListItems',
contract='ISupplierWebService',
date_start_field='LastUpdateStart',
date_end_field='LastUpdateEnd',
max_window_days=365, # catalog is small, use wide window
parser_module='wsp.parsers.catalog',
sink='beletage_catalog',
source_tag='wsp_beletage_catalog',
item_xpath='.//{http://schemas.datacontract.org/2004/07/SICAP.Supplier.Interface.Model}CatalogListItem',
),
}
ALL_OPS: dict[str, WspOp] = {**PUBLIC_OPS, **OWN_OPS}
+199
View File
@@ -0,0 +1,199 @@
"""
Window-aware pagination + auto-split.
Constraints discovered empirically:
- PageSize is server-fixed at 100 items per response.
- Server caps results at PageTotal == 1000 — when reached, the window
truncated and we lose data. Auto-split window in halves until PageTotal < 1000.
- Server silently returns PageTotal=0 for windows that are too wide
(e.g. >7 days for SU_CaNotices) — also triggers auto-split.
Yields parsed item dicts as they arrive; caller is responsible for batching
DB writes. Cursor advancement (last successful publication_date) is tracked
externally in seap.wsp_sync_state.
"""
from __future__ import annotations
import importlib
import logging
from dataclasses import dataclass, field
from datetime import datetime, timedelta
from typing import Any, Iterator
from lxml import etree
from .client import WspClient
from .operations import WspOp
log = logging.getLogger(__name__)
@dataclass
class WindowResult:
op_name: str
window_start: datetime
window_end: datetime
items_imported: int = 0
pages_fetched: int = 0
page_total_first: int = 0 # PageTotal returned on page 1
sub_windows: list['WindowResult'] = field(default_factory=list) # if split
errors: list[str] = field(default_factory=list)
skipped: bool = False # set if window was outside cursor range
@property
def all_items_imported(self) -> int:
return self.items_imported + sum(s.all_items_imported for s in self.sub_windows)
def _load_parser(op: WspOp):
"""Dynamically import the parser for an operation."""
if not op.parser_module:
return None
try:
mod = importlib.import_module(op.parser_module)
return mod
except ModuleNotFoundError:
log.warning('Parser module %s not found — items will be skipped', op.parser_module)
return None
def _iter_items_xml(items_xml: bytes, op: WspOp):
"""Iterate item elements out of <a:Items>."""
if not items_xml:
return
try:
items_el = etree.fromstring(items_xml)
except etree.XMLSyntaxError as e:
log.error('Cannot parse items XML for %s: %s', op.name, e)
return
for el in items_el.iter():
# Match the configured item tag (or fallback: any direct child)
if op.item_xpath:
qname = op.item_xpath.rsplit('}', 1)[-1]
if etree.QName(el.tag).localname == qname:
yield el
else:
# fallback: yield all children of Items
if el.getparent() is items_el:
yield el
def fetch_window(
client: WspClient,
op: WspOp,
window_start: datetime,
window_end: datetime,
extra_fields: dict | None = None,
on_item: callable = None,
) -> WindowResult:
"""Fetch all pages for a single (start, end) window.
Auto-splits if PageTotal == op.items_cap (server-side cap reached).
Calls `on_item(parsed_dict, raw_xml_element)` for each item parsed.
Returns WindowResult with stats; sub_windows populated if splits occurred.
"""
extra_fields = extra_fields or {}
parser = _load_parser(op)
result = WindowResult(
op_name=op.name,
window_start=window_start,
window_end=window_end,
)
# Build base fields
fields_base = dict(extra_fields)
if op.date_start_field:
fields_base[op.date_start_field] = window_start.strftime('%Y-%m-%dT%H:%M:%S')
if op.date_end_field:
fields_base[op.date_end_field] = window_end.strftime('%Y-%m-%dT%H:%M:%S')
# Page 1
fields = {**fields_base, 'PageIndex': 1}
r = client.call(op, fields)
result.pages_fetched = 1
result.page_total_first = r.page_total
if r.status != 'Success':
result.errors.append(f'page1: {r.status}{r.description}')
return result
# Detect server cap and split
if r.page_total >= op.items_cap and (window_end - window_start) > timedelta(hours=1):
log.info(' %s: PageTotal=%d hit cap (>= %d) on %s%s — splitting',
op.name, r.page_total, op.items_cap, window_start, window_end)
return _split_window(client, op, window_start, window_end, extra_fields, on_item)
# Process page 1 items
if r.items_xml and parser is not None:
for el in _iter_items_xml(r.items_xml, op):
try:
parsed = parser.parse(el)
if parsed and on_item:
on_item(parsed, el)
result.items_imported += 1
except Exception as e:
log.exception('Parse error in %s: %s', op.name, e)
result.errors.append(f'parse: {e}')
# Remaining pages
total_pages = r.num_pages
for page in range(2, total_pages + 1):
fields = {**fields_base, 'PageIndex': page}
r2 = client.call(op, fields)
result.pages_fetched += 1
if r2.status != 'Success':
result.errors.append(f'page{page}: {r2.status}{r2.description}')
break
if parser is None:
continue
for el in _iter_items_xml(r2.items_xml, op):
try:
parsed = parser.parse(el)
if parsed and on_item:
on_item(parsed, el)
result.items_imported += 1
except Exception as e:
log.exception('Parse error in %s page %d: %s', op.name, page, e)
result.errors.append(f'parse p{page}: {e}')
return result
def _split_window(client: WspClient, op: WspOp,
start: datetime, end: datetime,
extra_fields: dict, on_item) -> WindowResult:
"""Recursively split window in half until each fits under cap."""
parent = WindowResult(op_name=op.name, window_start=start, window_end=end)
span = end - start
if span <= timedelta(hours=1):
# Cannot split further — accept truncation
log.warning('Cannot split below 1h for %s [%s, %s] — accepting cap',
op.name, start, end)
# Just fetch with single window and tolerate cap
sub = fetch_window(client, op, start, end, extra_fields, on_item)
parent.sub_windows.append(sub)
return parent
mid = start + span / 2
sub1 = fetch_window(client, op, start, mid, extra_fields, on_item)
sub2 = fetch_window(client, op, mid, end, extra_fields, on_item)
parent.sub_windows.extend([sub1, sub2])
return parent
def split_into_windows(start: datetime, end: datetime,
max_window_days: int) -> list[tuple[datetime, datetime]]:
"""Slice [start, end) into chunks of max_window_days each.
Returns list of (window_start, window_end) tuples.
"""
if start >= end:
return []
span = timedelta(days=max_window_days)
windows = []
cur = start
while cur < end:
nxt = min(cur + span, end)
windows.append((cur, nxt))
cur = nxt
return windows
+214
View File
@@ -0,0 +1,214 @@
"""
Base notice parser — shared logic for CN, PI, RFQ, DC, PC, Rdc, EN notices.
CA notice has more complex structure (lots + winners) so it has its own parser.
"""
from __future__ import annotations
from ..xml_utils import (
find_child_local, find_local, find_path,
text_under, text_direct, int_under, decimal_under, bool_under,
datetime_under, sysitem_name, sysitem_id,
)
def parse_basic_notice(el, *, type_tag: str, source_tag: str,
notice_id_field: str = None) -> dict | None:
"""Generic notice parser — works for CN, PI, DC, PC, Rdc, EN, RFQ.
type_tag: short type identifier for the row (e.g. 'c_notice', 'pi_notice')
source_tag: value for source column (e.g. 'wsp_cnotice')
notice_id_field: local name of the ID field (CNoticeId, PiNoticeId, etc.)
If None, auto-detect by trying common names.
"""
notice_no = (text_under(el, 'NoticeNo') or
text_under(el, 'CNoticeNumber') or
text_under(el, 'PiNoticeNumber') or
text_under(el, 'NoticeNumber') or
text_under(el, 'RFQInvitationNumber') or
text_under(el, 'RFQNoticeNumber') or
text_under(el, 'DCNoticeNumber') or
text_under(el, 'PCNoticeNumber') or
text_under(el, 'RDCNoticeNumber') or
text_under(el, 'ENoticeNumber') or
text_under(el, 'EAProcedureNumber') or
text_under(el, 'DfNoticeNo')) # last-resort fallback
if not notice_no:
return None
notice_id = None
for field in (notice_id_field, 'CNoticeId', 'PiNoticeId', 'NoticeId',
'CaNoticeId', 'DCNoticeId', 'PCNoticeId',
'RFQNoticeId', 'RdcNoticeId', 'ENoticeId',
'EAProcedureId', 'RFQInvitationId'):
if field:
notice_id = int_under(el, field)
if notice_id is not None:
break
general = find_child_local(el, 'General')
section1 = find_child_local(el, 'Section1')
section2 = find_child_local(el, 'Section2')
section4 = find_child_local(el, 'Section4')
# Authority
auth_addresses = find_path(section1, 'Section1_1', 'CaAddresses')
if auth_addresses is None:
auth_addresses = find_path(section1, 'Section1_1')
auth_info = find_local(auth_addresses, 'EntityInformation') if auth_addresses is not None else None
authority_name = text_direct(auth_info, 'Name') if auth_info is not None else None
authority_cui = text_direct(auth_info, 'Cif') if auth_info is not None else None
authority_address = text_direct(auth_info, 'Address') if auth_info is not None else None
authority_email = text_direct(auth_info, 'Email') if auth_info is not None else None
authority_phone = text_direct(auth_info, 'Phone') if auth_info is not None else None
authority_url = text_direct(auth_info, 'Url') if auth_info is not None else None
county_code = sysitem_name(auth_info, 'NutsCode') if auth_info is not None else None
entity_id = int_under(general, 'EntityId') if general is not None else None
s1_4 = find_child_local(section1, 'Section1_4') if section1 is not None else None
authority_type = sysitem_name(s1_4, 'ContractingAuthorityType')
s1_5 = find_child_local(section1, 'Section1_5') if section1 is not None else None
main_activity = sysitem_name(s1_5, 'MainActivity')
# Section 2 — contract
s2_1 = find_child_local(section2, 'Section2_1') if section2 is not None else None
contract_title = (text_under(general, 'ContractTitle') or
text_under(s2_1, 'ContractName') or
text_under(s2_1, 'Title'))
short_desc = text_under(s2_1, 'ShortContractDescription')
main_cpv_code = sysitem_name(s2_1, 'MainCPV') or sysitem_name(s2_1, 'MainCPVCode')
main_cpv_id = sysitem_id(s2_1, 'MainCPV') or sysitem_id(s2_1, 'MainCPVCode')
contract_type = sysitem_name(s2_1, 'SysAcquisitionContractType')
currency = sysitem_name(s2_1, 'Currency')
estimated_value = decimal_under(s2_1, 'EstimatedValue') or decimal_under(s2_1, 'TotalValue')
has_lots = bool_under(s2_1, 'ContractHasLots')
reference_number = text_under(s2_1, 'ReferenceNumber')
# Lots — for CN/RFQ/etc., lots in Section2_2
lots = _extract_lots_simple(section2)
lots_count = len(lots) if lots else None
# Procedure
s4_1 = find_child_local(section4, 'Section4_1') if section4 is not None else None
procedure_type = sysitem_name(s4_1, 'SysProcedureType')
framework_agreement = bool_under(s4_1, 'FrameworkAgreement')
# Section 4_2 — deadlines
s4_2 = find_child_local(section4, 'Section4_2') if section4 is not None else None
deadline_submission = (datetime_under(s4_2, 'TenderAvailabilityDeadline') or
datetime_under(s4_2, 'ReceiptTimeLimit') or
datetime_under(s4_2, 'ReceiptDeadline'))
opening_date = datetime_under(s4_2, 'TenderOpeningDate')
# Dates + state
publication_date = datetime_under(general, 'PublishDate')
legislation = sysitem_name(general, 'SysLegislationType') or sysitem_name(general, 'LegislationType')
notice_state = sysitem_name(general, 'SysNoticeState')
notice_state_id = sysitem_id(general, 'SysNoticeState')
is_utility = bool_under(general, 'IsUtility')
notice_no_joue = text_under(general, 'NoticeNoJoue') or text_under(general, 'JOUEPublicationNumber')
# Documents
documents = _extract_documents(general)
return {
'type': type_tag,
'ref_number': f'WSP-{notice_no}',
'authority_name': authority_name,
'authority_cui': authority_cui,
'authority_address': authority_address,
'authority_email': authority_email,
'authority_phone': authority_phone,
'authority_url': authority_url,
'authority_type': authority_type,
'authority_main_activity': main_activity,
'authority_entity_id': entity_id,
'title': contract_title[:1000] if contract_title else None,
'cpv_code': main_cpv_code,
'contract_type': contract_type,
'publication_date': publication_date,
'estimated_value': estimated_value,
'awarded_value': None,
'currency': currency,
'supplier_name': None,
'supplier_cui': None,
'procedure_type': procedure_type,
'procedure_state': notice_state,
'legislation': legislation,
'has_lots': 'da' if has_lots else 'nu' if has_lots is False else None,
'contract_has_lots': has_lots,
'lots_count': lots_count,
'joue': notice_no_joue,
'county_code': county_code,
'notice_state': notice_state,
'notice_state_id': notice_state_id,
'framework_agreement': framework_agreement,
'notice_id_internal': notice_id,
'deadline_submission': deadline_submission,
'opening_date': opening_date,
'documents': documents or None,
'lots': lots or None,
'details': {
'short_description': short_desc,
'reference_number': reference_number,
'main_cpv_id': main_cpv_id,
'is_utility': is_utility,
},
'source': source_tag,
}
def _extract_lots_simple(section2) -> list[dict]:
"""Extract Lots list from Section2_2 → Lots → LotInfo."""
if section2 is None:
return []
lots_list = find_local(section2, 'Lots')
if lots_list is None:
return []
out = []
for lot in lots_list:
if etree.QName(lot.tag).localname != 'LotInfo':
continue
lot_data = {
'lot_id': int_under(lot, 'LotID'),
'lot_no': int_under(lot, 'LotNo'),
'title': text_under(lot, 'Title'),
'description': text_under(lot, 'DescriptionOfProcurement'),
'cpv_code': sysitem_name(lot, 'MainCPVCode'),
'estimated_value': _str_decimal(decimal_under(lot, 'EstimatedValue')),
'duration_months': int_under(lot, 'DurationInMonths'),
'duration_days': int_under(lot, 'DurationInDays'),
'currency': sysitem_name(lot, 'Currency'),
'place_of_performance': text_under(lot, 'MainSiteOrPlaceOfPerformance'),
'is_community_financed': bool_under(lot, 'IsCommunityFinanced'),
}
out.append({k: v for k, v in lot_data.items() if v is not None})
return out
def _extract_documents(general) -> list[dict]:
if general is None:
return []
out = []
for fld in ('NoticeFiles', 'CompanyFiles', 'DfNoticeFiles'):
container = find_local(general, fld)
if container is None:
continue
for kvp in container:
key = find_local(kvp, 'key')
if key is None:
key = find_local(kvp, 'Key')
val = find_local(kvp, 'value')
if val is None:
val = find_local(kvp, 'Value')
if key is not None and val is not None:
out.append({'type': fld, 'name': key.text, 'id': val.text})
return out
def _str_decimal(d):
return str(d) if d is not None else None
from lxml import etree # noqa: E402
@@ -0,0 +1,6 @@
"""SU_CNotices — Anunțuri de participare."""
from ._base import parse_basic_notice
def parse(el):
return parse_basic_notice(el, type_tag='c_notice', source_tag='wsp_cnotice',
notice_id_field='CNoticeId')
@@ -0,0 +1,287 @@
"""Parse SU_CaNotices items (Anunțuri de atribuire) → seap.announcements row dict."""
from __future__ import annotations
from ..xml_utils import (
find_child_local, find_local, find_path,
text_under, text_direct, int_under, decimal_under, bool_under,
datetime_under, sysitem_name, sysitem_id, to_jsonable,
)
SOURCE = 'wsp_canotice'
def parse(el) -> dict | None:
"""Extract a CaNoticeBase element (V1 or V2) into our normalized dict."""
notice_no = text_under(el, 'NoticeNo')
notice_id = int_under(el, 'CaNoticeId')
if not notice_no:
return None
general = find_child_local(el, 'General')
section1 = find_child_local(el, 'Section1')
section2 = find_child_local(el, 'Section2')
section4 = find_child_local(el, 'Section4')
section5 = find_child_local(el, 'Section5')
# Authority block: Section1_1 → CaAddresses → CaEntityInformation → EntityInformation
auth_entity = find_path(section1, 'Section1_1', 'CaAddresses')
auth_info = None
if auth_entity is not None:
auth_info = find_local(auth_entity, 'EntityInformation')
authority_name = text_direct(auth_info, 'Name') if auth_info is not None else None
authority_cui = text_direct(auth_info, 'Cif') if auth_info is not None else None
authority_address = text_direct(auth_info, 'Address') if auth_info is not None else None
authority_email = text_direct(auth_info, 'Email') if auth_info is not None else None
authority_phone = text_direct(auth_info, 'Phone') if auth_info is not None else None
authority_url = text_direct(auth_info, 'Url') if auth_info is not None else None
# NUTS code = county-ish (RO213 etc.)
nuts_el = find_local(auth_info, 'NutsCode') if auth_info is not None else None
county_code = sysitem_name(auth_info, 'NutsCode') if auth_info is not None else None
# Authority type + main activity
s1_4 = find_child_local(section1, 'Section1_4') if section1 is not None else None
authority_type = sysitem_name(s1_4, 'ContractingAuthorityType')
s1_5 = find_child_local(section1, 'Section1_5') if section1 is not None else None
main_activity = sysitem_name(s1_5, 'MainActivity')
# Section 2: contract details
s2_1 = find_child_local(section2, 'Section2_1') if section2 is not None else None
contract_title = text_under(general, 'ContractTitle') or text_under(s2_1, 'ContractName')
short_desc = text_under(s2_1, 'ShortContractDescription')
main_cpv_id = sysitem_id(s2_1, 'MainCPV')
main_cpv_code = sysitem_name(s2_1, 'MainCPV')
contract_type = sysitem_name(s2_1, 'SysAcquisitionContractType')
currency = sysitem_name(s2_1, 'Currency')
total_value = decimal_under(s2_1, 'TotalValue')
highest_offer = decimal_under(s2_1, 'HighestOffer')
lowest_offer = decimal_under(s2_1, 'LowestOffer')
has_lots = bool_under(s2_1, 'ContractHasLots')
reference_number = text_under(s2_1, 'ReferenceNumber')
# Lots — Section 5 (ContractLotList) for awarded notices, Section 2_2 otherwise
lots = _extract_lots(section5) or _extract_lots(section2)
lots_count = len(lots) if lots else None
# Award criteria — extracted from lot info or top-level
award_criteria = _extract_award_criteria(el)
# Section 4: procedure details
s4_1 = find_child_local(section4, 'Section4_1') if section4 is not None else None
procedure_type = sysitem_name(s4_1, 'SysProcedureType')
framework_agreement = bool_under(s4_1, 'FrameworkAgreement')
# Dates
publication_date = datetime_under(general, 'PublishDate')
legislation = sysitem_name(general, 'SysLegislationType')
notice_state = sysitem_name(general, 'SysNoticeState')
notice_state_id = sysitem_id(general, 'SysNoticeState')
is_utility = bool_under(general, 'IsUtility')
notice_no_joue = text_under(general, 'NoticeNoJoue')
entity_id = int_under(general, 'EntityId')
# Winners (suppliers) — extracted from lot info
winners = _extract_winners(section5)
# Pick first winner as primary; full list in lots JSONB
primary_winner = winners[0] if winners else {}
# Documents — DfNoticeFiles, CompanyFiles
documents = _extract_documents(general)
# Final award value
awarded_value = total_value
if not awarded_value and lots:
# sum lot values
try:
awarded_value = sum(
(lot.get('total_value') or 0) for lot in lots if lot.get('total_value')
) or None
except Exception:
pass
return {
# Standard columns
'type': 'ca_notice',
'ref_number': f'WSP-{notice_no}',
'authority_name': authority_name,
'authority_cui': authority_cui,
'authority_address': authority_address,
'authority_email': authority_email,
'authority_phone': authority_phone,
'authority_url': authority_url,
'authority_type': authority_type,
'authority_main_activity': main_activity,
'authority_entity_id': entity_id,
'title': contract_title[:1000] if contract_title else None,
'cpv_code': main_cpv_code,
'contract_type': contract_type,
'publication_date': publication_date,
'estimated_value': None,
'awarded_value': awarded_value,
'currency': currency,
'supplier_name': primary_winner.get('name'),
'supplier_cui': primary_winner.get('cif'),
'supplier_address': primary_winner.get('address'),
'supplier_is_sme': primary_winner.get('is_sme'),
'procedure_type': procedure_type,
'procedure_state': notice_state,
'legislation': legislation,
'lot_number': None,
'has_lots': 'da' if has_lots else 'nu' if has_lots is False else None,
'contract_has_lots': has_lots,
'lots_count': lots_count,
'joue': notice_no_joue,
'county_code': county_code,
'notice_state': notice_state,
'notice_state_id': notice_state_id,
'framework_agreement': framework_agreement,
'notice_id_internal': notice_id,
'seap_url': f'https://e-licitatie.ro/pub/notices/contract-award-notice/{notice_id}' if notice_id else None,
# Rich JSONB
'documents': documents or None,
'award_criteria': award_criteria or None,
'lots': lots or None,
'details': {
'short_description': short_desc,
'reference_number': reference_number,
'main_cpv_id': main_cpv_id,
'highest_offer': str(highest_offer) if highest_offer else None,
'lowest_offer': str(lowest_offer) if lowest_offer else None,
'is_utility': is_utility,
'all_winners': winners or None,
},
'source': SOURCE,
}
def _extract_lots(section) -> list[dict]:
"""Extract lot info — handles ContractLotList (Section5) or Lots (Section2_2)."""
if section is None:
return []
lot_list = find_local(section, 'ContractLotList')
if lot_list is None:
lot_list = find_local(section, 'Lots')
if lot_list is None:
return []
out = []
for lot in lot_list:
if etree.QName(lot.tag).localname not in ('ContractLotInfo', 'LotInfo'):
continue
lot_data = {
'lot_id': int_under(lot, 'LotID') or int_under(lot, 'ContractNo'),
'lot_no': int_under(lot, 'LotNo') or int_under(lot, 'ContractNo'),
'title': text_under(lot, 'Title') or text_under(lot, 'ContractObjectName'),
'description': text_under(lot, 'DescriptionOfProcurement') or text_under(lot, 'ContractObjectDescription'),
'cpv_code': sysitem_name(lot, 'MainCPVCode'),
'estimated_value': _decimal_or_none(text_under(lot, 'EstimatedValue')),
'total_value': _decimal_or_none(text_under(lot, 'TotalValue')),
'duration_months': int_under(lot, 'DurationInMonths'),
'duration_days': int_under(lot, 'DurationInDays'),
'currency': sysitem_name(lot, 'Currency'),
'place_of_performance': text_under(lot, 'MainSiteOrPlaceOfPerformance'),
'is_community_financed': bool_under(lot, 'IsCommunityFinanced'),
'has_options': bool_under(lot, 'HasOptions'),
'awarded_to_group': bool_under(lot, 'AwardedToGroupOfEcoOp'),
}
# Convert decimals to str for JSON safety
for k in ('estimated_value', 'total_value'):
if lot_data[k] is not None:
lot_data[k] = str(lot_data[k])
# only keep non-empty
out.append({k: v for k, v in lot_data.items() if v is not None})
return out
def _extract_winners(section5) -> list[dict]:
"""Extract winner info from ContractLotList → ContractorAddressList."""
if section5 is None:
return []
out = []
seen_cifs = set()
lot_list = find_local(section5, 'ContractLotList')
if lot_list is None:
return []
for lot in lot_list:
addr_list = find_local(lot, 'ContractorAddressList')
if addr_list is None:
continue
for sect in addr_list:
entity = find_local(sect, 'EntityInformation')
if entity is None:
continue
cif = text_direct(entity, 'Cif')
if cif in seen_cifs:
continue
if cif:
seen_cifs.add(cif)
out.append({
'name': text_direct(entity, 'Name'),
'cif': cif,
'address': text_direct(entity, 'Address'),
'email': text_direct(entity, 'Email'),
'phone': text_direct(entity, 'Phone'),
'url': text_direct(entity, 'Url'),
'nuts_code': sysitem_name(entity, 'NutsCode'),
'is_sme': bool_under(sect, 'IsSme'),
})
return out
def _extract_award_criteria(el) -> list[dict]:
"""Extract award criteria from lot info (MyPriceAwardCriterias / MyQualityAwardCriterias)."""
out = []
for crit_list_name in ('MyPriceAwardCriterias', 'MyQualityAwardCriterias'):
for crit_list in el.iter():
if etree.QName(crit_list.tag).localname != crit_list_name:
continue
for crit in crit_list:
name = text_under(crit, 'Name') or text_under(crit, 'Description')
weight = decimal_under(crit, 'Weight') or decimal_under(crit, 'Pondere')
if name or weight:
item = {'type': 'price' if 'Price' in crit_list_name else 'quality',
'name': name}
if weight is not None:
item['weight'] = str(weight)
out.append(item)
return out
def _extract_documents(general) -> list[dict]:
"""Extract document file references from DfNoticeFiles + CompanyFiles."""
if general is None:
return []
out = []
for fld in ('DfNoticeFiles', 'CompanyFiles', 'NoticeFiles'):
container = find_local(general, fld)
if container is None:
continue
for kvp in container:
key = find_local(kvp, 'key')
if key is None:
key = find_local(kvp, 'Key')
val = find_local(kvp, 'value')
if val is None:
val = find_local(kvp, 'Value')
if key is not None and val is not None:
out.append({
'type': fld,
'name': key.text,
'id': val.text,
})
return out
def _decimal_or_none(s):
if not s:
return None
try:
from decimal import Decimal
return Decimal(s)
except Exception:
return None
# late import to avoid circular
from lxml import etree # noqa: E402
@@ -0,0 +1,19 @@
"""Catalog_ListItems — Beletage product catalog."""
from __future__ import annotations
from ..xml_utils import text_under, decimal_under, datetime_under, sysitem_name, to_jsonable, int_under
def parse(el) -> dict | None:
item_code = text_under(el, 'ItemCode') or text_under(el, 'CatalogItemCode')
if not item_code:
return None
return {
'item_code': item_code,
'item_name': text_under(el, 'Name') or text_under(el, 'ItemName'),
'cpv_code': sysitem_name(el, 'CpvCode'),
'unit_price': decimal_under(el, 'UnitPrice') or decimal_under(el, 'Price'),
'currency': sysitem_name(el, 'Currency'),
'last_updated': datetime_under(el, 'LastUpdate'),
'details': to_jsonable(el),
}
@@ -0,0 +1,6 @@
"""SU_DCNotices — Concurs de soluții."""
from ._base import parse_basic_notice
def parse(el):
return parse_basic_notice(el, type_tag='dc_notice', source_tag='wsp_dcnotice',
notice_id_field='DCNoticeId')
@@ -0,0 +1,6 @@
"""SU_ENotices — Erate (corections to other notices)."""
from ._base import parse_basic_notice
def parse(el):
return parse_basic_notice(el, type_tag='e_notice', source_tag='wsp_enotice',
notice_id_field='ENoticeId')
@@ -0,0 +1,6 @@
"""SU_EAProcedure — Licitații electronice."""
from ._base import parse_basic_notice
def parse(el):
return parse_basic_notice(el, type_tag='ea_procedure', source_tag='wsp_eaprocedure',
notice_id_field='EAProcedureId')
@@ -0,0 +1,6 @@
"""SU_PCNotices — Anunțuri concesionari."""
from ._base import parse_basic_notice
def parse(el):
return parse_basic_notice(el, type_tag='pc_notice', source_tag='wsp_pcnotice',
notice_id_field='PCNoticeId')
@@ -0,0 +1,6 @@
"""SU_PiNotices — Anunțuri de intenție."""
from ._base import parse_basic_notice
def parse(el):
return parse_basic_notice(el, type_tag='pi_notice', source_tag='wsp_pinotice',
notice_id_field='PiNoticeId')
@@ -0,0 +1,6 @@
"""SU_RdcNotices — Rezultate desemnare castigator concurs (rare)."""
from ._base import parse_basic_notice
def parse(el):
return parse_basic_notice(el, type_tag='rdc_notice', source_tag='wsp_rdcnotice',
notice_id_field='RdcNoticeId')
@@ -0,0 +1,6 @@
"""SU_RfqInvitations — Cereri de oferta (RFQ invitations)."""
from ._base import parse_basic_notice
def parse(el):
return parse_basic_notice(el, type_tag='rfq_invitation', source_tag='wsp_rfq_invitation',
notice_id_field='RFQInvitationId')
@@ -0,0 +1,6 @@
"""SU_RfqNotices — Anunțuri de atribuire la cererea de oferta."""
from ._base import parse_basic_notice
def parse(el):
return parse_basic_notice(el, type_tag='rfq_notice', source_tag='wsp_rfq_notice',
notice_id_field='RFQNoticeId')
@@ -0,0 +1,83 @@
"""SuContracts — Beletage's own contracts (writes to seap.beletage_contracts)."""
from __future__ import annotations
from ..xml_utils import (
find_child_local, find_local, text_under, text_direct,
int_under, decimal_under, bool_under, datetime_under,
sysitem_name, sysitem_id, to_jsonable,
)
def parse(el) -> dict | None:
contract_id = int_under(el, 'ContractId')
if contract_id is None:
return None
# Top-level fields directly under ContractItem
contract_no = text_under(el, 'ContractNo')
contract_title = text_under(el, 'ContractTitle')
contract_value = decimal_under(el, 'ContractValue')
default_currency_value = decimal_under(el, 'DefaultCurrencyContractValue')
awarding_date = datetime_under(el, 'ContractAwardingDate')
contract_date = datetime_under(el, 'ContractDate')
publication_date = datetime_under(el, 'PublicationDate')
duration_months = int_under(el, 'MonthsContractDuration')
is_current = bool_under(el, 'IsCurrentVersion')
is_rejected = bool_under(el, 'IsRejected')
version_no = int_under(el, 'VersionNo')
version_date = datetime_under(el, 'VersionDate')
justification = text_under(el, 'Justification')
additional_info = text_under(el, 'AdditionalInformation')
contract_phase = sysitem_name(el, 'SysContractPhase')
contract_state = sysitem_name(el, 'SysContractState')
contract_type = sysitem_name(el, 'SysContractType')
currency = sysitem_name(el, 'ContractValueCurrency')
# CA Notice details: nested under <CANotice><General>...</General></CANotice>
ca_notice = find_child_local(el, 'CANotice')
ca_notice_id = None
ca_notice_no = None
authority_name = None
authority_cui = None
if ca_notice is not None:
general = find_child_local(ca_notice, 'General')
if general is not None:
ca_notice_id = int_under(general, 'CaNoticeId')
ca_notice_no = text_under(general, 'NoticeNo')
section1 = find_child_local(ca_notice, 'Section1')
if section1 is not None:
auth_addresses = find_local(section1, 'CaAddresses')
if auth_addresses is not None:
auth_info = find_local(auth_addresses, 'EntityInformation')
if auth_info is not None:
authority_name = text_direct(auth_info, 'Name')
authority_cui = text_direct(auth_info, 'Cif')
return {
'contract_id': contract_id,
'contract_no': contract_no,
'contract_title': contract_title[:1000] if contract_title else None,
'contract_type': contract_type,
'contract_phase': contract_phase,
'contract_state': contract_state,
'awarding_date': awarding_date.date() if awarding_date else None,
'contract_date': contract_date.date() if contract_date else None,
'publication_date': publication_date,
'duration_months': duration_months,
'contract_value': contract_value,
'default_currency_value': default_currency_value,
'currency': currency,
'ca_notice_id': ca_notice_id,
'ca_notice_no': ca_notice_no,
'authority_name': authority_name,
'authority_cui': authority_cui,
'is_current_version': is_current,
'is_rejected': is_rejected,
'version_no': version_no,
'version_date': version_date,
'justification': justification,
'additional_information': additional_info,
'details': to_jsonable(el),
}
@@ -0,0 +1,32 @@
"""SuDirectAcquisitions — Beletage's own direct acquisitions (won)."""
from __future__ import annotations
from ..xml_utils import (
find_child_local, find_local, text_under, text_direct,
int_under, decimal_under, datetime_under, sysitem_name, to_jsonable,
)
def parse(el) -> dict | None:
da_id = int_under(el, 'DirectAcquisitionId') or int_under(el, 'DirectAcquisitionID')
if da_id is None:
return None
cpv_code = sysitem_name(el, 'CpvCode')
return {
'da_id': da_id,
'da_name': text_under(el, 'DirectAcquisitionName'),
'unique_identification_code': text_under(el, 'UniqueIdentificationCode'),
'cpv_code': cpv_code,
'cpv_name': cpv_code, # name == code in this WSDL
'contract_type': sysitem_name(el, 'SysAcquisitionContractType'),
'publication_date': datetime_under(el, 'PublicationDate'),
'finalization_date': datetime_under(el, 'FinalizationDate'),
'estimated_value': decimal_under(el, 'EstimatedValue') or decimal_under(el, 'EstimatedValueRon'),
'closing_value': decimal_under(el, 'ClosingValue'),
'currency': 'RON',
'da_state': sysitem_name(el, 'SysDirectAcquisitionState'),
'authority_id': int_under(el, 'ContractingAuthorityID'),
'authority_name': text_under(el, 'ContractingAuthority'),
'details': to_jsonable(el),
}
@@ -0,0 +1,49 @@
"""SuInvoices — Beletage's own invoices (writes to seap.beletage_invoices)."""
from __future__ import annotations
from ..xml_utils import (
find_child_local, find_local, text_under, text_direct,
int_under, decimal_under, datetime_under, sysitem_name, to_jsonable,
)
def parse(el) -> dict | None:
invoice_id = int_under(el, 'InvoiceId')
if invoice_id is None:
return None
# Sums of payments
paid_value = None
paid_at = None
payments = find_local(el, 'Payments')
if payments is not None:
for pay in payments:
v = decimal_under(pay, 'Value')
d = datetime_under(pay, 'Date')
if v is not None:
paid_value = (paid_value or 0) + v
if d is not None and (paid_at is None or d > paid_at):
paid_at = d
return {
'invoice_id': invoice_id,
'invoice_no': text_under(el, 'InvoiceNumber') or text_under(el, 'InvoiceNo'),
'invoice_date': (datetime_under(el, 'InvoiceDate') or datetime_under(el, 'IssueDate')),
'due_date': datetime_under(el, 'DueDate'),
'contract_id': int_under(el, 'ContractId'),
'contract_no': text_under(el, 'ContractNo'),
'authority_name': text_under(el, 'BuyerName') or text_under(el, 'AuthorityName'),
'authority_cui': text_under(el, 'BuyerCif') or text_under(el, 'AuthorityCif'),
'total_value': decimal_under(el, 'TotalValue') or decimal_under(el, 'GrandTotal'),
'total_value_no_vat': decimal_under(el, 'TotalValueNoVat') or decimal_under(el, 'NetTotal'),
'vat_value': decimal_under(el, 'VatValue') or decimal_under(el, 'VAT'),
'currency': sysitem_name(el, 'Currency') or text_under(el, 'CurrencyCode'),
'state': sysitem_name(el, 'SysInvoiceState') or text_under(el, 'State'),
'paid_value': paid_value,
'paid_at': paid_at,
'details': to_jsonable(el),
}
def _date_only(dt):
return dt.date() if dt else None
+142
View File
@@ -0,0 +1,142 @@
"""
WSP CLI runner.
Usage:
python -m wsp.runner status
python -m wsp.runner incremental [op_name|all]
python -m wsp.runner backfill <op_name> --from YYYY-MM-DD --to YYYY-MM-DD [--workers 3]
python -m wsp.runner test
"""
from __future__ import annotations
import argparse
import logging
import sys
from datetime import datetime, timedelta
from . import db, sync
from .operations import ALL_OPS, PUBLIC_OPS, OWN_OPS
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s %(levelname)s [%(name)s] %(message)s',
datefmt='%Y-%m-%d %H:%M:%S',
)
log = logging.getLogger('wsp.runner')
def cmd_status(args):
with db.connection() as conn:
cur = conn.cursor()
cur.execute("""
SELECT feed, last_run_at, last_cursor_date,
items_imported_total, items_imported_24h,
consecutive_errors, last_error
FROM seap.wsp_sync_state ORDER BY feed
""")
rows = cur.fetchall()
print(f'\n{"Feed":<25} {"Last run":<22} {"Cursor":<22} {"Total":>10} {"Errors"}')
print('-' * 100)
for r in rows:
feed, last_run, cursor, total, _, errs, err_msg = r
err_str = err_msg[:40] + '...' if err_msg and len(err_msg) > 40 else (err_msg or '')
print(f'{feed:<25} {str(last_run)[:19]:<22} {str(cursor)[:19]:<22} '
f'{total or 0:>10,} {errs}/{err_str}')
cur.execute("""
SELECT feed, state, count(*) FROM seap.wsp_backfill_windows
GROUP BY feed, state ORDER BY feed, state
""")
rows = cur.fetchall()
if rows:
print(f'\n{"Backfill queue":<30}')
print('-' * 60)
for feed, state, count in rows:
print(f' {feed:<25} {state:<15} {count:>8}')
cur.execute("""
SELECT source, count(*) FROM seap.announcements
WHERE source LIKE 'wsp_%' GROUP BY source ORDER BY source
""")
rows = cur.fetchall()
if rows:
print(f'\n{"Source":<25} {"Rows in seap.announcements"}')
print('-' * 60)
for source, count in rows:
print(f' {source:<25} {count:>10,}')
def cmd_incremental(args):
target_ops = args.ops if args.ops != ['all'] else list(ALL_OPS)
for op_name in target_ops:
if op_name not in ALL_OPS:
log.error('Unknown op: %s', op_name)
continue
log.info('=== Running incremental: %s ===', op_name)
result = sync.run_incremental(op_name, lookback_hours=args.lookback_hours)
log.info('Result: %s', result)
def cmd_backfill(args):
if args.op not in ALL_OPS:
log.error('Unknown op: %s. Available: %s', args.op, list(ALL_OPS))
sys.exit(1)
start = datetime.fromisoformat(args.start)
end = datetime.fromisoformat(args.end)
log.info('=== Backfill %s [%s, %s) workers=%d ===',
args.op, start, end, args.workers)
result = sync.run_backfill(args.op, start, end, workers=args.workers)
log.info('Done: %s', result)
def cmd_test(args):
"""Run a 1-day end-to-end test on yesterday (all SU_* feeds)."""
yesterday = datetime.now().date() - timedelta(days=1)
start = datetime.combine(yesterday, datetime.min.time())
end = datetime.combine(yesterday, datetime.max.time())
feeds_to_test = ['SU_CaNotices', 'SU_CNotices', 'SU_PiNotices',
'SU_RfqNotices', 'SU_RfqInvitations',
'SU_DCNotices', 'SU_PCNotices', 'SU_RdcNotices', 'SU_ENotices']
log.info('=== Smoke test on %s ===', yesterday)
for op_name in feeds_to_test:
log.info('--- %s ---', op_name)
try:
r = sync.run_backfill(op_name, start, end, workers=1, enqueue=True)
log.info('%d items imported', r['items_imported'])
except Exception as e:
log.exception(' → ERROR: %s', e)
def main():
parser = argparse.ArgumentParser(prog='wsp.runner')
sub = parser.add_subparsers(dest='cmd', required=True)
sub.add_parser('status')
p_inc = sub.add_parser('incremental')
p_inc.add_argument('ops', nargs='*', default=['all'],
help='Operations to sync (default: all)')
p_inc.add_argument('--lookback-hours', type=int, default=36)
p_bf = sub.add_parser('backfill')
p_bf.add_argument('op')
p_bf.add_argument('--start', required=True, help='YYYY-MM-DD or full ISO')
p_bf.add_argument('--end', required=True)
p_bf.add_argument('--workers', type=int, default=3)
sub.add_parser('test')
args = parser.parse_args()
handlers = {
'status': cmd_status,
'incremental': cmd_incremental,
'backfill': cmd_backfill,
'test': cmd_test,
}
handlers[args.cmd](args)
if __name__ == '__main__':
main()
+193
View File
@@ -0,0 +1,193 @@
"""
Sync orchestrator — backfill (range, parallel) and incremental modes.
Backfill enqueues windows in seap.wsp_backfill_windows; workers claim
atomically (SKIP LOCKED), fetch, parse, write, mark complete. Resumable.
Incremental: reads cursor, fetches (cursor, now), updates cursor on success.
"""
from __future__ import annotations
import logging
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime, timedelta, timezone
from typing import Optional
from .client import WspClient, make_client_from_env
from . import db
from .operations import WspOp, ALL_OPS
from .pagination import fetch_window, split_into_windows
log = logging.getLogger(__name__)
def enqueue_backfill_windows(op: WspOp, start: datetime, end: datetime) -> int:
"""Pre-populate seap.wsp_backfill_windows for [start, end) using op's max window."""
windows = split_into_windows(start, end, op.max_window_days)
queued = 0
with db.connection() as conn:
cur = conn.cursor()
for ws, we in windows:
new_id = db.upsert_backfill_window(cur, op.name, ws, we)
if new_id:
queued += 1
log.info('Queued %d windows for %s [%s, %s)', queued, op.name, start, end)
return queued
def process_one_window(client: WspClient, op: WspOp, ws: datetime, we: datetime,
win_id: int | None = None) -> dict:
"""Fetch + parse + write one window. Returns stats."""
rows: list[dict] = []
sink = db.SINK_FUNCS[op.sink]
def collect(parsed, _el):
if parsed:
rows.append(parsed)
result = fetch_window(client, op, ws, we, on_item=collect)
items_imported = 0
error_msg = None
try:
with db.connection() as conn:
cur = conn.cursor()
items_imported = sink(cur, rows)
except Exception as e:
error_msg = str(e)[:500]
log.exception('DB write failed for %s [%s, %s)', op.name, ws, we)
if win_id is not None:
try:
with db.connection() as conn:
cur = conn.cursor()
db.complete_backfill_window(
cur, win_id, items_imported, result.page_total_first,
error=error_msg,
)
except Exception:
log.exception('Failed to mark window %d complete', win_id)
return {
'window_start': ws,
'window_end': we,
'items_imported': items_imported,
'page_total': result.page_total_first,
'pages_fetched': result.pages_fetched,
'errors': result.errors,
'sub_windows': len(result.sub_windows),
'error': error_msg,
}
def run_backfill(op_name: str, start: datetime, end: datetime,
workers: int = 3, enqueue: bool = True,
endpoint: str | None = None) -> dict:
"""Backfill an operation across [start, end) with parallel workers.
Each worker holds its own WspClient (own SSL session) for thread safety.
"""
op = ALL_OPS[op_name]
if enqueue:
enqueue_backfill_windows(op, start, end)
log.info('Starting backfill: op=%s workers=%d range=[%s, %s)',
op_name, workers, start, end)
stats = {'op': op_name, 'windows_processed': 0, 'items_imported': 0,
'errors': []}
stop_event = threading.Event()
def worker_loop(worker_id: int):
client = make_client_from_env(endpoint or 'https://e-licitatie.ro:8883/Pub')
local_count = 0
while not stop_event.is_set():
with db.connection() as conn:
cur = conn.cursor()
claimed = db.claim_backfill_window(cur, op_name)
if not claimed:
log.info('Worker %d: no more pending windows, exiting', worker_id)
return local_count
ws, we, win_id = claimed['window_start'], claimed['window_end'], claimed['id']
log.info(' W%d: processing [%s%s) (attempt %d)',
worker_id, ws, we, claimed['attempts'])
try:
r = process_one_window(client, op, ws, we, win_id=win_id)
local_count += r['items_imported']
log.info(' W%d: done [%s%s) — items=%d page_total=%d',
worker_id, ws, we, r['items_imported'], r['page_total'])
except Exception as e:
log.exception('Worker %d failed on window %s-%s', worker_id, ws, we)
with db.connection() as conn:
cur = conn.cursor()
db.complete_backfill_window(cur, win_id, 0, 0, error=str(e)[:500])
return local_count
with ThreadPoolExecutor(max_workers=workers) as pool:
futures = [pool.submit(worker_loop, i) for i in range(workers)]
for f in as_completed(futures):
try:
count = f.result()
stats['items_imported'] += count
except Exception as e:
stats['errors'].append(str(e))
log.info('Backfill done: op=%s items=%d', op_name, stats['items_imported'])
return stats
def run_incremental(op_name: str, lookback_hours: int = 36,
endpoint: str | None = None) -> dict:
"""Incremental sync: from (cursor or now-lookback) to now.
Cursor advances to window_end on success.
"""
op = ALL_OPS[op_name]
client = make_client_from_env(endpoint or 'https://e-licitatie.ro:8883/Pub')
# Use tz-aware UTC so we can compare with cursor (TIMESTAMPTZ from Postgres).
end = datetime.now(timezone.utc)
with db.connection() as conn:
cur = conn.cursor()
state = db.get_cursor(cur, op_name) or {}
cursor = state.get('last_cursor_date')
if cursor is None:
start = end - timedelta(hours=lookback_hours)
else:
# overlap by 1 hour to catch late-publishing
# Ensure cursor is tz-aware (assume UTC for any naive value from DB).
if cursor.tzinfo is None:
cursor = cursor.replace(tzinfo=timezone.utc)
start = cursor - timedelta(hours=1)
windows = split_into_windows(start, end, op.max_window_days)
log.info('Incremental %s: %d windows from %s to %s',
op_name, len(windows), start, end)
total_imported = 0
last_error = None
last_window_end = cursor
for ws, we in windows:
try:
r = process_one_window(client, op, ws, we)
total_imported += r['items_imported']
if not r.get('error'):
last_window_end = we
except Exception as e:
last_error = str(e)[:500]
log.exception('Incremental window failed: %s [%s, %s)', op_name, ws, we)
with db.connection() as conn:
cur = conn.cursor()
db.update_sync_state(
cur, op_name,
cursor_date=last_window_end,
window_start=start,
window_end=end,
items_added=total_imported,
error=last_error,
)
return {'op': op_name, 'items_imported': total_imported,
'cursor_advanced_to': last_window_end, 'error': last_error}
+191
View File
@@ -0,0 +1,191 @@
"""XML extraction helpers for WSP parsers — namespace-agnostic via local-name match."""
from __future__ import annotations
from datetime import datetime
from decimal import Decimal, InvalidOperation
from typing import Any, Optional
from lxml import etree
def find_local(el, local_name: str):
"""Find first descendant whose local name matches (any namespace)."""
if el is None:
return None
for d in el.iter():
if etree.QName(d.tag).localname == local_name:
return d
return None
def find_child_local(el, local_name: str):
"""Find direct child by local name."""
if el is None:
return None
for c in el:
if etree.QName(c.tag).localname == local_name:
return c
return None
def find_path(el, *local_names: str):
"""Walk a path of local names: find_path(el, 'General', 'NoticeNo')."""
cur = el
for name in local_names:
cur = find_child_local(cur, name)
if cur is None:
return None
return cur
def text(el, *local_names) -> Optional[str]:
"""Get text under a (possibly nested) path of local names."""
if local_names:
node = find_path(el, *local_names)
else:
node = el
if node is None or node.text is None:
return None
txt = node.text.strip()
return txt if txt else None
def text_under(el, local_name: str) -> Optional[str]:
"""Find first descendant by local name, return its text."""
n = find_local(el, local_name)
if n is None or n.text is None:
return None
s = n.text.strip()
return s if s else None
def text_direct(el, local_name: str) -> Optional[str]:
"""Get text of a DIRECT child by local name only (not descendants).
Use this for EntityInformation.Name/Address/Email/Phone where SysItem-style
children (City, Country, NutsCode) also contain a <Name> that would shadow.
"""
n = find_child_local(el, local_name)
if n is None or n.text is None:
return None
s = n.text.strip()
return s if s else None
def int_under(el, local_name: str) -> Optional[int]:
s = text_under(el, local_name)
if s is None:
return None
try:
return int(s)
except ValueError:
return None
def decimal_under(el, local_name: str) -> Optional[Decimal]:
s = text_under(el, local_name)
if s is None:
return None
try:
return Decimal(s)
except (InvalidOperation, ValueError):
return None
def bool_under(el, local_name: str) -> Optional[bool]:
s = text_under(el, local_name)
if s is None:
return None
return s.lower() == 'true'
def datetime_under(el, local_name: str) -> Optional[datetime]:
"""Parse ISO datetime. Handles WCF <DateTime>...</DateTime> wrapper too."""
n = find_local(el, local_name)
if n is None:
return None
# Direct text?
if n.text:
try:
return _parse_iso(n.text.strip())
except (ValueError, AttributeError):
pass
# WCF wrapper: <Field><c:DateTime>...</c:DateTime></Field>
inner = find_local(n, 'DateTime')
if inner is not None and inner.text:
try:
return _parse_iso(inner.text.strip())
except ValueError:
return None
return None
def _parse_iso(s: str) -> datetime:
"""Parse various ISO 8601 forms returned by SEAP."""
s = s.replace('Z', '+00:00')
return datetime.fromisoformat(s)
def sysitem_under(el, local_name: str) -> Optional[dict]:
"""Extract a SysItem-style {Id, Name} structure."""
n = find_local(el, local_name)
if n is None:
return None
id_el = find_local(n, 'Id')
name_el = find_local(n, 'Name')
out = {}
if id_el is not None and id_el.text:
try:
out['id'] = int(id_el.text)
except ValueError:
out['id'] = id_el.text
if name_el is not None and name_el.text:
out['name'] = name_el.text.strip()
return out or None
def sysitem_name(el, local_name: str) -> Optional[str]:
item = sysitem_under(el, local_name)
return item.get('name') if item else None
def sysitem_id(el, local_name: str) -> Optional[int]:
item = sysitem_under(el, local_name)
if not item:
return None
val = item.get('id')
return val if isinstance(val, int) else None
def to_jsonable(el) -> dict:
"""Convert an element subtree into a nested dict for JSONB storage.
Strips namespaces, keeps local names. Drops nil elements.
Mixed content with multiple same-name children becomes a list.
"""
if el is None:
return {}
is_nil = el.get('{http://www.w3.org/2001/XMLSchema-instance}nil') == 'true'
if is_nil:
return None
children = list(el)
if not children:
# leaf
if el.text and el.text.strip():
return el.text.strip()
return None
out: dict = {}
for c in children:
key = etree.QName(c.tag).localname
val = to_jsonable(c)
if val is None:
continue
if key in out:
if not isinstance(out[key], list):
out[key] = [out[key]]
out[key].append(val)
else:
out[key] = val
return out