initial: split from gov-agreg — vreau.digital standalone platform
Moved from gov-agreg/src/pages/achizitii/* to root (drop prefix). - 22 pages migrated, 127 files total - All internal links: /achizitii/X → /X (176 occurrences fixed) - AchizitiiLayout subnav rewritten: /X paths, top-right link to vreaudigital.ro hub - BaseLayout new (vreau.digital branding, OG tags, site URL) - astro.config.mjs: site https://vreau.digital, server output (was static) - docker-compose: port 5096 (vreaudigital is 5095), container vreau-digital - deploy.sh: paths /opt/vreau-digital, log /var/log/vreau-digital-deploy.log Backend shared with gov-agreg: - PostgreSQL satra (same schemas: seap, firms, anaf, anre, ...) - Photon, Martin tiles - Infisical /vreaudigital path (DATABASE_URL etc. shared) build: PASS (npx astro check 0 errors, npm run build 5s vite + 10s server)
This commit is contained in:
@@ -0,0 +1,19 @@
|
||||
FROM python:3.12-slim
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
curl ca-certificates && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY requirements.txt /app/
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY wsp/ /app/wsp/
|
||||
COPY credentials/50076FB3826FADA540ACFB19.p12 /app/credentials/
|
||||
|
||||
ENV PYTHONPATH=/app
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
|
||||
# Default: run incremental for all feeds. Override with `command` in compose.
|
||||
CMD ["python", "-m", "wsp.runner", "incremental", "all"]
|
||||
@@ -0,0 +1,102 @@
|
||||
"""
|
||||
In-memory mTLS certificate loader for SEAP WSP.
|
||||
|
||||
Decrypts the .p12 in memory using SEAP_CERT_KEY env var, builds an
|
||||
ssl.SSLContext, and exposes it via a requests.HTTPAdapter. The private
|
||||
key never touches disk.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import ssl
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
from cryptography.hazmat.primitives import serialization
|
||||
from cryptography.hazmat.primitives.serialization import pkcs12
|
||||
from requests.adapters import HTTPAdapter
|
||||
from urllib3.poolmanager import PoolManager
|
||||
|
||||
|
||||
def load_p12_to_ssl_context(p12_path: str | Path, password: str) -> ssl.SSLContext:
|
||||
"""Load PKCS#12 in memory and return an SSLContext with client cert + key.
|
||||
|
||||
The PEMs are written to a temp file briefly because OpenSSL/SSLContext
|
||||
requires a file path. We use mkstemp on a tmpfs-backed dir when
|
||||
available (/dev/shm on Linux) and unlink immediately after load —
|
||||
so the data lives only as an ephemeral inode held by SSLContext.
|
||||
"""
|
||||
p12_data = Path(p12_path).read_bytes()
|
||||
private_key, cert, additional = pkcs12.load_key_and_certificates(
|
||||
p12_data, password.encode()
|
||||
)
|
||||
if private_key is None or cert is None:
|
||||
raise RuntimeError('p12 missing private key or certificate')
|
||||
|
||||
key_pem = private_key.private_bytes(
|
||||
encoding=serialization.Encoding.PEM,
|
||||
format=serialization.PrivateFormat.PKCS8,
|
||||
encryption_algorithm=serialization.NoEncryption(),
|
||||
)
|
||||
cert_pem = cert.public_bytes(serialization.Encoding.PEM)
|
||||
if additional:
|
||||
for ca in additional:
|
||||
cert_pem += ca.public_bytes(serialization.Encoding.PEM)
|
||||
|
||||
# Use /dev/shm if available (RAM-backed); fallback to /tmp
|
||||
shm = Path('/dev/shm')
|
||||
tmp_dir = str(shm) if shm.is_dir() and os.access(shm, os.W_OK) else None
|
||||
|
||||
fd, path = tempfile.mkstemp(prefix='wsp_chain_', suffix='.pem', dir=tmp_dir)
|
||||
try:
|
||||
os.write(fd, cert_pem + b'\n' + key_pem)
|
||||
os.close(fd)
|
||||
os.chmod(path, 0o600)
|
||||
|
||||
ctx = ssl.create_default_context()
|
||||
ctx.load_cert_chain(certfile=path)
|
||||
return ctx
|
||||
finally:
|
||||
# Unlink immediately — kernel keeps the file alive until SSLContext closes
|
||||
try:
|
||||
os.unlink(path)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
class _SSLContextAdapter(HTTPAdapter):
|
||||
"""Requests adapter that uses a pre-built SSLContext (with client cert)."""
|
||||
|
||||
def __init__(self, ssl_context: ssl.SSLContext, **kwargs):
|
||||
self._ssl_context = ssl_context
|
||||
super().__init__(**kwargs)
|
||||
|
||||
def init_poolmanager(self, *args, **kwargs):
|
||||
kwargs['ssl_context'] = self._ssl_context
|
||||
return super().init_poolmanager(*args, **kwargs)
|
||||
|
||||
def proxy_manager_for(self, *args, **kwargs):
|
||||
kwargs['ssl_context'] = self._ssl_context
|
||||
return super().proxy_manager_for(*args, **kwargs)
|
||||
|
||||
|
||||
def make_mtls_session(p12_path: str | Path, password: str) -> requests.Session:
|
||||
"""Build a requests.Session with mTLS via in-memory cert."""
|
||||
ctx = load_p12_to_ssl_context(p12_path, password)
|
||||
session = requests.Session()
|
||||
adapter = _SSLContextAdapter(ssl_context=ctx)
|
||||
session.mount('https://', adapter)
|
||||
return session
|
||||
|
||||
|
||||
def make_mtls_session_from_env() -> requests.Session:
|
||||
"""Convenience: read p12 path + password from env."""
|
||||
p12 = os.environ.get(
|
||||
'SEAP_P12_PATH',
|
||||
str(Path(__file__).parent.parent / 'credentials' / '50076FB3826FADA540ACFB19.p12'),
|
||||
)
|
||||
pwd = os.environ.get('SEAP_CERT_KEY')
|
||||
if not pwd:
|
||||
raise RuntimeError('SEAP_CERT_KEY not in env')
|
||||
return make_mtls_session(p12, pwd)
|
||||
@@ -0,0 +1,224 @@
|
||||
"""
|
||||
WSP SOAP client — envelope construction + HTTPS+mTLS POST + response parsing.
|
||||
|
||||
Critical, non-negotiable rules (validated empirically against e-licitatie.ro:8883):
|
||||
1. Endpoint URL is case-sensitive: /Pub (capital P).
|
||||
2. Field children of <tem:request> MUST be in alphabetic order (WCF DataContract).
|
||||
3. SeapUserCredentials xmlns="http://tempuri.org" (NO trailing slash).
|
||||
4. SOAPAction header must be quoted: "http://tempuri.org/{contract}/{op}".
|
||||
5. None values must be encoded as <sic:Field i:nil="true"/>, not omitted —
|
||||
omitting an alphabetic-order intermediate field causes WCF to silently
|
||||
null-out subsequent fields (validation cascades).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Optional
|
||||
|
||||
import requests
|
||||
from lxml import etree
|
||||
|
||||
from .operations import WspOp, PROD_URL, NS_INTEG
|
||||
|
||||
NS_TEM = 'http://tempuri.org' # NOTE: no trailing slash for SeapUserCredentials
|
||||
NS_TEM_BODY = 'http://tempuri.org/' # WITH slash for tem:Op body
|
||||
NS_XSI = 'http://www.w3.org/2001/XMLSchema-instance'
|
||||
NS_SOAP = 'http://schemas.xmlsoap.org/soap/envelope/'
|
||||
|
||||
# Response namespaces for status extraction
|
||||
NS_INTEG_BRACED = '{%s}' % NS_INTEG
|
||||
|
||||
|
||||
def _envelope(op: WspOp, fields: dict[str, Any], username: str, password: str) -> bytes:
|
||||
"""Build a WCF-compatible SOAP envelope.
|
||||
|
||||
Children of <tem:request> are emitted in sorted (alphabetic) order.
|
||||
None values become <sic:F i:nil="true"/>. Other values are str-converted.
|
||||
"""
|
||||
parts: list[str] = []
|
||||
for name in sorted(fields.keys()):
|
||||
val = fields[name]
|
||||
if val is None:
|
||||
parts.append(f' <sic:{name} i:nil="true"/>')
|
||||
else:
|
||||
parts.append(f' <sic:{name}>{_xml_escape(str(val))}</sic:{name}>')
|
||||
|
||||
body = f'''<soapenv:Envelope xmlns:soapenv="{NS_SOAP}" xmlns:tem="{NS_TEM_BODY}" xmlns:sic="{op.request_ns}" xmlns:i="{NS_XSI}">
|
||||
<soapenv:Header>
|
||||
<SeapUserCredentials xmlns="{NS_TEM}" xmlns:i="{NS_XSI}">
|
||||
<Password xmlns="{NS_INTEG}">{_xml_escape(password)}</Password>
|
||||
<Username xmlns="{NS_INTEG}">{_xml_escape(username)}</Username>
|
||||
</SeapUserCredentials>
|
||||
</soapenv:Header>
|
||||
<soapenv:Body>
|
||||
<tem:{op.name}>
|
||||
<tem:request>
|
||||
{chr(10).join(parts)}
|
||||
</tem:request>
|
||||
</tem:{op.name}>
|
||||
</soapenv:Body>
|
||||
</soapenv:Envelope>'''
|
||||
return body.encode('utf-8')
|
||||
|
||||
|
||||
def _xml_escape(s: str) -> str:
|
||||
return (s.replace('&', '&').replace('<', '<')
|
||||
.replace('>', '>').replace('"', '"')
|
||||
.replace("'", '''))
|
||||
|
||||
|
||||
@dataclass
|
||||
class WspResult:
|
||||
status: str # 'Success' | 'ValidationError' | 'SystemError' | 'AuthFailed' | etc
|
||||
description: Optional[str]
|
||||
page_index: int
|
||||
page_total: int # total ITEM count (not page count). PageSize=100, so pages = ceil(page_total/100)
|
||||
items_xml: bytes # raw <a:Items>...</a:Items> bytes for parser
|
||||
raw_envelope: bytes # full response (for debugging / saving)
|
||||
elapsed_ms: int
|
||||
items: Optional[list] = None # populated by paginator after parser runs
|
||||
|
||||
@property
|
||||
def success(self) -> bool:
|
||||
return self.status == 'Success'
|
||||
|
||||
page_size: int = 50 # confirmed empirically
|
||||
|
||||
@property
|
||||
def num_pages(self) -> int:
|
||||
ps = self.page_size or 50
|
||||
return (self.page_total + ps - 1) // ps if self.page_total > 0 else 0
|
||||
|
||||
|
||||
class WspClient:
|
||||
"""Thread-safe SOAP client for SEAP WSP. Reuses a single HTTPS session."""
|
||||
|
||||
def __init__(self, session: requests.Session, username: str, password: str,
|
||||
endpoint: str = PROD_URL):
|
||||
self.session = session
|
||||
self.username = username
|
||||
self.password = password
|
||||
self.endpoint = endpoint
|
||||
|
||||
def call(self, op: WspOp, fields: dict[str, Any], timeout: int = 120,
|
||||
max_retries: int = 3) -> WspResult:
|
||||
"""Execute a SOAP call with retry on transient errors."""
|
||||
body = _envelope(op, fields, self.username, self.password)
|
||||
headers = {
|
||||
'Content-Type': 'text/xml; charset=utf-8',
|
||||
'SOAPAction': f'"{op.soap_action}"',
|
||||
}
|
||||
last_exc: Optional[Exception] = None
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
t0 = time.time()
|
||||
resp = self.session.post(self.endpoint, data=body,
|
||||
headers=headers, timeout=timeout)
|
||||
elapsed_ms = int((time.time() - t0) * 1000)
|
||||
if resp.status_code in (429, 500, 502, 503, 504):
|
||||
# 500 includes WCF ActionNotSupported faults — but those
|
||||
# are deterministic, so don't retry. Differentiate via body.
|
||||
if b'<faultcode' in resp.content:
|
||||
# SOAP fault — return as parsed error, no retry
|
||||
return self._parse_fault(resp.content, elapsed_ms)
|
||||
# transient — retry with backoff
|
||||
time.sleep(2 ** attempt)
|
||||
continue
|
||||
resp.raise_for_status()
|
||||
return self._parse_response(resp.content, elapsed_ms)
|
||||
except (requests.Timeout, requests.ConnectionError) as e:
|
||||
last_exc = e
|
||||
time.sleep(2 ** attempt)
|
||||
raise RuntimeError(f'Exhausted retries on {op.name}: {last_exc}')
|
||||
|
||||
def _parse_response(self, content: bytes, elapsed_ms: int) -> WspResult:
|
||||
"""Extract Status, Description, PageIndex, PageTotal, Items.
|
||||
|
||||
SEAP returns MTOM/XOP multipart sometimes — strip wrapping if present.
|
||||
"""
|
||||
# Strip MTOM multipart wrapper if present
|
||||
if content[:4] == b'\r\n--' or content[:2] == b'--':
|
||||
# Multipart — find the XML part
|
||||
idx = content.find(b'<s:Envelope')
|
||||
if idx == -1:
|
||||
idx = content.find(b'<?xml')
|
||||
if idx != -1:
|
||||
end = content.find(b'</s:Envelope>')
|
||||
if end != -1:
|
||||
content = content[idx:end + len(b'</s:Envelope>')]
|
||||
else:
|
||||
# Sometimes the multipart starts with --uuid... mid-stream
|
||||
idx = content.find(b'<s:Envelope')
|
||||
if idx > 0:
|
||||
content = content[idx:]
|
||||
end = content.find(b'</s:Envelope>')
|
||||
if end != -1:
|
||||
content = content[:end + len(b'</s:Envelope>')]
|
||||
|
||||
try:
|
||||
root = etree.fromstring(content)
|
||||
except etree.XMLSyntaxError as e:
|
||||
return WspResult(
|
||||
status='ParseError', description=str(e),
|
||||
page_index=0, page_total=0,
|
||||
items_xml=b'', raw_envelope=content, elapsed_ms=elapsed_ms,
|
||||
)
|
||||
|
||||
status_el = root.find(f'.//{NS_INTEG_BRACED}Status')
|
||||
desc_el = root.find(f'.//{NS_INTEG_BRACED}Description')
|
||||
# PageIndex/PageTotal live in a:* namespace (varies per response type)
|
||||
page_index = 0
|
||||
page_total = 0
|
||||
for el in root.iter():
|
||||
tag = etree.QName(el.tag).localname
|
||||
if tag == 'PageIndex' and el.text and el.text.isdigit():
|
||||
page_index = int(el.text)
|
||||
elif tag == 'PageTotal' and el.text and el.text.isdigit():
|
||||
page_total = int(el.text)
|
||||
|
||||
# Find Items element — varies by namespace
|
||||
items_el = None
|
||||
for el in root.iter():
|
||||
if etree.QName(el.tag).localname == 'Items':
|
||||
items_el = el
|
||||
break
|
||||
items_xml = etree.tostring(items_el) if items_el is not None else b''
|
||||
|
||||
return WspResult(
|
||||
status=status_el.text if status_el is not None else 'UNKNOWN',
|
||||
description=desc_el.text if desc_el is not None else None,
|
||||
page_index=page_index,
|
||||
page_total=page_total,
|
||||
items_xml=items_xml,
|
||||
raw_envelope=content,
|
||||
elapsed_ms=elapsed_ms,
|
||||
)
|
||||
|
||||
def _parse_fault(self, content: bytes, elapsed_ms: int) -> WspResult:
|
||||
try:
|
||||
root = etree.fromstring(content)
|
||||
faultcode = root.find('.//{*}faultcode')
|
||||
faultstring = root.find('.//{*}faultstring')
|
||||
status = f'Fault:{faultcode.text}' if faultcode is not None else 'Fault'
|
||||
desc = faultstring.text if faultstring is not None else None
|
||||
except Exception:
|
||||
status = 'Fault:Parse'
|
||||
desc = content[:200].decode('utf-8', errors='replace')
|
||||
return WspResult(
|
||||
status=status, description=desc,
|
||||
page_index=0, page_total=0,
|
||||
items_xml=b'', raw_envelope=content, elapsed_ms=elapsed_ms,
|
||||
)
|
||||
|
||||
|
||||
def make_client_from_env(endpoint: str = PROD_URL) -> WspClient:
|
||||
"""Build a WspClient using SEAP_USER/SEAP_PASS/SEAP_CERT_KEY env vars."""
|
||||
from .cert_loader import make_mtls_session_from_env
|
||||
user = os.environ.get('SEAP_USER')
|
||||
pwd = os.environ.get('SEAP_PASS')
|
||||
if not user or not pwd:
|
||||
raise RuntimeError('SEAP_USER / SEAP_PASS not in env')
|
||||
session = make_mtls_session_from_env()
|
||||
return WspClient(session, user, pwd, endpoint=endpoint)
|
||||
Executable
+47
@@ -0,0 +1,47 @@
|
||||
#!/usr/bin/env bash
|
||||
#
|
||||
# Daily WSP incremental sync — runs on satra via systemd timer or cron.
|
||||
#
|
||||
# Loads DATABASE_URL from /opt/architools/.env and SEAP secrets from
|
||||
# Infisical. Runs incremental for all configured public + Beletage feeds.
|
||||
# Logs go to /var/log/wsp-sync/ (rotated by logrotate).
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
LOG_DIR=/var/log/wsp-sync
|
||||
LOG_FILE="$LOG_DIR/wsp-incremental-$(date +%Y%m%d).log"
|
||||
mkdir -p "$LOG_DIR"
|
||||
|
||||
cd /opt/vreaudigital/services/seap-scraper
|
||||
|
||||
# Load DB URL (architools_user — same DB as TED imports)
|
||||
set -a
|
||||
. /opt/architools/.env
|
||||
set +a
|
||||
# Strip ?schema= param (psycopg2 doesn't accept it)
|
||||
export DATABASE_URL="${DATABASE_URL%%\?*}"
|
||||
|
||||
# Load SEAP secrets from Infisical (using the same loader as claude-start.sh)
|
||||
if [ -f /opt/wsp/.env.seap ]; then
|
||||
# Production: secrets pre-loaded into a static file readable by service account
|
||||
set -a
|
||||
. /opt/wsp/.env.seap
|
||||
set +a
|
||||
else
|
||||
echo "ERROR: /opt/wsp/.env.seap not found — run wsp-secrets-fetch.sh first" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Run incremental for each feed in sequence (low-volume daily = ~5 min total)
|
||||
for op in SU_CaNotices SU_CNotices SU_PiNotices SU_RfqNotices SU_RfqInvitations \
|
||||
SU_DCNotices SU_PCNotices SU_RdcNotices SU_ENotices \
|
||||
SuContracts SuInvoices SuDirectAcquisitions; do
|
||||
echo "=== $(date -Iseconds) — $op ===" | tee -a "$LOG_FILE"
|
||||
./.venv/bin/python -m wsp.runner incremental "$op" --lookback-hours 36 \
|
||||
>> "$LOG_FILE" 2>&1 || echo " (failed, continuing)" | tee -a "$LOG_FILE"
|
||||
done
|
||||
|
||||
echo "=== $(date -Iseconds) — Done ===" | tee -a "$LOG_FILE"
|
||||
|
||||
# Show status summary at the end (also captured by journalctl if systemd-run)
|
||||
./.venv/bin/python -m wsp.runner status 2>&1 | tail -30 | tee -a "$LOG_FILE"
|
||||
@@ -0,0 +1,336 @@
|
||||
"""DB sinks — idempotent UPSERT for each parsed row type."""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from contextlib import contextmanager
|
||||
from datetime import datetime
|
||||
from decimal import Decimal
|
||||
from typing import Any, Iterable
|
||||
|
||||
import psycopg2
|
||||
from psycopg2.extras import Json, execute_values
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def get_db_url() -> str:
|
||||
url = os.environ.get('DATABASE_URL')
|
||||
if not url:
|
||||
raise RuntimeError('DATABASE_URL not set in env')
|
||||
return url
|
||||
|
||||
|
||||
def connect():
|
||||
return psycopg2.connect(get_db_url())
|
||||
|
||||
|
||||
@contextmanager
|
||||
def connection():
|
||||
conn = connect()
|
||||
try:
|
||||
yield conn
|
||||
conn.commit()
|
||||
except Exception:
|
||||
conn.rollback()
|
||||
raise
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def _json_safe(obj):
|
||||
"""Convert Decimal/datetime/date to JSON-friendly types recursively."""
|
||||
if isinstance(obj, Decimal):
|
||||
return str(obj)
|
||||
if isinstance(obj, datetime):
|
||||
return obj.isoformat()
|
||||
if isinstance(obj, dict):
|
||||
return {k: _json_safe(v) for k, v in obj.items()}
|
||||
if isinstance(obj, list):
|
||||
return [_json_safe(v) for v in obj]
|
||||
return obj
|
||||
|
||||
|
||||
# ── seap.announcements (public feeds) ──
|
||||
|
||||
ANNOUNCEMENTS_COLS = [
|
||||
'type', 'ref_number',
|
||||
'authority_name', 'authority_cui', 'authority_address',
|
||||
'authority_email', 'authority_phone', 'authority_url',
|
||||
'authority_type', 'authority_main_activity', 'authority_entity_id',
|
||||
'title', 'cpv_code', 'contract_type',
|
||||
'publication_date', 'estimated_value', 'awarded_value', 'currency',
|
||||
'supplier_name', 'supplier_cui', 'supplier_address', 'supplier_is_sme',
|
||||
'procedure_type', 'procedure_state', 'legislation',
|
||||
'has_lots', 'contract_has_lots', 'lots_count',
|
||||
'joue', 'county_code', 'notice_state', 'notice_state_id',
|
||||
'framework_agreement', 'notice_id_internal',
|
||||
'deadline_submission', 'opening_date', 'duration_months',
|
||||
'documents', 'award_criteria', 'lots', 'details',
|
||||
'seap_url', 'source',
|
||||
]
|
||||
_JSONB_COLS = {'documents', 'award_criteria', 'lots', 'details'}
|
||||
|
||||
|
||||
def insert_announcements(cur, rows: Iterable[dict]) -> int:
|
||||
"""Bulk upsert into seap.announcements. Returns count inserted/updated."""
|
||||
rows = list(rows)
|
||||
if not rows:
|
||||
return 0
|
||||
values = []
|
||||
for r in rows:
|
||||
values.append(tuple(
|
||||
Json(_json_safe(r.get(c))) if c in _JSONB_COLS and r.get(c) is not None
|
||||
else r.get(c)
|
||||
for c in ANNOUNCEMENTS_COLS
|
||||
))
|
||||
cols_sql = ', '.join(ANNOUNCEMENTS_COLS)
|
||||
update_cols = [c for c in ANNOUNCEMENTS_COLS if c not in ('type', 'ref_number')]
|
||||
update_sql = ', '.join(
|
||||
f'{c} = COALESCE(EXCLUDED.{c}, seap.announcements.{c})' if c not in _JSONB_COLS
|
||||
else f'{c} = EXCLUDED.{c}'
|
||||
for c in update_cols
|
||||
)
|
||||
sql = f"""
|
||||
INSERT INTO seap.announcements ({cols_sql})
|
||||
VALUES %s
|
||||
ON CONFLICT (type, ref_number) DO UPDATE SET
|
||||
{update_sql},
|
||||
enriched_at = now()
|
||||
"""
|
||||
execute_values(cur, sql, values, page_size=100)
|
||||
return len(rows)
|
||||
|
||||
|
||||
# ── Beletage tables ──
|
||||
|
||||
BELETAGE_CONTRACTS_COLS = [
|
||||
'contract_id', 'contract_no', 'contract_title', 'contract_type',
|
||||
'contract_phase', 'contract_state',
|
||||
'awarding_date', 'contract_date', 'publication_date',
|
||||
'duration_months', 'contract_value', 'default_currency_value', 'currency',
|
||||
'ca_notice_id', 'ca_notice_no', 'authority_name', 'authority_cui',
|
||||
'is_current_version', 'is_rejected', 'version_no', 'version_date',
|
||||
'justification', 'additional_information', 'details',
|
||||
]
|
||||
|
||||
|
||||
def insert_beletage_contracts(cur, rows):
|
||||
rows = list(rows)
|
||||
if not rows:
|
||||
return 0
|
||||
values = [
|
||||
tuple(
|
||||
Json(_json_safe(r.get(c))) if c == 'details' and r.get(c) is not None
|
||||
else r.get(c)
|
||||
for c in BELETAGE_CONTRACTS_COLS
|
||||
)
|
||||
for r in rows
|
||||
]
|
||||
cols_sql = ', '.join(BELETAGE_CONTRACTS_COLS)
|
||||
update_cols = [c for c in BELETAGE_CONTRACTS_COLS if c != 'contract_id']
|
||||
update_sql = ', '.join(f'{c} = EXCLUDED.{c}' for c in update_cols)
|
||||
sql = f"""
|
||||
INSERT INTO seap.beletage_contracts ({cols_sql})
|
||||
VALUES %s
|
||||
ON CONFLICT (contract_id) DO UPDATE SET
|
||||
{update_sql},
|
||||
enriched_at = now()
|
||||
"""
|
||||
execute_values(cur, sql, values, page_size=100)
|
||||
return len(rows)
|
||||
|
||||
|
||||
BELETAGE_INVOICES_COLS = [
|
||||
'invoice_id', 'invoice_no', 'invoice_date', 'due_date',
|
||||
'contract_id', 'contract_no', 'authority_name', 'authority_cui',
|
||||
'total_value', 'total_value_no_vat', 'vat_value', 'currency',
|
||||
'state', 'paid_value', 'paid_at', 'details',
|
||||
]
|
||||
|
||||
|
||||
def insert_beletage_invoices(cur, rows):
|
||||
rows = list(rows)
|
||||
if not rows:
|
||||
return 0
|
||||
values = [
|
||||
tuple(
|
||||
Json(_json_safe(r.get(c))) if c == 'details' and r.get(c) is not None
|
||||
else r.get(c)
|
||||
for c in BELETAGE_INVOICES_COLS
|
||||
)
|
||||
for r in rows
|
||||
]
|
||||
cols_sql = ', '.join(BELETAGE_INVOICES_COLS)
|
||||
update_cols = [c for c in BELETAGE_INVOICES_COLS if c != 'invoice_id']
|
||||
update_sql = ', '.join(f'{c} = EXCLUDED.{c}' for c in update_cols)
|
||||
sql = f"""
|
||||
INSERT INTO seap.beletage_invoices ({cols_sql})
|
||||
VALUES %s
|
||||
ON CONFLICT (invoice_id) DO UPDATE SET
|
||||
{update_sql}
|
||||
"""
|
||||
execute_values(cur, sql, values, page_size=100)
|
||||
return len(rows)
|
||||
|
||||
|
||||
BELETAGE_DA_COLS = [
|
||||
'da_id', 'da_name', 'unique_identification_code',
|
||||
'cpv_code', 'cpv_name', 'contract_type',
|
||||
'publication_date', 'finalization_date',
|
||||
'estimated_value', 'closing_value', 'currency',
|
||||
'da_state', 'authority_id', 'authority_name', 'authority_cui',
|
||||
'details',
|
||||
]
|
||||
|
||||
|
||||
def insert_beletage_da(cur, rows):
|
||||
rows = list(rows)
|
||||
if not rows:
|
||||
return 0
|
||||
values = [
|
||||
tuple(
|
||||
Json(_json_safe(r.get(c))) if c == 'details' and r.get(c) is not None
|
||||
else r.get(c)
|
||||
for c in BELETAGE_DA_COLS
|
||||
)
|
||||
for r in rows
|
||||
]
|
||||
cols_sql = ', '.join(BELETAGE_DA_COLS)
|
||||
update_cols = [c for c in BELETAGE_DA_COLS if c != 'da_id']
|
||||
update_sql = ', '.join(f'{c} = EXCLUDED.{c}' for c in update_cols)
|
||||
sql = f"""
|
||||
INSERT INTO seap.beletage_direct_acquisitions ({cols_sql})
|
||||
VALUES %s
|
||||
ON CONFLICT (da_id) DO UPDATE SET
|
||||
{update_sql}
|
||||
"""
|
||||
execute_values(cur, sql, values, page_size=100)
|
||||
return len(rows)
|
||||
|
||||
|
||||
BELETAGE_CATALOG_COLS = ['item_code', 'item_name', 'cpv_code', 'unit_price',
|
||||
'currency', 'last_updated', 'details']
|
||||
|
||||
|
||||
def insert_beletage_catalog(cur, rows):
|
||||
rows = list(rows)
|
||||
if not rows:
|
||||
return 0
|
||||
values = [
|
||||
tuple(
|
||||
Json(_json_safe(r.get(c))) if c == 'details' and r.get(c) is not None
|
||||
else r.get(c)
|
||||
for c in BELETAGE_CATALOG_COLS
|
||||
)
|
||||
for r in rows
|
||||
]
|
||||
cols_sql = ', '.join(BELETAGE_CATALOG_COLS)
|
||||
update_cols = [c for c in BELETAGE_CATALOG_COLS if c != 'item_code']
|
||||
update_sql = ', '.join(f'{c} = EXCLUDED.{c}' for c in update_cols)
|
||||
sql = f"""
|
||||
INSERT INTO seap.beletage_catalog ({cols_sql})
|
||||
VALUES %s
|
||||
ON CONFLICT (item_code) DO UPDATE SET {update_sql}
|
||||
"""
|
||||
execute_values(cur, sql, values, page_size=100)
|
||||
return len(rows)
|
||||
|
||||
|
||||
# Sink dispatch
|
||||
SINK_FUNCS = {
|
||||
'announcements': insert_announcements,
|
||||
'beletage_contracts': insert_beletage_contracts,
|
||||
'beletage_invoices': insert_beletage_invoices,
|
||||
'beletage_da': insert_beletage_da,
|
||||
'beletage_catalog': insert_beletage_catalog,
|
||||
}
|
||||
|
||||
|
||||
# ── Sync state ──
|
||||
|
||||
def get_cursor(cur, feed: str) -> dict | None:
|
||||
cur.execute('SELECT last_cursor_date, items_imported_total, consecutive_errors '
|
||||
'FROM seap.wsp_sync_state WHERE feed = %s', (feed,))
|
||||
row = cur.fetchone()
|
||||
if not row:
|
||||
return None
|
||||
return {
|
||||
'last_cursor_date': row[0],
|
||||
'items_imported_total': row[1],
|
||||
'consecutive_errors': row[2],
|
||||
}
|
||||
|
||||
|
||||
def update_sync_state(cur, feed: str, *, cursor_date=None, window_start=None,
|
||||
window_end=None, items_added=0, error: str | None = None):
|
||||
cur.execute("""
|
||||
INSERT INTO seap.wsp_sync_state (feed, last_run_at, last_cursor_date,
|
||||
last_window_start, last_window_end, items_imported_total,
|
||||
items_imported_24h, consecutive_errors, last_error, last_error_at)
|
||||
VALUES (%s, now(), %s, %s, %s, %s, %s,
|
||||
CASE WHEN %s IS NULL THEN 0 ELSE 1 END,
|
||||
%s,
|
||||
CASE WHEN %s IS NULL THEN NULL ELSE now() END)
|
||||
ON CONFLICT (feed) DO UPDATE SET
|
||||
last_run_at = now(),
|
||||
last_cursor_date = COALESCE(EXCLUDED.last_cursor_date, seap.wsp_sync_state.last_cursor_date),
|
||||
last_window_start = EXCLUDED.last_window_start,
|
||||
last_window_end = EXCLUDED.last_window_end,
|
||||
items_imported_total = seap.wsp_sync_state.items_imported_total + %s,
|
||||
items_imported_24h = %s,
|
||||
consecutive_errors = CASE WHEN %s IS NULL THEN 0
|
||||
ELSE seap.wsp_sync_state.consecutive_errors + 1 END,
|
||||
last_error = EXCLUDED.last_error,
|
||||
last_error_at = CASE WHEN %s IS NULL THEN seap.wsp_sync_state.last_error_at
|
||||
ELSE now() END
|
||||
""", (feed, cursor_date, window_start, window_end, items_added, items_added,
|
||||
error, error, error,
|
||||
items_added, items_added, error, error))
|
||||
|
||||
|
||||
def upsert_backfill_window(cur, feed: str, ws, we, county=None) -> int:
|
||||
cur.execute("""
|
||||
INSERT INTO seap.wsp_backfill_windows (feed, window_start, window_end, county_code)
|
||||
VALUES (%s, %s, %s, %s)
|
||||
ON CONFLICT (feed, window_start, window_end, county_code) DO NOTHING
|
||||
RETURNING id
|
||||
""", (feed, ws, we, county))
|
||||
row = cur.fetchone()
|
||||
return row[0] if row else 0
|
||||
|
||||
|
||||
def claim_backfill_window(cur, feed: str) -> dict | None:
|
||||
"""Atomically claim oldest pending window for processing."""
|
||||
cur.execute("""
|
||||
UPDATE seap.wsp_backfill_windows
|
||||
SET state = 'in_progress', started_at = now(),
|
||||
attempts = attempts + 1
|
||||
WHERE id = (
|
||||
SELECT id FROM seap.wsp_backfill_windows
|
||||
WHERE feed = %s AND state IN ('pending', 'failed') AND attempts < 5
|
||||
ORDER BY window_start
|
||||
LIMIT 1
|
||||
FOR UPDATE SKIP LOCKED
|
||||
)
|
||||
RETURNING id, window_start, window_end, county_code, attempts
|
||||
""", (feed,))
|
||||
row = cur.fetchone()
|
||||
if not row:
|
||||
return None
|
||||
return {
|
||||
'id': row[0], 'window_start': row[1], 'window_end': row[2],
|
||||
'county_code': row[3], 'attempts': row[4],
|
||||
}
|
||||
|
||||
|
||||
def complete_backfill_window(cur, win_id: int, items: int, page_total: int,
|
||||
error: str | None = None):
|
||||
state = 'failed' if error else 'completed'
|
||||
cur.execute("""
|
||||
UPDATE seap.wsp_backfill_windows
|
||||
SET state = %s, items_imported = %s, page_total = %s,
|
||||
completed_at = now(), last_error = %s
|
||||
WHERE id = %s
|
||||
""", (state, items, page_total, error, win_id))
|
||||
@@ -0,0 +1,107 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Load EU CPV nomenclature (RO names) into seap.cpv_codes + backfill announcements."""
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import urllib.request
|
||||
from pathlib import Path
|
||||
|
||||
import psycopg2
|
||||
from psycopg2.extras import execute_values
|
||||
|
||||
CPV_URL = 'https://raw.githubusercontent.com/samhallskod/cpv-eu/main/data/cpv.json'
|
||||
|
||||
|
||||
def main():
|
||||
db = os.environ.get('DATABASE_URL')
|
||||
if not db:
|
||||
print('DATABASE_URL not set', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
cache = Path('/tmp/cpv.json')
|
||||
if not cache.exists():
|
||||
print(f'Downloading {CPV_URL}...')
|
||||
urllib.request.urlretrieve(CPV_URL, cache)
|
||||
data = json.loads(cache.read_text())
|
||||
print(f'Loaded {len(data)} CPV codes from JSON')
|
||||
|
||||
rows = []
|
||||
for entry in data:
|
||||
code = entry['code'] # 8-digit (no dash)
|
||||
labels = entry.get('labels', {})
|
||||
name_ro = labels.get('ro') or labels.get('en') or code
|
||||
name_en = labels.get('en')
|
||||
level = entry.get('level', 0)
|
||||
emoji = entry.get('emoji')
|
||||
|
||||
# Division = first 2 digits + 6 zeros
|
||||
division = code[:2] + '000000'
|
||||
|
||||
# Parent = level - 1 — easiest: trim trailing zero pairs based on level
|
||||
# Level mapping: 1=XX000000, 2=XXX00000, 3=XXXX0000, 4=XXXXX000, 5=XXXXXX00, 6=XXXXXXX0, 7=XXXXXXXX (full)
|
||||
# For our purposes, parent is one level up — replace last non-zero digit with 0
|
||||
parent = None
|
||||
if level > 1:
|
||||
# find rightmost non-zero, zero it out
|
||||
digits = list(code)
|
||||
# Find position of last meaningful digit
|
||||
for i in range(len(digits) - 1, -1, -1):
|
||||
if digits[i] != '0':
|
||||
digits[i] = '0'
|
||||
break
|
||||
parent_code = ''.join(digits)
|
||||
if parent_code != code:
|
||||
parent = parent_code
|
||||
|
||||
# Build full code with check digit (synthesize a placeholder dash + 0)
|
||||
# The real check digit is per-code — we don't compute it; use the 8-digit form everywhere.
|
||||
rows.append((code, code, name_ro, name_en, level, division, parent, emoji))
|
||||
|
||||
print(f'Inserting {len(rows)} rows into seap.cpv_codes...')
|
||||
conn = psycopg2.connect(db)
|
||||
cur = conn.cursor()
|
||||
cur.execute('TRUNCATE seap.cpv_codes')
|
||||
execute_values(cur, """
|
||||
INSERT INTO seap.cpv_codes (code, code_full, name_ro, name_en, level,
|
||||
division_code, parent_code, emoji)
|
||||
VALUES %s
|
||||
""", rows, page_size=500)
|
||||
conn.commit()
|
||||
cur.execute('SELECT count(*), count(*) FILTER (WHERE level=1) FROM seap.cpv_codes')
|
||||
total, divisions = cur.fetchone()
|
||||
print(f' → {total} codes loaded, {divisions} top-level divisions')
|
||||
|
||||
# Backfill cpv_division + cpv_name_ro on announcements
|
||||
print('Backfilling cpv_division + cpv_name_ro on announcements...')
|
||||
cur.execute("""
|
||||
UPDATE seap.announcements
|
||||
SET cpv_division = seap.cpv_division(cpv_code),
|
||||
cpv_name_ro = seap.cpv_name(cpv_code)
|
||||
WHERE cpv_code IS NOT NULL
|
||||
AND (cpv_division IS NULL OR cpv_name_ro IS NULL)
|
||||
""")
|
||||
affected = cur.rowcount
|
||||
conn.commit()
|
||||
print(f' → {affected:,} announcement rows enriched')
|
||||
|
||||
# Show top divisions
|
||||
cur.execute("""
|
||||
SELECT a.cpv_division, c.name_ro, c.emoji,
|
||||
count(*) as n,
|
||||
sum(awarded_value)::numeric(15,0) as total_value
|
||||
FROM seap.announcements a
|
||||
LEFT JOIN seap.cpv_codes c ON c.code = a.cpv_division
|
||||
WHERE a.cpv_division IS NOT NULL AND a.source LIKE 'wsp_%'
|
||||
GROUP BY 1, 2, 3
|
||||
ORDER BY 4 DESC
|
||||
LIMIT 15
|
||||
""")
|
||||
print('\nTop 15 CPV divisions in WSP data:')
|
||||
for r in cur.fetchall():
|
||||
print(f' {r[2] or " "} {r[0]:<10} {(r[1] or "?")[:45]:<45} {r[3]:>5} contracte {r[4] or 0:>13,} RON')
|
||||
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -0,0 +1,202 @@
|
||||
"""
|
||||
WSP operation registry — single source of truth for SOAP call config.
|
||||
|
||||
Each operation declares:
|
||||
- contract: SOAP contract (ISupplierWebService, IPubWebService, etc.)
|
||||
- request_ns: XML namespace of the Request type
|
||||
- date_fields: (start_field, end_field) — for window-based ops
|
||||
- max_window_days: server-imposed max window
|
||||
- extra_fields: static fields to inject (e.g. defaults)
|
||||
- response_xpath: XPath to extract Items elements from the response
|
||||
- parser: dotted module name of parser
|
||||
- sink: 'announcements' (public) or 'beletage' (own)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
|
||||
|
||||
NS_INTEG = 'http://schemas.datacontract.org/2004/07/SICAP.Service.Integration'
|
||||
NS_MODEL_BASE = 'http://schemas.datacontract.org/2004/07/SICAP.Supplier.Interface.Model'
|
||||
NS_MODEL_CONTRACTS = NS_MODEL_BASE + '.Contracts'
|
||||
|
||||
# Endpoints
|
||||
PROD_URL = 'https://e-licitatie.ro:8883/Pub'
|
||||
DEMO_URL = 'http://demo.e-licitatie.ro:8884/Pub'
|
||||
|
||||
|
||||
@dataclass
|
||||
class WspOp:
|
||||
name: str # operation name (matches SOAPAction tail)
|
||||
contract: str # SOAP contract — 'ISupplierWebService'
|
||||
request_ns: str = NS_MODEL_BASE
|
||||
date_start_field: Optional[str] = None
|
||||
date_end_field: Optional[str] = None
|
||||
max_window_days: int = 7 # server max
|
||||
page_size_inferred: int = 50 # confirmed empirically: server returns 50 items/page
|
||||
items_cap: int = 1000 # if PageTotal == 1000, window may be truncated; auto-split
|
||||
parser_module: str = '' # 'wsp.parsers.ca_notice'
|
||||
sink: str = 'announcements' # 'announcements' | 'beletage_contracts' | 'beletage_invoices' | 'beletage_da' | 'beletage_catalog'
|
||||
source_tag: str = '' # value for seap.announcements.source column
|
||||
item_xpath: str = '' # XPath under <a:Items> to find each item element
|
||||
|
||||
@property
|
||||
def soap_action(self) -> str:
|
||||
return f'http://tempuri.org/{self.contract}/{self.name}'
|
||||
|
||||
|
||||
# Public-data feeds (SU_*) — return all-Romania notices, not scoped to Beletage
|
||||
PUBLIC_OPS: dict[str, WspOp] = {
|
||||
'SU_CaNotices': WspOp(
|
||||
name='SU_CaNotices',
|
||||
contract='ISupplierWebService',
|
||||
date_start_field='PublicationStartDate',
|
||||
date_end_field='PublicationEndDate',
|
||||
max_window_days=1, # high volume — must split per day
|
||||
parser_module='wsp.parsers.ca_notice',
|
||||
source_tag='wsp_canotice',
|
||||
item_xpath='.//{http://schemas.datacontract.org/2004/07/SICAP.Supplier.Interface.Model.CANotice}CaNoticeBase',
|
||||
),
|
||||
'SU_CNotices': WspOp(
|
||||
name='SU_CNotices',
|
||||
contract='ISupplierWebService',
|
||||
date_start_field='PublicationStartDate',
|
||||
date_end_field='PublicationEndDate',
|
||||
max_window_days=7,
|
||||
parser_module='wsp.parsers.c_notice',
|
||||
source_tag='wsp_cnotice',
|
||||
item_xpath='.//{http://schemas.datacontract.org/2004/07/SICAP.Supplier.Interface.Model.CNotice}CNoticeBase',
|
||||
),
|
||||
'SU_PiNotices': WspOp(
|
||||
name='SU_PiNotices',
|
||||
contract='ISupplierWebService',
|
||||
date_start_field='PublicationStartDate',
|
||||
date_end_field='PublicationEndDate',
|
||||
max_window_days=30,
|
||||
parser_module='wsp.parsers.pi_notice',
|
||||
source_tag='wsp_pinotice',
|
||||
item_xpath='.//{http://schemas.datacontract.org/2004/07/SICAP.Supplier.Interface.Model.PINotice}PiNoticeBase',
|
||||
),
|
||||
'SU_RfqNotices': WspOp(
|
||||
name='SU_RfqNotices',
|
||||
contract='ISupplierWebService',
|
||||
date_start_field='PublicationStartDate',
|
||||
date_end_field='PublicationEndDate',
|
||||
max_window_days=7,
|
||||
parser_module='wsp.parsers.rfq_notice',
|
||||
source_tag='wsp_rfq_notice',
|
||||
item_xpath='.//{http://schemas.datacontract.org/2004/07/SICAP.Supplier.Interface.Model.RfqNotice}RfqNoticeBase',
|
||||
),
|
||||
'SU_RfqInvitations': WspOp(
|
||||
name='SU_RfqInvitations',
|
||||
contract='ISupplierWebService',
|
||||
date_start_field='PublicationStartDate',
|
||||
date_end_field='PublicationEndDate',
|
||||
max_window_days=7,
|
||||
parser_module='wsp.parsers.rfq_invitation',
|
||||
source_tag='wsp_rfq_invitation',
|
||||
item_xpath='.//{http://schemas.datacontract.org/2004/07/SICAP.Supplier.Interface.Model.RfqInvitation}RfqInvitationBase',
|
||||
),
|
||||
'SU_DCNotices': WspOp(
|
||||
name='SU_DCNotices',
|
||||
contract='ISupplierWebService',
|
||||
date_start_field='PublicationStartDate',
|
||||
date_end_field='PublicationEndDate',
|
||||
max_window_days=30,
|
||||
parser_module='wsp.parsers.dc_notice',
|
||||
source_tag='wsp_dcnotice',
|
||||
item_xpath='.//{http://schemas.datacontract.org/2004/07/SICAP.Supplier.Interface.Model.DCNotice}DCNoticeBase',
|
||||
),
|
||||
'SU_PCNotices': WspOp(
|
||||
name='SU_PCNotices',
|
||||
contract='ISupplierWebService',
|
||||
date_start_field='PublicationStartDate',
|
||||
date_end_field='PublicationEndDate',
|
||||
max_window_days=30,
|
||||
parser_module='wsp.parsers.pc_notice',
|
||||
source_tag='wsp_pcnotice',
|
||||
item_xpath='.//{http://schemas.datacontract.org/2004/07/SICAP.Supplier.Interface.Model.PCNotice}PCNoticeBase',
|
||||
),
|
||||
'SU_RdcNotices': WspOp(
|
||||
name='SU_RdcNotices',
|
||||
contract='ISupplierWebService',
|
||||
date_start_field='PublicationStartDate',
|
||||
date_end_field='PublicationEndDate',
|
||||
max_window_days=30,
|
||||
parser_module='wsp.parsers.rdc_notice',
|
||||
source_tag='wsp_rdcnotice',
|
||||
item_xpath='.//{http://schemas.datacontract.org/2004/07/SICAP.Supplier.Interface.Model.RDCNotice}RdcNoticeBase',
|
||||
),
|
||||
'SU_ENotices': WspOp(
|
||||
name='SU_ENotices',
|
||||
contract='ISupplierWebService',
|
||||
date_start_field='PublicationStartDate',
|
||||
date_end_field='PublicationEndDate',
|
||||
max_window_days=30,
|
||||
parser_module='wsp.parsers.e_notice',
|
||||
source_tag='wsp_enotice',
|
||||
item_xpath='.//{http://schemas.datacontract.org/2004/07/SICAP.Supplier.Interface.Model.ENotice}ENoticeBase',
|
||||
),
|
||||
'SU_EAProcedure': WspOp(
|
||||
name='SU_EAProcedure',
|
||||
contract='ISupplierWebService',
|
||||
date_start_field='PublicationStartDate',
|
||||
date_end_field='PublicationEndDate',
|
||||
max_window_days=30,
|
||||
parser_module='wsp.parsers.ea_procedure',
|
||||
source_tag='wsp_eaprocedure',
|
||||
item_xpath='.//{http://schemas.datacontract.org/2004/07/SICAP.Supplier.Interface.Model.EAProcedure}EAProcedureBase',
|
||||
),
|
||||
}
|
||||
|
||||
# Beletage-scoped (own data)
|
||||
OWN_OPS: dict[str, WspOp] = {
|
||||
'SuContracts': WspOp(
|
||||
name='SuContracts',
|
||||
contract='ISupplierWebService',
|
||||
request_ns=NS_MODEL_CONTRACTS,
|
||||
date_start_field='ContractStartDate',
|
||||
date_end_field='ContractEndDate',
|
||||
max_window_days=10,
|
||||
parser_module='wsp.parsers.su_contract',
|
||||
sink='beletage_contracts',
|
||||
source_tag='wsp_beletage_contract',
|
||||
item_xpath='.//{http://schemas.datacontract.org/2004/07/SICAP.Supplier.Interface.Model.Contracts}ContractItem',
|
||||
),
|
||||
'SuInvoices': WspOp(
|
||||
name='SuInvoices',
|
||||
contract='ISupplierWebService',
|
||||
date_start_field='MinDate',
|
||||
date_end_field='MaxDate',
|
||||
max_window_days=10,
|
||||
parser_module='wsp.parsers.su_invoice',
|
||||
sink='beletage_invoices',
|
||||
source_tag='wsp_beletage_invoice',
|
||||
item_xpath='.//{http://schemas.datacontract.org/2004/07/SICAP.Supplier.Interface.Model}InvoiceItem',
|
||||
),
|
||||
'SuDirectAcquisitions': WspOp(
|
||||
name='SuDirectAcquisitions',
|
||||
contract='ISupplierWebService',
|
||||
date_start_field='PublicationStartDate',
|
||||
date_end_field='PublicationEndDate',
|
||||
max_window_days=30,
|
||||
parser_module='wsp.parsers.su_da',
|
||||
sink='beletage_da',
|
||||
source_tag='wsp_beletage_da',
|
||||
item_xpath='.//{http://schemas.datacontract.org/2004/07/SICAP.Supplier.Interface.Model}SUDirectAcquisition',
|
||||
),
|
||||
'Catalog_ListItems': WspOp(
|
||||
name='Catalog_ListItems',
|
||||
contract='ISupplierWebService',
|
||||
date_start_field='LastUpdateStart',
|
||||
date_end_field='LastUpdateEnd',
|
||||
max_window_days=365, # catalog is small, use wide window
|
||||
parser_module='wsp.parsers.catalog',
|
||||
sink='beletage_catalog',
|
||||
source_tag='wsp_beletage_catalog',
|
||||
item_xpath='.//{http://schemas.datacontract.org/2004/07/SICAP.Supplier.Interface.Model}CatalogListItem',
|
||||
),
|
||||
}
|
||||
|
||||
ALL_OPS: dict[str, WspOp] = {**PUBLIC_OPS, **OWN_OPS}
|
||||
@@ -0,0 +1,199 @@
|
||||
"""
|
||||
Window-aware pagination + auto-split.
|
||||
|
||||
Constraints discovered empirically:
|
||||
- PageSize is server-fixed at 100 items per response.
|
||||
- Server caps results at PageTotal == 1000 — when reached, the window
|
||||
truncated and we lose data. Auto-split window in halves until PageTotal < 1000.
|
||||
- Server silently returns PageTotal=0 for windows that are too wide
|
||||
(e.g. >7 days for SU_CaNotices) — also triggers auto-split.
|
||||
|
||||
Yields parsed item dicts as they arrive; caller is responsible for batching
|
||||
DB writes. Cursor advancement (last successful publication_date) is tracked
|
||||
externally in seap.wsp_sync_state.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import importlib
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Any, Iterator
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from .client import WspClient
|
||||
from .operations import WspOp
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class WindowResult:
|
||||
op_name: str
|
||||
window_start: datetime
|
||||
window_end: datetime
|
||||
items_imported: int = 0
|
||||
pages_fetched: int = 0
|
||||
page_total_first: int = 0 # PageTotal returned on page 1
|
||||
sub_windows: list['WindowResult'] = field(default_factory=list) # if split
|
||||
errors: list[str] = field(default_factory=list)
|
||||
skipped: bool = False # set if window was outside cursor range
|
||||
|
||||
@property
|
||||
def all_items_imported(self) -> int:
|
||||
return self.items_imported + sum(s.all_items_imported for s in self.sub_windows)
|
||||
|
||||
|
||||
def _load_parser(op: WspOp):
|
||||
"""Dynamically import the parser for an operation."""
|
||||
if not op.parser_module:
|
||||
return None
|
||||
try:
|
||||
mod = importlib.import_module(op.parser_module)
|
||||
return mod
|
||||
except ModuleNotFoundError:
|
||||
log.warning('Parser module %s not found — items will be skipped', op.parser_module)
|
||||
return None
|
||||
|
||||
|
||||
def _iter_items_xml(items_xml: bytes, op: WspOp):
|
||||
"""Iterate item elements out of <a:Items>."""
|
||||
if not items_xml:
|
||||
return
|
||||
try:
|
||||
items_el = etree.fromstring(items_xml)
|
||||
except etree.XMLSyntaxError as e:
|
||||
log.error('Cannot parse items XML for %s: %s', op.name, e)
|
||||
return
|
||||
for el in items_el.iter():
|
||||
# Match the configured item tag (or fallback: any direct child)
|
||||
if op.item_xpath:
|
||||
qname = op.item_xpath.rsplit('}', 1)[-1]
|
||||
if etree.QName(el.tag).localname == qname:
|
||||
yield el
|
||||
else:
|
||||
# fallback: yield all children of Items
|
||||
if el.getparent() is items_el:
|
||||
yield el
|
||||
|
||||
|
||||
def fetch_window(
|
||||
client: WspClient,
|
||||
op: WspOp,
|
||||
window_start: datetime,
|
||||
window_end: datetime,
|
||||
extra_fields: dict | None = None,
|
||||
on_item: callable = None,
|
||||
) -> WindowResult:
|
||||
"""Fetch all pages for a single (start, end) window.
|
||||
|
||||
Auto-splits if PageTotal == op.items_cap (server-side cap reached).
|
||||
Calls `on_item(parsed_dict, raw_xml_element)` for each item parsed.
|
||||
Returns WindowResult with stats; sub_windows populated if splits occurred.
|
||||
"""
|
||||
extra_fields = extra_fields or {}
|
||||
parser = _load_parser(op)
|
||||
result = WindowResult(
|
||||
op_name=op.name,
|
||||
window_start=window_start,
|
||||
window_end=window_end,
|
||||
)
|
||||
|
||||
# Build base fields
|
||||
fields_base = dict(extra_fields)
|
||||
if op.date_start_field:
|
||||
fields_base[op.date_start_field] = window_start.strftime('%Y-%m-%dT%H:%M:%S')
|
||||
if op.date_end_field:
|
||||
fields_base[op.date_end_field] = window_end.strftime('%Y-%m-%dT%H:%M:%S')
|
||||
|
||||
# Page 1
|
||||
fields = {**fields_base, 'PageIndex': 1}
|
||||
r = client.call(op, fields)
|
||||
result.pages_fetched = 1
|
||||
result.page_total_first = r.page_total
|
||||
|
||||
if r.status != 'Success':
|
||||
result.errors.append(f'page1: {r.status} — {r.description}')
|
||||
return result
|
||||
|
||||
# Detect server cap and split
|
||||
if r.page_total >= op.items_cap and (window_end - window_start) > timedelta(hours=1):
|
||||
log.info(' %s: PageTotal=%d hit cap (>= %d) on %s → %s — splitting',
|
||||
op.name, r.page_total, op.items_cap, window_start, window_end)
|
||||
return _split_window(client, op, window_start, window_end, extra_fields, on_item)
|
||||
|
||||
# Process page 1 items
|
||||
if r.items_xml and parser is not None:
|
||||
for el in _iter_items_xml(r.items_xml, op):
|
||||
try:
|
||||
parsed = parser.parse(el)
|
||||
if parsed and on_item:
|
||||
on_item(parsed, el)
|
||||
result.items_imported += 1
|
||||
except Exception as e:
|
||||
log.exception('Parse error in %s: %s', op.name, e)
|
||||
result.errors.append(f'parse: {e}')
|
||||
|
||||
# Remaining pages
|
||||
total_pages = r.num_pages
|
||||
for page in range(2, total_pages + 1):
|
||||
fields = {**fields_base, 'PageIndex': page}
|
||||
r2 = client.call(op, fields)
|
||||
result.pages_fetched += 1
|
||||
if r2.status != 'Success':
|
||||
result.errors.append(f'page{page}: {r2.status} — {r2.description}')
|
||||
break
|
||||
if parser is None:
|
||||
continue
|
||||
for el in _iter_items_xml(r2.items_xml, op):
|
||||
try:
|
||||
parsed = parser.parse(el)
|
||||
if parsed and on_item:
|
||||
on_item(parsed, el)
|
||||
result.items_imported += 1
|
||||
except Exception as e:
|
||||
log.exception('Parse error in %s page %d: %s', op.name, page, e)
|
||||
result.errors.append(f'parse p{page}: {e}')
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _split_window(client: WspClient, op: WspOp,
|
||||
start: datetime, end: datetime,
|
||||
extra_fields: dict, on_item) -> WindowResult:
|
||||
"""Recursively split window in half until each fits under cap."""
|
||||
parent = WindowResult(op_name=op.name, window_start=start, window_end=end)
|
||||
span = end - start
|
||||
if span <= timedelta(hours=1):
|
||||
# Cannot split further — accept truncation
|
||||
log.warning('Cannot split below 1h for %s [%s, %s] — accepting cap',
|
||||
op.name, start, end)
|
||||
# Just fetch with single window and tolerate cap
|
||||
sub = fetch_window(client, op, start, end, extra_fields, on_item)
|
||||
parent.sub_windows.append(sub)
|
||||
return parent
|
||||
|
||||
mid = start + span / 2
|
||||
sub1 = fetch_window(client, op, start, mid, extra_fields, on_item)
|
||||
sub2 = fetch_window(client, op, mid, end, extra_fields, on_item)
|
||||
parent.sub_windows.extend([sub1, sub2])
|
||||
return parent
|
||||
|
||||
|
||||
def split_into_windows(start: datetime, end: datetime,
|
||||
max_window_days: int) -> list[tuple[datetime, datetime]]:
|
||||
"""Slice [start, end) into chunks of max_window_days each.
|
||||
|
||||
Returns list of (window_start, window_end) tuples.
|
||||
"""
|
||||
if start >= end:
|
||||
return []
|
||||
span = timedelta(days=max_window_days)
|
||||
windows = []
|
||||
cur = start
|
||||
while cur < end:
|
||||
nxt = min(cur + span, end)
|
||||
windows.append((cur, nxt))
|
||||
cur = nxt
|
||||
return windows
|
||||
@@ -0,0 +1,214 @@
|
||||
"""
|
||||
Base notice parser — shared logic for CN, PI, RFQ, DC, PC, Rdc, EN notices.
|
||||
|
||||
CA notice has more complex structure (lots + winners) so it has its own parser.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from ..xml_utils import (
|
||||
find_child_local, find_local, find_path,
|
||||
text_under, text_direct, int_under, decimal_under, bool_under,
|
||||
datetime_under, sysitem_name, sysitem_id,
|
||||
)
|
||||
|
||||
|
||||
def parse_basic_notice(el, *, type_tag: str, source_tag: str,
|
||||
notice_id_field: str = None) -> dict | None:
|
||||
"""Generic notice parser — works for CN, PI, DC, PC, Rdc, EN, RFQ.
|
||||
|
||||
type_tag: short type identifier for the row (e.g. 'c_notice', 'pi_notice')
|
||||
source_tag: value for source column (e.g. 'wsp_cnotice')
|
||||
notice_id_field: local name of the ID field (CNoticeId, PiNoticeId, etc.)
|
||||
If None, auto-detect by trying common names.
|
||||
"""
|
||||
notice_no = (text_under(el, 'NoticeNo') or
|
||||
text_under(el, 'CNoticeNumber') or
|
||||
text_under(el, 'PiNoticeNumber') or
|
||||
text_under(el, 'NoticeNumber') or
|
||||
text_under(el, 'RFQInvitationNumber') or
|
||||
text_under(el, 'RFQNoticeNumber') or
|
||||
text_under(el, 'DCNoticeNumber') or
|
||||
text_under(el, 'PCNoticeNumber') or
|
||||
text_under(el, 'RDCNoticeNumber') or
|
||||
text_under(el, 'ENoticeNumber') or
|
||||
text_under(el, 'EAProcedureNumber') or
|
||||
text_under(el, 'DfNoticeNo')) # last-resort fallback
|
||||
if not notice_no:
|
||||
return None
|
||||
|
||||
notice_id = None
|
||||
for field in (notice_id_field, 'CNoticeId', 'PiNoticeId', 'NoticeId',
|
||||
'CaNoticeId', 'DCNoticeId', 'PCNoticeId',
|
||||
'RFQNoticeId', 'RdcNoticeId', 'ENoticeId',
|
||||
'EAProcedureId', 'RFQInvitationId'):
|
||||
if field:
|
||||
notice_id = int_under(el, field)
|
||||
if notice_id is not None:
|
||||
break
|
||||
|
||||
general = find_child_local(el, 'General')
|
||||
section1 = find_child_local(el, 'Section1')
|
||||
section2 = find_child_local(el, 'Section2')
|
||||
section4 = find_child_local(el, 'Section4')
|
||||
|
||||
# Authority
|
||||
auth_addresses = find_path(section1, 'Section1_1', 'CaAddresses')
|
||||
if auth_addresses is None:
|
||||
auth_addresses = find_path(section1, 'Section1_1')
|
||||
auth_info = find_local(auth_addresses, 'EntityInformation') if auth_addresses is not None else None
|
||||
|
||||
authority_name = text_direct(auth_info, 'Name') if auth_info is not None else None
|
||||
authority_cui = text_direct(auth_info, 'Cif') if auth_info is not None else None
|
||||
authority_address = text_direct(auth_info, 'Address') if auth_info is not None else None
|
||||
authority_email = text_direct(auth_info, 'Email') if auth_info is not None else None
|
||||
authority_phone = text_direct(auth_info, 'Phone') if auth_info is not None else None
|
||||
authority_url = text_direct(auth_info, 'Url') if auth_info is not None else None
|
||||
county_code = sysitem_name(auth_info, 'NutsCode') if auth_info is not None else None
|
||||
entity_id = int_under(general, 'EntityId') if general is not None else None
|
||||
|
||||
s1_4 = find_child_local(section1, 'Section1_4') if section1 is not None else None
|
||||
authority_type = sysitem_name(s1_4, 'ContractingAuthorityType')
|
||||
s1_5 = find_child_local(section1, 'Section1_5') if section1 is not None else None
|
||||
main_activity = sysitem_name(s1_5, 'MainActivity')
|
||||
|
||||
# Section 2 — contract
|
||||
s2_1 = find_child_local(section2, 'Section2_1') if section2 is not None else None
|
||||
contract_title = (text_under(general, 'ContractTitle') or
|
||||
text_under(s2_1, 'ContractName') or
|
||||
text_under(s2_1, 'Title'))
|
||||
short_desc = text_under(s2_1, 'ShortContractDescription')
|
||||
main_cpv_code = sysitem_name(s2_1, 'MainCPV') or sysitem_name(s2_1, 'MainCPVCode')
|
||||
main_cpv_id = sysitem_id(s2_1, 'MainCPV') or sysitem_id(s2_1, 'MainCPVCode')
|
||||
contract_type = sysitem_name(s2_1, 'SysAcquisitionContractType')
|
||||
currency = sysitem_name(s2_1, 'Currency')
|
||||
estimated_value = decimal_under(s2_1, 'EstimatedValue') or decimal_under(s2_1, 'TotalValue')
|
||||
has_lots = bool_under(s2_1, 'ContractHasLots')
|
||||
reference_number = text_under(s2_1, 'ReferenceNumber')
|
||||
|
||||
# Lots — for CN/RFQ/etc., lots in Section2_2
|
||||
lots = _extract_lots_simple(section2)
|
||||
lots_count = len(lots) if lots else None
|
||||
|
||||
# Procedure
|
||||
s4_1 = find_child_local(section4, 'Section4_1') if section4 is not None else None
|
||||
procedure_type = sysitem_name(s4_1, 'SysProcedureType')
|
||||
framework_agreement = bool_under(s4_1, 'FrameworkAgreement')
|
||||
|
||||
# Section 4_2 — deadlines
|
||||
s4_2 = find_child_local(section4, 'Section4_2') if section4 is not None else None
|
||||
deadline_submission = (datetime_under(s4_2, 'TenderAvailabilityDeadline') or
|
||||
datetime_under(s4_2, 'ReceiptTimeLimit') or
|
||||
datetime_under(s4_2, 'ReceiptDeadline'))
|
||||
opening_date = datetime_under(s4_2, 'TenderOpeningDate')
|
||||
|
||||
# Dates + state
|
||||
publication_date = datetime_under(general, 'PublishDate')
|
||||
legislation = sysitem_name(general, 'SysLegislationType') or sysitem_name(general, 'LegislationType')
|
||||
notice_state = sysitem_name(general, 'SysNoticeState')
|
||||
notice_state_id = sysitem_id(general, 'SysNoticeState')
|
||||
is_utility = bool_under(general, 'IsUtility')
|
||||
notice_no_joue = text_under(general, 'NoticeNoJoue') or text_under(general, 'JOUEPublicationNumber')
|
||||
|
||||
# Documents
|
||||
documents = _extract_documents(general)
|
||||
|
||||
return {
|
||||
'type': type_tag,
|
||||
'ref_number': f'WSP-{notice_no}',
|
||||
'authority_name': authority_name,
|
||||
'authority_cui': authority_cui,
|
||||
'authority_address': authority_address,
|
||||
'authority_email': authority_email,
|
||||
'authority_phone': authority_phone,
|
||||
'authority_url': authority_url,
|
||||
'authority_type': authority_type,
|
||||
'authority_main_activity': main_activity,
|
||||
'authority_entity_id': entity_id,
|
||||
'title': contract_title[:1000] if contract_title else None,
|
||||
'cpv_code': main_cpv_code,
|
||||
'contract_type': contract_type,
|
||||
'publication_date': publication_date,
|
||||
'estimated_value': estimated_value,
|
||||
'awarded_value': None,
|
||||
'currency': currency,
|
||||
'supplier_name': None,
|
||||
'supplier_cui': None,
|
||||
'procedure_type': procedure_type,
|
||||
'procedure_state': notice_state,
|
||||
'legislation': legislation,
|
||||
'has_lots': 'da' if has_lots else 'nu' if has_lots is False else None,
|
||||
'contract_has_lots': has_lots,
|
||||
'lots_count': lots_count,
|
||||
'joue': notice_no_joue,
|
||||
'county_code': county_code,
|
||||
'notice_state': notice_state,
|
||||
'notice_state_id': notice_state_id,
|
||||
'framework_agreement': framework_agreement,
|
||||
'notice_id_internal': notice_id,
|
||||
'deadline_submission': deadline_submission,
|
||||
'opening_date': opening_date,
|
||||
'documents': documents or None,
|
||||
'lots': lots or None,
|
||||
'details': {
|
||||
'short_description': short_desc,
|
||||
'reference_number': reference_number,
|
||||
'main_cpv_id': main_cpv_id,
|
||||
'is_utility': is_utility,
|
||||
},
|
||||
'source': source_tag,
|
||||
}
|
||||
|
||||
|
||||
def _extract_lots_simple(section2) -> list[dict]:
|
||||
"""Extract Lots list from Section2_2 → Lots → LotInfo."""
|
||||
if section2 is None:
|
||||
return []
|
||||
lots_list = find_local(section2, 'Lots')
|
||||
if lots_list is None:
|
||||
return []
|
||||
out = []
|
||||
for lot in lots_list:
|
||||
if etree.QName(lot.tag).localname != 'LotInfo':
|
||||
continue
|
||||
lot_data = {
|
||||
'lot_id': int_under(lot, 'LotID'),
|
||||
'lot_no': int_under(lot, 'LotNo'),
|
||||
'title': text_under(lot, 'Title'),
|
||||
'description': text_under(lot, 'DescriptionOfProcurement'),
|
||||
'cpv_code': sysitem_name(lot, 'MainCPVCode'),
|
||||
'estimated_value': _str_decimal(decimal_under(lot, 'EstimatedValue')),
|
||||
'duration_months': int_under(lot, 'DurationInMonths'),
|
||||
'duration_days': int_under(lot, 'DurationInDays'),
|
||||
'currency': sysitem_name(lot, 'Currency'),
|
||||
'place_of_performance': text_under(lot, 'MainSiteOrPlaceOfPerformance'),
|
||||
'is_community_financed': bool_under(lot, 'IsCommunityFinanced'),
|
||||
}
|
||||
out.append({k: v for k, v in lot_data.items() if v is not None})
|
||||
return out
|
||||
|
||||
|
||||
def _extract_documents(general) -> list[dict]:
|
||||
if general is None:
|
||||
return []
|
||||
out = []
|
||||
for fld in ('NoticeFiles', 'CompanyFiles', 'DfNoticeFiles'):
|
||||
container = find_local(general, fld)
|
||||
if container is None:
|
||||
continue
|
||||
for kvp in container:
|
||||
key = find_local(kvp, 'key')
|
||||
if key is None:
|
||||
key = find_local(kvp, 'Key')
|
||||
val = find_local(kvp, 'value')
|
||||
if val is None:
|
||||
val = find_local(kvp, 'Value')
|
||||
if key is not None and val is not None:
|
||||
out.append({'type': fld, 'name': key.text, 'id': val.text})
|
||||
return out
|
||||
|
||||
|
||||
def _str_decimal(d):
|
||||
return str(d) if d is not None else None
|
||||
|
||||
|
||||
from lxml import etree # noqa: E402
|
||||
@@ -0,0 +1,6 @@
|
||||
"""SU_CNotices — Anunțuri de participare."""
|
||||
from ._base import parse_basic_notice
|
||||
|
||||
def parse(el):
|
||||
return parse_basic_notice(el, type_tag='c_notice', source_tag='wsp_cnotice',
|
||||
notice_id_field='CNoticeId')
|
||||
@@ -0,0 +1,287 @@
|
||||
"""Parse SU_CaNotices items (Anunțuri de atribuire) → seap.announcements row dict."""
|
||||
from __future__ import annotations
|
||||
|
||||
from ..xml_utils import (
|
||||
find_child_local, find_local, find_path,
|
||||
text_under, text_direct, int_under, decimal_under, bool_under,
|
||||
datetime_under, sysitem_name, sysitem_id, to_jsonable,
|
||||
)
|
||||
|
||||
SOURCE = 'wsp_canotice'
|
||||
|
||||
|
||||
def parse(el) -> dict | None:
|
||||
"""Extract a CaNoticeBase element (V1 or V2) into our normalized dict."""
|
||||
notice_no = text_under(el, 'NoticeNo')
|
||||
notice_id = int_under(el, 'CaNoticeId')
|
||||
if not notice_no:
|
||||
return None
|
||||
|
||||
general = find_child_local(el, 'General')
|
||||
section1 = find_child_local(el, 'Section1')
|
||||
section2 = find_child_local(el, 'Section2')
|
||||
section4 = find_child_local(el, 'Section4')
|
||||
section5 = find_child_local(el, 'Section5')
|
||||
|
||||
# Authority block: Section1_1 → CaAddresses → CaEntityInformation → EntityInformation
|
||||
auth_entity = find_path(section1, 'Section1_1', 'CaAddresses')
|
||||
auth_info = None
|
||||
if auth_entity is not None:
|
||||
auth_info = find_local(auth_entity, 'EntityInformation')
|
||||
|
||||
authority_name = text_direct(auth_info, 'Name') if auth_info is not None else None
|
||||
authority_cui = text_direct(auth_info, 'Cif') if auth_info is not None else None
|
||||
authority_address = text_direct(auth_info, 'Address') if auth_info is not None else None
|
||||
authority_email = text_direct(auth_info, 'Email') if auth_info is not None else None
|
||||
authority_phone = text_direct(auth_info, 'Phone') if auth_info is not None else None
|
||||
authority_url = text_direct(auth_info, 'Url') if auth_info is not None else None
|
||||
|
||||
# NUTS code = county-ish (RO213 etc.)
|
||||
nuts_el = find_local(auth_info, 'NutsCode') if auth_info is not None else None
|
||||
county_code = sysitem_name(auth_info, 'NutsCode') if auth_info is not None else None
|
||||
|
||||
# Authority type + main activity
|
||||
s1_4 = find_child_local(section1, 'Section1_4') if section1 is not None else None
|
||||
authority_type = sysitem_name(s1_4, 'ContractingAuthorityType')
|
||||
s1_5 = find_child_local(section1, 'Section1_5') if section1 is not None else None
|
||||
main_activity = sysitem_name(s1_5, 'MainActivity')
|
||||
|
||||
# Section 2: contract details
|
||||
s2_1 = find_child_local(section2, 'Section2_1') if section2 is not None else None
|
||||
contract_title = text_under(general, 'ContractTitle') or text_under(s2_1, 'ContractName')
|
||||
short_desc = text_under(s2_1, 'ShortContractDescription')
|
||||
main_cpv_id = sysitem_id(s2_1, 'MainCPV')
|
||||
main_cpv_code = sysitem_name(s2_1, 'MainCPV')
|
||||
contract_type = sysitem_name(s2_1, 'SysAcquisitionContractType')
|
||||
currency = sysitem_name(s2_1, 'Currency')
|
||||
total_value = decimal_under(s2_1, 'TotalValue')
|
||||
highest_offer = decimal_under(s2_1, 'HighestOffer')
|
||||
lowest_offer = decimal_under(s2_1, 'LowestOffer')
|
||||
has_lots = bool_under(s2_1, 'ContractHasLots')
|
||||
reference_number = text_under(s2_1, 'ReferenceNumber')
|
||||
|
||||
# Lots — Section 5 (ContractLotList) for awarded notices, Section 2_2 otherwise
|
||||
lots = _extract_lots(section5) or _extract_lots(section2)
|
||||
lots_count = len(lots) if lots else None
|
||||
|
||||
# Award criteria — extracted from lot info or top-level
|
||||
award_criteria = _extract_award_criteria(el)
|
||||
|
||||
# Section 4: procedure details
|
||||
s4_1 = find_child_local(section4, 'Section4_1') if section4 is not None else None
|
||||
procedure_type = sysitem_name(s4_1, 'SysProcedureType')
|
||||
framework_agreement = bool_under(s4_1, 'FrameworkAgreement')
|
||||
|
||||
# Dates
|
||||
publication_date = datetime_under(general, 'PublishDate')
|
||||
legislation = sysitem_name(general, 'SysLegislationType')
|
||||
notice_state = sysitem_name(general, 'SysNoticeState')
|
||||
notice_state_id = sysitem_id(general, 'SysNoticeState')
|
||||
is_utility = bool_under(general, 'IsUtility')
|
||||
notice_no_joue = text_under(general, 'NoticeNoJoue')
|
||||
entity_id = int_under(general, 'EntityId')
|
||||
|
||||
# Winners (suppliers) — extracted from lot info
|
||||
winners = _extract_winners(section5)
|
||||
# Pick first winner as primary; full list in lots JSONB
|
||||
primary_winner = winners[0] if winners else {}
|
||||
|
||||
# Documents — DfNoticeFiles, CompanyFiles
|
||||
documents = _extract_documents(general)
|
||||
|
||||
# Final award value
|
||||
awarded_value = total_value
|
||||
if not awarded_value and lots:
|
||||
# sum lot values
|
||||
try:
|
||||
awarded_value = sum(
|
||||
(lot.get('total_value') or 0) for lot in lots if lot.get('total_value')
|
||||
) or None
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return {
|
||||
# Standard columns
|
||||
'type': 'ca_notice',
|
||||
'ref_number': f'WSP-{notice_no}',
|
||||
'authority_name': authority_name,
|
||||
'authority_cui': authority_cui,
|
||||
'authority_address': authority_address,
|
||||
'authority_email': authority_email,
|
||||
'authority_phone': authority_phone,
|
||||
'authority_url': authority_url,
|
||||
'authority_type': authority_type,
|
||||
'authority_main_activity': main_activity,
|
||||
'authority_entity_id': entity_id,
|
||||
'title': contract_title[:1000] if contract_title else None,
|
||||
'cpv_code': main_cpv_code,
|
||||
'contract_type': contract_type,
|
||||
'publication_date': publication_date,
|
||||
'estimated_value': None,
|
||||
'awarded_value': awarded_value,
|
||||
'currency': currency,
|
||||
'supplier_name': primary_winner.get('name'),
|
||||
'supplier_cui': primary_winner.get('cif'),
|
||||
'supplier_address': primary_winner.get('address'),
|
||||
'supplier_is_sme': primary_winner.get('is_sme'),
|
||||
'procedure_type': procedure_type,
|
||||
'procedure_state': notice_state,
|
||||
'legislation': legislation,
|
||||
'lot_number': None,
|
||||
'has_lots': 'da' if has_lots else 'nu' if has_lots is False else None,
|
||||
'contract_has_lots': has_lots,
|
||||
'lots_count': lots_count,
|
||||
'joue': notice_no_joue,
|
||||
'county_code': county_code,
|
||||
'notice_state': notice_state,
|
||||
'notice_state_id': notice_state_id,
|
||||
'framework_agreement': framework_agreement,
|
||||
'notice_id_internal': notice_id,
|
||||
'seap_url': f'https://e-licitatie.ro/pub/notices/contract-award-notice/{notice_id}' if notice_id else None,
|
||||
|
||||
# Rich JSONB
|
||||
'documents': documents or None,
|
||||
'award_criteria': award_criteria or None,
|
||||
'lots': lots or None,
|
||||
'details': {
|
||||
'short_description': short_desc,
|
||||
'reference_number': reference_number,
|
||||
'main_cpv_id': main_cpv_id,
|
||||
'highest_offer': str(highest_offer) if highest_offer else None,
|
||||
'lowest_offer': str(lowest_offer) if lowest_offer else None,
|
||||
'is_utility': is_utility,
|
||||
'all_winners': winners or None,
|
||||
},
|
||||
'source': SOURCE,
|
||||
}
|
||||
|
||||
|
||||
def _extract_lots(section) -> list[dict]:
|
||||
"""Extract lot info — handles ContractLotList (Section5) or Lots (Section2_2)."""
|
||||
if section is None:
|
||||
return []
|
||||
lot_list = find_local(section, 'ContractLotList')
|
||||
if lot_list is None:
|
||||
lot_list = find_local(section, 'Lots')
|
||||
if lot_list is None:
|
||||
return []
|
||||
out = []
|
||||
for lot in lot_list:
|
||||
if etree.QName(lot.tag).localname not in ('ContractLotInfo', 'LotInfo'):
|
||||
continue
|
||||
lot_data = {
|
||||
'lot_id': int_under(lot, 'LotID') or int_under(lot, 'ContractNo'),
|
||||
'lot_no': int_under(lot, 'LotNo') or int_under(lot, 'ContractNo'),
|
||||
'title': text_under(lot, 'Title') or text_under(lot, 'ContractObjectName'),
|
||||
'description': text_under(lot, 'DescriptionOfProcurement') or text_under(lot, 'ContractObjectDescription'),
|
||||
'cpv_code': sysitem_name(lot, 'MainCPVCode'),
|
||||
'estimated_value': _decimal_or_none(text_under(lot, 'EstimatedValue')),
|
||||
'total_value': _decimal_or_none(text_under(lot, 'TotalValue')),
|
||||
'duration_months': int_under(lot, 'DurationInMonths'),
|
||||
'duration_days': int_under(lot, 'DurationInDays'),
|
||||
'currency': sysitem_name(lot, 'Currency'),
|
||||
'place_of_performance': text_under(lot, 'MainSiteOrPlaceOfPerformance'),
|
||||
'is_community_financed': bool_under(lot, 'IsCommunityFinanced'),
|
||||
'has_options': bool_under(lot, 'HasOptions'),
|
||||
'awarded_to_group': bool_under(lot, 'AwardedToGroupOfEcoOp'),
|
||||
}
|
||||
# Convert decimals to str for JSON safety
|
||||
for k in ('estimated_value', 'total_value'):
|
||||
if lot_data[k] is not None:
|
||||
lot_data[k] = str(lot_data[k])
|
||||
# only keep non-empty
|
||||
out.append({k: v for k, v in lot_data.items() if v is not None})
|
||||
return out
|
||||
|
||||
|
||||
def _extract_winners(section5) -> list[dict]:
|
||||
"""Extract winner info from ContractLotList → ContractorAddressList."""
|
||||
if section5 is None:
|
||||
return []
|
||||
out = []
|
||||
seen_cifs = set()
|
||||
lot_list = find_local(section5, 'ContractLotList')
|
||||
if lot_list is None:
|
||||
return []
|
||||
for lot in lot_list:
|
||||
addr_list = find_local(lot, 'ContractorAddressList')
|
||||
if addr_list is None:
|
||||
continue
|
||||
for sect in addr_list:
|
||||
entity = find_local(sect, 'EntityInformation')
|
||||
if entity is None:
|
||||
continue
|
||||
cif = text_direct(entity, 'Cif')
|
||||
if cif in seen_cifs:
|
||||
continue
|
||||
if cif:
|
||||
seen_cifs.add(cif)
|
||||
out.append({
|
||||
'name': text_direct(entity, 'Name'),
|
||||
'cif': cif,
|
||||
'address': text_direct(entity, 'Address'),
|
||||
'email': text_direct(entity, 'Email'),
|
||||
'phone': text_direct(entity, 'Phone'),
|
||||
'url': text_direct(entity, 'Url'),
|
||||
'nuts_code': sysitem_name(entity, 'NutsCode'),
|
||||
'is_sme': bool_under(sect, 'IsSme'),
|
||||
})
|
||||
return out
|
||||
|
||||
|
||||
def _extract_award_criteria(el) -> list[dict]:
|
||||
"""Extract award criteria from lot info (MyPriceAwardCriterias / MyQualityAwardCriterias)."""
|
||||
out = []
|
||||
for crit_list_name in ('MyPriceAwardCriterias', 'MyQualityAwardCriterias'):
|
||||
for crit_list in el.iter():
|
||||
if etree.QName(crit_list.tag).localname != crit_list_name:
|
||||
continue
|
||||
for crit in crit_list:
|
||||
name = text_under(crit, 'Name') or text_under(crit, 'Description')
|
||||
weight = decimal_under(crit, 'Weight') or decimal_under(crit, 'Pondere')
|
||||
if name or weight:
|
||||
item = {'type': 'price' if 'Price' in crit_list_name else 'quality',
|
||||
'name': name}
|
||||
if weight is not None:
|
||||
item['weight'] = str(weight)
|
||||
out.append(item)
|
||||
return out
|
||||
|
||||
|
||||
def _extract_documents(general) -> list[dict]:
|
||||
"""Extract document file references from DfNoticeFiles + CompanyFiles."""
|
||||
if general is None:
|
||||
return []
|
||||
out = []
|
||||
for fld in ('DfNoticeFiles', 'CompanyFiles', 'NoticeFiles'):
|
||||
container = find_local(general, fld)
|
||||
if container is None:
|
||||
continue
|
||||
for kvp in container:
|
||||
key = find_local(kvp, 'key')
|
||||
if key is None:
|
||||
key = find_local(kvp, 'Key')
|
||||
val = find_local(kvp, 'value')
|
||||
if val is None:
|
||||
val = find_local(kvp, 'Value')
|
||||
if key is not None and val is not None:
|
||||
out.append({
|
||||
'type': fld,
|
||||
'name': key.text,
|
||||
'id': val.text,
|
||||
})
|
||||
return out
|
||||
|
||||
|
||||
def _decimal_or_none(s):
|
||||
if not s:
|
||||
return None
|
||||
try:
|
||||
from decimal import Decimal
|
||||
return Decimal(s)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
# late import to avoid circular
|
||||
from lxml import etree # noqa: E402
|
||||
@@ -0,0 +1,19 @@
|
||||
"""Catalog_ListItems — Beletage product catalog."""
|
||||
from __future__ import annotations
|
||||
|
||||
from ..xml_utils import text_under, decimal_under, datetime_under, sysitem_name, to_jsonable, int_under
|
||||
|
||||
|
||||
def parse(el) -> dict | None:
|
||||
item_code = text_under(el, 'ItemCode') or text_under(el, 'CatalogItemCode')
|
||||
if not item_code:
|
||||
return None
|
||||
return {
|
||||
'item_code': item_code,
|
||||
'item_name': text_under(el, 'Name') or text_under(el, 'ItemName'),
|
||||
'cpv_code': sysitem_name(el, 'CpvCode'),
|
||||
'unit_price': decimal_under(el, 'UnitPrice') or decimal_under(el, 'Price'),
|
||||
'currency': sysitem_name(el, 'Currency'),
|
||||
'last_updated': datetime_under(el, 'LastUpdate'),
|
||||
'details': to_jsonable(el),
|
||||
}
|
||||
@@ -0,0 +1,6 @@
|
||||
"""SU_DCNotices — Concurs de soluții."""
|
||||
from ._base import parse_basic_notice
|
||||
|
||||
def parse(el):
|
||||
return parse_basic_notice(el, type_tag='dc_notice', source_tag='wsp_dcnotice',
|
||||
notice_id_field='DCNoticeId')
|
||||
@@ -0,0 +1,6 @@
|
||||
"""SU_ENotices — Erate (corections to other notices)."""
|
||||
from ._base import parse_basic_notice
|
||||
|
||||
def parse(el):
|
||||
return parse_basic_notice(el, type_tag='e_notice', source_tag='wsp_enotice',
|
||||
notice_id_field='ENoticeId')
|
||||
@@ -0,0 +1,6 @@
|
||||
"""SU_EAProcedure — Licitații electronice."""
|
||||
from ._base import parse_basic_notice
|
||||
|
||||
def parse(el):
|
||||
return parse_basic_notice(el, type_tag='ea_procedure', source_tag='wsp_eaprocedure',
|
||||
notice_id_field='EAProcedureId')
|
||||
@@ -0,0 +1,6 @@
|
||||
"""SU_PCNotices — Anunțuri concesionari."""
|
||||
from ._base import parse_basic_notice
|
||||
|
||||
def parse(el):
|
||||
return parse_basic_notice(el, type_tag='pc_notice', source_tag='wsp_pcnotice',
|
||||
notice_id_field='PCNoticeId')
|
||||
@@ -0,0 +1,6 @@
|
||||
"""SU_PiNotices — Anunțuri de intenție."""
|
||||
from ._base import parse_basic_notice
|
||||
|
||||
def parse(el):
|
||||
return parse_basic_notice(el, type_tag='pi_notice', source_tag='wsp_pinotice',
|
||||
notice_id_field='PiNoticeId')
|
||||
@@ -0,0 +1,6 @@
|
||||
"""SU_RdcNotices — Rezultate desemnare castigator concurs (rare)."""
|
||||
from ._base import parse_basic_notice
|
||||
|
||||
def parse(el):
|
||||
return parse_basic_notice(el, type_tag='rdc_notice', source_tag='wsp_rdcnotice',
|
||||
notice_id_field='RdcNoticeId')
|
||||
@@ -0,0 +1,6 @@
|
||||
"""SU_RfqInvitations — Cereri de oferta (RFQ invitations)."""
|
||||
from ._base import parse_basic_notice
|
||||
|
||||
def parse(el):
|
||||
return parse_basic_notice(el, type_tag='rfq_invitation', source_tag='wsp_rfq_invitation',
|
||||
notice_id_field='RFQInvitationId')
|
||||
@@ -0,0 +1,6 @@
|
||||
"""SU_RfqNotices — Anunțuri de atribuire la cererea de oferta."""
|
||||
from ._base import parse_basic_notice
|
||||
|
||||
def parse(el):
|
||||
return parse_basic_notice(el, type_tag='rfq_notice', source_tag='wsp_rfq_notice',
|
||||
notice_id_field='RFQNoticeId')
|
||||
@@ -0,0 +1,83 @@
|
||||
"""SuContracts — Beletage's own contracts (writes to seap.beletage_contracts)."""
|
||||
from __future__ import annotations
|
||||
|
||||
from ..xml_utils import (
|
||||
find_child_local, find_local, text_under, text_direct,
|
||||
int_under, decimal_under, bool_under, datetime_under,
|
||||
sysitem_name, sysitem_id, to_jsonable,
|
||||
)
|
||||
|
||||
|
||||
def parse(el) -> dict | None:
|
||||
contract_id = int_under(el, 'ContractId')
|
||||
if contract_id is None:
|
||||
return None
|
||||
|
||||
# Top-level fields directly under ContractItem
|
||||
contract_no = text_under(el, 'ContractNo')
|
||||
contract_title = text_under(el, 'ContractTitle')
|
||||
contract_value = decimal_under(el, 'ContractValue')
|
||||
default_currency_value = decimal_under(el, 'DefaultCurrencyContractValue')
|
||||
awarding_date = datetime_under(el, 'ContractAwardingDate')
|
||||
contract_date = datetime_under(el, 'ContractDate')
|
||||
publication_date = datetime_under(el, 'PublicationDate')
|
||||
duration_months = int_under(el, 'MonthsContractDuration')
|
||||
is_current = bool_under(el, 'IsCurrentVersion')
|
||||
is_rejected = bool_under(el, 'IsRejected')
|
||||
version_no = int_under(el, 'VersionNo')
|
||||
version_date = datetime_under(el, 'VersionDate')
|
||||
justification = text_under(el, 'Justification')
|
||||
additional_info = text_under(el, 'AdditionalInformation')
|
||||
contract_phase = sysitem_name(el, 'SysContractPhase')
|
||||
contract_state = sysitem_name(el, 'SysContractState')
|
||||
contract_type = sysitem_name(el, 'SysContractType')
|
||||
currency = sysitem_name(el, 'ContractValueCurrency')
|
||||
|
||||
# CA Notice details: nested under <CANotice><General>...</General></CANotice>
|
||||
ca_notice = find_child_local(el, 'CANotice')
|
||||
ca_notice_id = None
|
||||
ca_notice_no = None
|
||||
authority_name = None
|
||||
authority_cui = None
|
||||
|
||||
if ca_notice is not None:
|
||||
general = find_child_local(ca_notice, 'General')
|
||||
if general is not None:
|
||||
ca_notice_id = int_under(general, 'CaNoticeId')
|
||||
ca_notice_no = text_under(general, 'NoticeNo')
|
||||
|
||||
section1 = find_child_local(ca_notice, 'Section1')
|
||||
if section1 is not None:
|
||||
auth_addresses = find_local(section1, 'CaAddresses')
|
||||
if auth_addresses is not None:
|
||||
auth_info = find_local(auth_addresses, 'EntityInformation')
|
||||
if auth_info is not None:
|
||||
authority_name = text_direct(auth_info, 'Name')
|
||||
authority_cui = text_direct(auth_info, 'Cif')
|
||||
|
||||
return {
|
||||
'contract_id': contract_id,
|
||||
'contract_no': contract_no,
|
||||
'contract_title': contract_title[:1000] if contract_title else None,
|
||||
'contract_type': contract_type,
|
||||
'contract_phase': contract_phase,
|
||||
'contract_state': contract_state,
|
||||
'awarding_date': awarding_date.date() if awarding_date else None,
|
||||
'contract_date': contract_date.date() if contract_date else None,
|
||||
'publication_date': publication_date,
|
||||
'duration_months': duration_months,
|
||||
'contract_value': contract_value,
|
||||
'default_currency_value': default_currency_value,
|
||||
'currency': currency,
|
||||
'ca_notice_id': ca_notice_id,
|
||||
'ca_notice_no': ca_notice_no,
|
||||
'authority_name': authority_name,
|
||||
'authority_cui': authority_cui,
|
||||
'is_current_version': is_current,
|
||||
'is_rejected': is_rejected,
|
||||
'version_no': version_no,
|
||||
'version_date': version_date,
|
||||
'justification': justification,
|
||||
'additional_information': additional_info,
|
||||
'details': to_jsonable(el),
|
||||
}
|
||||
@@ -0,0 +1,32 @@
|
||||
"""SuDirectAcquisitions — Beletage's own direct acquisitions (won)."""
|
||||
from __future__ import annotations
|
||||
|
||||
from ..xml_utils import (
|
||||
find_child_local, find_local, text_under, text_direct,
|
||||
int_under, decimal_under, datetime_under, sysitem_name, to_jsonable,
|
||||
)
|
||||
|
||||
|
||||
def parse(el) -> dict | None:
|
||||
da_id = int_under(el, 'DirectAcquisitionId') or int_under(el, 'DirectAcquisitionID')
|
||||
if da_id is None:
|
||||
return None
|
||||
|
||||
cpv_code = sysitem_name(el, 'CpvCode')
|
||||
return {
|
||||
'da_id': da_id,
|
||||
'da_name': text_under(el, 'DirectAcquisitionName'),
|
||||
'unique_identification_code': text_under(el, 'UniqueIdentificationCode'),
|
||||
'cpv_code': cpv_code,
|
||||
'cpv_name': cpv_code, # name == code in this WSDL
|
||||
'contract_type': sysitem_name(el, 'SysAcquisitionContractType'),
|
||||
'publication_date': datetime_under(el, 'PublicationDate'),
|
||||
'finalization_date': datetime_under(el, 'FinalizationDate'),
|
||||
'estimated_value': decimal_under(el, 'EstimatedValue') or decimal_under(el, 'EstimatedValueRon'),
|
||||
'closing_value': decimal_under(el, 'ClosingValue'),
|
||||
'currency': 'RON',
|
||||
'da_state': sysitem_name(el, 'SysDirectAcquisitionState'),
|
||||
'authority_id': int_under(el, 'ContractingAuthorityID'),
|
||||
'authority_name': text_under(el, 'ContractingAuthority'),
|
||||
'details': to_jsonable(el),
|
||||
}
|
||||
@@ -0,0 +1,49 @@
|
||||
"""SuInvoices — Beletage's own invoices (writes to seap.beletage_invoices)."""
|
||||
from __future__ import annotations
|
||||
|
||||
from ..xml_utils import (
|
||||
find_child_local, find_local, text_under, text_direct,
|
||||
int_under, decimal_under, datetime_under, sysitem_name, to_jsonable,
|
||||
)
|
||||
|
||||
|
||||
def parse(el) -> dict | None:
|
||||
invoice_id = int_under(el, 'InvoiceId')
|
||||
if invoice_id is None:
|
||||
return None
|
||||
|
||||
# Sums of payments
|
||||
paid_value = None
|
||||
paid_at = None
|
||||
payments = find_local(el, 'Payments')
|
||||
if payments is not None:
|
||||
for pay in payments:
|
||||
v = decimal_under(pay, 'Value')
|
||||
d = datetime_under(pay, 'Date')
|
||||
if v is not None:
|
||||
paid_value = (paid_value or 0) + v
|
||||
if d is not None and (paid_at is None or d > paid_at):
|
||||
paid_at = d
|
||||
|
||||
return {
|
||||
'invoice_id': invoice_id,
|
||||
'invoice_no': text_under(el, 'InvoiceNumber') or text_under(el, 'InvoiceNo'),
|
||||
'invoice_date': (datetime_under(el, 'InvoiceDate') or datetime_under(el, 'IssueDate')),
|
||||
'due_date': datetime_under(el, 'DueDate'),
|
||||
'contract_id': int_under(el, 'ContractId'),
|
||||
'contract_no': text_under(el, 'ContractNo'),
|
||||
'authority_name': text_under(el, 'BuyerName') or text_under(el, 'AuthorityName'),
|
||||
'authority_cui': text_under(el, 'BuyerCif') or text_under(el, 'AuthorityCif'),
|
||||
'total_value': decimal_under(el, 'TotalValue') or decimal_under(el, 'GrandTotal'),
|
||||
'total_value_no_vat': decimal_under(el, 'TotalValueNoVat') or decimal_under(el, 'NetTotal'),
|
||||
'vat_value': decimal_under(el, 'VatValue') or decimal_under(el, 'VAT'),
|
||||
'currency': sysitem_name(el, 'Currency') or text_under(el, 'CurrencyCode'),
|
||||
'state': sysitem_name(el, 'SysInvoiceState') or text_under(el, 'State'),
|
||||
'paid_value': paid_value,
|
||||
'paid_at': paid_at,
|
||||
'details': to_jsonable(el),
|
||||
}
|
||||
|
||||
|
||||
def _date_only(dt):
|
||||
return dt.date() if dt else None
|
||||
@@ -0,0 +1,142 @@
|
||||
"""
|
||||
WSP CLI runner.
|
||||
|
||||
Usage:
|
||||
python -m wsp.runner status
|
||||
python -m wsp.runner incremental [op_name|all]
|
||||
python -m wsp.runner backfill <op_name> --from YYYY-MM-DD --to YYYY-MM-DD [--workers 3]
|
||||
python -m wsp.runner test
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import sys
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from . import db, sync
|
||||
from .operations import ALL_OPS, PUBLIC_OPS, OWN_OPS
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s %(levelname)s [%(name)s] %(message)s',
|
||||
datefmt='%Y-%m-%d %H:%M:%S',
|
||||
)
|
||||
log = logging.getLogger('wsp.runner')
|
||||
|
||||
|
||||
def cmd_status(args):
|
||||
with db.connection() as conn:
|
||||
cur = conn.cursor()
|
||||
cur.execute("""
|
||||
SELECT feed, last_run_at, last_cursor_date,
|
||||
items_imported_total, items_imported_24h,
|
||||
consecutive_errors, last_error
|
||||
FROM seap.wsp_sync_state ORDER BY feed
|
||||
""")
|
||||
rows = cur.fetchall()
|
||||
print(f'\n{"Feed":<25} {"Last run":<22} {"Cursor":<22} {"Total":>10} {"Errors"}')
|
||||
print('-' * 100)
|
||||
for r in rows:
|
||||
feed, last_run, cursor, total, _, errs, err_msg = r
|
||||
err_str = err_msg[:40] + '...' if err_msg and len(err_msg) > 40 else (err_msg or '')
|
||||
print(f'{feed:<25} {str(last_run)[:19]:<22} {str(cursor)[:19]:<22} '
|
||||
f'{total or 0:>10,} {errs}/{err_str}')
|
||||
|
||||
cur.execute("""
|
||||
SELECT feed, state, count(*) FROM seap.wsp_backfill_windows
|
||||
GROUP BY feed, state ORDER BY feed, state
|
||||
""")
|
||||
rows = cur.fetchall()
|
||||
if rows:
|
||||
print(f'\n{"Backfill queue":<30}')
|
||||
print('-' * 60)
|
||||
for feed, state, count in rows:
|
||||
print(f' {feed:<25} {state:<15} {count:>8}')
|
||||
|
||||
cur.execute("""
|
||||
SELECT source, count(*) FROM seap.announcements
|
||||
WHERE source LIKE 'wsp_%' GROUP BY source ORDER BY source
|
||||
""")
|
||||
rows = cur.fetchall()
|
||||
if rows:
|
||||
print(f'\n{"Source":<25} {"Rows in seap.announcements"}')
|
||||
print('-' * 60)
|
||||
for source, count in rows:
|
||||
print(f' {source:<25} {count:>10,}')
|
||||
|
||||
|
||||
def cmd_incremental(args):
|
||||
target_ops = args.ops if args.ops != ['all'] else list(ALL_OPS)
|
||||
for op_name in target_ops:
|
||||
if op_name not in ALL_OPS:
|
||||
log.error('Unknown op: %s', op_name)
|
||||
continue
|
||||
log.info('=== Running incremental: %s ===', op_name)
|
||||
result = sync.run_incremental(op_name, lookback_hours=args.lookback_hours)
|
||||
log.info('Result: %s', result)
|
||||
|
||||
|
||||
def cmd_backfill(args):
|
||||
if args.op not in ALL_OPS:
|
||||
log.error('Unknown op: %s. Available: %s', args.op, list(ALL_OPS))
|
||||
sys.exit(1)
|
||||
start = datetime.fromisoformat(args.start)
|
||||
end = datetime.fromisoformat(args.end)
|
||||
log.info('=== Backfill %s [%s, %s) workers=%d ===',
|
||||
args.op, start, end, args.workers)
|
||||
result = sync.run_backfill(args.op, start, end, workers=args.workers)
|
||||
log.info('Done: %s', result)
|
||||
|
||||
|
||||
def cmd_test(args):
|
||||
"""Run a 1-day end-to-end test on yesterday (all SU_* feeds)."""
|
||||
yesterday = datetime.now().date() - timedelta(days=1)
|
||||
start = datetime.combine(yesterday, datetime.min.time())
|
||||
end = datetime.combine(yesterday, datetime.max.time())
|
||||
|
||||
feeds_to_test = ['SU_CaNotices', 'SU_CNotices', 'SU_PiNotices',
|
||||
'SU_RfqNotices', 'SU_RfqInvitations',
|
||||
'SU_DCNotices', 'SU_PCNotices', 'SU_RdcNotices', 'SU_ENotices']
|
||||
log.info('=== Smoke test on %s ===', yesterday)
|
||||
for op_name in feeds_to_test:
|
||||
log.info('--- %s ---', op_name)
|
||||
try:
|
||||
r = sync.run_backfill(op_name, start, end, workers=1, enqueue=True)
|
||||
log.info(' → %d items imported', r['items_imported'])
|
||||
except Exception as e:
|
||||
log.exception(' → ERROR: %s', e)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(prog='wsp.runner')
|
||||
sub = parser.add_subparsers(dest='cmd', required=True)
|
||||
|
||||
sub.add_parser('status')
|
||||
|
||||
p_inc = sub.add_parser('incremental')
|
||||
p_inc.add_argument('ops', nargs='*', default=['all'],
|
||||
help='Operations to sync (default: all)')
|
||||
p_inc.add_argument('--lookback-hours', type=int, default=36)
|
||||
|
||||
p_bf = sub.add_parser('backfill')
|
||||
p_bf.add_argument('op')
|
||||
p_bf.add_argument('--start', required=True, help='YYYY-MM-DD or full ISO')
|
||||
p_bf.add_argument('--end', required=True)
|
||||
p_bf.add_argument('--workers', type=int, default=3)
|
||||
|
||||
sub.add_parser('test')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
handlers = {
|
||||
'status': cmd_status,
|
||||
'incremental': cmd_incremental,
|
||||
'backfill': cmd_backfill,
|
||||
'test': cmd_test,
|
||||
}
|
||||
handlers[args.cmd](args)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -0,0 +1,193 @@
|
||||
"""
|
||||
Sync orchestrator — backfill (range, parallel) and incremental modes.
|
||||
|
||||
Backfill enqueues windows in seap.wsp_backfill_windows; workers claim
|
||||
atomically (SKIP LOCKED), fetch, parse, write, mark complete. Resumable.
|
||||
|
||||
Incremental: reads cursor, fetches (cursor, now), updates cursor on success.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import threading
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from typing import Optional
|
||||
|
||||
from .client import WspClient, make_client_from_env
|
||||
from . import db
|
||||
from .operations import WspOp, ALL_OPS
|
||||
from .pagination import fetch_window, split_into_windows
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def enqueue_backfill_windows(op: WspOp, start: datetime, end: datetime) -> int:
|
||||
"""Pre-populate seap.wsp_backfill_windows for [start, end) using op's max window."""
|
||||
windows = split_into_windows(start, end, op.max_window_days)
|
||||
queued = 0
|
||||
with db.connection() as conn:
|
||||
cur = conn.cursor()
|
||||
for ws, we in windows:
|
||||
new_id = db.upsert_backfill_window(cur, op.name, ws, we)
|
||||
if new_id:
|
||||
queued += 1
|
||||
log.info('Queued %d windows for %s [%s, %s)', queued, op.name, start, end)
|
||||
return queued
|
||||
|
||||
|
||||
def process_one_window(client: WspClient, op: WspOp, ws: datetime, we: datetime,
|
||||
win_id: int | None = None) -> dict:
|
||||
"""Fetch + parse + write one window. Returns stats."""
|
||||
rows: list[dict] = []
|
||||
sink = db.SINK_FUNCS[op.sink]
|
||||
|
||||
def collect(parsed, _el):
|
||||
if parsed:
|
||||
rows.append(parsed)
|
||||
|
||||
result = fetch_window(client, op, ws, we, on_item=collect)
|
||||
|
||||
items_imported = 0
|
||||
error_msg = None
|
||||
try:
|
||||
with db.connection() as conn:
|
||||
cur = conn.cursor()
|
||||
items_imported = sink(cur, rows)
|
||||
except Exception as e:
|
||||
error_msg = str(e)[:500]
|
||||
log.exception('DB write failed for %s [%s, %s)', op.name, ws, we)
|
||||
|
||||
if win_id is not None:
|
||||
try:
|
||||
with db.connection() as conn:
|
||||
cur = conn.cursor()
|
||||
db.complete_backfill_window(
|
||||
cur, win_id, items_imported, result.page_total_first,
|
||||
error=error_msg,
|
||||
)
|
||||
except Exception:
|
||||
log.exception('Failed to mark window %d complete', win_id)
|
||||
|
||||
return {
|
||||
'window_start': ws,
|
||||
'window_end': we,
|
||||
'items_imported': items_imported,
|
||||
'page_total': result.page_total_first,
|
||||
'pages_fetched': result.pages_fetched,
|
||||
'errors': result.errors,
|
||||
'sub_windows': len(result.sub_windows),
|
||||
'error': error_msg,
|
||||
}
|
||||
|
||||
|
||||
def run_backfill(op_name: str, start: datetime, end: datetime,
|
||||
workers: int = 3, enqueue: bool = True,
|
||||
endpoint: str | None = None) -> dict:
|
||||
"""Backfill an operation across [start, end) with parallel workers.
|
||||
|
||||
Each worker holds its own WspClient (own SSL session) for thread safety.
|
||||
"""
|
||||
op = ALL_OPS[op_name]
|
||||
if enqueue:
|
||||
enqueue_backfill_windows(op, start, end)
|
||||
|
||||
log.info('Starting backfill: op=%s workers=%d range=[%s, %s)',
|
||||
op_name, workers, start, end)
|
||||
|
||||
stats = {'op': op_name, 'windows_processed': 0, 'items_imported': 0,
|
||||
'errors': []}
|
||||
stop_event = threading.Event()
|
||||
|
||||
def worker_loop(worker_id: int):
|
||||
client = make_client_from_env(endpoint or 'https://e-licitatie.ro:8883/Pub')
|
||||
local_count = 0
|
||||
while not stop_event.is_set():
|
||||
with db.connection() as conn:
|
||||
cur = conn.cursor()
|
||||
claimed = db.claim_backfill_window(cur, op_name)
|
||||
if not claimed:
|
||||
log.info('Worker %d: no more pending windows, exiting', worker_id)
|
||||
return local_count
|
||||
ws, we, win_id = claimed['window_start'], claimed['window_end'], claimed['id']
|
||||
log.info(' W%d: processing [%s → %s) (attempt %d)',
|
||||
worker_id, ws, we, claimed['attempts'])
|
||||
try:
|
||||
r = process_one_window(client, op, ws, we, win_id=win_id)
|
||||
local_count += r['items_imported']
|
||||
log.info(' W%d: done [%s → %s) — items=%d page_total=%d',
|
||||
worker_id, ws, we, r['items_imported'], r['page_total'])
|
||||
except Exception as e:
|
||||
log.exception('Worker %d failed on window %s-%s', worker_id, ws, we)
|
||||
with db.connection() as conn:
|
||||
cur = conn.cursor()
|
||||
db.complete_backfill_window(cur, win_id, 0, 0, error=str(e)[:500])
|
||||
return local_count
|
||||
|
||||
with ThreadPoolExecutor(max_workers=workers) as pool:
|
||||
futures = [pool.submit(worker_loop, i) for i in range(workers)]
|
||||
for f in as_completed(futures):
|
||||
try:
|
||||
count = f.result()
|
||||
stats['items_imported'] += count
|
||||
except Exception as e:
|
||||
stats['errors'].append(str(e))
|
||||
|
||||
log.info('Backfill done: op=%s items=%d', op_name, stats['items_imported'])
|
||||
return stats
|
||||
|
||||
|
||||
def run_incremental(op_name: str, lookback_hours: int = 36,
|
||||
endpoint: str | None = None) -> dict:
|
||||
"""Incremental sync: from (cursor or now-lookback) to now.
|
||||
|
||||
Cursor advances to window_end on success.
|
||||
"""
|
||||
op = ALL_OPS[op_name]
|
||||
client = make_client_from_env(endpoint or 'https://e-licitatie.ro:8883/Pub')
|
||||
|
||||
# Use tz-aware UTC so we can compare with cursor (TIMESTAMPTZ from Postgres).
|
||||
end = datetime.now(timezone.utc)
|
||||
with db.connection() as conn:
|
||||
cur = conn.cursor()
|
||||
state = db.get_cursor(cur, op_name) or {}
|
||||
cursor = state.get('last_cursor_date')
|
||||
if cursor is None:
|
||||
start = end - timedelta(hours=lookback_hours)
|
||||
else:
|
||||
# overlap by 1 hour to catch late-publishing
|
||||
# Ensure cursor is tz-aware (assume UTC for any naive value from DB).
|
||||
if cursor.tzinfo is None:
|
||||
cursor = cursor.replace(tzinfo=timezone.utc)
|
||||
start = cursor - timedelta(hours=1)
|
||||
|
||||
windows = split_into_windows(start, end, op.max_window_days)
|
||||
log.info('Incremental %s: %d windows from %s to %s',
|
||||
op_name, len(windows), start, end)
|
||||
|
||||
total_imported = 0
|
||||
last_error = None
|
||||
last_window_end = cursor
|
||||
for ws, we in windows:
|
||||
try:
|
||||
r = process_one_window(client, op, ws, we)
|
||||
total_imported += r['items_imported']
|
||||
if not r.get('error'):
|
||||
last_window_end = we
|
||||
except Exception as e:
|
||||
last_error = str(e)[:500]
|
||||
log.exception('Incremental window failed: %s [%s, %s)', op_name, ws, we)
|
||||
|
||||
with db.connection() as conn:
|
||||
cur = conn.cursor()
|
||||
db.update_sync_state(
|
||||
cur, op_name,
|
||||
cursor_date=last_window_end,
|
||||
window_start=start,
|
||||
window_end=end,
|
||||
items_added=total_imported,
|
||||
error=last_error,
|
||||
)
|
||||
|
||||
return {'op': op_name, 'items_imported': total_imported,
|
||||
'cursor_advanced_to': last_window_end, 'error': last_error}
|
||||
@@ -0,0 +1,191 @@
|
||||
"""XML extraction helpers for WSP parsers — namespace-agnostic via local-name match."""
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime
|
||||
from decimal import Decimal, InvalidOperation
|
||||
from typing import Any, Optional
|
||||
|
||||
from lxml import etree
|
||||
|
||||
|
||||
def find_local(el, local_name: str):
|
||||
"""Find first descendant whose local name matches (any namespace)."""
|
||||
if el is None:
|
||||
return None
|
||||
for d in el.iter():
|
||||
if etree.QName(d.tag).localname == local_name:
|
||||
return d
|
||||
return None
|
||||
|
||||
|
||||
def find_child_local(el, local_name: str):
|
||||
"""Find direct child by local name."""
|
||||
if el is None:
|
||||
return None
|
||||
for c in el:
|
||||
if etree.QName(c.tag).localname == local_name:
|
||||
return c
|
||||
return None
|
||||
|
||||
|
||||
def find_path(el, *local_names: str):
|
||||
"""Walk a path of local names: find_path(el, 'General', 'NoticeNo')."""
|
||||
cur = el
|
||||
for name in local_names:
|
||||
cur = find_child_local(cur, name)
|
||||
if cur is None:
|
||||
return None
|
||||
return cur
|
||||
|
||||
|
||||
def text(el, *local_names) -> Optional[str]:
|
||||
"""Get text under a (possibly nested) path of local names."""
|
||||
if local_names:
|
||||
node = find_path(el, *local_names)
|
||||
else:
|
||||
node = el
|
||||
if node is None or node.text is None:
|
||||
return None
|
||||
txt = node.text.strip()
|
||||
return txt if txt else None
|
||||
|
||||
|
||||
def text_under(el, local_name: str) -> Optional[str]:
|
||||
"""Find first descendant by local name, return its text."""
|
||||
n = find_local(el, local_name)
|
||||
if n is None or n.text is None:
|
||||
return None
|
||||
s = n.text.strip()
|
||||
return s if s else None
|
||||
|
||||
|
||||
def text_direct(el, local_name: str) -> Optional[str]:
|
||||
"""Get text of a DIRECT child by local name only (not descendants).
|
||||
|
||||
Use this for EntityInformation.Name/Address/Email/Phone where SysItem-style
|
||||
children (City, Country, NutsCode) also contain a <Name> that would shadow.
|
||||
"""
|
||||
n = find_child_local(el, local_name)
|
||||
if n is None or n.text is None:
|
||||
return None
|
||||
s = n.text.strip()
|
||||
return s if s else None
|
||||
|
||||
|
||||
def int_under(el, local_name: str) -> Optional[int]:
|
||||
s = text_under(el, local_name)
|
||||
if s is None:
|
||||
return None
|
||||
try:
|
||||
return int(s)
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def decimal_under(el, local_name: str) -> Optional[Decimal]:
|
||||
s = text_under(el, local_name)
|
||||
if s is None:
|
||||
return None
|
||||
try:
|
||||
return Decimal(s)
|
||||
except (InvalidOperation, ValueError):
|
||||
return None
|
||||
|
||||
|
||||
def bool_under(el, local_name: str) -> Optional[bool]:
|
||||
s = text_under(el, local_name)
|
||||
if s is None:
|
||||
return None
|
||||
return s.lower() == 'true'
|
||||
|
||||
|
||||
def datetime_under(el, local_name: str) -> Optional[datetime]:
|
||||
"""Parse ISO datetime. Handles WCF <DateTime>...</DateTime> wrapper too."""
|
||||
n = find_local(el, local_name)
|
||||
if n is None:
|
||||
return None
|
||||
# Direct text?
|
||||
if n.text:
|
||||
try:
|
||||
return _parse_iso(n.text.strip())
|
||||
except (ValueError, AttributeError):
|
||||
pass
|
||||
# WCF wrapper: <Field><c:DateTime>...</c:DateTime></Field>
|
||||
inner = find_local(n, 'DateTime')
|
||||
if inner is not None and inner.text:
|
||||
try:
|
||||
return _parse_iso(inner.text.strip())
|
||||
except ValueError:
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
def _parse_iso(s: str) -> datetime:
|
||||
"""Parse various ISO 8601 forms returned by SEAP."""
|
||||
s = s.replace('Z', '+00:00')
|
||||
return datetime.fromisoformat(s)
|
||||
|
||||
|
||||
def sysitem_under(el, local_name: str) -> Optional[dict]:
|
||||
"""Extract a SysItem-style {Id, Name} structure."""
|
||||
n = find_local(el, local_name)
|
||||
if n is None:
|
||||
return None
|
||||
id_el = find_local(n, 'Id')
|
||||
name_el = find_local(n, 'Name')
|
||||
out = {}
|
||||
if id_el is not None and id_el.text:
|
||||
try:
|
||||
out['id'] = int(id_el.text)
|
||||
except ValueError:
|
||||
out['id'] = id_el.text
|
||||
if name_el is not None and name_el.text:
|
||||
out['name'] = name_el.text.strip()
|
||||
return out or None
|
||||
|
||||
|
||||
def sysitem_name(el, local_name: str) -> Optional[str]:
|
||||
item = sysitem_under(el, local_name)
|
||||
return item.get('name') if item else None
|
||||
|
||||
|
||||
def sysitem_id(el, local_name: str) -> Optional[int]:
|
||||
item = sysitem_under(el, local_name)
|
||||
if not item:
|
||||
return None
|
||||
val = item.get('id')
|
||||
return val if isinstance(val, int) else None
|
||||
|
||||
|
||||
def to_jsonable(el) -> dict:
|
||||
"""Convert an element subtree into a nested dict for JSONB storage.
|
||||
|
||||
Strips namespaces, keeps local names. Drops nil elements.
|
||||
Mixed content with multiple same-name children becomes a list.
|
||||
"""
|
||||
if el is None:
|
||||
return {}
|
||||
is_nil = el.get('{http://www.w3.org/2001/XMLSchema-instance}nil') == 'true'
|
||||
if is_nil:
|
||||
return None
|
||||
|
||||
children = list(el)
|
||||
if not children:
|
||||
# leaf
|
||||
if el.text and el.text.strip():
|
||||
return el.text.strip()
|
||||
return None
|
||||
|
||||
out: dict = {}
|
||||
for c in children:
|
||||
key = etree.QName(c.tag).localname
|
||||
val = to_jsonable(c)
|
||||
if val is None:
|
||||
continue
|
||||
if key in out:
|
||||
if not isinstance(out[key], list):
|
||||
out[key] = [out[key]]
|
||||
out[key].append(val)
|
||||
else:
|
||||
out[key] = val
|
||||
return out
|
||||
Reference in New Issue
Block a user