feat(epay): three layers of download/poll resilience

After 327649 hit a transient ANCPI 500 on download (succeeded immediately on
manual retry), make the pipeline self-heal instead of marking the row failed:

1. downloadDocument retries transient failures (5xx, network/timeout, empty
   body, non-PDF error page) up to 4 attempts with linear backoff (3/6/9s);
   a 4xx is permanent and stops immediately. The %PDF guard stays — a bad
   body is now retried rather than thrown on the first try.

2. pollUntilComplete tolerates a transient error on a single poll: it logs and
   continues to the next cycle instead of throwing out of the whole batch (one
   ANCPI blip during polling no longer fails a paid order).

3. finalizeOrder runs a final retry sweep: any row still failed after the
   parallel pass is re-attempted once more after a short pause (covers a longer
   ANCPI blip or a MinIO hiccup). No new charge — the order is already paid.

Same downloadDocument + pollUntilComplete hardening ported to eterra-live.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Claude VM
2026-06-05 16:42:23 +03:00
parent 1c8d7ea59c
commit c9f1219eaa
2 changed files with 98 additions and 29 deletions
+55 -14
View File
@@ -51,6 +51,9 @@ const POLL_MAX_ATTEMPTS = 40;
// ShowOrderDetails page size — large enough to fetch any realistic batch in
// one request (see getOrderStatus / QW4).
const ORDER_PAGE_SIZE = 50;
// Document download retry (transient ANCPI 5xx / timeout / error-page).
const DOWNLOAD_MAX_ATTEMPTS = 4;
const DOWNLOAD_RETRY_DELAY_MS = 3_000; // linear backoff: 3s, 6s, 9s
/* ------------------------------------------------------------------ */
/* Session cache */
@@ -779,11 +782,21 @@ export class EpayClient {
onProgress?: (attempt: number, status: string) => void,
): Promise<EpayOrderStatus> {
for (let attempt = 1; attempt <= POLL_MAX_ATTEMPTS; attempt++) {
try {
const status = await this.getOrderStatus(orderId);
if (onProgress) onProgress(attempt, status.status);
if (["Finalizata", "Anulata", "Plata refuzata"].includes(status.status)) {
return status;
}
} catch (err) {
// A transient ANCPI error (5xx, timeout) on ONE poll must not abort
// the whole batch — the order is paid and still being processed.
// Log and try again on the next cycle.
const msg = err instanceof Error ? err.message : String(err);
console.warn(
`[epay] poll ${attempt}/${POLL_MAX_ATTEMPTS} for order ${orderId} errored (${msg}); continuing`,
);
}
await sleep(POLL_INTERVAL_MS);
}
throw new Error(`ePay order ${orderId} timed out after ${POLL_MAX_ATTEMPTS} poll attempts`);
@@ -793,29 +806,57 @@ export class EpayClient {
async downloadDocument(idDocument: number, typeD = 4): Promise<Buffer> {
const url = `${BASE_URL}/DownloadFile.action?typeD=${typeD}&id=${idDocument}&source=&browser=chrome`;
// Angular sends Content-Type: application/pdf in the REQUEST
let lastErr = "unknown";
// ANCPI's DownloadFile occasionally returns a transient 5xx / times out /
// hands back an error page even when the order is finalized (2026-06-05:
// 327649 got one 500, then succeeded on the very next attempt). The
// download is idempotent, so retry transient failures with backoff before
// giving up. A 4xx is treated as permanent (stop immediately).
for (let attempt = 1; attempt <= DOWNLOAD_MAX_ATTEMPTS; attempt++) {
try {
const response = await this.client.post(url, null, {
headers: { "Content-Type": "application/pdf" },
timeout: DEFAULT_TIMEOUT_MS,
responseType: "arraybuffer",
validateStatus: () => true, // inspect status ourselves for retry
});
const data = response.data;
if (!data || data.length < 100) {
throw new Error(`ePay download empty (${data?.length ?? 0} bytes)`);
}
const buf = Buffer.from(data);
// R2: if the ePay session expired mid-batch, DownloadFile returns the
// login/error HTML page (200 OK) instead of the PDF. Storing that as a
// ".pdf" silently corrupts the extract. Assert the PDF magic bytes.
if (buf.subarray(0, 5).toString("latin1") !== "%PDF-") {
const head = buf.subarray(0, 64).toString("latin1");
throw new Error(
`ePay download not a PDF (idDocument=${idDocument}, ${buf.length} bytes, head="${head.replace(/\s+/g, " ").slice(0, 40)}") — session may have expired`,
);
if (response.status >= 400) {
lastErr = `HTTP ${response.status}`;
if (response.status < 500) break; // client error — won't fix on retry
} else {
const buf = Buffer.from(response.data ?? Buffer.alloc(0));
if (buf.length < 100) {
lastErr = `empty (${buf.length} bytes)`;
} else if (buf.subarray(0, 5).toString("latin1") !== "%PDF-") {
// Not a PDF — usually a transient ANCPI error page or an expired
// session. Retry; a fresh attempt often returns the real PDF.
const head = buf.subarray(0, 48).toString("latin1").replace(/\s+/g, " ");
lastErr = `not a PDF (head="${head.slice(0, 40)}")`;
} else {
if (attempt > 1) {
console.log(`[epay] download ${idDocument} recovered on attempt ${attempt}`);
}
console.log(`[epay] Downloaded document ${idDocument}: ${buf.length} bytes`);
return buf;
}
}
} catch (err) {
// Network error / timeout — retryable.
lastErr = err instanceof Error ? err.message : String(err);
}
if (attempt < DOWNLOAD_MAX_ATTEMPTS) {
console.warn(
`[epay] download ${idDocument} attempt ${attempt} failed (${lastErr}); retrying in ${DOWNLOAD_RETRY_DELAY_MS * attempt}ms`,
);
await sleep(DOWNLOAD_RETRY_DELAY_MS * attempt);
}
}
throw new Error(
`ePay download failed after ${DOWNLOAD_MAX_ATTEMPTS} attempts (idDocument=${idDocument}): ${lastErr}`,
);
}
}
+32 -4
View File
@@ -574,10 +574,10 @@ async function finalizeOrder(
plans.push({ item, doc, matchedByIndex, index: next });
}
// Step 6: download + store in parallel (bounded). Each task is fully
// self-contained so a failure on one row doesn't abort the others. The
// file index is pre-allocated above, so parallel stores never overwrite.
await runWithConcurrency(plans, DOWNLOAD_CONCURRENCY, async ({ item, doc, matchedByIndex, index: fileIndex }) => {
// One plan's download + store. Returns true on success. On failure it
// marks the row failed and returns false so the caller can retry it.
const downloadAndStore = async (plan: Plan): Promise<boolean> => {
const { item, doc, matchedByIndex, index: fileIndex } = plan;
try {
await updateStatus(item.extractId, "downloading", {
idDocument: doc.idDocument,
@@ -629,15 +629,43 @@ async function finalizeOrder(
console.log(
`[epay-queue] ${matchedByIndex ? "Review" : "Completed"}: ${item.input.nrCadastral}${path}`,
);
return true;
} catch (error) {
const message =
error instanceof Error ? error.message : "Eroare download/stocare";
await updateStatus(item.extractId, "failed", {
errorMessage: message,
});
return false;
}
};
// Step 6: download + store in parallel (bounded). Each task is fully
// self-contained so a failure on one row doesn't abort the others. The
// file index is pre-allocated above, so parallel stores never overwrite.
// downloadDocument already retries transient ANCPI errors per call; this
// adds a SECOND layer — a final sweep that re-attempts any row still
// failed (covers a longer ANCPI blip or a MinIO hiccup) with no new
// charge, since the order is already paid.
const failed: Plan[] = [];
await runWithConcurrency(plans, DOWNLOAD_CONCURRENCY, async (plan) => {
const ok = await downloadAndStore(plan);
if (!ok) failed.push(plan);
});
if (failed.length > 0) {
console.warn(
`[epay-queue] ${failed.length}/${plans.length} downloads failed for order ${orderId} — retry sweep in 5s...`,
);
await new Promise((r) => setTimeout(r, 5000));
for (const plan of failed) {
const ok = await downloadAndStore(plan);
console.log(
`[epay-queue] retry sweep ${plan.item.input.nrCadastral}: ${ok ? "recovered" : "still failed"}`,
);
}
}
// Update credits after successful order
const newCredits = await client.getCredits();
updateEpayCredits(newCredits);