feat(epay): three layers of download/poll resilience

After 327649 hit a transient ANCPI 500 on download (succeeded immediately on
manual retry), make the pipeline self-heal instead of marking the row failed:

1. downloadDocument retries transient failures (5xx, network/timeout, empty
   body, non-PDF error page) up to 4 attempts with linear backoff (3/6/9s);
   a 4xx is permanent and stops immediately. The %PDF guard stays — a bad
   body is now retried rather than thrown on the first try.

2. pollUntilComplete tolerates a transient error on a single poll: it logs and
   continues to the next cycle instead of throwing out of the whole batch (one
   ANCPI blip during polling no longer fails a paid order).

3. finalizeOrder runs a final retry sweep: any row still failed after the
   parallel pass is re-attempted once more after a short pause (covers a longer
   ANCPI blip or a MinIO hiccup). No new charge — the order is already paid.

Same downloadDocument + pollUntilComplete hardening ported to eterra-live.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Claude VM
2026-06-05 16:42:23 +03:00
parent 1c8d7ea59c
commit c9f1219eaa
2 changed files with 98 additions and 29 deletions
+66 -25
View File
@@ -51,6 +51,9 @@ const POLL_MAX_ATTEMPTS = 40;
// ShowOrderDetails page size — large enough to fetch any realistic batch in // ShowOrderDetails page size — large enough to fetch any realistic batch in
// one request (see getOrderStatus / QW4). // one request (see getOrderStatus / QW4).
const ORDER_PAGE_SIZE = 50; const ORDER_PAGE_SIZE = 50;
// Document download retry (transient ANCPI 5xx / timeout / error-page).
const DOWNLOAD_MAX_ATTEMPTS = 4;
const DOWNLOAD_RETRY_DELAY_MS = 3_000; // linear backoff: 3s, 6s, 9s
/* ------------------------------------------------------------------ */ /* ------------------------------------------------------------------ */
/* Session cache */ /* Session cache */
@@ -779,10 +782,20 @@ export class EpayClient {
onProgress?: (attempt: number, status: string) => void, onProgress?: (attempt: number, status: string) => void,
): Promise<EpayOrderStatus> { ): Promise<EpayOrderStatus> {
for (let attempt = 1; attempt <= POLL_MAX_ATTEMPTS; attempt++) { for (let attempt = 1; attempt <= POLL_MAX_ATTEMPTS; attempt++) {
const status = await this.getOrderStatus(orderId); try {
if (onProgress) onProgress(attempt, status.status); const status = await this.getOrderStatus(orderId);
if (["Finalizata", "Anulata", "Plata refuzata"].includes(status.status)) { if (onProgress) onProgress(attempt, status.status);
return status; if (["Finalizata", "Anulata", "Plata refuzata"].includes(status.status)) {
return status;
}
} catch (err) {
// A transient ANCPI error (5xx, timeout) on ONE poll must not abort
// the whole batch — the order is paid and still being processed.
// Log and try again on the next cycle.
const msg = err instanceof Error ? err.message : String(err);
console.warn(
`[epay] poll ${attempt}/${POLL_MAX_ATTEMPTS} for order ${orderId} errored (${msg}); continuing`,
);
} }
await sleep(POLL_INTERVAL_MS); await sleep(POLL_INTERVAL_MS);
} }
@@ -793,29 +806,57 @@ export class EpayClient {
async downloadDocument(idDocument: number, typeD = 4): Promise<Buffer> { async downloadDocument(idDocument: number, typeD = 4): Promise<Buffer> {
const url = `${BASE_URL}/DownloadFile.action?typeD=${typeD}&id=${idDocument}&source=&browser=chrome`; const url = `${BASE_URL}/DownloadFile.action?typeD=${typeD}&id=${idDocument}&source=&browser=chrome`;
// Angular sends Content-Type: application/pdf in the REQUEST let lastErr = "unknown";
const response = await this.client.post(url, null, {
headers: { "Content-Type": "application/pdf" },
timeout: DEFAULT_TIMEOUT_MS,
responseType: "arraybuffer",
});
const data = response.data; // ANCPI's DownloadFile occasionally returns a transient 5xx / times out /
if (!data || data.length < 100) { // hands back an error page even when the order is finalized (2026-06-05:
throw new Error(`ePay download empty (${data?.length ?? 0} bytes)`); // 327649 got one 500, then succeeded on the very next attempt). The
// download is idempotent, so retry transient failures with backoff before
// giving up. A 4xx is treated as permanent (stop immediately).
for (let attempt = 1; attempt <= DOWNLOAD_MAX_ATTEMPTS; attempt++) {
try {
const response = await this.client.post(url, null, {
headers: { "Content-Type": "application/pdf" },
timeout: DEFAULT_TIMEOUT_MS,
responseType: "arraybuffer",
validateStatus: () => true, // inspect status ourselves for retry
});
if (response.status >= 400) {
lastErr = `HTTP ${response.status}`;
if (response.status < 500) break; // client error — won't fix on retry
} else {
const buf = Buffer.from(response.data ?? Buffer.alloc(0));
if (buf.length < 100) {
lastErr = `empty (${buf.length} bytes)`;
} else if (buf.subarray(0, 5).toString("latin1") !== "%PDF-") {
// Not a PDF — usually a transient ANCPI error page or an expired
// session. Retry; a fresh attempt often returns the real PDF.
const head = buf.subarray(0, 48).toString("latin1").replace(/\s+/g, " ");
lastErr = `not a PDF (head="${head.slice(0, 40)}")`;
} else {
if (attempt > 1) {
console.log(`[epay] download ${idDocument} recovered on attempt ${attempt}`);
}
console.log(`[epay] Downloaded document ${idDocument}: ${buf.length} bytes`);
return buf;
}
}
} catch (err) {
// Network error / timeout — retryable.
lastErr = err instanceof Error ? err.message : String(err);
}
if (attempt < DOWNLOAD_MAX_ATTEMPTS) {
console.warn(
`[epay] download ${idDocument} attempt ${attempt} failed (${lastErr}); retrying in ${DOWNLOAD_RETRY_DELAY_MS * attempt}ms`,
);
await sleep(DOWNLOAD_RETRY_DELAY_MS * attempt);
}
} }
const buf = Buffer.from(data); throw new Error(
// R2: if the ePay session expired mid-batch, DownloadFile returns the `ePay download failed after ${DOWNLOAD_MAX_ATTEMPTS} attempts (idDocument=${idDocument}): ${lastErr}`,
// login/error HTML page (200 OK) instead of the PDF. Storing that as a );
// ".pdf" silently corrupts the extract. Assert the PDF magic bytes.
if (buf.subarray(0, 5).toString("latin1") !== "%PDF-") {
const head = buf.subarray(0, 64).toString("latin1");
throw new Error(
`ePay download not a PDF (idDocument=${idDocument}, ${buf.length} bytes, head="${head.replace(/\s+/g, " ").slice(0, 40)}") — session may have expired`,
);
}
console.log(`[epay] Downloaded document ${idDocument}: ${buf.length} bytes`);
return buf;
} }
} }
+32 -4
View File
@@ -574,10 +574,10 @@ async function finalizeOrder(
plans.push({ item, doc, matchedByIndex, index: next }); plans.push({ item, doc, matchedByIndex, index: next });
} }
// Step 6: download + store in parallel (bounded). Each task is fully // One plan's download + store. Returns true on success. On failure it
// self-contained so a failure on one row doesn't abort the others. The // marks the row failed and returns false so the caller can retry it.
// file index is pre-allocated above, so parallel stores never overwrite. const downloadAndStore = async (plan: Plan): Promise<boolean> => {
await runWithConcurrency(plans, DOWNLOAD_CONCURRENCY, async ({ item, doc, matchedByIndex, index: fileIndex }) => { const { item, doc, matchedByIndex, index: fileIndex } = plan;
try { try {
await updateStatus(item.extractId, "downloading", { await updateStatus(item.extractId, "downloading", {
idDocument: doc.idDocument, idDocument: doc.idDocument,
@@ -629,15 +629,43 @@ async function finalizeOrder(
console.log( console.log(
`[epay-queue] ${matchedByIndex ? "Review" : "Completed"}: ${item.input.nrCadastral}${path}`, `[epay-queue] ${matchedByIndex ? "Review" : "Completed"}: ${item.input.nrCadastral}${path}`,
); );
return true;
} catch (error) { } catch (error) {
const message = const message =
error instanceof Error ? error.message : "Eroare download/stocare"; error instanceof Error ? error.message : "Eroare download/stocare";
await updateStatus(item.extractId, "failed", { await updateStatus(item.extractId, "failed", {
errorMessage: message, errorMessage: message,
}); });
return false;
} }
};
// Step 6: download + store in parallel (bounded). Each task is fully
// self-contained so a failure on one row doesn't abort the others. The
// file index is pre-allocated above, so parallel stores never overwrite.
// downloadDocument already retries transient ANCPI errors per call; this
// adds a SECOND layer — a final sweep that re-attempts any row still
// failed (covers a longer ANCPI blip or a MinIO hiccup) with no new
// charge, since the order is already paid.
const failed: Plan[] = [];
await runWithConcurrency(plans, DOWNLOAD_CONCURRENCY, async (plan) => {
const ok = await downloadAndStore(plan);
if (!ok) failed.push(plan);
}); });
if (failed.length > 0) {
console.warn(
`[epay-queue] ${failed.length}/${plans.length} downloads failed for order ${orderId} — retry sweep in 5s...`,
);
await new Promise((r) => setTimeout(r, 5000));
for (const plan of failed) {
const ok = await downloadAndStore(plan);
console.log(
`[epay-queue] retry sweep ${plan.item.input.nrCadastral}: ${ok ? "recovered" : "still failed"}`,
);
}
}
// Update credits after successful order // Update credits after successful order
const newCredits = await client.getCredits(); const newCredits = await client.getCredits();
updateEpayCredits(newCredits); updateEpayCredits(newCredits);