refactor(pdf-compress): replace Ghostscript with qpdf + iLovePDF API

Ghostscript -sDEVICE=pdfwrite fundamentally re-encodes fonts, causing
garbled text regardless of parameters. This cannot be fixed.

New approach:
- Local: qpdf-only lossless structural optimization (5-30% savings,
  zero corruption risk — fonts and images completely untouched)
- Cloud: iLovePDF API integration (auth → start → upload → process →
  download) with 3 levels (recommended/extreme/low), proper image
  recompression without font corruption

Frontend: 3 modes (cloud recommended, cloud extreme, local lossless).
Docker: ILOVEPDF_PUBLIC_KEY env var added.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
AI Assistant
2026-03-13 17:50:46 +02:00
parent d75fcb1d1c
commit f5deccd8ea
4 changed files with 358 additions and 203 deletions
+34 -189
View File
@@ -8,81 +8,9 @@ import { tmpdir } from "os";
const execFileAsync = promisify(execFile);
// Ghostscript args for PDF compression.
//
// CRITICAL: Do NOT use -dPDFSETTINGS=/screen — it overrides font encoding
// and produces garbled text. Instead, set each parameter individually so we
// only compress IMAGES while keeping fonts and text intact.
//
// Strategy: recompress all raster images to JPEG at quality ~40-50,
// downsample to 150 DPI, deduplicate, compress streams. Fonts untouched.
function gsArgs(
input: string,
output: string,
level: "extreme" | "high" | "balanced",
): string[] {
// Quality presets — only affect images, never fonts
const presets = {
extreme: { dpi: 100, qfactor: 1.2 }, // ~quality 35, aggressive
high: { dpi: 150, qfactor: 0.76 }, // ~quality 50, good balance
balanced: { dpi: 200, qfactor: 0.4 }, // ~quality 70, minimal loss
};
const { dpi, qfactor } = presets[level];
return [
"-sDEVICE=pdfwrite",
"-dCompatibilityLevel=1.5",
"-dNOPAUSE",
"-dBATCH",
`-sOutputFile=${output}`,
// ── Image recompression (the main size reducer) ──
// Force re-encode of existing JPEGs — without this, GS passes them through
"-dPassThroughJPEGImages=false",
"-dPassThroughJPXImages=false",
// Use DCT (JPEG) for all color/gray images
"-dAutoFilterColorImages=false",
"-dAutoFilterGrayImages=false",
"-dColorImageFilter=/DCTEncode",
"-dGrayImageFilter=/DCTEncode",
"-dEncodeColorImages=true",
"-dEncodeGrayImages=true",
// ── Downsampling ──
"-dDownsampleColorImages=true",
"-dDownsampleGrayImages=true",
"-dDownsampleMonoImages=true",
`-dColorImageResolution=${dpi}`,
`-dGrayImageResolution=${dpi}`,
`-dMonoImageResolution=${Math.max(dpi, 200)}`, // mono needs higher DPI
"-dColorImageDownsampleType=/Bicubic",
"-dGrayImageDownsampleType=/Bicubic",
"-dColorImageDownsampleThreshold=1.0",
"-dGrayImageDownsampleThreshold=1.0",
"-dMonoImageDownsampleThreshold=1.0",
// ── Font handling — PRESERVE everything ──
"-dSubsetFonts=true", // subset is safe — keeps encoding, reduces size
"-dEmbedAllFonts=true", // ensure all fonts stay embedded
"-dCompressFonts=true",
// ── Structure / stream optimization ──
"-dCompressStreams=true",
"-dDetectDuplicateImages=true",
"-sColorConversionStrategy=RGB", // CMYK→RGB saves ~25% on CMYK images
// ── JPEG quality dictionaries ──
"-c",
`<< /ColorACSImageDict << /QFactor ${qfactor} /Blend 1 /ColorTransform 1 /HSamples [2 1 1 2] /VSamples [2 1 1 2] >> >> setdistillerparams`,
`<< /GrayACSImageDict << /QFactor ${qfactor} /Blend 1 /HSamples [2 1 1 2] /VSamples [2 1 1 2] >> >> setdistillerparams`,
`<< /ColorImageDict << /QFactor ${qfactor} /Blend 1 /ColorTransform 1 /HSamples [2 1 1 2] /VSamples [2 1 1 2] >> >> setdistillerparams`,
`<< /GrayImageDict << /QFactor ${qfactor} /Blend 1 /HSamples [2 1 1 2] /VSamples [2 1 1 2] >> >> setdistillerparams`,
"-f",
input,
];
}
// qpdf args for structure polish (5-15% additional saving)
// qpdf-only compression: lossless structural optimization.
// Does NOT re-encode fonts or images — zero risk of corruption.
// Typical savings: 5-30% depending on PDF structure.
function qpdfArgs(input: string, output: string): string[] {
return [
input,
@@ -115,73 +43,31 @@ function extractFileFromMultipart(
const partStart = raw.indexOf(boundaryBuf, searchFrom);
if (partStart === -1) break;
// Find end of boundary line
const lineEnd = raw.indexOf(crlf, partStart);
if (lineEnd === -1) break;
// Find blank line separating headers from body
const headerEnd = raw.indexOf(headerSep, lineEnd);
if (headerEnd === -1) break;
// Check if this part has a filename
const headers = raw.subarray(lineEnd + 2, headerEnd).toString("utf8");
if (headers.includes("filename=")) {
const fileStart = headerEnd + 4; // skip \r\n\r\n
const fileStart = headerEnd + 4;
// Find closing boundary — search from end to avoid false matches inside PDF
const closingMarker = Buffer.from(`\r\n--${boundary}`);
const fileEnd = raw.lastIndexOf(closingMarker);
if (fileEnd > fileStart) {
return raw.subarray(fileStart, fileEnd);
}
// Fallback: no closing boundary found, take everything after headers
return raw.subarray(fileStart);
}
// Skip past this part
searchFrom = headerEnd + 4;
}
return null;
}
/**
* Extract a simple text field value from a multipart body.
* Returns null if the field is not found.
*/
function extractFieldFromMultipart(
raw: Buffer,
boundary: string,
fieldName: string,
): string | null {
const boundaryBuf = Buffer.from(`--${boundary}`);
const headerSep = Buffer.from("\r\n\r\n");
const crlf = Buffer.from("\r\n");
const namePattern = `name="${fieldName}"`;
let searchFrom = 0;
while (searchFrom < raw.length) {
const partStart = raw.indexOf(boundaryBuf, searchFrom);
if (partStart === -1) break;
const lineEnd = raw.indexOf(crlf, partStart);
if (lineEnd === -1) break;
const headerEnd = raw.indexOf(headerSep, lineEnd);
if (headerEnd === -1) break;
const headers = raw.subarray(lineEnd + 2, headerEnd).toString("utf8");
if (headers.includes(namePattern) && !headers.includes("filename=")) {
const valueStart = headerEnd + 4;
const nextBoundary = raw.indexOf(Buffer.from(`\r\n--${boundary}`), valueStart);
if (nextBoundary > valueStart) {
return raw.subarray(valueStart, nextBoundary).toString("utf8").trim();
}
}
searchFrom = headerEnd + 4;
}
return null;
}
async function cleanup(dir: string) {
try {
const { readdir } = await import("fs/promises");
@@ -197,16 +83,13 @@ async function cleanup(dir: string) {
}
export async function POST(req: NextRequest) {
const tmpDir = join(tmpdir(), `pdf-extreme-${randomUUID()}`);
const tmpDir = join(tmpdir(), `pdf-qpdf-${randomUUID()}`);
try {
await mkdir(tmpDir, { recursive: true });
const inputPath = join(tmpDir, "input.pdf");
const gsOutputPath = join(tmpDir, "gs-output.pdf");
const finalOutputPath = join(tmpDir, "final.pdf");
const outputPath = join(tmpDir, "output.pdf");
// Collect raw body via arrayBuffer() — more reliable than formData() for
// large files, and more reliable than Readable.fromWeb streaming to disk.
if (!req.body) {
return NextResponse.json(
{ error: "Lipsește fișierul PDF." },
@@ -216,7 +99,6 @@ export async function POST(req: NextRequest) {
const rawBuf = Buffer.from(await req.arrayBuffer());
// Extract PDF from multipart body
const contentType = req.headers.get("content-type") || "";
const boundaryMatch = contentType.match(
/boundary=(?:"([^"]+)"|([^\s;]+))/,
@@ -239,86 +121,50 @@ export async function POST(req: NextRequest) {
);
}
// Extract compression level from multipart (optional "level" field)
const levelParam = extractFieldFromMultipart(rawBuf, boundary, "level");
const level: "extreme" | "high" | "balanced" =
levelParam === "high" ? "high" :
levelParam === "balanced" ? "balanced" : "extreme";
await writeFile(inputPath, pdfBuffer);
const originalSize = pdfBuffer.length;
// Step 1: Ghostscript — image recompression + downsampling (fonts untouched)
// qpdf: lossless structural optimization — fonts and images untouched
try {
const { stderr } = await execFileAsync(
"gs",
gsArgs(inputPath, gsOutputPath, level),
{
timeout: 300_000, // 5 min for very large files
maxBuffer: 10 * 1024 * 1024, // 10MB stderr buffer
},
);
if (stderr && stderr.includes("Error")) {
console.error("[PDF extreme] GS stderr:", stderr.slice(0, 500));
}
} catch (gsErr) {
await execFileAsync("qpdf", qpdfArgs(inputPath, outputPath), {
timeout: 120_000,
maxBuffer: 10 * 1024 * 1024,
});
} catch (qpdfErr) {
const msg =
gsErr instanceof Error ? gsErr.message : "Ghostscript failed";
qpdfErr instanceof Error ? qpdfErr.message : "qpdf failed";
if (msg.includes("ENOENT") || msg.includes("not found")) {
return NextResponse.json(
{
error:
"Ghostscript nu este instalat pe server. Trebuie adăugat `ghostscript` în Dockerfile.",
},
{ error: "qpdf nu este instalat pe server." },
{ status: 501 },
);
}
// Include stderr in error for debugging
const stderr =
gsErr && typeof gsErr === "object" && "stderr" in gsErr
? String((gsErr as { stderr: unknown }).stderr).slice(0, 300)
: "";
return NextResponse.json(
{
error: `Ghostscript error: ${msg.slice(0, 200)}${stderr ? `${stderr}` : ""}`,
},
{ status: 500 },
);
// qpdf returns exit code 3 for warnings — output is still valid
const exitCode =
qpdfErr && typeof qpdfErr === "object" && "code" in qpdfErr
? (qpdfErr as { code: number }).code
: null;
if (exitCode !== 3) {
return NextResponse.json(
{ error: `qpdf error: ${msg.slice(0, 300)}` },
{ status: 500 },
);
}
}
// Verify GS output is a valid non-empty PDF
let gsSize = 0;
// Verify output exists
let outputSize = 0;
try {
const gsStat = await stat(gsOutputPath);
gsSize = gsStat.size;
const s = await stat(outputPath);
outputSize = s.size;
} catch {
return NextResponse.json(
{ error: "Ghostscript nu a produs fișier output." },
{ error: "qpdf nu a produs fișier output." },
{ status: 500 },
);
}
if (gsSize < 100) {
return NextResponse.json(
{
error: `Ghostscript a produs un fișier gol (${gsSize} bytes). PDF-ul poate conține elemente incompatibile.`,
},
{ status: 500 },
);
}
// Step 2: qpdf — structure optimization + linearization
let finalPath = gsOutputPath;
try {
await execFileAsync("qpdf", qpdfArgs(gsOutputPath, finalOutputPath), {
timeout: 60_000,
});
finalPath = finalOutputPath;
} catch {
// qpdf failed or not installed — GS output is still good
}
const resultBuffer = await readFile(finalPath);
const resultBuffer = await readFile(outputPath);
const compressedSize = resultBuffer.length;
// If compression made it bigger, return original
@@ -327,8 +173,7 @@ export async function POST(req: NextRequest) {
status: 200,
headers: {
"Content-Type": "application/pdf",
"Content-Disposition":
'attachment; filename="compressed-extreme.pdf"',
"Content-Disposition": 'attachment; filename="optimized.pdf"',
"X-Original-Size": String(originalSize),
"X-Compressed-Size": String(originalSize),
},
@@ -339,7 +184,7 @@ export async function POST(req: NextRequest) {
status: 200,
headers: {
"Content-Type": "application/pdf",
"Content-Disposition": 'attachment; filename="compressed-extreme.pdf"',
"Content-Disposition": 'attachment; filename="optimized.pdf"',
"X-Original-Size": String(originalSize),
"X-Compressed-Size": String(compressedSize),
},
@@ -347,7 +192,7 @@ export async function POST(req: NextRequest) {
} catch (err) {
const message = err instanceof Error ? err.message : "Unknown error";
return NextResponse.json(
{ error: `Eroare la compresia extremă: ${message}` },
{ error: `Eroare la optimizare: ${message}` },
{ status: 500 },
);
} finally {