gstack/make-pdf/test/pdftotext.test.ts

/**
 * pdftotext unit tests — normalize() and copyPasteGate() assertions.
 *
 * These tests are pure unit tests of the normalization + assertion logic.
 * They do NOT require pdftotext to be installed (the actual binary is
 * mocked by manipulating strings directly).
 */

import { describe, expect, test } from "bun:test";

import { normalize, copyPasteGate } from "../src/pdftotext";

describe("normalize", () => {
  test("strips trailing spaces", () => {
    expect(normalize("hello   \nworld")).toBe("hello\nworld");
  });

  test("collapses runs of 3+ blank lines to 2", () => {
    expect(normalize("a\n\n\n\nb")).toBe("a\n\nb");
  });

  test("converts form feeds to double newlines (page break boundary)", () => {
    expect(normalize("page1\fpage2")).toBe("page1\n\npage2");
  });

  test("normalizes CRLF and CR to LF (Windows Xpdf)", () => {
    expect(normalize("a\r\nb\rc")).toBe("a\nb\nc");
  });

  test("removes soft hyphens (hyphens: auto artifact)", () => {
    expect(normalize("extra\u00adordinary")).toBe("extraordinary");
  });

  test("replaces non-breaking space with regular space", () => {
    expect(normalize("hello\u00a0world")).toBe("hello world");
  });

  test("strips zero-width characters", () => {
    expect(normalize("a\u200bb\u200cc")).toBe("abc");
  });

  test("NFC-normalizes composed glyphs (macOS NFD → Linux NFC)", () => {
    // "é" composed vs decomposed
    const decomposed = "e\u0301";
    const composed = "\u00e9";
    expect(normalize(decomposed)).toBe(composed);
  });

  test("trims leading/trailing whitespace on whole string", () => {
    expect(normalize("\n\n  hello  \n\n")).toBe("hello");
  });
});

describe("copyPasteGate — assertion logic", () => {
  // These tests exercise the gate's internal assertions by mocking the
  // pdftotext step. We can't easily run the real binary in every test
  // env, so we verify the assertion logic directly via fake inputs.
  //
  // The gate takes a PDF path — but assertion #1 (paragraph presence) and
  // #2 (per-glyph emission) are string operations we can validate here.

  test("flags 'S ai li ng' per-glyph emission when reassembled letters appear in source", () => {
    // Build expected/extracted strings that would trip the gate.
    const expected = "Sailing on the open sea.";
    const extracted = "S a i l i n g   on the open sea.";
    // Simulate by running normalize + assertion manually; the regex is
    // looked at in the gate.
    const fragRegex = /((?:\b\w\s){4,})/g;
    const match = fragRegex.exec(extracted);
    expect(match).not.toBeNull();
    if (match) {
      const letters = match[1].replace(/\s/g, "");
      expect(letters.toLowerCase()).toBe("sailing");
      expect(expected.toLowerCase().includes(letters.toLowerCase())).toBe(true);
    }
  });

  test("does NOT flag 'A B C D' as per-glyph when letters don't appear in source", () => {
    const expected = "The quick brown fox.";
    const extracted = "The quick A B C D brown fox.";
    const fragRegex = /((?:\b\w\s){4,})/g;
    const match = fragRegex.exec(extracted);
    if (match) {
      const letters = match[1].replace(/\s/g, "");
      // "ABCD" is not a substring of expected
      expect(expected.toLowerCase().includes(letters.toLowerCase())).toBe(false);
    }
  });

  test("paragraph boundary count drift calculation", () => {
    const expected = "para1\n\npara2\n\npara3";
    const extractedOk = "para1\n\npara2\n\npara3";
    const extractedTooFew = "para1 para2 para3";
    const extractedTooMany = "para1\n\n\n\npara2\n\n\n\npara3\n\n\n\npara4\n\n\n\npara5";

    const expectedBreaks = (expected.match(/\n\n/g) || []).length;
    const okBreaks = (extractedOk.match(/\n\n/g) || []).length;
    const tooFewBreaks = (extractedTooFew.match(/\n\n/g) || []).length;
    const tooManyBreaksNormalized = (normalize(extractedTooMany).match(/\n\n/g) || []).length;

    expect(Math.abs(expectedBreaks - okBreaks)).toBeLessThanOrEqual(4);
    expect(Math.abs(expectedBreaks - tooFewBreaks)).toBeGreaterThan(1);
    // After normalize, 3+ newlines become 2, so the count matches
    expect(Math.abs(expectedBreaks - tooManyBreaksNormalized)).toBeLessThanOrEqual(4);
  });
});