From 04355ff5656a4e9fd4ca858c55a5a943b8c3812d Mon Sep 17 00:00:00 2001 From: Philippe Schrettenbrunner Date: Mon, 10 Nov 2025 12:59:30 +0100 Subject: [PATCH] Add adjusted PDF generator from AIMA project --- PDFGenerator/generate_pdf_preview.py | 399 +++++++++++++++++++++++++++ 1 file changed, 399 insertions(+) create mode 100755 PDFGenerator/generate_pdf_preview.py diff --git a/PDFGenerator/generate_pdf_preview.py b/PDFGenerator/generate_pdf_preview.py new file mode 100755 index 0000000..ef0fa76 --- /dev/null +++ b/PDFGenerator/generate_pdf_preview.py @@ -0,0 +1,399 @@ +#!/usr/bin/env python3 +import argparse +import re +import subprocess +from datetime import datetime +from markdown import markdown +from pathlib import Path +from weasyprint import HTML + +VALIDATE_STRUCTURE = False + +def sanitize_heading(text): + return re.sub(r'[^a-zA-Z0-9_-]', '', text.replace(' ', '_')) + +def get_git_info(): + try: + commit = subprocess.check_output(["git", "rev-parse", "--short", "HEAD"]).decode().strip() + branch = subprocess.check_output(["git", "rev-parse", "--abbrev-ref", "HEAD"]).decode().strip() + return branch, commit + except Exception: + return "unknown", "unknown" + +def transform_special_blockquotes(md_text: str) -> str: + md_text = re.sub( + r'(?m)^> NOTE:\s*(.*)', + r'
Note: \1
', + md_text + ) + md_text = re.sub( + r'(?m)^> COMMENT:\s*(.*)', + r'
Comment: \1
', + md_text + ) + return md_text + +def validate_markdown(file_path: Path, html: str): + headings = re.findall(r'(.*?)', html) + + if VALIDATE_STRUCTURE: + if not any(level == '1' for level, _ in headings): + raise ValueError(f"Validation failed: No

heading found in {file_path}") + + return headings + +def resolve_image_paths(html: str, base_path: Path) -> str: + def repl(match): + src = match.group(1) + abs_path = (base_path / src).resolve() + if not abs_path.exists(): + print(f"Warning: Image file not found: {abs_path}") + else: + print(f"Embedding image: {src} → {abs_path}") + return f'' + + return re.sub(r']*src="([^"]+)"[^>]*>', repl, html) + +def parse_toc_file(toc_path: Path) -> list[Path]: + """Parse a markdown ToC file and extract file paths in order.""" + print(f"\nParsing ToC file: {toc_path}") + with open(toc_path, encoding='utf-8') as f: + toc_content = f.read() + + # Extract all markdown links: [text](path) + link_pattern = r'\[([^\]]+)\]\(([^)]+\.md)\)' + matches = re.findall(link_pattern, toc_content) + + file_paths = [] + toc_dir = toc_path.parent + + for title, relative_path in matches: + # Resolve relative path from ToC file location + # Clean up paths like ../Document/../Document/content/file.md + abs_path = (toc_dir / relative_path).resolve() + + if not abs_path.exists(): + print(f" ⚠ Warning: File not found: {relative_path} → {abs_path}") + else: + file_paths.append(abs_path) + print(f" ✓ Found: {relative_path}") + + print(f"\nTotal files found: {len(file_paths)}") + return file_paths + +def scan_directory(input_dir: Path) -> list[Path]: + """Scan directory for markdown files with numeric naming pattern.""" + print(f"\nScanning directory: {input_dir}") + markdown_files = sorted( + [ + p for p in input_dir.rglob('*.md') + if re.match(r'^\d{2}', p.name) + and re.match(r'^\d{2}', p.relative_to(input_dir).parts[0]) + ], + key=lambda p: str(p) + ) + print(f"Total files found: {len(markdown_files)}") + return markdown_files + +def generate_pdf(input_path: Path, output_file: Path): + # Determine input mode: ToC file or directory + if input_path.is_file(): + print(f"Mode: ToC file") + markdown_files = parse_toc_file(input_path) + elif input_path.is_dir(): + print(f"Mode: Directory scan") + markdown_files = scan_directory(input_path) + else: + raise ValueError(f"Input path does not exist: {input_path}") + + if not markdown_files: + raise ValueError("No markdown files found to process") + + toc_entries = [] + content_blocks = [] + + h1_counter = 0 + h2_counter = 0 + h3_counter = 0 + + print(f"\nProcessing {len(markdown_files)} files...") + for file in markdown_files: + print(f" Processing: {file.name}") + with open(file, encoding='utf-8') as f: + raw_md = f.read() + raw_md = transform_special_blockquotes(raw_md) + html = markdown(raw_md, extensions=['extra', 'nl2br', 'sane_lists', 'attr_list']) + html = resolve_image_paths(html, file.parent) + headings = validate_markdown(file, html) + + for level, heading in headings: + if level == '1': + h1_counter += 1 + h2_counter = 0 + h3_counter = 0 + number = f"{h1_counter}" + elif level == '2': + h2_counter += 1 + h3_counter = 0 + number = f"{h1_counter}.{h2_counter}" + elif level == '3': + h3_counter += 1 + number = f"{h1_counter}.{h2_counter}.{h3_counter}" + else: + number = "" + + numbered_heading = f"{number} {heading}" + anchor = sanitize_heading(f"{number}_{heading}") + toc_entries.append( + f"
  • {numbered_heading}
  • " + ) + html = html.replace( + f"{heading}", + f"{numbered_heading}" + ) + + content_blocks.append(html) + + current_date = datetime.utcnow().strftime('%Y-%m-%d %H:%M UTC') + branch, commit = get_git_info() + + combined_html = f""" + + + + + + +
    DRAFT
    +
    +

    AI Testing Guide

    +

    PDF Preview

    +

    Generated on: {current_date}
    + From Branch: {branch}
    + From Commit: {commit}

    +
    +
    +

    Draft Disclaimer

    +

    This PDF is an automatically generated draft intended for internal review and development purposes. It is not final, not publication-ready, and may contain formatting or structural inconsistencies.

    +

    Content, structure, and layout are subject to change. For questions or feedback, please refer to the project repository or connect via Slack.

    +
    +
    +

    Table of Contents

    +
      + {''.join(toc_entries)} +
    +
    + {''.join(content_blocks)} + + +""" + + output_file.parent.mkdir(parents=True, exist_ok=True) + print(f"\nGenerating PDF: {output_file}") + HTML(string=combined_html).write_pdf(str(output_file)) + print(f"✓ PDF generated successfully") + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Generate PDF from markdown files. Supports both directory scanning and ToC file parsing." + ) + parser.add_argument( + "input_path", + type=Path, + help="Path to ToC markdown file OR root directory containing markdown files" + ) + parser.add_argument("output_file", type=Path, help="Path to output PDF file") + + args = parser.parse_args() + generate_pdf(args.input_path, args.output_file) \ No newline at end of file