#!/usr/bin/env python3 import argparse import re import subprocess from datetime import datetime from markdown import markdown from pathlib import Path from weasyprint import HTML VALIDATE_STRUCTURE = False def sanitize_heading(text): return re.sub(r'[^a-zA-Z0-9_-]', '', text.replace(' ', '_')) def get_git_info(): try: commit = subprocess.check_output(["git", "rev-parse", "--short", "HEAD"]).decode().strip() branch = subprocess.check_output(["git", "rev-parse", "--abbrev-ref", "HEAD"]).decode().strip() return branch, commit except Exception: return "unknown", "unknown" def transform_special_blockquotes(md_text: str) -> str: md_text = re.sub( r'(?m)^> NOTE:\s*(.*)', r'
Note: \1
', md_text ) md_text = re.sub( r'(?m)^> COMMENT:\s*(.*)', r'
Comment: \1
', md_text ) return md_text def validate_markdown(file_path: Path, html: str): headings = re.findall(r'(.*?)', html) if VALIDATE_STRUCTURE: if not any(level == '1' for level, _ in headings): raise ValueError(f"Validation failed: No

heading found in {file_path}") return headings def resolve_image_paths(html: str, base_path: Path) -> str: def repl(match): src = match.group(1) abs_path = (base_path / src).resolve() if not abs_path.exists(): print(f"Warning: Image file not found: {abs_path}") else: print(f"Embedding image: {src} → {abs_path}") return f'' return re.sub(r']*src="([^"]+)"[^>]*>', repl, html) def parse_toc_file(toc_path: Path) -> list[Path]: """Parse a markdown ToC file and extract file paths in order.""" print(f"\nParsing ToC file: {toc_path}") with open(toc_path, encoding='utf-8') as f: toc_content = f.read() # Extract all markdown links: [text](path) link_pattern = r'\[([^\]]+)\]\(([^)]+\.md)\)' matches = re.findall(link_pattern, toc_content) file_paths = [] toc_dir = toc_path.parent for title, relative_path in matches: # Resolve relative path from ToC file location # Clean up paths like ../Document/../Document/content/file.md abs_path = (toc_dir / relative_path).resolve() if not abs_path.exists(): print(f" ⚠ Warning: File not found: {relative_path} → {abs_path}") else: file_paths.append(abs_path) print(f" ✓ Found: {relative_path}") print(f"\nTotal files found: {len(file_paths)}") return file_paths def scan_directory(input_dir: Path) -> list[Path]: """Scan directory for markdown files with numeric naming pattern.""" print(f"\nScanning directory: {input_dir}") markdown_files = sorted( [ p for p in input_dir.rglob('*.md') if re.match(r'^\d{2}', p.name) and re.match(r'^\d{2}', p.relative_to(input_dir).parts[0]) ], key=lambda p: str(p) ) print(f"Total files found: {len(markdown_files)}") return markdown_files def generate_pdf(input_path: Path, output_file: Path): # Determine input mode: ToC file or directory if input_path.is_file(): print(f"Mode: ToC file") markdown_files = parse_toc_file(input_path) elif input_path.is_dir(): print(f"Mode: Directory scan") markdown_files = scan_directory(input_path) else: raise ValueError(f"Input path does not exist: {input_path}") if not markdown_files: raise ValueError("No markdown files found to process") toc_entries = [] content_blocks = [] h1_counter = 0 h2_counter = 0 h3_counter = 0 print(f"\nProcessing {len(markdown_files)} files...") for file in markdown_files: print(f" Processing: {file.name}") with open(file, encoding='utf-8') as f: raw_md = f.read() raw_md = transform_special_blockquotes(raw_md) html = markdown(raw_md, extensions=['extra', 'nl2br', 'sane_lists', 'attr_list']) html = resolve_image_paths(html, file.parent) headings = validate_markdown(file, html) for level, heading in headings: if level == '1': h1_counter += 1 h2_counter = 0 h3_counter = 0 number = f"{h1_counter}" elif level == '2': h2_counter += 1 h3_counter = 0 number = f"{h1_counter}.{h2_counter}" elif level == '3': h3_counter += 1 number = f"{h1_counter}.{h2_counter}.{h3_counter}" else: number = "" numbered_heading = f"{number} {heading}" anchor = sanitize_heading(f"{number}_{heading}") toc_entries.append( f"
  • {numbered_heading}
  • " ) html = html.replace( f"{heading}", f"{numbered_heading}" ) content_blocks.append(html) current_date = datetime.utcnow().strftime('%Y-%m-%d %H:%M UTC') branch, commit = get_git_info() combined_html = f"""
    DRAFT

    AI Testing Guide

    PDF Preview

    Generated on: {current_date}
    From Branch: {branch}
    From Commit: {commit}

    Draft Disclaimer

    This PDF is an automatically generated draft intended for internal review and development purposes. It is not final, not publication-ready, and may contain formatting or structural inconsistencies.

    Content, structure, and layout are subject to change. For questions or feedback, please refer to the project repository or connect via Slack.

    Table of Contents

      {''.join(toc_entries)}
    {''.join(content_blocks)} """ output_file.parent.mkdir(parents=True, exist_ok=True) print(f"\nGenerating PDF: {output_file}") HTML(string=combined_html).write_pdf(str(output_file)) print(f"✓ PDF generated successfully") if __name__ == "__main__": parser = argparse.ArgumentParser( description="Generate PDF from markdown files. Supports both directory scanning and ToC file parsing." ) parser.add_argument( "input_path", type=Path, help="Path to ToC markdown file OR root directory containing markdown files" ) parser.add_argument("output_file", type=Path, help="Path to output PDF file") args = parser.parse_args() generate_pdf(args.input_path, args.output_file)