diff --git a/PDFGenerator/PDFGenFinal.py b/PDFGenerator/PDFGenFinal.py new file mode 100644 index 0000000..76a0f8e --- /dev/null +++ b/PDFGenerator/PDFGenFinal.py @@ -0,0 +1,732 @@ + +#!/usr/bin/env python3 +import argparse +import re +import subprocess +from datetime import datetime +from pathlib import Path +from urllib.parse import urljoin, urlparse + +import requests +from markdown import markdown +from weasyprint import HTML + +VALIDATE_STRUCTURE = False + + +# ------------------ Configuration Management ------------------ # + +def load_config(config_path: Path) -> dict: + """Load configuration from a text file.""" + config = { + 'PROJECT_NAME': 'Document', + 'VERSION': 'Version 1.0', + 'TOC_PATH': 'ToC.md', + 'OUTPUT_FILE': 'output.pdf', + 'COVER_IMAGE': '', + 'HEADER_IMAGE': '' + } + + if not config_path.exists(): + print(f"⚠ Config file not found: {config_path}") + return config + + print(f"Loading configuration from: {config_path}") + + with open(config_path, 'r', encoding='utf-8') as f: + for line in f: + line = line.strip() + # Skip comments and empty lines + if not line or line.startswith('#'): + continue + + # Parse KEY=VALUE + if '=' in line: + key, value = line.split('=', 1) + key = key.strip() + value = value.strip() + + if key in config: + config[key] = value + print(f" {key} = {value}") + + return config + + +# ------------------ Utility functions ------------------ # + +def sanitize_heading(text: str) -> str: + """Create a safe HTML id from heading text.""" + return re.sub(r'[^a-zA-Z0-9_-]', '', text.replace(' ', '_')) + + +def get_git_info(): + """Return (branch, short_commit) or ('unknown', 'unknown').""" + try: + commit = subprocess.check_output( + ["git", "rev-parse", "--short", "HEAD"] + ).decode().strip() + branch = subprocess.check_output( + ["git", "rev-parse", "--abbrev-ref", "HEAD"] + ).decode().strip() + return branch, commit + except Exception: + return "unknown", "unknown" + + +def transform_special_blockquotes(md_text: str) -> str: + """Convert custom NOTE/COMMENT blockquotes in markdown to styled HTML blockquotes.""" + md_text = re.sub( + r'(?m)^> NOTE:\s*(.*)', + r'
Note: \1
', + md_text + ) + md_text = re.sub( + r'(?m)^> COMMENT:\s*(.*)', + r'
Comment: \1
', + md_text + ) + return md_text + + +def validate_markdown(file_path: Path, html: str): + """Optionally ensure markdown has at least one h1, etc.""" + headings = re.findall(r'(.*?)', html) + + if VALIDATE_STRUCTURE: + if not any(level == '1' for level, _ in headings): + raise ValueError(f"Validation failed: No

heading found in {file_path}") + + return headings + + +def resolve_image_paths(html: str, base_path: Path, repo_root: Path = None) -> str: + """ + Convert to absolute file:// paths for WeasyPrint (local files). + Improved version with better error handling and path resolution. + Handles /Document/images/ paths relative to repository root. + """ + def repl(match): + # Extract the full img tag and src attribute + full_tag = match.group(0) + src = match.group(1) + + # Skip if already absolute URL or file:// + if src.startswith(('http://', 'https://', 'file://', 'data:')): + return full_tag + + # Try multiple resolution strategies + possible_paths = [] + + # Strategy 1: If path starts with /Document/, resolve from repo root + if src.startswith('/Document/') and repo_root: + # Remove leading slash and resolve from repo root + rel_path = src.lstrip('/') + possible_paths.append(repo_root / rel_path) + + # Strategy 2: Relative to markdown file + possible_paths.append(base_path / src) + + # Strategy 3: One level up from markdown file + possible_paths.append(base_path.parent / src) + + # Strategy 4: Two levels up (for nested content) + possible_paths.append(base_path.parent.parent / src) + + # Strategy 5: Absolute path as-is + if not src.startswith('/'): + possible_paths.append(Path(src)) + + # Strategy 6: Remove leading slashes or dots + cleaned_src = src.lstrip('./') + possible_paths.append(base_path / cleaned_src) + + # Strategy 7: If starts with /, try from repo root + if src.startswith('/') and repo_root: + possible_paths.append(repo_root / src.lstrip('/')) + + # Find first existing path + abs_path = None + for path in possible_paths: + try: + resolved = path.resolve() + if resolved.exists(): + abs_path = resolved + break + except (OSError, ValueError): + continue + + if abs_path is None: + print(f"⚠ Warning: Image file not found: {src}") + print(f" Searched from: {base_path}") + if repo_root: + print(f" Repository root: {repo_root}") + print(f" Tried {len(possible_paths)} possible paths") + # Return original tag but with a placeholder style + return f'
⚠ Missing image: {src}
' + else: + print(f"✓ Embedding image: {src} → {abs_path}") + # Return img tag with file:// URL and proper styling + return f'' + + return re.sub(r']*src="([^"]+)"[^>]*>', repl, html) + + +def parse_toc_file(toc_path: Path): + print(f"\nParsing ToC file: {toc_path}") + toc_content = toc_path.read_text(encoding="utf-8") + + link_pattern = r'\[([^\]]+)\]\(([^)]+\.md)\)' + matches = re.findall(link_pattern, toc_content) + + file_refs = [] + toc_dir = toc_path.parent + + # repo locale clonato accanto allo script + REPO_ROOT = (Path(__file__).parent / "www-project-ai-testing-guide").resolve() + GITHUB_PREFIX = "https://github.com/OWASP/www-project-ai-testing-guide/blob/main/" + + for title, href in matches: + if href.startswith(GITHUB_PREFIX): + rel_path = href[len(GITHUB_PREFIX):] + abs_path = (REPO_ROOT / rel_path).resolve() + else: + abs_path = (toc_dir / href).resolve() + + if abs_path.exists(): + print(f" ✓ Local file: {href} → {abs_path}") + file_refs.append((href, abs_path)) + else: + print(f" ⚠ Missing file: {href} → {abs_path}") + + print(f"Total entries found: {len(file_refs)}") + return file_refs + + +def scan_directory(input_dir: Path): + """Fallback: scan a directory for .md files (not used if you work with ToC.md).""" + print(f"\nScanning directory: {input_dir}") + markdown_files = sorted(input_dir.rglob('*.md'), key=lambda p: str(p)) + print(f"Total files found: {len(markdown_files)}") + return [(str(p), p) for p in markdown_files] + + +def rewrite_links_to_anchors(html: str, link_map: dict[str, str]) -> str: + """Replace href="URL" with href="#anchor" when URL is present in link_map.""" + def repl(match): + href = match.group(1) + if href in link_map: + return f'href="#{link_map[href]}"' + return match.group(0) + + return re.sub(r'href="([^"]+)"', repl, html) + + +# ------------------ Main PDF generation ------------------ # + +def generate_pdf(input_path: Path, output_file: Path, project_name: str = "Document", + version: str = "Version 1.0", cover_image_path: str = "", + header_image_path: str = ""): + print(">>> PDF Generator - Final Version with Config Support") + print(f">>> Running from: {__file__}") + print(f">>> Project: {project_name}") + print(f">>> Version: {version}") + + # Determine repository root + # Try to find www-project-ai-testing-guide directory + REPO_ROOT = None + if input_path.is_file(): + # Start from ToC file's directory and search upwards + current = input_path.parent.resolve() + else: + current = input_path.resolve() + + # Search for repository root (contains Document folder) + while current != current.parent: + if (current / "Document").exists(): + REPO_ROOT = current + print(f">>> Repository root detected: {REPO_ROOT}") + break + current = current.parent + + if REPO_ROOT is None: + # Fallback: check if script is in repo + script_parent = Path(__file__).parent / "www-project-ai-testing-guide" + if script_parent.exists(): + REPO_ROOT = script_parent.resolve() + print(f">>> Repository root (from script): {REPO_ROOT}") + + # 1) Read ToC or scan directory + toc_html = None + + if input_path.is_file(): + print("Mode: ToC file") + toc_md = input_path.read_text(encoding="utf-8") + toc_md = transform_special_blockquotes(toc_md) + toc_html = markdown(toc_md, extensions=['extra', 'nl2br', 'sane_lists', 'attr_list']) + file_entries = parse_toc_file(input_path) + elif input_path.is_dir(): + print("Mode: Directory scan") + file_entries = scan_directory(input_path) + toc_html = "

Table of Contents

" + else: + raise ValueError(f"Input path does not exist: {input_path}") + + if not file_entries: + raise ValueError("No markdown files found to process") + + # 2) Build link map: href -> doc anchor id (doc1, doc2, ...) + link_map: dict[str, str] = {} + for idx, (href, _) in enumerate(file_entries): + link_map[href] = f"doc{idx + 1}" + + # rewrite ToC links to internal anchors + if toc_html is not None: + toc_html = rewrite_links_to_anchors(toc_html, link_map) + + content_blocks: list[str] = [] + + print(f"\nProcessing {len(file_entries)} files...") + for href, ref in file_entries: + doc_anchor = link_map[href] + print(f"\n Processing [{doc_anchor}]: {ref}") + + # ref è SEMPRE un Path locale + raw_md = ref.read_text(encoding="utf-8") + base_path = ref.parent + + # NOTE/COMMENT + raw_md = transform_special_blockquotes(raw_md) + + # Markdown -> HTML + html = markdown(raw_md, extensions=['extra', 'nl2br', 'sane_lists', 'attr_list']) + + # Immagini locali -> file:// (improved resolution) + html = resolve_image_paths(html, base_path, REPO_ROOT) + + # Rewrite links to internal anchors + html = rewrite_links_to_anchors(html, link_map) + + # Add id to headings + headings = validate_markdown(ref, html) + for level, heading in headings: + anchor = sanitize_heading(heading) + html = html.replace( + f"{heading}", + f"{heading}" + ) + + # Avvolgi ogni documento in
+ html = f'
{html}
' + content_blocks.append(html) + + + # Cover image + header logo + if cover_image_path: + cover_path = Path(cover_image_path).resolve() + if not cover_path.exists(): + # Try relative to script + cover_path = (Path(__file__).parent / cover_image_path).resolve() + else: + cover_path = (Path(__file__).parent / "Cover.png").resolve() + + if not cover_path.exists(): + print(f"⚠ Cover image not found at {cover_path}, generating PDF without image cover.") + cover_img_html = "" + else: + cover_src = f"file://{cover_path}" + cover_img_html = f'' + print(f"✓ Cover image found: {cover_path}") + + if header_image_path: + header_bg_path = Path(header_image_path).resolve() + if not header_bg_path.exists(): + # Try relative to script + header_bg_path = (Path(__file__).parent / header_image_path).resolve() + else: + header_bg_path = (Path(__file__).parent / "header-bg.png").resolve() + + if not header_bg_path.exists(): + print(f"⚠ Header image not found at {header_bg_path}, generating header without logo.") + header_content = '""' + else: + header_bg_url = f"file://{header_bg_path}" + header_content = f'url("{header_bg_url}")' + print(f"✓ Header image found: {header_bg_path}") + + # CSS - FINAL VERSION with all fixes + css = """ +@page {{ + size: A4; + margin: 2.5cm 2.5cm 2cm 2.5cm; + + @top-left {{ + content: "{project_name}"; + font-size: 10pt; + font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; + color: #103595; + font-weight: 500; + vertical-align: bottom; + padding: 0.7cm 0.0cm; /* Padding for badge effect */ + border-radius: 0px; /* Rounded corners */ + }} + + @top-center {{ + content: ""; + }} + + @top-right {{ + content: {header_content}; + vertical-align: bottom; + padding-bottom: 0.3cm; + }} + + @bottom-left {{ + content: "{version}"; + font-size: 8pt; + color: #666; + vertical-align: top; + padding-top: 0.2cm; + }} + + @bottom-right {{ + content: "Page " counter(page) " of " counter(pages); + font-size: 8pt; + color: #666; + vertical-align: top; + padding-top: 0.2cm; + }} +}} + +@page cover {{ + size: A4; + margin: 0 0 -0.5cm 0; + background: none; + + @top-left {{ content: none; }} + @top-center {{ content: none; }} + @top-right {{ content: none; }} + @bottom-left {{ content: none; }} + @bottom-center{{ content: none; }} + @bottom-right {{ content: none; }} +}} + +body {{ + font-family: 'Georgia', 'Times New Roman', serif; + font-size: 11pt; + line-height: 1.6; + color: #333; + margin: 0; +}} + +.cover {{ + page: cover; + width: 100%; + height: 100vh; + page-break-after: always; + display: flex; + align-items: center; + justify-content: center; +}} + +.cover-image {{ + width: 100%; + height: 100%; + object-fit: contain; + object-position: center center; + margin: 0; + padding: 0; +}} + +.main {{ + margin: 0 auto; + max-width: 75ch; +}} + +.toc {{ + page-break-after: always; +}} + +/* Professional color scheme: Deep Blue for h1, Medium Blue for h2, Dark Gray for h3 */ + +h1, h2, h3, h4, h5, h6 {{ + font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; + font-weight: 600; + line-height: 1.3; + page-break-after: avoid; +}} + +h1 {{ + font-size: 1.8em; /* Reduced from 2.4em */ + color: #1e3a5f; /* Deep Blue */ + margin-top: 2em; + margin-bottom: 0.6em; + padding-bottom: 0.3em; + border-bottom: 2px solid #2c5aa0; + page-break-before: always; + page-break-after: avoid; + letter-spacing: -0.02em; +}} + +h2 {{ + font-size: 1.4em; /* Reduced from 1.8em */ + color: #2c5aa0; /* Medium Blue */ + margin-top: 1.5em; + margin-bottom: 0.5em; + page-break-after: avoid; + letter-spacing: -0.01em; +}} + +h3 {{ + font-size: 1.15em; /* Reduced from 1.4em */ + color: #34495e; /* Dark Gray */ + margin-top: 1.2em; + margin-bottom: 0.4em; + page-break-after: avoid; + font-weight: 600; +}} + +h4 {{ + font-size: 1.05em; + color: #555; + margin-top: 1em; + margin-bottom: 0.3em; + page-break-after: avoid; + font-weight: 600; +}} + +p {{ + text-align: justify; + hyphens: auto; + margin: 0.8em 0; +}} + +p, li, table, blockquote {{ + orphans: 2; + widows: 2; +}} + +/* Image styling */ +img {{ + display: block; + margin: 1.5em auto; + max-width: 100%; + height: auto; + border: 1px solid #e0e0e0; + padding: 0.5em; + background: #fafafa; +}} + +.missing-image {{ + display: block; + margin: 1.5em auto; + padding: 1em; + background: #fff3cd; + border: 1px solid #ffc107; + color: #856404; + text-align: center; + font-style: italic; +}} + +/* Lists */ +ul, ol {{ + margin: 0.8em 0; + padding-left: 2em; +}} + +li {{ + margin: 0.3em 0; +}} + +/* Code blocks */ +code {{ + font-family: 'Consolas', 'Monaco', 'Courier New', monospace; + font-size: 0.85em; + background-color: #f5f5f5; + padding: 0.2em 0.4em; + border-radius: 3px; + word-wrap: break-word; +}} + +pre {{ + background-color: #f5f5f5; + border: 1px solid #ddd; + border-left: 3px solid #2c5aa0; + padding: 0.8em; + overflow-x: auto; + overflow-wrap: break-word; + word-wrap: break-word; + white-space: pre-wrap; + margin: 1em 0; + page-break-inside: avoid; + font-size: 0.85em; + line-height: 1.4; +}} + +pre code {{ + background: none; + padding: 0; + word-wrap: break-word; + white-space: pre-wrap; +}} + +/* Tables */ +table {{ + width: 100%; + border-collapse: collapse; + margin: 1.5em 0; + font-size: 10pt; + table-layout: auto; +}} + +th {{ + background-color: #2c5aa0; + color: white; + font-weight: 600; + border: 1px solid #1e3a5f; + padding: 8px 10px; + text-align: left; + vertical-align: top; +}} + +td {{ + border: 1px solid #ddd; + padding: 6px 10px; + text-align: left; + vertical-align: top; + word-wrap: break-word; + white-space: normal; +}} + +tr:nth-child(even) {{ + background-color: #f9f9f9; +}} + +/* Blockquotes */ +blockquote {{ + background-color: #f5f5f5; + border-left: 4px solid #2c5aa0; + padding: 0.8em 1.2em; + margin: 1em 0; + font-size: 0.95em; + font-style: italic; +}} + +blockquote.note {{ + background-color: #fff8dc; + border-left: 4px solid #f0ad4e; +}} + +blockquote.comment {{ + background-color: #e7f3ff; + border-left: 4px solid #2c5aa0; +}} + +/* Links */ +a {{ + color: #2c5aa0; + text-decoration: none; +}} + +a:hover {{ + text-decoration: underline; +}} + +/* Horizontal rules */ +hr {{ + border: none; + border-top: 1px solid #ddd; + margin: 2em 0; +}} +""".format(header_content=header_content, project_name=project_name, version=version) + + combined_html = """ + + + + + + +
+ {cover_img} +
+
+
+ {toc_html} +
+ {content} +
+ + +""".format( + css=css, + cover_img=cover_img_html, + toc_html=toc_html or "", + content="".join(content_blocks), + ) + + output_file.parent.mkdir(parents=True, exist_ok=True) + print(f"\n{'='*60}") + print(f"Generating PDF: {output_file}") + print(f"{'='*60}") + + HTML(string=combined_html, base_url=str(Path(__file__).parent)).write_pdf(str(output_file)) + + print(f"\n✓ PDF generated successfully!") + print(f" Output: {output_file}") + print(f" Size: {output_file.stat().st_size / 1024:.1f} KB") + + +# ------------------ CLI entrypoint ------------------ # + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Generate PDF from markdown files with optional configuration file support." + ) + parser.add_argument( + "--config", + "-c", + type=Path, + help="Path to configuration file (config.txt)", + ) + parser.add_argument( + "input_path", + type=Path, + nargs='?', + help="Path to ToC markdown file OR root directory containing markdown files (overrides config)", + ) + parser.add_argument( + "output_file", + type=Path, + nargs='?', + help="Path to output PDF file (overrides config)", + ) + + args = parser.parse_args() + + # Load configuration if provided + if args.config: + config = load_config(args.config) + + # Use command line arguments if provided, otherwise use config + input_path = args.input_path if args.input_path else Path(config['TOC_PATH']) + output_file = args.output_file if args.output_file else Path(config['OUTPUT_FILE']) + project_name = config['PROJECT_NAME'] + version = config['VERSION'] + cover_image = config['COVER_IMAGE'] + header_image = config['HEADER_IMAGE'] + else: + # Traditional mode: require both arguments + if not args.input_path or not args.output_file: + parser.error("input_path and output_file are required when not using --config") + + input_path = args.input_path + output_file = args.output_file + project_name = "Document" + version = "Version 1.0" + cover_image = "" + header_image = "" + + generate_pdf(input_path, output_file, project_name, version, cover_image, header_image) \ No newline at end of file diff --git a/PDFGenerator/config.txt b/PDFGenerator/config.txt new file mode 100644 index 0000000..8662c54 --- /dev/null +++ b/PDFGenerator/config.txt @@ -0,0 +1,21 @@ +# PDF Generator Configuration File +# Lines starting with # are comments and will be ignored +# Format: KEY=VALUE (no spaces around =) + +# Project name (appears in header on the left) +PROJECT_NAME=OWASP AI Testing Guide + +# Version number (appears in footer on the left) +VERSION=Version 0.9 + +# Path to Table of Contents markdown file +TOC_PATH=md/ToC.md + +# Output PDF file name +OUTPUT_FILE=OWASP-AI-Testing-Guide-v0.9.pdf + +# Path to cover image (optional, leave empty if not needed) +COVER_IMAGE=Cover.png + +# Path to header background/logo image (optional, leave empty if not needed) +HEADER_IMAGE=header-bg.jpg diff --git a/PDFGenerator/header-bg.jpg b/PDFGenerator/header-bg.jpg new file mode 100644 index 0000000..001cd92 Binary files /dev/null and b/PDFGenerator/header-bg.jpg differ