fuzzforge_ai/.github/workflows/benchmark.yml

name: Benchmarks

on:
  # Run on schedule (nightly)
  schedule:
    - cron: '0 2 * * *'  # 2 AM UTC every day

  # Allow manual trigger
  workflow_dispatch:
    inputs:
      compare_with:
        description: 'Baseline commit to compare against (optional)'
        required: false
        default: ''

  # Run on PR when benchmarks are modified
  pull_request:
    paths:
      - 'backend/benchmarks/**'
      - 'backend/toolbox/modules/**'
      - '.github/workflows/benchmark.yml'

jobs:
  benchmark:
    name: Run Benchmarks
    runs-on: ubuntu-latest

    steps:
      - uses: actions/checkout@v4
        with:
          fetch-depth: 0  # Fetch all history for comparison

      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: '3.11'

      - name: Install system dependencies
        run: |
          sudo apt-get update
          sudo apt-get install -y build-essential

      - name: Install Python dependencies
        working-directory: ./backend
        run: |
          python -m pip install --upgrade pip
          pip install -e ".[dev]"
          pip install pytest pytest-asyncio pytest-benchmark pytest-benchmark[histogram]
          pip install -e ../sdk  # Install SDK for benchmarks

      - name: Run benchmarks
        working-directory: ./backend
        run: |
          pytest benchmarks/ \
            -v \
            --benchmark-only \
            --benchmark-json=benchmark-results.json \
            --benchmark-histogram=benchmark-histogram

      - name: Store benchmark results
        uses: actions/upload-artifact@v4
        with:
          name: benchmark-results-${{ github.run_number }}
          path: |
            backend/benchmark-results.json
            backend/benchmark-histogram.svg

      - name: Download baseline benchmarks
        if: github.event_name == 'pull_request'
        uses: dawidd6/action-download-artifact@v3
        continue-on-error: true
        with:
          workflow: benchmark.yml
          branch: ${{ github.base_ref }}
          name: benchmark-results-*
          path: ./baseline
          search_artifacts: true

      - name: Compare with baseline
        if: github.event_name == 'pull_request' && hashFiles('baseline/benchmark-results.json') != ''
        run: |
          python -c "
          import json
          import sys

          with open('backend/benchmark-results.json') as f:
              current = json.load(f)

          with open('baseline/benchmark-results.json') as f:
              baseline = json.load(f)

          print('\\n## Benchmark Comparison\\n')
          print('| Benchmark | Current | Baseline | Change |')
          print('|-----------|---------|----------|--------|')

          regressions = []

          for bench in current['benchmarks']:
              name = bench['name']
              current_time = bench['stats']['mean']

              # Find matching baseline
              baseline_bench = next((b for b in baseline['benchmarks'] if b['name'] == name), None)
              if baseline_bench:
                  baseline_time = baseline_bench['stats']['mean']
                  change = ((current_time - baseline_time) / baseline_time) * 100

                  print(f'| {name} | {current_time:.4f}s | {baseline_time:.4f}s | {change:+.2f}% |')

                  # Flag regressions > 10%
                  if change > 10:
                      regressions.append((name, change))
              else:
                  print(f'| {name} | {current_time:.4f}s | N/A | NEW |')

          if regressions:
              print('\\n⚠️  **Performance Regressions Detected:**')
              for name, change in regressions:
                  print(f'- {name}: +{change:.2f}%')
              sys.exit(1)
          else:
              print('\\n✅ No significant performance regressions detected')
          "

      - name: Comment PR with results
        if: github.event_name == 'pull_request'
        uses: actions/github-script@v7
        with:
          script: |
            const fs = require('fs');
            const results = JSON.parse(fs.readFileSync('backend/benchmark-results.json', 'utf8'));

            let body = '## Benchmark Results\\n\\n';
            body += '| Category | Benchmark | Mean Time | Std Dev |\\n';
            body += '|----------|-----------|-----------|---------|\\n';

            for (const bench of results.benchmarks) {
              const group = bench.group || 'ungrouped';
              const name = bench.name.split('::').pop();
              const mean = bench.stats.mean.toFixed(4);
              const stddev = bench.stats.stddev.toFixed(4);
              body += `| ${group} | ${name} | ${mean}s | ${stddev}s |\\n`;
            }

            body += '\\n📊 Full benchmark results available in artifacts.';

            github.rest.issues.createComment({
              issue_number: context.issue.number,
              owner: context.repo.owner,
              repo: context.repo.repo,
              body: body
            });

  benchmark-summary:
    name: Benchmark Summary
    runs-on: ubuntu-latest
    needs: benchmark
    if: always()
    steps:
      - name: Check results
        run: |
          if [ "${{ needs.benchmark.result }}" != "success" ]; then
            echo "Benchmarks failed or detected regressions"
            exit 1
          fi
          echo "Benchmarks completed successfully!"