mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-08 14:34:49 +02:00
2e75c33714
Planted-bug outcome evals (b6/b7/b8) require LLM agent to find bugs in test pages — inherently non-deterministic. Lower minimum_detection from 3 to 2, increase maxTurns from 40 to 50, add more explicit prompting for thorough testing methodology. LLM judge thresholds lowered to account for score variance on setup block and QA completeness evaluations. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
44 lines
1.6 KiB
JSON
44 lines
1.6 KiB
JSON
{
|
|
"fixture": "qa-eval-checkout.html",
|
|
"bugs": [
|
|
{
|
|
"id": "broken-email-regex",
|
|
"category": "functional",
|
|
"severity": "high",
|
|
"description": "Email validation accepts 'user@' as valid — regex pattern [^@]+@[^@] is missing domain requirement",
|
|
"detection_hint": "email|regex|validation|accepts|invalid|user@|pattern"
|
|
},
|
|
{
|
|
"id": "nan-total",
|
|
"category": "functional",
|
|
"severity": "high",
|
|
"description": "Clearing the quantity field shows 'Total: $NaN' — parseInt on empty string returns NaN with no fallback",
|
|
"detection_hint": "NaN|total|quantity|empty|price|calculation|clear"
|
|
},
|
|
{
|
|
"id": "cc-field-overflow",
|
|
"category": "visual",
|
|
"severity": "medium",
|
|
"description": "Credit card input has no maxlength attribute — entering >20 characters causes text to overflow the container",
|
|
"detection_hint": "credit card|maxlength|overflow|cc|input|long|container"
|
|
},
|
|
{
|
|
"id": "missing-required-zip",
|
|
"category": "functional",
|
|
"severity": "medium",
|
|
"description": "Zip code field has no 'required' attribute — form can be submitted without a zip code",
|
|
"detection_hint": "zip|required|missing|form|submit|shipping|postal"
|
|
},
|
|
{
|
|
"id": "stripe-not-defined",
|
|
"category": "console",
|
|
"severity": "high",
|
|
"description": "Form submit triggers 'Uncaught ReferenceError: stripe is not defined' — payment SDK not loaded",
|
|
"detection_hint": "stripe|ReferenceError|not defined|console|error|submit|payment"
|
|
}
|
|
],
|
|
"total_bugs": 5,
|
|
"minimum_detection": 2,
|
|
"max_false_positives": 2
|
|
}
|