mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-02 03:35:09 +02:00
test(judge): fix two false-fail patterns surfaced by Opus 4.7 captures
COMPLETENESS_RE updated to match the option-prefixed form
`Completeness: A=10/10, B=7/10` documented in
scripts/resolvers/preamble/generate-ask-user-format.ts. The legacy regex
required a bare digit immediately after `Completeness: `, which Opus 4.7
correctly does not produce — the spec form names each option.
judgeRecommendation.commits no longer scans the entire recommendation body
for hedging keywords; it scans only the choice portion (text before the
"because" token). The because-clause is the reason and routinely contains
phrases like "the plan doesn't yet depend on Redis" — legitimate technical
language that the body-wide regex was flagging as hedging. Restricting the
check to the choice portion keeps the intent ("Either A or B because..."
flagged; "A because depends on X" accepted) without false positives.
Verified by re-reading the captured AUQs from the failing periodic run:
both Coverage tests had spec-correct `Completeness: A=10/10, B=7/10`
strings; the Kind test had a substantive recommendation whose because-clause
mentioned "depend on Redis" as part of the reasoning, not the choice.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -236,16 +236,22 @@ export async function judgeRecommendation(askUserText: string): Promise<Recommen
|
||||
const present = !!recLine;
|
||||
const recBody = recLine?.[1]?.trim() ?? '';
|
||||
|
||||
// commits: reject obvious hedging language. The format-spec self-check
|
||||
// requires the recommendation to name ONE choice; "either", "depending on",
|
||||
// "if X then" all signal the model bailed on the commitment.
|
||||
const commits = present && !/\b(either|depends? on|depending|if .+ then|or maybe|whichever)\b/i.test(recBody);
|
||||
|
||||
// has_because: literal "because" token in the body, per the format spec.
|
||||
const becauseMatch = recBody.match(/\bbecause\s+(.+?)$/i);
|
||||
const has_because = !!becauseMatch;
|
||||
const reason_text = becauseMatch?.[1]?.trim() ?? '';
|
||||
|
||||
// commits: reject hedging language only in the CHOICE portion (before the
|
||||
// "because" token). The because-clause itself is the reason and routinely
|
||||
// contains technical phrases like "the plan doesn't yet depend on Redis"
|
||||
// that aren't hedging at all. Looking only at the choice keeps the check
|
||||
// focused: "Either A or B because..." → flagged; "A because depends on X" →
|
||||
// accepted.
|
||||
const choicePortion = becauseMatch
|
||||
? recBody.slice(0, recBody.toLowerCase().indexOf('because')).trim()
|
||||
: recBody;
|
||||
const commits = present && !/\b(either|depends? on|depending|if .+ then|or maybe|whichever)\b/i.test(choicePortion);
|
||||
|
||||
// If the because-clause is absent, the substance score is implicitly 1.
|
||||
// Skip the LLM call — there is nothing to grade.
|
||||
if (!present || !has_because || !reason_text) {
|
||||
|
||||
@@ -41,7 +41,11 @@ const evalCollector = createEvalCollector('e2e-plan-format');
|
||||
// — the canonical form per generate-ask-user-format.ts is just
|
||||
// `Recommendation: <choice> because <reason>`, where <choice> is the bare
|
||||
// option label. judgeRecommendation.present covers the canonical shape.
|
||||
const COMPLETENESS_RE = /Completeness:\s*\d{1,2}\/10/;
|
||||
// COMPLETENESS regex matches both legacy bare form (`Completeness: 10/10`) and
|
||||
// the canonical option-prefixed form (`Completeness: A=10/10, B=7/10`) per
|
||||
// scripts/resolvers/preamble/generate-ask-user-format.ts. The optional
|
||||
// `[A-Z]=` prefix tolerates either shape; both are acceptable spec output.
|
||||
const COMPLETENESS_RE = /Completeness:\s*(?:[A-Z]=)?\d{1,2}\/10/;
|
||||
const KIND_NOTE_RE = /options differ in kind/i;
|
||||
|
||||
// v1.7.0.0 Pros/Cons format tokens. Tests are additive: existing
|
||||
|
||||
Reference in New Issue
Block a user