mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-05 13:15:24 +02:00
merge: resolve package.json version conflict (accept main's 0.12.5.0)
This commit is contained in:
@@ -329,6 +329,9 @@ export const LLM_JUDGE_TOUCHFILES: Record<string, string[]> = {
|
||||
'retro/SKILL.md instructions': ['retro/SKILL.md', 'retro/SKILL.md.tmpl'],
|
||||
'qa-only/SKILL.md workflow': ['qa-only/SKILL.md', 'qa-only/SKILL.md.tmpl'],
|
||||
'gstack-upgrade/SKILL.md upgrade flow': ['gstack-upgrade/SKILL.md', 'gstack-upgrade/SKILL.md.tmpl'],
|
||||
|
||||
// Voice directive
|
||||
'voice directive tone': ['scripts/resolvers/preamble.ts', 'review/SKILL.md', 'review/SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
|
||||
};
|
||||
|
||||
/**
|
||||
|
||||
@@ -778,6 +778,69 @@ describeIfSelected('Other skill evals', [
|
||||
}, 30_000);
|
||||
});
|
||||
|
||||
// Voice directive eval — tests that the voice section produces the right tone
|
||||
describeIfSelected('Voice directive eval', ['voice directive tone'], () => {
|
||||
testIfSelected('voice directive tone', async () => {
|
||||
const t0 = Date.now();
|
||||
// Read a tier 2+ skill to get the full voice directive in context
|
||||
const content = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8');
|
||||
const voiceStart = content.indexOf('## Voice');
|
||||
if (voiceStart === -1) {
|
||||
throw new Error('Voice section not found in review/SKILL.md. Was preamble.ts regenerated?');
|
||||
}
|
||||
const voiceEnd = content.indexOf('\n## ', voiceStart + 1);
|
||||
const voiceSection = content.slice(voiceStart, voiceEnd > 0 ? voiceEnd : voiceStart + 3000);
|
||||
|
||||
const result = await callJudge<{
|
||||
directness: number;
|
||||
concreteness: number;
|
||||
avoids_corporate: number;
|
||||
avoids_ai_vocabulary: number;
|
||||
connects_user_outcomes: number;
|
||||
reasoning: string;
|
||||
}>(`You are evaluating a voice directive for an AI coding assistant framework called GStack.
|
||||
Score each dimension 1-5 where 5 is excellent:
|
||||
|
||||
1. directness: Does it instruct the agent to be direct, lead with the point, take positions?
|
||||
2. concreteness: Does it instruct the agent to name specific files, commands, line numbers, real numbers?
|
||||
3. avoids_corporate: Does it explicitly ban corporate/formal/academic tone and provide alternatives?
|
||||
4. avoids_ai_vocabulary: Does it ban AI-tell words and phrases with specific lists?
|
||||
5. connects_user_outcomes: Does it instruct the agent to connect technical work to real user experience?
|
||||
|
||||
Return JSON only:
|
||||
{"directness": N, "concreteness": N, "avoids_corporate": N, "avoids_ai_vocabulary": N, "connects_user_outcomes": N, "reasoning": "..."}
|
||||
|
||||
THE VOICE DIRECTIVE:
|
||||
${voiceSection}`);
|
||||
|
||||
console.log('Voice directive scores:', JSON.stringify(result, null, 2));
|
||||
|
||||
evalCollector?.addTest({
|
||||
name: 'voice directive tone',
|
||||
suite: 'Voice directive eval',
|
||||
tier: 'llm-judge',
|
||||
passed: result.directness >= 4 && result.concreteness >= 4 && result.avoids_corporate >= 4
|
||||
&& result.avoids_ai_vocabulary >= 4 && result.connects_user_outcomes >= 4,
|
||||
duration_ms: Date.now() - t0,
|
||||
cost_usd: 0.02,
|
||||
judge_scores: {
|
||||
directness: result.directness,
|
||||
concreteness: result.concreteness,
|
||||
avoids_corporate: result.avoids_corporate,
|
||||
avoids_ai_vocabulary: result.avoids_ai_vocabulary,
|
||||
connects_user_outcomes: result.connects_user_outcomes,
|
||||
},
|
||||
judge_reasoning: result.reasoning,
|
||||
});
|
||||
|
||||
expect(result.directness).toBeGreaterThanOrEqual(4);
|
||||
expect(result.concreteness).toBeGreaterThanOrEqual(4);
|
||||
expect(result.avoids_corporate).toBeGreaterThanOrEqual(4);
|
||||
expect(result.avoids_ai_vocabulary).toBeGreaterThanOrEqual(4);
|
||||
expect(result.connects_user_outcomes).toBeGreaterThanOrEqual(4);
|
||||
}, 30_000);
|
||||
});
|
||||
|
||||
// Module-level afterAll — finalize eval collector after all tests complete
|
||||
afterAll(async () => {
|
||||
if (evalCollector) {
|
||||
|
||||
@@ -1325,7 +1325,7 @@ describe('Codex skill', () => {
|
||||
expect(content).toContain('fall back to the Claude adversarial subagent');
|
||||
// Review log uses new skill name
|
||||
expect(content).toContain('adversarial-review');
|
||||
expect(content).toContain('xhigh');
|
||||
expect(content).toContain('reasoning_effort="high"');
|
||||
expect(content).toContain('ADVERSARIAL REVIEW SYNTHESIS');
|
||||
});
|
||||
|
||||
@@ -1335,7 +1335,7 @@ describe('Codex skill', () => {
|
||||
expect(content).toContain('< 50');
|
||||
expect(content).toContain('200+');
|
||||
expect(content).toContain('adversarial-review');
|
||||
expect(content).toContain('xhigh');
|
||||
expect(content).toContain('reasoning_effort="high"');
|
||||
expect(content).toContain('Investigate and fix');
|
||||
});
|
||||
|
||||
|
||||
Reference in New Issue
Block a user