mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-06 13:45:35 +02:00
chore: upgrade eval judge to Sonnet 4.6, update changelog
Switch LLM-as-judge evals from Haiku to Sonnet 4.6 for more stable, nuanced scoring. Add changelog entry for all eval improvements. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -7,7 +7,7 @@
|
||||
* Requires: ANTHROPIC_API_KEY env var
|
||||
* Run: ANTHROPIC_API_KEY=sk-... bun test test/skill-llm-eval.test.ts
|
||||
*
|
||||
* Cost: ~$0.01-0.03 per run (haiku)
|
||||
* Cost: ~$0.05-0.15 per run (sonnet)
|
||||
*/
|
||||
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
@@ -30,7 +30,7 @@ async function judge(section: string, prompt: string): Promise<JudgeScore> {
|
||||
const client = new Anthropic();
|
||||
|
||||
const response = await client.messages.create({
|
||||
model: 'claude-haiku-4-5-20251001',
|
||||
model: 'claude-sonnet-4-6',
|
||||
max_tokens: 1024,
|
||||
messages: [{
|
||||
role: 'user',
|
||||
@@ -158,7 +158,7 @@ describeEval('LLM-as-judge quality evals', () => {
|
||||
|
||||
const client = new Anthropic();
|
||||
const response = await client.messages.create({
|
||||
model: 'claude-haiku-4-5-20251001',
|
||||
model: 'claude-sonnet-4-6',
|
||||
max_tokens: 1024,
|
||||
messages: [{
|
||||
role: 'user',
|
||||
|
||||
Reference in New Issue
Block a user