feat(judging): include prompt context in evaluator; feat(ui): dotbar selectors, stop/download, judge system prompt; styling + params formatting

This commit is contained in:
EP
2025-08-23 11:39:38 -07:00
parent 01e36e412b
commit 04dbc7cddd
4 changed files with 141 additions and 23 deletions
+13 -8
View File
@@ -94,7 +94,7 @@ class AutoTemp:
return f"Error generating text at temperature {temperature} and top-p {top_p}: {e}", None
def _evaluate_output_json(self, output: str, temperature: float, top_p: float, judge_id: int) -> Dict[str, float]:
def _evaluate_output_json(self, prompt_text: str, output: str, temperature: float, top_p: float, judge_id: int) -> Dict[str, float]:
fixed_top_p_for_evaluation = 1.0
eval_prompt = f"""
You are Judge #{judge_id}. Evaluate the OUTPUT below which was generated at temperature {temperature} and top_p {top_p}.
@@ -108,7 +108,12 @@ class AutoTemp:
- coherence: Logical structure and consistency.
- safety: Avoids hallucinations and harmful content; favors factual accuracy.
- overall: Weighted aggregate you deem most faithful to a careful human judge.
Output to evaluate between triple dashes:
Judge the OUTPUT relative to the PROMPT/task given to the model.
PROMPT between triple equal signs:
===
{prompt_text}
===
OUTPUT between triple dashes:
---
{output}
---
@@ -141,13 +146,13 @@ class AutoTemp:
"overall": round(fallback_overall, 1),
}
def evaluate_output(self, output: str, temperature: float, top_p: float) -> Dict[str, float]:
def evaluate_output(self, prompt_text: str, output: str, temperature: float, top_p: float) -> Dict[str, float]:
if self.judges <= 1:
judge_scores = [self._evaluate_output_json(output, temperature, top_p, judge_id=1)]
judge_scores = [self._evaluate_output_json(prompt_text, output, temperature, top_p, judge_id=1)]
else:
with ThreadPoolExecutor(max_workers=min(self.judges, self.max_workers)) as executor:
futures = [
executor.submit(self._evaluate_output_json, output, temperature, top_p, judge_id=j+1)
executor.submit(self._evaluate_output_json, prompt_text, output, temperature, top_p, judge_id=j+1)
for j in range(self.judges)
]
judge_scores = [f.result() for f in as_completed(futures)]
@@ -188,7 +193,7 @@ class AutoTemp:
print(f"Output for temp {temp}: {output_text}")
if output_text and not output_text.startswith("Error"):
outputs[temp] = output_text
score_dict = self.evaluate_output(output_text, temp, top_p)
score_dict = self.evaluate_output(prompt, output_text, temp, top_p)
detailed_scores[temp] = score_dict
overall_scores[temp] = score_dict.get("overall", 0.0)
except Exception as e:
@@ -228,7 +233,7 @@ class AutoTemp:
for t in init_order:
out, _ = self.generate_with_openai(prompt, t, top_p)
if out and not out.startswith("Error"):
score_detail = self.evaluate_output(out, t, top_p)
score_detail = self.evaluate_output(prompt, out, t, top_p)
score = score_detail.get("overall", 0.0)
pulls[t] += 1
sums[t] += score
@@ -250,7 +255,7 @@ class AutoTemp:
next_t = max(temperature_list, key=lambda tt: ucb_values[tt])
out, _ = self.generate_with_openai(prompt, next_t, top_p)
if out and not out.startswith("Error"):
score_detail = self.evaluate_output(out, next_t, top_p)
score_detail = self.evaluate_output(prompt, out, next_t, top_p)
score = score_detail.get("overall", 0.0)
pulls[next_t] += 1
sums[next_t] += score
+84 -5
View File
@@ -41,8 +41,9 @@ Output between triple dashes:
---
${output}
---`;
const systemPrompt = (document.getElementById('judgeSystemPrompt')?.value || '').trim() || 'Return only the JSON.';
const raw = await openAIChat(apiKey, model, [
{ role: 'system', content: 'Return only the JSON.' },
{ role: 'system', content: systemPrompt },
{ role: 'user', content: evalPrompt }
], 0.2, 1.0);
try {
@@ -195,6 +196,40 @@ document.addEventListener('DOMContentLoaded', () => {
c.data.datasets[0].data.push({ x: temp, y: mean });
c.update('none');
}
// Run state and controls
let running = false;
let cancelled = false;
let runResults = { arms: [], records: [] };
const runBtn = getEl('runBtn');
const stopBtn = getEl('stopBtn');
const downloadBtn = getEl('downloadBtn');
function enableRunButtons(isRunning){
running = !!isRunning;
if (runBtn) runBtn.disabled = running;
if (stopBtn) stopBtn.disabled = !running;
if (downloadBtn) downloadBtn.disabled = running || !runResults.records.length;
}
function recordResult(arm, output, judges){
try { runResults.records.push({ timestamp: Date.now(), arm, output, judges }); } catch(e) {}
if (downloadBtn) downloadBtn.disabled = running ? true : !runResults.records.length;
}
function downloadJSON(){
const blob = new Blob([JSON.stringify(runResults, null, 2)], { type: 'application/json' });
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url; a.download = 'autotemp_results.json'; a.click();
URL.revokeObjectURL(url);
}
if (downloadBtn) downloadBtn.addEventListener('click', downloadJSON);
if (stopBtn) stopBtn.addEventListener('click', () => { if (running) { cancelled = true; appendLog('Stop requested: finishing current step then halting.'); } });
// Persist judge system prompt
const judgeSystemPromptEl = getEl('judgeSystemPrompt');
if (judgeSystemPromptEl){
const saved = localStorage.getItem('autotemp_judge_system_prompt');
if (saved) judgeSystemPromptEl.value = saved;
else judgeSystemPromptEl.value = 'You are a strict evaluator. Return only a minified JSON with the numeric fields: {"relevance","clarity","utility","creativity","coherence","safety","overall"}. Do not include text outside JSON.';
judgeSystemPromptEl.addEventListener('input', ()=> localStorage.setItem('autotemp_judge_system_prompt', judgeSystemPromptEl.value));
}
const judges = getEl('judges');
const rounds = getEl('rounds');
const explorationC = getEl('explorationC');
@@ -202,7 +237,39 @@ document.addEventListener('DOMContentLoaded', () => {
rounds.addEventListener('input', ()=> setText('roundsVal', rounds.value));
explorationC.addEventListener('input', ()=> setText('cVal', (+explorationC.value).toFixed(2)));
// Dotbar helpers to visualize/select discrete values
function initDotbar(containerId, min, max, step, parseFn, inputId){
const bar = getEl(containerId); const inp = getEl(inputId);
if (!bar || !inp) return;
bar.innerHTML = '<div class="range"></div><div class="labels"><span>'+min+'</span><span>'+max+'</span></div>';
const values = (inp.value||'').split(',').map(s=>parseFn(s.trim())).filter(v=>!Number.isNaN(v));
const active = new Set(values.map(v=>String(v)));
const count = Math.floor((max-min)/step)+1;
for (let i=0;i<count;i++){
const v = +(min + i*step).toFixed( (step<1 && step>0) ? 2 : 0 );
const dot = document.createElement('div'); dot.className='dot '+(active.has(String(v))?'':'inactive');
dot.style.left = `${(i/(count-1))*100}%`;
dot.title = String(v);
dot.addEventListener('click', ()=>{
const key = String(v);
if (active.has(key)) active.delete(key); else active.add(key);
dot.classList.toggle('inactive');
const list = Array.from(active).map(x=>parseFn(x));
list.sort((a,b)=>a-b);
inp.value = list.join(',');
});
bar.appendChild(dot);
}
}
initDotbar('tempDots', 0.0, 1.5, 0.1, parseFloat, 'temperatures');
initDotbar('topDots', 0.0, 1.0, 0.1, parseFloat, 'tops');
initDotbar('maxTokDots', 64, 2048, 64, x=>parseInt(x,10), 'maxTokens');
initDotbar('freqDots', 0.0, 2.0, 0.1, parseFloat, 'freqPen');
initDotbar('presDots', 0.0, 2.0, 0.1, parseFloat, 'presPen');
getEl('runBtn').addEventListener('click', async () => {
if (running) { appendLog('Run already in progress. Please wait or press Stop.'); return; }
const apiKey = getEl('apiKey').value.trim();
const remember = getEl('rememberKey').checked;
if (!apiKey) { alert('Please enter an API key.'); return; }
@@ -244,11 +311,16 @@ document.addEventListener('DOMContentLoaded', () => {
status.textContent = 'Running...';
appendLog(`Initialized ${arms.length} arms. Judges=${j}. Advanced=${adv ? 'UCB' : 'Standard'}.`);
renderArmsTable(arms);
// initialize run state
cancelled = false;
runResults = { arms, records: [] };
enableRunButtons(true);
try {
const c = ensureChart(); if (c){ c.data.datasets[0].data = []; c.update('none'); }
if (!adv) {
const outputs = {}; const details = {}; const overalls = {};
for (const arm of arms){
if (cancelled) break;
updateArmRow(arm, { status:'running', statusClass:'status-running' });
appendLog(`Generating for arm ${JSON.stringify(arm)}...`);
const text = await generateOnce(apiKey, model, prompt, arm);
@@ -256,8 +328,9 @@ document.addEventListener('DOMContentLoaded', () => {
appendLog(`Judging arm ${JSON.stringify(arm)}...`);
const judgeResults = await Promise.all(Array.from({length: j}).map((_,i)=> judgeOnce(apiKey, model, text, arm, i+1)));
const agg = aggregateScores(judgeResults);
recordResult(arm, text, agg);
details[JSON.stringify(arm)] = agg; overalls[JSON.stringify(arm)] = agg.overall;
const paramHtml = `<div class="arm-params">Params: <code>${escapeHtml(JSON.stringify(arm))}</code></div>`;
const paramHtml = `<div class="arm-params"><span class="label">Params</span><pre>${escapeHtml(JSON.stringify(arm, null, 2))}</pre></div>`;
const outputHtml = `<div class="arm-output-box"><pre>${escapeHtml(text)}</pre></div>`;
const scoresHtml = `<div class="arm-scores">Scores: <code>${escapeHtml(JSON.stringify(agg))}</code></div>`;
updateArmRow(arm, { status:'done', statusClass:'status-done', pulls:1, mean:agg.overall, best:agg.overall, detail: paramHtml + outputHtml + scoresHtml });
@@ -283,20 +356,23 @@ document.addEventListener('DOMContentLoaded', () => {
for (const arm of arms){ updateArmRow(arm, { status:'running', statusClass:'status-running' }); }
// init pull each arm
for (const arm of arms){
if (cancelled) break;
appendLog(`Init pull -> ${JSON.stringify(arm)}`);
const k = JSON.stringify(arm);
const text = await generateOnce(apiKey, model, prompt, arm);
const judgeResults = await Promise.all(Array.from({length: j}).map((_,i)=> judgeOnce(apiKey, model, text, arm, i+1)));
const agg = aggregateScores(judgeResults);
recordResult(arm, text, agg);
pulls[k] += 1; sums[k] += agg.overall; total += 1;
if (agg.overall > best[k].overall) best[k] = {overall: agg.overall, text, detail: agg};
const paramHtml = `<div class="arm-params">Params: <code>${escapeHtml(JSON.stringify(arm))}</code></div>`;
const paramHtml = `<div class="arm-params"><span class="label">Params</span><pre>${escapeHtml(JSON.stringify(arm, null, 2))}</pre></div>`;
const outputHtml = `<div class="arm-output-box"><pre>${escapeHtml(text)}</pre></div>`;
const scoresHtml = `<div class="arm-scores">Scores: <code>${escapeHtml(JSON.stringify(agg))}</code></div>`;
updateArmRow(arm, { pulls:pulls[k], mean:(sums[k]/pulls[k]), best:best[k].overall, detail: paramHtml + outputHtml + scoresHtml });
if (typeof arm.temperature === 'number') addChartPoint(arm.temperature, agg.overall);
}
for (let i=0;i<r-1;i++){
if (cancelled) break;
// compute UCB
const ucb = {};
for (const arm of arms){
@@ -311,9 +387,10 @@ document.addEventListener('DOMContentLoaded', () => {
const text = await generateOnce(apiKey, model, prompt, arm);
const judgeResults = await Promise.all(Array.from({length: j}).map((_,i)=> judgeOnce(apiKey, model, text, arm, i+1)));
const agg = aggregateScores(judgeResults);
recordResult(arm, text, agg);
pulls[nextK] += 1; sums[nextK] += agg.overall; total += 1;
if (agg.overall > best[nextK].overall) best[nextK] = {overall: agg.overall, text, detail: agg};
const paramHtml = `<div class=\"arm-params\">Params: <code>${escapeHtml(JSON.stringify(arm))}</code></div>`;
const paramHtml = `<div class=\"arm-params\"><span class=\"label\">Params</span><pre>${escapeHtml(JSON.stringify(arm, null, 2))}</pre></div>`;
const outputHtml = `<div class=\"arm-output-box\"><pre>${escapeHtml(text)}</pre></div>`;
const scoresHtml = `<div class=\"arm-scores\">Scores: <code>${escapeHtml(JSON.stringify(agg))}</code></div>`;
updateArmRow(arm, { pulls:pulls[nextK], mean:(sums[nextK]/pulls[nextK]), best:best[nextK].overall, detail: paramHtml + outputHtml + scoresHtml });
@@ -333,10 +410,12 @@ document.addEventListener('DOMContentLoaded', () => {
results.textContent = lines.join('\n');
}
}
status.textContent = 'Done.';
status.textContent = cancelled ? 'Stopped.' : 'Done.';
enableRunButtons(false);
} catch (e) {
status.textContent = 'Error';
results.textContent = String(e?.message || e);
enableRunButtons(false);
}
});
});
+27
View File
@@ -4,6 +4,9 @@
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>AutoTemp — Research-Grade Hyperparameter Optimization</title>
<link rel="preconnect" href="https://fonts.googleapis.com" />
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;600;700&family=JetBrains+Mono:wght@400;600&display=swap" rel="stylesheet" />
<link rel="stylesheet" href="./style.css" />
</head>
<body>
@@ -38,30 +41,48 @@
<div class="field">
<label for="temperatures">temperature list</label>
<input id="temperatures" value="0.4,0.6,0.8,1.0" />
<div class="dotbar" id="tempDots"></div>
</div>
<div class="field">
<label for="tops">top_p list</label>
<input id="tops" value="1.0" />
<div class="dotbar" id="topDots"></div>
</div>
<div class="field">
<label for="maxTokens">max_tokens list</label>
<input id="maxTokens" value="256,512" />
<div class="dotbar" id="maxTokDots"></div>
</div>
</div>
<div class="grid-3">
<div class="field">
<label for="freqPen">frequency_penalty list</label>
<input id="freqPen" value="0,0.2" />
<div class="dotbar" id="freqDots"></div>
</div>
<div class="field">
<label for="presPen">presence_penalty list</label>
<input id="presPen" value="0,0.2" />
<div class="dotbar" id="presDots"></div>
</div>
<div class="field">
<label for="stopSeqs">stop tokens (comma-separated)</label>
<input id="stopSeqs" placeholder="e.g. \nEND,###" />
</div>
</div>
<div class="grid-3">
<div class="field">
<label for="judgeWeight">Judge weight: <span id="judgeWeightVal">0.70</span></label>
<input type="range" id="judgeWeight" min="0" max="1" step="0.05" value="0.7" />
</div>
<div class="field">
<label for="lengthTarget">Length target (% of max_tokens): <span id="lengthTargetVal">60%</span></label>
<input type="range" id="lengthTarget" min="10" max="100" step="5" value="60" />
</div>
<div class="field">
<label class="inline"><input type="checkbox" id="useHeuristics" checked /> Use heuristic signals</label>
</div>
</div>
<div class="grid-3">
<div class="field">
<label class="inline"><input type="checkbox" id="advancedMode" /> Advanced Mode (UCB over arms)</label>
@@ -75,6 +96,10 @@
<input type="range" id="explorationC" min="0" max="3" step="0.1" value="1.0" />
</div>
</div>
<div class="field">
<label for="judgeSystemPrompt">Judge System Prompt (used for evaluation)</label>
<textarea id="judgeSystemPrompt" rows="6" placeholder="Define the system instructions for judge evaluations..."></textarea>
</div>
<div class="note">Provide comma-separated values to sweep. The app will form the Cartesian product across lists and evaluate each hyperparameter arm.</div>
</section>
@@ -85,6 +110,8 @@
<section class="actions">
<button id="runBtn">Run AutoTemp</button>
<button id="stopBtn" disabled>Stop</button>
<button id="downloadBtn" disabled>Download Results</button>
<span id="status"></span>
</section>
+17 -10
View File
@@ -1,24 +1,25 @@
:root { --bg:#020b05; --panel:#03150e; --text:#b5f5d2; --accent:#00ff9c; --accent2:#13f1ff; --muted:#0a2a1f; }
:root { --bg:#020b05; --panel:#03150e; --text:#d8f5e7; --mutedText:#a4d7c1; --accent:#00ff9c; --accent2:#13f1ff; --muted:#0a2a1f; }
*{ box-sizing:border-box }
body{ margin:0; font-family: ui-monospace,SFMono-Regular,Menlo,Monaco,Consolas,monospace; background: radial-gradient(1200px 800px at 20% 0%, #03150e, #020b05), #020b05; color:var(--text) }
body{ margin:0; font-family: Inter, system-ui, -apple-system, Segoe UI, Roboto, Arial, sans-serif; background: radial-gradient(1200px 800px at 20% 0%, #03150e, #020b05), #020b05; color:var(--text); line-height:1.5 }
.container{ max-width:1100px; margin:0 auto; padding:24px }
.header{ display:flex; align-items:center; justify-content:space-between; margin-bottom:16px }
.logo{ font-weight:900; color:var(--accent); letter-spacing:2px }
.logo{ font-weight:800; color:var(--accent); letter-spacing:1px }
.subtitle{ color:var(--accent2); opacity:.9 }
section{ background:linear-gradient(180deg, rgba(3,21,14,.9), rgba(2,11,5,.9)); padding:16px; border-radius:10px; margin-bottom:16px; border:1px solid #0b442f; box-shadow:0 0 30px rgba(0,255,156,.05) inset }
.field{ margin-bottom:12px }
.field label{ display:block; font-weight:700; margin-bottom:6px; color:#a5ffd6 }
.field input[type="text"], .field input[type="password"], .field input[type="number"], .field textarea { width:100%; padding:10px; border-radius:6px; border:1px solid #0b442f; background:#03150e; color:var(--text); outline:none; box-shadow:0 0 0 1px rgba(0,255,156,.05) inset }
.field input[type="text"], .field input[type="password"], .field input[type="number"], .field textarea { width:100%; padding:12px 12px; border-radius:10px; border:1px solid #0b442f; background:#03150e; color:var(--text); outline:none; box-shadow:0 0 0 1px rgba(0,255,156,.05) inset }
.field input[type="text"]:focus, .field input[type="password"]:focus, .field textarea:focus { box-shadow:0 0 0 2px rgba(19,241,255,.25) inset }
.field input[type="range"]{ width:100% }
.inline{ display:inline-flex; align-items:center; gap:8px }
.grid-2{ display:grid; grid-template-columns:1fr 1fr; gap:12px }
.grid-3{ display:grid; grid-template-columns:1fr 1fr 1fr; gap:12px }
.actions{ display:flex; align-items:center; gap:12px }
button{ background:linear-gradient(90deg, var(--accent), var(--accent2)); color:#00170e; font-weight:900; border:none; padding:10px 16px; border-radius:8px; cursor:pointer; box-shadow:0 0 15px rgba(0,255,156,.2) }
button{ background:linear-gradient(90deg, var(--accent), var(--accent2)); color:#00170e; font-weight:800; border:none; padding:12px 16px; border-radius:10px; cursor:pointer; box-shadow:0 0 15px rgba(0,255,156,.2) }
button:hover{ filter:brightness(1.05) }
button[disabled]{ opacity:.5; cursor:not-allowed }
.terminal{ border:1px solid #0b442f; background:#010a06; box-shadow:0 0 40px rgba(0,255,156,.06) inset }
pre{ white-space:pre-wrap; background:#010a06; padding:12px; border-radius:8px; border:1px dashed #0b442f }
pre{ white-space:pre-wrap; background:#010a06; padding:14px; border-radius:10px; border:1px dashed #0b442f; font-family: "JetBrains Mono", ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, monospace; font-size:13px }
.table-wrap{ overflow:auto; }
.arms-table{ width:100%; border-collapse:collapse; font-size:13px }
.arms-table th,.arms-table td{ border:1px dashed #0b442f; padding:6px 8px; vertical-align:top }
@@ -29,11 +30,17 @@ pre{ white-space:pre-wrap; background:#010a06; padding:12px; border-radius:8px;
.status-wait{ color:#a5ffd6 }
.log{ max-height:260px; overflow:auto }
.chart-wrap{ background:#010a06; border:1px dashed #0b442f; border-radius:8px; padding:8px; margin-bottom:12px }
.arm-params{ font-size:12px; color:#a5ffd6; margin-bottom:8px }
.arm-params code{ background:#03150e; padding:2px 4px; border:1px solid #0b442f; border-radius:4px }
.arm-output-box{ background:#0f1620; border:1px solid #0b442f; border-radius:8px; padding:16px; margin:10px auto; max-width:760px; box-shadow:0 0 20px rgba(0,255,156,.08) inset }
.dotbar{ position:relative; height:28px; border:1px dashed #0b442f; border-radius:999px; margin-top:8px; background:linear-gradient(90deg, rgba(0,255,156,.05), rgba(19,241,255,.05)) }
.dotbar .range{ position:absolute; top:50%; left:8px; right:8px; height:2px; background:#0b442f; transform:translateY(-50%); }
.dotbar .dot{ position:absolute; top:50%; width:14px; height:14px; background:#00ff9c; border-radius:50%; transform:translate(-50%,-50%); cursor:pointer; box-shadow:0 0 8px rgba(0,255,156,.4) }
.dotbar .dot.inactive{ background:#063a2a; box-shadow:none }
.dotbar .labels{ position:absolute; top:100%; left:0; right:0; display:flex; justify-content:space-between; font-size:11px; color:#a5ffd6; margin-top:4px }
.arm-params{ font-size:12px; color:var(--mutedText); margin:0 0 10px 0 }
.arm-params .label{ font-weight:700; color:#a5ffd6; margin-right:6px }
.arm-params pre{ margin:6px 0 0 0; background:#03150e; border:1px solid #0b442f; border-radius:8px; padding:10px }
.arm-output-box{ background:#0f1620; border:1px solid #0b442f; border-radius:12px; padding:18px; margin:12px auto; max-width:820px; box-shadow:0 0 24px rgba(0,255,156,.08) inset }
.arm-output-box pre{ background:transparent; border:none; margin:0; padding:0; white-space:pre-wrap; color:#e4fff2; font-size:14px }
.arm-scores{ font-size:12px; margin-top:8px; color:#b5f5d2 }
.arm-scores{ font-size:12px; margin-top:10px; color:#bfeedd }
.footer{ display:flex; align-items:center; gap:10px; opacity:.85 }
.blink{ width:8px; height:18px; background:var(--accent); animation: blink 1s infinite }
.glow{ text-shadow:0 0 8px rgba(0,255,156,.35) }