From 04dbc7cddd2b5a5e729a64b6b17839954dacf279 Mon Sep 17 00:00:00 2001 From: EP Date: Sat, 23 Aug 2025 11:39:38 -0700 Subject: [PATCH] feat(judging): include prompt context in evaluator; feat(ui): dotbar selectors, stop/download, judge system prompt; styling + params formatting --- autotemp.py | 21 +++++++----- docs/app.js | 89 ++++++++++++++++++++++++++++++++++++++++++++++--- docs/index.html | 27 +++++++++++++++ docs/style.css | 27 +++++++++------ 4 files changed, 141 insertions(+), 23 deletions(-) diff --git a/autotemp.py b/autotemp.py index 018d3bb..eab726f 100644 --- a/autotemp.py +++ b/autotemp.py @@ -94,7 +94,7 @@ class AutoTemp: return f"Error generating text at temperature {temperature} and top-p {top_p}: {e}", None - def _evaluate_output_json(self, output: str, temperature: float, top_p: float, judge_id: int) -> Dict[str, float]: + def _evaluate_output_json(self, prompt_text: str, output: str, temperature: float, top_p: float, judge_id: int) -> Dict[str, float]: fixed_top_p_for_evaluation = 1.0 eval_prompt = f""" You are Judge #{judge_id}. Evaluate the OUTPUT below which was generated at temperature {temperature} and top_p {top_p}. @@ -108,7 +108,12 @@ class AutoTemp: - coherence: Logical structure and consistency. - safety: Avoids hallucinations and harmful content; favors factual accuracy. - overall: Weighted aggregate you deem most faithful to a careful human judge. - Output to evaluate between triple dashes: + Judge the OUTPUT relative to the PROMPT/task given to the model. + PROMPT between triple equal signs: + === + {prompt_text} + === + OUTPUT between triple dashes: --- {output} --- @@ -141,13 +146,13 @@ class AutoTemp: "overall": round(fallback_overall, 1), } - def evaluate_output(self, output: str, temperature: float, top_p: float) -> Dict[str, float]: + def evaluate_output(self, prompt_text: str, output: str, temperature: float, top_p: float) -> Dict[str, float]: if self.judges <= 1: - judge_scores = [self._evaluate_output_json(output, temperature, top_p, judge_id=1)] + judge_scores = [self._evaluate_output_json(prompt_text, output, temperature, top_p, judge_id=1)] else: with ThreadPoolExecutor(max_workers=min(self.judges, self.max_workers)) as executor: futures = [ - executor.submit(self._evaluate_output_json, output, temperature, top_p, judge_id=j+1) + executor.submit(self._evaluate_output_json, prompt_text, output, temperature, top_p, judge_id=j+1) for j in range(self.judges) ] judge_scores = [f.result() for f in as_completed(futures)] @@ -188,7 +193,7 @@ class AutoTemp: print(f"Output for temp {temp}: {output_text}") if output_text and not output_text.startswith("Error"): outputs[temp] = output_text - score_dict = self.evaluate_output(output_text, temp, top_p) + score_dict = self.evaluate_output(prompt, output_text, temp, top_p) detailed_scores[temp] = score_dict overall_scores[temp] = score_dict.get("overall", 0.0) except Exception as e: @@ -228,7 +233,7 @@ class AutoTemp: for t in init_order: out, _ = self.generate_with_openai(prompt, t, top_p) if out and not out.startswith("Error"): - score_detail = self.evaluate_output(out, t, top_p) + score_detail = self.evaluate_output(prompt, out, t, top_p) score = score_detail.get("overall", 0.0) pulls[t] += 1 sums[t] += score @@ -250,7 +255,7 @@ class AutoTemp: next_t = max(temperature_list, key=lambda tt: ucb_values[tt]) out, _ = self.generate_with_openai(prompt, next_t, top_p) if out and not out.startswith("Error"): - score_detail = self.evaluate_output(out, next_t, top_p) + score_detail = self.evaluate_output(prompt, out, next_t, top_p) score = score_detail.get("overall", 0.0) pulls[next_t] += 1 sums[next_t] += score diff --git a/docs/app.js b/docs/app.js index 02e7f14..77fb22c 100644 --- a/docs/app.js +++ b/docs/app.js @@ -41,8 +41,9 @@ Output between triple dashes: --- ${output} ---`; + const systemPrompt = (document.getElementById('judgeSystemPrompt')?.value || '').trim() || 'Return only the JSON.'; const raw = await openAIChat(apiKey, model, [ - { role: 'system', content: 'Return only the JSON.' }, + { role: 'system', content: systemPrompt }, { role: 'user', content: evalPrompt } ], 0.2, 1.0); try { @@ -195,6 +196,40 @@ document.addEventListener('DOMContentLoaded', () => { c.data.datasets[0].data.push({ x: temp, y: mean }); c.update('none'); } + // Run state and controls + let running = false; + let cancelled = false; + let runResults = { arms: [], records: [] }; + const runBtn = getEl('runBtn'); + const stopBtn = getEl('stopBtn'); + const downloadBtn = getEl('downloadBtn'); + function enableRunButtons(isRunning){ + running = !!isRunning; + if (runBtn) runBtn.disabled = running; + if (stopBtn) stopBtn.disabled = !running; + if (downloadBtn) downloadBtn.disabled = running || !runResults.records.length; + } + function recordResult(arm, output, judges){ + try { runResults.records.push({ timestamp: Date.now(), arm, output, judges }); } catch(e) {} + if (downloadBtn) downloadBtn.disabled = running ? true : !runResults.records.length; + } + function downloadJSON(){ + const blob = new Blob([JSON.stringify(runResults, null, 2)], { type: 'application/json' }); + const url = URL.createObjectURL(blob); + const a = document.createElement('a'); + a.href = url; a.download = 'autotemp_results.json'; a.click(); + URL.revokeObjectURL(url); + } + if (downloadBtn) downloadBtn.addEventListener('click', downloadJSON); + if (stopBtn) stopBtn.addEventListener('click', () => { if (running) { cancelled = true; appendLog('Stop requested: finishing current step then halting.'); } }); + // Persist judge system prompt + const judgeSystemPromptEl = getEl('judgeSystemPrompt'); + if (judgeSystemPromptEl){ + const saved = localStorage.getItem('autotemp_judge_system_prompt'); + if (saved) judgeSystemPromptEl.value = saved; + else judgeSystemPromptEl.value = 'You are a strict evaluator. Return only a minified JSON with the numeric fields: {"relevance","clarity","utility","creativity","coherence","safety","overall"}. Do not include text outside JSON.'; + judgeSystemPromptEl.addEventListener('input', ()=> localStorage.setItem('autotemp_judge_system_prompt', judgeSystemPromptEl.value)); + } const judges = getEl('judges'); const rounds = getEl('rounds'); const explorationC = getEl('explorationC'); @@ -202,7 +237,39 @@ document.addEventListener('DOMContentLoaded', () => { rounds.addEventListener('input', ()=> setText('roundsVal', rounds.value)); explorationC.addEventListener('input', ()=> setText('cVal', (+explorationC.value).toFixed(2))); + // Dotbar helpers to visualize/select discrete values + function initDotbar(containerId, min, max, step, parseFn, inputId){ + const bar = getEl(containerId); const inp = getEl(inputId); + if (!bar || !inp) return; + bar.innerHTML = '
'+min+''+max+'
'; + const values = (inp.value||'').split(',').map(s=>parseFn(s.trim())).filter(v=>!Number.isNaN(v)); + const active = new Set(values.map(v=>String(v))); + const count = Math.floor((max-min)/step)+1; + for (let i=0;i0) ? 2 : 0 ); + const dot = document.createElement('div'); dot.className='dot '+(active.has(String(v))?'':'inactive'); + dot.style.left = `${(i/(count-1))*100}%`; + dot.title = String(v); + dot.addEventListener('click', ()=>{ + const key = String(v); + if (active.has(key)) active.delete(key); else active.add(key); + dot.classList.toggle('inactive'); + const list = Array.from(active).map(x=>parseFn(x)); + list.sort((a,b)=>a-b); + inp.value = list.join(','); + }); + bar.appendChild(dot); + } + } + + initDotbar('tempDots', 0.0, 1.5, 0.1, parseFloat, 'temperatures'); + initDotbar('topDots', 0.0, 1.0, 0.1, parseFloat, 'tops'); + initDotbar('maxTokDots', 64, 2048, 64, x=>parseInt(x,10), 'maxTokens'); + initDotbar('freqDots', 0.0, 2.0, 0.1, parseFloat, 'freqPen'); + initDotbar('presDots', 0.0, 2.0, 0.1, parseFloat, 'presPen'); + getEl('runBtn').addEventListener('click', async () => { + if (running) { appendLog('Run already in progress. Please wait or press Stop.'); return; } const apiKey = getEl('apiKey').value.trim(); const remember = getEl('rememberKey').checked; if (!apiKey) { alert('Please enter an API key.'); return; } @@ -244,11 +311,16 @@ document.addEventListener('DOMContentLoaded', () => { status.textContent = 'Running...'; appendLog(`Initialized ${arms.length} arms. Judges=${j}. Advanced=${adv ? 'UCB' : 'Standard'}.`); renderArmsTable(arms); + // initialize run state + cancelled = false; + runResults = { arms, records: [] }; + enableRunButtons(true); try { const c = ensureChart(); if (c){ c.data.datasets[0].data = []; c.update('none'); } if (!adv) { const outputs = {}; const details = {}; const overalls = {}; for (const arm of arms){ + if (cancelled) break; updateArmRow(arm, { status:'running', statusClass:'status-running' }); appendLog(`Generating for arm ${JSON.stringify(arm)}...`); const text = await generateOnce(apiKey, model, prompt, arm); @@ -256,8 +328,9 @@ document.addEventListener('DOMContentLoaded', () => { appendLog(`Judging arm ${JSON.stringify(arm)}...`); const judgeResults = await Promise.all(Array.from({length: j}).map((_,i)=> judgeOnce(apiKey, model, text, arm, i+1))); const agg = aggregateScores(judgeResults); + recordResult(arm, text, agg); details[JSON.stringify(arm)] = agg; overalls[JSON.stringify(arm)] = agg.overall; - const paramHtml = `
Params: ${escapeHtml(JSON.stringify(arm))}
`; + const paramHtml = `
Params
${escapeHtml(JSON.stringify(arm, null, 2))}
`; const outputHtml = `
${escapeHtml(text)}
`; const scoresHtml = `
Scores: ${escapeHtml(JSON.stringify(agg))}
`; updateArmRow(arm, { status:'done', statusClass:'status-done', pulls:1, mean:agg.overall, best:agg.overall, detail: paramHtml + outputHtml + scoresHtml }); @@ -283,20 +356,23 @@ document.addEventListener('DOMContentLoaded', () => { for (const arm of arms){ updateArmRow(arm, { status:'running', statusClass:'status-running' }); } // init pull each arm for (const arm of arms){ + if (cancelled) break; appendLog(`Init pull -> ${JSON.stringify(arm)}`); const k = JSON.stringify(arm); const text = await generateOnce(apiKey, model, prompt, arm); const judgeResults = await Promise.all(Array.from({length: j}).map((_,i)=> judgeOnce(apiKey, model, text, arm, i+1))); const agg = aggregateScores(judgeResults); + recordResult(arm, text, agg); pulls[k] += 1; sums[k] += agg.overall; total += 1; if (agg.overall > best[k].overall) best[k] = {overall: agg.overall, text, detail: agg}; - const paramHtml = `
Params: ${escapeHtml(JSON.stringify(arm))}
`; + const paramHtml = `
Params
${escapeHtml(JSON.stringify(arm, null, 2))}
`; const outputHtml = `
${escapeHtml(text)}
`; const scoresHtml = `
Scores: ${escapeHtml(JSON.stringify(agg))}
`; updateArmRow(arm, { pulls:pulls[k], mean:(sums[k]/pulls[k]), best:best[k].overall, detail: paramHtml + outputHtml + scoresHtml }); if (typeof arm.temperature === 'number') addChartPoint(arm.temperature, agg.overall); } for (let i=0;i { const text = await generateOnce(apiKey, model, prompt, arm); const judgeResults = await Promise.all(Array.from({length: j}).map((_,i)=> judgeOnce(apiKey, model, text, arm, i+1))); const agg = aggregateScores(judgeResults); + recordResult(arm, text, agg); pulls[nextK] += 1; sums[nextK] += agg.overall; total += 1; if (agg.overall > best[nextK].overall) best[nextK] = {overall: agg.overall, text, detail: agg}; - const paramHtml = `
Params: ${escapeHtml(JSON.stringify(arm))}
`; + const paramHtml = `
Params
${escapeHtml(JSON.stringify(arm, null, 2))}
`; const outputHtml = `
${escapeHtml(text)}
`; const scoresHtml = `
Scores: ${escapeHtml(JSON.stringify(agg))}
`; updateArmRow(arm, { pulls:pulls[nextK], mean:(sums[nextK]/pulls[nextK]), best:best[nextK].overall, detail: paramHtml + outputHtml + scoresHtml }); @@ -333,10 +410,12 @@ document.addEventListener('DOMContentLoaded', () => { results.textContent = lines.join('\n'); } } - status.textContent = 'Done.'; + status.textContent = cancelled ? 'Stopped.' : 'Done.'; + enableRunButtons(false); } catch (e) { status.textContent = 'Error'; results.textContent = String(e?.message || e); + enableRunButtons(false); } }); }); diff --git a/docs/index.html b/docs/index.html index 4b4666a..a3ea548 100644 --- a/docs/index.html +++ b/docs/index.html @@ -4,6 +4,9 @@ AutoTemp — Research-Grade Hyperparameter Optimization + + + @@ -38,30 +41,48 @@
+
+
+
+
+
+
+
+ + +
+
+ + +
+
+ +
+
@@ -75,6 +96,10 @@
+
+ + +
Provide comma-separated values to sweep. The app will form the Cartesian product across lists and evaluate each hyperparameter arm.
@@ -85,6 +110,8 @@
+ +
diff --git a/docs/style.css b/docs/style.css index 63bc20c..bec4e71 100644 --- a/docs/style.css +++ b/docs/style.css @@ -1,24 +1,25 @@ -:root { --bg:#020b05; --panel:#03150e; --text:#b5f5d2; --accent:#00ff9c; --accent2:#13f1ff; --muted:#0a2a1f; } +:root { --bg:#020b05; --panel:#03150e; --text:#d8f5e7; --mutedText:#a4d7c1; --accent:#00ff9c; --accent2:#13f1ff; --muted:#0a2a1f; } *{ box-sizing:border-box } -body{ margin:0; font-family: ui-monospace,SFMono-Regular,Menlo,Monaco,Consolas,monospace; background: radial-gradient(1200px 800px at 20% 0%, #03150e, #020b05), #020b05; color:var(--text) } +body{ margin:0; font-family: Inter, system-ui, -apple-system, Segoe UI, Roboto, Arial, sans-serif; background: radial-gradient(1200px 800px at 20% 0%, #03150e, #020b05), #020b05; color:var(--text); line-height:1.5 } .container{ max-width:1100px; margin:0 auto; padding:24px } .header{ display:flex; align-items:center; justify-content:space-between; margin-bottom:16px } -.logo{ font-weight:900; color:var(--accent); letter-spacing:2px } +.logo{ font-weight:800; color:var(--accent); letter-spacing:1px } .subtitle{ color:var(--accent2); opacity:.9 } section{ background:linear-gradient(180deg, rgba(3,21,14,.9), rgba(2,11,5,.9)); padding:16px; border-radius:10px; margin-bottom:16px; border:1px solid #0b442f; box-shadow:0 0 30px rgba(0,255,156,.05) inset } .field{ margin-bottom:12px } .field label{ display:block; font-weight:700; margin-bottom:6px; color:#a5ffd6 } -.field input[type="text"], .field input[type="password"], .field input[type="number"], .field textarea { width:100%; padding:10px; border-radius:6px; border:1px solid #0b442f; background:#03150e; color:var(--text); outline:none; box-shadow:0 0 0 1px rgba(0,255,156,.05) inset } +.field input[type="text"], .field input[type="password"], .field input[type="number"], .field textarea { width:100%; padding:12px 12px; border-radius:10px; border:1px solid #0b442f; background:#03150e; color:var(--text); outline:none; box-shadow:0 0 0 1px rgba(0,255,156,.05) inset } .field input[type="text"]:focus, .field input[type="password"]:focus, .field textarea:focus { box-shadow:0 0 0 2px rgba(19,241,255,.25) inset } .field input[type="range"]{ width:100% } .inline{ display:inline-flex; align-items:center; gap:8px } .grid-2{ display:grid; grid-template-columns:1fr 1fr; gap:12px } .grid-3{ display:grid; grid-template-columns:1fr 1fr 1fr; gap:12px } .actions{ display:flex; align-items:center; gap:12px } -button{ background:linear-gradient(90deg, var(--accent), var(--accent2)); color:#00170e; font-weight:900; border:none; padding:10px 16px; border-radius:8px; cursor:pointer; box-shadow:0 0 15px rgba(0,255,156,.2) } +button{ background:linear-gradient(90deg, var(--accent), var(--accent2)); color:#00170e; font-weight:800; border:none; padding:12px 16px; border-radius:10px; cursor:pointer; box-shadow:0 0 15px rgba(0,255,156,.2) } button:hover{ filter:brightness(1.05) } +button[disabled]{ opacity:.5; cursor:not-allowed } .terminal{ border:1px solid #0b442f; background:#010a06; box-shadow:0 0 40px rgba(0,255,156,.06) inset } -pre{ white-space:pre-wrap; background:#010a06; padding:12px; border-radius:8px; border:1px dashed #0b442f } +pre{ white-space:pre-wrap; background:#010a06; padding:14px; border-radius:10px; border:1px dashed #0b442f; font-family: "JetBrains Mono", ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, monospace; font-size:13px } .table-wrap{ overflow:auto; } .arms-table{ width:100%; border-collapse:collapse; font-size:13px } .arms-table th,.arms-table td{ border:1px dashed #0b442f; padding:6px 8px; vertical-align:top } @@ -29,11 +30,17 @@ pre{ white-space:pre-wrap; background:#010a06; padding:12px; border-radius:8px; .status-wait{ color:#a5ffd6 } .log{ max-height:260px; overflow:auto } .chart-wrap{ background:#010a06; border:1px dashed #0b442f; border-radius:8px; padding:8px; margin-bottom:12px } -.arm-params{ font-size:12px; color:#a5ffd6; margin-bottom:8px } -.arm-params code{ background:#03150e; padding:2px 4px; border:1px solid #0b442f; border-radius:4px } -.arm-output-box{ background:#0f1620; border:1px solid #0b442f; border-radius:8px; padding:16px; margin:10px auto; max-width:760px; box-shadow:0 0 20px rgba(0,255,156,.08) inset } +.dotbar{ position:relative; height:28px; border:1px dashed #0b442f; border-radius:999px; margin-top:8px; background:linear-gradient(90deg, rgba(0,255,156,.05), rgba(19,241,255,.05)) } +.dotbar .range{ position:absolute; top:50%; left:8px; right:8px; height:2px; background:#0b442f; transform:translateY(-50%); } +.dotbar .dot{ position:absolute; top:50%; width:14px; height:14px; background:#00ff9c; border-radius:50%; transform:translate(-50%,-50%); cursor:pointer; box-shadow:0 0 8px rgba(0,255,156,.4) } +.dotbar .dot.inactive{ background:#063a2a; box-shadow:none } +.dotbar .labels{ position:absolute; top:100%; left:0; right:0; display:flex; justify-content:space-between; font-size:11px; color:#a5ffd6; margin-top:4px } +.arm-params{ font-size:12px; color:var(--mutedText); margin:0 0 10px 0 } +.arm-params .label{ font-weight:700; color:#a5ffd6; margin-right:6px } +.arm-params pre{ margin:6px 0 0 0; background:#03150e; border:1px solid #0b442f; border-radius:8px; padding:10px } +.arm-output-box{ background:#0f1620; border:1px solid #0b442f; border-radius:12px; padding:18px; margin:12px auto; max-width:820px; box-shadow:0 0 24px rgba(0,255,156,.08) inset } .arm-output-box pre{ background:transparent; border:none; margin:0; padding:0; white-space:pre-wrap; color:#e4fff2; font-size:14px } -.arm-scores{ font-size:12px; margin-top:8px; color:#b5f5d2 } +.arm-scores{ font-size:12px; margin-top:10px; color:#bfeedd } .footer{ display:flex; align-items:center; gap:10px; opacity:.85 } .blink{ width:8px; height:18px; background:var(--accent); animation: blink 1s infinite } .glow{ text-shadow:0 0 8px rgba(0,255,156,.35) }