mirror of
https://github.com/elder-plinius/P4RS3LT0NGV3.git
synced 2026-06-06 06:53:56 +02:00
Tokenizer tab: add OpenAI encodings via browser-safe gpt-tokenizer dynamic import; UI: fit all five Tokenade preset buttons on one line (tighter buttons)
This commit is contained in:
+3
-3
@@ -2023,12 +2023,12 @@ html {
|
||||
.tokenade-presets {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 8px;
|
||||
gap: 6px;
|
||||
flex-wrap: nowrap;
|
||||
overflow-x: auto;
|
||||
overflow-x: visible;
|
||||
margin: 8px 0 12px 0;
|
||||
}
|
||||
.tokenade-presets .transform-button { flex: 0 0 auto; min-width: 160px; }
|
||||
.tokenade-presets .transform-button { flex: 0 0 auto; min-width: 136px; padding: 5px 8px; }
|
||||
|
||||
/* Quick picks panel */
|
||||
.carrier-quick-grid {
|
||||
|
||||
+4
-1
@@ -204,7 +204,10 @@
|
||||
<select v-model="tokenizerEngine" @change="runTokenizer">
|
||||
<option value="byte">UTF-8 bytes</option>
|
||||
<option value="word">Naive words</option>
|
||||
<option value="gpt3">Experimental: gpt-3-encoder (if available)</option>
|
||||
<option value="cl100k">OpenAI: cl100k_base (GPT‑3.5/4)</option>
|
||||
<option value="o200k">OpenAI: o200k_base (GPT‑4o)</option>
|
||||
<option value="p50k">OpenAI: p50k_base</option>
|
||||
<option value="r50k">OpenAI: r50k_base</option>
|
||||
</select>
|
||||
</label>
|
||||
</div>
|
||||
|
||||
@@ -1909,7 +1909,7 @@ window.app = new Vue({
|
||||
}
|
||||
,
|
||||
// Tokenizer visualization
|
||||
runTokenizer() {
|
||||
async runTokenizer() {
|
||||
const text = this.tokenizerInput || '';
|
||||
const engine = this.tokenizerEngine;
|
||||
const tokens = [];
|
||||
@@ -1925,15 +1925,20 @@ window.app = new Vue({
|
||||
// Naive word split incl. punctuation
|
||||
const parts = text.split(/(\s+|[\.,!?:;()\[\]{}])/);
|
||||
for (const p of parts) { if (p) tokens.push({ text: p }); }
|
||||
} else if (engine === 'gpt3' && window.gpt3enc && window.gpt3enc.encode) {
|
||||
} else if (['cl100k','o200k','p50k','r50k'].includes(engine)) {
|
||||
try {
|
||||
const ids = window.gpt3enc.encode(text);
|
||||
if (!window.gptTok) {
|
||||
window.gptTok = await import('https://cdn.jsdelivr.net/npm/gpt-tokenizer@2/+esm');
|
||||
}
|
||||
const map = { cl100k: 'cl100k_base', o200k: 'o200k_base', p50k: 'p50k_base', r50k: 'r50k_base' };
|
||||
const enc = map[engine];
|
||||
const ids = window.gptTok.encode(text, enc);
|
||||
for (const id of ids) {
|
||||
const piece = window.gpt3enc.decode([id]);
|
||||
const piece = window.gptTok.decode([id], enc);
|
||||
tokens.push({ id, text: piece });
|
||||
}
|
||||
} catch (e) {
|
||||
console.warn('gpt-3-encoder not available', e);
|
||||
console.warn('Failed to load/use gpt-tokenizer; falling back to bytes', e);
|
||||
this.tokenizerEngine = 'byte';
|
||||
return this.runTokenizer();
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user