Files
P4RS3LT0NGV3/js/tools/TokenizerTool.js
T
Dustin Farley dc10a90851 refactor: migrate to modular tool-based architecture
- Implement tool registry system with individual tool modules
- Reorganize transformers into categorized source modules
- Remove emojiLibrary.js, consolidate into EmojiUtils and emojiData
- Fix mobile close button and tooltip functionality
- Add build system for transforms and emoji data
- Migrate from Python backend to pure JavaScript
- Add comprehensive documentation and testing
- Improve code organization and maintainability
- Ignore generated files (transforms-bundle.js, emojiData.js)
2025-12-02 20:26:32 -08:00

96 lines
3.4 KiB
JavaScript

/**
* Tokenizer Tool - Tokenizer visualization tool
*/
class TokenizerTool extends Tool {
constructor() {
super({
id: 'tokenizer',
name: 'Tokenizer',
icon: 'fa-layer-group',
title: 'Tokenizer visualization',
order: 6
});
}
getVueData() {
return {
tokenizerInput: '',
tokenizerEngine: 'byte',
tokenizerTokens: [],
tokenizerCharCount: 0,
tokenizerWordCount: 0
};
}
getVueMethods() {
return {
runTokenizer: async function() {
const text = this.tokenizerInput || '';
const engine = this.tokenizerEngine;
const tokens = [];
if (!text) { this.tokenizerTokens = []; this.tokenizerCharCount = 0; this.tokenizerWordCount = 0; return; }
if (engine === 'byte') {
const encoder = new TextEncoder();
const bytes = encoder.encode(text);
for (let i=0;i<bytes.length;i++) {
tokens.push({ id: bytes[i], text: `0x${bytes[i].toString(16).padStart(2,'0')}` });
}
} else if (engine === 'word') {
const parts = text.split(/(\s+|[\.,!?:;()\[\]{}])/);
for (const p of parts) { if (p) tokens.push({ text: p }); }
} else if (['cl100k','o200k','p50k','r50k'].includes(engine)) {
try {
if (!window.gptTok) {
window.gptTok = await import('https://cdn.jsdelivr.net/npm/gpt-tokenizer@2/+esm');
}
const map = { cl100k: 'cl100k_base', o200k: 'o200k_base', p50k: 'p50k_base', r50k: 'r50k_base' };
const enc = map[engine];
const ids = window.gptTok.encode(text, enc);
for (const id of ids) {
const piece = window.gptTok.decode([id], enc);
tokens.push({ id, text: piece });
}
} catch (e) {
console.warn('Failed to load/use gpt-tokenizer; falling back to bytes', e);
this.tokenizerEngine = 'byte';
return this.runTokenizer();
}
} else {
const encoder = new TextEncoder();
const bytes = encoder.encode(text);
for (let i=0;i<bytes.length;i++) tokens.push({ id: bytes[i], text: `0x${bytes[i].toString(16).padStart(2,'0')}` });
}
this.tokenizerTokens = tokens;
this.tokenizerCharCount = Array.from(text).length;
const wordMatches = text.trim().match(/[^\s]+/g) || [];
this.tokenizerWordCount = wordMatches.length;
}
};
}
getVueWatchers() {
return {
tokenizerInput() {
this.runTokenizer();
},
tokenizerEngine() {
this.runTokenizer();
}
};
}
onActivate(vueInstance) {
vueInstance.$nextTick(() => vueInstance.runTokenizer());
}
}
// Export
if (typeof module !== 'undefined' && module.exports) {
module.exports = TokenizerTool;
} else {
window.TokenizerTool = TokenizerTool;
}