Files
P4RS3LT0NGV3/temp_fetch_glitch.js
2026-03-20 22:57:14 -07:00

72 lines
2.8 KiB
JavaScript

const fs = require('fs');
// The JSON data fetched from the repository
const jsonData = {
"_metadata": {
"name": "AGGREGLITCH",
"version": "1.0.0",
"description": "The Complete Glitch Token Library - All Known LLM Vocabulary Anomalies",
"tagline": "GOTTA CATCH 'EM ALL",
"total_tokens_cataloged": 7895,
"last_updated": "2025-12-27",
"sources": [
"SolidGoldMagikarp (LessWrong, 2023) - Rumbelow & Watkins",
"SolidGoldMagikarp II & III Technical Details (LessWrong)",
"Glitch Token Catalog - Full Clear (LessWrong, 2024)",
"SmartyHeaderCode: Anomalous Tokens GPT3.5/GPT-4 (LessWrong)",
"The petertodd/Leilan Phenomenon (LessWrong)",
"Mapping the Semantic Void (LessWrong)",
"BPE Subtoken Artifacts (LessWrong)",
"Anomalous Tokens in DeepSeek-V3/r1 (Substack, 2025)",
"Glitch Tokens in LLMs (ACM, 2024)",
"GlitchMiner: Gradient-based Detection (arXiv, 2024)",
"GPT-4o Chinese Token Pollution (MIT Tech Review, 2024)",
"NVIDIA Garak LLM Vulnerability Scanner",
"Dropbox Prompt Injection Research (2023)"
],
"usage": "Import this library to test LLMs for glitch token vulnerabilities"
},
"behavior_categories": {
"UNSPEAKABLE": "Model CANNOT repeat these tokens - substitutes, evades, or produces garbage",
"POLYSEMANTIC": "Token interpreted as DIFFERENT words each time, even at temperature 0",
"GLITCHED_SPELLING": "Model CAN repeat but CANNOT spell correctly",
"CONTEXT_CORRUPTOR": "Token corrupts surrounding context when present",
"LOOP_INDUCER": "Causes infinite generation loops - DoS potential",
"IDENTITY_DISRUPTOR": "Causes model to lose sense of identity",
"FRAGMENT": "Orphaned BPE subtoken that glitches without parent",
"UNREACHABLE": "Exists in vocabulary but pre-tokenization prevents use"
},
"tokenizers": {
"r50k_base": {
"name": "GPT-2/GPT-3 Tokenizer",
"vocab_size": 50257,
"models": ["GPT-2", "GPT-3", "GPT-J"]
},
"cl100k_base": {
"name": "GPT-3.5/GPT-4 Tokenizer",
"vocab_size": 100256,
"models": ["GPT-3.5-turbo", "GPT-4", "GPT-4-turbo"]
},
"o200k_base": {
"name": "GPT-4o Tokenizer",
"vocab_size": 200000,
"models": ["GPT-4o", "GPT-4o-mini"]
},
"llama": {
"name": "LLaMA Tokenizer",
"models": ["Llama-2-7b", "Llama-2-13b", "Llama-3"]
},
"deepseek": {
"name": "DeepSeek Tokenizer",
"models": ["DeepSeek-V3", "DeepSeek-r1"]
}
},
"glitch_tokens": {}
};
// Note: The full glitch_tokens data is too large to include here
// We'll need to fetch it from the URL or copy it from the browser
console.log('This is a placeholder. The actual data needs to be fetched from the repository.');
console.log('Total tokens should be:', jsonData._metadata.total_tokens_cataloged);