mirror of
https://github.com/elder-plinius/P4RS3LT0NGV3.git
synced 2026-04-21 19:26:13 +02:00
8ff45957c8
Made-with: Cursor
72 lines
2.8 KiB
JavaScript
72 lines
2.8 KiB
JavaScript
const fs = require('fs');
|
|
|
|
// The JSON data fetched from the repository
|
|
const jsonData = {
|
|
"_metadata": {
|
|
"name": "AGGREGLITCH",
|
|
"version": "1.0.0",
|
|
"description": "The Complete Glitch Token Library - All Known LLM Vocabulary Anomalies",
|
|
"tagline": "GOTTA CATCH 'EM ALL",
|
|
"total_tokens_cataloged": 7895,
|
|
"last_updated": "2025-12-27",
|
|
"sources": [
|
|
"SolidGoldMagikarp (LessWrong, 2023) - Rumbelow & Watkins",
|
|
"SolidGoldMagikarp II & III Technical Details (LessWrong)",
|
|
"Glitch Token Catalog - Full Clear (LessWrong, 2024)",
|
|
"SmartyHeaderCode: Anomalous Tokens GPT3.5/GPT-4 (LessWrong)",
|
|
"The petertodd/Leilan Phenomenon (LessWrong)",
|
|
"Mapping the Semantic Void (LessWrong)",
|
|
"BPE Subtoken Artifacts (LessWrong)",
|
|
"Anomalous Tokens in DeepSeek-V3/r1 (Substack, 2025)",
|
|
"Glitch Tokens in LLMs (ACM, 2024)",
|
|
"GlitchMiner: Gradient-based Detection (arXiv, 2024)",
|
|
"GPT-4o Chinese Token Pollution (MIT Tech Review, 2024)",
|
|
"NVIDIA Garak LLM Vulnerability Scanner",
|
|
"Dropbox Prompt Injection Research (2023)"
|
|
],
|
|
"usage": "Import this library to test LLMs for glitch token vulnerabilities"
|
|
},
|
|
"behavior_categories": {
|
|
"UNSPEAKABLE": "Model CANNOT repeat these tokens - substitutes, evades, or produces garbage",
|
|
"POLYSEMANTIC": "Token interpreted as DIFFERENT words each time, even at temperature 0",
|
|
"GLITCHED_SPELLING": "Model CAN repeat but CANNOT spell correctly",
|
|
"CONTEXT_CORRUPTOR": "Token corrupts surrounding context when present",
|
|
"LOOP_INDUCER": "Causes infinite generation loops - DoS potential",
|
|
"IDENTITY_DISRUPTOR": "Causes model to lose sense of identity",
|
|
"FRAGMENT": "Orphaned BPE subtoken that glitches without parent",
|
|
"UNREACHABLE": "Exists in vocabulary but pre-tokenization prevents use"
|
|
},
|
|
"tokenizers": {
|
|
"r50k_base": {
|
|
"name": "GPT-2/GPT-3 Tokenizer",
|
|
"vocab_size": 50257,
|
|
"models": ["GPT-2", "GPT-3", "GPT-J"]
|
|
},
|
|
"cl100k_base": {
|
|
"name": "GPT-3.5/GPT-4 Tokenizer",
|
|
"vocab_size": 100256,
|
|
"models": ["GPT-3.5-turbo", "GPT-4", "GPT-4-turbo"]
|
|
},
|
|
"o200k_base": {
|
|
"name": "GPT-4o Tokenizer",
|
|
"vocab_size": 200000,
|
|
"models": ["GPT-4o", "GPT-4o-mini"]
|
|
},
|
|
"llama": {
|
|
"name": "LLaMA Tokenizer",
|
|
"models": ["Llama-2-7b", "Llama-2-13b", "Llama-3"]
|
|
},
|
|
"deepseek": {
|
|
"name": "DeepSeek Tokenizer",
|
|
"models": ["DeepSeek-V3", "DeepSeek-r1"]
|
|
}
|
|
},
|
|
"glitch_tokens": {}
|
|
};
|
|
|
|
// Note: The full glitch_tokens data is too large to include here
|
|
// We'll need to fetch it from the URL or copy it from the browser
|
|
console.log('This is a placeholder. The actual data needs to be fetched from the repository.');
|
|
console.log('Total tokens should be:', jsonData._metadata.total_tokens_cataloged);
|
|
|