P4RS3LT0NGV3/temp_fetch_glitch.js

const fs = require('fs');

// The JSON data fetched from the repository
const jsonData = {
  "_metadata": {
    "name": "AGGREGLITCH",
    "version": "1.0.0",
    "description": "The Complete Glitch Token Library - All Known LLM Vocabulary Anomalies",
    "tagline": "GOTTA CATCH 'EM ALL",
    "total_tokens_cataloged": 7895,
    "last_updated": "2025-12-27",
    "sources": [
      "SolidGoldMagikarp (LessWrong, 2023) - Rumbelow & Watkins",
      "SolidGoldMagikarp II & III Technical Details (LessWrong)",
      "Glitch Token Catalog - Full Clear (LessWrong, 2024)",
      "SmartyHeaderCode: Anomalous Tokens GPT3.5/GPT-4 (LessWrong)",
      "The petertodd/Leilan Phenomenon (LessWrong)",
      "Mapping the Semantic Void (LessWrong)",
      "BPE Subtoken Artifacts (LessWrong)",
      "Anomalous Tokens in DeepSeek-V3/r1 (Substack, 2025)",
      "Glitch Tokens in LLMs (ACM, 2024)",
      "GlitchMiner: Gradient-based Detection (arXiv, 2024)",
      "GPT-4o Chinese Token Pollution (MIT Tech Review, 2024)",
      "NVIDIA Garak LLM Vulnerability Scanner",
      "Dropbox Prompt Injection Research (2023)"
    ],
    "usage": "Import this library to test LLMs for glitch token vulnerabilities"
  },
  "behavior_categories": {
    "UNSPEAKABLE": "Model CANNOT repeat these tokens - substitutes, evades, or produces garbage",
    "POLYSEMANTIC": "Token interpreted as DIFFERENT words each time, even at temperature 0",
    "GLITCHED_SPELLING": "Model CAN repeat but CANNOT spell correctly",
    "CONTEXT_CORRUPTOR": "Token corrupts surrounding context when present",
    "LOOP_INDUCER": "Causes infinite generation loops - DoS potential",
    "IDENTITY_DISRUPTOR": "Causes model to lose sense of identity",
    "FRAGMENT": "Orphaned BPE subtoken that glitches without parent",
    "UNREACHABLE": "Exists in vocabulary but pre-tokenization prevents use"
  },
  "tokenizers": {
    "r50k_base": {
      "name": "GPT-2/GPT-3 Tokenizer",
      "vocab_size": 50257,
      "models": ["GPT-2", "GPT-3", "GPT-J"]
    },
    "cl100k_base": {
      "name": "GPT-3.5/GPT-4 Tokenizer",
      "vocab_size": 100256,
      "models": ["GPT-3.5-turbo", "GPT-4", "GPT-4-turbo"]
    },
    "o200k_base": {
      "name": "GPT-4o Tokenizer",
      "vocab_size": 200000,
      "models": ["GPT-4o", "GPT-4o-mini"]
    },
    "llama": {
      "name": "LLaMA Tokenizer",
      "models": ["Llama-2-7b", "Llama-2-13b", "Llama-3"]
    },
    "deepseek": {
      "name": "DeepSeek Tokenizer",
      "models": ["DeepSeek-V3", "DeepSeek-r1"]
    }
  },
  "glitch_tokens": {}
};

// Note: The full glitch_tokens data is too large to include here
// We'll need to fetch it from the URL or copy it from the browser
console.log('This is a placeholder. The actual data needs to be fetched from the repository.');
console.log('Total tokens should be:', jsonData._metadata.total_tokens_cataloged);