test: Add secret detection benchmark dataset and ground truth

Add comprehensive benchmark dataset with 32 documented secrets for testing
secret detection workflows (gitleaks, trufflehog, llm_secret_detection).

- Add test_projects/secret_detection_benchmark/ with 19 test files
- Add ground truth JSON with precise line-by-line secret mappings
- Update .gitignore with exceptions for benchmark files (not real secrets)

Dataset breakdown:
- 12 Easy secrets (standard patterns)
- 10 Medium secrets (obfuscated)
- 10 Hard secrets (well hidden)
This commit is contained in:
tduhamel42
2025-10-16 11:46:28 +02:00
parent 87e3262832
commit 3be4d34531
22 changed files with 773 additions and 0 deletions
@@ -0,0 +1,344 @@
{
"description": "Ground truth dataset for secret detection benchmarking - Exactly 32 secrets",
"version": "1.1.0",
"total_secrets": 32,
"secrets_by_difficulty": {
"easy": 12,
"medium": 10,
"hard": 10
},
"secrets": [
{
"id": 1,
"file": ".env",
"line": 3,
"difficulty": "easy",
"type": "aws_access_key",
"value": "AKIAIOSFODNN7EXAMPLE",
"severity": "critical"
},
{
"id": 2,
"file": ".env",
"line": 4,
"difficulty": "easy",
"type": "aws_secret_access_key",
"value": "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY",
"severity": "critical"
},
{
"id": 3,
"file": "config/settings.py",
"line": 6,
"difficulty": "easy",
"type": "github_pat",
"value": "ghp_vR8jK2mN4pQ6tX9bC3wY7zA1eF5hI8kL",
"severity": "critical"
},
{
"id": 4,
"file": "config/settings.py",
"line": 9,
"difficulty": "easy",
"type": "stripe_api_key",
"value": "sk_live_51MabcdefghijklmnopqrstuvwxyzABCDEF123456789",
"severity": "critical"
},
{
"id": 5,
"file": "config/settings.py",
"line": 17,
"difficulty": "easy",
"type": "database_password",
"value": "ProdDB_P@ssw0rd_2024_Secure!",
"severity": "critical"
},
{
"id": 6,
"file": "src/app.py",
"line": 6,
"difficulty": "easy",
"type": "jwt_secret",
"value": "my-super-secret-jwt-key-do-not-share-2024",
"severity": "critical"
},
{
"id": 7,
"file": "config/database.yaml",
"line": 7,
"difficulty": "easy",
"type": "azure_storage_key",
"value": "DefaultEndpointsProtocol=https;AccountName=prodstore;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;EndpointSuffix=core.windows.net",
"severity": "critical"
},
{
"id": 8,
"file": "scripts/webhook.js",
"line": 4,
"difficulty": "easy",
"type": "slack_webhook",
"value": "https://hooks.slack.com/services/T00000000/B00000000/XXXXXXXXXXXXXXXXXXXX",
"severity": "high"
},
{
"id": 9,
"file": "config/app.properties",
"line": 6,
"difficulty": "easy",
"type": "api_key",
"value": "sk_test_4eC39HqLyjWDarjtT1zdp7dc",
"severity": "high"
},
{
"id": 10,
"file": "id_rsa",
"line": 1,
"difficulty": "easy",
"type": "ssh_private_key",
"value": "-----BEGIN OPENSSH PRIVATE KEY-----",
"severity": "critical"
},
{
"id": 11,
"file": "config/oauth.json",
"line": 4,
"difficulty": "easy",
"type": "oauth_client_secret",
"value": "GOCSPX-Ab12Cd34Ef56Gh78Ij90Kl12",
"severity": "critical"
},
{
"id": 12,
"file": "src/Main.java",
"line": 5,
"difficulty": "easy",
"type": "google_oauth_secret",
"value": "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",
"severity": "critical"
},
{
"id": 13,
"file": "src/config.py",
"line": 7,
"difficulty": "medium",
"type": "aws_access_key_base64",
"value": "QUtJQUlPU0ZPRE5ON0VYQU1QTEU=",
"decoded": "AKIAIOSFODNN7EXAMPLE",
"severity": "critical"
},
{
"id": 14,
"file": "src/config.py",
"line": 10,
"difficulty": "medium",
"type": "api_token_hex",
"value": "6170695f746f6b656e5f616263313233787977373839",
"decoded": "api_token_abc123xyz789",
"severity": "high"
},
{
"id": 15,
"file": "src/config.py",
"line": 16,
"difficulty": "medium",
"type": "database_password_concatenated",
"value": "MySecurePassword2024!",
"note": "Built from DB_PASS_PART1 + DB_PASS_PART2 + DB_PASS_PART3",
"severity": "critical"
},
{
"id": 16,
"file": "scripts/deploy.sh",
"line": 5,
"difficulty": "medium",
"type": "api_key_export",
"value": "sk_prod_1234567890abcdefghijklmnopqrstuvwxyz",
"severity": "critical"
},
{
"id": 17,
"file": "scripts/deploy.sh",
"line": 11,
"difficulty": "medium",
"type": "database_password_url_encoded",
"value": "mysql://admin:MyP%40ssw0rd%21@db.example.com:3306/prod",
"decoded": "mysql://admin:MyP@ssw0rd!@db.example.com:3306/prod",
"note": "In comment",
"severity": "critical"
},
{
"id": 18,
"file": "config/keys.yaml",
"line": 6,
"difficulty": "medium",
"type": "rsa_private_key_multiline",
"value": "-----BEGIN RSA PRIVATE KEY-----",
"note": "Multi-line YAML literal block",
"severity": "critical"
},
{
"id": 19,
"file": "config/keys.yaml",
"line": 11,
"difficulty": "medium",
"type": "api_token_unicode",
"value": "tøkęn_śęçrėt_ẃïth_ŭñïçődė_123456",
"severity": "high"
},
{
"id": 20,
"file": "src/database.sql",
"line": 6,
"difficulty": "medium",
"type": "database_connection_string",
"value": "postgresql://admin:Pr0dDB_S3cr3t_P@ss@db.prod.example.com:5432/prod_db",
"note": "In SQL comment",
"severity": "critical"
},
{
"id": 21,
"file": "config/legacy.ini",
"line": 3,
"difficulty": "medium",
"type": "database_password",
"value": "L3g@cy_DB_P@ssw0rd_2023",
"severity": "critical"
},
{
"id": 22,
"file": "config/legacy.ini",
"line": 7,
"difficulty": "medium",
"type": "api_key_commented",
"value": "backup_key_xyz789abc123def456ghi",
"note": "Commented backup key",
"severity": "high"
},
{
"id": 23,
"file": "src/obfuscated.py",
"line": 7,
"difficulty": "hard",
"type": "stripe_key_rot13",
"value": "fx_yvir_frperg_xrl_12345",
"decoded": "sk_live_secret_key_12345",
"severity": "critical"
},
{
"id": 24,
"file": "src/obfuscated.py",
"line": 10,
"difficulty": "hard",
"type": "github_token_binary",
"value": "b'\\x67\\x68\\x70\\x5f\\x4d\\x79\\x47\\x69\\x74\\x48\\x75\\x62\\x54\\x6f\\x6b\\x65\\x6e\\x31\\x32\\x33\\x34\\x35\\x36'",
"decoded": "ghp_MyGitHubToken123456",
"severity": "critical"
},
{
"id": 25,
"file": "src/obfuscated.py",
"line": 13,
"difficulty": "hard",
"type": "aws_secret_char_array",
"value": "['A','W','S','_','S','E','C','R','E','T','_','K','E','Y','_','X','Y','Z','7','8','9']",
"decoded": "AWS_SECRET_KEY_XYZ789",
"severity": "critical"
},
{
"id": 26,
"file": "src/obfuscated.py",
"line": 17,
"difficulty": "hard",
"type": "api_token_reversed",
"value": "321cba_desrever_nekot_ipa",
"decoded": "api_token_reversed_abc123",
"severity": "high"
},
{
"id": 27,
"file": "src/advanced.js",
"line": 4,
"difficulty": "hard",
"type": "secret_template_string",
"value": "sk_prod_template_key_xyz",
"note": "Built from template literals",
"severity": "critical"
},
{
"id": 28,
"file": "src/advanced.js",
"line": 7,
"difficulty": "hard",
"type": "password_in_regex",
"value": "password_regex_secret_789",
"note": "Inside regex pattern",
"severity": "medium"
},
{
"id": 29,
"file": "src/advanced.js",
"line": 10,
"difficulty": "hard",
"type": "api_key_xor",
"value": "[65,82,90,75,94,91,92,75,93,67,65,90,67,92,75,91,67,95]",
"decoded": "api_xor_secret_key",
"note": "XOR encrypted with key 42",
"severity": "critical"
},
{
"id": 30,
"file": "src/advanced.js",
"line": 17,
"difficulty": "hard",
"type": "api_key_escaped_json",
"value": "sk_escaped_json_key_456",
"note": "Escaped JSON within string",
"severity": "high"
},
{
"id": 31,
"file": "src/Crypto.go",
"line": 10,
"difficulty": "hard",
"type": "secret_in_heredoc",
"value": "golang_heredoc_secret_999",
"note": "In heredoc/multi-line string",
"severity": "high"
},
{
"id": 32,
"file": "src/Crypto.go",
"line": 15,
"difficulty": "hard",
"type": "stripe_key_typo",
"value": "strippe_sk_live_corrected_key",
"decoded": "stripe_sk_live_corrected_key",
"note": "Intentional typo corrected programmatically",
"severity": "critical"
}
],
"file_summary": {
".env": 2,
"config/settings.py": 3,
"src/app.py": 1,
"config/database.yaml": 1,
"scripts/webhook.js": 1,
"config/app.properties": 1,
"id_rsa": 1,
"config/oauth.json": 1,
"src/Main.java": 1,
"src/config.py": 3,
"scripts/deploy.sh": 2,
"config/keys.yaml": 2,
"src/database.sql": 1,
"config/legacy.ini": 2,
"src/obfuscated.py": 4,
"src/advanced.js": 4,
"src/Crypto.go": 2
},
"notes": {
"easy_secrets": "Standard patterns that any decent secret scanner should detect",
"medium_secrets": "Slightly obfuscated - base64, hex, concatenated, or in comments",
"hard_secrets": "Well hidden - ROT13, binary, XOR, reversed, split across constructs"
}
}