From c8fdfbdceb9fb48b7318d3f2522a1300121d5af8 Mon Sep 17 00:00:00 2001
From: BigBodyCobain <43977454+BigBodyCobain@users.noreply.github.com>
Date: Mon, 25 May 2026 03:16:01 -0600
Subject: [PATCH] fix(secret-scan): exempt SSH known_hosts entries from
 leaked-key detection
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PR #331 introduced github.com host fingerprints pinned in
.gitlab-ci.yml's mirror-to-github before_script. The scanner flagged
them as embedded secrets and blocked CI:

  BLOCKED: Embedded secrets/tokens found in:
    .gitlab-ci.yml
      133: github.com ssh-ed25519 AAAA...
      135: github.com ssh-rsa AAAA...

These are PUBLIC host keys — the whole point of pinning known_hosts is
to publish the fingerprint widely so a MITM is detectable. They are
documented at https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/githubs-ssh-key-fingerprints
and committing them is the correct, secure practice.

Fix: add a KNOWN_HOSTS_LINE regex to the content-scan block that
recognizes `<host-or-ip> [salt] <algo> AAAA...` shape lines (the
exact format used in ~/.ssh/known_hosts) and filters them out before
flagging the file. Bare `ssh-rsa AAAA...` lines without a host prefix
are still caught — only the host-key shape is exempt.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 backend/scripts/scan-secrets.sh | 37 +++++++++++++++++++++++++--------
 1 file changed, 28 insertions(+), 9 deletions(-)
diff --git a/backend/scripts/scan-secrets.sh b/backend/scripts/scan-secrets.sh
index dfd8d38..50910d1 100644
--- a/backend/scripts/scan-secrets.sh
+++ b/backend/scripts/scan-secrets.sh
@@ -92,18 +92,37 @@ SECRET_REGEX+='pypi-[0-9a-zA-Z-]{50,}'                        # PyPI token
 TEXT_FILES=$(grep -ivE '\.(png|jpg|jpeg|gif|ico|svg|woff2?|ttf|eot|pbf|zip|tar|gz|db|sqlite|xlsx|pdf|mp[34]|wav|ogg|webm|webp|avif)$' "$FILELIST" | grep -v 'scan-secrets\.sh$' || true)
 
 if [[ -n "$TEXT_FILES" ]]; then
+    # Known-public exclusions: lines matching `<host-or-ip> ssh-<algo> <key>`
+    # are SSH known_hosts entries — the host's PUBLIC fingerprint, which is
+    # by definition safe to commit (the whole point of pinning known_hosts
+    # is to publish the fingerprint widely so MITM is detectable). Filter
+    # these out before flagging the file.
+    KNOWN_HOSTS_LINE='^[[:space:]]*[a-zA-Z0-9._:,*-]+([[:space:]]+[a-zA-Z0-9._:,*-]+)?[[:space:]]+(ssh-rsa|ssh-ed25519|ssh-dss|ecdsa-sha2-nistp256|ecdsa-sha2-nistp384|ecdsa-sha2-nistp521)[[:space:]]+AAAA'
+
     # Use grep with file list, skip missing/binary, limit output
     CONTENT_HITS=$(echo "$TEXT_FILES" | xargs grep -lE "$SECRET_REGEX" 2>/dev/null || true)
     if [[ -n "$CONTENT_HITS" ]]; then
-        echo -e "\n${RED}BLOCKED: Embedded secrets/tokens found in:${NC}"
-        echo "$CONTENT_HITS" | while read -r f; do
-            echo -e "  ${RED}$f${NC}"
-            # Show first matching line for context
-            grep -nE "$SECRET_REGEX" "$f" 2>/dev/null | head -2 | while read -r line; do
-                echo -e "    ${YELLOW}$line${NC}"
-            done
-        done
-        FOUND=1
+        REAL_HITS=""
+        REAL_REPORT=""
+        while IFS= read -r f; do
+            [[ -z "$f" ]] && continue
+            # Re-grep this file, but filter out known_hosts-style lines.
+            FILE_HITS=$(grep -nE "$SECRET_REGEX" "$f" 2>/dev/null | grep -vE "$KNOWN_HOSTS_LINE" || true)
+            if [[ -n "$FILE_HITS" ]]; then
+                REAL_HITS+="$f"$'\n'
+                REAL_REPORT+="  ${RED}$f${NC}"$'\n'
+                # Show first 2 matching lines for context
+                while IFS= read -r line; do
+                    [[ -z "$line" ]] && continue
+                    REAL_REPORT+="    ${YELLOW}$line${NC}"$'\n'
+                done < <(echo "$FILE_HITS" | head -2)
+            fi
+        done <<< "$CONTENT_HITS"
+        if [[ -n "$REAL_HITS" ]]; then
+            echo -e "\n${RED}BLOCKED: Embedded secrets/tokens found in:${NC}"
+            echo -en "$REAL_REPORT"
+            FOUND=1
+        fi
     fi
 fi