From 7eee5e3c523ecfec5c619438c34a3c6b0b0240c6 Mon Sep 17 00:00:00 2001 From: tdurieux Date: Wed, 8 Aug 2018 11:08:17 +0200 Subject: [PATCH] fix: automatically detect binary files --- server.py | 35 ++++++++++++++++------------------- templates/repo.html | 4 ++-- 2 files changed, 18 insertions(+), 21 deletions(-) diff --git a/server.py b/server.py index a72b478..0c60cf9 100644 --- a/server.py +++ b/server.py @@ -9,6 +9,7 @@ except ImportError: from urllib.parse import quote # Python 3+ import re import shutil +import string import base64 from datetime import datetime @@ -37,6 +38,20 @@ def clean_github_repository(repo): branch = split_repo[3] return username, repository, branch +TEXT_CHARACTERS = ''.join([chr(code) for code in range(32,127)] + list('\b\f\n\r\t')) +def istext(s, threshold=0.30): + # if s contains any null, it's not text: + if '\x00' in s: + return False + # an "empty" string is "text" (arbitrary but reasonable choice): + if not s: + return True + # Get the substring of s made up of non-text characters + translate_table = dict((ord(char), None) for char in TEXT_CHARACTERS) + binary_length = float(len(s.translate(None, TEXT_CHARACTERS))) + # s is 'text' if less than 30% of its characters are non-text ones: + return binary_length/len(s) <= threshold + class Anonymous_Github: def __init__(self, @@ -108,25 +123,7 @@ class Anonymous_Github: repository_configuration)) if ".jpg" in file.name or ".png" in file.name or ".png" in file.name or ".gif" in file.name: return Markup("%s" % (file.url, file.name)) - if ".txt" in file.name \ - or ".rtf" in file.name \ - or ".log" in file.name \ - or ".csv" in file.name \ - or ".xml" in file.name \ - or ".json" in file.name \ - or ".css" in file.name \ - or ".html" in file.name \ - or ".js" in file.name \ - or ".tex" in file.name \ - or ".java" in file.name \ - or ".php" in file.name \ - or ".c" in file.name \ - or ".h" in file.name \ - or ".lua" in file.name \ - or ".py" in file.name \ - or ".sh" in file.name \ - or ".gitignore" in file.name \ - or ".travis.yml" in file.name: + if istext(file.decoded_content): return Markup("
{}
")\ .format(Markup.escape(remove_terms(file.decoded_content.decode("utf-8"), repository_configuration))) return Markup("%s has an unknown extension, we are unable to anonymize it (known extensions md/txt/json/java/...)" % (file.name)) diff --git a/templates/repo.html b/templates/repo.html index d8a8c74..f7eacef 100644 --- a/templates/repo.html +++ b/templates/repo.html @@ -24,14 +24,14 @@ Root {% for item in path %} {% do current_path.append(item) %} - {{ item|remove_terms(repository) }} + {{ item|remove_terms(repository, False) }} {% endfor %}
{% for item in files %} {% endfor %}