fix: automatically detect binary files

This commit is contained in:
tdurieux
2018-08-08 11:08:17 +02:00
parent a176998c5a
commit 7eee5e3c52
2 changed files with 18 additions and 21 deletions

View File

@@ -9,6 +9,7 @@ except ImportError:
from urllib.parse import quote # Python 3+
import re
import shutil
import string
import base64
from datetime import datetime
@@ -37,6 +38,20 @@ def clean_github_repository(repo):
branch = split_repo[3]
return username, repository, branch
TEXT_CHARACTERS = ''.join([chr(code) for code in range(32,127)] + list('\b\f\n\r\t'))
def istext(s, threshold=0.30):
# if s contains any null, it's not text:
if '\x00' in s:
return False
# an "empty" string is "text" (arbitrary but reasonable choice):
if not s:
return True
# Get the substring of s made up of non-text characters
translate_table = dict((ord(char), None) for char in TEXT_CHARACTERS)
binary_length = float(len(s.translate(None, TEXT_CHARACTERS)))
# s is 'text' if less than 30% of its characters are non-text ones:
return binary_length/len(s) <= threshold
class Anonymous_Github:
def __init__(self,
@@ -108,25 +123,7 @@ class Anonymous_Github:
repository_configuration))
if ".jpg" in file.name or ".png" in file.name or ".png" in file.name or ".gif" in file.name:
return Markup("<img src='%s' alt='%s'>" % (file.url, file.name))
if ".txt" in file.name \
or ".rtf" in file.name \
or ".log" in file.name \
or ".csv" in file.name \
or ".xml" in file.name \
or ".json" in file.name \
or ".css" in file.name \
or ".html" in file.name \
or ".js" in file.name \
or ".tex" in file.name \
or ".java" in file.name \
or ".php" in file.name \
or ".c" in file.name \
or ".h" in file.name \
or ".lua" in file.name \
or ".py" in file.name \
or ".sh" in file.name \
or ".gitignore" in file.name \
or ".travis.yml" in file.name:
if istext(file.decoded_content):
return Markup("<pre><code>{}</code></pre>")\
.format(Markup.escape(remove_terms(file.decoded_content.decode("utf-8"), repository_configuration)))
return Markup("<b>%s has an unknown extension, we are unable to anonymize it (known extensions md/txt/json/java/...)</b>" % (file.name))

View File

@@ -24,14 +24,14 @@
<span class="path"><a href="{{ current_path|join("/") }}">Root</a></span>
{% for item in path %}
{% do current_path.append(item) %}
<span class="path"><a href="{{ current_path|join("/") }}">{{ item|remove_terms(repository) }}</a></span>
<span class="path"><a href="{{ current_path|join("/") }}">{{ item|remove_terms(repository, False) }}</a></span>
{% endfor %}
</div>
<div class="files">
{% for item in files %}
<div class="{{ item.type }} {% if item.path == current_file.name %}active{% endif %}">
<a href="/repository/{{ current_repository }}/{{ path_directory }}{% if path_directory|length > 0%}/{% endif %}{{ item.path }}{% if item.type == 'tree'%}/{% endif %}">
{{ item.path|remove_terms(repository) }}
{{ item.path|remove_terms(repository, False) }}
</a>
</div>
{% endfor %}