From b2f44417d3d7401b7c2d79c371545b62f38c6b37 Mon Sep 17 00:00:00 2001 From: Remi Rampin Date: Tue, 12 Mar 2019 12:22:38 -0400 Subject: [PATCH] Anonymize whole URLs (#19) --- server.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/server.py b/server.py index 2e77ce6..0de7979 100644 --- a/server.py +++ b/server.py @@ -93,7 +93,7 @@ class Anonymous_Github: application.jinja_env.add_extension('jinja2.ext.do') @application.template_filter('remove_terms', ) - def remove_terms(content, repository_configuration, word_boundaries=True): + def remove_terms(content, repository_configuration, word_boundaries=True, whole_urls=True): """ remove the blacklisted terms from the content :param content: the content to anonymize @@ -108,9 +108,20 @@ class Anonymous_Github: content = re.compile(repo, re.IGNORECASE).sub("%s/repository/%s" % (self.public_url, repository_configuration["id"]), content) for term in repository_configuration['terms']: if word_boundaries: - content = re.compile(r'\b%s\b' % term, re.IGNORECASE).sub("XXX", content) + regex = re.compile(r'\b%s\b' % term, re.IGNORECASE) else: - content = re.compile(term, re.IGNORECASE).sub("XXX", content) + regex = re.compile(term, re.IGNORECASE) + + if whole_urls: + def sub_url(m): + if regex.search(m.group(0)): + return 'XXX' + return m.group(0) + + url_regex = re.compile('\\b((https?|ftp|file)://)[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]\\b') + content = url_regex.sub(sub_url, content) + + content = regex.sub("XXX", content) return content @application.template_filter('file_render', )