diff options
author | Wojciech Polak <polak@gnu.org> | 2019-09-10 22:01:50 +0200 |
---|---|---|
committer | Wojciech Polak <polak@gnu.org> | 2019-09-10 22:01:50 +0200 |
commit | c5f91b184ebbb54e717f73b7ec25b37b05f56bf0 (patch) | |
tree | 5597f598a65e235bf29e87cdfd82357c30fa4154 | |
parent | 6ee2cdd2573866484089da12a892ed1c888b5dc6 (diff) | |
download | glifestream-c5f91b184ebbb54e717f73b7ec25b37b05f56bf0.tar.gz glifestream-c5f91b184ebbb54e717f73b7ec25b37b05f56bf0.tar.bz2 |
Fix urlizetrunc regression
-rw-r--r-- | glifestream/filters/twyntax.py | 5 | ||||
-rw-r--r-- | glifestream/stream/templatetags/gls_filters.py | 14 | ||||
-rw-r--r-- | glifestream/utils/html.py | 112 |
3 files changed, 127 insertions, 4 deletions
diff --git a/glifestream/filters/twyntax.py b/glifestream/filters/twyntax.py index e2bf0b7..34134c7 100644 --- a/glifestream/filters/twyntax.py +++ b/glifestream/filters/twyntax.py @@ -14,15 +14,14 @@ # with this program. If not, see <http://www.gnu.org/licenses/>. import re -from django.template.defaultfilters import urlizetrunc - def parse(s, type='twitter'): + from glifestream.stream.templatetags.gls_filters import gls_urlizetrunc if type == 'twitter': s = s.split(': ', 1)[1] s = hash_tag(s, type) s = at_reply(s, type) - s = urlizetrunc(s, 45) + s = gls_urlizetrunc(s, 45) return s diff --git a/glifestream/stream/templatetags/gls_filters.py b/glifestream/stream/templatetags/gls_filters.py index 4be8f7b..6d69672 100644 --- a/glifestream/stream/templatetags/gls_filters.py +++ b/glifestream/stream/templatetags/gls_filters.py @@ -28,6 +28,7 @@ from django.utils.translation import ugettext as _ from django.utils.translation import ungettext from django.template.defaultfilters import date as ddate from django.template.defaultfilters import urlencode, stringfilter +from glifestream.utils.html import urlize as _urlize from glifestream.utils.slugify import slugify from glifestream.stream import media from glifestream.apis import * @@ -178,3 +179,16 @@ def fix_ampersands(value): def fix_ampersands_filter(value): """Replaces ampersands with ``&`` entities.""" return fix_ampersands(value) + + +@register.filter('gls_urlizetrunc', is_safe=True, needs_autoescape=True) +@stringfilter +def gls_urlizetrunc(value, limit, autoescape=None): + """ + Converts URLs into clickable links, truncating URLs to the given character + limit, and adding 'rel=nofollow' attribute to discourage spamming. + + Argument: Length to truncate URLs to. + """ + return mark_safe(_urlize(value, trim_url_limit=int(limit), nofollow=True, + autoescape=autoescape)) diff --git a/glifestream/utils/html.py b/glifestream/utils/html.py index ed75356..9ff2658 100644 --- a/glifestream/utils/html.py +++ b/glifestream/utils/html.py @@ -15,8 +15,10 @@ import re from django.utils import six -from django.utils.encoding import force_text +from django.utils.six.moves.urllib.parse import quote, unquote, urlsplit, urlunsplit +from django.utils.encoding import force_str, force_text from django.utils.functional import allow_lazy +from django.utils.safestring import mark_safe, SafeData try: from bs4 import BeautifulSoup @@ -57,3 +59,111 @@ def strip_entities(value): strip_entities = allow_lazy(strip_entities, six.text_type) + +## +## Code taken from Django 1.7 +## + +TRAILING_PUNCTUATION = ['.', ',', ':', ';', '.)', '"', '\''] +WRAPPING_PUNCTUATION = [('(', ')'), ('<', '>'), ('[', ']'), ('<', '>'), ('"', '"'), ('\'', '\'')] +word_split_re = re.compile(r'(\s+)') +simple_url_re = re.compile(r'^https?://\[?\w', re.IGNORECASE) +simple_url_2_re = re.compile(r'^www\.|^(?!http)\w[^@]+\.(com|edu|gov|int|mil|net|org)$', re.IGNORECASE) +simple_email_re = re.compile(r'^\S+@\S+\.\S+$') + +def smart_urlquote(url): + "Quotes a URL if it isn't already quoted." + # Handle IDN before quoting. + try: + scheme, netloc, path, query, fragment = urlsplit(url) + try: + netloc = netloc.encode('idna').decode('ascii') # IDN -> ACE + except UnicodeError: # invalid domain part + pass + else: + url = urlunsplit((scheme, netloc, path, query, fragment)) + except ValueError: + # invalid IPv6 URL (normally square brackets in hostname part). + pass + + url = unquote(force_str(url)) + url = quote(url, safe=b'!*\'();:@&=+$,/?#[]~') + + return force_text(url) + +def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False): + """ + Converts any URLs in text into clickable links. + + Works on http://, https://, www. links, and also on links ending in one of + the original seven gTLDs (.com, .edu, .gov, .int, .mil, .net, and .org). + Links can have trailing punctuation (periods, commas, close-parens) and + leading punctuation (opening parens) and it'll still do the right thing. + + If trim_url_limit is not None, the URLs in the link text longer than this + limit will be truncated to trim_url_limit-3 characters and appended with + an ellipsis. + + If nofollow is True, the links will get a rel="nofollow" attribute. + + If autoescape is True, the link text and URLs will be autoescaped. + """ + def trim_url(x, limit=trim_url_limit): + if limit is None or len(x) <= limit: + return x + return '%s...' % x[:max(0, limit - 3)] + safe_input = isinstance(text, SafeData) + words = word_split_re.split(force_text(text)) + for i, word in enumerate(words): + if '.' in word or '@' in word or ':' in word: + # Deal with punctuation. + lead, middle, trail = '', word, '' + for punctuation in TRAILING_PUNCTUATION: + if middle.endswith(punctuation): + middle = middle[:-len(punctuation)] + trail = punctuation + trail + for opening, closing in WRAPPING_PUNCTUATION: + if middle.startswith(opening): + middle = middle[len(opening):] + lead = lead + opening + # Keep parentheses at the end only if they're balanced. + if (middle.endswith(closing) + and middle.count(closing) == middle.count(opening) + 1): + middle = middle[:-len(closing)] + trail = closing + trail + + # Make URL we want to point to. + url = None + nofollow_attr = ' rel="nofollow"' if nofollow else '' + if simple_url_re.match(middle): + url = smart_urlquote(middle) + elif simple_url_2_re.match(middle): + url = smart_urlquote('http://%s' % middle) + elif ':' not in middle and simple_email_re.match(middle): + local, domain = middle.rsplit('@', 1) + try: + domain = domain.encode('idna').decode('ascii') + except UnicodeError: + continue + url = 'mailto:%s@%s' % (local, domain) + nofollow_attr = '' + + # Make link. + if url: + trimmed = trim_url(middle) + if autoescape and not safe_input: + lead, trail = escape(lead), escape(trail) + url, trimmed = escape(url), escape(trimmed) + middle = '<a href="%s"%s>%s</a>' % (url, nofollow_attr, trimmed) + words[i] = mark_safe('%s%s%s' % (lead, middle, trail)) + else: + if safe_input: + words[i] = mark_safe(word) + elif autoescape: + words[i] = escape(word) + elif safe_input: + words[i] = mark_safe(word) + elif autoescape: + words[i] = escape(word) + return ''.join(words) +urlize = allow_lazy(urlize, six.text_type) |