00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017 """Escaping/unescaping methods for HTML, JSON, URLs, and others.
00018
00019 Also includes a few other miscellaneous string manipulation functions that
00020 have crept in over time.
00021 """
00022
00023 from __future__ import absolute_import, division, print_function, with_statement
00024
00025 import re
00026 import sys
00027
00028 from tornado.util import bytes_type, unicode_type, basestring_type, u
00029
00030 try:
00031 from urllib.parse import parse_qs as _parse_qs
00032 except ImportError:
00033 from urlparse import parse_qs as _parse_qs
00034
00035 try:
00036 import htmlentitydefs
00037 except ImportError:
00038 import html.entities as htmlentitydefs
00039
00040 try:
00041 import urllib.parse as urllib_parse
00042 except ImportError:
00043 import urllib as urllib_parse
00044
00045 import json
00046
00047 try:
00048 unichr
00049 except NameError:
00050 unichr = chr
00051
00052 _XHTML_ESCAPE_RE = re.compile('[&<>"\']')
00053 _XHTML_ESCAPE_DICT = {'&': '&', '<': '<', '>': '>', '"': '"',
00054 '\'': '''}
00055
00056
00057 def xhtml_escape(value):
00058 """Escapes a string so it is valid within HTML or XML.
00059
00060 Escapes the characters ``<``, ``>``, ``"``, ``'``, and ``&``.
00061 When used in attribute values the escaped strings must be enclosed
00062 in quotes.
00063
00064 .. versionchanged:: 3.2
00065
00066 Added the single quote to the list of escaped characters.
00067 """
00068 return _XHTML_ESCAPE_RE.sub(lambda match: _XHTML_ESCAPE_DICT[match.group(0)],
00069 to_basestring(value))
00070
00071
00072 def xhtml_unescape(value):
00073 """Un-escapes an XML-escaped string."""
00074 return re.sub(r"&(#?)(\w+?);", _convert_entity, _unicode(value))
00075
00076
00077
00078
00079
00080 def json_encode(value):
00081 """JSON-encodes the given Python object."""
00082
00083
00084
00085
00086
00087
00088 return json.dumps(value).replace("</", "<\\/")
00089
00090
00091 def json_decode(value):
00092 """Returns Python objects for the given JSON string."""
00093 return json.loads(to_basestring(value))
00094
00095
00096 def squeeze(value):
00097 """Replace all sequences of whitespace chars with a single space."""
00098 return re.sub(r"[\x00-\x20]+", " ", value).strip()
00099
00100
00101 def url_escape(value, plus=True):
00102 """Returns a URL-encoded version of the given value.
00103
00104 If ``plus`` is true (the default), spaces will be represented
00105 as "+" instead of "%20". This is appropriate for query strings
00106 but not for the path component of a URL. Note that this default
00107 is the reverse of Python's urllib module.
00108
00109 .. versionadded:: 3.1
00110 The ``plus`` argument
00111 """
00112 quote = urllib_parse.quote_plus if plus else urllib_parse.quote
00113 return quote(utf8(value))
00114
00115
00116
00117
00118
00119 if sys.version_info[0] < 3:
00120 def url_unescape(value, encoding='utf-8', plus=True):
00121 """Decodes the given value from a URL.
00122
00123 The argument may be either a byte or unicode string.
00124
00125 If encoding is None, the result will be a byte string. Otherwise,
00126 the result is a unicode string in the specified encoding.
00127
00128 If ``plus`` is true (the default), plus signs will be interpreted
00129 as spaces (literal plus signs must be represented as "%2B"). This
00130 is appropriate for query strings and form-encoded values but not
00131 for the path component of a URL. Note that this default is the
00132 reverse of Python's urllib module.
00133
00134 .. versionadded:: 3.1
00135 The ``plus`` argument
00136 """
00137 unquote = (urllib_parse.unquote_plus if plus else urllib_parse.unquote)
00138 if encoding is None:
00139 return unquote(utf8(value))
00140 else:
00141 return unicode_type(unquote(utf8(value)), encoding)
00142
00143 parse_qs_bytes = _parse_qs
00144 else:
00145 def url_unescape(value, encoding='utf-8', plus=True):
00146 """Decodes the given value from a URL.
00147
00148 The argument may be either a byte or unicode string.
00149
00150 If encoding is None, the result will be a byte string. Otherwise,
00151 the result is a unicode string in the specified encoding.
00152
00153 If ``plus`` is true (the default), plus signs will be interpreted
00154 as spaces (literal plus signs must be represented as "%2B"). This
00155 is appropriate for query strings and form-encoded values but not
00156 for the path component of a URL. Note that this default is the
00157 reverse of Python's urllib module.
00158
00159 .. versionadded:: 3.1
00160 The ``plus`` argument
00161 """
00162 if encoding is None:
00163 if plus:
00164
00165 value = to_basestring(value).replace('+', ' ')
00166 return urllib_parse.unquote_to_bytes(value)
00167 else:
00168 unquote = (urllib_parse.unquote_plus if plus
00169 else urllib_parse.unquote)
00170 return unquote(to_basestring(value), encoding=encoding)
00171
00172 def parse_qs_bytes(qs, keep_blank_values=False, strict_parsing=False):
00173 """Parses a query string like urlparse.parse_qs, but returns the
00174 values as byte strings.
00175
00176 Keys still become type str (interpreted as latin1 in python3!)
00177 because it's too painful to keep them as byte strings in
00178 python3 and in practice they're nearly always ascii anyway.
00179 """
00180
00181
00182 result = _parse_qs(qs, keep_blank_values, strict_parsing,
00183 encoding='latin1', errors='strict')
00184 encoded = {}
00185 for k, v in result.items():
00186 encoded[k] = [i.encode('latin1') for i in v]
00187 return encoded
00188
00189
00190 _UTF8_TYPES = (bytes_type, type(None))
00191
00192
00193 def utf8(value):
00194 """Converts a string argument to a byte string.
00195
00196 If the argument is already a byte string or None, it is returned unchanged.
00197 Otherwise it must be a unicode string and is encoded as utf8.
00198 """
00199 if isinstance(value, _UTF8_TYPES):
00200 return value
00201 if not isinstance(value, unicode_type):
00202 raise TypeError(
00203 "Expected bytes, unicode, or None; got %r" % type(value)
00204 )
00205 return value.encode("utf-8")
00206
00207 _TO_UNICODE_TYPES = (unicode_type, type(None))
00208
00209
00210 def to_unicode(value):
00211 """Converts a string argument to a unicode string.
00212
00213 If the argument is already a unicode string or None, it is returned
00214 unchanged. Otherwise it must be a byte string and is decoded as utf8.
00215 """
00216 if isinstance(value, _TO_UNICODE_TYPES):
00217 return value
00218 if not isinstance(value, bytes_type):
00219 raise TypeError(
00220 "Expected bytes, unicode, or None; got %r" % type(value)
00221 )
00222 return value.decode("utf-8")
00223
00224
00225
00226 _unicode = to_unicode
00227
00228
00229
00230 if str is unicode_type:
00231 native_str = to_unicode
00232 else:
00233 native_str = utf8
00234
00235 _BASESTRING_TYPES = (basestring_type, type(None))
00236
00237
00238 def to_basestring(value):
00239 """Converts a string argument to a subclass of basestring.
00240
00241 In python2, byte and unicode strings are mostly interchangeable,
00242 so functions that deal with a user-supplied argument in combination
00243 with ascii string constants can use either and should return the type
00244 the user supplied. In python3, the two types are not interchangeable,
00245 so this method is needed to convert byte strings to unicode.
00246 """
00247 if isinstance(value, _BASESTRING_TYPES):
00248 return value
00249 if not isinstance(value, bytes_type):
00250 raise TypeError(
00251 "Expected bytes, unicode, or None; got %r" % type(value)
00252 )
00253 return value.decode("utf-8")
00254
00255
00256 def recursive_unicode(obj):
00257 """Walks a simple data structure, converting byte strings to unicode.
00258
00259 Supports lists, tuples, and dictionaries.
00260 """
00261 if isinstance(obj, dict):
00262 return dict((recursive_unicode(k), recursive_unicode(v)) for (k, v) in obj.items())
00263 elif isinstance(obj, list):
00264 return list(recursive_unicode(i) for i in obj)
00265 elif isinstance(obj, tuple):
00266 return tuple(recursive_unicode(i) for i in obj)
00267 elif isinstance(obj, bytes_type):
00268 return to_unicode(obj)
00269 else:
00270 return obj
00271
00272
00273
00274
00275
00276
00277
00278
00279 _URL_RE = re.compile(to_unicode(r"""\b((?:([\w-]+):(/{1,3})|www[.])(?:(?:(?:[^\s&()]|&|")*(?:[^!"#$%&'()*+,.:;<=>?@\[\]^`{|}~\s]))|(?:\((?:[^\s&()]|&|")*\)))+)"""))
00280
00281
00282 def linkify(text, shorten=False, extra_params="",
00283 require_protocol=False, permitted_protocols=["http", "https"]):
00284 """Converts plain text into HTML with links.
00285
00286 For example: ``linkify("Hello http://tornadoweb.org!")`` would return
00287 ``Hello <a href="http://tornadoweb.org">http://tornadoweb.org</a>!``
00288
00289 Parameters:
00290
00291 * ``shorten``: Long urls will be shortened for display.
00292
00293 * ``extra_params``: Extra text to include in the link tag, or a callable
00294 taking the link as an argument and returning the extra text
00295 e.g. ``linkify(text, extra_params='rel="nofollow" class="external"')``,
00296 or::
00297
00298 def extra_params_cb(url):
00299 if url.startswith("http://example.com"):
00300 return 'class="internal"'
00301 else:
00302 return 'class="external" rel="nofollow"'
00303 linkify(text, extra_params=extra_params_cb)
00304
00305 * ``require_protocol``: Only linkify urls which include a protocol. If
00306 this is False, urls such as www.facebook.com will also be linkified.
00307
00308 * ``permitted_protocols``: List (or set) of protocols which should be
00309 linkified, e.g. ``linkify(text, permitted_protocols=["http", "ftp",
00310 "mailto"])``. It is very unsafe to include protocols such as
00311 ``javascript``.
00312 """
00313 if extra_params and not callable(extra_params):
00314 extra_params = " " + extra_params.strip()
00315
00316 def make_link(m):
00317 url = m.group(1)
00318 proto = m.group(2)
00319 if require_protocol and not proto:
00320 return url
00321
00322 if proto and proto not in permitted_protocols:
00323 return url
00324
00325 href = m.group(1)
00326 if not proto:
00327 href = "http://" + href
00328
00329 if callable(extra_params):
00330 params = " " + extra_params(href).strip()
00331 else:
00332 params = extra_params
00333
00334
00335 max_len = 30
00336 if shorten and len(url) > max_len:
00337 before_clip = url
00338 if proto:
00339 proto_len = len(proto) + 1 + len(m.group(3) or "")
00340 else:
00341 proto_len = 0
00342
00343 parts = url[proto_len:].split("/")
00344 if len(parts) > 1:
00345
00346
00347
00348
00349 url = url[:proto_len] + parts[0] + "/" + \
00350 parts[1][:8].split('?')[0].split('.')[0]
00351
00352 if len(url) > max_len * 1.5:
00353 url = url[:max_len]
00354
00355 if url != before_clip:
00356 amp = url.rfind('&')
00357
00358 if amp > max_len - 5:
00359 url = url[:amp]
00360 url += "..."
00361
00362 if len(url) >= len(before_clip):
00363 url = before_clip
00364 else:
00365
00366
00367 params += ' title="%s"' % href
00368
00369 return u('<a href="%s"%s>%s</a>') % (href, params, url)
00370
00371
00372
00373
00374 text = _unicode(xhtml_escape(text))
00375 return _URL_RE.sub(make_link, text)
00376
00377
00378 def _convert_entity(m):
00379 if m.group(1) == "#":
00380 try:
00381 return unichr(int(m.group(2)))
00382 except ValueError:
00383 return "&#%s;" % m.group(2)
00384 try:
00385 return _HTML_UNICODE_MAP[m.group(2)]
00386 except KeyError:
00387 return "&%s;" % m.group(2)
00388
00389
00390 def _build_unicode_map():
00391 unicode_map = {}
00392 for name, value in htmlentitydefs.name2codepoint.items():
00393 unicode_map[name] = unichr(value)
00394 return unicode_map
00395
00396 _HTML_UNICODE_MAP = _build_unicode_map()