00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017 """Escaping/unescaping methods for HTML, JSON, URLs, and others.
00018
00019 Also includes a few other miscellaneous string manipulation functions that
00020 have crept in over time.
00021 """
00022
00023 from __future__ import absolute_import, division, with_statement
00024
00025 import htmlentitydefs
00026 import re
00027 import sys
00028 import urllib
00029
00030
00031 try:
00032 bytes
00033 except Exception:
00034 bytes = str
00035
00036 try:
00037 from urlparse import parse_qs
00038 except ImportError:
00039 from cgi import parse_qs
00040
00041
00042
00043 try:
00044 import json
00045 assert hasattr(json, "loads") and hasattr(json, "dumps")
00046 _json_decode = json.loads
00047 _json_encode = json.dumps
00048 except Exception:
00049 try:
00050 import simplejson
00051 _json_decode = lambda s: simplejson.loads(_unicode(s))
00052 _json_encode = lambda v: simplejson.dumps(v)
00053 except ImportError:
00054 try:
00055
00056 from django.utils import simplejson
00057 _json_decode = lambda s: simplejson.loads(_unicode(s))
00058 _json_encode = lambda v: simplejson.dumps(v)
00059 except ImportError:
00060 def _json_decode(s):
00061 raise NotImplementedError(
00062 "A JSON parser is required, e.g., simplejson at "
00063 "http://pypi.python.org/pypi/simplejson/")
00064 _json_encode = _json_decode
00065
00066
00067 _XHTML_ESCAPE_RE = re.compile('[&<>"]')
00068 _XHTML_ESCAPE_DICT = {'&': '&', '<': '<', '>': '>', '"': '"'}
00069
00070
00071 def xhtml_escape(value):
00072 """Escapes a string so it is valid within XML or XHTML."""
00073 return _XHTML_ESCAPE_RE.sub(lambda match: _XHTML_ESCAPE_DICT[match.group(0)],
00074 to_basestring(value))
00075
00076
00077 def xhtml_unescape(value):
00078 """Un-escapes an XML-escaped string."""
00079 return re.sub(r"&(#?)(\w+?);", _convert_entity, _unicode(value))
00080
00081
00082 def json_encode(value):
00083 """JSON-encodes the given Python object."""
00084
00085
00086
00087
00088
00089
00090 return _json_encode(recursive_unicode(value)).replace("</", "<\\/")
00091
00092
00093 def json_decode(value):
00094 """Returns Python objects for the given JSON string."""
00095 return _json_decode(to_basestring(value))
00096
00097
00098 def squeeze(value):
00099 """Replace all sequences of whitespace chars with a single space."""
00100 return re.sub(r"[\x00-\x20]+", " ", value).strip()
00101
00102
00103 def url_escape(value):
00104 """Returns a valid URL-encoded version of the given value."""
00105 return urllib.quote_plus(utf8(value))
00106
00107
00108
00109
00110 if sys.version_info[0] < 3:
00111 def url_unescape(value, encoding='utf-8'):
00112 """Decodes the given value from a URL.
00113
00114 The argument may be either a byte or unicode string.
00115
00116 If encoding is None, the result will be a byte string. Otherwise,
00117 the result is a unicode string in the specified encoding.
00118 """
00119 if encoding is None:
00120 return urllib.unquote_plus(utf8(value))
00121 else:
00122 return unicode(urllib.unquote_plus(utf8(value)), encoding)
00123
00124 parse_qs_bytes = parse_qs
00125 else:
00126 def url_unescape(value, encoding='utf-8'):
00127 """Decodes the given value from a URL.
00128
00129 The argument may be either a byte or unicode string.
00130
00131 If encoding is None, the result will be a byte string. Otherwise,
00132 the result is a unicode string in the specified encoding.
00133 """
00134 if encoding is None:
00135 return urllib.parse.unquote_to_bytes(value)
00136 else:
00137 return urllib.unquote_plus(to_basestring(value), encoding=encoding)
00138
00139 def parse_qs_bytes(qs, keep_blank_values=False, strict_parsing=False):
00140 """Parses a query string like urlparse.parse_qs, but returns the
00141 values as byte strings.
00142
00143 Keys still become type str (interpreted as latin1 in python3!)
00144 because it's too painful to keep them as byte strings in
00145 python3 and in practice they're nearly always ascii anyway.
00146 """
00147
00148
00149 result = parse_qs(qs, keep_blank_values, strict_parsing,
00150 encoding='latin1', errors='strict')
00151 encoded = {}
00152 for k, v in result.iteritems():
00153 encoded[k] = [i.encode('latin1') for i in v]
00154 return encoded
00155
00156
00157 _UTF8_TYPES = (bytes, type(None))
00158
00159
00160 def utf8(value):
00161 """Converts a string argument to a byte string.
00162
00163 If the argument is already a byte string or None, it is returned unchanged.
00164 Otherwise it must be a unicode string and is encoded as utf8.
00165 """
00166 if isinstance(value, _UTF8_TYPES):
00167 return value
00168 assert isinstance(value, unicode)
00169 return value.encode("utf-8")
00170
00171 _TO_UNICODE_TYPES = (unicode, type(None))
00172
00173
00174 def to_unicode(value):
00175 """Converts a string argument to a unicode string.
00176
00177 If the argument is already a unicode string or None, it is returned
00178 unchanged. Otherwise it must be a byte string and is decoded as utf8.
00179 """
00180 if isinstance(value, _TO_UNICODE_TYPES):
00181 return value
00182 assert isinstance(value, bytes)
00183 return value.decode("utf-8")
00184
00185
00186
00187 _unicode = to_unicode
00188
00189
00190
00191 if str is unicode:
00192 native_str = to_unicode
00193 else:
00194 native_str = utf8
00195
00196 _BASESTRING_TYPES = (basestring, type(None))
00197
00198
00199 def to_basestring(value):
00200 """Converts a string argument to a subclass of basestring.
00201
00202 In python2, byte and unicode strings are mostly interchangeable,
00203 so functions that deal with a user-supplied argument in combination
00204 with ascii string constants can use either and should return the type
00205 the user supplied. In python3, the two types are not interchangeable,
00206 so this method is needed to convert byte strings to unicode.
00207 """
00208 if isinstance(value, _BASESTRING_TYPES):
00209 return value
00210 assert isinstance(value, bytes)
00211 return value.decode("utf-8")
00212
00213
00214 def recursive_unicode(obj):
00215 """Walks a simple data structure, converting byte strings to unicode.
00216
00217 Supports lists, tuples, and dictionaries.
00218 """
00219 if isinstance(obj, dict):
00220 return dict((recursive_unicode(k), recursive_unicode(v)) for (k, v) in obj.iteritems())
00221 elif isinstance(obj, list):
00222 return list(recursive_unicode(i) for i in obj)
00223 elif isinstance(obj, tuple):
00224 return tuple(recursive_unicode(i) for i in obj)
00225 elif isinstance(obj, bytes):
00226 return to_unicode(obj)
00227 else:
00228 return obj
00229
00230
00231
00232
00233
00234
00235 _URL_RE = re.compile(ur"""\b((?:([\w-]+):(/{1,3})|www[.])(?:(?:(?:[^\s&()]|&|")*(?:[^!"#$%&'()*+,.:;<=>?@\[\]^`{|}~\s]))|(?:\((?:[^\s&()]|&|")*\)))+)""")
00236
00237
00238 def linkify(text, shorten=False, extra_params="",
00239 require_protocol=False, permitted_protocols=["http", "https"]):
00240 """Converts plain text into HTML with links.
00241
00242 For example: ``linkify("Hello http://tornadoweb.org!")`` would return
00243 ``Hello <a href="http://tornadoweb.org">http://tornadoweb.org</a>!``
00244
00245 Parameters:
00246
00247 shorten: Long urls will be shortened for display.
00248
00249 extra_params: Extra text to include in the link tag, or a callable
00250 taking the link as an argument and returning the extra text
00251 e.g. ``linkify(text, extra_params='rel="nofollow" class="external"')``,
00252 or::
00253
00254 def extra_params_cb(url):
00255 if url.startswith("http://example.com"):
00256 return 'class="internal"'
00257 else:
00258 return 'class="external" rel="nofollow"'
00259 linkify(text, extra_params=extra_params_cb)
00260
00261 require_protocol: Only linkify urls which include a protocol. If this is
00262 False, urls such as www.facebook.com will also be linkified.
00263
00264 permitted_protocols: List (or set) of protocols which should be linkified,
00265 e.g. linkify(text, permitted_protocols=["http", "ftp", "mailto"]).
00266 It is very unsafe to include protocols such as "javascript".
00267 """
00268 if extra_params and not callable(extra_params):
00269 extra_params = " " + extra_params.strip()
00270
00271 def make_link(m):
00272 url = m.group(1)
00273 proto = m.group(2)
00274 if require_protocol and not proto:
00275 return url
00276
00277 if proto and proto not in permitted_protocols:
00278 return url
00279
00280 href = m.group(1)
00281 if not proto:
00282 href = "http://" + href
00283
00284 if callable(extra_params):
00285 params = " " + extra_params(href).strip()
00286 else:
00287 params = extra_params
00288
00289
00290 max_len = 30
00291 if shorten and len(url) > max_len:
00292 before_clip = url
00293 if proto:
00294 proto_len = len(proto) + 1 + len(m.group(3) or "")
00295 else:
00296 proto_len = 0
00297
00298 parts = url[proto_len:].split("/")
00299 if len(parts) > 1:
00300
00301
00302
00303
00304 url = url[:proto_len] + parts[0] + "/" + \
00305 parts[1][:8].split('?')[0].split('.')[0]
00306
00307 if len(url) > max_len * 1.5:
00308 url = url[:max_len]
00309
00310 if url != before_clip:
00311 amp = url.rfind('&')
00312
00313 if amp > max_len - 5:
00314 url = url[:amp]
00315 url += "..."
00316
00317 if len(url) >= len(before_clip):
00318 url = before_clip
00319 else:
00320
00321
00322 params += ' title="%s"' % href
00323
00324 return u'<a href="%s"%s>%s</a>' % (href, params, url)
00325
00326
00327
00328
00329 text = _unicode(xhtml_escape(text))
00330 return _URL_RE.sub(make_link, text)
00331
00332
00333 def _convert_entity(m):
00334 if m.group(1) == "#":
00335 try:
00336 return unichr(int(m.group(2)))
00337 except ValueError:
00338 return "&#%s;" % m.group(2)
00339 try:
00340 return _HTML_UNICODE_MAP[m.group(2)]
00341 except KeyError:
00342 return "&%s;" % m.group(2)
00343
00344
00345 def _build_unicode_map():
00346 unicode_map = {}
00347 for name, value in htmlentitydefs.name2codepoint.iteritems():
00348 unicode_map[name] = unichr(value)
00349 return unicode_map
00350
00351 _HTML_UNICODE_MAP = _build_unicode_map()