rosbridge_server: escape.py Source File

Go to the documentation of this file.
00001 #!/usr/bin/env python
00002 #
00003 # Copyright 2009 Facebook
00004 #
00005 # Licensed under the Apache License, Version 2.0 (the "License"); you may
00006 # not use this file except in compliance with the License. You may obtain
00007 # a copy of the License at
00008 #
00009 #     http://www.apache.org/licenses/LICENSE-2.0
00010 #
00011 # Unless required by applicable law or agreed to in writing, software
00012 # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
00013 # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
00014 # License for the specific language governing permissions and limitations
00015 # under the License.
00016 
00017 """Escaping/unescaping methods for HTML, JSON, URLs, and others.
00018 
00019 Also includes a few other miscellaneous string manipulation functions that
00020 have crept in over time.
00021 """
00022 
00023 from __future__ import absolute_import, division, print_function, with_statement
00024 
00025 import re
00026 import sys
00027 
00028 from tornado.util import bytes_type, unicode_type, basestring_type, u
00029 
00030 try:
00031     from urllib.parse import parse_qs as _parse_qs  # py3
00032 except ImportError:
00033     from urlparse import parse_qs as _parse_qs  # Python 2.6+
00034 
00035 try:
00036     import htmlentitydefs  # py2
00037 except ImportError:
00038     import html.entities as htmlentitydefs  # py3
00039 
00040 try:
00041     import urllib.parse as urllib_parse  # py3
00042 except ImportError:
00043     import urllib as urllib_parse  # py2
00044 
00045 import json
00046 
00047 try:
00048     unichr
00049 except NameError:
00050     unichr = chr
00051 
00052 _XHTML_ESCAPE_RE = re.compile('[&<>"\']')
00053 _XHTML_ESCAPE_DICT = {'&': '&amp;', '<': '&lt;', '>': '&gt;', '"': '&quot;',
00054                       '\'': '&#39;'}
00055 
00056 
00057 def xhtml_escape(value):
00058     """Escapes a string so it is valid within HTML or XML.
00059 
00060     Escapes the characters ``<``, ``>``, ``"``, ``'``, and ``&``.
00061     When used in attribute values the escaped strings must be enclosed
00062     in quotes.
00063 
00064     .. versionchanged:: 3.2
00065 
00066        Added the single quote to the list of escaped characters.
00067     """
00068     return _XHTML_ESCAPE_RE.sub(lambda match: _XHTML_ESCAPE_DICT[match.group(0)],
00069                                 to_basestring(value))
00070 
00071 
00072 def xhtml_unescape(value):
00073     """Un-escapes an XML-escaped string."""
00074     return re.sub(r"&(#?)(\w+?);", _convert_entity, _unicode(value))
00075 
00076 
00077 # The fact that json_encode wraps json.dumps is an implementation detail.
00078 # Please see https://github.com/tornadoweb/tornado/pull/706
00079 # before sending a pull request that adds **kwargs to this function.
00080 def json_encode(value):
00081     """JSON-encodes the given Python object."""
00082     # JSON permits but does not require forward slashes to be escaped.
00083     # This is useful when json data is emitted in a <script> tag
00084     # in HTML, as it prevents </script> tags from prematurely terminating
00085     # the javscript.  Some json libraries do this escaping by default,
00086     # although python's standard library does not, so we do it here.
00087     # http://stackoverflow.com/questions/1580647/json-why-are-forward-slashes-escaped
00088     return json.dumps(value).replace("</", "<\\/")
00089 
00090 
00091 def json_decode(value):
00092     """Returns Python objects for the given JSON string."""
00093     return json.loads(to_basestring(value))
00094 
00095 
00096 def squeeze(value):
00097     """Replace all sequences of whitespace chars with a single space."""
00098     return re.sub(r"[\x00-\x20]+", " ", value).strip()
00099 
00100 
00101 def url_escape(value, plus=True):
00102     """Returns a URL-encoded version of the given value.
00103 
00104     If ``plus`` is true (the default), spaces will be represented
00105     as "+" instead of "%20".  This is appropriate for query strings
00106     but not for the path component of a URL.  Note that this default
00107     is the reverse of Python's urllib module.
00108 
00109     .. versionadded:: 3.1
00110         The ``plus`` argument
00111     """
00112     quote = urllib_parse.quote_plus if plus else urllib_parse.quote
00113     return quote(utf8(value))
00114 
00115 
00116 # python 3 changed things around enough that we need two separate
00117 # implementations of url_unescape.  We also need our own implementation
00118 # of parse_qs since python 3's version insists on decoding everything.
00119 if sys.version_info[0] < 3:
00120     def url_unescape(value, encoding='utf-8', plus=True):
00121         """Decodes the given value from a URL.
00122 
00123         The argument may be either a byte or unicode string.
00124 
00125         If encoding is None, the result will be a byte string.  Otherwise,
00126         the result is a unicode string in the specified encoding.
00127 
00128         If ``plus`` is true (the default), plus signs will be interpreted
00129         as spaces (literal plus signs must be represented as "%2B").  This
00130         is appropriate for query strings and form-encoded values but not
00131         for the path component of a URL.  Note that this default is the
00132         reverse of Python's urllib module.
00133 
00134         .. versionadded:: 3.1
00135            The ``plus`` argument
00136         """
00137         unquote = (urllib_parse.unquote_plus if plus else urllib_parse.unquote)
00138         if encoding is None:
00139             return unquote(utf8(value))
00140         else:
00141             return unicode_type(unquote(utf8(value)), encoding)
00142 
00143     parse_qs_bytes = _parse_qs
00144 else:
00145     def url_unescape(value, encoding='utf-8', plus=True):
00146         """Decodes the given value from a URL.
00147 
00148         The argument may be either a byte or unicode string.
00149 
00150         If encoding is None, the result will be a byte string.  Otherwise,
00151         the result is a unicode string in the specified encoding.
00152 
00153         If ``plus`` is true (the default), plus signs will be interpreted
00154         as spaces (literal plus signs must be represented as "%2B").  This
00155         is appropriate for query strings and form-encoded values but not
00156         for the path component of a URL.  Note that this default is the
00157         reverse of Python's urllib module.
00158 
00159         .. versionadded:: 3.1
00160            The ``plus`` argument
00161         """
00162         if encoding is None:
00163             if plus:
00164                 # unquote_to_bytes doesn't have a _plus variant
00165                 value = to_basestring(value).replace('+', ' ')
00166             return urllib_parse.unquote_to_bytes(value)
00167         else:
00168             unquote = (urllib_parse.unquote_plus if plus
00169                        else urllib_parse.unquote)
00170             return unquote(to_basestring(value), encoding=encoding)
00171 
00172     def parse_qs_bytes(qs, keep_blank_values=False, strict_parsing=False):
00173         """Parses a query string like urlparse.parse_qs, but returns the
00174         values as byte strings.
00175 
00176         Keys still become type str (interpreted as latin1 in python3!)
00177         because it's too painful to keep them as byte strings in
00178         python3 and in practice they're nearly always ascii anyway.
00179         """
00180         # This is gross, but python3 doesn't give us another way.
00181         # Latin1 is the universal donor of character encodings.
00182         result = _parse_qs(qs, keep_blank_values, strict_parsing,
00183                            encoding='latin1', errors='strict')
00184         encoded = {}
00185         for k, v in result.items():
00186             encoded[k] = [i.encode('latin1') for i in v]
00187         return encoded
00188 
00189 
00190 _UTF8_TYPES = (bytes_type, type(None))
00191 
00192 
00193 def utf8(value):
00194     """Converts a string argument to a byte string.
00195 
00196     If the argument is already a byte string or None, it is returned unchanged.
00197     Otherwise it must be a unicode string and is encoded as utf8.
00198     """
00199     if isinstance(value, _UTF8_TYPES):
00200         return value
00201     if not isinstance(value, unicode_type):
00202         raise TypeError(
00203             "Expected bytes, unicode, or None; got %r" % type(value)
00204         )
00205     return value.encode("utf-8")
00206 
00207 _TO_UNICODE_TYPES = (unicode_type, type(None))
00208 
00209 
00210 def to_unicode(value):
00211     """Converts a string argument to a unicode string.
00212 
00213     If the argument is already a unicode string or None, it is returned
00214     unchanged.  Otherwise it must be a byte string and is decoded as utf8.
00215     """
00216     if isinstance(value, _TO_UNICODE_TYPES):
00217         return value
00218     if not isinstance(value, bytes_type):
00219         raise TypeError(
00220             "Expected bytes, unicode, or None; got %r" % type(value)
00221         )
00222     return value.decode("utf-8")
00223 
00224 # to_unicode was previously named _unicode not because it was private,
00225 # but to avoid conflicts with the built-in unicode() function/type
00226 _unicode = to_unicode
00227 
00228 # When dealing with the standard library across python 2 and 3 it is
00229 # sometimes useful to have a direct conversion to the native string type
00230 if str is unicode_type:
00231     native_str = to_unicode
00232 else:
00233     native_str = utf8
00234 
00235 _BASESTRING_TYPES = (basestring_type, type(None))
00236 
00237 
00238 def to_basestring(value):
00239     """Converts a string argument to a subclass of basestring.
00240 
00241     In python2, byte and unicode strings are mostly interchangeable,
00242     so functions that deal with a user-supplied argument in combination
00243     with ascii string constants can use either and should return the type
00244     the user supplied.  In python3, the two types are not interchangeable,
00245     so this method is needed to convert byte strings to unicode.
00246     """
00247     if isinstance(value, _BASESTRING_TYPES):
00248         return value
00249     if not isinstance(value, bytes_type):
00250         raise TypeError(
00251             "Expected bytes, unicode, or None; got %r" % type(value)
00252         )
00253     return value.decode("utf-8")
00254 
00255 
00256 def recursive_unicode(obj):
00257     """Walks a simple data structure, converting byte strings to unicode.
00258 
00259     Supports lists, tuples, and dictionaries.
00260     """
00261     if isinstance(obj, dict):
00262         return dict((recursive_unicode(k), recursive_unicode(v)) for (k, v) in obj.items())
00263     elif isinstance(obj, list):
00264         return list(recursive_unicode(i) for i in obj)
00265     elif isinstance(obj, tuple):
00266         return tuple(recursive_unicode(i) for i in obj)
00267     elif isinstance(obj, bytes_type):
00268         return to_unicode(obj)
00269     else:
00270         return obj
00271 
00272 # I originally used the regex from
00273 # http://daringfireball.net/2010/07/improved_regex_for_matching_urls
00274 # but it gets all exponential on certain patterns (such as too many trailing
00275 # dots), causing the regex matcher to never return.
00276 # This regex should avoid those problems.
00277 # Use to_unicode instead of tornado.util.u - we don't want backslashes getting
00278 # processed as escapes.
00279 _URL_RE = re.compile(to_unicode(r"""\b((?:([\w-]+):(/{1,3})|www[.])(?:(?:(?:[^\s&()]|&amp;|&quot;)*(?:[^!"#$%&'()*+,.:;<=>?@\[\]^`{|}~\s]))|(?:\((?:[^\s&()]|&amp;|&quot;)*\)))+)"""))
00280 
00281 
00282 def linkify(text, shorten=False, extra_params="",
00283             require_protocol=False, permitted_protocols=["http", "https"]):
00284     """Converts plain text into HTML with links.
00285 
00286     For example: ``linkify("Hello http://tornadoweb.org!")`` would return
00287     ``Hello <a href="http://tornadoweb.org">http://tornadoweb.org</a>!``
00288 
00289     Parameters:
00290 
00291     * ``shorten``: Long urls will be shortened for display.
00292 
00293     * ``extra_params``: Extra text to include in the link tag, or a callable
00294         taking the link as an argument and returning the extra text
00295         e.g. ``linkify(text, extra_params='rel="nofollow" class="external"')``,
00296         or::
00297 
00298             def extra_params_cb(url):
00299                 if url.startswith("http://example.com"):
00300                     return 'class="internal"'
00301                 else:
00302                     return 'class="external" rel="nofollow"'
00303             linkify(text, extra_params=extra_params_cb)
00304 
00305     * ``require_protocol``: Only linkify urls which include a protocol. If
00306         this is False, urls such as www.facebook.com will also be linkified.
00307 
00308     * ``permitted_protocols``: List (or set) of protocols which should be
00309         linkified, e.g. ``linkify(text, permitted_protocols=["http", "ftp",
00310         "mailto"])``. It is very unsafe to include protocols such as
00311         ``javascript``.
00312     """
00313     if extra_params and not callable(extra_params):
00314         extra_params = " " + extra_params.strip()
00315 
00316     def make_link(m):
00317         url = m.group(1)
00318         proto = m.group(2)
00319         if require_protocol and not proto:
00320             return url  # not protocol, no linkify
00321 
00322         if proto and proto not in permitted_protocols:
00323             return url  # bad protocol, no linkify
00324 
00325         href = m.group(1)
00326         if not proto:
00327             href = "http://" + href   # no proto specified, use http
00328 
00329         if callable(extra_params):
00330             params = " " + extra_params(href).strip()
00331         else:
00332             params = extra_params
00333 
00334         # clip long urls. max_len is just an approximation
00335         max_len = 30
00336         if shorten and len(url) > max_len:
00337             before_clip = url
00338             if proto:
00339                 proto_len = len(proto) + 1 + len(m.group(3) or "")  # +1 for :
00340             else:
00341                 proto_len = 0
00342 
00343             parts = url[proto_len:].split("/")
00344             if len(parts) > 1:
00345                 # Grab the whole host part plus the first bit of the path
00346                 # The path is usually not that interesting once shortened
00347                 # (no more slug, etc), so it really just provides a little
00348                 # extra indication of shortening.
00349                 url = url[:proto_len] + parts[0] + "/" + \
00350                     parts[1][:8].split('?')[0].split('.')[0]
00351 
00352             if len(url) > max_len * 1.5:  # still too long
00353                 url = url[:max_len]
00354 
00355             if url != before_clip:
00356                 amp = url.rfind('&')
00357                 # avoid splitting html char entities
00358                 if amp > max_len - 5:
00359                     url = url[:amp]
00360                 url += "..."
00361 
00362                 if len(url) >= len(before_clip):
00363                     url = before_clip
00364                 else:
00365                     # full url is visible on mouse-over (for those who don't
00366                     # have a status bar, such as Safari by default)
00367                     params += ' title="%s"' % href
00368 
00369         return u('<a href="%s"%s>%s</a>') % (href, params, url)
00370 
00371     # First HTML-escape so that our strings are all safe.
00372     # The regex is modified to avoid character entites other than &amp; so
00373     # that we won't pick up &quot;, etc.
00374     text = _unicode(xhtml_escape(text))
00375     return _URL_RE.sub(make_link, text)
00376 
00377 
00378 def _convert_entity(m):
00379     if m.group(1) == "#":
00380         try:
00381             return unichr(int(m.group(2)))
00382         except ValueError:
00383             return "&#%s;" % m.group(2)
00384     try:
00385         return _HTML_UNICODE_MAP[m.group(2)]
00386     except KeyError:
00387         return "&%s;" % m.group(2)
00388 
00389 
00390 def _build_unicode_map():
00391     unicode_map = {}
00392     for name, value in htmlentitydefs.name2codepoint.items():
00393         unicode_map[name] = unichr(value)
00394     return unicode_map
00395 
00396 _HTML_UNICODE_MAP = _build_unicode_map()