rosbridge_server: escape.py Source File

Go to the documentation of this file.
00001 #!/usr/bin/env python
00002 #
00003 # Copyright 2009 Facebook
00004 #
00005 # Licensed under the Apache License, Version 2.0 (the "License"); you may
00006 # not use this file except in compliance with the License. You may obtain
00007 # a copy of the License at
00008 #
00009 #     http://www.apache.org/licenses/LICENSE-2.0
00010 #
00011 # Unless required by applicable law or agreed to in writing, software
00012 # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
00013 # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
00014 # License for the specific language governing permissions and limitations
00015 # under the License.
00016 
00017 """Escaping/unescaping methods for HTML, JSON, URLs, and others.
00018 
00019 Also includes a few other miscellaneous string manipulation functions that
00020 have crept in over time.
00021 """
00022 
00023 from __future__ import absolute_import, division, with_statement
00024 
00025 import htmlentitydefs
00026 import re
00027 import sys
00028 import urllib
00029 
00030 # Python3 compatibility:  On python2.5, introduce the bytes alias from 2.6
00031 try:
00032     bytes
00033 except Exception:
00034     bytes = str
00035 
00036 try:
00037     from urlparse import parse_qs  # Python 2.6+
00038 except ImportError:
00039     from cgi import parse_qs
00040 
00041 # json module is in the standard library as of python 2.6; fall back to
00042 # simplejson if present for older versions.
00043 try:
00044     import json
00045     assert hasattr(json, "loads") and hasattr(json, "dumps")
00046     _json_decode = json.loads
00047     _json_encode = json.dumps
00048 except Exception:
00049     try:
00050         import simplejson
00051         _json_decode = lambda s: simplejson.loads(_unicode(s))
00052         _json_encode = lambda v: simplejson.dumps(v)
00053     except ImportError:
00054         try:
00055             # For Google AppEngine
00056             from django.utils import simplejson
00057             _json_decode = lambda s: simplejson.loads(_unicode(s))
00058             _json_encode = lambda v: simplejson.dumps(v)
00059         except ImportError:
00060             def _json_decode(s):
00061                 raise NotImplementedError(
00062                     "A JSON parser is required, e.g., simplejson at "
00063                     "http://pypi.python.org/pypi/simplejson/")
00064             _json_encode = _json_decode
00065 
00066 
00067 _XHTML_ESCAPE_RE = re.compile('[&<>"]')
00068 _XHTML_ESCAPE_DICT = {'&': '&amp;', '<': '&lt;', '>': '&gt;', '"': '&quot;'}
00069 
00070 
00071 def xhtml_escape(value):
00072     """Escapes a string so it is valid within XML or XHTML."""
00073     return _XHTML_ESCAPE_RE.sub(lambda match: _XHTML_ESCAPE_DICT[match.group(0)],
00074                                 to_basestring(value))
00075 
00076 
00077 def xhtml_unescape(value):
00078     """Un-escapes an XML-escaped string."""
00079     return re.sub(r"&(#?)(\w+?);", _convert_entity, _unicode(value))
00080 
00081 
00082 def json_encode(value):
00083     """JSON-encodes the given Python object."""
00084     # JSON permits but does not require forward slashes to be escaped.
00085     # This is useful when json data is emitted in a <script> tag
00086     # in HTML, as it prevents </script> tags from prematurely terminating
00087     # the javscript.  Some json libraries do this escaping by default,
00088     # although python's standard library does not, so we do it here.
00089     # http://stackoverflow.com/questions/1580647/json-why-are-forward-slashes-escaped
00090     return _json_encode(recursive_unicode(value)).replace("</", "<\\/")
00091 
00092 
00093 def json_decode(value):
00094     """Returns Python objects for the given JSON string."""
00095     return _json_decode(to_basestring(value))
00096 
00097 
00098 def squeeze(value):
00099     """Replace all sequences of whitespace chars with a single space."""
00100     return re.sub(r"[\x00-\x20]+", " ", value).strip()
00101 
00102 
00103 def url_escape(value):
00104     """Returns a valid URL-encoded version of the given value."""
00105     return urllib.quote_plus(utf8(value))
00106 
00107 # python 3 changed things around enough that we need two separate
00108 # implementations of url_unescape.  We also need our own implementation
00109 # of parse_qs since python 3's version insists on decoding everything.
00110 if sys.version_info[0] < 3:
00111     def url_unescape(value, encoding='utf-8'):
00112         """Decodes the given value from a URL.
00113 
00114         The argument may be either a byte or unicode string.
00115 
00116         If encoding is None, the result will be a byte string.  Otherwise,
00117         the result is a unicode string in the specified encoding.
00118         """
00119         if encoding is None:
00120             return urllib.unquote_plus(utf8(value))
00121         else:
00122             return unicode(urllib.unquote_plus(utf8(value)), encoding)
00123 
00124     parse_qs_bytes = parse_qs
00125 else:
00126     def url_unescape(value, encoding='utf-8'):
00127         """Decodes the given value from a URL.
00128 
00129         The argument may be either a byte or unicode string.
00130 
00131         If encoding is None, the result will be a byte string.  Otherwise,
00132         the result is a unicode string in the specified encoding.
00133         """
00134         if encoding is None:
00135             return urllib.parse.unquote_to_bytes(value)
00136         else:
00137             return urllib.unquote_plus(to_basestring(value), encoding=encoding)
00138 
00139     def parse_qs_bytes(qs, keep_blank_values=False, strict_parsing=False):
00140         """Parses a query string like urlparse.parse_qs, but returns the
00141         values as byte strings.
00142 
00143         Keys still become type str (interpreted as latin1 in python3!)
00144         because it's too painful to keep them as byte strings in
00145         python3 and in practice they're nearly always ascii anyway.
00146         """
00147         # This is gross, but python3 doesn't give us another way.
00148         # Latin1 is the universal donor of character encodings.
00149         result = parse_qs(qs, keep_blank_values, strict_parsing,
00150                           encoding='latin1', errors='strict')
00151         encoded = {}
00152         for k, v in result.iteritems():
00153             encoded[k] = [i.encode('latin1') for i in v]
00154         return encoded
00155 
00156 
00157 _UTF8_TYPES = (bytes, type(None))
00158 
00159 
00160 def utf8(value):
00161     """Converts a string argument to a byte string.
00162 
00163     If the argument is already a byte string or None, it is returned unchanged.
00164     Otherwise it must be a unicode string and is encoded as utf8.
00165     """
00166     if isinstance(value, _UTF8_TYPES):
00167         return value
00168     assert isinstance(value, unicode)
00169     return value.encode("utf-8")
00170 
00171 _TO_UNICODE_TYPES = (unicode, type(None))
00172 
00173 
00174 def to_unicode(value):
00175     """Converts a string argument to a unicode string.
00176 
00177     If the argument is already a unicode string or None, it is returned
00178     unchanged.  Otherwise it must be a byte string and is decoded as utf8.
00179     """
00180     if isinstance(value, _TO_UNICODE_TYPES):
00181         return value
00182     assert isinstance(value, bytes)
00183     return value.decode("utf-8")
00184 
00185 # to_unicode was previously named _unicode not because it was private,
00186 # but to avoid conflicts with the built-in unicode() function/type
00187 _unicode = to_unicode
00188 
00189 # When dealing with the standard library across python 2 and 3 it is
00190 # sometimes useful to have a direct conversion to the native string type
00191 if str is unicode:
00192     native_str = to_unicode
00193 else:
00194     native_str = utf8
00195 
00196 _BASESTRING_TYPES = (basestring, type(None))
00197 
00198 
00199 def to_basestring(value):
00200     """Converts a string argument to a subclass of basestring.
00201 
00202     In python2, byte and unicode strings are mostly interchangeable,
00203     so functions that deal with a user-supplied argument in combination
00204     with ascii string constants can use either and should return the type
00205     the user supplied.  In python3, the two types are not interchangeable,
00206     so this method is needed to convert byte strings to unicode.
00207     """
00208     if isinstance(value, _BASESTRING_TYPES):
00209         return value
00210     assert isinstance(value, bytes)
00211     return value.decode("utf-8")
00212 
00213 
00214 def recursive_unicode(obj):
00215     """Walks a simple data structure, converting byte strings to unicode.
00216 
00217     Supports lists, tuples, and dictionaries.
00218     """
00219     if isinstance(obj, dict):
00220         return dict((recursive_unicode(k), recursive_unicode(v)) for (k, v) in obj.iteritems())
00221     elif isinstance(obj, list):
00222         return list(recursive_unicode(i) for i in obj)
00223     elif isinstance(obj, tuple):
00224         return tuple(recursive_unicode(i) for i in obj)
00225     elif isinstance(obj, bytes):
00226         return to_unicode(obj)
00227     else:
00228         return obj
00229 
00230 # I originally used the regex from
00231 # http://daringfireball.net/2010/07/improved_regex_for_matching_urls
00232 # but it gets all exponential on certain patterns (such as too many trailing
00233 # dots), causing the regex matcher to never return.
00234 # This regex should avoid those problems.
00235 _URL_RE = re.compile(ur"""\b((?:([\w-]+):(/{1,3})|www[.])(?:(?:(?:[^\s&()]|&amp;|&quot;)*(?:[^!"#$%&'()*+,.:;<=>?@\[\]^`{|}~\s]))|(?:\((?:[^\s&()]|&amp;|&quot;)*\)))+)""")
00236 
00237 
00238 def linkify(text, shorten=False, extra_params="",
00239             require_protocol=False, permitted_protocols=["http", "https"]):
00240     """Converts plain text into HTML with links.
00241 
00242     For example: ``linkify("Hello http://tornadoweb.org!")`` would return
00243     ``Hello <a href="http://tornadoweb.org">http://tornadoweb.org</a>!``
00244 
00245     Parameters:
00246 
00247     shorten: Long urls will be shortened for display.
00248 
00249     extra_params: Extra text to include in the link tag, or a callable
00250         taking the link as an argument and returning the extra text
00251         e.g. ``linkify(text, extra_params='rel="nofollow" class="external"')``,
00252         or::
00253 
00254             def extra_params_cb(url):
00255                 if url.startswith("http://example.com"):
00256                     return 'class="internal"'
00257                 else:
00258                     return 'class="external" rel="nofollow"'
00259             linkify(text, extra_params=extra_params_cb)
00260 
00261     require_protocol: Only linkify urls which include a protocol. If this is
00262         False, urls such as www.facebook.com will also be linkified.
00263 
00264     permitted_protocols: List (or set) of protocols which should be linkified,
00265         e.g. linkify(text, permitted_protocols=["http", "ftp", "mailto"]).
00266         It is very unsafe to include protocols such as "javascript".
00267     """
00268     if extra_params and not callable(extra_params):
00269         extra_params = " " + extra_params.strip()
00270 
00271     def make_link(m):
00272         url = m.group(1)
00273         proto = m.group(2)
00274         if require_protocol and not proto:
00275             return url  # not protocol, no linkify
00276 
00277         if proto and proto not in permitted_protocols:
00278             return url  # bad protocol, no linkify
00279 
00280         href = m.group(1)
00281         if not proto:
00282             href = "http://" + href   # no proto specified, use http
00283 
00284         if callable(extra_params):
00285             params = " " + extra_params(href).strip()
00286         else:
00287             params = extra_params
00288 
00289         # clip long urls. max_len is just an approximation
00290         max_len = 30
00291         if shorten and len(url) > max_len:
00292             before_clip = url
00293             if proto:
00294                 proto_len = len(proto) + 1 + len(m.group(3) or "")  # +1 for :
00295             else:
00296                 proto_len = 0
00297 
00298             parts = url[proto_len:].split("/")
00299             if len(parts) > 1:
00300                 # Grab the whole host part plus the first bit of the path
00301                 # The path is usually not that interesting once shortened
00302                 # (no more slug, etc), so it really just provides a little
00303                 # extra indication of shortening.
00304                 url = url[:proto_len] + parts[0] + "/" + \
00305                         parts[1][:8].split('?')[0].split('.')[0]
00306 
00307             if len(url) > max_len * 1.5:  # still too long
00308                 url = url[:max_len]
00309 
00310             if url != before_clip:
00311                 amp = url.rfind('&')
00312                 # avoid splitting html char entities
00313                 if amp > max_len - 5:
00314                     url = url[:amp]
00315                 url += "..."
00316 
00317                 if len(url) >= len(before_clip):
00318                     url = before_clip
00319                 else:
00320                     # full url is visible on mouse-over (for those who don't
00321                     # have a status bar, such as Safari by default)
00322                     params += ' title="%s"' % href
00323 
00324         return u'<a href="%s"%s>%s</a>' % (href, params, url)
00325 
00326     # First HTML-escape so that our strings are all safe.
00327     # The regex is modified to avoid character entites other than &amp; so
00328     # that we won't pick up &quot;, etc.
00329     text = _unicode(xhtml_escape(text))
00330     return _URL_RE.sub(make_link, text)
00331 
00332 
00333 def _convert_entity(m):
00334     if m.group(1) == "#":
00335         try:
00336             return unichr(int(m.group(2)))
00337         except ValueError:
00338             return "&#%s;" % m.group(2)
00339     try:
00340         return _HTML_UNICODE_MAP[m.group(2)]
00341     except KeyError:
00342         return "&%s;" % m.group(2)
00343 
00344 
00345 def _build_unicode_map():
00346     unicode_map = {}
00347     for name, value in htmlentitydefs.name2codepoint.iteritems():
00348         unicode_map[name] = unichr(value)
00349     return unicode_map
00350 
00351 _HTML_UNICODE_MAP = _build_unicode_map()