roswww: httputil.py Source File

Go to the documentation of this file.
00001 #!/usr/bin/env python
00002 #
00003 # Copyright 2009 Facebook
00004 #
00005 # Licensed under the Apache License, Version 2.0 (the "License"); you may
00006 # not use this file except in compliance with the License. You may obtain
00007 # a copy of the License at
00008 #
00009 #     http://www.apache.org/licenses/LICENSE-2.0
00010 #
00011 # Unless required by applicable law or agreed to in writing, software
00012 # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
00013 # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
00014 # License for the specific language governing permissions and limitations
00015 # under the License.
00016 
00017 """HTTP utility code shared by clients and servers."""
00018 
00019 from __future__ import absolute_import, division, with_statement
00020 
00021 import logging
00022 import urllib
00023 import re
00024 
00025 from tornado.util import b, ObjectDict
00026 
00027 
00028 class HTTPHeaders(dict):
00029     """A dictionary that maintains Http-Header-Case for all keys.
00030 
00031     Supports multiple values per key via a pair of new methods,
00032     add() and get_list().  The regular dictionary interface returns a single
00033     value per key, with multiple values joined by a comma.
00034 
00035     >>> h = HTTPHeaders({"content-type": "text/html"})
00036     >>> h.keys()
00037     ['Content-Type']
00038     >>> h["Content-Type"]
00039     'text/html'
00040 
00041     >>> h.add("Set-Cookie", "A=B")
00042     >>> h.add("Set-Cookie", "C=D")
00043     >>> h["set-cookie"]
00044     'A=B,C=D'
00045     >>> h.get_list("set-cookie")
00046     ['A=B', 'C=D']
00047 
00048     >>> for (k,v) in sorted(h.get_all()):
00049     ...    print '%s: %s' % (k,v)
00050     ...
00051     Content-Type: text/html
00052     Set-Cookie: A=B
00053     Set-Cookie: C=D
00054     """
00055     def __init__(self, *args, **kwargs):
00056         # Don't pass args or kwargs to dict.__init__, as it will bypass
00057         # our __setitem__
00058         dict.__init__(self)
00059         self._as_list = {}
00060         self._last_key = None
00061         if (len(args) == 1 and len(kwargs) == 0 and
00062             isinstance(args[0], HTTPHeaders)):
00063             # Copy constructor
00064             for k, v in args[0].get_all():
00065                 self.add(k, v)
00066         else:
00067             # Dict-style initialization
00068             self.update(*args, **kwargs)
00069 
00070     # new public methods
00071 
00072     def add(self, name, value):
00073         """Adds a new value for the given key."""
00074         norm_name = HTTPHeaders._normalize_name(name)
00075         self._last_key = norm_name
00076         if norm_name in self:
00077             # bypass our override of __setitem__ since it modifies _as_list
00078             dict.__setitem__(self, norm_name, self[norm_name] + ',' + value)
00079             self._as_list[norm_name].append(value)
00080         else:
00081             self[norm_name] = value
00082 
00083     def get_list(self, name):
00084         """Returns all values for the given header as a list."""
00085         norm_name = HTTPHeaders._normalize_name(name)
00086         return self._as_list.get(norm_name, [])
00087 
00088     def get_all(self):
00089         """Returns an iterable of all (name, value) pairs.
00090 
00091         If a header has multiple values, multiple pairs will be
00092         returned with the same name.
00093         """
00094         for name, list in self._as_list.iteritems():
00095             for value in list:
00096                 yield (name, value)
00097 
00098     def parse_line(self, line):
00099         """Updates the dictionary with a single header line.
00100 
00101         >>> h = HTTPHeaders()
00102         >>> h.parse_line("Content-Type: text/html")
00103         >>> h.get('content-type')
00104         'text/html'
00105         """
00106         if line[0].isspace():
00107             # continuation of a multi-line header
00108             new_part = ' ' + line.lstrip()
00109             self._as_list[self._last_key][-1] += new_part
00110             dict.__setitem__(self, self._last_key,
00111                              self[self._last_key] + new_part)
00112         else:
00113             name, value = line.split(":", 1)
00114             self.add(name, value.strip())
00115 
00116     @classmethod
00117     def parse(cls, headers):
00118         """Returns a dictionary from HTTP header text.
00119 
00120         >>> h = HTTPHeaders.parse("Content-Type: text/html\\r\\nContent-Length: 42\\r\\n")
00121         >>> sorted(h.iteritems())
00122         [('Content-Length', '42'), ('Content-Type', 'text/html')]
00123         """
00124         h = cls()
00125         for line in headers.splitlines():
00126             if line:
00127                 h.parse_line(line)
00128         return h
00129 
00130     # dict implementation overrides
00131 
00132     def __setitem__(self, name, value):
00133         norm_name = HTTPHeaders._normalize_name(name)
00134         dict.__setitem__(self, norm_name, value)
00135         self._as_list[norm_name] = [value]
00136 
00137     def __getitem__(self, name):
00138         return dict.__getitem__(self, HTTPHeaders._normalize_name(name))
00139 
00140     def __delitem__(self, name):
00141         norm_name = HTTPHeaders._normalize_name(name)
00142         dict.__delitem__(self, norm_name)
00143         del self._as_list[norm_name]
00144 
00145     def __contains__(self, name):
00146         norm_name = HTTPHeaders._normalize_name(name)
00147         return dict.__contains__(self, norm_name)
00148 
00149     def get(self, name, default=None):
00150         return dict.get(self, HTTPHeaders._normalize_name(name), default)
00151 
00152     def update(self, *args, **kwargs):
00153         # dict.update bypasses our __setitem__
00154         for k, v in dict(*args, **kwargs).iteritems():
00155             self[k] = v
00156 
00157     def copy(self):
00158         # default implementation returns dict(self), not the subclass
00159         return HTTPHeaders(self)
00160 
00161     _NORMALIZED_HEADER_RE = re.compile(r'^[A-Z0-9][a-z0-9]*(-[A-Z0-9][a-z0-9]*)*$')
00162     _normalized_headers = {}
00163 
00164     @staticmethod
00165     def _normalize_name(name):
00166         """Converts a name to Http-Header-Case.
00167 
00168         >>> HTTPHeaders._normalize_name("coNtent-TYPE")
00169         'Content-Type'
00170         """
00171         try:
00172             return HTTPHeaders._normalized_headers[name]
00173         except KeyError:
00174             if HTTPHeaders._NORMALIZED_HEADER_RE.match(name):
00175                 normalized = name
00176             else:
00177                 normalized = "-".join([w.capitalize() for w in name.split("-")])
00178             HTTPHeaders._normalized_headers[name] = normalized
00179             return normalized
00180 
00181 
00182 def url_concat(url, args):
00183     """Concatenate url and argument dictionary regardless of whether
00184     url has existing query parameters.
00185 
00186     >>> url_concat("http://example.com/foo?a=b", dict(c="d"))
00187     'http://example.com/foo?a=b&c=d'
00188     """
00189     if not args:
00190         return url
00191     if url[-1] not in ('?', '&'):
00192         url += '&' if ('?' in url) else '?'
00193     return url + urllib.urlencode(args)
00194 
00195 
00196 class HTTPFile(ObjectDict):
00197     """Represents an HTTP file. For backwards compatibility, its instance
00198     attributes are also accessible as dictionary keys.
00199 
00200     :ivar filename:
00201     :ivar body:
00202     :ivar content_type: The content_type comes from the provided HTTP header
00203         and should not be trusted outright given that it can be easily forged.
00204     """
00205     pass
00206 
00207 
00208 def parse_multipart_form_data(boundary, data, arguments, files):
00209     """Parses a multipart/form-data body.
00210 
00211     The boundary and data parameters are both byte strings.
00212     The dictionaries given in the arguments and files parameters
00213     will be updated with the contents of the body.
00214     """
00215     # The standard allows for the boundary to be quoted in the header,
00216     # although it's rare (it happens at least for google app engine
00217     # xmpp).  I think we're also supposed to handle backslash-escapes
00218     # here but I'll save that until we see a client that uses them
00219     # in the wild.
00220     if boundary.startswith(b('"')) and boundary.endswith(b('"')):
00221         boundary = boundary[1:-1]
00222     final_boundary_index = data.rfind(b("--") + boundary + b("--"))
00223     if final_boundary_index == -1:
00224         logging.warning("Invalid multipart/form-data: no final boundary")
00225         return
00226     parts = data[:final_boundary_index].split(b("--") + boundary + b("\r\n"))
00227     for part in parts:
00228         if not part:
00229             continue
00230         eoh = part.find(b("\r\n\r\n"))
00231         if eoh == -1:
00232             logging.warning("multipart/form-data missing headers")
00233             continue
00234         headers = HTTPHeaders.parse(part[:eoh].decode("utf-8"))
00235         disp_header = headers.get("Content-Disposition", "")
00236         disposition, disp_params = _parse_header(disp_header)
00237         if disposition != "form-data" or not part.endswith(b("\r\n")):
00238             logging.warning("Invalid multipart/form-data")
00239             continue
00240         value = part[eoh + 4:-2]
00241         if not disp_params.get("name"):
00242             logging.warning("multipart/form-data value missing name")
00243             continue
00244         name = disp_params["name"]
00245         if disp_params.get("filename"):
00246             ctype = headers.get("Content-Type", "application/unknown")
00247             files.setdefault(name, []).append(HTTPFile(
00248                 filename=disp_params["filename"], body=value,
00249                 content_type=ctype))
00250         else:
00251             arguments.setdefault(name, []).append(value)
00252 
00253 
00254 # _parseparam and _parse_header are copied and modified from python2.7's cgi.py
00255 # The original 2.7 version of this code did not correctly support some
00256 # combinations of semicolons and double quotes.
00257 def _parseparam(s):
00258     while s[:1] == ';':
00259         s = s[1:]
00260         end = s.find(';')
00261         while end > 0 and (s.count('"', 0, end) - s.count('\\"', 0, end)) % 2:
00262             end = s.find(';', end + 1)
00263         if end < 0:
00264             end = len(s)
00265         f = s[:end]
00266         yield f.strip()
00267         s = s[end:]
00268 
00269 
00270 def _parse_header(line):
00271     """Parse a Content-type like header.
00272 
00273     Return the main content-type and a dictionary of options.
00274 
00275     """
00276     parts = _parseparam(';' + line)
00277     key = parts.next()
00278     pdict = {}
00279     for p in parts:
00280         i = p.find('=')
00281         if i >= 0:
00282             name = p[:i].strip().lower()
00283             value = p[i + 1:].strip()
00284             if len(value) >= 2 and value[0] == value[-1] == '"':
00285                 value = value[1:-1]
00286                 value = value.replace('\\\\', '\\').replace('\\"', '"')
00287             pdict[name] = value
00288     return key, pdict
00289 
00290 
00291 def doctests():
00292     import doctest
00293     return doctest.DocTestSuite()