httpclient.py
Go to the documentation of this file.
00001 """Blocking and non-blocking HTTP client interfaces.
00002 
00003 This module defines a common interface shared by two implementations,
00004 `simple_httpclient` and `curl_httpclient`.  Applications may either
00005 instantiate their chosen implementation class directly or use the
00006 `AsyncHTTPClient` class from this module, which selects an implementation
00007 that can be overridden with the `AsyncHTTPClient.configure` method.
00008 
00009 The default implementation is `simple_httpclient`, and this is expected
00010 to be suitable for most users' needs.  However, some applications may wish
00011 to switch to `curl_httpclient` for reasons such as the following:
00012 
00013 * `curl_httpclient` has some features not found in `simple_httpclient`,
00014   including support for HTTP proxies and the ability to use a specified
00015   network interface.
00016 
00017 * `curl_httpclient` is more likely to be compatible with sites that are
00018   not-quite-compliant with the HTTP spec, or sites that use little-exercised
00019   features of HTTP.
00020 
00021 * `simple_httpclient` only supports SSL on Python 2.6 and above.
00022 
00023 * `curl_httpclient` is faster
00024 
00025 * `curl_httpclient` was the default prior to Tornado 2.0.
00026 
00027 Note that if you are using `curl_httpclient`, it is highly recommended that
00028 you use a recent version of ``libcurl`` and ``pycurl``.  Currently the minimum
00029 supported version is 7.18.2, and the recommended version is 7.21.1 or newer.
00030 """
00031 
00032 from __future__ import absolute_import, division, with_statement
00033 
00034 import calendar
00035 import email.utils
00036 import httplib
00037 import time
00038 import weakref
00039 
00040 from tornado.escape import utf8
00041 from tornado import httputil
00042 from tornado.ioloop import IOLoop
00043 from tornado.util import import_object, bytes_type
00044 
00045 
00046 class HTTPClient(object):
00047     """A blocking HTTP client.
00048 
00049     This interface is provided for convenience and testing; most applications
00050     that are running an IOLoop will want to use `AsyncHTTPClient` instead.
00051     Typical usage looks like this::
00052 
00053         http_client = httpclient.HTTPClient()
00054         try:
00055             response = http_client.fetch("http://www.google.com/")
00056             print response.body
00057         except httpclient.HTTPError, e:
00058             print "Error:", e
00059     """
00060     def __init__(self, async_client_class=None, **kwargs):
00061         self._io_loop = IOLoop()
00062         if async_client_class is None:
00063             async_client_class = AsyncHTTPClient
00064         self._async_client = async_client_class(self._io_loop, **kwargs)
00065         self._response = None
00066         self._closed = False
00067 
00068     def __del__(self):
00069         self.close()
00070 
00071     def close(self):
00072         """Closes the HTTPClient, freeing any resources used."""
00073         if not self._closed:
00074             self._async_client.close()
00075             self._io_loop.close()
00076             self._closed = True
00077 
00078     def fetch(self, request, **kwargs):
00079         """Executes a request, returning an `HTTPResponse`.
00080 
00081         The request may be either a string URL or an `HTTPRequest` object.
00082         If it is a string, we construct an `HTTPRequest` using any additional
00083         kwargs: ``HTTPRequest(request, **kwargs)``
00084 
00085         If an error occurs during the fetch, we raise an `HTTPError`.
00086         """
00087         def callback(response):
00088             self._response = response
00089             self._io_loop.stop()
00090         self._async_client.fetch(request, callback, **kwargs)
00091         self._io_loop.start()
00092         response = self._response
00093         self._response = None
00094         response.rethrow()
00095         return response
00096 
00097 
00098 class AsyncHTTPClient(object):
00099     """An non-blocking HTTP client.
00100 
00101     Example usage::
00102 
00103         import ioloop
00104 
00105         def handle_request(response):
00106             if response.error:
00107                 print "Error:", response.error
00108             else:
00109                 print response.body
00110             ioloop.IOLoop.instance().stop()
00111 
00112         http_client = httpclient.AsyncHTTPClient()
00113         http_client.fetch("http://www.google.com/", handle_request)
00114         ioloop.IOLoop.instance().start()
00115 
00116     The constructor for this class is magic in several respects:  It actually
00117     creates an instance of an implementation-specific subclass, and instances
00118     are reused as a kind of pseudo-singleton (one per IOLoop).  The keyword
00119     argument force_instance=True can be used to suppress this singleton
00120     behavior.  Constructor arguments other than io_loop and force_instance
00121     are deprecated.  The implementation subclass as well as arguments to
00122     its constructor can be set with the static method configure()
00123     """
00124     _impl_class = None
00125     _impl_kwargs = None
00126 
00127     _DEFAULT_MAX_CLIENTS = 10
00128 
00129     @classmethod
00130     def _async_clients(cls):
00131         assert cls is not AsyncHTTPClient, "should only be called on subclasses"
00132         if not hasattr(cls, '_async_client_dict'):
00133             cls._async_client_dict = weakref.WeakKeyDictionary()
00134         return cls._async_client_dict
00135 
00136     def __new__(cls, io_loop=None, max_clients=None, force_instance=False,
00137                 **kwargs):
00138         io_loop = io_loop or IOLoop.instance()
00139         if cls is AsyncHTTPClient:
00140             if cls._impl_class is None:
00141                 from tornado.simple_httpclient import SimpleAsyncHTTPClient
00142                 AsyncHTTPClient._impl_class = SimpleAsyncHTTPClient
00143             impl = AsyncHTTPClient._impl_class
00144         else:
00145             impl = cls
00146         if io_loop in impl._async_clients() and not force_instance:
00147             return impl._async_clients()[io_loop]
00148         else:
00149             instance = super(AsyncHTTPClient, cls).__new__(impl)
00150             args = {}
00151             if cls._impl_kwargs:
00152                 args.update(cls._impl_kwargs)
00153             args.update(kwargs)
00154             if max_clients is not None:
00155                 # max_clients is special because it may be passed
00156                 # positionally instead of by keyword
00157                 args["max_clients"] = max_clients
00158             elif "max_clients" not in args:
00159                 args["max_clients"] = AsyncHTTPClient._DEFAULT_MAX_CLIENTS
00160             instance.initialize(io_loop, **args)
00161             if not force_instance:
00162                 impl._async_clients()[io_loop] = instance
00163             return instance
00164 
00165     def close(self):
00166         """Destroys this http client, freeing any file descriptors used.
00167         Not needed in normal use, but may be helpful in unittests that
00168         create and destroy http clients.  No other methods may be called
00169         on the AsyncHTTPClient after close().
00170         """
00171         if self._async_clients().get(self.io_loop) is self:
00172             del self._async_clients()[self.io_loop]
00173 
00174     def fetch(self, request, callback, **kwargs):
00175         """Executes a request, calling callback with an `HTTPResponse`.
00176 
00177         The request may be either a string URL or an `HTTPRequest` object.
00178         If it is a string, we construct an `HTTPRequest` using any additional
00179         kwargs: ``HTTPRequest(request, **kwargs)``
00180 
00181         If an error occurs during the fetch, the HTTPResponse given to the
00182         callback has a non-None error attribute that contains the exception
00183         encountered during the request. You can call response.rethrow() to
00184         throw the exception (if any) in the callback.
00185         """
00186         raise NotImplementedError()
00187 
00188     @staticmethod
00189     def configure(impl, **kwargs):
00190         """Configures the AsyncHTTPClient subclass to use.
00191 
00192         AsyncHTTPClient() actually creates an instance of a subclass.
00193         This method may be called with either a class object or the
00194         fully-qualified name of such a class (or None to use the default,
00195         SimpleAsyncHTTPClient)
00196 
00197         If additional keyword arguments are given, they will be passed
00198         to the constructor of each subclass instance created.  The
00199         keyword argument max_clients determines the maximum number of
00200         simultaneous fetch() operations that can execute in parallel
00201         on each IOLoop.  Additional arguments may be supported depending
00202         on the implementation class in use.
00203 
00204         Example::
00205 
00206            AsyncHTTPClient.configure("tornado.curl_httpclient.CurlAsyncHTTPClient")
00207         """
00208         if isinstance(impl, (unicode, bytes_type)):
00209             impl = import_object(impl)
00210         if impl is not None and not issubclass(impl, AsyncHTTPClient):
00211             raise ValueError("Invalid AsyncHTTPClient implementation")
00212         AsyncHTTPClient._impl_class = impl
00213         AsyncHTTPClient._impl_kwargs = kwargs
00214 
00215     @staticmethod
00216     def _save_configuration():
00217         return (AsyncHTTPClient._impl_class, AsyncHTTPClient._impl_kwargs)
00218 
00219     @staticmethod
00220     def _restore_configuration(saved):
00221         AsyncHTTPClient._impl_class = saved[0]
00222         AsyncHTTPClient._impl_kwargs = saved[1]
00223 
00224 
00225 class HTTPRequest(object):
00226     """HTTP client request object."""
00227     def __init__(self, url, method="GET", headers=None, body=None,
00228                  auth_username=None, auth_password=None,
00229                  connect_timeout=20.0, request_timeout=20.0,
00230                  if_modified_since=None, follow_redirects=True,
00231                  max_redirects=5, user_agent=None, use_gzip=True,
00232                  network_interface=None, streaming_callback=None,
00233                  header_callback=None, prepare_curl_callback=None,
00234                  proxy_host=None, proxy_port=None, proxy_username=None,
00235                  proxy_password='', allow_nonstandard_methods=False,
00236                  validate_cert=True, ca_certs=None,
00237                  allow_ipv6=None,
00238                  client_key=None, client_cert=None):
00239         """Creates an `HTTPRequest`.
00240 
00241         All parameters except `url` are optional.
00242 
00243         :arg string url: URL to fetch
00244         :arg string method: HTTP method, e.g. "GET" or "POST"
00245         :arg headers: Additional HTTP headers to pass on the request
00246         :type headers: `~tornado.httputil.HTTPHeaders` or `dict`
00247         :arg string auth_username: Username for HTTP "Basic" authentication
00248         :arg string auth_password: Password for HTTP "Basic" authentication
00249         :arg float connect_timeout: Timeout for initial connection in seconds
00250         :arg float request_timeout: Timeout for entire request in seconds
00251         :arg datetime if_modified_since: Timestamp for ``If-Modified-Since``
00252            header
00253         :arg bool follow_redirects: Should redirects be followed automatically
00254            or return the 3xx response?
00255         :arg int max_redirects: Limit for `follow_redirects`
00256         :arg string user_agent: String to send as ``User-Agent`` header
00257         :arg bool use_gzip: Request gzip encoding from the server
00258         :arg string network_interface: Network interface to use for request
00259         :arg callable streaming_callback: If set, `streaming_callback` will
00260            be run with each chunk of data as it is received, and
00261            `~HTTPResponse.body` and `~HTTPResponse.buffer` will be empty in
00262            the final response.
00263         :arg callable header_callback: If set, `header_callback` will
00264            be run with each header line as it is received, and
00265            `~HTTPResponse.headers` will be empty in the final response.
00266         :arg callable prepare_curl_callback: If set, will be called with
00267            a `pycurl.Curl` object to allow the application to make additional
00268            `setopt` calls.
00269         :arg string proxy_host: HTTP proxy hostname.  To use proxies,
00270            `proxy_host` and `proxy_port` must be set; `proxy_username` and
00271            `proxy_pass` are optional.  Proxies are currently only support
00272            with `curl_httpclient`.
00273         :arg int proxy_port: HTTP proxy port
00274         :arg string proxy_username: HTTP proxy username
00275         :arg string proxy_password: HTTP proxy password
00276         :arg bool allow_nonstandard_methods: Allow unknown values for `method`
00277            argument?
00278         :arg bool validate_cert: For HTTPS requests, validate the server's
00279            certificate?
00280         :arg string ca_certs: filename of CA certificates in PEM format,
00281            or None to use defaults.  Note that in `curl_httpclient`, if
00282            any request uses a custom `ca_certs` file, they all must (they
00283            don't have to all use the same `ca_certs`, but it's not possible
00284            to mix requests with ca_certs and requests that use the defaults.
00285         :arg bool allow_ipv6: Use IPv6 when available?  Default is false in
00286            `simple_httpclient` and true in `curl_httpclient`
00287         :arg string client_key: Filename for client SSL key, if any
00288         :arg string client_cert: Filename for client SSL certificate, if any
00289         """
00290         if headers is None:
00291             headers = httputil.HTTPHeaders()
00292         if if_modified_since:
00293             timestamp = calendar.timegm(if_modified_since.utctimetuple())
00294             headers["If-Modified-Since"] = email.utils.formatdate(
00295                 timestamp, localtime=False, usegmt=True)
00296         self.proxy_host = proxy_host
00297         self.proxy_port = proxy_port
00298         self.proxy_username = proxy_username
00299         self.proxy_password = proxy_password
00300         self.url = url
00301         self.method = method
00302         self.headers = headers
00303         self.body = utf8(body)
00304         self.auth_username = auth_username
00305         self.auth_password = auth_password
00306         self.connect_timeout = connect_timeout
00307         self.request_timeout = request_timeout
00308         self.follow_redirects = follow_redirects
00309         self.max_redirects = max_redirects
00310         self.user_agent = user_agent
00311         self.use_gzip = use_gzip
00312         self.network_interface = network_interface
00313         self.streaming_callback = streaming_callback
00314         self.header_callback = header_callback
00315         self.prepare_curl_callback = prepare_curl_callback
00316         self.allow_nonstandard_methods = allow_nonstandard_methods
00317         self.validate_cert = validate_cert
00318         self.ca_certs = ca_certs
00319         self.allow_ipv6 = allow_ipv6
00320         self.client_key = client_key
00321         self.client_cert = client_cert
00322         self.start_time = time.time()
00323 
00324 
00325 class HTTPResponse(object):
00326     """HTTP Response object.
00327 
00328     Attributes:
00329 
00330     * request: HTTPRequest object
00331 
00332     * code: numeric HTTP status code, e.g. 200 or 404
00333 
00334     * headers: httputil.HTTPHeaders object
00335 
00336     * buffer: cStringIO object for response body
00337 
00338     * body: respose body as string (created on demand from self.buffer)
00339 
00340     * error: Exception object, if any
00341 
00342     * request_time: seconds from request start to finish
00343 
00344     * time_info: dictionary of diagnostic timing information from the request.
00345         Available data are subject to change, but currently uses timings
00346         available from http://curl.haxx.se/libcurl/c/curl_easy_getinfo.html,
00347         plus 'queue', which is the delay (if any) introduced by waiting for
00348         a slot under AsyncHTTPClient's max_clients setting.
00349     """
00350     def __init__(self, request, code, headers=None, buffer=None,
00351                  effective_url=None, error=None, request_time=None,
00352                  time_info=None):
00353         self.request = request
00354         self.code = code
00355         if headers is not None:
00356             self.headers = headers
00357         else:
00358             self.headers = httputil.HTTPHeaders()
00359         self.buffer = buffer
00360         self._body = None
00361         if effective_url is None:
00362             self.effective_url = request.url
00363         else:
00364             self.effective_url = effective_url
00365         if error is None:
00366             if self.code < 200 or self.code >= 300:
00367                 self.error = HTTPError(self.code, response=self)
00368             else:
00369                 self.error = None
00370         else:
00371             self.error = error
00372         self.request_time = request_time
00373         self.time_info = time_info or {}
00374 
00375     def _get_body(self):
00376         if self.buffer is None:
00377             return None
00378         elif self._body is None:
00379             self._body = self.buffer.getvalue()
00380 
00381         return self._body
00382 
00383     body = property(_get_body)
00384 
00385     def rethrow(self):
00386         """If there was an error on the request, raise an `HTTPError`."""
00387         if self.error:
00388             raise self.error
00389 
00390     def __repr__(self):
00391         args = ",".join("%s=%r" % i for i in self.__dict__.iteritems())
00392         return "%s(%s)" % (self.__class__.__name__, args)
00393 
00394 
00395 class HTTPError(Exception):
00396     """Exception thrown for an unsuccessful HTTP request.
00397 
00398     Attributes:
00399 
00400     code - HTTP error integer error code, e.g. 404.  Error code 599 is
00401            used when no HTTP response was received, e.g. for a timeout.
00402 
00403     response - HTTPResponse object, if any.
00404 
00405     Note that if follow_redirects is False, redirects become HTTPErrors,
00406     and you can look at error.response.headers['Location'] to see the
00407     destination of the redirect.
00408     """
00409     def __init__(self, code, message=None, response=None):
00410         self.code = code
00411         message = message or httplib.responses.get(code, "Unknown")
00412         self.response = response
00413         Exception.__init__(self, "HTTP %d: %s" % (self.code, message))
00414 
00415 
00416 def main():
00417     from tornado.options import define, options, parse_command_line
00418     define("print_headers", type=bool, default=False)
00419     define("print_body", type=bool, default=True)
00420     define("follow_redirects", type=bool, default=True)
00421     define("validate_cert", type=bool, default=True)
00422     args = parse_command_line()
00423     client = HTTPClient()
00424     for arg in args:
00425         try:
00426             response = client.fetch(arg,
00427                                     follow_redirects=options.follow_redirects,
00428                                     validate_cert=options.validate_cert,
00429                                     )
00430         except HTTPError, e:
00431             if e.response is not None:
00432                 response = e.response
00433             else:
00434                 raise
00435         if options.print_headers:
00436             print response.headers
00437         if options.print_body:
00438             print response.body
00439     client.close()
00440 
00441 if __name__ == "__main__":
00442     main()


roswww
Author(s): Jonathan Mace
autogenerated on Thu Jan 2 2014 11:53:30