00001 """Blocking and non-blocking HTTP client interfaces.
00002
00003 This module defines a common interface shared by two implementations,
00004 `simple_httpclient` and `curl_httpclient`. Applications may either
00005 instantiate their chosen implementation class directly or use the
00006 `AsyncHTTPClient` class from this module, which selects an implementation
00007 that can be overridden with the `AsyncHTTPClient.configure` method.
00008
00009 The default implementation is `simple_httpclient`, and this is expected
00010 to be suitable for most users' needs. However, some applications may wish
00011 to switch to `curl_httpclient` for reasons such as the following:
00012
00013 * `curl_httpclient` has some features not found in `simple_httpclient`,
00014 including support for HTTP proxies and the ability to use a specified
00015 network interface.
00016
00017 * `curl_httpclient` is more likely to be compatible with sites that are
00018 not-quite-compliant with the HTTP spec, or sites that use little-exercised
00019 features of HTTP.
00020
00021 * `simple_httpclient` only supports SSL on Python 2.6 and above.
00022
00023 * `curl_httpclient` is faster
00024
00025 * `curl_httpclient` was the default prior to Tornado 2.0.
00026
00027 Note that if you are using `curl_httpclient`, it is highly recommended that
00028 you use a recent version of ``libcurl`` and ``pycurl``. Currently the minimum
00029 supported version is 7.18.2, and the recommended version is 7.21.1 or newer.
00030 """
00031
00032 from __future__ import absolute_import, division, with_statement
00033
00034 import calendar
00035 import email.utils
00036 import httplib
00037 import time
00038 import weakref
00039
00040 from tornado.escape import utf8
00041 from tornado import httputil
00042 from tornado.ioloop import IOLoop
00043 from tornado.util import import_object, bytes_type
00044
00045
00046 class HTTPClient(object):
00047 """A blocking HTTP client.
00048
00049 This interface is provided for convenience and testing; most applications
00050 that are running an IOLoop will want to use `AsyncHTTPClient` instead.
00051 Typical usage looks like this::
00052
00053 http_client = httpclient.HTTPClient()
00054 try:
00055 response = http_client.fetch("http://www.google.com/")
00056 print response.body
00057 except httpclient.HTTPError, e:
00058 print "Error:", e
00059 """
00060 def __init__(self, async_client_class=None, **kwargs):
00061 self._io_loop = IOLoop()
00062 if async_client_class is None:
00063 async_client_class = AsyncHTTPClient
00064 self._async_client = async_client_class(self._io_loop, **kwargs)
00065 self._response = None
00066 self._closed = False
00067
00068 def __del__(self):
00069 self.close()
00070
00071 def close(self):
00072 """Closes the HTTPClient, freeing any resources used."""
00073 if not self._closed:
00074 self._async_client.close()
00075 self._io_loop.close()
00076 self._closed = True
00077
00078 def fetch(self, request, **kwargs):
00079 """Executes a request, returning an `HTTPResponse`.
00080
00081 The request may be either a string URL or an `HTTPRequest` object.
00082 If it is a string, we construct an `HTTPRequest` using any additional
00083 kwargs: ``HTTPRequest(request, **kwargs)``
00084
00085 If an error occurs during the fetch, we raise an `HTTPError`.
00086 """
00087 def callback(response):
00088 self._response = response
00089 self._io_loop.stop()
00090 self._async_client.fetch(request, callback, **kwargs)
00091 self._io_loop.start()
00092 response = self._response
00093 self._response = None
00094 response.rethrow()
00095 return response
00096
00097
00098 class AsyncHTTPClient(object):
00099 """An non-blocking HTTP client.
00100
00101 Example usage::
00102
00103 import ioloop
00104
00105 def handle_request(response):
00106 if response.error:
00107 print "Error:", response.error
00108 else:
00109 print response.body
00110 ioloop.IOLoop.instance().stop()
00111
00112 http_client = httpclient.AsyncHTTPClient()
00113 http_client.fetch("http://www.google.com/", handle_request)
00114 ioloop.IOLoop.instance().start()
00115
00116 The constructor for this class is magic in several respects: It actually
00117 creates an instance of an implementation-specific subclass, and instances
00118 are reused as a kind of pseudo-singleton (one per IOLoop). The keyword
00119 argument force_instance=True can be used to suppress this singleton
00120 behavior. Constructor arguments other than io_loop and force_instance
00121 are deprecated. The implementation subclass as well as arguments to
00122 its constructor can be set with the static method configure()
00123 """
00124 _impl_class = None
00125 _impl_kwargs = None
00126
00127 _DEFAULT_MAX_CLIENTS = 10
00128
00129 @classmethod
00130 def _async_clients(cls):
00131 assert cls is not AsyncHTTPClient, "should only be called on subclasses"
00132 if not hasattr(cls, '_async_client_dict'):
00133 cls._async_client_dict = weakref.WeakKeyDictionary()
00134 return cls._async_client_dict
00135
00136 def __new__(cls, io_loop=None, max_clients=None, force_instance=False,
00137 **kwargs):
00138 io_loop = io_loop or IOLoop.instance()
00139 if cls is AsyncHTTPClient:
00140 if cls._impl_class is None:
00141 from tornado.simple_httpclient import SimpleAsyncHTTPClient
00142 AsyncHTTPClient._impl_class = SimpleAsyncHTTPClient
00143 impl = AsyncHTTPClient._impl_class
00144 else:
00145 impl = cls
00146 if io_loop in impl._async_clients() and not force_instance:
00147 return impl._async_clients()[io_loop]
00148 else:
00149 instance = super(AsyncHTTPClient, cls).__new__(impl)
00150 args = {}
00151 if cls._impl_kwargs:
00152 args.update(cls._impl_kwargs)
00153 args.update(kwargs)
00154 if max_clients is not None:
00155
00156
00157 args["max_clients"] = max_clients
00158 elif "max_clients" not in args:
00159 args["max_clients"] = AsyncHTTPClient._DEFAULT_MAX_CLIENTS
00160 instance.initialize(io_loop, **args)
00161 if not force_instance:
00162 impl._async_clients()[io_loop] = instance
00163 return instance
00164
00165 def close(self):
00166 """Destroys this http client, freeing any file descriptors used.
00167 Not needed in normal use, but may be helpful in unittests that
00168 create and destroy http clients. No other methods may be called
00169 on the AsyncHTTPClient after close().
00170 """
00171 if self._async_clients().get(self.io_loop) is self:
00172 del self._async_clients()[self.io_loop]
00173
00174 def fetch(self, request, callback, **kwargs):
00175 """Executes a request, calling callback with an `HTTPResponse`.
00176
00177 The request may be either a string URL or an `HTTPRequest` object.
00178 If it is a string, we construct an `HTTPRequest` using any additional
00179 kwargs: ``HTTPRequest(request, **kwargs)``
00180
00181 If an error occurs during the fetch, the HTTPResponse given to the
00182 callback has a non-None error attribute that contains the exception
00183 encountered during the request. You can call response.rethrow() to
00184 throw the exception (if any) in the callback.
00185 """
00186 raise NotImplementedError()
00187
00188 @staticmethod
00189 def configure(impl, **kwargs):
00190 """Configures the AsyncHTTPClient subclass to use.
00191
00192 AsyncHTTPClient() actually creates an instance of a subclass.
00193 This method may be called with either a class object or the
00194 fully-qualified name of such a class (or None to use the default,
00195 SimpleAsyncHTTPClient)
00196
00197 If additional keyword arguments are given, they will be passed
00198 to the constructor of each subclass instance created. The
00199 keyword argument max_clients determines the maximum number of
00200 simultaneous fetch() operations that can execute in parallel
00201 on each IOLoop. Additional arguments may be supported depending
00202 on the implementation class in use.
00203
00204 Example::
00205
00206 AsyncHTTPClient.configure("tornado.curl_httpclient.CurlAsyncHTTPClient")
00207 """
00208 if isinstance(impl, (unicode, bytes_type)):
00209 impl = import_object(impl)
00210 if impl is not None and not issubclass(impl, AsyncHTTPClient):
00211 raise ValueError("Invalid AsyncHTTPClient implementation")
00212 AsyncHTTPClient._impl_class = impl
00213 AsyncHTTPClient._impl_kwargs = kwargs
00214
00215 @staticmethod
00216 def _save_configuration():
00217 return (AsyncHTTPClient._impl_class, AsyncHTTPClient._impl_kwargs)
00218
00219 @staticmethod
00220 def _restore_configuration(saved):
00221 AsyncHTTPClient._impl_class = saved[0]
00222 AsyncHTTPClient._impl_kwargs = saved[1]
00223
00224
00225 class HTTPRequest(object):
00226 """HTTP client request object."""
00227 def __init__(self, url, method="GET", headers=None, body=None,
00228 auth_username=None, auth_password=None,
00229 connect_timeout=20.0, request_timeout=20.0,
00230 if_modified_since=None, follow_redirects=True,
00231 max_redirects=5, user_agent=None, use_gzip=True,
00232 network_interface=None, streaming_callback=None,
00233 header_callback=None, prepare_curl_callback=None,
00234 proxy_host=None, proxy_port=None, proxy_username=None,
00235 proxy_password='', allow_nonstandard_methods=False,
00236 validate_cert=True, ca_certs=None,
00237 allow_ipv6=None,
00238 client_key=None, client_cert=None):
00239 """Creates an `HTTPRequest`.
00240
00241 All parameters except `url` are optional.
00242
00243 :arg string url: URL to fetch
00244 :arg string method: HTTP method, e.g. "GET" or "POST"
00245 :arg headers: Additional HTTP headers to pass on the request
00246 :type headers: `~tornado.httputil.HTTPHeaders` or `dict`
00247 :arg string auth_username: Username for HTTP "Basic" authentication
00248 :arg string auth_password: Password for HTTP "Basic" authentication
00249 :arg float connect_timeout: Timeout for initial connection in seconds
00250 :arg float request_timeout: Timeout for entire request in seconds
00251 :arg datetime if_modified_since: Timestamp for ``If-Modified-Since``
00252 header
00253 :arg bool follow_redirects: Should redirects be followed automatically
00254 or return the 3xx response?
00255 :arg int max_redirects: Limit for `follow_redirects`
00256 :arg string user_agent: String to send as ``User-Agent`` header
00257 :arg bool use_gzip: Request gzip encoding from the server
00258 :arg string network_interface: Network interface to use for request
00259 :arg callable streaming_callback: If set, `streaming_callback` will
00260 be run with each chunk of data as it is received, and
00261 `~HTTPResponse.body` and `~HTTPResponse.buffer` will be empty in
00262 the final response.
00263 :arg callable header_callback: If set, `header_callback` will
00264 be run with each header line as it is received, and
00265 `~HTTPResponse.headers` will be empty in the final response.
00266 :arg callable prepare_curl_callback: If set, will be called with
00267 a `pycurl.Curl` object to allow the application to make additional
00268 `setopt` calls.
00269 :arg string proxy_host: HTTP proxy hostname. To use proxies,
00270 `proxy_host` and `proxy_port` must be set; `proxy_username` and
00271 `proxy_pass` are optional. Proxies are currently only support
00272 with `curl_httpclient`.
00273 :arg int proxy_port: HTTP proxy port
00274 :arg string proxy_username: HTTP proxy username
00275 :arg string proxy_password: HTTP proxy password
00276 :arg bool allow_nonstandard_methods: Allow unknown values for `method`
00277 argument?
00278 :arg bool validate_cert: For HTTPS requests, validate the server's
00279 certificate?
00280 :arg string ca_certs: filename of CA certificates in PEM format,
00281 or None to use defaults. Note that in `curl_httpclient`, if
00282 any request uses a custom `ca_certs` file, they all must (they
00283 don't have to all use the same `ca_certs`, but it's not possible
00284 to mix requests with ca_certs and requests that use the defaults.
00285 :arg bool allow_ipv6: Use IPv6 when available? Default is false in
00286 `simple_httpclient` and true in `curl_httpclient`
00287 :arg string client_key: Filename for client SSL key, if any
00288 :arg string client_cert: Filename for client SSL certificate, if any
00289 """
00290 if headers is None:
00291 headers = httputil.HTTPHeaders()
00292 if if_modified_since:
00293 timestamp = calendar.timegm(if_modified_since.utctimetuple())
00294 headers["If-Modified-Since"] = email.utils.formatdate(
00295 timestamp, localtime=False, usegmt=True)
00296 self.proxy_host = proxy_host
00297 self.proxy_port = proxy_port
00298 self.proxy_username = proxy_username
00299 self.proxy_password = proxy_password
00300 self.url = url
00301 self.method = method
00302 self.headers = headers
00303 self.body = utf8(body)
00304 self.auth_username = auth_username
00305 self.auth_password = auth_password
00306 self.connect_timeout = connect_timeout
00307 self.request_timeout = request_timeout
00308 self.follow_redirects = follow_redirects
00309 self.max_redirects = max_redirects
00310 self.user_agent = user_agent
00311 self.use_gzip = use_gzip
00312 self.network_interface = network_interface
00313 self.streaming_callback = streaming_callback
00314 self.header_callback = header_callback
00315 self.prepare_curl_callback = prepare_curl_callback
00316 self.allow_nonstandard_methods = allow_nonstandard_methods
00317 self.validate_cert = validate_cert
00318 self.ca_certs = ca_certs
00319 self.allow_ipv6 = allow_ipv6
00320 self.client_key = client_key
00321 self.client_cert = client_cert
00322 self.start_time = time.time()
00323
00324
00325 class HTTPResponse(object):
00326 """HTTP Response object.
00327
00328 Attributes:
00329
00330 * request: HTTPRequest object
00331
00332 * code: numeric HTTP status code, e.g. 200 or 404
00333
00334 * headers: httputil.HTTPHeaders object
00335
00336 * buffer: cStringIO object for response body
00337
00338 * body: respose body as string (created on demand from self.buffer)
00339
00340 * error: Exception object, if any
00341
00342 * request_time: seconds from request start to finish
00343
00344 * time_info: dictionary of diagnostic timing information from the request.
00345 Available data are subject to change, but currently uses timings
00346 available from http://curl.haxx.se/libcurl/c/curl_easy_getinfo.html,
00347 plus 'queue', which is the delay (if any) introduced by waiting for
00348 a slot under AsyncHTTPClient's max_clients setting.
00349 """
00350 def __init__(self, request, code, headers=None, buffer=None,
00351 effective_url=None, error=None, request_time=None,
00352 time_info=None):
00353 self.request = request
00354 self.code = code
00355 if headers is not None:
00356 self.headers = headers
00357 else:
00358 self.headers = httputil.HTTPHeaders()
00359 self.buffer = buffer
00360 self._body = None
00361 if effective_url is None:
00362 self.effective_url = request.url
00363 else:
00364 self.effective_url = effective_url
00365 if error is None:
00366 if self.code < 200 or self.code >= 300:
00367 self.error = HTTPError(self.code, response=self)
00368 else:
00369 self.error = None
00370 else:
00371 self.error = error
00372 self.request_time = request_time
00373 self.time_info = time_info or {}
00374
00375 def _get_body(self):
00376 if self.buffer is None:
00377 return None
00378 elif self._body is None:
00379 self._body = self.buffer.getvalue()
00380
00381 return self._body
00382
00383 body = property(_get_body)
00384
00385 def rethrow(self):
00386 """If there was an error on the request, raise an `HTTPError`."""
00387 if self.error:
00388 raise self.error
00389
00390 def __repr__(self):
00391 args = ",".join("%s=%r" % i for i in self.__dict__.iteritems())
00392 return "%s(%s)" % (self.__class__.__name__, args)
00393
00394
00395 class HTTPError(Exception):
00396 """Exception thrown for an unsuccessful HTTP request.
00397
00398 Attributes:
00399
00400 code - HTTP error integer error code, e.g. 404. Error code 599 is
00401 used when no HTTP response was received, e.g. for a timeout.
00402
00403 response - HTTPResponse object, if any.
00404
00405 Note that if follow_redirects is False, redirects become HTTPErrors,
00406 and you can look at error.response.headers['Location'] to see the
00407 destination of the redirect.
00408 """
00409 def __init__(self, code, message=None, response=None):
00410 self.code = code
00411 message = message or httplib.responses.get(code, "Unknown")
00412 self.response = response
00413 Exception.__init__(self, "HTTP %d: %s" % (self.code, message))
00414
00415
00416 def main():
00417 from tornado.options import define, options, parse_command_line
00418 define("print_headers", type=bool, default=False)
00419 define("print_body", type=bool, default=True)
00420 define("follow_redirects", type=bool, default=True)
00421 define("validate_cert", type=bool, default=True)
00422 args = parse_command_line()
00423 client = HTTPClient()
00424 for arg in args:
00425 try:
00426 response = client.fetch(arg,
00427 follow_redirects=options.follow_redirects,
00428 validate_cert=options.validate_cert,
00429 )
00430 except HTTPError, e:
00431 if e.response is not None:
00432 response = e.response
00433 else:
00434 raise
00435 if options.print_headers:
00436 print response.headers
00437 if options.print_body:
00438 print response.body
00439 client.close()
00440
00441 if __name__ == "__main__":
00442 main()