00001 """Blocking and non-blocking HTTP client interfaces.
00002
00003 This module defines a common interface shared by two implementations,
00004 ``simple_httpclient`` and ``curl_httpclient``. Applications may either
00005 instantiate their chosen implementation class directly or use the
00006 `AsyncHTTPClient` class from this module, which selects an implementation
00007 that can be overridden with the `AsyncHTTPClient.configure` method.
00008
00009 The default implementation is ``simple_httpclient``, and this is expected
00010 to be suitable for most users' needs. However, some applications may wish
00011 to switch to ``curl_httpclient`` for reasons such as the following:
00012
00013 * ``curl_httpclient`` has some features not found in ``simple_httpclient``,
00014 including support for HTTP proxies and the ability to use a specified
00015 network interface.
00016
00017 * ``curl_httpclient`` is more likely to be compatible with sites that are
00018 not-quite-compliant with the HTTP spec, or sites that use little-exercised
00019 features of HTTP.
00020
00021 * ``curl_httpclient`` is faster.
00022
00023 * ``curl_httpclient`` was the default prior to Tornado 2.0.
00024
00025 Note that if you are using ``curl_httpclient``, it is highly
00026 recommended that you use a recent version of ``libcurl`` and
00027 ``pycurl``. Currently the minimum supported version of libcurl is
00028 7.21.1, and the minimum version of pycurl is 7.18.2. It is highly
00029 recommended that your ``libcurl`` installation is built with
00030 asynchronous DNS resolver (threaded or c-ares), otherwise you may
00031 encounter various problems with request timeouts (for more
00032 information, see
00033 http://curl.haxx.se/libcurl/c/curl_easy_setopt.html#CURLOPTCONNECTTIMEOUTMS
00034 and comments in curl_httpclient.py).
00035
00036 To select ``curl_httpclient``, call `AsyncHTTPClient.configure` at startup::
00037
00038 AsyncHTTPClient.configure("tornado.curl_httpclient.CurlAsyncHTTPClient")
00039 """
00040
00041 from __future__ import absolute_import, division, print_function, with_statement
00042
00043 import functools
00044 import time
00045 import weakref
00046
00047 from tornado.concurrent import TracebackFuture
00048 from tornado.escape import utf8, native_str
00049 from tornado import httputil, stack_context
00050 from tornado.ioloop import IOLoop
00051 from tornado.util import Configurable
00052
00053
00054 class HTTPClient(object):
00055 """A blocking HTTP client.
00056
00057 This interface is provided for convenience and testing; most applications
00058 that are running an IOLoop will want to use `AsyncHTTPClient` instead.
00059 Typical usage looks like this::
00060
00061 http_client = httpclient.HTTPClient()
00062 try:
00063 response = http_client.fetch("http://www.google.com/")
00064 print response.body
00065 except httpclient.HTTPError as e:
00066 print "Error:", e
00067 http_client.close()
00068 """
00069 def __init__(self, async_client_class=None, **kwargs):
00070 self._io_loop = IOLoop()
00071 if async_client_class is None:
00072 async_client_class = AsyncHTTPClient
00073 self._async_client = async_client_class(self._io_loop, **kwargs)
00074 self._closed = False
00075
00076 def __del__(self):
00077 self.close()
00078
00079 def close(self):
00080 """Closes the HTTPClient, freeing any resources used."""
00081 if not self._closed:
00082 self._async_client.close()
00083 self._io_loop.close()
00084 self._closed = True
00085
00086 def fetch(self, request, **kwargs):
00087 """Executes a request, returning an `HTTPResponse`.
00088
00089 The request may be either a string URL or an `HTTPRequest` object.
00090 If it is a string, we construct an `HTTPRequest` using any additional
00091 kwargs: ``HTTPRequest(request, **kwargs)``
00092
00093 If an error occurs during the fetch, we raise an `HTTPError`.
00094 """
00095 response = self._io_loop.run_sync(functools.partial(
00096 self._async_client.fetch, request, **kwargs))
00097 response.rethrow()
00098 return response
00099
00100
00101 class AsyncHTTPClient(Configurable):
00102 """An non-blocking HTTP client.
00103
00104 Example usage::
00105
00106 def handle_request(response):
00107 if response.error:
00108 print "Error:", response.error
00109 else:
00110 print response.body
00111
00112 http_client = AsyncHTTPClient()
00113 http_client.fetch("http://www.google.com/", handle_request)
00114
00115 The constructor for this class is magic in several respects: It
00116 actually creates an instance of an implementation-specific
00117 subclass, and instances are reused as a kind of pseudo-singleton
00118 (one per `.IOLoop`). The keyword argument ``force_instance=True``
00119 can be used to suppress this singleton behavior. Unless
00120 ``force_instance=True`` is used, no arguments other than
00121 ``io_loop`` should be passed to the `AsyncHTTPClient` constructor.
00122 The implementation subclass as well as arguments to its
00123 constructor can be set with the static method `configure()`
00124
00125 All `AsyncHTTPClient` implementations support a ``defaults``
00126 keyword argument, which can be used to set default values for
00127 `HTTPRequest` attributes. For example::
00128
00129 AsyncHTTPClient.configure(
00130 None, defaults=dict(user_agent="MyUserAgent"))
00131 # or with force_instance:
00132 client = AsyncHTTPClient(force_instance=True,
00133 defaults=dict(user_agent="MyUserAgent"))
00134 """
00135 @classmethod
00136 def configurable_base(cls):
00137 return AsyncHTTPClient
00138
00139 @classmethod
00140 def configurable_default(cls):
00141 from tornado.simple_httpclient import SimpleAsyncHTTPClient
00142 return SimpleAsyncHTTPClient
00143
00144 @classmethod
00145 def _async_clients(cls):
00146 attr_name = '_async_client_dict_' + cls.__name__
00147 if not hasattr(cls, attr_name):
00148 setattr(cls, attr_name, weakref.WeakKeyDictionary())
00149 return getattr(cls, attr_name)
00150
00151 def __new__(cls, io_loop=None, force_instance=False, **kwargs):
00152 io_loop = io_loop or IOLoop.current()
00153 if force_instance:
00154 instance_cache = None
00155 else:
00156 instance_cache = cls._async_clients()
00157 if instance_cache is not None and io_loop in instance_cache:
00158 return instance_cache[io_loop]
00159 instance = super(AsyncHTTPClient, cls).__new__(cls, io_loop=io_loop,
00160 **kwargs)
00161
00162
00163
00164
00165 instance._instance_cache = instance_cache
00166 if instance_cache is not None:
00167 instance_cache[instance.io_loop] = instance
00168 return instance
00169
00170 def initialize(self, io_loop, defaults=None):
00171 self.io_loop = io_loop
00172 self.defaults = dict(HTTPRequest._DEFAULTS)
00173 if defaults is not None:
00174 self.defaults.update(defaults)
00175 self._closed = False
00176
00177 def close(self):
00178 """Destroys this HTTP client, freeing any file descriptors used.
00179
00180 This method is **not needed in normal use** due to the way
00181 that `AsyncHTTPClient` objects are transparently reused.
00182 ``close()`` is generally only necessary when either the
00183 `.IOLoop` is also being closed, or the ``force_instance=True``
00184 argument was used when creating the `AsyncHTTPClient`.
00185
00186 No other methods may be called on the `AsyncHTTPClient` after
00187 ``close()``.
00188
00189 """
00190 if self._closed:
00191 return
00192 self._closed = True
00193 if self._instance_cache is not None:
00194 if self._instance_cache.get(self.io_loop) is not self:
00195 raise RuntimeError("inconsistent AsyncHTTPClient cache")
00196 del self._instance_cache[self.io_loop]
00197
00198 def fetch(self, request, callback=None, **kwargs):
00199 """Executes a request, asynchronously returning an `HTTPResponse`.
00200
00201 The request may be either a string URL or an `HTTPRequest` object.
00202 If it is a string, we construct an `HTTPRequest` using any additional
00203 kwargs: ``HTTPRequest(request, **kwargs)``
00204
00205 This method returns a `.Future` whose result is an
00206 `HTTPResponse`. The ``Future`` will raise an `HTTPError` if
00207 the request returned a non-200 response code.
00208
00209 If a ``callback`` is given, it will be invoked with the `HTTPResponse`.
00210 In the callback interface, `HTTPError` is not automatically raised.
00211 Instead, you must check the response's ``error`` attribute or
00212 call its `~HTTPResponse.rethrow` method.
00213 """
00214 if self._closed:
00215 raise RuntimeError("fetch() called on closed AsyncHTTPClient")
00216 if not isinstance(request, HTTPRequest):
00217 request = HTTPRequest(url=request, **kwargs)
00218
00219
00220
00221 request.headers = httputil.HTTPHeaders(request.headers)
00222 request = _RequestProxy(request, self.defaults)
00223 future = TracebackFuture()
00224 if callback is not None:
00225 callback = stack_context.wrap(callback)
00226
00227 def handle_future(future):
00228 exc = future.exception()
00229 if isinstance(exc, HTTPError) and exc.response is not None:
00230 response = exc.response
00231 elif exc is not None:
00232 response = HTTPResponse(
00233 request, 599, error=exc,
00234 request_time=time.time() - request.start_time)
00235 else:
00236 response = future.result()
00237 self.io_loop.add_callback(callback, response)
00238 future.add_done_callback(handle_future)
00239
00240 def handle_response(response):
00241 if response.error:
00242 future.set_exception(response.error)
00243 else:
00244 future.set_result(response)
00245 self.fetch_impl(request, handle_response)
00246 return future
00247
00248 def fetch_impl(self, request, callback):
00249 raise NotImplementedError()
00250
00251 @classmethod
00252 def configure(cls, impl, **kwargs):
00253 """Configures the `AsyncHTTPClient` subclass to use.
00254
00255 ``AsyncHTTPClient()`` actually creates an instance of a subclass.
00256 This method may be called with either a class object or the
00257 fully-qualified name of such a class (or ``None`` to use the default,
00258 ``SimpleAsyncHTTPClient``)
00259
00260 If additional keyword arguments are given, they will be passed
00261 to the constructor of each subclass instance created. The
00262 keyword argument ``max_clients`` determines the maximum number
00263 of simultaneous `~AsyncHTTPClient.fetch()` operations that can
00264 execute in parallel on each `.IOLoop`. Additional arguments
00265 may be supported depending on the implementation class in use.
00266
00267 Example::
00268
00269 AsyncHTTPClient.configure("tornado.curl_httpclient.CurlAsyncHTTPClient")
00270 """
00271 super(AsyncHTTPClient, cls).configure(impl, **kwargs)
00272
00273
00274 class HTTPRequest(object):
00275 """HTTP client request object."""
00276
00277
00278
00279
00280 _DEFAULTS = dict(
00281 connect_timeout=20.0,
00282 request_timeout=20.0,
00283 follow_redirects=True,
00284 max_redirects=5,
00285 decompress_response=True,
00286 proxy_password='',
00287 allow_nonstandard_methods=False,
00288 validate_cert=True)
00289
00290 def __init__(self, url, method="GET", headers=None, body=None,
00291 auth_username=None, auth_password=None, auth_mode=None,
00292 connect_timeout=None, request_timeout=None,
00293 if_modified_since=None, follow_redirects=None,
00294 max_redirects=None, user_agent=None, use_gzip=None,
00295 network_interface=None, streaming_callback=None,
00296 header_callback=None, prepare_curl_callback=None,
00297 proxy_host=None, proxy_port=None, proxy_username=None,
00298 proxy_password=None, allow_nonstandard_methods=None,
00299 validate_cert=None, ca_certs=None,
00300 allow_ipv6=None,
00301 client_key=None, client_cert=None, body_producer=None,
00302 expect_100_continue=False, decompress_response=None):
00303 r"""All parameters except ``url`` are optional.
00304
00305 :arg string url: URL to fetch
00306 :arg string method: HTTP method, e.g. "GET" or "POST"
00307 :arg headers: Additional HTTP headers to pass on the request
00308 :type headers: `~tornado.httputil.HTTPHeaders` or `dict`
00309 :arg body: HTTP request body as a string (byte or unicode; if unicode
00310 the utf-8 encoding will be used)
00311 :arg body_producer: Callable used for lazy/asynchronous request bodies.
00312 It is called with one argument, a ``write`` function, and should
00313 return a `.Future`. It should call the write function with new
00314 data as it becomes available. The write function returns a
00315 `.Future` which can be used for flow control.
00316 Only one of ``body`` and ``body_producer`` may
00317 be specified. ``body_producer`` is not supported on
00318 ``curl_httpclient``. When using ``body_producer`` it is recommended
00319 to pass a ``Content-Length`` in the headers as otherwise chunked
00320 encoding will be used, and many servers do not support chunked
00321 encoding on requests. New in Tornado 4.0
00322 :arg string auth_username: Username for HTTP authentication
00323 :arg string auth_password: Password for HTTP authentication
00324 :arg string auth_mode: Authentication mode; default is "basic".
00325 Allowed values are implementation-defined; ``curl_httpclient``
00326 supports "basic" and "digest"; ``simple_httpclient`` only supports
00327 "basic"
00328 :arg float connect_timeout: Timeout for initial connection in seconds
00329 :arg float request_timeout: Timeout for entire request in seconds
00330 :arg if_modified_since: Timestamp for ``If-Modified-Since`` header
00331 :type if_modified_since: `datetime` or `float`
00332 :arg bool follow_redirects: Should redirects be followed automatically
00333 or return the 3xx response?
00334 :arg int max_redirects: Limit for ``follow_redirects``
00335 :arg string user_agent: String to send as ``User-Agent`` header
00336 :arg bool decompress_response: Request a compressed response from
00337 the server and decompress it after downloading. Default is True.
00338 New in Tornado 4.0.
00339 :arg bool use_gzip: Deprecated alias for ``decompress_response``
00340 since Tornado 4.0.
00341 :arg string network_interface: Network interface to use for request.
00342 ``curl_httpclient`` only; see note below.
00343 :arg callable streaming_callback: If set, ``streaming_callback`` will
00344 be run with each chunk of data as it is received, and
00345 ``HTTPResponse.body`` and ``HTTPResponse.buffer`` will be empty in
00346 the final response.
00347 :arg callable header_callback: If set, ``header_callback`` will
00348 be run with each header line as it is received (including the
00349 first line, e.g. ``HTTP/1.0 200 OK\r\n``, and a final line
00350 containing only ``\r\n``. All lines include the trailing newline
00351 characters). ``HTTPResponse.headers`` will be empty in the final
00352 response. This is most useful in conjunction with
00353 ``streaming_callback``, because it's the only way to get access to
00354 header data while the request is in progress.
00355 :arg callable prepare_curl_callback: If set, will be called with
00356 a ``pycurl.Curl`` object to allow the application to make additional
00357 ``setopt`` calls.
00358 :arg string proxy_host: HTTP proxy hostname. To use proxies,
00359 ``proxy_host`` and ``proxy_port`` must be set; ``proxy_username`` and
00360 ``proxy_pass`` are optional. Proxies are currently only supported
00361 with ``curl_httpclient``.
00362 :arg int proxy_port: HTTP proxy port
00363 :arg string proxy_username: HTTP proxy username
00364 :arg string proxy_password: HTTP proxy password
00365 :arg bool allow_nonstandard_methods: Allow unknown values for ``method``
00366 argument?
00367 :arg bool validate_cert: For HTTPS requests, validate the server's
00368 certificate?
00369 :arg string ca_certs: filename of CA certificates in PEM format,
00370 or None to use defaults. See note below when used with
00371 ``curl_httpclient``.
00372 :arg bool allow_ipv6: Use IPv6 when available? Default is false in
00373 ``simple_httpclient`` and true in ``curl_httpclient``
00374 :arg string client_key: Filename for client SSL key, if any. See
00375 note below when used with ``curl_httpclient``.
00376 :arg string client_cert: Filename for client SSL certificate, if any.
00377 See note below when used with ``curl_httpclient``.
00378 :arg bool expect_100_continue: If true, send the
00379 ``Expect: 100-continue`` header and wait for a continue response
00380 before sending the request body. Only supported with
00381 simple_httpclient.
00382
00383 .. note::
00384
00385 When using ``curl_httpclient`` certain options may be
00386 inherited by subsequent fetches because ``pycurl`` does
00387 not allow them to be cleanly reset. This applies to the
00388 ``ca_certs``, ``client_key``, ``client_cert``, and
00389 ``network_interface`` arguments. If you use these
00390 options, you should pass them on every request (you don't
00391 have to always use the same values, but it's not possible
00392 to mix requests that specify these options with ones that
00393 use the defaults).
00394
00395 .. versionadded:: 3.1
00396 The ``auth_mode`` argument.
00397
00398 .. versionadded:: 4.0
00399 The ``body_producer`` and ``expect_100_continue`` arguments.
00400 """
00401
00402
00403 self.headers = headers
00404 if if_modified_since:
00405 self.headers["If-Modified-Since"] = httputil.format_timestamp(
00406 if_modified_since)
00407 self.proxy_host = proxy_host
00408 self.proxy_port = proxy_port
00409 self.proxy_username = proxy_username
00410 self.proxy_password = proxy_password
00411 self.url = url
00412 self.method = method
00413 self.body = body
00414 self.body_producer = body_producer
00415 self.auth_username = auth_username
00416 self.auth_password = auth_password
00417 self.auth_mode = auth_mode
00418 self.connect_timeout = connect_timeout
00419 self.request_timeout = request_timeout
00420 self.follow_redirects = follow_redirects
00421 self.max_redirects = max_redirects
00422 self.user_agent = user_agent
00423 if decompress_response is not None:
00424 self.decompress_response = decompress_response
00425 else:
00426 self.decompress_response = use_gzip
00427 self.network_interface = network_interface
00428 self.streaming_callback = streaming_callback
00429 self.header_callback = header_callback
00430 self.prepare_curl_callback = prepare_curl_callback
00431 self.allow_nonstandard_methods = allow_nonstandard_methods
00432 self.validate_cert = validate_cert
00433 self.ca_certs = ca_certs
00434 self.allow_ipv6 = allow_ipv6
00435 self.client_key = client_key
00436 self.client_cert = client_cert
00437 self.expect_100_continue = expect_100_continue
00438 self.start_time = time.time()
00439
00440 @property
00441 def headers(self):
00442 return self._headers
00443
00444 @headers.setter
00445 def headers(self, value):
00446 if value is None:
00447 self._headers = httputil.HTTPHeaders()
00448 else:
00449 self._headers = value
00450
00451 @property
00452 def body(self):
00453 return self._body
00454
00455 @body.setter
00456 def body(self, value):
00457 self._body = utf8(value)
00458
00459 @property
00460 def body_producer(self):
00461 return self._body_producer
00462
00463 @body_producer.setter
00464 def body_producer(self, value):
00465 self._body_producer = stack_context.wrap(value)
00466
00467 @property
00468 def streaming_callback(self):
00469 return self._streaming_callback
00470
00471 @streaming_callback.setter
00472 def streaming_callback(self, value):
00473 self._streaming_callback = stack_context.wrap(value)
00474
00475 @property
00476 def header_callback(self):
00477 return self._header_callback
00478
00479 @header_callback.setter
00480 def header_callback(self, value):
00481 self._header_callback = stack_context.wrap(value)
00482
00483 @property
00484 def prepare_curl_callback(self):
00485 return self._prepare_curl_callback
00486
00487 @prepare_curl_callback.setter
00488 def prepare_curl_callback(self, value):
00489 self._prepare_curl_callback = stack_context.wrap(value)
00490
00491
00492 class HTTPResponse(object):
00493 """HTTP Response object.
00494
00495 Attributes:
00496
00497 * request: HTTPRequest object
00498
00499 * code: numeric HTTP status code, e.g. 200 or 404
00500
00501 * reason: human-readable reason phrase describing the status code
00502
00503 * headers: `tornado.httputil.HTTPHeaders` object
00504
00505 * effective_url: final location of the resource after following any
00506 redirects
00507
00508 * buffer: ``cStringIO`` object for response body
00509
00510 * body: response body as string (created on demand from ``self.buffer``)
00511
00512 * error: Exception object, if any
00513
00514 * request_time: seconds from request start to finish
00515
00516 * time_info: dictionary of diagnostic timing information from the request.
00517 Available data are subject to change, but currently uses timings
00518 available from http://curl.haxx.se/libcurl/c/curl_easy_getinfo.html,
00519 plus ``queue``, which is the delay (if any) introduced by waiting for
00520 a slot under `AsyncHTTPClient`'s ``max_clients`` setting.
00521 """
00522 def __init__(self, request, code, headers=None, buffer=None,
00523 effective_url=None, error=None, request_time=None,
00524 time_info=None, reason=None):
00525 if isinstance(request, _RequestProxy):
00526 self.request = request.request
00527 else:
00528 self.request = request
00529 self.code = code
00530 self.reason = reason or httputil.responses.get(code, "Unknown")
00531 if headers is not None:
00532 self.headers = headers
00533 else:
00534 self.headers = httputil.HTTPHeaders()
00535 self.buffer = buffer
00536 self._body = None
00537 if effective_url is None:
00538 self.effective_url = request.url
00539 else:
00540 self.effective_url = effective_url
00541 if error is None:
00542 if self.code < 200 or self.code >= 300:
00543 self.error = HTTPError(self.code, message=self.reason,
00544 response=self)
00545 else:
00546 self.error = None
00547 else:
00548 self.error = error
00549 self.request_time = request_time
00550 self.time_info = time_info or {}
00551
00552 def _get_body(self):
00553 if self.buffer is None:
00554 return None
00555 elif self._body is None:
00556 self._body = self.buffer.getvalue()
00557
00558 return self._body
00559
00560 body = property(_get_body)
00561
00562 def rethrow(self):
00563 """If there was an error on the request, raise an `HTTPError`."""
00564 if self.error:
00565 raise self.error
00566
00567 def __repr__(self):
00568 args = ",".join("%s=%r" % i for i in sorted(self.__dict__.items()))
00569 return "%s(%s)" % (self.__class__.__name__, args)
00570
00571
00572 class HTTPError(Exception):
00573 """Exception thrown for an unsuccessful HTTP request.
00574
00575 Attributes:
00576
00577 * ``code`` - HTTP error integer error code, e.g. 404. Error code 599 is
00578 used when no HTTP response was received, e.g. for a timeout.
00579
00580 * ``response`` - `HTTPResponse` object, if any.
00581
00582 Note that if ``follow_redirects`` is False, redirects become HTTPErrors,
00583 and you can look at ``error.response.headers['Location']`` to see the
00584 destination of the redirect.
00585 """
00586 def __init__(self, code, message=None, response=None):
00587 self.code = code
00588 message = message or httputil.responses.get(code, "Unknown")
00589 self.response = response
00590 Exception.__init__(self, "HTTP %d: %s" % (self.code, message))
00591
00592
00593 class _RequestProxy(object):
00594 """Combines an object with a dictionary of defaults.
00595
00596 Used internally by AsyncHTTPClient implementations.
00597 """
00598 def __init__(self, request, defaults):
00599 self.request = request
00600 self.defaults = defaults
00601
00602 def __getattr__(self, name):
00603 request_attr = getattr(self.request, name)
00604 if request_attr is not None:
00605 return request_attr
00606 elif self.defaults is not None:
00607 return self.defaults.get(name, None)
00608 else:
00609 return None
00610
00611
00612 def main():
00613 from tornado.options import define, options, parse_command_line
00614 define("print_headers", type=bool, default=False)
00615 define("print_body", type=bool, default=True)
00616 define("follow_redirects", type=bool, default=True)
00617 define("validate_cert", type=bool, default=True)
00618 args = parse_command_line()
00619 client = HTTPClient()
00620 for arg in args:
00621 try:
00622 response = client.fetch(arg,
00623 follow_redirects=options.follow_redirects,
00624 validate_cert=options.validate_cert,
00625 )
00626 except HTTPError as e:
00627 if e.response is not None:
00628 response = e.response
00629 else:
00630 raise
00631 if options.print_headers:
00632 print(response.headers)
00633 if options.print_body:
00634 print(native_str(response.body))
00635 client.close()
00636
00637 if __name__ == "__main__":
00638 main()