2 # -*- coding: utf-8 -*-
4 # This library is free software; you can redistribute it and/or
5 # modify it under the terms of the GNU Lesser General Public
6 # License as published by the Free Software Foundation; either
7 # version 2.1 of the License, or (at your option) any later version.
9 # This library is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 # Lesser General Public License for more details.
14 # You should have received a copy of the GNU Lesser General Public
15 # License along with this library; if not, write to the
16 # Free Software Foundation, Inc.,
17 # 59 Temple Place, Suite 330,
18 # Boston, MA 02111-1307 USA
20 # This file is part of urlgrabber, a high-level cross-protocol url-grabber
21 # Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
23 """An HTTP handler for urllib2 that supports HTTP 1.1 and keepalive.
26 >>> from keepalive import HTTPHandler
27 >>> keepalive_handler = HTTPHandler()
28 >>> opener = urllib2.build_opener(keepalive_handler)
29 >>> urllib2.install_opener(opener)
31 >>> fo = urllib2.urlopen('http://www.python.org')
33 If a connection to a given host is requested, and all of the existing
34 connections are still in use, another connection will be opened. If
35 the handler tries to use an existing connection but it fails in some
36 way, it will be closed and removed from the pool.
38 To remove the handler, simply re-run build_opener with no arguments, and
41 You can explicitly close connections by using the close_connection()
42 method of the returned file-like object (described below) or you can
43 use the handler methods:
45 close_connection(host)
49 NOTE: using the close_connection and close_all methods of the handler
50 should be done with care when using multiple threads.
51 * there is nothing that prevents another thread from creating new
52 connections immediately after connections are closed
53 * no checks are done to prevent in-use connections from being closed
55 >>> keepalive_handler.close_all()
57 EXTRA ATTRIBUTES AND METHODS
59 Upon a status of 200, the object returned has a few additional
60 attributes and methods, which should not be used if you want to
61 remain consistent with the normal urllib2-returned objects:
63 close_connection() - close the connection to the host
64 readlines() - you know, readlines()
65 status - the return status (ie 404)
66 reason - english translation of status (ie 'File not found')
68 If you want the best of both worlds, use this inside an
69 AttributeError-catching try:
71 >>> try: status = fo.status
72 >>> except AttributeError: status = None
74 Unfortunately, these are ONLY there if status == 200, so it's not
75 easy to distinguish between non-200 responses. The reason is that
76 urllib2 tries to do clever things with error codes 301, 302, 401,
77 and 407, and it wraps the object upon return.
79 For python versions earlier than 2.4, you can avoid this fancy error
80 handling by setting the module-level global HANDLE_ERRORS to zero.
81 You see, prior to 2.4, it's the HTTP Handler's job to determine what
82 to handle specially, and what to just pass up. HANDLE_ERRORS == 0
83 means "pass everything up". In python 2.4, however, this job no
84 longer belongs to the HTTP Handler and is now done by a NEW handler,
85 HTTPErrorProcessor. Here's the bottom line:
88 HANDLE_ERRORS == 1 (default) pass up 200, treat the rest as
90 HANDLE_ERRORS == 0 pass everything up, error processing is
91 left to the calling code
93 HANDLE_ERRORS == 1 pass up 200, treat the rest as errors
94 HANDLE_ERRORS == 0 (default) pass everything up, let the
95 other handlers (specifically,
96 HTTPErrorProcessor) decide what to do
98 In practice, setting the variable either way makes little difference
99 in python 2.4, so for the most consistent behavior across versions,
100 you probably just want to use the defaults, which will give you
101 exceptions on errors.
117 if sys.version_info < (2, 4): HANDLE_ERRORS = 1
118 else: HANDLE_ERRORS = 0
120 class ConnectionManager:
122 The connection manager must be able to:
123 * keep track of all existing
126 self._lock = thread.allocate_lock()
127 self._hostmap = {} # map hosts to a list of connections
128 self._connmap = {} # map connections to host
129 self._readymap = {} # map connection to ready state
131 def add(self, host, connection, ready):
134 if not self._hostmap.has_key(host): self._hostmap[host] = []
135 self._hostmap[host].append(connection)
136 self._connmap[connection] = host
137 self._readymap[connection] = ready
141 def remove(self, connection):
145 host = self._connmap[connection]
149 del self._connmap[connection]
150 del self._readymap[connection]
151 self._hostmap[host].remove(connection)
152 if not self._hostmap[host]: del self._hostmap[host]
156 def set_ready(self, connection, ready):
157 try: self._readymap[connection] = ready
158 except KeyError: pass
160 def get_ready_conn(self, host):
164 if self._hostmap.has_key(host):
165 for c in self._hostmap[host]:
166 if self._readymap[c]:
167 self._readymap[c] = 0
174 def get_all(self, host=None):
176 return list(self._hostmap.get(host, []))
178 return dict(self._hostmap)
180 class KeepAliveHandler:
182 self._cm = ConnectionManager()
184 #### Connection Management
185 def open_connections(self):
186 """return a list of connected hosts and the number of connections
187 to each. [('foo.com:80', 2), ('bar.org', 1)]"""
188 return [(host, len(li)) for (host, li) in self._cm.get_all().items()]
190 def close_connection(self, host):
191 """close connection(s) to <host>
192 host is the host:port spec, as in 'www.cnn.com:8080' as passed in.
193 no error occurs if there is no connection to that host."""
194 for h in self._cm.get_all(host):
199 """close all open connections"""
200 for host, conns in self._cm.get_all().items():
205 def _request_closed(self, request, host, connection):
206 """tells us that this request is now closed and the the
207 connection is ready for another request"""
208 self._cm.set_ready(connection, 1)
210 def _remove_connection(self, host, connection, close=0):
211 if close: connection.close()
212 self._cm.remove(connection)
214 #### Transaction Execution
215 def do_open(self, req):
216 host = req.get_host()
218 raise urllib2.URLError('no host given')
221 h = self._cm.get_ready_conn(host)
223 r = self._reuse_connection(h, req, host)
225 # if this response is non-None, then it worked and we're
226 # done. Break out, skipping the else block.
229 # connection is bad - possibly closed by server
230 # discard it and ask for the next free connection
233 h = self._cm.get_ready_conn(host)
235 # no (working) free connections were found. Create a new one.
236 h = self._get_connection(host)
237 if DEBUG: DEBUG.info("creating new connection to %s (%d)",
239 self._cm.add(host, h, 0)
240 self._start_transaction(h, req)
242 except (socket.error, httplib.HTTPException), err:
243 raise urllib2.URLError(err)
245 # if not a persistent connection, don't try to reuse it
246 if r.will_close: self._cm.remove(h)
248 if DEBUG: DEBUG.info("STATUS: %s, %s", r.status, r.reason)
251 r._url = req.get_full_url()
257 if r.status == 200 or not HANDLE_ERRORS:
260 return self.parent.error('http', req, r,
261 r.status, r.msg, r.headers)
263 def _reuse_connection(self, h, req, host):
264 """start the transaction with a re-used connection
265 return a response object (r) upon success or None on failure.
266 This DOES not close or remove bad connections in cases where
267 it returns. However, if an unexpected exception occurs, it
268 will close and remove the connection before re-raising.
271 self._start_transaction(h, req)
273 # note: just because we got something back doesn't mean it
274 # worked. We'll check the version below, too.
275 except (socket.error, httplib.HTTPException):
278 # adding this block just in case we've missed
279 # something we will still raise the exception, but
280 # lets try and close the connection and remove it
281 # first. We previously got into a nasty loop
282 # where an exception was uncaught, and so the
283 # connection stayed open. On the next try, the
284 # same exception was raised, etc. The tradeoff is
285 # that it's now possible this call will raise
286 # a DIFFERENT exception
287 if DEBUG: DEBUG.error("unexpected exception - closing " + \
288 "connection to %s (%d)", host, id(h))
293 if r is None or r.version == 9:
294 # httplib falls back to assuming HTTP 0.9 if it gets a
295 # bad header back. This is most likely to happen if
296 # the socket has been closed by the server since we
297 # last used the connection.
298 if DEBUG: DEBUG.info("failed to re-use connection to %s (%d)",
302 if DEBUG: DEBUG.info("re-using connection to %s (%d)", host, id(h))
306 def _start_transaction(self, h, req):
309 data = req.get_data()
310 h.putrequest('POST', req.get_selector(), skip_accept_encoding=1)
311 if not req.headers.has_key('Content-type'):
312 h.putheader('Content-type',
313 'application/x-www-form-urlencoded')
314 if not req.headers.has_key('Content-length'):
315 h.putheader('Content-length', '%d' % len(data))
317 h.putrequest('GET', req.get_selector(), skip_accept_encoding=1)
318 except (socket.error, httplib.HTTPException), err:
319 raise urllib2.URLError(err)
321 for args in self.parent.addheaders:
323 for k, v in req.headers.items():
329 def _get_connection(self, host):
330 return NotImplementedError
332 class HTTPHandler(KeepAliveHandler, urllib2.HTTPHandler):
334 KeepAliveHandler.__init__(self)
336 def http_open(self, req):
337 return self.do_open(req)
339 def _get_connection(self, host):
340 return HTTPConnection(host)
342 class HTTPSHandler(KeepAliveHandler, urllib2.HTTPSHandler):
343 def __init__(self, ssl_factory=None):
344 KeepAliveHandler.__init__(self)
346 ssl_factory = sslfactory.get_factory()
347 self._ssl_factory = ssl_factory
349 def https_open(self, req):
350 return self.do_open(req)
352 def _get_connection(self, host):
353 return self._ssl_factory.get_https_connection(host)
355 class HTTPResponse(httplib.HTTPResponse):
356 # we need to subclass HTTPResponse in order to
357 # 1) add readline() and readlines() methods
358 # 2) add close_connection() methods
359 # 3) add info() and geturl() methods
361 # in order to add readline(), read must be modified to deal with a
362 # buffer. example: readline must read a buffer and then spit back
363 # one line at a time. The only real alternative is to read one
364 # BYTE at a time (ick). Once something has been read, it can't be
365 # put back (ok, maybe it can, but that's even uglier than this),
366 # so if you THEN do a normal read, you must first take stuff from
369 # the read method wraps the original to accomodate buffering,
370 # although read() never adds to the buffer.
371 # Both readline and readlines have been stolen with almost no
372 # modification from socket.py
375 def __init__(self, sock, debuglevel=0, strict=0, method=None):
376 if method: # the httplib in python 2.3 uses the method arg
377 httplib.HTTPResponse.__init__(self, sock, debuglevel, method)
379 httplib.HTTPResponse.__init__(self, sock, debuglevel)
380 self.fileno = sock.fileno
383 self._rbufsize = 8096
384 self._handler = None # inserted by the handler later
385 self._host = None # (same)
386 self._url = None # (same)
387 self._connection = None # (same)
389 _raw_read = httplib.HTTPResponse.read
396 self._handler._request_closed(self, self._host,
399 def close_connection(self):
400 self._handler._remove_connection(self._host, self._connection, close=1)
409 def read(self, amt=None):
410 # the _rbuf test is only in this first if for speed. It's not
411 # logically necessary
412 if self._rbuf and not amt is None:
418 self._rbuf = self._rbuf[amt:]
421 s = self._rbuf + self._raw_read(amt)
425 def readline(self, limit=-1):
427 i = self._rbuf.find('\n')
428 while i < 0 and not (0 < limit <= len(self._rbuf)):
429 new = self._raw_read(self._rbufsize)
432 if i >= 0: i = i + len(self._rbuf)
433 self._rbuf = self._rbuf + new
434 if i < 0: i = len(self._rbuf)
436 if 0 <= limit < len(self._rbuf): i = limit
437 data, self._rbuf = self._rbuf[:i], self._rbuf[i:]
440 def readlines(self, sizehint = 0):
444 line = self.readline()
448 if sizehint and total >= sizehint:
453 class HTTPConnection(httplib.HTTPConnection):
454 # use the modified response class
455 response_class = HTTPResponse
457 class HTTPSConnection(httplib.HTTPSConnection):
458 response_class = HTTPResponse
460 #########################################################################
462 #########################################################################
464 def error_handler(url):
467 keepalive_handler = HTTPHandler()
468 opener = urllib2.build_opener(keepalive_handler)
469 urllib2.install_opener(opener)
470 pos = {0: 'off', 1: 'on'}
472 print " fancy error handling %s (HANDLE_ERRORS = %i)" % (pos[i], i)
475 fo = urllib2.urlopen(url)
478 try: status, reason = fo.status, fo.reason
479 except AttributeError: status, reason = None, None
481 print " EXCEPTION: %s" % e
484 print " status = %s, reason = %s" % (status, reason)
486 hosts = keepalive_handler.open_connections()
487 print "open connections:", hosts
488 keepalive_handler.close_all()
494 # first fetch the file with the normal http handler
495 opener = urllib2.build_opener()
496 urllib2.install_opener(opener)
497 fo = urllib2.urlopen(url)
501 print format % ('normal urllib', m.hexdigest())
503 # now install the keepalive handler and try again
504 opener = urllib2.build_opener(HTTPHandler())
505 urllib2.install_opener(opener)
507 fo = urllib2.urlopen(url)
511 print format % ('keepalive read', m.hexdigest())
513 fo = urllib2.urlopen(url)
521 print format % ('keepalive readline', m.hexdigest())
524 print ' making %i connections to:\n %s' % (N, url)
526 sys.stdout.write(' first using the normal urllib handlers')
527 # first use normal opener
528 opener = urllib2.build_opener()
529 urllib2.install_opener(opener)
531 print ' TIME: %.3f s' % t1
533 sys.stdout.write(' now using the keepalive handler ')
534 # now install the keepalive handler and try again
535 opener = urllib2.build_opener(HTTPHandler())
536 urllib2.install_opener(opener)
538 print ' TIME: %.3f s' % t2
539 print ' improvement factor: %.2f' % (t1/t2, )
541 def fetch(N, url, delay=0):
544 starttime = time.time()
546 if delay and i > 0: time.sleep(delay)
547 fo = urllib2.urlopen(url)
550 lens.append(len(foo))
551 diff = time.time() - starttime
557 print "WARNING: inconsistent length on read %i: %i" % (j, i)
561 def test_timeout(url):
565 def debug(self, msg, *args): print msg % args
566 info = warning = error = debug
568 print " fetching the file to establish a connection"
569 fo = urllib2.urlopen(url)
574 print " waiting %i seconds for the server to close the connection" % i
576 sys.stdout.write('\r %2i' % i)
580 sys.stderr.write('\r')
582 print " fetching the file a second time"
583 fo = urllib2.urlopen(url)
588 print ' data are identical'
590 print ' ERROR: DATA DIFFER'
596 print "checking error hander (do this on a non-200)"
597 try: error_handler(url)
599 print "exiting - exception will prevent further tests"
602 print "performing continuity test (making sure stuff isn't corrupted)"
605 print "performing speed comparison"
608 print "performing dropped-connection check"
611 if __name__ == '__main__':
618 print "%s <integer> <url>" % sys.argv[0]