code.vuplus.com Git - vuplus_dvbapp-plugin/blob - rsdownloader/src/Keepalive.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 #   This library is free software; you can redistribute it and/or
   5 #   modify it under the terms of the GNU Lesser General Public
   6 #   License as published by the Free Software Foundation; either
   7 #   version 2.1 of the License, or (at your option) any later version.
   8 #
   9 #   This library is distributed in the hope that it will be useful,
  10 #   but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12 #   Lesser General Public License for more details.
  13 #
  14 #   You should have received a copy of the GNU Lesser General Public
  15 #   License along with this library; if not, write to the
  16 #      Free Software Foundation, Inc.,
  17 #      59 Temple Place, Suite 330,
  18 #      Boston, MA  02111-1307  USA
  19
  20 # This file is part of urlgrabber, a high-level cross-protocol url-grabber
  21 # Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
  22
  23 """An HTTP handler for urllib2 that supports HTTP 1.1 and keepalive.
  24
  25 >>> import urllib2
  26 >>> from keepalive import HTTPHandler
  27 >>> keepalive_handler = HTTPHandler()
  28 >>> opener = urllib2.build_opener(keepalive_handler)
  29 >>> urllib2.install_opener(opener)
  30 >>>
  31 >>> fo = urllib2.urlopen('http://www.python.org')
  32
  33 If a connection to a given host is requested, and all of the existing
  34 connections are still in use, another connection will be opened.  If
  35 the handler tries to use an existing connection but it fails in some
  36 way, it will be closed and removed from the pool.
  37
  38 To remove the handler, simply re-run build_opener with no arguments, and
  39 install that opener.
  40
  41 You can explicitly close connections by using the close_connection()
  42 method of the returned file-like object (described below) or you can
  43 use the handler methods:
  44
  45   close_connection(host)
  46   close_all()
  47   open_connections()
  48
  49 NOTE: using the close_connection and close_all methods of the handler
  50 should be done with care when using multiple threads.
  51   * there is nothing that prevents another thread from creating new
  52     connections immediately after connections are closed
  53   * no checks are done to prevent in-use connections from being closed
  54
  55 >>> keepalive_handler.close_all()
  56
  57 EXTRA ATTRIBUTES AND METHODS
  58
  59   Upon a status of 200, the object returned has a few additional
  60   attributes and methods, which should not be used if you want to
  61   remain consistent with the normal urllib2-returned objects:
  62
  63     close_connection()  -  close the connection to the host
  64     readlines()         -  you know, readlines()
  65     status              -  the return status (ie 404)
  66     reason              -  english translation of status (ie 'File not found')
  67
  68   If you want the best of both worlds, use this inside an
  69   AttributeError-catching try:
  70
  71   >>> try: status = fo.status
  72   >>> except AttributeError: status = None
  73
  74   Unfortunately, these are ONLY there if status == 200, so it's not
  75   easy to distinguish between non-200 responses.  The reason is that
  76   urllib2 tries to do clever things with error codes 301, 302, 401,
  77   and 407, and it wraps the object upon return.
  78
  79   For python versions earlier than 2.4, you can avoid this fancy error
  80   handling by setting the module-level global HANDLE_ERRORS to zero.
  81   You see, prior to 2.4, it's the HTTP Handler's job to determine what
  82   to handle specially, and what to just pass up.  HANDLE_ERRORS == 0
  83   means "pass everything up".  In python 2.4, however, this job no
  84   longer belongs to the HTTP Handler and is now done by a NEW handler,
  85   HTTPErrorProcessor.  Here's the bottom line:
  86
  87     python version < 2.4
  88         HANDLE_ERRORS == 1  (default) pass up 200, treat the rest as
  89                             errors
  90         HANDLE_ERRORS == 0  pass everything up, error processing is
  91                             left to the calling code
  92     python version >= 2.4
  93         HANDLE_ERRORS == 1  pass up 200, treat the rest as errors
  94         HANDLE_ERRORS == 0  (default) pass everything up, let the
  95                             other handlers (specifically,
  96                             HTTPErrorProcessor) decide what to do
  97
  98   In practice, setting the variable either way makes little difference
  99   in python 2.4, so for the most consistent behavior across versions,
 100   you probably just want to use the defaults, which will give you
 101   exceptions on errors.
 102
 103 """
 104
 105 # $Id$
 106
 107 import urllib2
 108 import httplib
 109 import socket
 110 import thread
 111
 112 DEBUG = None
 113
 114 import sslfactory
 115
 116 import sys
 117 if sys.version_info < (2, 4): HANDLE_ERRORS = 1
 118 else: HANDLE_ERRORS = 0
 119
 120 class ConnectionManager:
 121     """
 122     The connection manager must be able to:
 123       * keep track of all existing
 124       """
 125     def __init__(self):
 126         self._lock = thread.allocate_lock()
 127         self._hostmap = {} # map hosts to a list of connections
 128         self._connmap = {} # map connections to host
 129         self._readymap = {} # map connection to ready state
 130
 131     def add(self, host, connection, ready):
 132         self._lock.acquire()
 133         try:
 134             if not self._hostmap.has_key(host): self._hostmap[host] = []
 135             self._hostmap[host].append(connection)
 136             self._connmap[connection] = host
 137             self._readymap[connection] = ready
 138         finally:
 139             self._lock.release()
 140
 141     def remove(self, connection):
 142         self._lock.acquire()
 143         try:
 144             try:
 145                 host = self._connmap[connection]
 146             except KeyError:
 147                 pass
 148             else:
 149                 del self._connmap[connection]
 150                 del self._readymap[connection]
 151                 self._hostmap[host].remove(connection)
 152                 if not self._hostmap[host]: del self._hostmap[host]
 153         finally:
 154             self._lock.release()
 155
 156     def set_ready(self, connection, ready):
 157         try: self._readymap[connection] = ready
 158         except KeyError: pass
 159
 160     def get_ready_conn(self, host):
 161         conn = None
 162         self._lock.acquire()
 163         try:
 164             if self._hostmap.has_key(host):
 165                 for c in self._hostmap[host]:
 166                     if self._readymap[c]:
 167                         self._readymap[c] = 0
 168                         conn = c
 169                         break
 170         finally:
 171             self._lock.release()
 172         return conn
 173
 174     def get_all(self, host=None):
 175         if host:
 176             return list(self._hostmap.get(host, []))
 177         else:
 178             return dict(self._hostmap)
 179
 180 class KeepAliveHandler:
 181     def __init__(self):
 182         self._cm = ConnectionManager()
 183
 184     #### Connection Management
 185     def open_connections(self):
 186         """return a list of connected hosts and the number of connections
 187         to each.  [('foo.com:80', 2), ('bar.org', 1)]"""
 188         return [(host, len(li)) for (host, li) in self._cm.get_all().items()]
 189
 190     def close_connection(self, host):
 191         """close connection(s) to <host>
 192         host is the host:port spec, as in 'www.cnn.com:8080' as passed in.
 193         no error occurs if there is no connection to that host."""
 194         for h in self._cm.get_all(host):
 195             self._cm.remove(h)
 196             h.close()
 197
 198     def close_all(self):
 199         """close all open connections"""
 200         for host, conns in self._cm.get_all().items():
 201             for h in conns:
 202                 self._cm.remove(h)
 203                 h.close()
 204
 205     def _request_closed(self, request, host, connection):
 206         """tells us that this request is now closed and the the
 207         connection is ready for another request"""
 208         self._cm.set_ready(connection, 1)
 209
 210     def _remove_connection(self, host, connection, close=0):
 211         if close: connection.close()
 212         self._cm.remove(connection)
 213
 214     #### Transaction Execution
 215     def do_open(self, req):
 216         host = req.get_host()
 217         if not host:
 218             raise urllib2.URLError('no host given')
 219
 220         try:
 221             h = self._cm.get_ready_conn(host)
 222             while h:
 223                 r = self._reuse_connection(h, req, host)
 224
 225                 # if this response is non-None, then it worked and we're
 226                 # done.  Break out, skipping the else block.
 227                 if r: break
 228
 229                 # connection is bad - possibly closed by server
 230                 # discard it and ask for the next free connection
 231                 h.close()
 232                 self._cm.remove(h)
 233                 h = self._cm.get_ready_conn(host)
 234             else:
 235                 # no (working) free connections were found.  Create a new one.
 236                 h = self._get_connection(host)
 237                 if DEBUG: DEBUG.info("creating new connection to %s (%d)",
 238                                      host, id(h))
 239                 self._cm.add(host, h, 0)
 240                 self._start_transaction(h, req)
 241                 r = h.getresponse()
 242         except (socket.error, httplib.HTTPException), err:
 243             raise urllib2.URLError(err)
 244
 245         # if not a persistent connection, don't try to reuse it
 246         if r.will_close: self._cm.remove(h)
 247
 248         if DEBUG: DEBUG.info("STATUS: %s, %s", r.status, r.reason)
 249         r._handler = self
 250         r._host = host
 251         r._url = req.get_full_url()
 252         r._connection = h
 253         r.code = r.status
 254         r.headers = r.msg
 255         r.msg = r.reason
 256
 257         if r.status == 200 or not HANDLE_ERRORS:
 258             return r
 259         else:
 260             return self.parent.error('http', req, r,
 261                                      r.status, r.msg, r.headers)
 262
 263     def _reuse_connection(self, h, req, host):
 264         """start the transaction with a re-used connection
 265         return a response object (r) upon success or None on failure.
 266         This DOES not close or remove bad connections in cases where
 267         it returns.  However, if an unexpected exception occurs, it
 268         will close and remove the connection before re-raising.
 269         """
 270         try:
 271             self._start_transaction(h, req)
 272             r = h.getresponse()
 273             # note: just because we got something back doesn't mean it
 274             # worked.  We'll check the version below, too.
 275         except (socket.error, httplib.HTTPException):
 276             r = None
 277         except:
 278             # adding this block just in case we've missed
 279             # something we will still raise the exception, but
 280             # lets try and close the connection and remove it
 281             # first.  We previously got into a nasty loop
 282             # where an exception was uncaught, and so the
 283             # connection stayed open.  On the next try, the
 284             # same exception was raised, etc.  The tradeoff is
 285             # that it's now possible this call will raise
 286             # a DIFFERENT exception
 287             if DEBUG: DEBUG.error("unexpected exception - closing " + \
 288                                   "connection to %s (%d)", host, id(h))
 289             self._cm.remove(h)
 290             h.close()
 291             raise
 292
 293         if r is None or r.version == 9:
 294             # httplib falls back to assuming HTTP 0.9 if it gets a
 295             # bad header back.  This is most likely to happen if
 296             # the socket has been closed by the server since we
 297             # last used the connection.
 298             if DEBUG: DEBUG.info("failed to re-use connection to %s (%d)",
 299                                  host, id(h))
 300             r = None
 301         else:
 302             if DEBUG: DEBUG.info("re-using connection to %s (%d)", host, id(h))
 303
 304         return r
 305
 306     def _start_transaction(self, h, req):
 307         try:
 308             if req.has_data():
 309                 data = req.get_data()
 310                 h.putrequest('POST', req.get_selector(), skip_accept_encoding=1)
 311                 if not req.headers.has_key('Content-type'):
 312                     h.putheader('Content-type',
 313                                 'application/x-www-form-urlencoded')
 314                 if not req.headers.has_key('Content-length'):
 315                     h.putheader('Content-length', '%d' % len(data))
 316             else:
 317                 h.putrequest('GET', req.get_selector(), skip_accept_encoding=1)
 318         except (socket.error, httplib.HTTPException), err:
 319             raise urllib2.URLError(err)
 320
 321         for args in self.parent.addheaders:
 322             h.putheader(*args)
 323         for k, v in req.headers.items():
 324             h.putheader(k, v)
 325         h.endheaders()
 326         if req.has_data():
 327             h.send(data)
 328
 329     def _get_connection(self, host):
 330         return NotImplementedError
 331
 332 class HTTPHandler(KeepAliveHandler, urllib2.HTTPHandler):
 333     def __init__(self):
 334         KeepAliveHandler.__init__(self)
 335
 336     def http_open(self, req):
 337         return self.do_open(req)
 338
 339     def _get_connection(self, host):
 340         return HTTPConnection(host)
 341
 342 class HTTPSHandler(KeepAliveHandler, urllib2.HTTPSHandler):
 343     def __init__(self, ssl_factory=None):
 344         KeepAliveHandler.__init__(self)
 345         if not ssl_factory:
 346             ssl_factory = sslfactory.get_factory()
 347         self._ssl_factory = ssl_factory
 348
 349     def https_open(self, req):
 350         return self.do_open(req)
 351
 352     def _get_connection(self, host):
 353         return self._ssl_factory.get_https_connection(host)
 354
 355 class HTTPResponse(httplib.HTTPResponse):
 356     # we need to subclass HTTPResponse in order to
 357     # 1) add readline() and readlines() methods
 358     # 2) add close_connection() methods
 359     # 3) add info() and geturl() methods
 360
 361     # in order to add readline(), read must be modified to deal with a
 362     # buffer.  example: readline must read a buffer and then spit back
 363     # one line at a time.  The only real alternative is to read one
 364     # BYTE at a time (ick).  Once something has been read, it can't be
 365     # put back (ok, maybe it can, but that's even uglier than this),
 366     # so if you THEN do a normal read, you must first take stuff from
 367     # the buffer.
 368
 369     # the read method wraps the original to accomodate buffering,
 370     # although read() never adds to the buffer.
 371     # Both readline and readlines have been stolen with almost no
 372     # modification from socket.py
 373
 374
 375     def __init__(self, sock, debuglevel=0, strict=0, method=None):
 376         if method: # the httplib in python 2.3 uses the method arg
 377             httplib.HTTPResponse.__init__(self, sock, debuglevel, method)
 378         else: # 2.2 doesn't
 379             httplib.HTTPResponse.__init__(self, sock, debuglevel)
 380         self.fileno = sock.fileno
 381         self.code = None
 382         self._rbuf = ''
 383         self._rbufsize = 8096
 384         self._handler = None # inserted by the handler later
 385         self._host = None    # (same)
 386         self._url = None     # (same)
 387         self._connection = None # (same)
 388
 389     _raw_read = httplib.HTTPResponse.read
 390
 391     def close(self):
 392         if self.fp:
 393             self.fp.close()
 394             self.fp = None
 395             if self._handler:
 396                 self._handler._request_closed(self, self._host,
 397                                               self._connection)
 398
 399     def close_connection(self):
 400         self._handler._remove_connection(self._host, self._connection, close=1)
 401         self.close()
 402
 403     def info(self):
 404         return self.headers
 405
 406     def geturl(self):
 407         return self._url
 408
 409     def read(self, amt=None):
 410         # the _rbuf test is only in this first if for speed.  It's not
 411         # logically necessary
 412         if self._rbuf and not amt is None:
 413             L = len(self._rbuf)
 414             if amt > L:
 415                 amt -= L
 416             else:
 417                 s = self._rbuf[:amt]
 418                 self._rbuf = self._rbuf[amt:]
 419                 return s
 420
 421         s = self._rbuf + self._raw_read(amt)
 422         self._rbuf = ''
 423         return s
 424
 425     def readline(self, limit=-1):
 426         data = ""
 427         i = self._rbuf.find('\n')
 428         while i < 0 and not (0 < limit <= len(self._rbuf)):
 429             new = self._raw_read(self._rbufsize)
 430             if not new: break
 431             i = new.find('\n')
 432             if i >= 0: i = i + len(self._rbuf)
 433             self._rbuf = self._rbuf + new
 434         if i < 0: i = len(self._rbuf)
 435         else: i = i+1
 436         if 0 <= limit < len(self._rbuf): i = limit
 437         data, self._rbuf = self._rbuf[:i], self._rbuf[i:]
 438         return data
 439
 440     def readlines(self, sizehint = 0):
 441         total = 0
 442         list = []
 443         while 1:
 444             line = self.readline()
 445             if not line: break
 446             list.append(line)
 447             total += len(line)
 448             if sizehint and total >= sizehint:
 449                 break
 450         return list
 451
 452
 453 class HTTPConnection(httplib.HTTPConnection):
 454     # use the modified response class
 455     response_class = HTTPResponse
 456
 457 class HTTPSConnection(httplib.HTTPSConnection):
 458     response_class = HTTPResponse
 459
 460 #########################################################################
 461 #####   TEST FUNCTIONS
 462 #########################################################################
 463
 464 def error_handler(url):
 465     global HANDLE_ERRORS
 466     orig = HANDLE_ERRORS
 467     keepalive_handler = HTTPHandler()
 468     opener = urllib2.build_opener(keepalive_handler)
 469     urllib2.install_opener(opener)
 470     pos = {0: 'off', 1: 'on'}
 471     for i in (0, 1):
 472         print "  fancy error handling %s (HANDLE_ERRORS = %i)" % (pos[i], i)
 473         HANDLE_ERRORS = i
 474         try:
 475             fo = urllib2.urlopen(url)
 476             foo = fo.read()
 477             fo.close()
 478             try: status, reason = fo.status, fo.reason
 479             except AttributeError: status, reason = None, None
 480         except IOError, e:
 481             print "  EXCEPTION: %s" % e
 482             raise
 483         else:
 484             print "  status = %s, reason = %s" % (status, reason)
 485     HANDLE_ERRORS = orig
 486     hosts = keepalive_handler.open_connections()
 487     print "open connections:", hosts
 488     keepalive_handler.close_all()
 489
 490 def continuity(url):
 491     import md5
 492     format = '%25s: %s'
 493
 494     # first fetch the file with the normal http handler
 495     opener = urllib2.build_opener()
 496     urllib2.install_opener(opener)
 497     fo = urllib2.urlopen(url)
 498     foo = fo.read()
 499     fo.close()
 500     m = md5.new(foo)
 501     print format % ('normal urllib', m.hexdigest())
 502
 503     # now install the keepalive handler and try again
 504     opener = urllib2.build_opener(HTTPHandler())
 505     urllib2.install_opener(opener)
 506
 507     fo = urllib2.urlopen(url)
 508     foo = fo.read()
 509     fo.close()
 510     m = md5.new(foo)
 511     print format % ('keepalive read', m.hexdigest())
 512
 513     fo = urllib2.urlopen(url)
 514     foo = ''
 515     while 1:
 516         f = fo.readline()
 517         if f: foo = foo + f
 518         else: break
 519     fo.close()
 520     m = md5.new(foo)
 521     print format % ('keepalive readline', m.hexdigest())
 522
 523 def comp(N, url):
 524     print '  making %i connections to:\n  %s' % (N, url)
 525
 526     sys.stdout.write('  first using the normal urllib handlers')
 527     # first use normal opener
 528     opener = urllib2.build_opener()
 529     urllib2.install_opener(opener)
 530     t1 = fetch(N, url)
 531     print '  TIME: %.3f s' % t1
 532
 533     sys.stdout.write('  now using the keepalive handler       ')
 534     # now install the keepalive handler and try again
 535     opener = urllib2.build_opener(HTTPHandler())
 536     urllib2.install_opener(opener)
 537     t2 = fetch(N, url)
 538     print '  TIME: %.3f s' % t2
 539     print '  improvement factor: %.2f' % (t1/t2, )
 540
 541 def fetch(N, url, delay=0):
 542     import time
 543     lens = []
 544     starttime = time.time()
 545     for i in range(N):
 546         if delay and i > 0: time.sleep(delay)
 547         fo = urllib2.urlopen(url)
 548         foo = fo.read()
 549         fo.close()
 550         lens.append(len(foo))
 551     diff = time.time() - starttime
 552
 553     j = 0
 554     for i in lens[1:]:
 555         j = j + 1
 556         if not i == lens[0]:
 557             print "WARNING: inconsistent length on read %i: %i" % (j, i)
 558
 559     return diff
 560
 561 def test_timeout(url):
 562     global DEBUG
 563     dbbackup = DEBUG
 564     class FakeLogger:
 565         def debug(self, msg, *args): print msg % args
 566         info = warning = error = debug
 567     DEBUG = FakeLogger()
 568     print "  fetching the file to establish a connection"
 569     fo = urllib2.urlopen(url)
 570     data1 = fo.read()
 571     fo.close()
 572
 573     i = 20
 574     print "  waiting %i seconds for the server to close the connection" % i
 575     while i > 0:
 576         sys.stdout.write('\r  %2i' % i)
 577         sys.stdout.flush()
 578         time.sleep(1)
 579         i -= 1
 580     sys.stderr.write('\r')
 581
 582     print "  fetching the file a second time"
 583     fo = urllib2.urlopen(url)
 584     data2 = fo.read()
 585     fo.close()
 586
 587     if data1 == data2:
 588         print '  data are identical'
 589     else:
 590         print '  ERROR: DATA DIFFER'
 591
 592     DEBUG = dbbackup
 593
 594
 595 def test(url, N=10):
 596     print "checking error hander (do this on a non-200)"
 597     try: error_handler(url)
 598     except IOError, e:
 599         print "exiting - exception will prevent further tests"
 600         sys.exit()
 601     print
 602     print "performing continuity test (making sure stuff isn't corrupted)"
 603     continuity(url)
 604     print
 605     print "performing speed comparison"
 606     comp(N, url)
 607     print
 608     print "performing dropped-connection check"
 609     test_timeout(url)
 610
 611 if __name__ == '__main__':
 612     import time
 613     import sys
 614     try:
 615         N = int(sys.argv[1])
 616         url = sys.argv[2]
 617     except:
 618         print "%s <integer> <url>" % sys.argv[0]
 619     else:
 620         test(url, N)