Update to latest HEAD.

12 years ago · b14cebc2e8
parent add16b8996
commit b14cebc2e8
2 changed files with 72 additions and 37 deletions
--- a/python-urlgrabber.spec
+++ b/python-urlgrabber.spec
@ -3,7 +3,7 @@
 Summary: A high-level cross-protocol url-grabber
 Name: python-urlgrabber
 Version: 3.9.1
-Release: 21%{?dist}
+Release: 22%{?dist}
 Source0: urlgrabber-%{version}.tar.gz
 Patch1: urlgrabber-HEAD.patch

@ -44,6 +44,10 @@ rm -rf $RPM_BUILD_ROOT
 %attr(0755,root,root) %{_libexecdir}/urlgrabber-ext-down

 %changelog
+* Thu Dec  6 2012 Zdeněk Pavlas <zpavlas@redhat.com> - 3.9.1-22
+- Update to latest HEAD.
+- Improve URLGRABBER_DEBUG, add max_connections.  BZ 853432
+
 * Thu Nov  1 2012 Zdeněk Pavlas <zpavlas@redhat.com> - 3.9.1-21
 - Update to latest HEAD.
 - Get rid of "HTTP 200 OK" errors.  BZ 871835.
--- a/urlgrabber-HEAD.patch
+++ b/urlgrabber-HEAD.patch
@ -236,7 +236,7 @@ index 3e5f3b7..8eeaeda 100644
     return (fb,lb)
 
 diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py
-index e090e90..74a692c 100644
+index e090e90..78c2e59 100644
 --- a/urlgrabber/grabber.py
 +++ b/urlgrabber/grabber.py
@@ -49,11 +49,26 @@ GENERAL ARGUMENTS (kwargs)
@ -458,7 +458,26 @@ index e090e90..74a692c 100644
 ########################################################################
 # functions for debugging output.  These functions are here because they
 # are also part of the module initialization.
-@@ -527,6 +608,29 @@ def _(st):
+@@ -504,6 +585,7 @@ def _init_default_logger(logspec=None):
+         else:  handler = logging.FileHandler(filename)
+         handler.setFormatter(formatter)
+         DBOBJ = logging.getLogger('urlgrabber')
+        DBOBJ.propagate = False
+         DBOBJ.addHandler(handler)
+         DBOBJ.setLevel(level)
+     except (KeyError, ImportError, ValueError):
+@@ -512,8 +594,8 @@ def _init_default_logger(logspec=None):
+ 
+ def _log_package_state():
+     if not DEBUG: return
+-    DEBUG.info('urlgrabber version  = %s' % __version__)
+-    DEBUG.info('trans function "_"  = %s' % _)
+    DEBUG.debug('urlgrabber version  = %s' % __version__)
+    DEBUG.debug('trans function "_"  = %s' % _)
+         
+ _init_default_logger()
+ _log_package_state()
+@@ -527,6 +609,29 @@ def _(st):
 #                 END MODULE INITIALIZATION
 ########################################################################
 
@ -488,7 +507,7 @@ index e090e90..74a692c 100644
 
 
 class URLGrabError(IOError):
-@@ -662,6 +766,7 @@ class URLParser:
+@@ -662,6 +767,7 @@ class URLParser:
           opts.quote = 0     --> do not quote it
           opts.quote = None  --> guess
         """
@ -496,7 +515,7 @@ index e090e90..74a692c 100644
         quote = opts.quote
         
         if opts.prefix:
-@@ -768,6 +873,41 @@ class URLGrabberOptions:
+@@ -768,6 +874,41 @@ class URLGrabberOptions:
         else: # throttle is a float
             return self.bandwidth * self.throttle
         
@ -538,7 +557,7 @@ index e090e90..74a692c 100644
     def derive(self, **kwargs):
         """Create a derived URLGrabberOptions instance.
         This method creates a new instance and overrides the
-@@ -791,30 +931,37 @@ class URLGrabberOptions:
+@@ -791,30 +932,37 @@ class URLGrabberOptions:
         provided here.
         """
         self.progress_obj = None
@ -577,7 +596,7 @@ index e090e90..74a692c 100644
         self.ssl_ca_cert = None # sets SSL_CAINFO - path to certdb
         self.ssl_context = None # no-op in pycurl
         self.ssl_verify_peer = True # check peer's cert for authenticityb
-@@ -827,6 +974,12 @@ class URLGrabberOptions:
+@@ -827,6 +975,12 @@ class URLGrabberOptions:
         self.size = None # if we know how big the thing we're getting is going
                          # to be. this is ultimately a MAXIMUM size for the file
         self.max_header_size = 2097152 #2mb seems reasonable for maximum header size
@ -590,7 +609,7 @@ index e090e90..74a692c 100644
         
     def __repr__(self):
         return self.format()
-@@ -846,7 +999,18 @@ class URLGrabberOptions:
+@@ -846,7 +1000,18 @@ class URLGrabberOptions:
         s = s + indent + '}'
         return s
 
@ -610,7 +629,7 @@ index e090e90..74a692c 100644
     """Provides easy opening of URLs with a variety of options.
     
     All options are specified as kwargs. Options may be specified when
-@@ -872,7 +1036,6 @@ class URLGrabber:
+@@ -872,7 +1037,6 @@ class URLGrabber:
             # beware of infinite loops :)
             tries = tries + 1
             exception = None
@ -618,7 +637,7 @@ index e090e90..74a692c 100644
             callback  = None
             if DEBUG: DEBUG.info('attempt %i/%s: %s',
                                  tries, opts.retry, args[0])
-@@ -883,54 +1046,62 @@ class URLGrabber:
+@@ -883,54 +1047,62 @@ class URLGrabber:
             except URLGrabError, e:
                 exception = e
                 callback = opts.failure_callback
@ -688,7 +707,7 @@ index e090e90..74a692c 100644
         if scheme == 'file' and not opts.copy_local:
             # just return the name of the local file - don't make a 
             # copy currently
-@@ -950,41 +1121,51 @@ class URLGrabber:
+@@ -950,41 +1122,51 @@ class URLGrabber:
 
             elif not opts.range:
                 if not opts.checkfunc is None:
@ -755,7 +774,7 @@ index e090e90..74a692c 100644
         if limit is not None:
             limit = limit + 1
             
-@@ -1000,12 +1181,8 @@ class URLGrabber:
+@@ -1000,12 +1182,8 @@ class URLGrabber:
                 else: s = fo.read(limit)
 
                 if not opts.checkfunc is None:
@ -770,7 +789,7 @@ index e090e90..74a692c 100644
             finally:
                 fo.close()
             return s
-@@ -1020,6 +1197,7 @@ class URLGrabber:
+@@ -1020,6 +1198,7 @@ class URLGrabber:
         return s
         
     def _make_callback(self, callback_obj):
@ -778,7 +797,7 @@ index e090e90..74a692c 100644
         if callable(callback_obj):
             return callback_obj, (), {}
         else:
-@@ -1030,7 +1208,7 @@ class URLGrabber:
+@@ -1030,7 +1209,7 @@ class URLGrabber:
 default_grabber = URLGrabber()
 
 
@ -787,7 +806,7 @@ index e090e90..74a692c 100644
     def __init__(self, url, filename, opts):
         self.fo = None
         self._hdr_dump = ''
-@@ -1052,10 +1230,13 @@ class PyCurlFileObject():
+@@ -1052,10 +1231,13 @@ class PyCurlFileObject():
         self._reget_length = 0
         self._prog_running = False
         self._error = (None, None)
@ -803,7 +822,7 @@ index e090e90..74a692c 100644
     def __getattr__(self, name):
         """This effectively allows us to wrap at the instance level.
         Any attribute not found in _this_ object will be searched for
-@@ -1067,6 +1248,12 @@ class PyCurlFileObject():
+@@ -1067,6 +1249,12 @@ class PyCurlFileObject():
 
     def _retrieve(self, buf):
         try:
@ -816,7 +835,7 @@ index e090e90..74a692c 100644
             if not self._prog_running:
                 if self.opts.progress_obj:
                     size  = self.size + self._reget_length
-@@ -1079,15 +1266,24 @@ class PyCurlFileObject():
+@@ -1079,15 +1267,24 @@ class PyCurlFileObject():
                     self.opts.progress_obj.update(self._amount_read)
 
             self._amount_read += len(buf)
@ -843,7 +862,7 @@ index e090e90..74a692c 100644
         try:
             self._hdr_dump += buf
             # we have to get the size before we do the progress obj start
-@@ -1104,7 +1300,17 @@ class PyCurlFileObject():
+@@ -1104,7 +1301,17 @@ class PyCurlFileObject():
                     s = parse150(buf)
                 if s:
                     self.size = int(s)
@ -857,12 +876,12 @@ index e090e90..74a692c 100644
 +                
 +            if len(self._hdr_dump) != 0 and buf == '\r\n':
 +                self._hdr_ended = True
-+                if DEBUG: DEBUG.info('header ended:')
+                if DEBUG: DEBUG.debug('header ended:')
 +                
             return len(buf)
         except KeyboardInterrupt:
             return pycurl.READFUNC_ABORT
-@@ -1113,8 +1319,10 @@ class PyCurlFileObject():
+@@ -1113,8 +1320,10 @@ class PyCurlFileObject():
         if self._parsed_hdr:
             return self._parsed_hdr
         statusend = self._hdr_dump.find('\n')
@ -873,7 +892,7 @@ index e090e90..74a692c 100644
         self._parsed_hdr =  mimetools.Message(hdrfp)
         return self._parsed_hdr
     
-@@ -1127,6 +1335,9 @@ class PyCurlFileObject():
+@@ -1127,6 +1336,9 @@ class PyCurlFileObject():
         if not opts:
             opts = self.opts
 
@ -883,13 +902,14 @@ index e090e90..74a692c 100644
 
         # defaults we're always going to set
         self.curl_obj.setopt(pycurl.NOPROGRESS, False)
-@@ -1136,11 +1347,21 @@ class PyCurlFileObject():
+@@ -1136,11 +1348,21 @@ class PyCurlFileObject():
         self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update)
         self.curl_obj.setopt(pycurl.FAILONERROR, True)
         self.curl_obj.setopt(pycurl.OPT_FILETIME, True)
 +        self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)
         
-         if DEBUG:
+-        if DEBUG:
+        if DEBUG and DEBUG.level <= 10:
             self.curl_obj.setopt(pycurl.VERBOSE, True)
         if opts.user_agent:
             self.curl_obj.setopt(pycurl.USERAGENT, opts.user_agent)
@ -905,7 +925,7 @@ index e090e90..74a692c 100644
         
         # maybe to be options later
         self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)
-@@ -1148,9 +1369,11 @@ class PyCurlFileObject():
+@@ -1148,9 +1370,11 @@ class PyCurlFileObject():
         
         # timeouts
         timeout = 300
@ -920,7 +940,7 @@ index e090e90..74a692c 100644
 
         # ssl options
         if self.scheme == 'https':
-@@ -1158,13 +1381,16 @@ class PyCurlFileObject():
+@@ -1158,13 +1382,16 @@ class PyCurlFileObject():
                 self.curl_obj.setopt(pycurl.CAPATH, opts.ssl_ca_cert)
                 self.curl_obj.setopt(pycurl.CAINFO, opts.ssl_ca_cert)
             self.curl_obj.setopt(pycurl.SSL_VERIFYPEER, opts.ssl_verify_peer)
@ -938,7 +958,7 @@ index e090e90..74a692c 100644
             if opts.ssl_cert_type:                
                 self.curl_obj.setopt(pycurl.SSLCERTTYPE, opts.ssl_cert_type)
             if opts.ssl_key_pass:
-@@ -1187,28 +1413,26 @@ class PyCurlFileObject():
+@@ -1187,28 +1414,26 @@ class PyCurlFileObject():
         if hasattr(opts, 'raw_throttle') and opts.raw_throttle():
             self.curl_obj.setopt(pycurl.MAX_RECV_SPEED_LARGE, int(opts.raw_throttle()))
             
@ -983,7 +1003,7 @@ index e090e90..74a692c 100644
             
         # our url
         self.curl_obj.setopt(pycurl.URL, self.url)
-@@ -1228,39 +1452,36 @@ class PyCurlFileObject():
+@@ -1228,39 +1453,36 @@ class PyCurlFileObject():
             
             code = self.http_code
             errcode = e.args[0]
@ -1032,7 +1052,7 @@ index e090e90..74a692c 100644
                 # this is probably wrong but ultimately this is what happens
                 # we have a legit http code and a pycurl 'writer failed' code
                 # which almost always means something aborted it from outside
-@@ -1272,33 +1493,94 @@ class PyCurlFileObject():
+@@ -1272,33 +1494,94 @@ class PyCurlFileObject():
             elif errcode == 58:
                 msg = _("problem with the local client certificate")
                 err = URLGrabError(14, msg)
@ -1135,7 +1155,7 @@ index e090e90..74a692c 100644
 
     def _do_open(self):
         self.curl_obj = _curl_cache
-@@ -1333,7 +1615,11 @@ class PyCurlFileObject():
+@@ -1333,7 +1616,11 @@ class PyCurlFileObject():
                 
         if self.opts.range:
             rt = self.opts.range
@ -1148,7 +1168,7 @@ index e090e90..74a692c 100644
 
         if rt:
             header = range_tuple_to_header(rt)
-@@ -1434,21 +1720,46 @@ class PyCurlFileObject():
+@@ -1434,21 +1721,46 @@ class PyCurlFileObject():
             #fh, self._temp_name = mkstemp()
             #self.fo = open(self._temp_name, 'wb')
 
@ -1202,7 +1222,7 @@ index e090e90..74a692c 100644
         else:
             #self.fo = open(self._temp_name, 'r')
             self.fo.seek(0)
-@@ -1526,17 +1837,20 @@ class PyCurlFileObject():
+@@ -1526,17 +1838,20 @@ class PyCurlFileObject():
             if self._prog_running:
                 downloaded += self._reget_length
                 self.opts.progress_obj.update(downloaded)
@ -1228,7 +1248,7 @@ index e090e90..74a692c 100644
 
             msg = _("Downloaded more than max size for %s: %s > %s") \
                         % (self.url, cur, max_size)
-@@ -1544,13 +1858,6 @@ class PyCurlFileObject():
+@@ -1544,13 +1859,6 @@ class PyCurlFileObject():
             return True
         return False
         
@ -1242,7 +1262,7 @@ index e090e90..74a692c 100644
     def read(self, amt=None):
         self._fill_buffer(amt)
         if amt is None:
-@@ -1582,9 +1889,21 @@ class PyCurlFileObject():
+@@ -1582,9 +1890,21 @@ class PyCurlFileObject():
             self.opts.progress_obj.end(self._amount_read)
         self.fo.close()
         
@ -1265,7 +1285,7 @@ index e090e90..74a692c 100644
 
 #####################################################################
 # DEPRECATED FUNCTIONS
-@@ -1621,6 +1940,467 @@ def retrygrab(url, filename=None, copy_local=0, close_connection=0,
+@@ -1621,6 +1941,478 @@ def retrygrab(url, filename=None, copy_local=0, close_connection=0,
 
         
 #####################################################################
@ -1498,17 +1518,23 @@ index e090e90..74a692c 100644
 +    host_con = {} # current host connection counts
 +
 +    def start(opts, tries):
+        opts.tries = tries
+        try:
+            dl.start(opts)
+        except OSError, e:
+            # can't spawn downloader, give up immediately
+            opts.exception = URLGrabError(5, exception2msg(e))
+            _run_callback(opts.failfunc, opts)
+            return
+
 +        key, limit = opts.async
 +        host_con[key] = host_con.get(key, 0) + 1
-+        opts.tries = tries
 +        if opts.progress_obj:
 +            if opts.multi_progress_obj:
 +                opts._progress = opts.multi_progress_obj.newMeter()
 +                opts._progress.start(text=opts.text)
 +            else:
 +                opts._progress = time.time() # no updates
-+        if DEBUG: DEBUG.info('attempt %i/%s: %s', opts.tries, opts.retry, opts.url)
-+        dl.start(opts)
 +
 +    def perform():
 +        for opts, size, ug_err in dl.perform():
@ -1588,6 +1614,8 @@ index e090e90..74a692c 100644
 +            # check global limit
 +            while len(dl.running) >= default_grabber.opts.max_connections:
 +                perform()
+            if DEBUG:
+                DEBUG.info('max_connections: %d/%d', len(dl.running), default_grabber.opts.max_connections)
 +
 +            if opts.mirror_group:
 +                mg, errors, failed, removed = opts.mirror_group
@ -1636,6 +1664,9 @@ index e090e90..74a692c 100644
 +            key, limit = opts.async
 +            while host_con.get(key, 0) >= limit:
 +                perform()
+            if DEBUG:
+                DEBUG.info('max_connections(%s): %d/%d', key, host_con.get(key, 0), limit)
+
 +            start(opts, 1)
 +    except IOError, e:
 +        if e.errno != 4: raise