Update to latest HEAD.

- Fix parsing of FTP 213 responses - Switch to max_connections=1 after timing out. BZ 853432 - max_connections=0 should imply the default limit.
12 years ago · d2b26353b7
parent 4b9511117b
commit d2b26353b7
2 changed files with 48 additions and 31 deletions
--- a/python-urlgrabber.spec
+++ b/python-urlgrabber.spec
@ -3,7 +3,7 @@
 Summary: A high-level cross-protocol url-grabber
 Name: python-urlgrabber
 Version: 3.9.1
-Release: 28%{?dist}
+Release: 29%{?dist}
 Source0: urlgrabber-%{version}.tar.gz
 Patch1: urlgrabber-HEAD.patch
@ -44,6 +44,12 @@ rm -rf $RPM_BUILD_ROOT
 %attr(0755,root,root) %{_libexecdir}/urlgrabber-ext-down
 %changelog
 * Tue Jun 18 2013 Zdenek Pavlas <zpavlas@redhat.com> - 3.9.1-29
 - Update to latest HEAD.
 - Fix parsing of FTP 213 responses
 - Switch to max_connections=1 after timing out.  BZ 853432
 - max_connections=0 should imply the default limit.
 * Fri May 17 2013 Zdenek Pavlas <zpavlas@redhat.com> - 3.9.1-28
 - Update to latest HEAD.
 - Add the "minrate" option. BZ 964298
--- a/urlgrabber-HEAD.patch
+++ b/urlgrabber-HEAD.patch
@ -314,7 +314,7 @@ index 3e5f3b7..8eeaeda 100644
     return (fb,lb)
 diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py
-index e090e90..05ea9c3 100644
+index e090e90..6b409e3 100644
 --- a/urlgrabber/grabber.py
 +++ b/urlgrabber/grabber.py
@@ -49,11 +49,26 @@ GENERAL ARGUMENTS (kwargs)
@ -920,7 +920,7 @@ index e090e90..05ea9c3 100644
             if not self._prog_running:
                 if self.opts.progress_obj:
                     size  = self.size + self._reget_length
-@@ -1079,23 +1274,40 @@ class PyCurlFileObject():
+@@ -1079,32 +1274,62 @@ class PyCurlFileObject():
                     self.opts.progress_obj.update(self._amount_read)
             self._amount_read += len(buf)
@ -967,7 +967,10 @@ index e090e90..05ea9c3 100644
             elif self.scheme in ['ftp']:
                 s = None
                 if buf.startswith('213 '):
-@@ -1104,7 +1316,18 @@ class PyCurlFileObject():
+                     s = buf[3:].strip()
 +                    if len(s) >= 14:
 +                        s = None # ignore MDTM responses
                 elif buf.startswith('150 '):
                     s = parse150(buf)
                 if s:
                     self.size = int(s)
@ -987,7 +990,7 @@ index e090e90..05ea9c3 100644
             return len(buf)
         except KeyboardInterrupt:
             return pycurl.READFUNC_ABORT
-@@ -1113,8 +1336,10 @@ class PyCurlFileObject():
+@@ -1113,8 +1338,10 @@ class PyCurlFileObject():
         if self._parsed_hdr:
             return self._parsed_hdr
         statusend = self._hdr_dump.find('\n')
@ -998,7 +1001,7 @@ index e090e90..05ea9c3 100644
         self._parsed_hdr =  mimetools.Message(hdrfp)
         return self._parsed_hdr
-@@ -1127,6 +1352,9 @@ class PyCurlFileObject():
+@@ -1127,6 +1354,9 @@ class PyCurlFileObject():
         if not opts:
             opts = self.opts
@ -1008,7 +1011,7 @@ index e090e90..05ea9c3 100644
         # defaults we're always going to set
         self.curl_obj.setopt(pycurl.NOPROGRESS, False)
-@@ -1136,11 +1364,21 @@ class PyCurlFileObject():
+@@ -1136,11 +1366,21 @@ class PyCurlFileObject():
         self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update)
         self.curl_obj.setopt(pycurl.FAILONERROR, True)
         self.curl_obj.setopt(pycurl.OPT_FILETIME, True)
@ -1031,7 +1034,7 @@ index e090e90..05ea9c3 100644
         # maybe to be options later
         self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)
-@@ -1148,9 +1386,11 @@ class PyCurlFileObject():
+@@ -1148,9 +1388,11 @@ class PyCurlFileObject():
         # timeouts
         timeout = 300
@ -1046,7 +1049,7 @@ index e090e90..05ea9c3 100644
         # ssl options
         if self.scheme == 'https':
-@@ -1158,13 +1398,16 @@ class PyCurlFileObject():
+@@ -1158,13 +1400,16 @@ class PyCurlFileObject():
                 self.curl_obj.setopt(pycurl.CAPATH, opts.ssl_ca_cert)
                 self.curl_obj.setopt(pycurl.CAINFO, opts.ssl_ca_cert)
             self.curl_obj.setopt(pycurl.SSL_VERIFYPEER, opts.ssl_verify_peer)
@ -1064,7 +1067,7 @@ index e090e90..05ea9c3 100644
             if opts.ssl_cert_type:                
                 self.curl_obj.setopt(pycurl.SSLCERTTYPE, opts.ssl_cert_type)
             if opts.ssl_key_pass:
-@@ -1187,28 +1430,26 @@ class PyCurlFileObject():
+@@ -1187,28 +1432,26 @@ class PyCurlFileObject():
         if hasattr(opts, 'raw_throttle') and opts.raw_throttle():
             self.curl_obj.setopt(pycurl.MAX_RECV_SPEED_LARGE, int(opts.raw_throttle()))
@ -1109,7 +1112,7 @@ index e090e90..05ea9c3 100644
         # our url
         self.curl_obj.setopt(pycurl.URL, self.url)
-@@ -1228,39 +1469,26 @@ class PyCurlFileObject():
+@@ -1228,39 +1471,26 @@ class PyCurlFileObject():
             code = self.http_code
             errcode = e.args[0]
@ -1155,7 +1158,7 @@ index e090e90..05ea9c3 100644
                 # this is probably wrong but ultimately this is what happens
                 # we have a legit http code and a pycurl 'writer failed' code
                 # which almost always means something aborted it from outside
-@@ -1269,36 +1497,70 @@ class PyCurlFileObject():
+@@ -1269,36 +1499,70 @@ class PyCurlFileObject():
                 # figure out what aborted the pycurl process FIXME
                 raise KeyboardInterrupt
@ -1251,7 +1254,7 @@ index e090e90..05ea9c3 100644
     def _do_open(self):
         self.curl_obj = _curl_cache
-@@ -1333,7 +1595,11 @@ class PyCurlFileObject():
+@@ -1333,7 +1597,11 @@ class PyCurlFileObject():
         if self.opts.range:
             rt = self.opts.range
@ -1264,7 +1267,7 @@ index e090e90..05ea9c3 100644
         if rt:
             header = range_tuple_to_header(rt)
-@@ -1434,21 +1700,46 @@ class PyCurlFileObject():
+@@ -1434,21 +1702,46 @@ class PyCurlFileObject():
             #fh, self._temp_name = mkstemp()
             #self.fo = open(self._temp_name, 'wb')
@ -1318,7 +1321,7 @@ index e090e90..05ea9c3 100644
         else:
             #self.fo = open(self._temp_name, 'r')
             self.fo.seek(0)
-@@ -1526,17 +1817,20 @@ class PyCurlFileObject():
+@@ -1526,17 +1819,20 @@ class PyCurlFileObject():
             if self._prog_running:
                 downloaded += self._reget_length
                 self.opts.progress_obj.update(downloaded)
@ -1344,7 +1347,7 @@ index e090e90..05ea9c3 100644
             msg = _("Downloaded more than max size for %s: %s > %s") \
                         % (self.url, cur, max_size)
-@@ -1544,13 +1838,6 @@ class PyCurlFileObject():
+@@ -1544,13 +1840,6 @@ class PyCurlFileObject():
             return True
         return False
@ -1358,7 +1361,7 @@ index e090e90..05ea9c3 100644
     def read(self, amt=None):
         self._fill_buffer(amt)
         if amt is None:
-@@ -1582,9 +1869,21 @@ class PyCurlFileObject():
+@@ -1582,9 +1871,21 @@ class PyCurlFileObject():
             self.opts.progress_obj.end(self._amount_read)
         self.fo.close()
@ -1381,7 +1384,7 @@ index e090e90..05ea9c3 100644
 #####################################################################
 # DEPRECATED FUNCTIONS
-@@ -1621,6 +1920,482 @@ def retrygrab(url, filename=None, copy_local=0, close_connection=0,
+@@ -1621,6 +1922,489 @@ def retrygrab(url, filename=None, copy_local=0, close_connection=0,
 #####################################################################
@ -1614,6 +1617,7 @@ index e090e90..05ea9c3 100644
 +
 +    dl = _ExternalDownloaderPool()
 +    host_con = {} # current host connection counts
 +    single = set() # hosts in single connection mode
 +
 +    def start(opts, tries):
 +        opts.tries = tries
@ -1660,6 +1664,10 @@ index e090e90..05ea9c3 100644
 +
 +            if ug_err is None:
 +                continue
 +            if ug_err.errno == pycurl.E_OPERATION_TIMEOUTED:
 +                # One possible cause is connection-limited server.
 +                # Turn on the max_connections=1 override. BZ 853432
 +                single.add(key)
 +
 +            retry = opts.retry or 0
 +            if opts.failure_callback:
@ -1749,7 +1757,7 @@ index e090e90..05ea9c3 100644
 +
 +                # update the current mirror and limit
 +                key = best['mirror']
-+                limit = best.get('kwargs', {}).get('max_connections', 2)
+                limit = best.get('kwargs', {}).get('max_connections') or 2
 +                opts.async = key, limit
 +
 +                # update URL and proxy
@ -1760,6 +1768,8 @@ index e090e90..05ea9c3 100644
 +
 +            # check host limit, then start
 +            key, limit = opts.async
 +            if key in single:
 +                limit = 1
 +            while host_con.get(key, 0) >= limit:
 +                perform()
 +            if DEBUG:
@ -1865,20 +1875,21 @@ index e090e90..05ea9c3 100644
 def _main_test():
     try: url, filename = sys.argv[1:3]
 diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py
-index dad410b..7975f1b 100644
+index dad410b..5d3aa34 100644
 --- a/urlgrabber/mirror.py
 +++ b/urlgrabber/mirror.py
-@@ -76,6 +76,9 @@ CUSTOMIZATION
+@@ -76,6 +76,10 @@ CUSTOMIZATION
        'grabber' is omitted, the default grabber will be used.  If
        kwargs are omitted, then (duh) they will not be used.
 +       kwarg 'max_connections' limits the number of concurrent
-+       connections to this mirror.
+       connections to this mirror.  When omitted or set to zero,
 +       the default limit (2) will be used.
 +
     3) Pass keyword arguments when instantiating the mirror group.
        See, for example, the failure_callback argument.
-@@ -87,10 +90,14 @@ CUSTOMIZATION
+@@ -87,10 +91,14 @@ CUSTOMIZATION
 """
@ -1894,7 +1905,7 @@ index dad410b..7975f1b 100644
 def _(st): 
     return st
-@@ -126,7 +133,9 @@ class MirrorGroup:
+@@ -126,7 +134,9 @@ class MirrorGroup:
         files)
       * if the local list is ever exhausted, a URLGrabError will be
@ -1905,7 +1916,7 @@ index dad410b..7975f1b 100644
     OPTIONS
-@@ -153,7 +162,8 @@ class MirrorGroup:
+@@ -153,7 +163,8 @@ class MirrorGroup:
         The 'fail' option will cause immediate failure by re-raising
         the exception and no further attempts to get the current
@ -1915,7 +1926,7 @@ index dad410b..7975f1b 100644
         This dict can be set at instantiation time,
           mg = MirrorGroup(grabber, mirrors, default_action={'fail':1})
-@@ -184,6 +194,7 @@ class MirrorGroup:
+@@ -184,6 +195,7 @@ class MirrorGroup:
            obj.exception    = < exception that was raised >
            obj.mirror       = < the mirror that was tried >
@ -1923,7 +1934,7 @@ index dad410b..7975f1b 100644
            obj.relative_url = < url relative to the mirror >
            obj.url          = < full url that failed >
                               # .url is just the combination of .mirror
-@@ -251,6 +262,17 @@ class MirrorGroup:
+@@ -251,6 +263,17 @@ class MirrorGroup:
         self.default_action = None
         self._process_kwargs(kwargs)
@ -1941,7 +1952,7 @@ index dad410b..7975f1b 100644
     # if these values are found in **kwargs passed to one of the urlXXX
     # methods, they will be stripped before getting passed on to the
     # grabber
-@@ -263,7 +285,8 @@ class MirrorGroup:
+@@ -263,7 +286,8 @@ class MirrorGroup:
     def _parse_mirrors(self, mirrors):
         parsed_mirrors = []
         for m in mirrors:
@ -1951,7 +1962,7 @@ index dad410b..7975f1b 100644
             parsed_mirrors.append(m)
         return parsed_mirrors
-@@ -280,7 +303,9 @@ class MirrorGroup:
+@@ -280,7 +304,9 @@ class MirrorGroup:
         #   return a random mirror so that multiple mirrors get used
         #   even without failures.
         if not gr.mirrors:
@ -1962,7 +1973,7 @@ index dad410b..7975f1b 100644
         return gr.mirrors[gr._next]
     def _failure(self, gr, cb_obj):
-@@ -307,7 +332,9 @@ class MirrorGroup:
+@@ -307,7 +333,9 @@ class MirrorGroup:
         a.update(action)
         action = a
         self.increment_mirror(gr, action)
@ -1973,7 +1984,7 @@ index dad410b..7975f1b 100644
     def increment_mirror(self, gr, action={}):
         """Tell the mirror object increment the mirror index
-@@ -377,35 +404,50 @@ class MirrorGroup:
+@@ -377,35 +405,50 @@ class MirrorGroup:
         gr.url  = url
         gr.kw   = dict(kw)
         self._load_gr(gr)