Update to latest HEAD.

- Fix parsing of FTP 213 responses
- Switch to max_connections=1 after timing out.  BZ 853432
- max_connections=0 should imply the default limit.
epel9
Zdenek Pavlas 12 years ago
parent 4b9511117b
commit d2b26353b7

@ -3,7 +3,7 @@
Summary: A high-level cross-protocol url-grabber Summary: A high-level cross-protocol url-grabber
Name: python-urlgrabber Name: python-urlgrabber
Version: 3.9.1 Version: 3.9.1
Release: 28%{?dist} Release: 29%{?dist}
Source0: urlgrabber-%{version}.tar.gz Source0: urlgrabber-%{version}.tar.gz
Patch1: urlgrabber-HEAD.patch Patch1: urlgrabber-HEAD.patch
@ -44,6 +44,12 @@ rm -rf $RPM_BUILD_ROOT
%attr(0755,root,root) %{_libexecdir}/urlgrabber-ext-down %attr(0755,root,root) %{_libexecdir}/urlgrabber-ext-down
%changelog %changelog
* Tue Jun 18 2013 Zdenek Pavlas <zpavlas@redhat.com> - 3.9.1-29
- Update to latest HEAD.
- Fix parsing of FTP 213 responses
- Switch to max_connections=1 after timing out. BZ 853432
- max_connections=0 should imply the default limit.
* Fri May 17 2013 Zdenek Pavlas <zpavlas@redhat.com> - 3.9.1-28 * Fri May 17 2013 Zdenek Pavlas <zpavlas@redhat.com> - 3.9.1-28
- Update to latest HEAD. - Update to latest HEAD.
- Add the "minrate" option. BZ 964298 - Add the "minrate" option. BZ 964298

@ -314,7 +314,7 @@ index 3e5f3b7..8eeaeda 100644
return (fb,lb) return (fb,lb)
diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py
index e090e90..05ea9c3 100644 index e090e90..6b409e3 100644
--- a/urlgrabber/grabber.py --- a/urlgrabber/grabber.py
+++ b/urlgrabber/grabber.py +++ b/urlgrabber/grabber.py
@@ -49,11 +49,26 @@ GENERAL ARGUMENTS (kwargs) @@ -49,11 +49,26 @@ GENERAL ARGUMENTS (kwargs)
@ -920,7 +920,7 @@ index e090e90..05ea9c3 100644
if not self._prog_running: if not self._prog_running:
if self.opts.progress_obj: if self.opts.progress_obj:
size = self.size + self._reget_length size = self.size + self._reget_length
@@ -1079,23 +1274,40 @@ class PyCurlFileObject(): @@ -1079,32 +1274,62 @@ class PyCurlFileObject():
self.opts.progress_obj.update(self._amount_read) self.opts.progress_obj.update(self._amount_read)
self._amount_read += len(buf) self._amount_read += len(buf)
@ -967,7 +967,10 @@ index e090e90..05ea9c3 100644
elif self.scheme in ['ftp']: elif self.scheme in ['ftp']:
s = None s = None
if buf.startswith('213 '): if buf.startswith('213 '):
@@ -1104,7 +1316,18 @@ class PyCurlFileObject(): s = buf[3:].strip()
+ if len(s) >= 14:
+ s = None # ignore MDTM responses
elif buf.startswith('150 '):
s = parse150(buf) s = parse150(buf)
if s: if s:
self.size = int(s) self.size = int(s)
@ -987,7 +990,7 @@ index e090e90..05ea9c3 100644
return len(buf) return len(buf)
except KeyboardInterrupt: except KeyboardInterrupt:
return pycurl.READFUNC_ABORT return pycurl.READFUNC_ABORT
@@ -1113,8 +1336,10 @@ class PyCurlFileObject(): @@ -1113,8 +1338,10 @@ class PyCurlFileObject():
if self._parsed_hdr: if self._parsed_hdr:
return self._parsed_hdr return self._parsed_hdr
statusend = self._hdr_dump.find('\n') statusend = self._hdr_dump.find('\n')
@ -998,7 +1001,7 @@ index e090e90..05ea9c3 100644
self._parsed_hdr = mimetools.Message(hdrfp) self._parsed_hdr = mimetools.Message(hdrfp)
return self._parsed_hdr return self._parsed_hdr
@@ -1127,6 +1352,9 @@ class PyCurlFileObject(): @@ -1127,6 +1354,9 @@ class PyCurlFileObject():
if not opts: if not opts:
opts = self.opts opts = self.opts
@ -1008,7 +1011,7 @@ index e090e90..05ea9c3 100644
# defaults we're always going to set # defaults we're always going to set
self.curl_obj.setopt(pycurl.NOPROGRESS, False) self.curl_obj.setopt(pycurl.NOPROGRESS, False)
@@ -1136,11 +1364,21 @@ class PyCurlFileObject(): @@ -1136,11 +1366,21 @@ class PyCurlFileObject():
self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update) self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update)
self.curl_obj.setopt(pycurl.FAILONERROR, True) self.curl_obj.setopt(pycurl.FAILONERROR, True)
self.curl_obj.setopt(pycurl.OPT_FILETIME, True) self.curl_obj.setopt(pycurl.OPT_FILETIME, True)
@ -1031,7 +1034,7 @@ index e090e90..05ea9c3 100644
# maybe to be options later # maybe to be options later
self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True) self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)
@@ -1148,9 +1386,11 @@ class PyCurlFileObject(): @@ -1148,9 +1388,11 @@ class PyCurlFileObject():
# timeouts # timeouts
timeout = 300 timeout = 300
@ -1046,7 +1049,7 @@ index e090e90..05ea9c3 100644
# ssl options # ssl options
if self.scheme == 'https': if self.scheme == 'https':
@@ -1158,13 +1398,16 @@ class PyCurlFileObject(): @@ -1158,13 +1400,16 @@ class PyCurlFileObject():
self.curl_obj.setopt(pycurl.CAPATH, opts.ssl_ca_cert) self.curl_obj.setopt(pycurl.CAPATH, opts.ssl_ca_cert)
self.curl_obj.setopt(pycurl.CAINFO, opts.ssl_ca_cert) self.curl_obj.setopt(pycurl.CAINFO, opts.ssl_ca_cert)
self.curl_obj.setopt(pycurl.SSL_VERIFYPEER, opts.ssl_verify_peer) self.curl_obj.setopt(pycurl.SSL_VERIFYPEER, opts.ssl_verify_peer)
@ -1064,7 +1067,7 @@ index e090e90..05ea9c3 100644
if opts.ssl_cert_type: if opts.ssl_cert_type:
self.curl_obj.setopt(pycurl.SSLCERTTYPE, opts.ssl_cert_type) self.curl_obj.setopt(pycurl.SSLCERTTYPE, opts.ssl_cert_type)
if opts.ssl_key_pass: if opts.ssl_key_pass:
@@ -1187,28 +1430,26 @@ class PyCurlFileObject(): @@ -1187,28 +1432,26 @@ class PyCurlFileObject():
if hasattr(opts, 'raw_throttle') and opts.raw_throttle(): if hasattr(opts, 'raw_throttle') and opts.raw_throttle():
self.curl_obj.setopt(pycurl.MAX_RECV_SPEED_LARGE, int(opts.raw_throttle())) self.curl_obj.setopt(pycurl.MAX_RECV_SPEED_LARGE, int(opts.raw_throttle()))
@ -1109,7 +1112,7 @@ index e090e90..05ea9c3 100644
# our url # our url
self.curl_obj.setopt(pycurl.URL, self.url) self.curl_obj.setopt(pycurl.URL, self.url)
@@ -1228,39 +1469,26 @@ class PyCurlFileObject(): @@ -1228,39 +1471,26 @@ class PyCurlFileObject():
code = self.http_code code = self.http_code
errcode = e.args[0] errcode = e.args[0]
@ -1155,7 +1158,7 @@ index e090e90..05ea9c3 100644
# this is probably wrong but ultimately this is what happens # this is probably wrong but ultimately this is what happens
# we have a legit http code and a pycurl 'writer failed' code # we have a legit http code and a pycurl 'writer failed' code
# which almost always means something aborted it from outside # which almost always means something aborted it from outside
@@ -1269,36 +1497,70 @@ class PyCurlFileObject(): @@ -1269,36 +1499,70 @@ class PyCurlFileObject():
# figure out what aborted the pycurl process FIXME # figure out what aborted the pycurl process FIXME
raise KeyboardInterrupt raise KeyboardInterrupt
@ -1251,7 +1254,7 @@ index e090e90..05ea9c3 100644
def _do_open(self): def _do_open(self):
self.curl_obj = _curl_cache self.curl_obj = _curl_cache
@@ -1333,7 +1595,11 @@ class PyCurlFileObject(): @@ -1333,7 +1597,11 @@ class PyCurlFileObject():
if self.opts.range: if self.opts.range:
rt = self.opts.range rt = self.opts.range
@ -1264,7 +1267,7 @@ index e090e90..05ea9c3 100644
if rt: if rt:
header = range_tuple_to_header(rt) header = range_tuple_to_header(rt)
@@ -1434,21 +1700,46 @@ class PyCurlFileObject(): @@ -1434,21 +1702,46 @@ class PyCurlFileObject():
#fh, self._temp_name = mkstemp() #fh, self._temp_name = mkstemp()
#self.fo = open(self._temp_name, 'wb') #self.fo = open(self._temp_name, 'wb')
@ -1318,7 +1321,7 @@ index e090e90..05ea9c3 100644
else: else:
#self.fo = open(self._temp_name, 'r') #self.fo = open(self._temp_name, 'r')
self.fo.seek(0) self.fo.seek(0)
@@ -1526,17 +1817,20 @@ class PyCurlFileObject(): @@ -1526,17 +1819,20 @@ class PyCurlFileObject():
if self._prog_running: if self._prog_running:
downloaded += self._reget_length downloaded += self._reget_length
self.opts.progress_obj.update(downloaded) self.opts.progress_obj.update(downloaded)
@ -1344,7 +1347,7 @@ index e090e90..05ea9c3 100644
msg = _("Downloaded more than max size for %s: %s > %s") \ msg = _("Downloaded more than max size for %s: %s > %s") \
% (self.url, cur, max_size) % (self.url, cur, max_size)
@@ -1544,13 +1838,6 @@ class PyCurlFileObject(): @@ -1544,13 +1840,6 @@ class PyCurlFileObject():
return True return True
return False return False
@ -1358,7 +1361,7 @@ index e090e90..05ea9c3 100644
def read(self, amt=None): def read(self, amt=None):
self._fill_buffer(amt) self._fill_buffer(amt)
if amt is None: if amt is None:
@@ -1582,9 +1869,21 @@ class PyCurlFileObject(): @@ -1582,9 +1871,21 @@ class PyCurlFileObject():
self.opts.progress_obj.end(self._amount_read) self.opts.progress_obj.end(self._amount_read)
self.fo.close() self.fo.close()
@ -1381,7 +1384,7 @@ index e090e90..05ea9c3 100644
##################################################################### #####################################################################
# DEPRECATED FUNCTIONS # DEPRECATED FUNCTIONS
@@ -1621,6 +1920,482 @@ def retrygrab(url, filename=None, copy_local=0, close_connection=0, @@ -1621,6 +1922,489 @@ def retrygrab(url, filename=None, copy_local=0, close_connection=0,
##################################################################### #####################################################################
@ -1614,6 +1617,7 @@ index e090e90..05ea9c3 100644
+ +
+ dl = _ExternalDownloaderPool() + dl = _ExternalDownloaderPool()
+ host_con = {} # current host connection counts + host_con = {} # current host connection counts
+ single = set() # hosts in single connection mode
+ +
+ def start(opts, tries): + def start(opts, tries):
+ opts.tries = tries + opts.tries = tries
@ -1660,6 +1664,10 @@ index e090e90..05ea9c3 100644
+ +
+ if ug_err is None: + if ug_err is None:
+ continue + continue
+ if ug_err.errno == pycurl.E_OPERATION_TIMEOUTED:
+ # One possible cause is connection-limited server.
+ # Turn on the max_connections=1 override. BZ 853432
+ single.add(key)
+ +
+ retry = opts.retry or 0 + retry = opts.retry or 0
+ if opts.failure_callback: + if opts.failure_callback:
@ -1749,7 +1757,7 @@ index e090e90..05ea9c3 100644
+ +
+ # update the current mirror and limit + # update the current mirror and limit
+ key = best['mirror'] + key = best['mirror']
+ limit = best.get('kwargs', {}).get('max_connections', 2) + limit = best.get('kwargs', {}).get('max_connections') or 2
+ opts.async = key, limit + opts.async = key, limit
+ +
+ # update URL and proxy + # update URL and proxy
@ -1760,6 +1768,8 @@ index e090e90..05ea9c3 100644
+ +
+ # check host limit, then start + # check host limit, then start
+ key, limit = opts.async + key, limit = opts.async
+ if key in single:
+ limit = 1
+ while host_con.get(key, 0) >= limit: + while host_con.get(key, 0) >= limit:
+ perform() + perform()
+ if DEBUG: + if DEBUG:
@ -1865,20 +1875,21 @@ index e090e90..05ea9c3 100644
def _main_test(): def _main_test():
try: url, filename = sys.argv[1:3] try: url, filename = sys.argv[1:3]
diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py
index dad410b..7975f1b 100644 index dad410b..5d3aa34 100644
--- a/urlgrabber/mirror.py --- a/urlgrabber/mirror.py
+++ b/urlgrabber/mirror.py +++ b/urlgrabber/mirror.py
@@ -76,6 +76,9 @@ CUSTOMIZATION @@ -76,6 +76,10 @@ CUSTOMIZATION
'grabber' is omitted, the default grabber will be used. If 'grabber' is omitted, the default grabber will be used. If
kwargs are omitted, then (duh) they will not be used. kwargs are omitted, then (duh) they will not be used.
+ kwarg 'max_connections' limits the number of concurrent + kwarg 'max_connections' limits the number of concurrent
+ connections to this mirror. + connections to this mirror. When omitted or set to zero,
+ the default limit (2) will be used.
+ +
3) Pass keyword arguments when instantiating the mirror group. 3) Pass keyword arguments when instantiating the mirror group.
See, for example, the failure_callback argument. See, for example, the failure_callback argument.
@@ -87,10 +90,14 @@ CUSTOMIZATION @@ -87,10 +91,14 @@ CUSTOMIZATION
""" """
@ -1894,7 +1905,7 @@ index dad410b..7975f1b 100644
def _(st): def _(st):
return st return st
@@ -126,7 +133,9 @@ class MirrorGroup: @@ -126,7 +134,9 @@ class MirrorGroup:
files) files)
* if the local list is ever exhausted, a URLGrabError will be * if the local list is ever exhausted, a URLGrabError will be
@ -1905,7 +1916,7 @@ index dad410b..7975f1b 100644
OPTIONS OPTIONS
@@ -153,7 +162,8 @@ class MirrorGroup: @@ -153,7 +163,8 @@ class MirrorGroup:
The 'fail' option will cause immediate failure by re-raising The 'fail' option will cause immediate failure by re-raising
the exception and no further attempts to get the current the exception and no further attempts to get the current
@ -1915,7 +1926,7 @@ index dad410b..7975f1b 100644
This dict can be set at instantiation time, This dict can be set at instantiation time,
mg = MirrorGroup(grabber, mirrors, default_action={'fail':1}) mg = MirrorGroup(grabber, mirrors, default_action={'fail':1})
@@ -184,6 +194,7 @@ class MirrorGroup: @@ -184,6 +195,7 @@ class MirrorGroup:
obj.exception = < exception that was raised > obj.exception = < exception that was raised >
obj.mirror = < the mirror that was tried > obj.mirror = < the mirror that was tried >
@ -1923,7 +1934,7 @@ index dad410b..7975f1b 100644
obj.relative_url = < url relative to the mirror > obj.relative_url = < url relative to the mirror >
obj.url = < full url that failed > obj.url = < full url that failed >
# .url is just the combination of .mirror # .url is just the combination of .mirror
@@ -251,6 +262,17 @@ class MirrorGroup: @@ -251,6 +263,17 @@ class MirrorGroup:
self.default_action = None self.default_action = None
self._process_kwargs(kwargs) self._process_kwargs(kwargs)
@ -1941,7 +1952,7 @@ index dad410b..7975f1b 100644
# if these values are found in **kwargs passed to one of the urlXXX # if these values are found in **kwargs passed to one of the urlXXX
# methods, they will be stripped before getting passed on to the # methods, they will be stripped before getting passed on to the
# grabber # grabber
@@ -263,7 +285,8 @@ class MirrorGroup: @@ -263,7 +286,8 @@ class MirrorGroup:
def _parse_mirrors(self, mirrors): def _parse_mirrors(self, mirrors):
parsed_mirrors = [] parsed_mirrors = []
for m in mirrors: for m in mirrors:
@ -1951,7 +1962,7 @@ index dad410b..7975f1b 100644
parsed_mirrors.append(m) parsed_mirrors.append(m)
return parsed_mirrors return parsed_mirrors
@@ -280,7 +303,9 @@ class MirrorGroup: @@ -280,7 +304,9 @@ class MirrorGroup:
# return a random mirror so that multiple mirrors get used # return a random mirror so that multiple mirrors get used
# even without failures. # even without failures.
if not gr.mirrors: if not gr.mirrors:
@ -1962,7 +1973,7 @@ index dad410b..7975f1b 100644
return gr.mirrors[gr._next] return gr.mirrors[gr._next]
def _failure(self, gr, cb_obj): def _failure(self, gr, cb_obj):
@@ -307,7 +332,9 @@ class MirrorGroup: @@ -307,7 +333,9 @@ class MirrorGroup:
a.update(action) a.update(action)
action = a action = a
self.increment_mirror(gr, action) self.increment_mirror(gr, action)
@ -1973,7 +1984,7 @@ index dad410b..7975f1b 100644
def increment_mirror(self, gr, action={}): def increment_mirror(self, gr, action={}):
"""Tell the mirror object increment the mirror index """Tell the mirror object increment the mirror index
@@ -377,35 +404,50 @@ class MirrorGroup: @@ -377,35 +405,50 @@ class MirrorGroup:
gr.url = url gr.url = url
gr.kw = dict(kw) gr.kw = dict(kw)
self._load_gr(gr) self._load_gr(gr)

Loading…
Cancel
Save