diff --git a/python-urlgrabber.spec b/python-urlgrabber.spec index 15a948f..92a5cf6 100644 --- a/python-urlgrabber.spec +++ b/python-urlgrabber.spec @@ -3,7 +3,7 @@ Summary: A high-level cross-protocol url-grabber Name: python-urlgrabber Version: 3.9.1 -Release: 17%{?dist} +Release: 18%{?dist} Source0: urlgrabber-%{version}.tar.gz Patch1: urlgrabber-HEAD.patch @@ -44,6 +44,9 @@ rm -rf $RPM_BUILD_ROOT %attr(0755,root,root) %{_libexecdir}/urlgrabber-ext-down %changelog +* Wed Aug 22 2012 Zdeněk Pavlas - 3.9.1-18 +- Update to latest HEAD, lots of enhancements. + * Wed Aug 10 2012 Zdeněk Pavlas - 3.9.1-17 - Fix a bug in progress display code. BZ 847105. diff --git a/urlgrabber-HEAD.patch b/urlgrabber-HEAD.patch index 3d53ec6..56f8f77 100644 --- a/urlgrabber-HEAD.patch +++ b/urlgrabber-HEAD.patch @@ -233,10 +233,10 @@ index 3e5f3b7..8eeaeda 100644 return (fb,lb) diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py -index e090e90..bdcdfe3 100644 +index e090e90..ffd5a10 100644 --- a/urlgrabber/grabber.py +++ b/urlgrabber/grabber.py -@@ -49,7 +49,7 @@ GENERAL ARGUMENTS (kwargs) +@@ -49,11 +49,26 @@ GENERAL ARGUMENTS (kwargs) progress_obj = None a class instance that supports the following methods: @@ -245,7 +245,26 @@ index e090e90..bdcdfe3 100644 # length will be None if unknown po.update(read) # read == bytes read so far po.end() -@@ -68,14 +68,14 @@ GENERAL ARGUMENTS (kwargs) + ++ multi_progress_obj = None ++ ++ a class instance that supports the following methods: ++ mo.start(total_files, total_size) ++ mo.newMeter() => meter ++ mo.removeMeter(meter) ++ mo.end() ++ ++ The 'meter' object is similar to progress_obj, but multiple ++ instances may be created and updated at the same time. ++ ++ When downloading multiple files in parallel and multi_progress_obj ++ is None progress_obj is used in compatibility mode: finished files ++ are shown but there's no in-progress display. ++ + text = None + + specifies alternative text to be passed to the progress meter +@@ -68,14 +83,14 @@ GENERAL ARGUMENTS (kwargs) (which can be set on default_grabber.throttle) is used. See BANDWIDTH THROTTLING for more information. @@ -267,7 +286,7 @@ index e090e90..bdcdfe3 100644 bandwidth = 0 -@@ -143,8 +143,12 @@ GENERAL ARGUMENTS (kwargs) +@@ -143,8 +158,12 @@ GENERAL ARGUMENTS (kwargs) note that proxy authentication information may be provided using normal URL constructs: proxies={ 'http' : 'http://user:host@foo:3128' } @@ -282,7 +301,7 @@ index e090e90..bdcdfe3 100644 prefix = None -@@ -198,6 +202,12 @@ GENERAL ARGUMENTS (kwargs) +@@ -198,6 +217,12 @@ GENERAL ARGUMENTS (kwargs) control, you should probably subclass URLParser and pass it in via the 'urlparser' option. @@ -295,7 +314,7 @@ index e090e90..bdcdfe3 100644 ssl_ca_cert = None this option can be used if M2Crypto is available and will be -@@ -211,43 +221,75 @@ GENERAL ARGUMENTS (kwargs) +@@ -211,43 +236,75 @@ GENERAL ARGUMENTS (kwargs) No-op when using the curl backend (default) @@ -380,7 +399,7 @@ index e090e90..bdcdfe3 100644 RETRY RELATED ARGUMENTS -@@ -328,6 +370,15 @@ RETRY RELATED ARGUMENTS +@@ -328,6 +385,15 @@ RETRY RELATED ARGUMENTS but it cannot (without severe trickiness) prevent the exception from being raised. @@ -396,7 +415,7 @@ index e090e90..bdcdfe3 100644 interrupt_callback = None This callback is called if KeyboardInterrupt is received at any -@@ -420,6 +471,7 @@ import time +@@ -420,6 +486,7 @@ import time import string import urllib import urllib2 @@ -404,7 +423,7 @@ index e090e90..bdcdfe3 100644 import mimetools import thread import types -@@ -428,9 +480,17 @@ import pycurl +@@ -428,9 +495,17 @@ import pycurl from ftplib import parse150 from StringIO import StringIO from httplib import HTTPException @@ -423,7 +442,7 @@ index e090e90..bdcdfe3 100644 ######################################################################## # MODULE INITIALIZATION ######################################################################## -@@ -439,6 +499,12 @@ try: +@@ -439,6 +514,12 @@ try: except: __version__ = '???' @@ -436,7 +455,7 @@ index e090e90..bdcdfe3 100644 ######################################################################## # functions for debugging output. These functions are here because they # are also part of the module initialization. -@@ -527,6 +593,22 @@ def _(st): +@@ -527,6 +608,22 @@ def _(st): # END MODULE INITIALIZATION ######################################################################## @@ -459,7 +478,7 @@ index e090e90..bdcdfe3 100644 class URLGrabError(IOError): -@@ -662,6 +744,7 @@ class URLParser: +@@ -662,6 +759,7 @@ class URLParser: opts.quote = 0 --> do not quote it opts.quote = None --> guess """ @@ -467,7 +486,7 @@ index e090e90..bdcdfe3 100644 quote = opts.quote if opts.prefix: -@@ -768,6 +851,41 @@ class URLGrabberOptions: +@@ -768,6 +866,41 @@ class URLGrabberOptions: else: # throttle is a float return self.bandwidth * self.throttle @@ -509,7 +528,13 @@ index e090e90..bdcdfe3 100644 def derive(self, **kwargs): """Create a derived URLGrabberOptions instance. This method creates a new instance and overrides the -@@ -796,25 +914,31 @@ class URLGrabberOptions: +@@ -791,30 +924,37 @@ class URLGrabberOptions: + provided here. + """ + self.progress_obj = None ++ self.multi_progress_obj = None + self.throttle = 1.0 + self.bandwidth = 0 self.retry = None self.retrycodes = [-1,2,4,5,6,7] self.checkfunc = None @@ -542,7 +567,7 @@ index e090e90..bdcdfe3 100644 self.ssl_ca_cert = None # sets SSL_CAINFO - path to certdb self.ssl_context = None # no-op in pycurl self.ssl_verify_peer = True # check peer's cert for authenticityb -@@ -827,6 +951,12 @@ class URLGrabberOptions: +@@ -827,6 +967,12 @@ class URLGrabberOptions: self.size = None # if we know how big the thing we're getting is going # to be. this is ultimately a MAXIMUM size for the file self.max_header_size = 2097152 #2mb seems reasonable for maximum header size @@ -555,7 +580,7 @@ index e090e90..bdcdfe3 100644 def __repr__(self): return self.format() -@@ -846,7 +976,18 @@ class URLGrabberOptions: +@@ -846,7 +992,18 @@ class URLGrabberOptions: s = s + indent + '}' return s @@ -575,7 +600,7 @@ index e090e90..bdcdfe3 100644 """Provides easy opening of URLs with a variety of options. All options are specified as kwargs. Options may be specified when -@@ -872,7 +1013,6 @@ class URLGrabber: +@@ -872,7 +1029,6 @@ class URLGrabber: # beware of infinite loops :) tries = tries + 1 exception = None @@ -583,7 +608,7 @@ index e090e90..bdcdfe3 100644 callback = None if DEBUG: DEBUG.info('attempt %i/%s: %s', tries, opts.retry, args[0]) -@@ -883,23 +1023,24 @@ class URLGrabber: +@@ -883,54 +1039,62 @@ class URLGrabber: except URLGrabError, e: exception = e callback = opts.failure_callback @@ -611,24 +636,36 @@ index e090e90..bdcdfe3 100644 if (retrycode is not None) and (retrycode not in opts.retrycodes): if DEBUG: DEBUG.info('retrycode (%i) not in list %s, re-raising', retrycode, opts.retrycodes) -@@ -912,9 +1053,11 @@ class URLGrabber: + raise + +- def urlopen(self, url, **kwargs): ++ def urlopen(self, url, opts=None, **kwargs): + """open the url and return a file object + If a progress object or throttle value specified when this + object was created, then a special file object will be returned that supports them. The file object can be treated like any other file object. """ +- opts = self.opts.derive(**kwargs) + url = _to_utf8(url) - opts = self.opts.derive(**kwargs) ++ opts = (opts or self.opts).derive(**kwargs) if DEBUG: DEBUG.debug('combined options: %s' % repr(opts)) (url,parts) = opts.urlparser.parse(url, opts) + opts.find_proxy(url, parts[0]) def retryfunc(opts, url): return PyCurlFileObject(url, filename=None, opts=opts) return self._retry(opts, retryfunc, url) -@@ -925,12 +1068,17 @@ class URLGrabber: + +- def urlgrab(self, url, filename=None, **kwargs): ++ def urlgrab(self, url, filename=None, opts=None, **kwargs): + """grab the file at and make a local copy at + If filename is none, the basename of the url is used. urlgrab returns the filename of the local file, which may be different from the passed-in filename if copy_local == 0. """ +- opts = self.opts.derive(**kwargs) + url = _to_utf8(url) - opts = self.opts.derive(**kwargs) ++ opts = (opts or self.opts).derive(**kwargs) if DEBUG: DEBUG.debug('combined options: %s' % repr(opts)) (url,parts) = opts.urlparser.parse(url, opts) (scheme, host, path, parm, query, frag) = parts @@ -641,7 +678,7 @@ index e090e90..bdcdfe3 100644 if scheme == 'file' and not opts.copy_local: # just return the name of the local file - don't make a # copy currently -@@ -950,30 +1098,36 @@ class URLGrabber: +@@ -950,41 +1114,49 @@ class URLGrabber: elif not opts.range: if not opts.checkfunc is None: @@ -689,21 +726,24 @@ index e090e90..bdcdfe3 100644 + opts.exception = e + return _run_callback(opts.failfunc, opts) - def urlread(self, url, limit=None, **kwargs): +- def urlread(self, url, limit=None, **kwargs): ++ def urlread(self, url, limit=None, opts=None, **kwargs): """read the url into a string, up to 'limit' bytes -@@ -982,9 +1136,11 @@ class URLGrabber: + If the limit is exceeded, an exception will be thrown. Note + that urlread is NOT intended to be used as a way of saying "I want the first N bytes" but rather 'read the whole file into memory, but don't use too much' """ +- opts = self.opts.derive(**kwargs) + url = _to_utf8(url) - opts = self.opts.derive(**kwargs) ++ opts = (opts or self.opts).derive(**kwargs) if DEBUG: DEBUG.debug('combined options: %s' % repr(opts)) (url,parts) = opts.urlparser.parse(url, opts) + opts.find_proxy(url, parts[0]) if limit is not None: limit = limit + 1 -@@ -1000,12 +1156,8 @@ class URLGrabber: +@@ -1000,12 +1172,8 @@ class URLGrabber: else: s = fo.read(limit) if not opts.checkfunc is None: @@ -718,7 +758,7 @@ index e090e90..bdcdfe3 100644 finally: fo.close() return s -@@ -1020,6 +1172,7 @@ class URLGrabber: +@@ -1020,6 +1188,7 @@ class URLGrabber: return s def _make_callback(self, callback_obj): @@ -726,7 +766,7 @@ index e090e90..bdcdfe3 100644 if callable(callback_obj): return callback_obj, (), {} else: -@@ -1030,7 +1183,7 @@ class URLGrabber: +@@ -1030,7 +1199,7 @@ class URLGrabber: default_grabber = URLGrabber() @@ -735,7 +775,7 @@ index e090e90..bdcdfe3 100644 def __init__(self, url, filename, opts): self.fo = None self._hdr_dump = '' -@@ -1052,10 +1205,11 @@ class PyCurlFileObject(): +@@ -1052,10 +1221,11 @@ class PyCurlFileObject(): self._reget_length = 0 self._prog_running = False self._error = (None, None) @@ -749,7 +789,7 @@ index e090e90..bdcdfe3 100644 def __getattr__(self, name): """This effectively allows us to wrap at the instance level. Any attribute not found in _this_ object will be searched for -@@ -1085,9 +1239,14 @@ class PyCurlFileObject(): +@@ -1085,9 +1255,14 @@ class PyCurlFileObject(): return -1 def _hdr_retrieve(self, buf): @@ -765,7 +805,7 @@ index e090e90..bdcdfe3 100644 try: self._hdr_dump += buf # we have to get the size before we do the progress obj start -@@ -1104,7 +1263,17 @@ class PyCurlFileObject(): +@@ -1104,7 +1279,17 @@ class PyCurlFileObject(): s = parse150(buf) if s: self.size = int(s) @@ -784,7 +824,7 @@ index e090e90..bdcdfe3 100644 return len(buf) except KeyboardInterrupt: return pycurl.READFUNC_ABORT -@@ -1113,8 +1282,10 @@ class PyCurlFileObject(): +@@ -1113,8 +1298,10 @@ class PyCurlFileObject(): if self._parsed_hdr: return self._parsed_hdr statusend = self._hdr_dump.find('\n') @@ -795,7 +835,7 @@ index e090e90..bdcdfe3 100644 self._parsed_hdr = mimetools.Message(hdrfp) return self._parsed_hdr -@@ -1127,6 +1298,9 @@ class PyCurlFileObject(): +@@ -1127,6 +1314,9 @@ class PyCurlFileObject(): if not opts: opts = self.opts @@ -805,7 +845,7 @@ index e090e90..bdcdfe3 100644 # defaults we're always going to set self.curl_obj.setopt(pycurl.NOPROGRESS, False) -@@ -1136,11 +1310,21 @@ class PyCurlFileObject(): +@@ -1136,11 +1326,21 @@ class PyCurlFileObject(): self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update) self.curl_obj.setopt(pycurl.FAILONERROR, True) self.curl_obj.setopt(pycurl.OPT_FILETIME, True) @@ -827,7 +867,7 @@ index e090e90..bdcdfe3 100644 # maybe to be options later self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True) -@@ -1148,9 +1332,11 @@ class PyCurlFileObject(): +@@ -1148,9 +1348,11 @@ class PyCurlFileObject(): # timeouts timeout = 300 @@ -842,7 +882,7 @@ index e090e90..bdcdfe3 100644 # ssl options if self.scheme == 'https': -@@ -1158,13 +1344,16 @@ class PyCurlFileObject(): +@@ -1158,13 +1360,16 @@ class PyCurlFileObject(): self.curl_obj.setopt(pycurl.CAPATH, opts.ssl_ca_cert) self.curl_obj.setopt(pycurl.CAINFO, opts.ssl_ca_cert) self.curl_obj.setopt(pycurl.SSL_VERIFYPEER, opts.ssl_verify_peer) @@ -860,7 +900,7 @@ index e090e90..bdcdfe3 100644 if opts.ssl_cert_type: self.curl_obj.setopt(pycurl.SSLCERTTYPE, opts.ssl_cert_type) if opts.ssl_key_pass: -@@ -1187,28 +1376,26 @@ class PyCurlFileObject(): +@@ -1187,28 +1392,26 @@ class PyCurlFileObject(): if hasattr(opts, 'raw_throttle') and opts.raw_throttle(): self.curl_obj.setopt(pycurl.MAX_RECV_SPEED_LARGE, int(opts.raw_throttle())) @@ -905,7 +945,7 @@ index e090e90..bdcdfe3 100644 # our url self.curl_obj.setopt(pycurl.URL, self.url) -@@ -1228,12 +1415,14 @@ class PyCurlFileObject(): +@@ -1228,12 +1431,14 @@ class PyCurlFileObject(): code = self.http_code errcode = e.args[0] @@ -922,7 +962,7 @@ index e090e90..bdcdfe3 100644 # this is probably wrong but ultimately this is what happens # we have a legit http code and a pycurl 'writer failed' code -@@ -1244,23 +1433,23 @@ class PyCurlFileObject(): +@@ -1244,23 +1449,23 @@ class PyCurlFileObject(): raise KeyboardInterrupt elif errcode == 28: @@ -953,7 +993,7 @@ index e090e90..bdcdfe3 100644 # this is probably wrong but ultimately this is what happens # we have a legit http code and a pycurl 'writer failed' code # which almost always means something aborted it from outside -@@ -1272,33 +1461,94 @@ class PyCurlFileObject(): +@@ -1272,33 +1477,94 @@ class PyCurlFileObject(): elif errcode == 58: msg = _("problem with the local client certificate") err = URLGrabError(14, msg) @@ -1055,7 +1095,7 @@ index e090e90..bdcdfe3 100644 def _do_open(self): self.curl_obj = _curl_cache -@@ -1333,7 +1583,11 @@ class PyCurlFileObject(): +@@ -1333,7 +1599,11 @@ class PyCurlFileObject(): if self.opts.range: rt = self.opts.range @@ -1068,7 +1108,7 @@ index e090e90..bdcdfe3 100644 if rt: header = range_tuple_to_header(rt) -@@ -1434,21 +1688,46 @@ class PyCurlFileObject(): +@@ -1434,21 +1704,46 @@ class PyCurlFileObject(): #fh, self._temp_name = mkstemp() #self.fo = open(self._temp_name, 'wb') @@ -1122,7 +1162,7 @@ index e090e90..bdcdfe3 100644 else: #self.fo = open(self._temp_name, 'r') self.fo.seek(0) -@@ -1526,17 +1805,20 @@ class PyCurlFileObject(): +@@ -1526,17 +1821,20 @@ class PyCurlFileObject(): if self._prog_running: downloaded += self._reget_length self.opts.progress_obj.update(downloaded) @@ -1148,7 +1188,7 @@ index e090e90..bdcdfe3 100644 msg = _("Downloaded more than max size for %s: %s > %s") \ % (self.url, cur, max_size) -@@ -1544,13 +1826,6 @@ class PyCurlFileObject(): +@@ -1544,13 +1842,6 @@ class PyCurlFileObject(): return True return False @@ -1162,7 +1202,7 @@ index e090e90..bdcdfe3 100644 def read(self, amt=None): self._fill_buffer(amt) if amt is None: -@@ -1582,9 +1857,21 @@ class PyCurlFileObject(): +@@ -1582,9 +1873,21 @@ class PyCurlFileObject(): self.opts.progress_obj.end(self._amount_read) self.fo.close() @@ -1185,7 +1225,7 @@ index e090e90..bdcdfe3 100644 ##################################################################### # DEPRECATED FUNCTIONS -@@ -1621,6 +1908,445 @@ def retrygrab(url, filename=None, copy_local=0, close_connection=0, +@@ -1621,6 +1924,458 @@ def retrygrab(url, filename=None, copy_local=0, close_connection=0, ##################################################################### @@ -1309,7 +1349,7 @@ index e090e90..bdcdfe3 100644 + v = getattr(opts, k) + if v is None: continue + arg.append('%s=%s' % (k, _dumps(v))) -+ if opts.progress_obj: ++ if opts.progress_obj and opts.multi_progress_obj: + arg.append('progress_obj=True') + arg = ' '.join(arg) + if DEBUG: DEBUG.info('attempt %i/%s: %s', opts.tries, opts.retry, opts.url) @@ -1329,7 +1369,7 @@ index e090e90..bdcdfe3 100644 + line = line.split(' ', 5) + _id, size = map(int, line[:2]) + if len(line) == 2: -+ self.running[_id].progress_obj.update(size) ++ self.running[_id]._progress.update(size) + continue + # job done + opts = self.running.pop(_id) @@ -1398,19 +1438,20 @@ index e090e90..bdcdfe3 100644 + +_async_queue = [] + -+def parallel_wait(meter = 'text'): ++def parallel_wait(meter=None): + '''Process queued requests in parallel. + ''' + -+ if meter: -+ count = total = 0 -+ for opts in _async_queue: -+ if opts.progress_obj: -+ count += 1 -+ total += opts.size -+ if meter == 'text': -+ from progress import TextMultiFileMeter -+ meter = TextMultiFileMeter() ++ # calculate total sizes ++ meters = {} ++ for opts in _async_queue: ++ if opts.progress_obj and opts.multi_progress_obj: ++ count, total = meters.get(opts.multi_progress_obj) or (0, 0) ++ meters[opts.multi_progress_obj] = count + 1, total + opts.size ++ ++ # start multi-file meters ++ for meter in meters: ++ count, total = meters[meter] + meter.start(count, total) + + dl = _ExternalDownloaderPool() @@ -1420,11 +1461,12 @@ index e090e90..bdcdfe3 100644 + key, limit = opts.async + host_con[key] = host_con.get(key, 0) + 1 + opts.tries = tries -+ if meter and opts.progress_obj: -+ opts.progress_obj = meter.newMeter() -+ opts.progress_obj.start(text=opts.text, basename=os.path.basename(opts.filename)) -+ else: -+ opts.progress_obj = None ++ if opts.progress_obj: ++ if opts.multi_progress_obj: ++ opts._progress = opts.multi_progress_obj.newMeter() ++ opts._progress.start(text=opts.text) ++ else: ++ opts._progress = time.time() # no updates + if DEBUG: DEBUG.info('attempt %i/%s: %s', opts.tries, opts.retry, opts.url) + dl.start(opts) + @@ -1432,15 +1474,16 @@ index e090e90..bdcdfe3 100644 + for opts, size, ug_err in dl.perform(): + key, limit = opts.async + host_con[key] -= 1 -+ m = opts.progress_obj -+ if m: -+ if ug_err: -+ m.failure(ug_err.args[1]) ++ if opts.progress_obj: ++ if opts.multi_progress_obj: ++ opts.multi_progress_obj.re.total += size - opts.size # correct totals ++ opts._progress.end(size) ++ opts.multi_progress_obj.removeMeter(opts._progress) + else: -+ # file size might have changed -+ meter.re.total += size - opts.size -+ m.end(size) -+ meter.removeMeter(m) ++ opts.progress_obj.start(text=opts.text, now=opts._progress) ++ opts.progress_obj.update(size) ++ opts.progress_obj.end(size) ++ del opts._progress + + if ug_err is None: + if opts.checkfunc: @@ -1460,13 +1503,15 @@ index e090e90..bdcdfe3 100644 + continue + + if opts.mirror_group: -+ mg, failed, removed = opts.mirror_group ++ mg, errors, failed, removed = opts.mirror_group ++ errors.append((opts.url, str(ug_err))) + failed[key] = failed.get(key, 0) + 1 + opts.mirror = key + opts.exception = ug_err + action = mg.default_action or {} + if mg.failure_callback: -+ opts.tries = sum(failed.values()) ++ opts.tries = len(errors) ++ action = dict(action) # update only the copy + action.update(_run_callback(mg.failure_callback, opts)) + if not action.get('fail', 0): + # mask this mirror and retry @@ -1474,6 +1519,8 @@ index e090e90..bdcdfe3 100644 + removed.add(key) + _async_queue.append(opts) + continue ++ # fail=1 from callback ++ ug_err.errors = errors + + # urlgrab failed + opts.exception = ug_err @@ -1494,11 +1541,11 @@ index e090e90..bdcdfe3 100644 + idx += 1 + + # check global limit -+ while len(dl.running) >= opts.max_connections: ++ while len(dl.running) >= default_grabber.opts.max_connections: + perform() + + if opts.mirror_group: -+ mg, failed, removed = opts.mirror_group ++ mg, errors, failed, removed = opts.mirror_group + + # find the best mirror + best = None @@ -1519,9 +1566,14 @@ index e090e90..bdcdfe3 100644 + + if best is None: + opts.exception = URLGrabError(256, _('No more mirrors to try.')) ++ opts.exception.errors = errors + _run_callback(opts.failfunc, opts) + continue + ++ # update the grabber object, apply mirror kwargs ++ grabber = best.get('grabber') or mg.grabber ++ opts.delegate = grabber.opts.derive(**best.get('kwargs', {})) ++ + # update the current mirror and limit + key = best['mirror'] + limit = best.get('kwargs', {}).get('max_connections', 2) @@ -1544,7 +1596,8 @@ index e090e90..bdcdfe3 100644 + + finally: + dl.abort() -+ if meter: meter.end() ++ for meter in meters: ++ meter.end() + del _async_queue[:] + _TH.save() + @@ -1632,20 +1685,24 @@ index e090e90..bdcdfe3 100644 def _main_test(): try: url, filename = sys.argv[1:3] diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py -index dad410b..ac78b34 100644 +index dad410b..b17be17 100644 --- a/urlgrabber/mirror.py +++ b/urlgrabber/mirror.py @@ -76,6 +76,9 @@ CUSTOMIZATION 'grabber' is omitted, the default grabber will be used. If kwargs are omitted, then (duh) they will not be used. -+ kwarg 'max_connections' is used to store the max connection -+ limit of this mirror. ++ kwarg 'max_connections' limits the number of concurrent ++ connections to this mirror. + 3) Pass keyword arguments when instantiating the mirror group. See, for example, the failure_callback argument. -@@ -90,7 +93,8 @@ CUSTOMIZATION +@@ -87,10 +90,12 @@ CUSTOMIZATION + """ + + ++import sys import random import thread # needed for locking to make this threadsafe @@ -1655,7 +1712,28 @@ index dad410b..ac78b34 100644 def _(st): return st -@@ -184,6 +188,7 @@ class MirrorGroup: +@@ -126,7 +131,9 @@ class MirrorGroup: + files) + + * if the local list is ever exhausted, a URLGrabError will be +- raised (errno=256, no more mirrors) ++ raised (errno=256, No more mirrors). The 'errors' attribute ++ holds a list of (full_url, errmsg) tuples. This contains ++ all URLs tried and the corresponding error messages. + + OPTIONS + +@@ -153,7 +160,8 @@ class MirrorGroup: + + The 'fail' option will cause immediate failure by re-raising + the exception and no further attempts to get the current +- download. ++ download. As in the "No more mirrors" case, the 'errors' ++ attribute is set in the exception object. + + This dict can be set at instantiation time, + mg = MirrorGroup(grabber, mirrors, default_action={'fail':1}) +@@ -184,6 +192,7 @@ class MirrorGroup: obj.exception = < exception that was raised > obj.mirror = < the mirror that was tried > @@ -1663,7 +1741,7 @@ index dad410b..ac78b34 100644 obj.relative_url = < url relative to the mirror > obj.url = < full url that failed > # .url is just the combination of .mirror -@@ -263,7 +268,8 @@ class MirrorGroup: +@@ -263,7 +272,8 @@ class MirrorGroup: def _parse_mirrors(self, mirrors): parsed_mirrors = [] for m in mirrors: @@ -1673,7 +1751,35 @@ index dad410b..ac78b34 100644 parsed_mirrors.append(m) return parsed_mirrors -@@ -382,7 +388,9 @@ class MirrorGroup: +@@ -280,7 +290,9 @@ class MirrorGroup: + # return a random mirror so that multiple mirrors get used + # even without failures. + if not gr.mirrors: +- raise URLGrabError(256, _('No more mirrors to try.')) ++ e = URLGrabError(256, _('No more mirrors to try.')) ++ e.errors = gr.errors ++ raise e + return gr.mirrors[gr._next] + + def _failure(self, gr, cb_obj): +@@ -307,7 +319,9 @@ class MirrorGroup: + a.update(action) + action = a + self.increment_mirror(gr, action) +- if action and action.get('fail', 0): raise ++ if action and action.get('fail', 0): ++ sys.exc_info()[1].errors = gr.errors ++ raise + + def increment_mirror(self, gr, action={}): + """Tell the mirror object increment the mirror index +@@ -377,35 +391,50 @@ class MirrorGroup: + gr.url = url + gr.kw = dict(kw) + self._load_gr(gr) ++ gr.errors = [] + + for k in self.options: try: del kw[k] except KeyError: pass @@ -1682,8 +1788,21 @@ index dad410b..ac78b34 100644 + tries += 1 mirrorchoice = self._get_mirror(gr) fullurl = self._join_url(mirrorchoice['mirror'], gr.url) - kwargs = dict(mirrorchoice.get('kwargs', {})) -@@ -399,13 +407,24 @@ class MirrorGroup: +- kwargs = dict(mirrorchoice.get('kwargs', {})) +- kwargs.update(kw) + grabber = mirrorchoice.get('grabber') or self.grabber ++ # apply mirrorchoice kwargs on top of grabber.opts ++ opts = grabber.opts.derive(**mirrorchoice.get('kwargs', {})) + func_ref = getattr(grabber, func) + if DEBUG: DEBUG.info('MIRROR: trying %s -> %s', url, fullurl) + try: +- return func_ref( *(fullurl,), **kwargs ) ++ return func_ref( *(fullurl,), opts=opts, **kw ) + except URLGrabError, e: + if DEBUG: DEBUG.info('MIRROR: failed') ++ gr.errors.append((fullurl, str(e))) + obj = CallbackObject() + obj.exception = e obj.mirror = mirrorchoice['mirror'] obj.relative_url = gr.url obj.url = fullurl @@ -1695,7 +1814,7 @@ index dad410b..ac78b34 100644 kw['filename'] = filename + if kw.get('async'): + # enable mirror failovers in async path -+ kw['mirror_group'] = self, {}, set() ++ kw['mirror_group'] = self, [], {}, set() + kw['relative_url'] = url + else: + kw.pop('failfunc', None)