Update to latest HEAD.

epel9
Zdeněk Pavlas 13 years ago
parent 651b5b9090
commit 5228c98fd8

@ -3,7 +3,7 @@
Summary: A high-level cross-protocol url-grabber Summary: A high-level cross-protocol url-grabber
Name: python-urlgrabber Name: python-urlgrabber
Version: 3.9.1 Version: 3.9.1
Release: 17%{?dist} Release: 18%{?dist}
Source0: urlgrabber-%{version}.tar.gz Source0: urlgrabber-%{version}.tar.gz
Patch1: urlgrabber-HEAD.patch Patch1: urlgrabber-HEAD.patch
@ -44,6 +44,9 @@ rm -rf $RPM_BUILD_ROOT
%attr(0755,root,root) %{_libexecdir}/urlgrabber-ext-down %attr(0755,root,root) %{_libexecdir}/urlgrabber-ext-down
%changelog %changelog
* Wed Aug 22 2012 Zdeněk Pavlas <zpavlas@redhat.com> - 3.9.1-18
- Update to latest HEAD, lots of enhancements.
* Wed Aug 10 2012 Zdeněk Pavlas <zpavlas@redhat.com> - 3.9.1-17 * Wed Aug 10 2012 Zdeněk Pavlas <zpavlas@redhat.com> - 3.9.1-17
- Fix a bug in progress display code. BZ 847105. - Fix a bug in progress display code. BZ 847105.

@ -233,10 +233,10 @@ index 3e5f3b7..8eeaeda 100644
return (fb,lb) return (fb,lb)
diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py
index e090e90..bdcdfe3 100644 index e090e90..ffd5a10 100644
--- a/urlgrabber/grabber.py --- a/urlgrabber/grabber.py
+++ b/urlgrabber/grabber.py +++ b/urlgrabber/grabber.py
@@ -49,7 +49,7 @@ GENERAL ARGUMENTS (kwargs) @@ -49,11 +49,26 @@ GENERAL ARGUMENTS (kwargs)
progress_obj = None progress_obj = None
a class instance that supports the following methods: a class instance that supports the following methods:
@ -245,7 +245,26 @@ index e090e90..bdcdfe3 100644
# length will be None if unknown # length will be None if unknown
po.update(read) # read == bytes read so far po.update(read) # read == bytes read so far
po.end() po.end()
@@ -68,14 +68,14 @@ GENERAL ARGUMENTS (kwargs)
+ multi_progress_obj = None
+
+ a class instance that supports the following methods:
+ mo.start(total_files, total_size)
+ mo.newMeter() => meter
+ mo.removeMeter(meter)
+ mo.end()
+
+ The 'meter' object is similar to progress_obj, but multiple
+ instances may be created and updated at the same time.
+
+ When downloading multiple files in parallel and multi_progress_obj
+ is None progress_obj is used in compatibility mode: finished files
+ are shown but there's no in-progress display.
+
text = None
specifies alternative text to be passed to the progress meter
@@ -68,14 +83,14 @@ GENERAL ARGUMENTS (kwargs)
(which can be set on default_grabber.throttle) is used. See (which can be set on default_grabber.throttle) is used. See
BANDWIDTH THROTTLING for more information. BANDWIDTH THROTTLING for more information.
@ -267,7 +286,7 @@ index e090e90..bdcdfe3 100644
bandwidth = 0 bandwidth = 0
@@ -143,8 +143,12 @@ GENERAL ARGUMENTS (kwargs) @@ -143,8 +158,12 @@ GENERAL ARGUMENTS (kwargs)
note that proxy authentication information may be provided using note that proxy authentication information may be provided using
normal URL constructs: normal URL constructs:
proxies={ 'http' : 'http://user:host@foo:3128' } proxies={ 'http' : 'http://user:host@foo:3128' }
@ -282,7 +301,7 @@ index e090e90..bdcdfe3 100644
prefix = None prefix = None
@@ -198,6 +202,12 @@ GENERAL ARGUMENTS (kwargs) @@ -198,6 +217,12 @@ GENERAL ARGUMENTS (kwargs)
control, you should probably subclass URLParser and pass it in via control, you should probably subclass URLParser and pass it in via
the 'urlparser' option. the 'urlparser' option.
@ -295,7 +314,7 @@ index e090e90..bdcdfe3 100644
ssl_ca_cert = None ssl_ca_cert = None
this option can be used if M2Crypto is available and will be this option can be used if M2Crypto is available and will be
@@ -211,43 +221,75 @@ GENERAL ARGUMENTS (kwargs) @@ -211,43 +236,75 @@ GENERAL ARGUMENTS (kwargs)
No-op when using the curl backend (default) No-op when using the curl backend (default)
@ -380,7 +399,7 @@ index e090e90..bdcdfe3 100644
RETRY RELATED ARGUMENTS RETRY RELATED ARGUMENTS
@@ -328,6 +370,15 @@ RETRY RELATED ARGUMENTS @@ -328,6 +385,15 @@ RETRY RELATED ARGUMENTS
but it cannot (without severe trickiness) prevent the exception but it cannot (without severe trickiness) prevent the exception
from being raised. from being raised.
@ -396,7 +415,7 @@ index e090e90..bdcdfe3 100644
interrupt_callback = None interrupt_callback = None
This callback is called if KeyboardInterrupt is received at any This callback is called if KeyboardInterrupt is received at any
@@ -420,6 +471,7 @@ import time @@ -420,6 +486,7 @@ import time
import string import string
import urllib import urllib
import urllib2 import urllib2
@ -404,7 +423,7 @@ index e090e90..bdcdfe3 100644
import mimetools import mimetools
import thread import thread
import types import types
@@ -428,9 +480,17 @@ import pycurl @@ -428,9 +495,17 @@ import pycurl
from ftplib import parse150 from ftplib import parse150
from StringIO import StringIO from StringIO import StringIO
from httplib import HTTPException from httplib import HTTPException
@ -423,7 +442,7 @@ index e090e90..bdcdfe3 100644
######################################################################## ########################################################################
# MODULE INITIALIZATION # MODULE INITIALIZATION
######################################################################## ########################################################################
@@ -439,6 +499,12 @@ try: @@ -439,6 +514,12 @@ try:
except: except:
__version__ = '???' __version__ = '???'
@ -436,7 +455,7 @@ index e090e90..bdcdfe3 100644
######################################################################## ########################################################################
# functions for debugging output. These functions are here because they # functions for debugging output. These functions are here because they
# are also part of the module initialization. # are also part of the module initialization.
@@ -527,6 +593,22 @@ def _(st): @@ -527,6 +608,22 @@ def _(st):
# END MODULE INITIALIZATION # END MODULE INITIALIZATION
######################################################################## ########################################################################
@ -459,7 +478,7 @@ index e090e90..bdcdfe3 100644
class URLGrabError(IOError): class URLGrabError(IOError):
@@ -662,6 +744,7 @@ class URLParser: @@ -662,6 +759,7 @@ class URLParser:
opts.quote = 0 --> do not quote it opts.quote = 0 --> do not quote it
opts.quote = None --> guess opts.quote = None --> guess
""" """
@ -467,7 +486,7 @@ index e090e90..bdcdfe3 100644
quote = opts.quote quote = opts.quote
if opts.prefix: if opts.prefix:
@@ -768,6 +851,41 @@ class URLGrabberOptions: @@ -768,6 +866,41 @@ class URLGrabberOptions:
else: # throttle is a float else: # throttle is a float
return self.bandwidth * self.throttle return self.bandwidth * self.throttle
@ -509,7 +528,13 @@ index e090e90..bdcdfe3 100644
def derive(self, **kwargs): def derive(self, **kwargs):
"""Create a derived URLGrabberOptions instance. """Create a derived URLGrabberOptions instance.
This method creates a new instance and overrides the This method creates a new instance and overrides the
@@ -796,25 +914,31 @@ class URLGrabberOptions: @@ -791,30 +924,37 @@ class URLGrabberOptions:
provided here.
"""
self.progress_obj = None
+ self.multi_progress_obj = None
self.throttle = 1.0
self.bandwidth = 0
self.retry = None self.retry = None
self.retrycodes = [-1,2,4,5,6,7] self.retrycodes = [-1,2,4,5,6,7]
self.checkfunc = None self.checkfunc = None
@ -542,7 +567,7 @@ index e090e90..bdcdfe3 100644
self.ssl_ca_cert = None # sets SSL_CAINFO - path to certdb self.ssl_ca_cert = None # sets SSL_CAINFO - path to certdb
self.ssl_context = None # no-op in pycurl self.ssl_context = None # no-op in pycurl
self.ssl_verify_peer = True # check peer's cert for authenticityb self.ssl_verify_peer = True # check peer's cert for authenticityb
@@ -827,6 +951,12 @@ class URLGrabberOptions: @@ -827,6 +967,12 @@ class URLGrabberOptions:
self.size = None # if we know how big the thing we're getting is going self.size = None # if we know how big the thing we're getting is going
# to be. this is ultimately a MAXIMUM size for the file # to be. this is ultimately a MAXIMUM size for the file
self.max_header_size = 2097152 #2mb seems reasonable for maximum header size self.max_header_size = 2097152 #2mb seems reasonable for maximum header size
@ -555,7 +580,7 @@ index e090e90..bdcdfe3 100644
def __repr__(self): def __repr__(self):
return self.format() return self.format()
@@ -846,7 +976,18 @@ class URLGrabberOptions: @@ -846,7 +992,18 @@ class URLGrabberOptions:
s = s + indent + '}' s = s + indent + '}'
return s return s
@ -575,7 +600,7 @@ index e090e90..bdcdfe3 100644
"""Provides easy opening of URLs with a variety of options. """Provides easy opening of URLs with a variety of options.
All options are specified as kwargs. Options may be specified when All options are specified as kwargs. Options may be specified when
@@ -872,7 +1013,6 @@ class URLGrabber: @@ -872,7 +1029,6 @@ class URLGrabber:
# beware of infinite loops :) # beware of infinite loops :)
tries = tries + 1 tries = tries + 1
exception = None exception = None
@ -583,7 +608,7 @@ index e090e90..bdcdfe3 100644
callback = None callback = None
if DEBUG: DEBUG.info('attempt %i/%s: %s', if DEBUG: DEBUG.info('attempt %i/%s: %s',
tries, opts.retry, args[0]) tries, opts.retry, args[0])
@@ -883,23 +1023,24 @@ class URLGrabber: @@ -883,54 +1039,62 @@ class URLGrabber:
except URLGrabError, e: except URLGrabError, e:
exception = e exception = e
callback = opts.failure_callback callback = opts.failure_callback
@ -611,24 +636,36 @@ index e090e90..bdcdfe3 100644
if (retrycode is not None) and (retrycode not in opts.retrycodes): if (retrycode is not None) and (retrycode not in opts.retrycodes):
if DEBUG: DEBUG.info('retrycode (%i) not in list %s, re-raising', if DEBUG: DEBUG.info('retrycode (%i) not in list %s, re-raising',
retrycode, opts.retrycodes) retrycode, opts.retrycodes)
@@ -912,9 +1053,11 @@ class URLGrabber: raise
- def urlopen(self, url, **kwargs):
+ def urlopen(self, url, opts=None, **kwargs):
"""open the url and return a file object
If a progress object or throttle value specified when this
object was created, then a special file object will be
returned that supports them. The file object can be treated returned that supports them. The file object can be treated
like any other file object. like any other file object.
""" """
- opts = self.opts.derive(**kwargs)
+ url = _to_utf8(url) + url = _to_utf8(url)
opts = self.opts.derive(**kwargs) + opts = (opts or self.opts).derive(**kwargs)
if DEBUG: DEBUG.debug('combined options: %s' % repr(opts)) if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
(url,parts) = opts.urlparser.parse(url, opts) (url,parts) = opts.urlparser.parse(url, opts)
+ opts.find_proxy(url, parts[0]) + opts.find_proxy(url, parts[0])
def retryfunc(opts, url): def retryfunc(opts, url):
return PyCurlFileObject(url, filename=None, opts=opts) return PyCurlFileObject(url, filename=None, opts=opts)
return self._retry(opts, retryfunc, url) return self._retry(opts, retryfunc, url)
@@ -925,12 +1068,17 @@ class URLGrabber:
- def urlgrab(self, url, filename=None, **kwargs):
+ def urlgrab(self, url, filename=None, opts=None, **kwargs):
"""grab the file at <url> and make a local copy at <filename>
If filename is none, the basename of the url is used.
urlgrab returns the filename of the local file, which may be urlgrab returns the filename of the local file, which may be
different from the passed-in filename if copy_local == 0. different from the passed-in filename if copy_local == 0.
""" """
- opts = self.opts.derive(**kwargs)
+ url = _to_utf8(url) + url = _to_utf8(url)
opts = self.opts.derive(**kwargs) + opts = (opts or self.opts).derive(**kwargs)
if DEBUG: DEBUG.debug('combined options: %s' % repr(opts)) if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
(url,parts) = opts.urlparser.parse(url, opts) (url,parts) = opts.urlparser.parse(url, opts)
(scheme, host, path, parm, query, frag) = parts (scheme, host, path, parm, query, frag) = parts
@ -641,7 +678,7 @@ index e090e90..bdcdfe3 100644
if scheme == 'file' and not opts.copy_local: if scheme == 'file' and not opts.copy_local:
# just return the name of the local file - don't make a # just return the name of the local file - don't make a
# copy currently # copy currently
@@ -950,30 +1098,36 @@ class URLGrabber: @@ -950,41 +1114,49 @@ class URLGrabber:
elif not opts.range: elif not opts.range:
if not opts.checkfunc is None: if not opts.checkfunc is None:
@ -689,21 +726,24 @@ index e090e90..bdcdfe3 100644
+ opts.exception = e + opts.exception = e
+ return _run_callback(opts.failfunc, opts) + return _run_callback(opts.failfunc, opts)
def urlread(self, url, limit=None, **kwargs): - def urlread(self, url, limit=None, **kwargs):
+ def urlread(self, url, limit=None, opts=None, **kwargs):
"""read the url into a string, up to 'limit' bytes """read the url into a string, up to 'limit' bytes
@@ -982,9 +1136,11 @@ class URLGrabber: If the limit is exceeded, an exception will be thrown. Note
that urlread is NOT intended to be used as a way of saying
"I want the first N bytes" but rather 'read the whole file "I want the first N bytes" but rather 'read the whole file
into memory, but don't use too much' into memory, but don't use too much'
""" """
- opts = self.opts.derive(**kwargs)
+ url = _to_utf8(url) + url = _to_utf8(url)
opts = self.opts.derive(**kwargs) + opts = (opts or self.opts).derive(**kwargs)
if DEBUG: DEBUG.debug('combined options: %s' % repr(opts)) if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
(url,parts) = opts.urlparser.parse(url, opts) (url,parts) = opts.urlparser.parse(url, opts)
+ opts.find_proxy(url, parts[0]) + opts.find_proxy(url, parts[0])
if limit is not None: if limit is not None:
limit = limit + 1 limit = limit + 1
@@ -1000,12 +1156,8 @@ class URLGrabber: @@ -1000,12 +1172,8 @@ class URLGrabber:
else: s = fo.read(limit) else: s = fo.read(limit)
if not opts.checkfunc is None: if not opts.checkfunc is None:
@ -718,7 +758,7 @@ index e090e90..bdcdfe3 100644
finally: finally:
fo.close() fo.close()
return s return s
@@ -1020,6 +1172,7 @@ class URLGrabber: @@ -1020,6 +1188,7 @@ class URLGrabber:
return s return s
def _make_callback(self, callback_obj): def _make_callback(self, callback_obj):
@ -726,7 +766,7 @@ index e090e90..bdcdfe3 100644
if callable(callback_obj): if callable(callback_obj):
return callback_obj, (), {} return callback_obj, (), {}
else: else:
@@ -1030,7 +1183,7 @@ class URLGrabber: @@ -1030,7 +1199,7 @@ class URLGrabber:
default_grabber = URLGrabber() default_grabber = URLGrabber()
@ -735,7 +775,7 @@ index e090e90..bdcdfe3 100644
def __init__(self, url, filename, opts): def __init__(self, url, filename, opts):
self.fo = None self.fo = None
self._hdr_dump = '' self._hdr_dump = ''
@@ -1052,10 +1205,11 @@ class PyCurlFileObject(): @@ -1052,10 +1221,11 @@ class PyCurlFileObject():
self._reget_length = 0 self._reget_length = 0
self._prog_running = False self._prog_running = False
self._error = (None, None) self._error = (None, None)
@ -749,7 +789,7 @@ index e090e90..bdcdfe3 100644
def __getattr__(self, name): def __getattr__(self, name):
"""This effectively allows us to wrap at the instance level. """This effectively allows us to wrap at the instance level.
Any attribute not found in _this_ object will be searched for Any attribute not found in _this_ object will be searched for
@@ -1085,9 +1239,14 @@ class PyCurlFileObject(): @@ -1085,9 +1255,14 @@ class PyCurlFileObject():
return -1 return -1
def _hdr_retrieve(self, buf): def _hdr_retrieve(self, buf):
@ -765,7 +805,7 @@ index e090e90..bdcdfe3 100644
try: try:
self._hdr_dump += buf self._hdr_dump += buf
# we have to get the size before we do the progress obj start # we have to get the size before we do the progress obj start
@@ -1104,7 +1263,17 @@ class PyCurlFileObject(): @@ -1104,7 +1279,17 @@ class PyCurlFileObject():
s = parse150(buf) s = parse150(buf)
if s: if s:
self.size = int(s) self.size = int(s)
@ -784,7 +824,7 @@ index e090e90..bdcdfe3 100644
return len(buf) return len(buf)
except KeyboardInterrupt: except KeyboardInterrupt:
return pycurl.READFUNC_ABORT return pycurl.READFUNC_ABORT
@@ -1113,8 +1282,10 @@ class PyCurlFileObject(): @@ -1113,8 +1298,10 @@ class PyCurlFileObject():
if self._parsed_hdr: if self._parsed_hdr:
return self._parsed_hdr return self._parsed_hdr
statusend = self._hdr_dump.find('\n') statusend = self._hdr_dump.find('\n')
@ -795,7 +835,7 @@ index e090e90..bdcdfe3 100644
self._parsed_hdr = mimetools.Message(hdrfp) self._parsed_hdr = mimetools.Message(hdrfp)
return self._parsed_hdr return self._parsed_hdr
@@ -1127,6 +1298,9 @@ class PyCurlFileObject(): @@ -1127,6 +1314,9 @@ class PyCurlFileObject():
if not opts: if not opts:
opts = self.opts opts = self.opts
@ -805,7 +845,7 @@ index e090e90..bdcdfe3 100644
# defaults we're always going to set # defaults we're always going to set
self.curl_obj.setopt(pycurl.NOPROGRESS, False) self.curl_obj.setopt(pycurl.NOPROGRESS, False)
@@ -1136,11 +1310,21 @@ class PyCurlFileObject(): @@ -1136,11 +1326,21 @@ class PyCurlFileObject():
self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update) self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update)
self.curl_obj.setopt(pycurl.FAILONERROR, True) self.curl_obj.setopt(pycurl.FAILONERROR, True)
self.curl_obj.setopt(pycurl.OPT_FILETIME, True) self.curl_obj.setopt(pycurl.OPT_FILETIME, True)
@ -827,7 +867,7 @@ index e090e90..bdcdfe3 100644
# maybe to be options later # maybe to be options later
self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True) self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)
@@ -1148,9 +1332,11 @@ class PyCurlFileObject(): @@ -1148,9 +1348,11 @@ class PyCurlFileObject():
# timeouts # timeouts
timeout = 300 timeout = 300
@ -842,7 +882,7 @@ index e090e90..bdcdfe3 100644
# ssl options # ssl options
if self.scheme == 'https': if self.scheme == 'https':
@@ -1158,13 +1344,16 @@ class PyCurlFileObject(): @@ -1158,13 +1360,16 @@ class PyCurlFileObject():
self.curl_obj.setopt(pycurl.CAPATH, opts.ssl_ca_cert) self.curl_obj.setopt(pycurl.CAPATH, opts.ssl_ca_cert)
self.curl_obj.setopt(pycurl.CAINFO, opts.ssl_ca_cert) self.curl_obj.setopt(pycurl.CAINFO, opts.ssl_ca_cert)
self.curl_obj.setopt(pycurl.SSL_VERIFYPEER, opts.ssl_verify_peer) self.curl_obj.setopt(pycurl.SSL_VERIFYPEER, opts.ssl_verify_peer)
@ -860,7 +900,7 @@ index e090e90..bdcdfe3 100644
if opts.ssl_cert_type: if opts.ssl_cert_type:
self.curl_obj.setopt(pycurl.SSLCERTTYPE, opts.ssl_cert_type) self.curl_obj.setopt(pycurl.SSLCERTTYPE, opts.ssl_cert_type)
if opts.ssl_key_pass: if opts.ssl_key_pass:
@@ -1187,28 +1376,26 @@ class PyCurlFileObject(): @@ -1187,28 +1392,26 @@ class PyCurlFileObject():
if hasattr(opts, 'raw_throttle') and opts.raw_throttle(): if hasattr(opts, 'raw_throttle') and opts.raw_throttle():
self.curl_obj.setopt(pycurl.MAX_RECV_SPEED_LARGE, int(opts.raw_throttle())) self.curl_obj.setopt(pycurl.MAX_RECV_SPEED_LARGE, int(opts.raw_throttle()))
@ -905,7 +945,7 @@ index e090e90..bdcdfe3 100644
# our url # our url
self.curl_obj.setopt(pycurl.URL, self.url) self.curl_obj.setopt(pycurl.URL, self.url)
@@ -1228,12 +1415,14 @@ class PyCurlFileObject(): @@ -1228,12 +1431,14 @@ class PyCurlFileObject():
code = self.http_code code = self.http_code
errcode = e.args[0] errcode = e.args[0]
@ -922,7 +962,7 @@ index e090e90..bdcdfe3 100644
# this is probably wrong but ultimately this is what happens # this is probably wrong but ultimately this is what happens
# we have a legit http code and a pycurl 'writer failed' code # we have a legit http code and a pycurl 'writer failed' code
@@ -1244,23 +1433,23 @@ class PyCurlFileObject(): @@ -1244,23 +1449,23 @@ class PyCurlFileObject():
raise KeyboardInterrupt raise KeyboardInterrupt
elif errcode == 28: elif errcode == 28:
@ -953,7 +993,7 @@ index e090e90..bdcdfe3 100644
# this is probably wrong but ultimately this is what happens # this is probably wrong but ultimately this is what happens
# we have a legit http code and a pycurl 'writer failed' code # we have a legit http code and a pycurl 'writer failed' code
# which almost always means something aborted it from outside # which almost always means something aborted it from outside
@@ -1272,33 +1461,94 @@ class PyCurlFileObject(): @@ -1272,33 +1477,94 @@ class PyCurlFileObject():
elif errcode == 58: elif errcode == 58:
msg = _("problem with the local client certificate") msg = _("problem with the local client certificate")
err = URLGrabError(14, msg) err = URLGrabError(14, msg)
@ -1055,7 +1095,7 @@ index e090e90..bdcdfe3 100644
def _do_open(self): def _do_open(self):
self.curl_obj = _curl_cache self.curl_obj = _curl_cache
@@ -1333,7 +1583,11 @@ class PyCurlFileObject(): @@ -1333,7 +1599,11 @@ class PyCurlFileObject():
if self.opts.range: if self.opts.range:
rt = self.opts.range rt = self.opts.range
@ -1068,7 +1108,7 @@ index e090e90..bdcdfe3 100644
if rt: if rt:
header = range_tuple_to_header(rt) header = range_tuple_to_header(rt)
@@ -1434,21 +1688,46 @@ class PyCurlFileObject(): @@ -1434,21 +1704,46 @@ class PyCurlFileObject():
#fh, self._temp_name = mkstemp() #fh, self._temp_name = mkstemp()
#self.fo = open(self._temp_name, 'wb') #self.fo = open(self._temp_name, 'wb')
@ -1122,7 +1162,7 @@ index e090e90..bdcdfe3 100644
else: else:
#self.fo = open(self._temp_name, 'r') #self.fo = open(self._temp_name, 'r')
self.fo.seek(0) self.fo.seek(0)
@@ -1526,17 +1805,20 @@ class PyCurlFileObject(): @@ -1526,17 +1821,20 @@ class PyCurlFileObject():
if self._prog_running: if self._prog_running:
downloaded += self._reget_length downloaded += self._reget_length
self.opts.progress_obj.update(downloaded) self.opts.progress_obj.update(downloaded)
@ -1148,7 +1188,7 @@ index e090e90..bdcdfe3 100644
msg = _("Downloaded more than max size for %s: %s > %s") \ msg = _("Downloaded more than max size for %s: %s > %s") \
% (self.url, cur, max_size) % (self.url, cur, max_size)
@@ -1544,13 +1826,6 @@ class PyCurlFileObject(): @@ -1544,13 +1842,6 @@ class PyCurlFileObject():
return True return True
return False return False
@ -1162,7 +1202,7 @@ index e090e90..bdcdfe3 100644
def read(self, amt=None): def read(self, amt=None):
self._fill_buffer(amt) self._fill_buffer(amt)
if amt is None: if amt is None:
@@ -1582,9 +1857,21 @@ class PyCurlFileObject(): @@ -1582,9 +1873,21 @@ class PyCurlFileObject():
self.opts.progress_obj.end(self._amount_read) self.opts.progress_obj.end(self._amount_read)
self.fo.close() self.fo.close()
@ -1185,7 +1225,7 @@ index e090e90..bdcdfe3 100644
##################################################################### #####################################################################
# DEPRECATED FUNCTIONS # DEPRECATED FUNCTIONS
@@ -1621,6 +1908,445 @@ def retrygrab(url, filename=None, copy_local=0, close_connection=0, @@ -1621,6 +1924,458 @@ def retrygrab(url, filename=None, copy_local=0, close_connection=0,
##################################################################### #####################################################################
@ -1309,7 +1349,7 @@ index e090e90..bdcdfe3 100644
+ v = getattr(opts, k) + v = getattr(opts, k)
+ if v is None: continue + if v is None: continue
+ arg.append('%s=%s' % (k, _dumps(v))) + arg.append('%s=%s' % (k, _dumps(v)))
+ if opts.progress_obj: + if opts.progress_obj and opts.multi_progress_obj:
+ arg.append('progress_obj=True') + arg.append('progress_obj=True')
+ arg = ' '.join(arg) + arg = ' '.join(arg)
+ if DEBUG: DEBUG.info('attempt %i/%s: %s', opts.tries, opts.retry, opts.url) + if DEBUG: DEBUG.info('attempt %i/%s: %s', opts.tries, opts.retry, opts.url)
@ -1329,7 +1369,7 @@ index e090e90..bdcdfe3 100644
+ line = line.split(' ', 5) + line = line.split(' ', 5)
+ _id, size = map(int, line[:2]) + _id, size = map(int, line[:2])
+ if len(line) == 2: + if len(line) == 2:
+ self.running[_id].progress_obj.update(size) + self.running[_id]._progress.update(size)
+ continue + continue
+ # job done + # job done
+ opts = self.running.pop(_id) + opts = self.running.pop(_id)
@ -1398,19 +1438,20 @@ index e090e90..bdcdfe3 100644
+ +
+_async_queue = [] +_async_queue = []
+ +
+def parallel_wait(meter = 'text'): +def parallel_wait(meter=None):
+ '''Process queued requests in parallel. + '''Process queued requests in parallel.
+ ''' + '''
+ +
+ if meter: + # calculate total sizes
+ count = total = 0 + meters = {}
+ for opts in _async_queue: + for opts in _async_queue:
+ if opts.progress_obj: + if opts.progress_obj and opts.multi_progress_obj:
+ count += 1 + count, total = meters.get(opts.multi_progress_obj) or (0, 0)
+ total += opts.size + meters[opts.multi_progress_obj] = count + 1, total + opts.size
+ if meter == 'text': +
+ from progress import TextMultiFileMeter + # start multi-file meters
+ meter = TextMultiFileMeter() + for meter in meters:
+ count, total = meters[meter]
+ meter.start(count, total) + meter.start(count, total)
+ +
+ dl = _ExternalDownloaderPool() + dl = _ExternalDownloaderPool()
@ -1420,11 +1461,12 @@ index e090e90..bdcdfe3 100644
+ key, limit = opts.async + key, limit = opts.async
+ host_con[key] = host_con.get(key, 0) + 1 + host_con[key] = host_con.get(key, 0) + 1
+ opts.tries = tries + opts.tries = tries
+ if meter and opts.progress_obj: + if opts.progress_obj:
+ opts.progress_obj = meter.newMeter() + if opts.multi_progress_obj:
+ opts.progress_obj.start(text=opts.text, basename=os.path.basename(opts.filename)) + opts._progress = opts.multi_progress_obj.newMeter()
+ else: + opts._progress.start(text=opts.text)
+ opts.progress_obj = None + else:
+ opts._progress = time.time() # no updates
+ if DEBUG: DEBUG.info('attempt %i/%s: %s', opts.tries, opts.retry, opts.url) + if DEBUG: DEBUG.info('attempt %i/%s: %s', opts.tries, opts.retry, opts.url)
+ dl.start(opts) + dl.start(opts)
+ +
@ -1432,15 +1474,16 @@ index e090e90..bdcdfe3 100644
+ for opts, size, ug_err in dl.perform(): + for opts, size, ug_err in dl.perform():
+ key, limit = opts.async + key, limit = opts.async
+ host_con[key] -= 1 + host_con[key] -= 1
+ m = opts.progress_obj + if opts.progress_obj:
+ if m: + if opts.multi_progress_obj:
+ if ug_err: + opts.multi_progress_obj.re.total += size - opts.size # correct totals
+ m.failure(ug_err.args[1]) + opts._progress.end(size)
+ opts.multi_progress_obj.removeMeter(opts._progress)
+ else: + else:
+ # file size might have changed + opts.progress_obj.start(text=opts.text, now=opts._progress)
+ meter.re.total += size - opts.size + opts.progress_obj.update(size)
+ m.end(size) + opts.progress_obj.end(size)
+ meter.removeMeter(m) + del opts._progress
+ +
+ if ug_err is None: + if ug_err is None:
+ if opts.checkfunc: + if opts.checkfunc:
@ -1460,13 +1503,15 @@ index e090e90..bdcdfe3 100644
+ continue + continue
+ +
+ if opts.mirror_group: + if opts.mirror_group:
+ mg, failed, removed = opts.mirror_group + mg, errors, failed, removed = opts.mirror_group
+ errors.append((opts.url, str(ug_err)))
+ failed[key] = failed.get(key, 0) + 1 + failed[key] = failed.get(key, 0) + 1
+ opts.mirror = key + opts.mirror = key
+ opts.exception = ug_err + opts.exception = ug_err
+ action = mg.default_action or {} + action = mg.default_action or {}
+ if mg.failure_callback: + if mg.failure_callback:
+ opts.tries = sum(failed.values()) + opts.tries = len(errors)
+ action = dict(action) # update only the copy
+ action.update(_run_callback(mg.failure_callback, opts)) + action.update(_run_callback(mg.failure_callback, opts))
+ if not action.get('fail', 0): + if not action.get('fail', 0):
+ # mask this mirror and retry + # mask this mirror and retry
@ -1474,6 +1519,8 @@ index e090e90..bdcdfe3 100644
+ removed.add(key) + removed.add(key)
+ _async_queue.append(opts) + _async_queue.append(opts)
+ continue + continue
+ # fail=1 from callback
+ ug_err.errors = errors
+ +
+ # urlgrab failed + # urlgrab failed
+ opts.exception = ug_err + opts.exception = ug_err
@ -1494,11 +1541,11 @@ index e090e90..bdcdfe3 100644
+ idx += 1 + idx += 1
+ +
+ # check global limit + # check global limit
+ while len(dl.running) >= opts.max_connections: + while len(dl.running) >= default_grabber.opts.max_connections:
+ perform() + perform()
+ +
+ if opts.mirror_group: + if opts.mirror_group:
+ mg, failed, removed = opts.mirror_group + mg, errors, failed, removed = opts.mirror_group
+ +
+ # find the best mirror + # find the best mirror
+ best = None + best = None
@ -1519,9 +1566,14 @@ index e090e90..bdcdfe3 100644
+ +
+ if best is None: + if best is None:
+ opts.exception = URLGrabError(256, _('No more mirrors to try.')) + opts.exception = URLGrabError(256, _('No more mirrors to try.'))
+ opts.exception.errors = errors
+ _run_callback(opts.failfunc, opts) + _run_callback(opts.failfunc, opts)
+ continue + continue
+ +
+ # update the grabber object, apply mirror kwargs
+ grabber = best.get('grabber') or mg.grabber
+ opts.delegate = grabber.opts.derive(**best.get('kwargs', {}))
+
+ # update the current mirror and limit + # update the current mirror and limit
+ key = best['mirror'] + key = best['mirror']
+ limit = best.get('kwargs', {}).get('max_connections', 2) + limit = best.get('kwargs', {}).get('max_connections', 2)
@ -1544,7 +1596,8 @@ index e090e90..bdcdfe3 100644
+ +
+ finally: + finally:
+ dl.abort() + dl.abort()
+ if meter: meter.end() + for meter in meters:
+ meter.end()
+ del _async_queue[:] + del _async_queue[:]
+ _TH.save() + _TH.save()
+ +
@ -1632,20 +1685,24 @@ index e090e90..bdcdfe3 100644
def _main_test(): def _main_test():
try: url, filename = sys.argv[1:3] try: url, filename = sys.argv[1:3]
diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py
index dad410b..ac78b34 100644 index dad410b..b17be17 100644
--- a/urlgrabber/mirror.py --- a/urlgrabber/mirror.py
+++ b/urlgrabber/mirror.py +++ b/urlgrabber/mirror.py
@@ -76,6 +76,9 @@ CUSTOMIZATION @@ -76,6 +76,9 @@ CUSTOMIZATION
'grabber' is omitted, the default grabber will be used. If 'grabber' is omitted, the default grabber will be used. If
kwargs are omitted, then (duh) they will not be used. kwargs are omitted, then (duh) they will not be used.
+ kwarg 'max_connections' is used to store the max connection + kwarg 'max_connections' limits the number of concurrent
+ limit of this mirror. + connections to this mirror.
+ +
3) Pass keyword arguments when instantiating the mirror group. 3) Pass keyword arguments when instantiating the mirror group.
See, for example, the failure_callback argument. See, for example, the failure_callback argument.
@@ -90,7 +93,8 @@ CUSTOMIZATION @@ -87,10 +90,12 @@ CUSTOMIZATION
"""
+import sys
import random import random
import thread # needed for locking to make this threadsafe import thread # needed for locking to make this threadsafe
@ -1655,7 +1712,28 @@ index dad410b..ac78b34 100644
def _(st): def _(st):
return st return st
@@ -184,6 +188,7 @@ class MirrorGroup: @@ -126,7 +131,9 @@ class MirrorGroup:
files)
* if the local list is ever exhausted, a URLGrabError will be
- raised (errno=256, no more mirrors)
+ raised (errno=256, No more mirrors). The 'errors' attribute
+ holds a list of (full_url, errmsg) tuples. This contains
+ all URLs tried and the corresponding error messages.
OPTIONS
@@ -153,7 +160,8 @@ class MirrorGroup:
The 'fail' option will cause immediate failure by re-raising
the exception and no further attempts to get the current
- download.
+ download. As in the "No more mirrors" case, the 'errors'
+ attribute is set in the exception object.
This dict can be set at instantiation time,
mg = MirrorGroup(grabber, mirrors, default_action={'fail':1})
@@ -184,6 +192,7 @@ class MirrorGroup:
obj.exception = < exception that was raised > obj.exception = < exception that was raised >
obj.mirror = < the mirror that was tried > obj.mirror = < the mirror that was tried >
@ -1663,7 +1741,7 @@ index dad410b..ac78b34 100644
obj.relative_url = < url relative to the mirror > obj.relative_url = < url relative to the mirror >
obj.url = < full url that failed > obj.url = < full url that failed >
# .url is just the combination of .mirror # .url is just the combination of .mirror
@@ -263,7 +268,8 @@ class MirrorGroup: @@ -263,7 +272,8 @@ class MirrorGroup:
def _parse_mirrors(self, mirrors): def _parse_mirrors(self, mirrors):
parsed_mirrors = [] parsed_mirrors = []
for m in mirrors: for m in mirrors:
@ -1673,7 +1751,35 @@ index dad410b..ac78b34 100644
parsed_mirrors.append(m) parsed_mirrors.append(m)
return parsed_mirrors return parsed_mirrors
@@ -382,7 +388,9 @@ class MirrorGroup: @@ -280,7 +290,9 @@ class MirrorGroup:
# return a random mirror so that multiple mirrors get used
# even without failures.
if not gr.mirrors:
- raise URLGrabError(256, _('No more mirrors to try.'))
+ e = URLGrabError(256, _('No more mirrors to try.'))
+ e.errors = gr.errors
+ raise e
return gr.mirrors[gr._next]
def _failure(self, gr, cb_obj):
@@ -307,7 +319,9 @@ class MirrorGroup:
a.update(action)
action = a
self.increment_mirror(gr, action)
- if action and action.get('fail', 0): raise
+ if action and action.get('fail', 0):
+ sys.exc_info()[1].errors = gr.errors
+ raise
def increment_mirror(self, gr, action={}):
"""Tell the mirror object increment the mirror index
@@ -377,35 +391,50 @@ class MirrorGroup:
gr.url = url
gr.kw = dict(kw)
self._load_gr(gr)
+ gr.errors = []
for k in self.options:
try: del kw[k] try: del kw[k]
except KeyError: pass except KeyError: pass
@ -1682,8 +1788,21 @@ index dad410b..ac78b34 100644
+ tries += 1 + tries += 1
mirrorchoice = self._get_mirror(gr) mirrorchoice = self._get_mirror(gr)
fullurl = self._join_url(mirrorchoice['mirror'], gr.url) fullurl = self._join_url(mirrorchoice['mirror'], gr.url)
kwargs = dict(mirrorchoice.get('kwargs', {})) - kwargs = dict(mirrorchoice.get('kwargs', {}))
@@ -399,13 +407,24 @@ class MirrorGroup: - kwargs.update(kw)
grabber = mirrorchoice.get('grabber') or self.grabber
+ # apply mirrorchoice kwargs on top of grabber.opts
+ opts = grabber.opts.derive(**mirrorchoice.get('kwargs', {}))
func_ref = getattr(grabber, func)
if DEBUG: DEBUG.info('MIRROR: trying %s -> %s', url, fullurl)
try:
- return func_ref( *(fullurl,), **kwargs )
+ return func_ref( *(fullurl,), opts=opts, **kw )
except URLGrabError, e:
if DEBUG: DEBUG.info('MIRROR: failed')
+ gr.errors.append((fullurl, str(e)))
obj = CallbackObject()
obj.exception = e
obj.mirror = mirrorchoice['mirror'] obj.mirror = mirrorchoice['mirror']
obj.relative_url = gr.url obj.relative_url = gr.url
obj.url = fullurl obj.url = fullurl
@ -1695,7 +1814,7 @@ index dad410b..ac78b34 100644
kw['filename'] = filename kw['filename'] = filename
+ if kw.get('async'): + if kw.get('async'):
+ # enable mirror failovers in async path + # enable mirror failovers in async path
+ kw['mirror_group'] = self, {}, set() + kw['mirror_group'] = self, [], {}, set()
+ kw['relative_url'] = url + kw['relative_url'] = url
+ else: + else:
+ kw.pop('failfunc', None) + kw.pop('failfunc', None)

Loading…
Cancel
Save