|
|
|
@ -236,7 +236,7 @@ index 3e5f3b7..8eeaeda 100644
|
|
|
|
|
return (fb,lb)
|
|
|
|
|
|
|
|
|
|
diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py
|
|
|
|
|
index e090e90..01218b0 100644
|
|
|
|
|
index e090e90..74a692c 100644
|
|
|
|
|
--- a/urlgrabber/grabber.py
|
|
|
|
|
+++ b/urlgrabber/grabber.py
|
|
|
|
|
@@ -49,11 +49,26 @@ GENERAL ARGUMENTS (kwargs)
|
|
|
|
@ -458,7 +458,7 @@ index e090e90..01218b0 100644
|
|
|
|
|
########################################################################
|
|
|
|
|
# functions for debugging output. These functions are here because they
|
|
|
|
|
# are also part of the module initialization.
|
|
|
|
|
@@ -527,6 +608,22 @@ def _(st):
|
|
|
|
|
@@ -527,6 +608,29 @@ def _(st):
|
|
|
|
|
# END MODULE INITIALIZATION
|
|
|
|
|
########################################################################
|
|
|
|
|
|
|
|
|
@ -475,13 +475,20 @@ index e090e90..01218b0 100644
|
|
|
|
|
+ obj = obj.encode('utf-8', errors)
|
|
|
|
|
+ return obj
|
|
|
|
|
+
|
|
|
|
|
+def exception2msg(e):
|
|
|
|
|
+ try:
|
|
|
|
|
+ return str(e)
|
|
|
|
|
+ except UnicodeEncodeError:
|
|
|
|
|
+ # always use byte strings
|
|
|
|
|
+ return unicode(e).encode('utf8')
|
|
|
|
|
+
|
|
|
|
|
+########################################################################
|
|
|
|
|
+# END UTILITY FUNCTIONS
|
|
|
|
|
+########################################################################
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class URLGrabError(IOError):
|
|
|
|
|
@@ -662,6 +759,7 @@ class URLParser:
|
|
|
|
|
@@ -662,6 +766,7 @@ class URLParser:
|
|
|
|
|
opts.quote = 0 --> do not quote it
|
|
|
|
|
opts.quote = None --> guess
|
|
|
|
|
"""
|
|
|
|
@ -489,7 +496,7 @@ index e090e90..01218b0 100644
|
|
|
|
|
quote = opts.quote
|
|
|
|
|
|
|
|
|
|
if opts.prefix:
|
|
|
|
|
@@ -768,6 +866,41 @@ class URLGrabberOptions:
|
|
|
|
|
@@ -768,6 +873,41 @@ class URLGrabberOptions:
|
|
|
|
|
else: # throttle is a float
|
|
|
|
|
return self.bandwidth * self.throttle
|
|
|
|
|
|
|
|
|
@ -531,7 +538,7 @@ index e090e90..01218b0 100644
|
|
|
|
|
def derive(self, **kwargs):
|
|
|
|
|
"""Create a derived URLGrabberOptions instance.
|
|
|
|
|
This method creates a new instance and overrides the
|
|
|
|
|
@@ -791,30 +924,37 @@ class URLGrabberOptions:
|
|
|
|
|
@@ -791,30 +931,37 @@ class URLGrabberOptions:
|
|
|
|
|
provided here.
|
|
|
|
|
"""
|
|
|
|
|
self.progress_obj = None
|
|
|
|
@ -570,7 +577,7 @@ index e090e90..01218b0 100644
|
|
|
|
|
self.ssl_ca_cert = None # sets SSL_CAINFO - path to certdb
|
|
|
|
|
self.ssl_context = None # no-op in pycurl
|
|
|
|
|
self.ssl_verify_peer = True # check peer's cert for authenticityb
|
|
|
|
|
@@ -827,6 +967,12 @@ class URLGrabberOptions:
|
|
|
|
|
@@ -827,6 +974,12 @@ class URLGrabberOptions:
|
|
|
|
|
self.size = None # if we know how big the thing we're getting is going
|
|
|
|
|
# to be. this is ultimately a MAXIMUM size for the file
|
|
|
|
|
self.max_header_size = 2097152 #2mb seems reasonable for maximum header size
|
|
|
|
@ -583,7 +590,7 @@ index e090e90..01218b0 100644
|
|
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
|
return self.format()
|
|
|
|
|
@@ -846,7 +992,18 @@ class URLGrabberOptions:
|
|
|
|
|
@@ -846,7 +999,18 @@ class URLGrabberOptions:
|
|
|
|
|
s = s + indent + '}'
|
|
|
|
|
return s
|
|
|
|
|
|
|
|
|
@ -603,7 +610,7 @@ index e090e90..01218b0 100644
|
|
|
|
|
"""Provides easy opening of URLs with a variety of options.
|
|
|
|
|
|
|
|
|
|
All options are specified as kwargs. Options may be specified when
|
|
|
|
|
@@ -872,7 +1029,6 @@ class URLGrabber:
|
|
|
|
|
@@ -872,7 +1036,6 @@ class URLGrabber:
|
|
|
|
|
# beware of infinite loops :)
|
|
|
|
|
tries = tries + 1
|
|
|
|
|
exception = None
|
|
|
|
@ -611,7 +618,7 @@ index e090e90..01218b0 100644
|
|
|
|
|
callback = None
|
|
|
|
|
if DEBUG: DEBUG.info('attempt %i/%s: %s',
|
|
|
|
|
tries, opts.retry, args[0])
|
|
|
|
|
@@ -883,54 +1039,62 @@ class URLGrabber:
|
|
|
|
|
@@ -883,54 +1046,62 @@ class URLGrabber:
|
|
|
|
|
except URLGrabError, e:
|
|
|
|
|
exception = e
|
|
|
|
|
callback = opts.failure_callback
|
|
|
|
@ -681,7 +688,7 @@ index e090e90..01218b0 100644
|
|
|
|
|
if scheme == 'file' and not opts.copy_local:
|
|
|
|
|
# just return the name of the local file - don't make a
|
|
|
|
|
# copy currently
|
|
|
|
|
@@ -950,41 +1114,51 @@ class URLGrabber:
|
|
|
|
|
@@ -950,41 +1121,51 @@ class URLGrabber:
|
|
|
|
|
|
|
|
|
|
elif not opts.range:
|
|
|
|
|
if not opts.checkfunc is None:
|
|
|
|
@ -748,7 +755,7 @@ index e090e90..01218b0 100644
|
|
|
|
|
if limit is not None:
|
|
|
|
|
limit = limit + 1
|
|
|
|
|
|
|
|
|
|
@@ -1000,12 +1174,8 @@ class URLGrabber:
|
|
|
|
|
@@ -1000,12 +1181,8 @@ class URLGrabber:
|
|
|
|
|
else: s = fo.read(limit)
|
|
|
|
|
|
|
|
|
|
if not opts.checkfunc is None:
|
|
|
|
@ -763,7 +770,7 @@ index e090e90..01218b0 100644
|
|
|
|
|
finally:
|
|
|
|
|
fo.close()
|
|
|
|
|
return s
|
|
|
|
|
@@ -1020,6 +1190,7 @@ class URLGrabber:
|
|
|
|
|
@@ -1020,6 +1197,7 @@ class URLGrabber:
|
|
|
|
|
return s
|
|
|
|
|
|
|
|
|
|
def _make_callback(self, callback_obj):
|
|
|
|
@ -771,7 +778,7 @@ index e090e90..01218b0 100644
|
|
|
|
|
if callable(callback_obj):
|
|
|
|
|
return callback_obj, (), {}
|
|
|
|
|
else:
|
|
|
|
|
@@ -1030,7 +1201,7 @@ class URLGrabber:
|
|
|
|
|
@@ -1030,7 +1208,7 @@ class URLGrabber:
|
|
|
|
|
default_grabber = URLGrabber()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -780,7 +787,7 @@ index e090e90..01218b0 100644
|
|
|
|
|
def __init__(self, url, filename, opts):
|
|
|
|
|
self.fo = None
|
|
|
|
|
self._hdr_dump = ''
|
|
|
|
|
@@ -1052,10 +1223,13 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1052,10 +1230,13 @@ class PyCurlFileObject():
|
|
|
|
|
self._reget_length = 0
|
|
|
|
|
self._prog_running = False
|
|
|
|
|
self._error = (None, None)
|
|
|
|
@ -796,7 +803,7 @@ index e090e90..01218b0 100644
|
|
|
|
|
def __getattr__(self, name):
|
|
|
|
|
"""This effectively allows us to wrap at the instance level.
|
|
|
|
|
Any attribute not found in _this_ object will be searched for
|
|
|
|
|
@@ -1067,6 +1241,12 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1067,6 +1248,12 @@ class PyCurlFileObject():
|
|
|
|
|
|
|
|
|
|
def _retrieve(self, buf):
|
|
|
|
|
try:
|
|
|
|
@ -809,7 +816,18 @@ index e090e90..01218b0 100644
|
|
|
|
|
if not self._prog_running:
|
|
|
|
|
if self.opts.progress_obj:
|
|
|
|
|
size = self.size + self._reget_length
|
|
|
|
|
@@ -1085,9 +1265,14 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1079,15 +1266,24 @@ class PyCurlFileObject():
|
|
|
|
|
self.opts.progress_obj.update(self._amount_read)
|
|
|
|
|
|
|
|
|
|
self._amount_read += len(buf)
|
|
|
|
|
- self.fo.write(buf)
|
|
|
|
|
+ try:
|
|
|
|
|
+ self.fo.write(buf)
|
|
|
|
|
+ except IOError, e:
|
|
|
|
|
+ self._cb_error = URLGrabError(16, exception2msg(e))
|
|
|
|
|
+ return -1
|
|
|
|
|
return len(buf)
|
|
|
|
|
except KeyboardInterrupt:
|
|
|
|
|
return -1
|
|
|
|
|
|
|
|
|
|
def _hdr_retrieve(self, buf):
|
|
|
|
@ -825,7 +843,7 @@ index e090e90..01218b0 100644
|
|
|
|
|
try:
|
|
|
|
|
self._hdr_dump += buf
|
|
|
|
|
# we have to get the size before we do the progress obj start
|
|
|
|
|
@@ -1104,7 +1289,17 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1104,7 +1300,17 @@ class PyCurlFileObject():
|
|
|
|
|
s = parse150(buf)
|
|
|
|
|
if s:
|
|
|
|
|
self.size = int(s)
|
|
|
|
@ -844,7 +862,7 @@ index e090e90..01218b0 100644
|
|
|
|
|
return len(buf)
|
|
|
|
|
except KeyboardInterrupt:
|
|
|
|
|
return pycurl.READFUNC_ABORT
|
|
|
|
|
@@ -1113,8 +1308,10 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1113,8 +1319,10 @@ class PyCurlFileObject():
|
|
|
|
|
if self._parsed_hdr:
|
|
|
|
|
return self._parsed_hdr
|
|
|
|
|
statusend = self._hdr_dump.find('\n')
|
|
|
|
@ -855,7 +873,7 @@ index e090e90..01218b0 100644
|
|
|
|
|
self._parsed_hdr = mimetools.Message(hdrfp)
|
|
|
|
|
return self._parsed_hdr
|
|
|
|
|
|
|
|
|
|
@@ -1127,6 +1324,9 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1127,6 +1335,9 @@ class PyCurlFileObject():
|
|
|
|
|
if not opts:
|
|
|
|
|
opts = self.opts
|
|
|
|
|
|
|
|
|
@ -865,7 +883,7 @@ index e090e90..01218b0 100644
|
|
|
|
|
|
|
|
|
|
# defaults we're always going to set
|
|
|
|
|
self.curl_obj.setopt(pycurl.NOPROGRESS, False)
|
|
|
|
|
@@ -1136,11 +1336,21 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1136,11 +1347,21 @@ class PyCurlFileObject():
|
|
|
|
|
self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update)
|
|
|
|
|
self.curl_obj.setopt(pycurl.FAILONERROR, True)
|
|
|
|
|
self.curl_obj.setopt(pycurl.OPT_FILETIME, True)
|
|
|
|
@ -887,7 +905,7 @@ index e090e90..01218b0 100644
|
|
|
|
|
|
|
|
|
|
# maybe to be options later
|
|
|
|
|
self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)
|
|
|
|
|
@@ -1148,9 +1358,11 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1148,9 +1369,11 @@ class PyCurlFileObject():
|
|
|
|
|
|
|
|
|
|
# timeouts
|
|
|
|
|
timeout = 300
|
|
|
|
@ -897,12 +915,12 @@ index e090e90..01218b0 100644
|
|
|
|
|
+ if hasattr(opts, 'timeout'):
|
|
|
|
|
+ timeout = int(opts.timeout or 0)
|
|
|
|
|
+ self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout)
|
|
|
|
|
+ self.curl_obj.setopt(pycurl.LOW_SPEED_LIMIT, 1)
|
|
|
|
|
+ self.curl_obj.setopt(pycurl.LOW_SPEED_LIMIT, 1000)
|
|
|
|
|
+ self.curl_obj.setopt(pycurl.LOW_SPEED_TIME, timeout)
|
|
|
|
|
|
|
|
|
|
# ssl options
|
|
|
|
|
if self.scheme == 'https':
|
|
|
|
|
@@ -1158,13 +1370,16 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1158,13 +1381,16 @@ class PyCurlFileObject():
|
|
|
|
|
self.curl_obj.setopt(pycurl.CAPATH, opts.ssl_ca_cert)
|
|
|
|
|
self.curl_obj.setopt(pycurl.CAINFO, opts.ssl_ca_cert)
|
|
|
|
|
self.curl_obj.setopt(pycurl.SSL_VERIFYPEER, opts.ssl_verify_peer)
|
|
|
|
@ -920,7 +938,7 @@ index e090e90..01218b0 100644
|
|
|
|
|
if opts.ssl_cert_type:
|
|
|
|
|
self.curl_obj.setopt(pycurl.SSLCERTTYPE, opts.ssl_cert_type)
|
|
|
|
|
if opts.ssl_key_pass:
|
|
|
|
|
@@ -1187,28 +1402,26 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1187,28 +1413,26 @@ class PyCurlFileObject():
|
|
|
|
|
if hasattr(opts, 'raw_throttle') and opts.raw_throttle():
|
|
|
|
|
self.curl_obj.setopt(pycurl.MAX_RECV_SPEED_LARGE, int(opts.raw_throttle()))
|
|
|
|
|
|
|
|
|
@ -965,7 +983,7 @@ index e090e90..01218b0 100644
|
|
|
|
|
|
|
|
|
|
# our url
|
|
|
|
|
self.curl_obj.setopt(pycurl.URL, self.url)
|
|
|
|
|
@@ -1228,12 +1441,14 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1228,39 +1452,36 @@ class PyCurlFileObject():
|
|
|
|
|
|
|
|
|
|
code = self.http_code
|
|
|
|
|
errcode = e.args[0]
|
|
|
|
@ -974,16 +992,19 @@ index e090e90..01218b0 100644
|
|
|
|
|
if self._error[0]:
|
|
|
|
|
errcode = self._error[0]
|
|
|
|
|
|
|
|
|
|
if errcode == 23 and code >= 200 and code < 299:
|
|
|
|
|
- if errcode == 23 and code >= 200 and code < 299:
|
|
|
|
|
- err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e))
|
|
|
|
|
- err.url = self.url
|
|
|
|
|
+ err = URLGrabError(15, _('User (or something) called abort %s: %s') % (errurl, e))
|
|
|
|
|
+ err.url = errurl
|
|
|
|
|
|
|
|
|
|
-
|
|
|
|
|
+ if errcode == 23 and 200 <= code <= 299:
|
|
|
|
|
# this is probably wrong but ultimately this is what happens
|
|
|
|
|
# we have a legit http code and a pycurl 'writer failed' code
|
|
|
|
|
@@ -1244,23 +1459,23 @@ class PyCurlFileObject():
|
|
|
|
|
raise KeyboardInterrupt
|
|
|
|
|
# which almost always means something aborted it from outside
|
|
|
|
|
# since we cannot know what it is -I'm banking on it being
|
|
|
|
|
# a ctrl-c. XXXX - if there's a way of going back two raises to
|
|
|
|
|
# figure out what aborted the pycurl process FIXME
|
|
|
|
|
- raise KeyboardInterrupt
|
|
|
|
|
+ raise getattr(self, '_cb_error', KeyboardInterrupt)
|
|
|
|
|
|
|
|
|
|
elif errcode == 28:
|
|
|
|
|
- err = URLGrabError(12, _('Timeout on %s: %s') % (self.url, e))
|
|
|
|
@ -1008,12 +1029,10 @@ index e090e90..01218b0 100644
|
|
|
|
|
elif errcode == 42:
|
|
|
|
|
- err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e))
|
|
|
|
|
- err.url = self.url
|
|
|
|
|
+ err = URLGrabError(15, _('User (or something) called abort %s: %s') % (errurl, e))
|
|
|
|
|
+ err.url = errurl
|
|
|
|
|
# this is probably wrong but ultimately this is what happens
|
|
|
|
|
# we have a legit http code and a pycurl 'writer failed' code
|
|
|
|
|
# which almost always means something aborted it from outside
|
|
|
|
|
@@ -1272,33 +1487,94 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1272,33 +1493,94 @@ class PyCurlFileObject():
|
|
|
|
|
elif errcode == 58:
|
|
|
|
|
msg = _("problem with the local client certificate")
|
|
|
|
|
err = URLGrabError(14, msg)
|
|
|
|
@ -1040,8 +1059,9 @@ index e090e90..01218b0 100644
|
|
|
|
|
+ err.url = errurl
|
|
|
|
|
raise err
|
|
|
|
|
|
|
|
|
|
elif str(e.args[1]) == '' and self.http_code != 0: # fake it until you make it
|
|
|
|
|
- elif str(e.args[1]) == '' and self.http_code != 0: # fake it until you make it
|
|
|
|
|
- msg = 'HTTP Error %s : %s ' % (self.http_code, self.url)
|
|
|
|
|
+ elif str(e.args[1]) == '' and code and not 200 <= code <= 299:
|
|
|
|
|
+ if self.scheme in ['http', 'https']:
|
|
|
|
|
+ if self.http_code in responses:
|
|
|
|
|
+ resp = responses[self.http_code]
|
|
|
|
@ -1115,7 +1135,7 @@ index e090e90..01218b0 100644
|
|
|
|
|
|
|
|
|
|
def _do_open(self):
|
|
|
|
|
self.curl_obj = _curl_cache
|
|
|
|
|
@@ -1333,7 +1609,11 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1333,7 +1615,11 @@ class PyCurlFileObject():
|
|
|
|
|
|
|
|
|
|
if self.opts.range:
|
|
|
|
|
rt = self.opts.range
|
|
|
|
@ -1128,7 +1148,7 @@ index e090e90..01218b0 100644
|
|
|
|
|
|
|
|
|
|
if rt:
|
|
|
|
|
header = range_tuple_to_header(rt)
|
|
|
|
|
@@ -1434,21 +1714,46 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1434,21 +1720,46 @@ class PyCurlFileObject():
|
|
|
|
|
#fh, self._temp_name = mkstemp()
|
|
|
|
|
#self.fo = open(self._temp_name, 'wb')
|
|
|
|
|
|
|
|
|
@ -1182,7 +1202,7 @@ index e090e90..01218b0 100644
|
|
|
|
|
else:
|
|
|
|
|
#self.fo = open(self._temp_name, 'r')
|
|
|
|
|
self.fo.seek(0)
|
|
|
|
|
@@ -1526,17 +1831,20 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1526,17 +1837,20 @@ class PyCurlFileObject():
|
|
|
|
|
if self._prog_running:
|
|
|
|
|
downloaded += self._reget_length
|
|
|
|
|
self.opts.progress_obj.update(downloaded)
|
|
|
|
@ -1208,7 +1228,7 @@ index e090e90..01218b0 100644
|
|
|
|
|
|
|
|
|
|
msg = _("Downloaded more than max size for %s: %s > %s") \
|
|
|
|
|
% (self.url, cur, max_size)
|
|
|
|
|
@@ -1544,13 +1852,6 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1544,13 +1858,6 @@ class PyCurlFileObject():
|
|
|
|
|
return True
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
@ -1222,7 +1242,7 @@ index e090e90..01218b0 100644
|
|
|
|
|
def read(self, amt=None):
|
|
|
|
|
self._fill_buffer(amt)
|
|
|
|
|
if amt is None:
|
|
|
|
|
@@ -1582,9 +1883,21 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1582,9 +1889,21 @@ class PyCurlFileObject():
|
|
|
|
|
self.opts.progress_obj.end(self._amount_read)
|
|
|
|
|
self.fo.close()
|
|
|
|
|
|
|
|
|
@ -1245,7 +1265,7 @@ index e090e90..01218b0 100644
|
|
|
|
|
|
|
|
|
|
#####################################################################
|
|
|
|
|
# DEPRECATED FUNCTIONS
|
|
|
|
|
@@ -1621,6 +1934,466 @@ def retrygrab(url, filename=None, copy_local=0, close_connection=0,
|
|
|
|
|
@@ -1621,6 +1940,467 @@ def retrygrab(url, filename=None, copy_local=0, close_connection=0,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#####################################################################
|
|
|
|
@ -1580,11 +1600,12 @@ index e090e90..01218b0 100644
|
|
|
|
|
+ if key in removed: continue
|
|
|
|
|
+
|
|
|
|
|
+ # estimate mirror speed
|
|
|
|
|
+ speed = _TH.estimate(key)
|
|
|
|
|
+ speed, fail = _TH.estimate(key)
|
|
|
|
|
+ speed /= 1 + host_con.get(key, 0)
|
|
|
|
|
+
|
|
|
|
|
+ # order by: least failures, private flag, best speed
|
|
|
|
|
+ private = mirror.get('kwargs', {}).get('private', False)
|
|
|
|
|
+ # ignore 'private' flag if there were failures
|
|
|
|
|
+ private = not fail and mirror.get('kwargs', {}).get('private', False)
|
|
|
|
|
+ speed = -failed.get(key, 0), private, speed
|
|
|
|
|
+ if best is None or speed > best_speed:
|
|
|
|
|
+ best = mirror
|
|
|
|
@ -1701,19 +1722,19 @@ index e090e90..01218b0 100644
|
|
|
|
|
+
|
|
|
|
|
+ default_speed = default_grabber.opts.default_speed
|
|
|
|
|
+ try: speed, fail, ts = _TH.hosts[host]
|
|
|
|
|
+ except KeyError: return default_speed
|
|
|
|
|
+ except KeyError: return default_speed, 0
|
|
|
|
|
+
|
|
|
|
|
+ speed *= 2**-fail
|
|
|
|
|
+ k = 2**((ts - time.time()) / default_grabber.opts.half_life)
|
|
|
|
|
+ speed = k * speed + (1 - k) * default_speed
|
|
|
|
|
+ return speed
|
|
|
|
|
+ return speed, fail
|
|
|
|
|
+
|
|
|
|
|
+#####################################################################
|
|
|
|
|
# TESTING
|
|
|
|
|
def _main_test():
|
|
|
|
|
try: url, filename = sys.argv[1:3]
|
|
|
|
|
diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py
|
|
|
|
|
index dad410b..b17be17 100644
|
|
|
|
|
index dad410b..7975f1b 100644
|
|
|
|
|
--- a/urlgrabber/mirror.py
|
|
|
|
|
+++ b/urlgrabber/mirror.py
|
|
|
|
|
@@ -76,6 +76,9 @@ CUSTOMIZATION
|
|
|
|
@ -1726,7 +1747,7 @@ index dad410b..b17be17 100644
|
|
|
|
|
3) Pass keyword arguments when instantiating the mirror group.
|
|
|
|
|
See, for example, the failure_callback argument.
|
|
|
|
|
|
|
|
|
|
@@ -87,10 +90,12 @@ CUSTOMIZATION
|
|
|
|
|
@@ -87,10 +90,14 @@ CUSTOMIZATION
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -1737,10 +1758,12 @@ index dad410b..b17be17 100644
|
|
|
|
|
-from grabber import URLGrabError, CallbackObject, DEBUG
|
|
|
|
|
+from grabber import URLGrabError, CallbackObject, DEBUG, _to_utf8
|
|
|
|
|
+from grabber import _run_callback, _do_raise
|
|
|
|
|
+from grabber import exception2msg
|
|
|
|
|
+from grabber import _TH
|
|
|
|
|
|
|
|
|
|
def _(st):
|
|
|
|
|
return st
|
|
|
|
|
@@ -126,7 +131,9 @@ class MirrorGroup:
|
|
|
|
|
@@ -126,7 +133,9 @@ class MirrorGroup:
|
|
|
|
|
files)
|
|
|
|
|
|
|
|
|
|
* if the local list is ever exhausted, a URLGrabError will be
|
|
|
|
@ -1751,7 +1774,7 @@ index dad410b..b17be17 100644
|
|
|
|
|
|
|
|
|
|
OPTIONS
|
|
|
|
|
|
|
|
|
|
@@ -153,7 +160,8 @@ class MirrorGroup:
|
|
|
|
|
@@ -153,7 +162,8 @@ class MirrorGroup:
|
|
|
|
|
|
|
|
|
|
The 'fail' option will cause immediate failure by re-raising
|
|
|
|
|
the exception and no further attempts to get the current
|
|
|
|
@ -1761,7 +1784,7 @@ index dad410b..b17be17 100644
|
|
|
|
|
|
|
|
|
|
This dict can be set at instantiation time,
|
|
|
|
|
mg = MirrorGroup(grabber, mirrors, default_action={'fail':1})
|
|
|
|
|
@@ -184,6 +192,7 @@ class MirrorGroup:
|
|
|
|
|
@@ -184,6 +194,7 @@ class MirrorGroup:
|
|
|
|
|
|
|
|
|
|
obj.exception = < exception that was raised >
|
|
|
|
|
obj.mirror = < the mirror that was tried >
|
|
|
|
@ -1769,7 +1792,25 @@ index dad410b..b17be17 100644
|
|
|
|
|
obj.relative_url = < url relative to the mirror >
|
|
|
|
|
obj.url = < full url that failed >
|
|
|
|
|
# .url is just the combination of .mirror
|
|
|
|
|
@@ -263,7 +272,8 @@ class MirrorGroup:
|
|
|
|
|
@@ -251,6 +262,17 @@ class MirrorGroup:
|
|
|
|
|
self.default_action = None
|
|
|
|
|
self._process_kwargs(kwargs)
|
|
|
|
|
|
|
|
|
|
+ # use the same algorithm as parallel downloader to initially sort
|
|
|
|
|
+ # the mirror list (sort by speed, but prefer live private mirrors)
|
|
|
|
|
+ def estimate(m):
|
|
|
|
|
+ speed, fail = _TH.estimate(m['mirror'])
|
|
|
|
|
+ private = not fail and m.get('kwargs', {}).get('private', False)
|
|
|
|
|
+ return private, speed
|
|
|
|
|
+
|
|
|
|
|
+ # update the initial order. since sorting is stable, the relative
|
|
|
|
|
+ # order of unknown (not used yet) hosts is retained.
|
|
|
|
|
+ self.mirrors.sort(key=estimate, reverse=True)
|
|
|
|
|
+
|
|
|
|
|
# if these values are found in **kwargs passed to one of the urlXXX
|
|
|
|
|
# methods, they will be stripped before getting passed on to the
|
|
|
|
|
# grabber
|
|
|
|
|
@@ -263,7 +285,8 @@ class MirrorGroup:
|
|
|
|
|
def _parse_mirrors(self, mirrors):
|
|
|
|
|
parsed_mirrors = []
|
|
|
|
|
for m in mirrors:
|
|
|
|
@ -1779,7 +1820,7 @@ index dad410b..b17be17 100644
|
|
|
|
|
parsed_mirrors.append(m)
|
|
|
|
|
return parsed_mirrors
|
|
|
|
|
|
|
|
|
|
@@ -280,7 +290,9 @@ class MirrorGroup:
|
|
|
|
|
@@ -280,7 +303,9 @@ class MirrorGroup:
|
|
|
|
|
# return a random mirror so that multiple mirrors get used
|
|
|
|
|
# even without failures.
|
|
|
|
|
if not gr.mirrors:
|
|
|
|
@ -1790,7 +1831,7 @@ index dad410b..b17be17 100644
|
|
|
|
|
return gr.mirrors[gr._next]
|
|
|
|
|
|
|
|
|
|
def _failure(self, gr, cb_obj):
|
|
|
|
|
@@ -307,7 +319,9 @@ class MirrorGroup:
|
|
|
|
|
@@ -307,7 +332,9 @@ class MirrorGroup:
|
|
|
|
|
a.update(action)
|
|
|
|
|
action = a
|
|
|
|
|
self.increment_mirror(gr, action)
|
|
|
|
@ -1801,7 +1842,7 @@ index dad410b..b17be17 100644
|
|
|
|
|
|
|
|
|
|
def increment_mirror(self, gr, action={}):
|
|
|
|
|
"""Tell the mirror object increment the mirror index
|
|
|
|
|
@@ -377,35 +391,50 @@ class MirrorGroup:
|
|
|
|
|
@@ -377,35 +404,50 @@ class MirrorGroup:
|
|
|
|
|
gr.url = url
|
|
|
|
|
gr.kw = dict(kw)
|
|
|
|
|
self._load_gr(gr)
|
|
|
|
@ -1828,7 +1869,7 @@ index dad410b..b17be17 100644
|
|
|
|
|
+ return func_ref( *(fullurl,), opts=opts, **kw )
|
|
|
|
|
except URLGrabError, e:
|
|
|
|
|
if DEBUG: DEBUG.info('MIRROR: failed')
|
|
|
|
|
+ gr.errors.append((fullurl, str(e)))
|
|
|
|
|
+ gr.errors.append((fullurl, exception2msg(e)))
|
|
|
|
|
obj = CallbackObject()
|
|
|
|
|
obj.exception = e
|
|
|
|
|
obj.mirror = mirrorchoice['mirror']
|
|
|
|
|