|
|
|
@ -314,7 +314,7 @@ index 3e5f3b7..8eeaeda 100644
|
|
|
|
|
return (fb,lb)
|
|
|
|
|
|
|
|
|
|
diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py
|
|
|
|
|
index e090e90..05ea9c3 100644
|
|
|
|
|
index e090e90..6b409e3 100644
|
|
|
|
|
--- a/urlgrabber/grabber.py
|
|
|
|
|
+++ b/urlgrabber/grabber.py
|
|
|
|
|
@@ -49,11 +49,26 @@ GENERAL ARGUMENTS (kwargs)
|
|
|
|
@ -920,7 +920,7 @@ index e090e90..05ea9c3 100644
|
|
|
|
|
if not self._prog_running:
|
|
|
|
|
if self.opts.progress_obj:
|
|
|
|
|
size = self.size + self._reget_length
|
|
|
|
|
@@ -1079,23 +1274,40 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1079,32 +1274,62 @@ class PyCurlFileObject():
|
|
|
|
|
self.opts.progress_obj.update(self._amount_read)
|
|
|
|
|
|
|
|
|
|
self._amount_read += len(buf)
|
|
|
|
@ -967,7 +967,10 @@ index e090e90..05ea9c3 100644
|
|
|
|
|
elif self.scheme in ['ftp']:
|
|
|
|
|
s = None
|
|
|
|
|
if buf.startswith('213 '):
|
|
|
|
|
@@ -1104,7 +1316,18 @@ class PyCurlFileObject():
|
|
|
|
|
s = buf[3:].strip()
|
|
|
|
|
+ if len(s) >= 14:
|
|
|
|
|
+ s = None # ignore MDTM responses
|
|
|
|
|
elif buf.startswith('150 '):
|
|
|
|
|
s = parse150(buf)
|
|
|
|
|
if s:
|
|
|
|
|
self.size = int(s)
|
|
|
|
@ -987,7 +990,7 @@ index e090e90..05ea9c3 100644
|
|
|
|
|
return len(buf)
|
|
|
|
|
except KeyboardInterrupt:
|
|
|
|
|
return pycurl.READFUNC_ABORT
|
|
|
|
|
@@ -1113,8 +1336,10 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1113,8 +1338,10 @@ class PyCurlFileObject():
|
|
|
|
|
if self._parsed_hdr:
|
|
|
|
|
return self._parsed_hdr
|
|
|
|
|
statusend = self._hdr_dump.find('\n')
|
|
|
|
@ -998,7 +1001,7 @@ index e090e90..05ea9c3 100644
|
|
|
|
|
self._parsed_hdr = mimetools.Message(hdrfp)
|
|
|
|
|
return self._parsed_hdr
|
|
|
|
|
|
|
|
|
|
@@ -1127,6 +1352,9 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1127,6 +1354,9 @@ class PyCurlFileObject():
|
|
|
|
|
if not opts:
|
|
|
|
|
opts = self.opts
|
|
|
|
|
|
|
|
|
@ -1008,7 +1011,7 @@ index e090e90..05ea9c3 100644
|
|
|
|
|
|
|
|
|
|
# defaults we're always going to set
|
|
|
|
|
self.curl_obj.setopt(pycurl.NOPROGRESS, False)
|
|
|
|
|
@@ -1136,11 +1364,21 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1136,11 +1366,21 @@ class PyCurlFileObject():
|
|
|
|
|
self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update)
|
|
|
|
|
self.curl_obj.setopt(pycurl.FAILONERROR, True)
|
|
|
|
|
self.curl_obj.setopt(pycurl.OPT_FILETIME, True)
|
|
|
|
@ -1031,7 +1034,7 @@ index e090e90..05ea9c3 100644
|
|
|
|
|
|
|
|
|
|
# maybe to be options later
|
|
|
|
|
self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)
|
|
|
|
|
@@ -1148,9 +1386,11 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1148,9 +1388,11 @@ class PyCurlFileObject():
|
|
|
|
|
|
|
|
|
|
# timeouts
|
|
|
|
|
timeout = 300
|
|
|
|
@ -1046,7 +1049,7 @@ index e090e90..05ea9c3 100644
|
|
|
|
|
|
|
|
|
|
# ssl options
|
|
|
|
|
if self.scheme == 'https':
|
|
|
|
|
@@ -1158,13 +1398,16 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1158,13 +1400,16 @@ class PyCurlFileObject():
|
|
|
|
|
self.curl_obj.setopt(pycurl.CAPATH, opts.ssl_ca_cert)
|
|
|
|
|
self.curl_obj.setopt(pycurl.CAINFO, opts.ssl_ca_cert)
|
|
|
|
|
self.curl_obj.setopt(pycurl.SSL_VERIFYPEER, opts.ssl_verify_peer)
|
|
|
|
@ -1064,7 +1067,7 @@ index e090e90..05ea9c3 100644
|
|
|
|
|
if opts.ssl_cert_type:
|
|
|
|
|
self.curl_obj.setopt(pycurl.SSLCERTTYPE, opts.ssl_cert_type)
|
|
|
|
|
if opts.ssl_key_pass:
|
|
|
|
|
@@ -1187,28 +1430,26 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1187,28 +1432,26 @@ class PyCurlFileObject():
|
|
|
|
|
if hasattr(opts, 'raw_throttle') and opts.raw_throttle():
|
|
|
|
|
self.curl_obj.setopt(pycurl.MAX_RECV_SPEED_LARGE, int(opts.raw_throttle()))
|
|
|
|
|
|
|
|
|
@ -1109,7 +1112,7 @@ index e090e90..05ea9c3 100644
|
|
|
|
|
|
|
|
|
|
# our url
|
|
|
|
|
self.curl_obj.setopt(pycurl.URL, self.url)
|
|
|
|
|
@@ -1228,39 +1469,26 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1228,39 +1471,26 @@ class PyCurlFileObject():
|
|
|
|
|
|
|
|
|
|
code = self.http_code
|
|
|
|
|
errcode = e.args[0]
|
|
|
|
@ -1155,7 +1158,7 @@ index e090e90..05ea9c3 100644
|
|
|
|
|
# this is probably wrong but ultimately this is what happens
|
|
|
|
|
# we have a legit http code and a pycurl 'writer failed' code
|
|
|
|
|
# which almost always means something aborted it from outside
|
|
|
|
|
@@ -1269,36 +1497,70 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1269,36 +1499,70 @@ class PyCurlFileObject():
|
|
|
|
|
# figure out what aborted the pycurl process FIXME
|
|
|
|
|
raise KeyboardInterrupt
|
|
|
|
|
|
|
|
|
@ -1251,7 +1254,7 @@ index e090e90..05ea9c3 100644
|
|
|
|
|
|
|
|
|
|
def _do_open(self):
|
|
|
|
|
self.curl_obj = _curl_cache
|
|
|
|
|
@@ -1333,7 +1595,11 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1333,7 +1597,11 @@ class PyCurlFileObject():
|
|
|
|
|
|
|
|
|
|
if self.opts.range:
|
|
|
|
|
rt = self.opts.range
|
|
|
|
@ -1264,7 +1267,7 @@ index e090e90..05ea9c3 100644
|
|
|
|
|
|
|
|
|
|
if rt:
|
|
|
|
|
header = range_tuple_to_header(rt)
|
|
|
|
|
@@ -1434,21 +1700,46 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1434,21 +1702,46 @@ class PyCurlFileObject():
|
|
|
|
|
#fh, self._temp_name = mkstemp()
|
|
|
|
|
#self.fo = open(self._temp_name, 'wb')
|
|
|
|
|
|
|
|
|
@ -1318,7 +1321,7 @@ index e090e90..05ea9c3 100644
|
|
|
|
|
else:
|
|
|
|
|
#self.fo = open(self._temp_name, 'r')
|
|
|
|
|
self.fo.seek(0)
|
|
|
|
|
@@ -1526,17 +1817,20 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1526,17 +1819,20 @@ class PyCurlFileObject():
|
|
|
|
|
if self._prog_running:
|
|
|
|
|
downloaded += self._reget_length
|
|
|
|
|
self.opts.progress_obj.update(downloaded)
|
|
|
|
@ -1344,7 +1347,7 @@ index e090e90..05ea9c3 100644
|
|
|
|
|
|
|
|
|
|
msg = _("Downloaded more than max size for %s: %s > %s") \
|
|
|
|
|
% (self.url, cur, max_size)
|
|
|
|
|
@@ -1544,13 +1838,6 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1544,13 +1840,6 @@ class PyCurlFileObject():
|
|
|
|
|
return True
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
@ -1358,7 +1361,7 @@ index e090e90..05ea9c3 100644
|
|
|
|
|
def read(self, amt=None):
|
|
|
|
|
self._fill_buffer(amt)
|
|
|
|
|
if amt is None:
|
|
|
|
|
@@ -1582,9 +1869,21 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1582,9 +1871,21 @@ class PyCurlFileObject():
|
|
|
|
|
self.opts.progress_obj.end(self._amount_read)
|
|
|
|
|
self.fo.close()
|
|
|
|
|
|
|
|
|
@ -1381,7 +1384,7 @@ index e090e90..05ea9c3 100644
|
|
|
|
|
|
|
|
|
|
#####################################################################
|
|
|
|
|
# DEPRECATED FUNCTIONS
|
|
|
|
|
@@ -1621,6 +1920,482 @@ def retrygrab(url, filename=None, copy_local=0, close_connection=0,
|
|
|
|
|
@@ -1621,6 +1922,489 @@ def retrygrab(url, filename=None, copy_local=0, close_connection=0,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#####################################################################
|
|
|
|
@ -1614,6 +1617,7 @@ index e090e90..05ea9c3 100644
|
|
|
|
|
+
|
|
|
|
|
+ dl = _ExternalDownloaderPool()
|
|
|
|
|
+ host_con = {} # current host connection counts
|
|
|
|
|
+ single = set() # hosts in single connection mode
|
|
|
|
|
+
|
|
|
|
|
+ def start(opts, tries):
|
|
|
|
|
+ opts.tries = tries
|
|
|
|
@ -1660,6 +1664,10 @@ index e090e90..05ea9c3 100644
|
|
|
|
|
+
|
|
|
|
|
+ if ug_err is None:
|
|
|
|
|
+ continue
|
|
|
|
|
+ if ug_err.errno == pycurl.E_OPERATION_TIMEOUTED:
|
|
|
|
|
+ # One possible cause is connection-limited server.
|
|
|
|
|
+ # Turn on the max_connections=1 override. BZ 853432
|
|
|
|
|
+ single.add(key)
|
|
|
|
|
+
|
|
|
|
|
+ retry = opts.retry or 0
|
|
|
|
|
+ if opts.failure_callback:
|
|
|
|
@ -1749,7 +1757,7 @@ index e090e90..05ea9c3 100644
|
|
|
|
|
+
|
|
|
|
|
+ # update the current mirror and limit
|
|
|
|
|
+ key = best['mirror']
|
|
|
|
|
+ limit = best.get('kwargs', {}).get('max_connections', 2)
|
|
|
|
|
+ limit = best.get('kwargs', {}).get('max_connections') or 2
|
|
|
|
|
+ opts.async = key, limit
|
|
|
|
|
+
|
|
|
|
|
+ # update URL and proxy
|
|
|
|
@ -1760,6 +1768,8 @@ index e090e90..05ea9c3 100644
|
|
|
|
|
+
|
|
|
|
|
+ # check host limit, then start
|
|
|
|
|
+ key, limit = opts.async
|
|
|
|
|
+ if key in single:
|
|
|
|
|
+ limit = 1
|
|
|
|
|
+ while host_con.get(key, 0) >= limit:
|
|
|
|
|
+ perform()
|
|
|
|
|
+ if DEBUG:
|
|
|
|
@ -1865,20 +1875,21 @@ index e090e90..05ea9c3 100644
|
|
|
|
|
def _main_test():
|
|
|
|
|
try: url, filename = sys.argv[1:3]
|
|
|
|
|
diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py
|
|
|
|
|
index dad410b..7975f1b 100644
|
|
|
|
|
index dad410b..5d3aa34 100644
|
|
|
|
|
--- a/urlgrabber/mirror.py
|
|
|
|
|
+++ b/urlgrabber/mirror.py
|
|
|
|
|
@@ -76,6 +76,9 @@ CUSTOMIZATION
|
|
|
|
|
@@ -76,6 +76,10 @@ CUSTOMIZATION
|
|
|
|
|
'grabber' is omitted, the default grabber will be used. If
|
|
|
|
|
kwargs are omitted, then (duh) they will not be used.
|
|
|
|
|
|
|
|
|
|
+ kwarg 'max_connections' limits the number of concurrent
|
|
|
|
|
+ connections to this mirror.
|
|
|
|
|
+ connections to this mirror. When omitted or set to zero,
|
|
|
|
|
+ the default limit (2) will be used.
|
|
|
|
|
+
|
|
|
|
|
3) Pass keyword arguments when instantiating the mirror group.
|
|
|
|
|
See, for example, the failure_callback argument.
|
|
|
|
|
|
|
|
|
|
@@ -87,10 +90,14 @@ CUSTOMIZATION
|
|
|
|
|
@@ -87,10 +91,14 @@ CUSTOMIZATION
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -1894,7 +1905,7 @@ index dad410b..7975f1b 100644
|
|
|
|
|
|
|
|
|
|
def _(st):
|
|
|
|
|
return st
|
|
|
|
|
@@ -126,7 +133,9 @@ class MirrorGroup:
|
|
|
|
|
@@ -126,7 +134,9 @@ class MirrorGroup:
|
|
|
|
|
files)
|
|
|
|
|
|
|
|
|
|
* if the local list is ever exhausted, a URLGrabError will be
|
|
|
|
@ -1905,7 +1916,7 @@ index dad410b..7975f1b 100644
|
|
|
|
|
|
|
|
|
|
OPTIONS
|
|
|
|
|
|
|
|
|
|
@@ -153,7 +162,8 @@ class MirrorGroup:
|
|
|
|
|
@@ -153,7 +163,8 @@ class MirrorGroup:
|
|
|
|
|
|
|
|
|
|
The 'fail' option will cause immediate failure by re-raising
|
|
|
|
|
the exception and no further attempts to get the current
|
|
|
|
@ -1915,7 +1926,7 @@ index dad410b..7975f1b 100644
|
|
|
|
|
|
|
|
|
|
This dict can be set at instantiation time,
|
|
|
|
|
mg = MirrorGroup(grabber, mirrors, default_action={'fail':1})
|
|
|
|
|
@@ -184,6 +194,7 @@ class MirrorGroup:
|
|
|
|
|
@@ -184,6 +195,7 @@ class MirrorGroup:
|
|
|
|
|
|
|
|
|
|
obj.exception = < exception that was raised >
|
|
|
|
|
obj.mirror = < the mirror that was tried >
|
|
|
|
@ -1923,7 +1934,7 @@ index dad410b..7975f1b 100644
|
|
|
|
|
obj.relative_url = < url relative to the mirror >
|
|
|
|
|
obj.url = < full url that failed >
|
|
|
|
|
# .url is just the combination of .mirror
|
|
|
|
|
@@ -251,6 +262,17 @@ class MirrorGroup:
|
|
|
|
|
@@ -251,6 +263,17 @@ class MirrorGroup:
|
|
|
|
|
self.default_action = None
|
|
|
|
|
self._process_kwargs(kwargs)
|
|
|
|
|
|
|
|
|
@ -1941,7 +1952,7 @@ index dad410b..7975f1b 100644
|
|
|
|
|
# if these values are found in **kwargs passed to one of the urlXXX
|
|
|
|
|
# methods, they will be stripped before getting passed on to the
|
|
|
|
|
# grabber
|
|
|
|
|
@@ -263,7 +285,8 @@ class MirrorGroup:
|
|
|
|
|
@@ -263,7 +286,8 @@ class MirrorGroup:
|
|
|
|
|
def _parse_mirrors(self, mirrors):
|
|
|
|
|
parsed_mirrors = []
|
|
|
|
|
for m in mirrors:
|
|
|
|
@ -1951,7 +1962,7 @@ index dad410b..7975f1b 100644
|
|
|
|
|
parsed_mirrors.append(m)
|
|
|
|
|
return parsed_mirrors
|
|
|
|
|
|
|
|
|
|
@@ -280,7 +303,9 @@ class MirrorGroup:
|
|
|
|
|
@@ -280,7 +304,9 @@ class MirrorGroup:
|
|
|
|
|
# return a random mirror so that multiple mirrors get used
|
|
|
|
|
# even without failures.
|
|
|
|
|
if not gr.mirrors:
|
|
|
|
@ -1962,7 +1973,7 @@ index dad410b..7975f1b 100644
|
|
|
|
|
return gr.mirrors[gr._next]
|
|
|
|
|
|
|
|
|
|
def _failure(self, gr, cb_obj):
|
|
|
|
|
@@ -307,7 +332,9 @@ class MirrorGroup:
|
|
|
|
|
@@ -307,7 +333,9 @@ class MirrorGroup:
|
|
|
|
|
a.update(action)
|
|
|
|
|
action = a
|
|
|
|
|
self.increment_mirror(gr, action)
|
|
|
|
@ -1973,7 +1984,7 @@ index dad410b..7975f1b 100644
|
|
|
|
|
|
|
|
|
|
def increment_mirror(self, gr, action={}):
|
|
|
|
|
"""Tell the mirror object increment the mirror index
|
|
|
|
|
@@ -377,35 +404,50 @@ class MirrorGroup:
|
|
|
|
|
@@ -377,35 +405,50 @@ class MirrorGroup:
|
|
|
|
|
gr.url = url
|
|
|
|
|
gr.kw = dict(kw)
|
|
|
|
|
self._load_gr(gr)
|
|
|
|
|