|
|
|
@ -233,10 +233,10 @@ index 3e5f3b7..8eeaeda 100644
|
|
|
|
|
return (fb,lb)
|
|
|
|
|
|
|
|
|
|
diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py
|
|
|
|
|
index e090e90..bdcdfe3 100644
|
|
|
|
|
index e090e90..ffd5a10 100644
|
|
|
|
|
--- a/urlgrabber/grabber.py
|
|
|
|
|
+++ b/urlgrabber/grabber.py
|
|
|
|
|
@@ -49,7 +49,7 @@ GENERAL ARGUMENTS (kwargs)
|
|
|
|
|
@@ -49,11 +49,26 @@ GENERAL ARGUMENTS (kwargs)
|
|
|
|
|
progress_obj = None
|
|
|
|
|
|
|
|
|
|
a class instance that supports the following methods:
|
|
|
|
@ -245,7 +245,26 @@ index e090e90..bdcdfe3 100644
|
|
|
|
|
# length will be None if unknown
|
|
|
|
|
po.update(read) # read == bytes read so far
|
|
|
|
|
po.end()
|
|
|
|
|
@@ -68,14 +68,14 @@ GENERAL ARGUMENTS (kwargs)
|
|
|
|
|
|
|
|
|
|
+ multi_progress_obj = None
|
|
|
|
|
+
|
|
|
|
|
+ a class instance that supports the following methods:
|
|
|
|
|
+ mo.start(total_files, total_size)
|
|
|
|
|
+ mo.newMeter() => meter
|
|
|
|
|
+ mo.removeMeter(meter)
|
|
|
|
|
+ mo.end()
|
|
|
|
|
+
|
|
|
|
|
+ The 'meter' object is similar to progress_obj, but multiple
|
|
|
|
|
+ instances may be created and updated at the same time.
|
|
|
|
|
+
|
|
|
|
|
+ When downloading multiple files in parallel and multi_progress_obj
|
|
|
|
|
+ is None progress_obj is used in compatibility mode: finished files
|
|
|
|
|
+ are shown but there's no in-progress display.
|
|
|
|
|
+
|
|
|
|
|
text = None
|
|
|
|
|
|
|
|
|
|
specifies alternative text to be passed to the progress meter
|
|
|
|
|
@@ -68,14 +83,14 @@ GENERAL ARGUMENTS (kwargs)
|
|
|
|
|
(which can be set on default_grabber.throttle) is used. See
|
|
|
|
|
BANDWIDTH THROTTLING for more information.
|
|
|
|
|
|
|
|
|
@ -267,7 +286,7 @@ index e090e90..bdcdfe3 100644
|
|
|
|
|
|
|
|
|
|
bandwidth = 0
|
|
|
|
|
|
|
|
|
|
@@ -143,8 +143,12 @@ GENERAL ARGUMENTS (kwargs)
|
|
|
|
|
@@ -143,8 +158,12 @@ GENERAL ARGUMENTS (kwargs)
|
|
|
|
|
note that proxy authentication information may be provided using
|
|
|
|
|
normal URL constructs:
|
|
|
|
|
proxies={ 'http' : 'http://user:host@foo:3128' }
|
|
|
|
@ -282,7 +301,7 @@ index e090e90..bdcdfe3 100644
|
|
|
|
|
|
|
|
|
|
prefix = None
|
|
|
|
|
|
|
|
|
|
@@ -198,6 +202,12 @@ GENERAL ARGUMENTS (kwargs)
|
|
|
|
|
@@ -198,6 +217,12 @@ GENERAL ARGUMENTS (kwargs)
|
|
|
|
|
control, you should probably subclass URLParser and pass it in via
|
|
|
|
|
the 'urlparser' option.
|
|
|
|
|
|
|
|
|
@ -295,7 +314,7 @@ index e090e90..bdcdfe3 100644
|
|
|
|
|
ssl_ca_cert = None
|
|
|
|
|
|
|
|
|
|
this option can be used if M2Crypto is available and will be
|
|
|
|
|
@@ -211,43 +221,75 @@ GENERAL ARGUMENTS (kwargs)
|
|
|
|
|
@@ -211,43 +236,75 @@ GENERAL ARGUMENTS (kwargs)
|
|
|
|
|
No-op when using the curl backend (default)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -380,7 +399,7 @@ index e090e90..bdcdfe3 100644
|
|
|
|
|
|
|
|
|
|
RETRY RELATED ARGUMENTS
|
|
|
|
|
|
|
|
|
|
@@ -328,6 +370,15 @@ RETRY RELATED ARGUMENTS
|
|
|
|
|
@@ -328,6 +385,15 @@ RETRY RELATED ARGUMENTS
|
|
|
|
|
but it cannot (without severe trickiness) prevent the exception
|
|
|
|
|
from being raised.
|
|
|
|
|
|
|
|
|
@ -396,7 +415,7 @@ index e090e90..bdcdfe3 100644
|
|
|
|
|
interrupt_callback = None
|
|
|
|
|
|
|
|
|
|
This callback is called if KeyboardInterrupt is received at any
|
|
|
|
|
@@ -420,6 +471,7 @@ import time
|
|
|
|
|
@@ -420,6 +486,7 @@ import time
|
|
|
|
|
import string
|
|
|
|
|
import urllib
|
|
|
|
|
import urllib2
|
|
|
|
@ -404,7 +423,7 @@ index e090e90..bdcdfe3 100644
|
|
|
|
|
import mimetools
|
|
|
|
|
import thread
|
|
|
|
|
import types
|
|
|
|
|
@@ -428,9 +480,17 @@ import pycurl
|
|
|
|
|
@@ -428,9 +495,17 @@ import pycurl
|
|
|
|
|
from ftplib import parse150
|
|
|
|
|
from StringIO import StringIO
|
|
|
|
|
from httplib import HTTPException
|
|
|
|
@ -423,7 +442,7 @@ index e090e90..bdcdfe3 100644
|
|
|
|
|
########################################################################
|
|
|
|
|
# MODULE INITIALIZATION
|
|
|
|
|
########################################################################
|
|
|
|
|
@@ -439,6 +499,12 @@ try:
|
|
|
|
|
@@ -439,6 +514,12 @@ try:
|
|
|
|
|
except:
|
|
|
|
|
__version__ = '???'
|
|
|
|
|
|
|
|
|
@ -436,7 +455,7 @@ index e090e90..bdcdfe3 100644
|
|
|
|
|
########################################################################
|
|
|
|
|
# functions for debugging output. These functions are here because they
|
|
|
|
|
# are also part of the module initialization.
|
|
|
|
|
@@ -527,6 +593,22 @@ def _(st):
|
|
|
|
|
@@ -527,6 +608,22 @@ def _(st):
|
|
|
|
|
# END MODULE INITIALIZATION
|
|
|
|
|
########################################################################
|
|
|
|
|
|
|
|
|
@ -459,7 +478,7 @@ index e090e90..bdcdfe3 100644
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class URLGrabError(IOError):
|
|
|
|
|
@@ -662,6 +744,7 @@ class URLParser:
|
|
|
|
|
@@ -662,6 +759,7 @@ class URLParser:
|
|
|
|
|
opts.quote = 0 --> do not quote it
|
|
|
|
|
opts.quote = None --> guess
|
|
|
|
|
"""
|
|
|
|
@ -467,7 +486,7 @@ index e090e90..bdcdfe3 100644
|
|
|
|
|
quote = opts.quote
|
|
|
|
|
|
|
|
|
|
if opts.prefix:
|
|
|
|
|
@@ -768,6 +851,41 @@ class URLGrabberOptions:
|
|
|
|
|
@@ -768,6 +866,41 @@ class URLGrabberOptions:
|
|
|
|
|
else: # throttle is a float
|
|
|
|
|
return self.bandwidth * self.throttle
|
|
|
|
|
|
|
|
|
@ -509,7 +528,13 @@ index e090e90..bdcdfe3 100644
|
|
|
|
|
def derive(self, **kwargs):
|
|
|
|
|
"""Create a derived URLGrabberOptions instance.
|
|
|
|
|
This method creates a new instance and overrides the
|
|
|
|
|
@@ -796,25 +914,31 @@ class URLGrabberOptions:
|
|
|
|
|
@@ -791,30 +924,37 @@ class URLGrabberOptions:
|
|
|
|
|
provided here.
|
|
|
|
|
"""
|
|
|
|
|
self.progress_obj = None
|
|
|
|
|
+ self.multi_progress_obj = None
|
|
|
|
|
self.throttle = 1.0
|
|
|
|
|
self.bandwidth = 0
|
|
|
|
|
self.retry = None
|
|
|
|
|
self.retrycodes = [-1,2,4,5,6,7]
|
|
|
|
|
self.checkfunc = None
|
|
|
|
@ -542,7 +567,7 @@ index e090e90..bdcdfe3 100644
|
|
|
|
|
self.ssl_ca_cert = None # sets SSL_CAINFO - path to certdb
|
|
|
|
|
self.ssl_context = None # no-op in pycurl
|
|
|
|
|
self.ssl_verify_peer = True # check peer's cert for authenticityb
|
|
|
|
|
@@ -827,6 +951,12 @@ class URLGrabberOptions:
|
|
|
|
|
@@ -827,6 +967,12 @@ class URLGrabberOptions:
|
|
|
|
|
self.size = None # if we know how big the thing we're getting is going
|
|
|
|
|
# to be. this is ultimately a MAXIMUM size for the file
|
|
|
|
|
self.max_header_size = 2097152 #2mb seems reasonable for maximum header size
|
|
|
|
@ -555,7 +580,7 @@ index e090e90..bdcdfe3 100644
|
|
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
|
return self.format()
|
|
|
|
|
@@ -846,7 +976,18 @@ class URLGrabberOptions:
|
|
|
|
|
@@ -846,7 +992,18 @@ class URLGrabberOptions:
|
|
|
|
|
s = s + indent + '}'
|
|
|
|
|
return s
|
|
|
|
|
|
|
|
|
@ -575,7 +600,7 @@ index e090e90..bdcdfe3 100644
|
|
|
|
|
"""Provides easy opening of URLs with a variety of options.
|
|
|
|
|
|
|
|
|
|
All options are specified as kwargs. Options may be specified when
|
|
|
|
|
@@ -872,7 +1013,6 @@ class URLGrabber:
|
|
|
|
|
@@ -872,7 +1029,6 @@ class URLGrabber:
|
|
|
|
|
# beware of infinite loops :)
|
|
|
|
|
tries = tries + 1
|
|
|
|
|
exception = None
|
|
|
|
@ -583,7 +608,7 @@ index e090e90..bdcdfe3 100644
|
|
|
|
|
callback = None
|
|
|
|
|
if DEBUG: DEBUG.info('attempt %i/%s: %s',
|
|
|
|
|
tries, opts.retry, args[0])
|
|
|
|
|
@@ -883,23 +1023,24 @@ class URLGrabber:
|
|
|
|
|
@@ -883,54 +1039,62 @@ class URLGrabber:
|
|
|
|
|
except URLGrabError, e:
|
|
|
|
|
exception = e
|
|
|
|
|
callback = opts.failure_callback
|
|
|
|
@ -611,24 +636,36 @@ index e090e90..bdcdfe3 100644
|
|
|
|
|
if (retrycode is not None) and (retrycode not in opts.retrycodes):
|
|
|
|
|
if DEBUG: DEBUG.info('retrycode (%i) not in list %s, re-raising',
|
|
|
|
|
retrycode, opts.retrycodes)
|
|
|
|
|
@@ -912,9 +1053,11 @@ class URLGrabber:
|
|
|
|
|
raise
|
|
|
|
|
|
|
|
|
|
- def urlopen(self, url, **kwargs):
|
|
|
|
|
+ def urlopen(self, url, opts=None, **kwargs):
|
|
|
|
|
"""open the url and return a file object
|
|
|
|
|
If a progress object or throttle value specified when this
|
|
|
|
|
object was created, then a special file object will be
|
|
|
|
|
returned that supports them. The file object can be treated
|
|
|
|
|
like any other file object.
|
|
|
|
|
"""
|
|
|
|
|
- opts = self.opts.derive(**kwargs)
|
|
|
|
|
+ url = _to_utf8(url)
|
|
|
|
|
opts = self.opts.derive(**kwargs)
|
|
|
|
|
+ opts = (opts or self.opts).derive(**kwargs)
|
|
|
|
|
if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
|
|
|
|
|
(url,parts) = opts.urlparser.parse(url, opts)
|
|
|
|
|
+ opts.find_proxy(url, parts[0])
|
|
|
|
|
def retryfunc(opts, url):
|
|
|
|
|
return PyCurlFileObject(url, filename=None, opts=opts)
|
|
|
|
|
return self._retry(opts, retryfunc, url)
|
|
|
|
|
@@ -925,12 +1068,17 @@ class URLGrabber:
|
|
|
|
|
|
|
|
|
|
- def urlgrab(self, url, filename=None, **kwargs):
|
|
|
|
|
+ def urlgrab(self, url, filename=None, opts=None, **kwargs):
|
|
|
|
|
"""grab the file at <url> and make a local copy at <filename>
|
|
|
|
|
If filename is none, the basename of the url is used.
|
|
|
|
|
urlgrab returns the filename of the local file, which may be
|
|
|
|
|
different from the passed-in filename if copy_local == 0.
|
|
|
|
|
"""
|
|
|
|
|
- opts = self.opts.derive(**kwargs)
|
|
|
|
|
+ url = _to_utf8(url)
|
|
|
|
|
opts = self.opts.derive(**kwargs)
|
|
|
|
|
+ opts = (opts or self.opts).derive(**kwargs)
|
|
|
|
|
if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
|
|
|
|
|
(url,parts) = opts.urlparser.parse(url, opts)
|
|
|
|
|
(scheme, host, path, parm, query, frag) = parts
|
|
|
|
@ -641,7 +678,7 @@ index e090e90..bdcdfe3 100644
|
|
|
|
|
if scheme == 'file' and not opts.copy_local:
|
|
|
|
|
# just return the name of the local file - don't make a
|
|
|
|
|
# copy currently
|
|
|
|
|
@@ -950,30 +1098,36 @@ class URLGrabber:
|
|
|
|
|
@@ -950,41 +1114,49 @@ class URLGrabber:
|
|
|
|
|
|
|
|
|
|
elif not opts.range:
|
|
|
|
|
if not opts.checkfunc is None:
|
|
|
|
@ -689,21 +726,24 @@ index e090e90..bdcdfe3 100644
|
|
|
|
|
+ opts.exception = e
|
|
|
|
|
+ return _run_callback(opts.failfunc, opts)
|
|
|
|
|
|
|
|
|
|
def urlread(self, url, limit=None, **kwargs):
|
|
|
|
|
- def urlread(self, url, limit=None, **kwargs):
|
|
|
|
|
+ def urlread(self, url, limit=None, opts=None, **kwargs):
|
|
|
|
|
"""read the url into a string, up to 'limit' bytes
|
|
|
|
|
@@ -982,9 +1136,11 @@ class URLGrabber:
|
|
|
|
|
If the limit is exceeded, an exception will be thrown. Note
|
|
|
|
|
that urlread is NOT intended to be used as a way of saying
|
|
|
|
|
"I want the first N bytes" but rather 'read the whole file
|
|
|
|
|
into memory, but don't use too much'
|
|
|
|
|
"""
|
|
|
|
|
- opts = self.opts.derive(**kwargs)
|
|
|
|
|
+ url = _to_utf8(url)
|
|
|
|
|
opts = self.opts.derive(**kwargs)
|
|
|
|
|
+ opts = (opts or self.opts).derive(**kwargs)
|
|
|
|
|
if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
|
|
|
|
|
(url,parts) = opts.urlparser.parse(url, opts)
|
|
|
|
|
+ opts.find_proxy(url, parts[0])
|
|
|
|
|
if limit is not None:
|
|
|
|
|
limit = limit + 1
|
|
|
|
|
|
|
|
|
|
@@ -1000,12 +1156,8 @@ class URLGrabber:
|
|
|
|
|
@@ -1000,12 +1172,8 @@ class URLGrabber:
|
|
|
|
|
else: s = fo.read(limit)
|
|
|
|
|
|
|
|
|
|
if not opts.checkfunc is None:
|
|
|
|
@ -718,7 +758,7 @@ index e090e90..bdcdfe3 100644
|
|
|
|
|
finally:
|
|
|
|
|
fo.close()
|
|
|
|
|
return s
|
|
|
|
|
@@ -1020,6 +1172,7 @@ class URLGrabber:
|
|
|
|
|
@@ -1020,6 +1188,7 @@ class URLGrabber:
|
|
|
|
|
return s
|
|
|
|
|
|
|
|
|
|
def _make_callback(self, callback_obj):
|
|
|
|
@ -726,7 +766,7 @@ index e090e90..bdcdfe3 100644
|
|
|
|
|
if callable(callback_obj):
|
|
|
|
|
return callback_obj, (), {}
|
|
|
|
|
else:
|
|
|
|
|
@@ -1030,7 +1183,7 @@ class URLGrabber:
|
|
|
|
|
@@ -1030,7 +1199,7 @@ class URLGrabber:
|
|
|
|
|
default_grabber = URLGrabber()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -735,7 +775,7 @@ index e090e90..bdcdfe3 100644
|
|
|
|
|
def __init__(self, url, filename, opts):
|
|
|
|
|
self.fo = None
|
|
|
|
|
self._hdr_dump = ''
|
|
|
|
|
@@ -1052,10 +1205,11 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1052,10 +1221,11 @@ class PyCurlFileObject():
|
|
|
|
|
self._reget_length = 0
|
|
|
|
|
self._prog_running = False
|
|
|
|
|
self._error = (None, None)
|
|
|
|
@ -749,7 +789,7 @@ index e090e90..bdcdfe3 100644
|
|
|
|
|
def __getattr__(self, name):
|
|
|
|
|
"""This effectively allows us to wrap at the instance level.
|
|
|
|
|
Any attribute not found in _this_ object will be searched for
|
|
|
|
|
@@ -1085,9 +1239,14 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1085,9 +1255,14 @@ class PyCurlFileObject():
|
|
|
|
|
return -1
|
|
|
|
|
|
|
|
|
|
def _hdr_retrieve(self, buf):
|
|
|
|
@ -765,7 +805,7 @@ index e090e90..bdcdfe3 100644
|
|
|
|
|
try:
|
|
|
|
|
self._hdr_dump += buf
|
|
|
|
|
# we have to get the size before we do the progress obj start
|
|
|
|
|
@@ -1104,7 +1263,17 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1104,7 +1279,17 @@ class PyCurlFileObject():
|
|
|
|
|
s = parse150(buf)
|
|
|
|
|
if s:
|
|
|
|
|
self.size = int(s)
|
|
|
|
@ -784,7 +824,7 @@ index e090e90..bdcdfe3 100644
|
|
|
|
|
return len(buf)
|
|
|
|
|
except KeyboardInterrupt:
|
|
|
|
|
return pycurl.READFUNC_ABORT
|
|
|
|
|
@@ -1113,8 +1282,10 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1113,8 +1298,10 @@ class PyCurlFileObject():
|
|
|
|
|
if self._parsed_hdr:
|
|
|
|
|
return self._parsed_hdr
|
|
|
|
|
statusend = self._hdr_dump.find('\n')
|
|
|
|
@ -795,7 +835,7 @@ index e090e90..bdcdfe3 100644
|
|
|
|
|
self._parsed_hdr = mimetools.Message(hdrfp)
|
|
|
|
|
return self._parsed_hdr
|
|
|
|
|
|
|
|
|
|
@@ -1127,6 +1298,9 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1127,6 +1314,9 @@ class PyCurlFileObject():
|
|
|
|
|
if not opts:
|
|
|
|
|
opts = self.opts
|
|
|
|
|
|
|
|
|
@ -805,7 +845,7 @@ index e090e90..bdcdfe3 100644
|
|
|
|
|
|
|
|
|
|
# defaults we're always going to set
|
|
|
|
|
self.curl_obj.setopt(pycurl.NOPROGRESS, False)
|
|
|
|
|
@@ -1136,11 +1310,21 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1136,11 +1326,21 @@ class PyCurlFileObject():
|
|
|
|
|
self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update)
|
|
|
|
|
self.curl_obj.setopt(pycurl.FAILONERROR, True)
|
|
|
|
|
self.curl_obj.setopt(pycurl.OPT_FILETIME, True)
|
|
|
|
@ -827,7 +867,7 @@ index e090e90..bdcdfe3 100644
|
|
|
|
|
|
|
|
|
|
# maybe to be options later
|
|
|
|
|
self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)
|
|
|
|
|
@@ -1148,9 +1332,11 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1148,9 +1348,11 @@ class PyCurlFileObject():
|
|
|
|
|
|
|
|
|
|
# timeouts
|
|
|
|
|
timeout = 300
|
|
|
|
@ -842,7 +882,7 @@ index e090e90..bdcdfe3 100644
|
|
|
|
|
|
|
|
|
|
# ssl options
|
|
|
|
|
if self.scheme == 'https':
|
|
|
|
|
@@ -1158,13 +1344,16 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1158,13 +1360,16 @@ class PyCurlFileObject():
|
|
|
|
|
self.curl_obj.setopt(pycurl.CAPATH, opts.ssl_ca_cert)
|
|
|
|
|
self.curl_obj.setopt(pycurl.CAINFO, opts.ssl_ca_cert)
|
|
|
|
|
self.curl_obj.setopt(pycurl.SSL_VERIFYPEER, opts.ssl_verify_peer)
|
|
|
|
@ -860,7 +900,7 @@ index e090e90..bdcdfe3 100644
|
|
|
|
|
if opts.ssl_cert_type:
|
|
|
|
|
self.curl_obj.setopt(pycurl.SSLCERTTYPE, opts.ssl_cert_type)
|
|
|
|
|
if opts.ssl_key_pass:
|
|
|
|
|
@@ -1187,28 +1376,26 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1187,28 +1392,26 @@ class PyCurlFileObject():
|
|
|
|
|
if hasattr(opts, 'raw_throttle') and opts.raw_throttle():
|
|
|
|
|
self.curl_obj.setopt(pycurl.MAX_RECV_SPEED_LARGE, int(opts.raw_throttle()))
|
|
|
|
|
|
|
|
|
@ -905,7 +945,7 @@ index e090e90..bdcdfe3 100644
|
|
|
|
|
|
|
|
|
|
# our url
|
|
|
|
|
self.curl_obj.setopt(pycurl.URL, self.url)
|
|
|
|
|
@@ -1228,12 +1415,14 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1228,12 +1431,14 @@ class PyCurlFileObject():
|
|
|
|
|
|
|
|
|
|
code = self.http_code
|
|
|
|
|
errcode = e.args[0]
|
|
|
|
@ -922,7 +962,7 @@ index e090e90..bdcdfe3 100644
|
|
|
|
|
|
|
|
|
|
# this is probably wrong but ultimately this is what happens
|
|
|
|
|
# we have a legit http code and a pycurl 'writer failed' code
|
|
|
|
|
@@ -1244,23 +1433,23 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1244,23 +1449,23 @@ class PyCurlFileObject():
|
|
|
|
|
raise KeyboardInterrupt
|
|
|
|
|
|
|
|
|
|
elif errcode == 28:
|
|
|
|
@ -953,7 +993,7 @@ index e090e90..bdcdfe3 100644
|
|
|
|
|
# this is probably wrong but ultimately this is what happens
|
|
|
|
|
# we have a legit http code and a pycurl 'writer failed' code
|
|
|
|
|
# which almost always means something aborted it from outside
|
|
|
|
|
@@ -1272,33 +1461,94 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1272,33 +1477,94 @@ class PyCurlFileObject():
|
|
|
|
|
elif errcode == 58:
|
|
|
|
|
msg = _("problem with the local client certificate")
|
|
|
|
|
err = URLGrabError(14, msg)
|
|
|
|
@ -1055,7 +1095,7 @@ index e090e90..bdcdfe3 100644
|
|
|
|
|
|
|
|
|
|
def _do_open(self):
|
|
|
|
|
self.curl_obj = _curl_cache
|
|
|
|
|
@@ -1333,7 +1583,11 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1333,7 +1599,11 @@ class PyCurlFileObject():
|
|
|
|
|
|
|
|
|
|
if self.opts.range:
|
|
|
|
|
rt = self.opts.range
|
|
|
|
@ -1068,7 +1108,7 @@ index e090e90..bdcdfe3 100644
|
|
|
|
|
|
|
|
|
|
if rt:
|
|
|
|
|
header = range_tuple_to_header(rt)
|
|
|
|
|
@@ -1434,21 +1688,46 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1434,21 +1704,46 @@ class PyCurlFileObject():
|
|
|
|
|
#fh, self._temp_name = mkstemp()
|
|
|
|
|
#self.fo = open(self._temp_name, 'wb')
|
|
|
|
|
|
|
|
|
@ -1122,7 +1162,7 @@ index e090e90..bdcdfe3 100644
|
|
|
|
|
else:
|
|
|
|
|
#self.fo = open(self._temp_name, 'r')
|
|
|
|
|
self.fo.seek(0)
|
|
|
|
|
@@ -1526,17 +1805,20 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1526,17 +1821,20 @@ class PyCurlFileObject():
|
|
|
|
|
if self._prog_running:
|
|
|
|
|
downloaded += self._reget_length
|
|
|
|
|
self.opts.progress_obj.update(downloaded)
|
|
|
|
@ -1148,7 +1188,7 @@ index e090e90..bdcdfe3 100644
|
|
|
|
|
|
|
|
|
|
msg = _("Downloaded more than max size for %s: %s > %s") \
|
|
|
|
|
% (self.url, cur, max_size)
|
|
|
|
|
@@ -1544,13 +1826,6 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1544,13 +1842,6 @@ class PyCurlFileObject():
|
|
|
|
|
return True
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
@ -1162,7 +1202,7 @@ index e090e90..bdcdfe3 100644
|
|
|
|
|
def read(self, amt=None):
|
|
|
|
|
self._fill_buffer(amt)
|
|
|
|
|
if amt is None:
|
|
|
|
|
@@ -1582,9 +1857,21 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1582,9 +1873,21 @@ class PyCurlFileObject():
|
|
|
|
|
self.opts.progress_obj.end(self._amount_read)
|
|
|
|
|
self.fo.close()
|
|
|
|
|
|
|
|
|
@ -1185,7 +1225,7 @@ index e090e90..bdcdfe3 100644
|
|
|
|
|
|
|
|
|
|
#####################################################################
|
|
|
|
|
# DEPRECATED FUNCTIONS
|
|
|
|
|
@@ -1621,6 +1908,445 @@ def retrygrab(url, filename=None, copy_local=0, close_connection=0,
|
|
|
|
|
@@ -1621,6 +1924,458 @@ def retrygrab(url, filename=None, copy_local=0, close_connection=0,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#####################################################################
|
|
|
|
@ -1309,7 +1349,7 @@ index e090e90..bdcdfe3 100644
|
|
|
|
|
+ v = getattr(opts, k)
|
|
|
|
|
+ if v is None: continue
|
|
|
|
|
+ arg.append('%s=%s' % (k, _dumps(v)))
|
|
|
|
|
+ if opts.progress_obj:
|
|
|
|
|
+ if opts.progress_obj and opts.multi_progress_obj:
|
|
|
|
|
+ arg.append('progress_obj=True')
|
|
|
|
|
+ arg = ' '.join(arg)
|
|
|
|
|
+ if DEBUG: DEBUG.info('attempt %i/%s: %s', opts.tries, opts.retry, opts.url)
|
|
|
|
@ -1329,7 +1369,7 @@ index e090e90..bdcdfe3 100644
|
|
|
|
|
+ line = line.split(' ', 5)
|
|
|
|
|
+ _id, size = map(int, line[:2])
|
|
|
|
|
+ if len(line) == 2:
|
|
|
|
|
+ self.running[_id].progress_obj.update(size)
|
|
|
|
|
+ self.running[_id]._progress.update(size)
|
|
|
|
|
+ continue
|
|
|
|
|
+ # job done
|
|
|
|
|
+ opts = self.running.pop(_id)
|
|
|
|
@ -1398,19 +1438,20 @@ index e090e90..bdcdfe3 100644
|
|
|
|
|
+
|
|
|
|
|
+_async_queue = []
|
|
|
|
|
+
|
|
|
|
|
+def parallel_wait(meter = 'text'):
|
|
|
|
|
+def parallel_wait(meter=None):
|
|
|
|
|
+ '''Process queued requests in parallel.
|
|
|
|
|
+ '''
|
|
|
|
|
+
|
|
|
|
|
+ if meter:
|
|
|
|
|
+ count = total = 0
|
|
|
|
|
+ # calculate total sizes
|
|
|
|
|
+ meters = {}
|
|
|
|
|
+ for opts in _async_queue:
|
|
|
|
|
+ if opts.progress_obj:
|
|
|
|
|
+ count += 1
|
|
|
|
|
+ total += opts.size
|
|
|
|
|
+ if meter == 'text':
|
|
|
|
|
+ from progress import TextMultiFileMeter
|
|
|
|
|
+ meter = TextMultiFileMeter()
|
|
|
|
|
+ if opts.progress_obj and opts.multi_progress_obj:
|
|
|
|
|
+ count, total = meters.get(opts.multi_progress_obj) or (0, 0)
|
|
|
|
|
+ meters[opts.multi_progress_obj] = count + 1, total + opts.size
|
|
|
|
|
+
|
|
|
|
|
+ # start multi-file meters
|
|
|
|
|
+ for meter in meters:
|
|
|
|
|
+ count, total = meters[meter]
|
|
|
|
|
+ meter.start(count, total)
|
|
|
|
|
+
|
|
|
|
|
+ dl = _ExternalDownloaderPool()
|
|
|
|
@ -1420,11 +1461,12 @@ index e090e90..bdcdfe3 100644
|
|
|
|
|
+ key, limit = opts.async
|
|
|
|
|
+ host_con[key] = host_con.get(key, 0) + 1
|
|
|
|
|
+ opts.tries = tries
|
|
|
|
|
+ if meter and opts.progress_obj:
|
|
|
|
|
+ opts.progress_obj = meter.newMeter()
|
|
|
|
|
+ opts.progress_obj.start(text=opts.text, basename=os.path.basename(opts.filename))
|
|
|
|
|
+ if opts.progress_obj:
|
|
|
|
|
+ if opts.multi_progress_obj:
|
|
|
|
|
+ opts._progress = opts.multi_progress_obj.newMeter()
|
|
|
|
|
+ opts._progress.start(text=opts.text)
|
|
|
|
|
+ else:
|
|
|
|
|
+ opts.progress_obj = None
|
|
|
|
|
+ opts._progress = time.time() # no updates
|
|
|
|
|
+ if DEBUG: DEBUG.info('attempt %i/%s: %s', opts.tries, opts.retry, opts.url)
|
|
|
|
|
+ dl.start(opts)
|
|
|
|
|
+
|
|
|
|
@ -1432,15 +1474,16 @@ index e090e90..bdcdfe3 100644
|
|
|
|
|
+ for opts, size, ug_err in dl.perform():
|
|
|
|
|
+ key, limit = opts.async
|
|
|
|
|
+ host_con[key] -= 1
|
|
|
|
|
+ m = opts.progress_obj
|
|
|
|
|
+ if m:
|
|
|
|
|
+ if ug_err:
|
|
|
|
|
+ m.failure(ug_err.args[1])
|
|
|
|
|
+ if opts.progress_obj:
|
|
|
|
|
+ if opts.multi_progress_obj:
|
|
|
|
|
+ opts.multi_progress_obj.re.total += size - opts.size # correct totals
|
|
|
|
|
+ opts._progress.end(size)
|
|
|
|
|
+ opts.multi_progress_obj.removeMeter(opts._progress)
|
|
|
|
|
+ else:
|
|
|
|
|
+ # file size might have changed
|
|
|
|
|
+ meter.re.total += size - opts.size
|
|
|
|
|
+ m.end(size)
|
|
|
|
|
+ meter.removeMeter(m)
|
|
|
|
|
+ opts.progress_obj.start(text=opts.text, now=opts._progress)
|
|
|
|
|
+ opts.progress_obj.update(size)
|
|
|
|
|
+ opts.progress_obj.end(size)
|
|
|
|
|
+ del opts._progress
|
|
|
|
|
+
|
|
|
|
|
+ if ug_err is None:
|
|
|
|
|
+ if opts.checkfunc:
|
|
|
|
@ -1460,13 +1503,15 @@ index e090e90..bdcdfe3 100644
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ if opts.mirror_group:
|
|
|
|
|
+ mg, failed, removed = opts.mirror_group
|
|
|
|
|
+ mg, errors, failed, removed = opts.mirror_group
|
|
|
|
|
+ errors.append((opts.url, str(ug_err)))
|
|
|
|
|
+ failed[key] = failed.get(key, 0) + 1
|
|
|
|
|
+ opts.mirror = key
|
|
|
|
|
+ opts.exception = ug_err
|
|
|
|
|
+ action = mg.default_action or {}
|
|
|
|
|
+ if mg.failure_callback:
|
|
|
|
|
+ opts.tries = sum(failed.values())
|
|
|
|
|
+ opts.tries = len(errors)
|
|
|
|
|
+ action = dict(action) # update only the copy
|
|
|
|
|
+ action.update(_run_callback(mg.failure_callback, opts))
|
|
|
|
|
+ if not action.get('fail', 0):
|
|
|
|
|
+ # mask this mirror and retry
|
|
|
|
@ -1474,6 +1519,8 @@ index e090e90..bdcdfe3 100644
|
|
|
|
|
+ removed.add(key)
|
|
|
|
|
+ _async_queue.append(opts)
|
|
|
|
|
+ continue
|
|
|
|
|
+ # fail=1 from callback
|
|
|
|
|
+ ug_err.errors = errors
|
|
|
|
|
+
|
|
|
|
|
+ # urlgrab failed
|
|
|
|
|
+ opts.exception = ug_err
|
|
|
|
@ -1494,11 +1541,11 @@ index e090e90..bdcdfe3 100644
|
|
|
|
|
+ idx += 1
|
|
|
|
|
+
|
|
|
|
|
+ # check global limit
|
|
|
|
|
+ while len(dl.running) >= opts.max_connections:
|
|
|
|
|
+ while len(dl.running) >= default_grabber.opts.max_connections:
|
|
|
|
|
+ perform()
|
|
|
|
|
+
|
|
|
|
|
+ if opts.mirror_group:
|
|
|
|
|
+ mg, failed, removed = opts.mirror_group
|
|
|
|
|
+ mg, errors, failed, removed = opts.mirror_group
|
|
|
|
|
+
|
|
|
|
|
+ # find the best mirror
|
|
|
|
|
+ best = None
|
|
|
|
@ -1519,9 +1566,14 @@ index e090e90..bdcdfe3 100644
|
|
|
|
|
+
|
|
|
|
|
+ if best is None:
|
|
|
|
|
+ opts.exception = URLGrabError(256, _('No more mirrors to try.'))
|
|
|
|
|
+ opts.exception.errors = errors
|
|
|
|
|
+ _run_callback(opts.failfunc, opts)
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ # update the grabber object, apply mirror kwargs
|
|
|
|
|
+ grabber = best.get('grabber') or mg.grabber
|
|
|
|
|
+ opts.delegate = grabber.opts.derive(**best.get('kwargs', {}))
|
|
|
|
|
+
|
|
|
|
|
+ # update the current mirror and limit
|
|
|
|
|
+ key = best['mirror']
|
|
|
|
|
+ limit = best.get('kwargs', {}).get('max_connections', 2)
|
|
|
|
@ -1544,7 +1596,8 @@ index e090e90..bdcdfe3 100644
|
|
|
|
|
+
|
|
|
|
|
+ finally:
|
|
|
|
|
+ dl.abort()
|
|
|
|
|
+ if meter: meter.end()
|
|
|
|
|
+ for meter in meters:
|
|
|
|
|
+ meter.end()
|
|
|
|
|
+ del _async_queue[:]
|
|
|
|
|
+ _TH.save()
|
|
|
|
|
+
|
|
|
|
@ -1632,20 +1685,24 @@ index e090e90..bdcdfe3 100644
|
|
|
|
|
def _main_test():
|
|
|
|
|
try: url, filename = sys.argv[1:3]
|
|
|
|
|
diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py
|
|
|
|
|
index dad410b..ac78b34 100644
|
|
|
|
|
index dad410b..b17be17 100644
|
|
|
|
|
--- a/urlgrabber/mirror.py
|
|
|
|
|
+++ b/urlgrabber/mirror.py
|
|
|
|
|
@@ -76,6 +76,9 @@ CUSTOMIZATION
|
|
|
|
|
'grabber' is omitted, the default grabber will be used. If
|
|
|
|
|
kwargs are omitted, then (duh) they will not be used.
|
|
|
|
|
|
|
|
|
|
+ kwarg 'max_connections' is used to store the max connection
|
|
|
|
|
+ limit of this mirror.
|
|
|
|
|
+ kwarg 'max_connections' limits the number of concurrent
|
|
|
|
|
+ connections to this mirror.
|
|
|
|
|
+
|
|
|
|
|
3) Pass keyword arguments when instantiating the mirror group.
|
|
|
|
|
See, for example, the failure_callback argument.
|
|
|
|
|
|
|
|
|
|
@@ -90,7 +93,8 @@ CUSTOMIZATION
|
|
|
|
|
@@ -87,10 +90,12 @@ CUSTOMIZATION
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
+import sys
|
|
|
|
|
import random
|
|
|
|
|
import thread # needed for locking to make this threadsafe
|
|
|
|
|
|
|
|
|
@ -1655,7 +1712,28 @@ index dad410b..ac78b34 100644
|
|
|
|
|
|
|
|
|
|
def _(st):
|
|
|
|
|
return st
|
|
|
|
|
@@ -184,6 +188,7 @@ class MirrorGroup:
|
|
|
|
|
@@ -126,7 +131,9 @@ class MirrorGroup:
|
|
|
|
|
files)
|
|
|
|
|
|
|
|
|
|
* if the local list is ever exhausted, a URLGrabError will be
|
|
|
|
|
- raised (errno=256, no more mirrors)
|
|
|
|
|
+ raised (errno=256, No more mirrors). The 'errors' attribute
|
|
|
|
|
+ holds a list of (full_url, errmsg) tuples. This contains
|
|
|
|
|
+ all URLs tried and the corresponding error messages.
|
|
|
|
|
|
|
|
|
|
OPTIONS
|
|
|
|
|
|
|
|
|
|
@@ -153,7 +160,8 @@ class MirrorGroup:
|
|
|
|
|
|
|
|
|
|
The 'fail' option will cause immediate failure by re-raising
|
|
|
|
|
the exception and no further attempts to get the current
|
|
|
|
|
- download.
|
|
|
|
|
+ download. As in the "No more mirrors" case, the 'errors'
|
|
|
|
|
+ attribute is set in the exception object.
|
|
|
|
|
|
|
|
|
|
This dict can be set at instantiation time,
|
|
|
|
|
mg = MirrorGroup(grabber, mirrors, default_action={'fail':1})
|
|
|
|
|
@@ -184,6 +192,7 @@ class MirrorGroup:
|
|
|
|
|
|
|
|
|
|
obj.exception = < exception that was raised >
|
|
|
|
|
obj.mirror = < the mirror that was tried >
|
|
|
|
@ -1663,7 +1741,7 @@ index dad410b..ac78b34 100644
|
|
|
|
|
obj.relative_url = < url relative to the mirror >
|
|
|
|
|
obj.url = < full url that failed >
|
|
|
|
|
# .url is just the combination of .mirror
|
|
|
|
|
@@ -263,7 +268,8 @@ class MirrorGroup:
|
|
|
|
|
@@ -263,7 +272,8 @@ class MirrorGroup:
|
|
|
|
|
def _parse_mirrors(self, mirrors):
|
|
|
|
|
parsed_mirrors = []
|
|
|
|
|
for m in mirrors:
|
|
|
|
@ -1673,7 +1751,35 @@ index dad410b..ac78b34 100644
|
|
|
|
|
parsed_mirrors.append(m)
|
|
|
|
|
return parsed_mirrors
|
|
|
|
|
|
|
|
|
|
@@ -382,7 +388,9 @@ class MirrorGroup:
|
|
|
|
|
@@ -280,7 +290,9 @@ class MirrorGroup:
|
|
|
|
|
# return a random mirror so that multiple mirrors get used
|
|
|
|
|
# even without failures.
|
|
|
|
|
if not gr.mirrors:
|
|
|
|
|
- raise URLGrabError(256, _('No more mirrors to try.'))
|
|
|
|
|
+ e = URLGrabError(256, _('No more mirrors to try.'))
|
|
|
|
|
+ e.errors = gr.errors
|
|
|
|
|
+ raise e
|
|
|
|
|
return gr.mirrors[gr._next]
|
|
|
|
|
|
|
|
|
|
def _failure(self, gr, cb_obj):
|
|
|
|
|
@@ -307,7 +319,9 @@ class MirrorGroup:
|
|
|
|
|
a.update(action)
|
|
|
|
|
action = a
|
|
|
|
|
self.increment_mirror(gr, action)
|
|
|
|
|
- if action and action.get('fail', 0): raise
|
|
|
|
|
+ if action and action.get('fail', 0):
|
|
|
|
|
+ sys.exc_info()[1].errors = gr.errors
|
|
|
|
|
+ raise
|
|
|
|
|
|
|
|
|
|
def increment_mirror(self, gr, action={}):
|
|
|
|
|
"""Tell the mirror object increment the mirror index
|
|
|
|
|
@@ -377,35 +391,50 @@ class MirrorGroup:
|
|
|
|
|
gr.url = url
|
|
|
|
|
gr.kw = dict(kw)
|
|
|
|
|
self._load_gr(gr)
|
|
|
|
|
+ gr.errors = []
|
|
|
|
|
|
|
|
|
|
for k in self.options:
|
|
|
|
|
try: del kw[k]
|
|
|
|
|
except KeyError: pass
|
|
|
|
|
|
|
|
|
@ -1682,8 +1788,21 @@ index dad410b..ac78b34 100644
|
|
|
|
|
+ tries += 1
|
|
|
|
|
mirrorchoice = self._get_mirror(gr)
|
|
|
|
|
fullurl = self._join_url(mirrorchoice['mirror'], gr.url)
|
|
|
|
|
kwargs = dict(mirrorchoice.get('kwargs', {}))
|
|
|
|
|
@@ -399,13 +407,24 @@ class MirrorGroup:
|
|
|
|
|
- kwargs = dict(mirrorchoice.get('kwargs', {}))
|
|
|
|
|
- kwargs.update(kw)
|
|
|
|
|
grabber = mirrorchoice.get('grabber') or self.grabber
|
|
|
|
|
+ # apply mirrorchoice kwargs on top of grabber.opts
|
|
|
|
|
+ opts = grabber.opts.derive(**mirrorchoice.get('kwargs', {}))
|
|
|
|
|
func_ref = getattr(grabber, func)
|
|
|
|
|
if DEBUG: DEBUG.info('MIRROR: trying %s -> %s', url, fullurl)
|
|
|
|
|
try:
|
|
|
|
|
- return func_ref( *(fullurl,), **kwargs )
|
|
|
|
|
+ return func_ref( *(fullurl,), opts=opts, **kw )
|
|
|
|
|
except URLGrabError, e:
|
|
|
|
|
if DEBUG: DEBUG.info('MIRROR: failed')
|
|
|
|
|
+ gr.errors.append((fullurl, str(e)))
|
|
|
|
|
obj = CallbackObject()
|
|
|
|
|
obj.exception = e
|
|
|
|
|
obj.mirror = mirrorchoice['mirror']
|
|
|
|
|
obj.relative_url = gr.url
|
|
|
|
|
obj.url = fullurl
|
|
|
|
@ -1695,7 +1814,7 @@ index dad410b..ac78b34 100644
|
|
|
|
|
kw['filename'] = filename
|
|
|
|
|
+ if kw.get('async'):
|
|
|
|
|
+ # enable mirror failovers in async path
|
|
|
|
|
+ kw['mirror_group'] = self, {}, set()
|
|
|
|
|
+ kw['mirror_group'] = self, [], {}, set()
|
|
|
|
|
+ kw['relative_url'] = url
|
|
|
|
|
+ else:
|
|
|
|
|
+ kw.pop('failfunc', None)
|
|
|
|
|