|
|
|
@ -236,7 +236,7 @@ index 3e5f3b7..8eeaeda 100644
|
|
|
|
|
return (fb,lb)
|
|
|
|
|
|
|
|
|
|
diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py
|
|
|
|
|
index e090e90..74a692c 100644
|
|
|
|
|
index e090e90..78c2e59 100644
|
|
|
|
|
--- a/urlgrabber/grabber.py
|
|
|
|
|
+++ b/urlgrabber/grabber.py
|
|
|
|
|
@@ -49,11 +49,26 @@ GENERAL ARGUMENTS (kwargs)
|
|
|
|
@ -458,7 +458,26 @@ index e090e90..74a692c 100644
|
|
|
|
|
########################################################################
|
|
|
|
|
# functions for debugging output. These functions are here because they
|
|
|
|
|
# are also part of the module initialization.
|
|
|
|
|
@@ -527,6 +608,29 @@ def _(st):
|
|
|
|
|
@@ -504,6 +585,7 @@ def _init_default_logger(logspec=None):
|
|
|
|
|
else: handler = logging.FileHandler(filename)
|
|
|
|
|
handler.setFormatter(formatter)
|
|
|
|
|
DBOBJ = logging.getLogger('urlgrabber')
|
|
|
|
|
+ DBOBJ.propagate = False
|
|
|
|
|
DBOBJ.addHandler(handler)
|
|
|
|
|
DBOBJ.setLevel(level)
|
|
|
|
|
except (KeyError, ImportError, ValueError):
|
|
|
|
|
@@ -512,8 +594,8 @@ def _init_default_logger(logspec=None):
|
|
|
|
|
|
|
|
|
|
def _log_package_state():
|
|
|
|
|
if not DEBUG: return
|
|
|
|
|
- DEBUG.info('urlgrabber version = %s' % __version__)
|
|
|
|
|
- DEBUG.info('trans function "_" = %s' % _)
|
|
|
|
|
+ DEBUG.debug('urlgrabber version = %s' % __version__)
|
|
|
|
|
+ DEBUG.debug('trans function "_" = %s' % _)
|
|
|
|
|
|
|
|
|
|
_init_default_logger()
|
|
|
|
|
_log_package_state()
|
|
|
|
|
@@ -527,6 +609,29 @@ def _(st):
|
|
|
|
|
# END MODULE INITIALIZATION
|
|
|
|
|
########################################################################
|
|
|
|
|
|
|
|
|
@ -488,7 +507,7 @@ index e090e90..74a692c 100644
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class URLGrabError(IOError):
|
|
|
|
|
@@ -662,6 +766,7 @@ class URLParser:
|
|
|
|
|
@@ -662,6 +767,7 @@ class URLParser:
|
|
|
|
|
opts.quote = 0 --> do not quote it
|
|
|
|
|
opts.quote = None --> guess
|
|
|
|
|
"""
|
|
|
|
@ -496,7 +515,7 @@ index e090e90..74a692c 100644
|
|
|
|
|
quote = opts.quote
|
|
|
|
|
|
|
|
|
|
if opts.prefix:
|
|
|
|
|
@@ -768,6 +873,41 @@ class URLGrabberOptions:
|
|
|
|
|
@@ -768,6 +874,41 @@ class URLGrabberOptions:
|
|
|
|
|
else: # throttle is a float
|
|
|
|
|
return self.bandwidth * self.throttle
|
|
|
|
|
|
|
|
|
@ -538,7 +557,7 @@ index e090e90..74a692c 100644
|
|
|
|
|
def derive(self, **kwargs):
|
|
|
|
|
"""Create a derived URLGrabberOptions instance.
|
|
|
|
|
This method creates a new instance and overrides the
|
|
|
|
|
@@ -791,30 +931,37 @@ class URLGrabberOptions:
|
|
|
|
|
@@ -791,30 +932,37 @@ class URLGrabberOptions:
|
|
|
|
|
provided here.
|
|
|
|
|
"""
|
|
|
|
|
self.progress_obj = None
|
|
|
|
@ -577,7 +596,7 @@ index e090e90..74a692c 100644
|
|
|
|
|
self.ssl_ca_cert = None # sets SSL_CAINFO - path to certdb
|
|
|
|
|
self.ssl_context = None # no-op in pycurl
|
|
|
|
|
self.ssl_verify_peer = True # check peer's cert for authenticityb
|
|
|
|
|
@@ -827,6 +974,12 @@ class URLGrabberOptions:
|
|
|
|
|
@@ -827,6 +975,12 @@ class URLGrabberOptions:
|
|
|
|
|
self.size = None # if we know how big the thing we're getting is going
|
|
|
|
|
# to be. this is ultimately a MAXIMUM size for the file
|
|
|
|
|
self.max_header_size = 2097152 #2mb seems reasonable for maximum header size
|
|
|
|
@ -590,7 +609,7 @@ index e090e90..74a692c 100644
|
|
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
|
return self.format()
|
|
|
|
|
@@ -846,7 +999,18 @@ class URLGrabberOptions:
|
|
|
|
|
@@ -846,7 +1000,18 @@ class URLGrabberOptions:
|
|
|
|
|
s = s + indent + '}'
|
|
|
|
|
return s
|
|
|
|
|
|
|
|
|
@ -610,7 +629,7 @@ index e090e90..74a692c 100644
|
|
|
|
|
"""Provides easy opening of URLs with a variety of options.
|
|
|
|
|
|
|
|
|
|
All options are specified as kwargs. Options may be specified when
|
|
|
|
|
@@ -872,7 +1036,6 @@ class URLGrabber:
|
|
|
|
|
@@ -872,7 +1037,6 @@ class URLGrabber:
|
|
|
|
|
# beware of infinite loops :)
|
|
|
|
|
tries = tries + 1
|
|
|
|
|
exception = None
|
|
|
|
@ -618,7 +637,7 @@ index e090e90..74a692c 100644
|
|
|
|
|
callback = None
|
|
|
|
|
if DEBUG: DEBUG.info('attempt %i/%s: %s',
|
|
|
|
|
tries, opts.retry, args[0])
|
|
|
|
|
@@ -883,54 +1046,62 @@ class URLGrabber:
|
|
|
|
|
@@ -883,54 +1047,62 @@ class URLGrabber:
|
|
|
|
|
except URLGrabError, e:
|
|
|
|
|
exception = e
|
|
|
|
|
callback = opts.failure_callback
|
|
|
|
@ -688,7 +707,7 @@ index e090e90..74a692c 100644
|
|
|
|
|
if scheme == 'file' and not opts.copy_local:
|
|
|
|
|
# just return the name of the local file - don't make a
|
|
|
|
|
# copy currently
|
|
|
|
|
@@ -950,41 +1121,51 @@ class URLGrabber:
|
|
|
|
|
@@ -950,41 +1122,51 @@ class URLGrabber:
|
|
|
|
|
|
|
|
|
|
elif not opts.range:
|
|
|
|
|
if not opts.checkfunc is None:
|
|
|
|
@ -755,7 +774,7 @@ index e090e90..74a692c 100644
|
|
|
|
|
if limit is not None:
|
|
|
|
|
limit = limit + 1
|
|
|
|
|
|
|
|
|
|
@@ -1000,12 +1181,8 @@ class URLGrabber:
|
|
|
|
|
@@ -1000,12 +1182,8 @@ class URLGrabber:
|
|
|
|
|
else: s = fo.read(limit)
|
|
|
|
|
|
|
|
|
|
if not opts.checkfunc is None:
|
|
|
|
@ -770,7 +789,7 @@ index e090e90..74a692c 100644
|
|
|
|
|
finally:
|
|
|
|
|
fo.close()
|
|
|
|
|
return s
|
|
|
|
|
@@ -1020,6 +1197,7 @@ class URLGrabber:
|
|
|
|
|
@@ -1020,6 +1198,7 @@ class URLGrabber:
|
|
|
|
|
return s
|
|
|
|
|
|
|
|
|
|
def _make_callback(self, callback_obj):
|
|
|
|
@ -778,7 +797,7 @@ index e090e90..74a692c 100644
|
|
|
|
|
if callable(callback_obj):
|
|
|
|
|
return callback_obj, (), {}
|
|
|
|
|
else:
|
|
|
|
|
@@ -1030,7 +1208,7 @@ class URLGrabber:
|
|
|
|
|
@@ -1030,7 +1209,7 @@ class URLGrabber:
|
|
|
|
|
default_grabber = URLGrabber()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -787,7 +806,7 @@ index e090e90..74a692c 100644
|
|
|
|
|
def __init__(self, url, filename, opts):
|
|
|
|
|
self.fo = None
|
|
|
|
|
self._hdr_dump = ''
|
|
|
|
|
@@ -1052,10 +1230,13 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1052,10 +1231,13 @@ class PyCurlFileObject():
|
|
|
|
|
self._reget_length = 0
|
|
|
|
|
self._prog_running = False
|
|
|
|
|
self._error = (None, None)
|
|
|
|
@ -803,7 +822,7 @@ index e090e90..74a692c 100644
|
|
|
|
|
def __getattr__(self, name):
|
|
|
|
|
"""This effectively allows us to wrap at the instance level.
|
|
|
|
|
Any attribute not found in _this_ object will be searched for
|
|
|
|
|
@@ -1067,6 +1248,12 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1067,6 +1249,12 @@ class PyCurlFileObject():
|
|
|
|
|
|
|
|
|
|
def _retrieve(self, buf):
|
|
|
|
|
try:
|
|
|
|
@ -816,7 +835,7 @@ index e090e90..74a692c 100644
|
|
|
|
|
if not self._prog_running:
|
|
|
|
|
if self.opts.progress_obj:
|
|
|
|
|
size = self.size + self._reget_length
|
|
|
|
|
@@ -1079,15 +1266,24 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1079,15 +1267,24 @@ class PyCurlFileObject():
|
|
|
|
|
self.opts.progress_obj.update(self._amount_read)
|
|
|
|
|
|
|
|
|
|
self._amount_read += len(buf)
|
|
|
|
@ -843,7 +862,7 @@ index e090e90..74a692c 100644
|
|
|
|
|
try:
|
|
|
|
|
self._hdr_dump += buf
|
|
|
|
|
# we have to get the size before we do the progress obj start
|
|
|
|
|
@@ -1104,7 +1300,17 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1104,7 +1301,17 @@ class PyCurlFileObject():
|
|
|
|
|
s = parse150(buf)
|
|
|
|
|
if s:
|
|
|
|
|
self.size = int(s)
|
|
|
|
@ -857,12 +876,12 @@ index e090e90..74a692c 100644
|
|
|
|
|
+
|
|
|
|
|
+ if len(self._hdr_dump) != 0 and buf == '\r\n':
|
|
|
|
|
+ self._hdr_ended = True
|
|
|
|
|
+ if DEBUG: DEBUG.info('header ended:')
|
|
|
|
|
+ if DEBUG: DEBUG.debug('header ended:')
|
|
|
|
|
+
|
|
|
|
|
return len(buf)
|
|
|
|
|
except KeyboardInterrupt:
|
|
|
|
|
return pycurl.READFUNC_ABORT
|
|
|
|
|
@@ -1113,8 +1319,10 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1113,8 +1320,10 @@ class PyCurlFileObject():
|
|
|
|
|
if self._parsed_hdr:
|
|
|
|
|
return self._parsed_hdr
|
|
|
|
|
statusend = self._hdr_dump.find('\n')
|
|
|
|
@ -873,7 +892,7 @@ index e090e90..74a692c 100644
|
|
|
|
|
self._parsed_hdr = mimetools.Message(hdrfp)
|
|
|
|
|
return self._parsed_hdr
|
|
|
|
|
|
|
|
|
|
@@ -1127,6 +1335,9 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1127,6 +1336,9 @@ class PyCurlFileObject():
|
|
|
|
|
if not opts:
|
|
|
|
|
opts = self.opts
|
|
|
|
|
|
|
|
|
@ -883,13 +902,14 @@ index e090e90..74a692c 100644
|
|
|
|
|
|
|
|
|
|
# defaults we're always going to set
|
|
|
|
|
self.curl_obj.setopt(pycurl.NOPROGRESS, False)
|
|
|
|
|
@@ -1136,11 +1347,21 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1136,11 +1348,21 @@ class PyCurlFileObject():
|
|
|
|
|
self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update)
|
|
|
|
|
self.curl_obj.setopt(pycurl.FAILONERROR, True)
|
|
|
|
|
self.curl_obj.setopt(pycurl.OPT_FILETIME, True)
|
|
|
|
|
+ self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)
|
|
|
|
|
|
|
|
|
|
if DEBUG:
|
|
|
|
|
- if DEBUG:
|
|
|
|
|
+ if DEBUG and DEBUG.level <= 10:
|
|
|
|
|
self.curl_obj.setopt(pycurl.VERBOSE, True)
|
|
|
|
|
if opts.user_agent:
|
|
|
|
|
self.curl_obj.setopt(pycurl.USERAGENT, opts.user_agent)
|
|
|
|
@ -905,7 +925,7 @@ index e090e90..74a692c 100644
|
|
|
|
|
|
|
|
|
|
# maybe to be options later
|
|
|
|
|
self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)
|
|
|
|
|
@@ -1148,9 +1369,11 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1148,9 +1370,11 @@ class PyCurlFileObject():
|
|
|
|
|
|
|
|
|
|
# timeouts
|
|
|
|
|
timeout = 300
|
|
|
|
@ -920,7 +940,7 @@ index e090e90..74a692c 100644
|
|
|
|
|
|
|
|
|
|
# ssl options
|
|
|
|
|
if self.scheme == 'https':
|
|
|
|
|
@@ -1158,13 +1381,16 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1158,13 +1382,16 @@ class PyCurlFileObject():
|
|
|
|
|
self.curl_obj.setopt(pycurl.CAPATH, opts.ssl_ca_cert)
|
|
|
|
|
self.curl_obj.setopt(pycurl.CAINFO, opts.ssl_ca_cert)
|
|
|
|
|
self.curl_obj.setopt(pycurl.SSL_VERIFYPEER, opts.ssl_verify_peer)
|
|
|
|
@ -938,7 +958,7 @@ index e090e90..74a692c 100644
|
|
|
|
|
if opts.ssl_cert_type:
|
|
|
|
|
self.curl_obj.setopt(pycurl.SSLCERTTYPE, opts.ssl_cert_type)
|
|
|
|
|
if opts.ssl_key_pass:
|
|
|
|
|
@@ -1187,28 +1413,26 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1187,28 +1414,26 @@ class PyCurlFileObject():
|
|
|
|
|
if hasattr(opts, 'raw_throttle') and opts.raw_throttle():
|
|
|
|
|
self.curl_obj.setopt(pycurl.MAX_RECV_SPEED_LARGE, int(opts.raw_throttle()))
|
|
|
|
|
|
|
|
|
@ -983,7 +1003,7 @@ index e090e90..74a692c 100644
|
|
|
|
|
|
|
|
|
|
# our url
|
|
|
|
|
self.curl_obj.setopt(pycurl.URL, self.url)
|
|
|
|
|
@@ -1228,39 +1452,36 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1228,39 +1453,36 @@ class PyCurlFileObject():
|
|
|
|
|
|
|
|
|
|
code = self.http_code
|
|
|
|
|
errcode = e.args[0]
|
|
|
|
@ -1032,7 +1052,7 @@ index e090e90..74a692c 100644
|
|
|
|
|
# this is probably wrong but ultimately this is what happens
|
|
|
|
|
# we have a legit http code and a pycurl 'writer failed' code
|
|
|
|
|
# which almost always means something aborted it from outside
|
|
|
|
|
@@ -1272,33 +1493,94 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1272,33 +1494,94 @@ class PyCurlFileObject():
|
|
|
|
|
elif errcode == 58:
|
|
|
|
|
msg = _("problem with the local client certificate")
|
|
|
|
|
err = URLGrabError(14, msg)
|
|
|
|
@ -1135,7 +1155,7 @@ index e090e90..74a692c 100644
|
|
|
|
|
|
|
|
|
|
def _do_open(self):
|
|
|
|
|
self.curl_obj = _curl_cache
|
|
|
|
|
@@ -1333,7 +1615,11 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1333,7 +1616,11 @@ class PyCurlFileObject():
|
|
|
|
|
|
|
|
|
|
if self.opts.range:
|
|
|
|
|
rt = self.opts.range
|
|
|
|
@ -1148,7 +1168,7 @@ index e090e90..74a692c 100644
|
|
|
|
|
|
|
|
|
|
if rt:
|
|
|
|
|
header = range_tuple_to_header(rt)
|
|
|
|
|
@@ -1434,21 +1720,46 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1434,21 +1721,46 @@ class PyCurlFileObject():
|
|
|
|
|
#fh, self._temp_name = mkstemp()
|
|
|
|
|
#self.fo = open(self._temp_name, 'wb')
|
|
|
|
|
|
|
|
|
@ -1202,7 +1222,7 @@ index e090e90..74a692c 100644
|
|
|
|
|
else:
|
|
|
|
|
#self.fo = open(self._temp_name, 'r')
|
|
|
|
|
self.fo.seek(0)
|
|
|
|
|
@@ -1526,17 +1837,20 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1526,17 +1838,20 @@ class PyCurlFileObject():
|
|
|
|
|
if self._prog_running:
|
|
|
|
|
downloaded += self._reget_length
|
|
|
|
|
self.opts.progress_obj.update(downloaded)
|
|
|
|
@ -1228,7 +1248,7 @@ index e090e90..74a692c 100644
|
|
|
|
|
|
|
|
|
|
msg = _("Downloaded more than max size for %s: %s > %s") \
|
|
|
|
|
% (self.url, cur, max_size)
|
|
|
|
|
@@ -1544,13 +1858,6 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1544,13 +1859,6 @@ class PyCurlFileObject():
|
|
|
|
|
return True
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
@ -1242,7 +1262,7 @@ index e090e90..74a692c 100644
|
|
|
|
|
def read(self, amt=None):
|
|
|
|
|
self._fill_buffer(amt)
|
|
|
|
|
if amt is None:
|
|
|
|
|
@@ -1582,9 +1889,21 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1582,9 +1890,21 @@ class PyCurlFileObject():
|
|
|
|
|
self.opts.progress_obj.end(self._amount_read)
|
|
|
|
|
self.fo.close()
|
|
|
|
|
|
|
|
|
@ -1265,7 +1285,7 @@ index e090e90..74a692c 100644
|
|
|
|
|
|
|
|
|
|
#####################################################################
|
|
|
|
|
# DEPRECATED FUNCTIONS
|
|
|
|
|
@@ -1621,6 +1940,467 @@ def retrygrab(url, filename=None, copy_local=0, close_connection=0,
|
|
|
|
|
@@ -1621,6 +1941,478 @@ def retrygrab(url, filename=None, copy_local=0, close_connection=0,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#####################################################################
|
|
|
|
@ -1498,17 +1518,23 @@ index e090e90..74a692c 100644
|
|
|
|
|
+ host_con = {} # current host connection counts
|
|
|
|
|
+
|
|
|
|
|
+ def start(opts, tries):
|
|
|
|
|
+ opts.tries = tries
|
|
|
|
|
+ try:
|
|
|
|
|
+ dl.start(opts)
|
|
|
|
|
+ except OSError, e:
|
|
|
|
|
+ # can't spawn downloader, give up immediately
|
|
|
|
|
+ opts.exception = URLGrabError(5, exception2msg(e))
|
|
|
|
|
+ _run_callback(opts.failfunc, opts)
|
|
|
|
|
+ return
|
|
|
|
|
+
|
|
|
|
|
+ key, limit = opts.async
|
|
|
|
|
+ host_con[key] = host_con.get(key, 0) + 1
|
|
|
|
|
+ opts.tries = tries
|
|
|
|
|
+ if opts.progress_obj:
|
|
|
|
|
+ if opts.multi_progress_obj:
|
|
|
|
|
+ opts._progress = opts.multi_progress_obj.newMeter()
|
|
|
|
|
+ opts._progress.start(text=opts.text)
|
|
|
|
|
+ else:
|
|
|
|
|
+ opts._progress = time.time() # no updates
|
|
|
|
|
+ if DEBUG: DEBUG.info('attempt %i/%s: %s', opts.tries, opts.retry, opts.url)
|
|
|
|
|
+ dl.start(opts)
|
|
|
|
|
+
|
|
|
|
|
+ def perform():
|
|
|
|
|
+ for opts, size, ug_err in dl.perform():
|
|
|
|
@ -1588,6 +1614,8 @@ index e090e90..74a692c 100644
|
|
|
|
|
+ # check global limit
|
|
|
|
|
+ while len(dl.running) >= default_grabber.opts.max_connections:
|
|
|
|
|
+ perform()
|
|
|
|
|
+ if DEBUG:
|
|
|
|
|
+ DEBUG.info('max_connections: %d/%d', len(dl.running), default_grabber.opts.max_connections)
|
|
|
|
|
+
|
|
|
|
|
+ if opts.mirror_group:
|
|
|
|
|
+ mg, errors, failed, removed = opts.mirror_group
|
|
|
|
@ -1636,6 +1664,9 @@ index e090e90..74a692c 100644
|
|
|
|
|
+ key, limit = opts.async
|
|
|
|
|
+ while host_con.get(key, 0) >= limit:
|
|
|
|
|
+ perform()
|
|
|
|
|
+ if DEBUG:
|
|
|
|
|
+ DEBUG.info('max_connections(%s): %d/%d', key, host_con.get(key, 0), limit)
|
|
|
|
|
+
|
|
|
|
|
+ start(opts, 1)
|
|
|
|
|
+ except IOError, e:
|
|
|
|
|
+ if e.errno != 4: raise
|
|
|
|
|