update to latest head, multi-downloader

epel9
Zdeněk Pavlas 13 years ago
parent dadfd07b88
commit a16417493c

@ -0,0 +1,904 @@
diff --git a/scripts/urlgrabber-ext-down b/scripts/urlgrabber-ext-down
new file mode 100755
index 0000000..c37e6a8
--- /dev/null
+++ b/scripts/urlgrabber-ext-down
@@ -0,0 +1,55 @@
+#! /usr/bin/python
+# A very simple external downloader
+
+import time, os, errno, sys
+from urlgrabber.grabber import \
+ _readlines, URLGrabberOptions, _loads, \
+ PyCurlFileObject, URLGrabError
+
+def write(fmt, *arg):
+ try: os.write(1, fmt % arg)
+ except OSError, e:
+ if e.arg[0] != errno.EPIPE: raise
+ sys.exit(1)
+
+class ProxyProgress:
+ def start(self, *d1, **d2):
+ self.next_update = 0
+ def update(self, _amount_read):
+ t = time.time()
+ if t < self.next_update: return
+ self.next_update = t + 0.31
+ write('%d %d\n', self._id, _amount_read)
+
+def main():
+ import signal
+ signal.signal(signal.SIGINT, lambda n, f: sys.exit(1))
+ cnt = 0
+ while True:
+ lines = _readlines(0)
+ if not lines: break
+ for line in lines:
+ cnt += 1
+ opts = URLGrabberOptions()
+ opts._id = cnt
+ for k in line.split(' '):
+ k, v = k.split('=', 1)
+ setattr(opts, k, _loads(v))
+ if opts.progress_obj:
+ opts.progress_obj = ProxyProgress()
+ opts.progress_obj._id = cnt
+ tm = time.time()
+ try:
+ fo = PyCurlFileObject(opts.url, opts.filename, opts)
+ fo._do_grab()
+ fo.fo.close()
+ size = fo._amount_read
+ dlsz = size - fo._reget_length
+ ug_err = 'OK'
+ except URLGrabError, e:
+ size = dlsz = 0
+ ug_err = '%d %s' % e.args
+ write('%d %d %d %.3f %s\n', opts._id, size, dlsz, time.time() - tm, ug_err)
+
+if __name__ == '__main__':
+ main()
diff --git a/setup.py b/setup.py
index d0b87b8..bfa4a18 100644
--- a/setup.py
+++ b/setup.py
@@ -15,8 +15,10 @@ url = _urlgrabber.__url__
packages = ['urlgrabber']
package_dir = {'urlgrabber':'urlgrabber'}
scripts = ['scripts/urlgrabber']
-data_files = [('share/doc/' + name + '-' + version,
- ['README','LICENSE', 'TODO', 'ChangeLog'])]
+data_files = [
+ ('share/doc/' + name + '-' + version, ['README','LICENSE', 'TODO', 'ChangeLog']),
+ ('libexec', ['scripts/urlgrabber-ext-down']),
+]
options = { 'clean' : { 'all' : 1 } }
classifiers = [
'Development Status :: 4 - Beta',
diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py
index 38ae1f7..094be77 100644
--- a/urlgrabber/grabber.py
+++ b/urlgrabber/grabber.py
@@ -263,6 +263,33 @@ GENERAL ARGUMENTS (kwargs)
What type of name to IP resolving to use, default is to do both IPV4 and
IPV6.
+ async = (key, limit)
+
+ When this option is set, the urlgrab() is not processed immediately
+ but queued. parallel_wait() then processes grabs in parallel, limiting
+ the numer of connections in each 'key' group to at most 'limit'.
+
+ max_connections
+
+ The global connection limit.
+
+ timedhosts
+
+ The filename of the host download statistics. If defined, urlgrabber
+ will update the stats at the end of every download. At the end of
+ parallel_wait(), the updated stats are saved. If synchronous grabs
+ are used, you should call th_save().
+
+ default_speed, half_life
+
+ These options only affect the async mirror selection code.
+ The default_speed option sets the speed estimate for mirrors
+ we have never downloaded from, and defaults to 1 MBps.
+
+ The speed estimate also drifts exponentially from the speed
+ actually measured to the default speed, with default
+ period of 30 days.
+
RETRY RELATED ARGUMENTS
@@ -343,6 +370,15 @@ RETRY RELATED ARGUMENTS
but it cannot (without severe trickiness) prevent the exception
from being raised.
+ failfunc = None
+
+ The callback that gets called when urlgrab request fails.
+ If defined, urlgrab() calls it instead of raising URLGrabError.
+ Callback syntax is identical to failure_callback.
+
+ Contrary to failure_callback, it's called only once. It's primary
+ purpose is to use urlgrab() without a try/except block.
+
interrupt_callback = None
This callback is called if KeyboardInterrupt is received at any
@@ -444,7 +480,7 @@ import pycurl
from ftplib import parse150
from StringIO import StringIO
from httplib import HTTPException
-import socket
+import socket, select, fcntl
from byterange import range_tuple_normalize, range_tuple_to_header, RangeError
try:
@@ -878,6 +914,7 @@ class URLGrabberOptions:
self.retry = None
self.retrycodes = [-1,2,4,5,6,7]
self.checkfunc = None
+ self.failfunc = _do_raise
self.copy_local = 0
self.close_connection = 0
self.range = None
@@ -886,6 +923,7 @@ class URLGrabberOptions:
self.keepalive = 1
self.proxies = None
self.libproxy = False
+ self.proxy = None
self.reget = None
self.failure_callback = None
self.interrupt_callback = None
@@ -913,6 +951,12 @@ class URLGrabberOptions:
self.size = None # if we know how big the thing we're getting is going
# to be. this is ultimately a MAXIMUM size for the file
self.max_header_size = 2097152 #2mb seems reasonable for maximum header size
+ self.async = None # blocking by default
+ self.mirror_group = None
+ self.max_connections = 5
+ self.timedhosts = None
+ self.half_life = 30*24*60*60 # 30 days
+ self.default_speed = 1e6 # 1 MBit
def __repr__(self):
return self.format()
@@ -932,6 +976,17 @@ class URLGrabberOptions:
s = s + indent + '}'
return s
+def _do_raise(obj):
+ raise obj.exception
+
+def _run_callback(cb, obj):
+ if not cb:
+ return
+ if callable(cb):
+ return cb(obj)
+ cb, arg, karg = cb
+ return cb(obj, *arg, **karg)
+
class URLGrabber(object):
"""Provides easy opening of URLs with a variety of options.
@@ -977,10 +1032,9 @@ class URLGrabber(object):
if DEBUG: DEBUG.info('exception: %s', exception)
if callback:
if DEBUG: DEBUG.info('calling callback: %s', callback)
- cb_func, cb_args, cb_kwargs = self._make_callback(callback)
obj = CallbackObject(exception=exception, url=args[0],
tries=tries, retry=opts.retry)
- cb_func(obj, *cb_args, **cb_kwargs)
+ _run_callback(callback, obj)
if (opts.retry is None) or (tries == opts.retry):
if DEBUG: DEBUG.info('retries exceeded, re-raising')
@@ -1043,30 +1097,36 @@ class URLGrabber(object):
elif not opts.range:
if not opts.checkfunc is None:
- cb_func, cb_args, cb_kwargs = \
- self._make_callback(opts.checkfunc)
- obj = CallbackObject()
- obj.filename = path
- obj.url = url
- apply(cb_func, (obj, )+cb_args, cb_kwargs)
+ obj = CallbackObject(filename=path, url=url)
+ _run_callback(opts.checkfunc, obj)
return path
+ if opts.async:
+ opts.url = url
+ opts.filename = filename
+ opts.size = int(opts.size or 0)
+ _async_queue.append(opts)
+ return filename
+
def retryfunc(opts, url, filename):
+ tm = time.time()
fo = PyCurlFileObject(url, filename, opts)
try:
fo._do_grab()
+ _TH.update(url, fo._amount_read - fo._reget_length, time.time() - tm, None)
if not opts.checkfunc is None:
- cb_func, cb_args, cb_kwargs = \
- self._make_callback(opts.checkfunc)
- obj = CallbackObject()
- obj.filename = filename
- obj.url = url
- apply(cb_func, (obj, )+cb_args, cb_kwargs)
+ obj = CallbackObject(filename=filename, url=url)
+ _run_callback(opts.checkfunc, obj)
finally:
fo.close()
return filename
- return self._retry(opts, retryfunc, url, filename)
+ try:
+ return self._retry(opts, retryfunc, url, filename)
+ except URLGrabError, e:
+ _TH.update(url, 0, 0, e)
+ opts.exception = e
+ return _run_callback(opts.failfunc, opts)
def urlread(self, url, limit=None, **kwargs):
"""read the url into a string, up to 'limit' bytes
@@ -1095,12 +1155,8 @@ class URLGrabber(object):
else: s = fo.read(limit)
if not opts.checkfunc is None:
- cb_func, cb_args, cb_kwargs = \
- self._make_callback(opts.checkfunc)
- obj = CallbackObject()
- obj.data = s
- obj.url = url
- apply(cb_func, (obj, )+cb_args, cb_kwargs)
+ obj = CallbackObject(data=s, url=url)
+ _run_callback(opts.checkfunc, obj)
finally:
fo.close()
return s
@@ -1115,6 +1171,7 @@ class URLGrabber(object):
return s
def _make_callback(self, callback_obj):
+ # not used, left for compatibility
if callable(callback_obj):
return callback_obj, (), {}
else:
@@ -1346,14 +1403,8 @@ class PyCurlFileObject(object):
return
try:
- e = None
self.curl_obj.perform()
- except pycurl.error, e: pass
- self._do_perform_exc(e)
-
- def _do_perform_exc(self, e):
- # handle pycurl exception 'e'
- if e:
+ except pycurl.error, e:
# XXX - break some of these out a bit more clearly
# to other URLGrabErrors from
# http://curl.haxx.se/libcurl/c/libcurl-errors.html
@@ -1607,7 +1658,22 @@ class PyCurlFileObject(object):
_was_filename = False
if type(self.filename) in types.StringTypes and self.filename:
_was_filename = True
- self._do_open_fo()
+ self._prog_reportname = str(self.filename)
+ self._prog_basename = os.path.basename(self.filename)
+
+ if self.append: mode = 'ab'
+ else: mode = 'wb'
+
+ if DEBUG: DEBUG.info('opening local file "%s" with mode %s' % \
+ (self.filename, mode))
+ try:
+ self.fo = open(self.filename, mode)
+ except IOError, e:
+ err = URLGrabError(16, _(\
+ 'error opening local file from %s, IOError: %s') % (self.url, e))
+ err.url = self.url
+ raise err
+
else:
self._prog_reportname = 'MEMORY'
self._prog_basename = 'MEMORY'
@@ -1627,7 +1693,29 @@ class PyCurlFileObject(object):
raise e
if _was_filename:
- self._do_close_fo()
+ # close it up
+ self.fo.flush()
+ self.fo.close()
+
+ # Set the URL where we got it from:
+ if xattr is not None:
+ # See: http://www.freedesktop.org/wiki/CommonExtendedAttributes
+ try:
+ xattr.set(self.filename, 'user.xdg.origin.url', self.url)
+ except:
+ pass # URL too long. = IOError ... ignore everything.
+
+ # set the time
+ mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME)
+ if mod_time != -1:
+ try:
+ os.utime(self.filename, (mod_time, mod_time))
+ except OSError, e:
+ err = URLGrabError(16, _(\
+ 'error setting timestamp on file %s from %s, OSError: %s')
+ % (self.filename, self.url, e))
+ err.url = self.url
+ raise err
# re open it
try:
self.fo = open(self.filename, 'r')
@@ -1643,47 +1731,6 @@ class PyCurlFileObject(object):
self._complete = True
- def _do_open_fo(self):
- self._prog_reportname = str(self.filename)
- self._prog_basename = os.path.basename(self.filename)
- if self.append: mode = 'ab'
- else: mode = 'wb'
-
- if DEBUG: DEBUG.info('opening local file "%s" with mode %s' % \
- (self.filename, mode))
- try:
- self.fo = open(self.filename, mode)
- except IOError, e:
- err = URLGrabError(16, _(\
- 'error opening local file from %s, IOError: %s') % (self.url, e))
- err.url = self.url
- raise err
-
- def _do_close_fo(self):
- # close it up
- self.fo.flush()
- self.fo.close()
-
- # Set the URL where we got it from:
- if xattr is not None:
- # See: http://www.freedesktop.org/wiki/CommonExtendedAttributes
- try:
- xattr.set(self.filename, 'user.xdg.origin.url', self.url)
- except:
- pass # URL too long. = IOError ... ignore everything.
-
- # set the time
- mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME)
- if mod_time != -1:
- try:
- os.utime(self.filename, (mod_time, mod_time))
- except OSError, e:
- err = URLGrabError(16, _(\
- 'error setting timestamp on file %s from %s, OSError: %s')
- % (self.filename, self.url, e))
- err.url = self.url
- raise err
-
def _fill_buffer(self, amt=None):
"""fill the buffer to contain at least 'amt' bytes by reading
from the underlying file object. If amt is None, then it will
@@ -1858,6 +1905,425 @@ def retrygrab(url, filename=None, copy_local=0, close_connection=0,
#####################################################################
+# Serializer + parser: A replacement of the rather bulky Json code.
+#
+# - handles basic python literals, lists and tuples.
+# - serialized strings never contain ' ' or '\n'
+#
+#####################################################################
+
+_quoter_map = {}
+for c in '%[(,)] \n':
+ _quoter_map[c] = '%%%02x' % ord(c)
+del c
+
+def _dumps(v):
+ if v is None: return 'None'
+ if v is True: return 'True'
+ if v is False: return 'False'
+ if type(v) in (int, long, float):
+ return str(v)
+ if type(v) == unicode:
+ v = v.encode('UTF8')
+ if type(v) == str:
+ def quoter(c): return _quoter_map.get(c, c)
+ return "'%s'" % ''.join(map(quoter, v))
+ if type(v) == tuple:
+ return "(%s)" % ','.join(map(_dumps, v))
+ if type(v) == list:
+ return "[%s]" % ','.join(map(_dumps, v))
+ raise TypeError, 'Can\'t serialize %s' % v
+
+def _loads(s):
+ def decode(v):
+ if v == 'None': return None
+ if v == 'True': return True
+ if v == 'False': return False
+ try: return int(v)
+ except ValueError: pass
+ try: return float(v)
+ except ValueError: pass
+ if len(v) >= 2 and v[0] == v[-1] == "'":
+ ret = []; i = 1
+ while True:
+ j = v.find('%', i)
+ ret.append(v[i:j]) # skips the final "'"
+ if j == -1: break
+ ret.append(chr(int(v[j + 1:j + 3], 16)))
+ i = j + 3
+ v = ''.join(ret)
+ return v
+ stk = None
+ l = []
+ i = j = 0
+ while True:
+ if j == len(s) or s[j] in ',)]':
+ if j > i:
+ l.append(decode(s[i:j]))
+ if j == len(s): break
+ if s[j] in ')]':
+ if s[j] == ')':
+ l = tuple(l)
+ stk[0].append(l)
+ l, stk = stk
+ i = j = j + 1
+ elif s[j] in '[(':
+ stk = l, stk
+ l = []
+ i = j = j + 1
+ else:
+ j += 1 # safe because '[(,)]' are quoted
+ if stk: raise ValueError
+ if len(l) == 1: l = l[0]
+ return l
+
+
+#####################################################################
+# External downloader process
+#####################################################################
+
+def _readlines(fd):
+ buf = os.read(fd, 4096)
+ if not buf: return None
+ # whole lines only, no buffering
+ while buf[-1] != '\n':
+ buf += os.read(fd, 4096)
+ return buf[:-1].split('\n')
+
+import subprocess
+
+class _ExternalDownloader:
+ def __init__(self):
+ self.popen = subprocess.Popen(
+ '/usr/libexec/urlgrabber-ext-down',
+ stdin = subprocess.PIPE,
+ stdout = subprocess.PIPE,
+ )
+ self.stdin = self.popen.stdin.fileno()
+ self.stdout = self.popen.stdout.fileno()
+ self.running = {}
+ self.cnt = 0
+
+ # list of options we pass to downloader
+ _options = (
+ 'url', 'filename',
+ 'timeout', 'close_connection', 'keepalive',
+ 'throttle', 'bandwidth', 'range', 'reget',
+ 'user_agent', 'http_headers', 'ftp_headers',
+ 'proxy', 'prefix', 'username', 'password',
+ 'ssl_ca_cert',
+ 'ssl_cert', 'ssl_cert_type',
+ 'ssl_key', 'ssl_key_type',
+ 'ssl_key_pass',
+ 'ssl_verify_peer', 'ssl_verify_host',
+ 'size', 'max_header_size', 'ip_resolve',
+ )
+
+ def start(self, opts):
+ arg = []
+ for k in self._options:
+ v = getattr(opts, k)
+ if v is None: continue
+ arg.append('%s=%s' % (k, _dumps(v)))
+ if opts.progress_obj:
+ arg.append('progress_obj=True')
+ arg = ' '.join(arg)
+ if DEBUG: DEBUG.info('attempt %i/%s: %s', opts.tries, opts.retry, opts.url)
+
+ self.cnt += 1
+ self.running[self.cnt] = opts
+ os.write(self.stdin, arg +'\n')
+
+ def perform(self):
+ ret = []
+ lines = _readlines(self.stdout)
+ if not lines:
+ if DEBUG: DEBUG.info('downloader died')
+ raise KeyboardInterrupt
+ for line in lines:
+ # parse downloader output
+ line = line.split(' ', 5)
+ _id, size = map(int, line[:2])
+ if len(line) == 2:
+ opts = self.running[_id]
+ m = opts.progress_obj
+ if m:
+ if not m.last_update_time:
+ m.start(text = opts.text)
+ m.update(size)
+ continue
+ # job done
+ opts = self.running.pop(_id)
+ if line[4] == 'OK':
+ ug_err = None
+ if DEBUG: DEBUG.info('success')
+ else:
+ ug_err = URLGrabError(int(line[4]), line[5])
+ if DEBUG: DEBUG.info('failure: %s', err)
+ _TH.update(opts.url, int(line[2]), float(line[3]), ug_err)
+ ret.append((opts, size, ug_err))
+ return ret
+
+ def abort(self):
+ self.popen.stdin.close()
+ self.popen.stdout.close()
+ self.popen.wait()
+
+class _ExternalDownloaderPool:
+ def __init__(self):
+ self.epoll = select.epoll()
+ self.running = {}
+ self.cache = {}
+
+ def start(self, opts):
+ host = urlparse.urlsplit(opts.url).netloc
+ dl = self.cache.pop(host, None)
+ if not dl:
+ dl = _ExternalDownloader()
+ fl = fcntl.fcntl(dl.stdin, fcntl.F_GETFD)
+ fcntl.fcntl(dl.stdin, fcntl.F_SETFD, fl | fcntl.FD_CLOEXEC)
+ self.epoll.register(dl.stdout, select.EPOLLIN)
+ self.running[dl.stdout] = dl
+ dl.start(opts)
+
+ def perform(self):
+ ret = []
+ for fd, event in self.epoll.poll():
+ assert event & select.EPOLLIN
+ done = self.running[fd].perform()
+ if not done: continue
+ assert len(done) == 1
+ ret.extend(done)
+
+ # dl finished, move it to the cache
+ host = urlparse.urlsplit(done[0][0].url).netloc
+ if host in self.cache: self.cache[host].abort()
+ self.epoll.unregister(fd)
+ self.cache[host] = self.running.pop(fd)
+ return ret
+
+ def abort(self):
+ for dl in self.running.values():
+ self.epoll.unregister(dl.stdout)
+ dl.abort()
+ for dl in self.cache.values():
+ dl.abort()
+
+
+#####################################################################
+# High level async API
+#####################################################################
+
+_async_queue = []
+
+def parallel_wait(meter = 'text'):
+ '''Process queued requests in parallel.
+ '''
+
+ if meter:
+ count = total = 0
+ for opts in _async_queue:
+ count += 1
+ total += opts.size
+ if meter == 'text':
+ from progress import TextMultiFileMeter
+ meter = TextMultiFileMeter()
+ meter.start(count, total)
+
+ dl = _ExternalDownloaderPool()
+ host_con = {} # current host connection counts
+
+ def start(opts, tries):
+ key, limit = opts.async
+ host_con[key] = host_con.get(key, 0) + 1
+ opts.tries = tries
+ opts.progress_obj = meter and meter.newMeter()
+ if DEBUG: DEBUG.info('attempt %i/%s: %s', opts.tries, opts.retry, opts.url)
+ dl.start(opts)
+
+ def perform():
+ for opts, size, ug_err in dl.perform():
+ key, limit = opts.async
+ host_con[key] -= 1
+ if meter:
+ m = opts.progress_obj
+ m.basename = os.path.basename(opts.filename)
+ if ug_err:
+ m.failure(ug_err.args[1])
+ else:
+ # file size might have changed
+ meter.re.total += size - opts.size
+ m.end(size)
+ meter.removeMeter(m)
+
+ if ug_err is None:
+ if opts.checkfunc:
+ try: _run_callback(opts.checkfunc, opts)
+ except URLGrabError, ug_err: pass
+ if ug_err is None:
+ continue
+
+ retry = opts.retry or 0
+ if opts.failure_callback:
+ opts.exception = ug_err
+ try: _run_callback(opts.failure_callback, opts)
+ except URLGrabError, ug_err:
+ retry = 0 # no retries
+ if opts.tries < retry and ug_err.args[0] in opts.retrycodes:
+ start(opts, opts.tries + 1) # simple retry
+ continue
+
+ if opts.mirror_group:
+ mg, failed = opts.mirror_group
+ opts.mirror = key
+ opts.exception = ug_err
+ action = _run_callback(mg.failure_callback, opts)
+ if not (action and action.get('fail')):
+ # mask this mirror and retry
+ failed.add(key)
+ _async_queue.append(opts)
+ continue
+
+ # urlgrab failed
+ opts.exception = ug_err
+ _run_callback(opts.failfunc, opts)
+
+ try:
+ idx = 0
+ while True:
+ if idx >= len(_async_queue):
+ # the queue is empty
+ if not dl.running: break
+ # pending dl may extend it
+ perform()
+ continue
+
+ # handle next request
+ opts = _async_queue[idx]
+ idx += 1
+
+ # check global limit
+ while len(dl.running) >= opts.max_connections:
+ perform()
+
+ if opts.mirror_group:
+ mg, failed = opts.mirror_group
+
+ # find the best mirror
+ best = None
+ for mirror in mg.mirrors:
+ key = mirror['mirror']
+ if key in failed: continue
+
+ # estimate mirror speed
+ speed = _TH.estimate(key)
+ speed /= 1 + host_con.get(key, 0)
+ if best is None or speed > best_speed:
+ best = mirror
+ best_speed = speed
+
+ if best is None:
+ opts.exception = URLGrabError(256, _('No more mirrors to try.'))
+ _run_callback(opts.failfunc, opts)
+ continue
+
+ # update the current mirror and limit
+ key = best['mirror']
+ limit = best.get('kwargs', {}).get('max_connections', 3)
+ opts.async = key, limit
+
+ # update URL and proxy
+ url = mg._join_url(key, opts.relative_url)
+ url, parts = opts.urlparser.parse(url, opts)
+ opts.find_proxy(url, parts[0])
+ opts.url = url
+
+ # check host limit, then start
+ key, limit = opts.async
+ while host_con.get(key, 0) >= limit:
+ perform()
+ start(opts, 1)
+ except IOError, e:
+ if e.errno != 4: raise
+ raise KeyboardInterrupt
+
+ finally:
+ dl.abort()
+ if meter: meter.end()
+ del _async_queue[:]
+ _TH.save()
+
+
+#####################################################################
+# Host bandwidth estimation
+#####################################################################
+
+class _TH:
+ hosts = {}
+ dirty = None
+
+ @staticmethod
+ def load():
+ filename = default_grabber.opts.timedhosts
+ if filename and _TH.dirty is None:
+ try:
+ for line in open(filename):
+ host, speed, fail, ts = line.split()
+ _TH.hosts[host] = int(speed), int(fail), int(ts)
+ except IOError: pass
+ _TH.dirty = False
+
+ @staticmethod
+ def save():
+ filename = default_grabber.opts.timedhosts
+ if filename and _TH.dirty is True:
+ tmp = '%s.%d' % (filename, os.getpid())
+ try:
+ f = open(tmp, 'w')
+ for host in _TH.hosts:
+ f.write(host + ' %d %d %d\n' % _TH.hosts[host])
+ f.close()
+ os.rename(tmp, filename)
+ except IOError: pass
+ _TH.dirty = False
+
+ @staticmethod
+ def update(url, dl_size, dl_time, ug_err):
+ _TH.load()
+ host = urlparse.urlsplit(url).netloc
+ speed, fail, ts = _TH.hosts.get(host) or (0, 0, 0)
+ now = time.time()
+
+ if ug_err is None:
+ # k1: the older, the less useful
+ # k2: if it was <1MiB, don't trust it much
+ # speeds vary, use 10:1 smoothing
+ k1 = 2**((ts - now) / default_grabber.opts.half_life)
+ k2 = min(dl_size / 1e6, 1.0) / 10
+ speed = (k1 * speed + k2 * dl_size / dl_time) / (k1 + k2)
+ fail = 0
+ elif getattr(ug_err, 'code', None) == 404:
+ fail = 0 # alive, at least
+ else:
+ fail += 1 # seems dead
+
+ _TH.hosts[host] = speed, fail, now
+ _TH.dirty = True
+
+ @staticmethod
+ def estimate(url):
+ _TH.load()
+ host = urlparse.urlsplit(url).netloc
+ default_speed = default_grabber.opts.default_speed
+ try: speed, fail, ts = _TH.hosts[host]
+ except KeyError: return default_speed
+
+ speed *= 2**-fail
+ k = 2**((ts - time.time()) / default_grabber.opts.half_life)
+ speed = k * speed + (1 - k) * default_speed
+ return speed
+
+#####################################################################
# TESTING
def _main_test():
try: url, filename = sys.argv[1:3]
diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py
index 8731aed..d699b61 100644
--- a/urlgrabber/mirror.py
+++ b/urlgrabber/mirror.py
@@ -76,6 +76,9 @@ CUSTOMIZATION
'grabber' is omitted, the default grabber will be used. If
kwargs are omitted, then (duh) they will not be used.
+ kwarg 'max_connections' is used to store the max connection
+ limit of this mirror.
+
3) Pass keyword arguments when instantiating the mirror group.
See, for example, the failure_callback argument.
@@ -91,6 +94,7 @@ import random
import thread # needed for locking to make this threadsafe
from grabber import URLGrabError, CallbackObject, DEBUG, _to_utf8
+from grabber import _run_callback, _do_raise, _async_queue
def _(st):
return st
@@ -254,7 +258,7 @@ class MirrorGroup:
# if these values are found in **kwargs passed to one of the urlXXX
# methods, they will be stripped before getting passed on to the
# grabber
- options = ['default_action', 'failure_callback']
+ options = ['default_action', 'failure_callback', 'failfunc']
def _process_kwargs(self, kwargs):
self.failure_callback = kwargs.get('failure_callback')
@@ -403,10 +407,25 @@ class MirrorGroup:
self._failure(gr, obj)
def urlgrab(self, url, filename=None, **kwargs):
+ if kwargs.get('async'):
+ opts = self.grabber.opts.derive(**kwargs)
+ opts.mirror_group = self, set()
+ opts.relative_url = _to_utf8(url)
+
+ opts.url = 'http://tbd'
+ opts.filename = filename
+ opts.size = int(opts.size or 0)
+ _async_queue.append(opts)
+ return filename
+
kw = dict(kwargs)
kw['filename'] = filename
func = 'urlgrab'
- return self._mirror_try(func, url, kw)
+ try:
+ return self._mirror_try(func, url, kw)
+ except URLGrabError, e:
+ obj = CallbackObject(url=url, filename=filename, exception=e, **kwargs)
+ return _run_callback(kwargs.get('failfunc', _do_raise), obj)
def urlopen(self, url, **kwargs):
kw = dict(kwargs)
diff --git a/urlgrabber/progress.py b/urlgrabber/progress.py
index 3d7e99a..4c126c5 100644
--- a/urlgrabber/progress.py
+++ b/urlgrabber/progress.py
@@ -576,7 +576,6 @@ class TextMultiFileMeter(MultiFileMeter):
self.fo.write(out)
finally:
self._lock.release()
- self._do_update_meter(meter, now)
def _do_failure_meter(self, meter, message, now):
self._lock.acquire()
@@ -599,15 +598,6 @@ class TextMultiFileMeter(MultiFileMeter):
pass
finally:
self._lock.release()
-
- def _do_end(self, now):
- self._do_update_meter(None, now)
- self._lock.acquire()
- try:
- self.fo.write('\n')
- self.fo.flush()
- finally:
- self._lock.release()
######################################################################
# support classes and functions

@ -3,9 +3,10 @@
Summary: A high-level cross-protocol url-grabber Summary: A high-level cross-protocol url-grabber
Name: python-urlgrabber Name: python-urlgrabber
Version: 3.9.1 Version: 3.9.1
Release: 11%{?dist} Release: 12%{?dist}
Source0: urlgrabber-%{version}.tar.gz Source0: urlgrabber-%{version}.tar.gz
Patch1: urlgrabber-HEAD.patch Patch1: urlgrabber-HEAD.patch
Patch2: multi-downloader.patch
License: LGPLv2+ License: LGPLv2+
Group: Development/Libraries Group: Development/Libraries
@ -24,6 +25,7 @@ authentication, proxies and more.
%prep %prep
%setup -q -n urlgrabber-%{version} %setup -q -n urlgrabber-%{version}
%patch1 -p1 %patch1 -p1
%patch2 -p1
%build %build
python setup.py build python setup.py build
@ -41,8 +43,13 @@ rm -rf $RPM_BUILD_ROOT
%doc ChangeLog LICENSE README TODO %doc ChangeLog LICENSE README TODO
%{python_sitelib}/urlgrabber* %{python_sitelib}/urlgrabber*
%{_bindir}/urlgrabber %{_bindir}/urlgrabber
%attr(0755,root,root) /usr/libexec/urlgrabber-ext-down
%changelog %changelog
* Mon May 14 2012 Zdeněk Pavlas <zpavlas@redhat.com> - 3.9.1-12
- Update to latest HEAD
- Merge multi-downloader patches
* Sat Jan 14 2012 Fedora Release Engineering <rel-eng@lists.fedoraproject.org> - 3.9.1-11 * Sat Jan 14 2012 Fedora Release Engineering <rel-eng@lists.fedoraproject.org> - 3.9.1-11
- Rebuilt for https://fedoraproject.org/wiki/Fedora_17_Mass_Rebuild - Rebuilt for https://fedoraproject.org/wiki/Fedora_17_Mass_Rebuild

@ -1,3 +1,16 @@
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..1ffe416
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,7 @@
+*.py[co]
+MANIFEST
+dist
+build
+*.kdev*
+*.kateproject
+ipython.log*
diff --git a/scripts/urlgrabber b/scripts/urlgrabber diff --git a/scripts/urlgrabber b/scripts/urlgrabber
index 518e512..09cd896 100644 index 518e512..09cd896 100644
--- a/scripts/urlgrabber --- a/scripts/urlgrabber
@ -125,9 +138,18 @@ index 3e5f3b7..8eeaeda 100644
return (fb,lb) return (fb,lb)
diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py
index e090e90..b2770c5 100644 index e090e90..38ae1f7 100644
--- a/urlgrabber/grabber.py --- a/urlgrabber/grabber.py
+++ b/urlgrabber/grabber.py +++ b/urlgrabber/grabber.py
@@ -49,7 +49,7 @@ GENERAL ARGUMENTS (kwargs)
progress_obj = None
a class instance that supports the following methods:
- po.start(filename, url, basename, length, text)
+ po.start(filename, url, basename, size, now, text)
# length will be None if unknown
po.update(read) # read == bytes read so far
po.end()
@@ -68,14 +68,14 @@ GENERAL ARGUMENTS (kwargs) @@ -68,14 +68,14 @@ GENERAL ARGUMENTS (kwargs)
(which can be set on default_grabber.throttle) is used. See (which can be set on default_grabber.throttle) is used. See
BANDWIDTH THROTTLING for more information. BANDWIDTH THROTTLING for more information.
@ -150,7 +172,22 @@ index e090e90..b2770c5 100644
bandwidth = 0 bandwidth = 0
@@ -198,6 +198,12 @@ GENERAL ARGUMENTS (kwargs) @@ -143,8 +143,12 @@ GENERAL ARGUMENTS (kwargs)
note that proxy authentication information may be provided using
normal URL constructs:
proxies={ 'http' : 'http://user:host@foo:3128' }
- Lastly, if proxies is None, the default environment settings will
- be used.
+
+ libproxy = False
+
+ Use the libproxy module (if installed) to find proxies.
+ The libproxy code is only used if the proxies dictionary
+ does not provide any proxies.
prefix = None
@@ -198,6 +202,12 @@ GENERAL ARGUMENTS (kwargs)
control, you should probably subclass URLParser and pass it in via control, you should probably subclass URLParser and pass it in via
the 'urlparser' option. the 'urlparser' option.
@ -163,11 +200,57 @@ index e090e90..b2770c5 100644
ssl_ca_cert = None ssl_ca_cert = None
this option can be used if M2Crypto is available and will be this option can be used if M2Crypto is available and will be
@@ -248,6 +254,11 @@ GENERAL ARGUMENTS (kwargs) @@ -211,43 +221,48 @@ GENERAL ARGUMENTS (kwargs)
No-op when using the curl backend (default)
- self.ssl_verify_peer = True
+ ssl_verify_peer = True
Check the server's certificate to make sure it is valid with what our CA validates
- self.ssl_verify_host = True
+ ssl_verify_host = True
Check the server's hostname to make sure it matches the certificate DN
- self.ssl_key = None
+ ssl_key = None
Path to the key the client should use to connect/authenticate with
- self.ssl_key_type = 'PEM'
+ ssl_key_type = 'PEM'
PEM or DER - format of key
- self.ssl_cert = None
+ ssl_cert = None
Path to the ssl certificate the client should use to to authenticate with
- self.ssl_cert_type = 'PEM'
+ ssl_cert_type = 'PEM'
PEM or DER - format of certificate
- self.ssl_key_pass = None
+ ssl_key_pass = None
password to access the ssl_key
- self.size = None
+ size = None
size (in bytes) or Maximum size of the thing being downloaded.
This is mostly to keep us from exploding with an endless datastream
- self.max_header_size = 2097152
+ max_header_size = 2097152
Maximum size (in bytes) of the headers. Maximum size (in bytes) of the headers.
+ self.ip_resolve = 'whatever' + ip_resolve = 'whatever'
+ +
+ What type of name to IP resolving to use, default is to do both IPV4 and + What type of name to IP resolving to use, default is to do both IPV4 and
+ IPV6. + IPV6.
@ -175,7 +258,7 @@ index e090e90..b2770c5 100644
RETRY RELATED ARGUMENTS RETRY RELATED ARGUMENTS
@@ -420,6 +431,7 @@ import time @@ -420,6 +435,7 @@ import time
import string import string
import urllib import urllib
import urllib2 import urllib2
@ -183,7 +266,22 @@ index e090e90..b2770c5 100644
import mimetools import mimetools
import thread import thread
import types import types
@@ -439,6 +451,12 @@ try: @@ -431,6 +447,14 @@ from httplib import HTTPException
import socket
from byterange import range_tuple_normalize, range_tuple_to_header, RangeError
+try:
+ import xattr
+ if not hasattr(xattr, 'set'):
+ xattr = None # This is a "newer" API.
+except ImportError:
+ xattr = None
+
+
########################################################################
# MODULE INITIALIZATION
########################################################################
@@ -439,6 +463,12 @@ try:
except: except:
__version__ = '???' __version__ = '???'
@ -196,7 +294,7 @@ index e090e90..b2770c5 100644
######################################################################## ########################################################################
# functions for debugging output. These functions are here because they # functions for debugging output. These functions are here because they
# are also part of the module initialization. # are also part of the module initialization.
@@ -527,6 +545,22 @@ def _(st): @@ -527,6 +557,22 @@ def _(st):
# END MODULE INITIALIZATION # END MODULE INITIALIZATION
######################################################################## ########################################################################
@ -219,7 +317,7 @@ index e090e90..b2770c5 100644
class URLGrabError(IOError): class URLGrabError(IOError):
@@ -662,6 +696,7 @@ class URLParser: @@ -662,6 +708,7 @@ class URLParser:
opts.quote = 0 --> do not quote it opts.quote = 0 --> do not quote it
opts.quote = None --> guess opts.quote = None --> guess
""" """
@ -227,15 +325,59 @@ index e090e90..b2770c5 100644
quote = opts.quote quote = opts.quote
if opts.prefix: if opts.prefix:
@@ -800,6 +835,7 @@ class URLGrabberOptions: @@ -768,6 +815,41 @@ class URLGrabberOptions:
else: # throttle is a float
return self.bandwidth * self.throttle
+ def find_proxy(self, url, scheme):
+ """Find the proxy to use for this URL.
+ Use the proxies dictionary first, then libproxy.
+ """
+ self.proxy = None
+ if scheme not in ('ftp', 'http', 'https'):
+ return
+
+ if self.proxies:
+ proxy = self.proxies.get(scheme)
+ if proxy is None:
+ if scheme == 'http':
+ proxy = self.proxies.get('https')
+ elif scheme == 'https':
+ proxy = self.proxies.get('http')
+ if proxy == '_none_':
+ proxy = ''
+ self.proxy = proxy
+ return
+
+ if self.libproxy:
+ global _libproxy_cache
+ if _libproxy_cache is None:
+ try:
+ import libproxy
+ _libproxy_cache = libproxy.ProxyFactory()
+ except:
+ _libproxy_cache = False
+ if _libproxy_cache:
+ for proxy in _libproxy_cache.getProxies(url):
+ if proxy.startswith('http://'):
+ if DEBUG: DEBUG.info('using proxy "%s" for url %s' % (proxy, url))
+ self.proxy = proxy
+ break
+
def derive(self, **kwargs):
"""Create a derived URLGrabberOptions instance.
This method creates a new instance and overrides the
@@ -800,21 +882,25 @@ class URLGrabberOptions:
self.close_connection = 0 self.close_connection = 0
self.range = None self.range = None
self.user_agent = 'urlgrabber/%s' % __version__ self.user_agent = 'urlgrabber/%s' % __version__
+ self.ip_resolve = None + self.ip_resolve = None
self.keepalive = 1 self.keepalive = 1
self.proxies = None self.proxies = None
+ self.libproxy = False
self.reget = None self.reget = None
@@ -808,13 +844,15 @@ class URLGrabberOptions: self.failure_callback = None
self.interrupt_callback = None
self.prefix = None self.prefix = None
self.opener = None self.opener = None
self.cache_openers = True self.cache_openers = True
@ -252,7 +394,7 @@ index e090e90..b2770c5 100644
self.ssl_ca_cert = None # sets SSL_CAINFO - path to certdb self.ssl_ca_cert = None # sets SSL_CAINFO - path to certdb
self.ssl_context = None # no-op in pycurl self.ssl_context = None # no-op in pycurl
self.ssl_verify_peer = True # check peer's cert for authenticityb self.ssl_verify_peer = True # check peer's cert for authenticityb
@@ -846,7 +884,7 @@ class URLGrabberOptions: @@ -846,7 +932,7 @@ class URLGrabberOptions:
s = s + indent + '}' s = s + indent + '}'
return s return s
@ -261,8 +403,28 @@ index e090e90..b2770c5 100644
"""Provides easy opening of URLs with a variety of options. """Provides easy opening of URLs with a variety of options.
All options are specified as kwargs. Options may be specified when All options are specified as kwargs. Options may be specified when
@@ -931,6 +969,9 @@ class URLGrabber: @@ -912,9 +998,11 @@ class URLGrabber:
returned that supports them. The file object can be treated
like any other file object.
"""
+ url = _to_utf8(url)
opts = self.opts.derive(**kwargs)
if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
(url,parts) = opts.urlparser.parse(url, opts)
+ opts.find_proxy(url, parts[0])
def retryfunc(opts, url):
return PyCurlFileObject(url, filename=None, opts=opts)
return self._retry(opts, retryfunc, url)
@@ -925,12 +1013,17 @@ class URLGrabber:
urlgrab returns the filename of the local file, which may be
different from the passed-in filename if copy_local == 0.
"""
+ url = _to_utf8(url)
opts = self.opts.derive(**kwargs)
if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
(url,parts) = opts.urlparser.parse(url, opts)
(scheme, host, path, parm, query, frag) = parts (scheme, host, path, parm, query, frag) = parts
+ opts.find_proxy(url, scheme)
if filename is None: if filename is None:
filename = os.path.basename( urllib.unquote(path) ) filename = os.path.basename( urllib.unquote(path) )
+ if not filename: + if not filename:
@ -271,7 +433,19 @@ index e090e90..b2770c5 100644
if scheme == 'file' and not opts.copy_local: if scheme == 'file' and not opts.copy_local:
# just return the name of the local file - don't make a # just return the name of the local file - don't make a
# copy currently # copy currently
@@ -1030,7 +1071,7 @@ class URLGrabber: @@ -982,9 +1075,11 @@ class URLGrabber:
"I want the first N bytes" but rather 'read the whole file
into memory, but don't use too much'
"""
+ url = _to_utf8(url)
opts = self.opts.derive(**kwargs)
if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
(url,parts) = opts.urlparser.parse(url, opts)
+ opts.find_proxy(url, parts[0])
if limit is not None:
limit = limit + 1
@@ -1030,7 +1125,7 @@ class URLGrabber:
default_grabber = URLGrabber() default_grabber = URLGrabber()
@ -280,7 +454,7 @@ index e090e90..b2770c5 100644
def __init__(self, url, filename, opts): def __init__(self, url, filename, opts):
self.fo = None self.fo = None
self._hdr_dump = '' self._hdr_dump = ''
@@ -1052,9 +1093,15 @@ class PyCurlFileObject(): @@ -1052,10 +1147,11 @@ class PyCurlFileObject():
self._reget_length = 0 self._reget_length = 0
self._prog_running = False self._prog_running = False
self._error = (None, None) self._error = (None, None)
@ -289,15 +463,12 @@ index e090e90..b2770c5 100644
+ self._hdr_ended = False + self._hdr_ended = False
self._do_open() self._do_open()
-
+ +
+ def geturl(self):
+ """ Provide the geturl() method, used to be got from
+ urllib.addinfourl, via. urllib.URLopener.* """
+ return self.url
def __getattr__(self, name): def __getattr__(self, name):
"""This effectively allows us to wrap at the instance level. """This effectively allows us to wrap at the instance level.
@@ -1085,9 +1132,14 @@ class PyCurlFileObject(): Any attribute not found in _this_ object will be searched for
@@ -1085,9 +1181,14 @@ class PyCurlFileObject():
return -1 return -1
def _hdr_retrieve(self, buf): def _hdr_retrieve(self, buf):
@ -313,7 +484,7 @@ index e090e90..b2770c5 100644
try: try:
self._hdr_dump += buf self._hdr_dump += buf
# we have to get the size before we do the progress obj start # we have to get the size before we do the progress obj start
@@ -1104,7 +1156,17 @@ class PyCurlFileObject(): @@ -1104,7 +1205,17 @@ class PyCurlFileObject():
s = parse150(buf) s = parse150(buf)
if s: if s:
self.size = int(s) self.size = int(s)
@ -332,7 +503,7 @@ index e090e90..b2770c5 100644
return len(buf) return len(buf)
except KeyboardInterrupt: except KeyboardInterrupt:
return pycurl.READFUNC_ABORT return pycurl.READFUNC_ABORT
@@ -1113,8 +1175,10 @@ class PyCurlFileObject(): @@ -1113,8 +1224,10 @@ class PyCurlFileObject():
if self._parsed_hdr: if self._parsed_hdr:
return self._parsed_hdr return self._parsed_hdr
statusend = self._hdr_dump.find('\n') statusend = self._hdr_dump.find('\n')
@ -343,7 +514,17 @@ index e090e90..b2770c5 100644
self._parsed_hdr = mimetools.Message(hdrfp) self._parsed_hdr = mimetools.Message(hdrfp)
return self._parsed_hdr return self._parsed_hdr
@@ -1136,11 +1200,21 @@ class PyCurlFileObject(): @@ -1127,6 +1240,9 @@ class PyCurlFileObject():
if not opts:
opts = self.opts
+ # keepalives
+ if not opts.keepalive:
+ self.curl_obj.setopt(pycurl.FORBID_REUSE, 1)
# defaults we're always going to set
self.curl_obj.setopt(pycurl.NOPROGRESS, False)
@@ -1136,11 +1252,21 @@ class PyCurlFileObject():
self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update) self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update)
self.curl_obj.setopt(pycurl.FAILONERROR, True) self.curl_obj.setopt(pycurl.FAILONERROR, True)
self.curl_obj.setopt(pycurl.OPT_FILETIME, True) self.curl_obj.setopt(pycurl.OPT_FILETIME, True)
@ -365,7 +546,7 @@ index e090e90..b2770c5 100644
# maybe to be options later # maybe to be options later
self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True) self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)
@@ -1148,9 +1222,11 @@ class PyCurlFileObject(): @@ -1148,9 +1274,11 @@ class PyCurlFileObject():
# timeouts # timeouts
timeout = 300 timeout = 300
@ -380,11 +561,50 @@ index e090e90..b2770c5 100644
# ssl options # ssl options
if self.scheme == 'https': if self.scheme == 'https':
@@ -1203,12 +1279,19 @@ class PyCurlFileObject(): @@ -1158,13 +1286,16 @@ class PyCurlFileObject():
if proxy == '_none_': proxy = "" self.curl_obj.setopt(pycurl.CAPATH, opts.ssl_ca_cert)
self.curl_obj.setopt(pycurl.PROXY, proxy) self.curl_obj.setopt(pycurl.CAINFO, opts.ssl_ca_cert)
self.curl_obj.setopt(pycurl.SSL_VERIFYPEER, opts.ssl_verify_peer)
- self.curl_obj.setopt(pycurl.SSL_VERIFYHOST, opts.ssl_verify_host)
+ if opts.ssl_verify_host: # 1 is meaningless to curl
+ self.curl_obj.setopt(pycurl.SSL_VERIFYHOST, 2)
if opts.ssl_key:
self.curl_obj.setopt(pycurl.SSLKEY, opts.ssl_key)
if opts.ssl_key_type:
self.curl_obj.setopt(pycurl.SSLKEYTYPE, opts.ssl_key_type)
if opts.ssl_cert:
self.curl_obj.setopt(pycurl.SSLCERT, opts.ssl_cert)
+ # if we have a client side cert - turn off reuse b/c nss is odd
+ self.curl_obj.setopt(pycurl.FORBID_REUSE, 1)
if opts.ssl_cert_type:
self.curl_obj.setopt(pycurl.SSLCERTTYPE, opts.ssl_cert_type)
if opts.ssl_key_pass:
@@ -1187,28 +1318,24 @@ class PyCurlFileObject():
if hasattr(opts, 'raw_throttle') and opts.raw_throttle():
self.curl_obj.setopt(pycurl.MAX_RECV_SPEED_LARGE, int(opts.raw_throttle()))
- # proxy settings
- if opts.proxies:
- for (scheme, proxy) in opts.proxies.items():
- if self.scheme in ('ftp'): # only set the ftp proxy for ftp items
- if scheme not in ('ftp'):
- continue
- else:
- if proxy == '_none_': proxy = ""
- self.curl_obj.setopt(pycurl.PROXY, proxy)
- elif self.scheme in ('http', 'https'):
- if scheme not in ('http', 'https'):
- continue
- else:
- if proxy == '_none_': proxy = ""
- self.curl_obj.setopt(pycurl.PROXY, proxy)
-
- # FIXME username/password/auth settings - # FIXME username/password/auth settings
+ # proxy
+ if opts.proxy is not None:
+ self.curl_obj.setopt(pycurl.PROXY, opts.proxy)
+ self.curl_obj.setopt(pycurl.PROXYAUTH, pycurl.HTTPAUTH_ANY)
+
+ if opts.username and opts.password: + if opts.username and opts.password:
+ if self.scheme in ('http', 'https'): + if self.scheme in ('http', 'https'):
+ self.curl_obj.setopt(pycurl.HTTPAUTH, pycurl.HTTPAUTH_ANY) + self.curl_obj.setopt(pycurl.HTTPAUTH, pycurl.HTTPAUTH_ANY)
@ -402,7 +622,23 @@ index e090e90..b2770c5 100644
# our url # our url
self.curl_obj.setopt(pycurl.URL, self.url) self.curl_obj.setopt(pycurl.URL, self.url)
@@ -1228,12 +1311,14 @@ class PyCurlFileObject(): @@ -1219,8 +1346,14 @@ class PyCurlFileObject():
return
try:
+ e = None
self.curl_obj.perform()
- except pycurl.error, e:
+ except pycurl.error, e: pass
+ self._do_perform_exc(e)
+
+ def _do_perform_exc(self, e):
+ # handle pycurl exception 'e'
+ if e:
# XXX - break some of these out a bit more clearly
# to other URLGrabErrors from
# http://curl.haxx.se/libcurl/c/libcurl-errors.html
@@ -1228,12 +1361,14 @@ class PyCurlFileObject():
code = self.http_code code = self.http_code
errcode = e.args[0] errcode = e.args[0]
@ -419,7 +655,7 @@ index e090e90..b2770c5 100644
# this is probably wrong but ultimately this is what happens # this is probably wrong but ultimately this is what happens
# we have a legit http code and a pycurl 'writer failed' code # we have a legit http code and a pycurl 'writer failed' code
@@ -1244,23 +1329,23 @@ class PyCurlFileObject(): @@ -1244,23 +1379,23 @@ class PyCurlFileObject():
raise KeyboardInterrupt raise KeyboardInterrupt
elif errcode == 28: elif errcode == 28:
@ -450,7 +686,7 @@ index e090e90..b2770c5 100644
# this is probably wrong but ultimately this is what happens # this is probably wrong but ultimately this is what happens
# we have a legit http code and a pycurl 'writer failed' code # we have a legit http code and a pycurl 'writer failed' code
# which almost always means something aborted it from outside # which almost always means something aborted it from outside
@@ -1272,33 +1357,93 @@ class PyCurlFileObject(): @@ -1272,33 +1407,94 @@ class PyCurlFileObject():
elif errcode == 58: elif errcode == 58:
msg = _("problem with the local client certificate") msg = _("problem with the local client certificate")
err = URLGrabError(14, msg) err = URLGrabError(14, msg)
@ -519,6 +755,7 @@ index e090e90..b2770c5 100644
+ 42 : _("Aborted by callback"), + 42 : _("Aborted by callback"),
+ 47 : _("Too many redirects"), + 47 : _("Too many redirects"),
+ 51 : _("Peer certificate failed verification"), + 51 : _("Peer certificate failed verification"),
+ 52 : _("Got nothing: SSL certificate expired?"),
+ 53 : _("SSL engine not found"), + 53 : _("SSL engine not found"),
+ 54 : _("SSL engine set failed"), + 54 : _("SSL engine set failed"),
+ 55 : _("Network error send()"), + 55 : _("Network error send()"),
@ -545,13 +782,13 @@ index e090e90..b2770c5 100644
+ else: + else:
+ if self._error[1]: + if self._error[1]:
+ msg = self._error[1] + msg = self._error[1]
+ err = URLGRabError(14, msg) + err = URLGrabError(14, msg)
+ err.url = urllib.unquote(self.url) + err.url = urllib.unquote(self.url)
+ raise err + raise err
def _do_open(self): def _do_open(self):
self.curl_obj = _curl_cache self.curl_obj = _curl_cache
@@ -1333,7 +1478,11 @@ class PyCurlFileObject(): @@ -1333,7 +1529,11 @@ class PyCurlFileObject():
if self.opts.range: if self.opts.range:
rt = self.opts.range rt = self.opts.range
@ -564,36 +801,54 @@ index e090e90..b2770c5 100644
if rt: if rt:
header = range_tuple_to_header(rt) header = range_tuple_to_header(rt)
@@ -1434,9 +1583,13 @@ class PyCurlFileObject(): @@ -1407,22 +1607,7 @@ class PyCurlFileObject():
_was_filename = False
if type(self.filename) in types.StringTypes and self.filename:
_was_filename = True
- self._prog_reportname = str(self.filename)
- self._prog_basename = os.path.basename(self.filename)
-
- if self.append: mode = 'ab'
- else: mode = 'wb'
-
- if DEBUG: DEBUG.info('opening local file "%s" with mode %s' % \
- (self.filename, mode))
- try:
- self.fo = open(self.filename, mode)
- except IOError, e:
- err = URLGrabError(16, _(\
- 'error opening local file from %s, IOError: %s') % (self.url, e))
- err.url = self.url
- raise err
-
+ self._do_open_fo()
else:
self._prog_reportname = 'MEMORY'
self._prog_basename = 'MEMORY'
@@ -1434,27 +1619,71 @@ class PyCurlFileObject():
#fh, self._temp_name = mkstemp() #fh, self._temp_name = mkstemp()
#self.fo = open(self._temp_name, 'wb') #self.fo = open(self._temp_name, 'wb')
- -
- self._do_perform() - self._do_perform()
- -
-
-
- if _was_filename:
- # close it up
+ try: + try:
+ self._do_perform() + self._do_perform()
+ except URLGrabError, e: + except URLGrabError, e:
+ self.fo.flush() self.fo.flush()
+ self.fo.close() self.fo.close()
- # set the time
- mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME)
- if mod_time != -1:
- os.utime(self.filename, (mod_time, mod_time))
+ raise e + raise e
+ +
+ if _was_filename:
+ self._do_close_fo()
if _was_filename:
@@ -1446,9 +1599,23 @@ class PyCurlFileObject():
# set the time
mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME)
if mod_time != -1:
- os.utime(self.filename, (mod_time, mod_time))
+ try:
+ os.utime(self.filename, (mod_time, mod_time))
+ except OSError, e:
+ err = URLGrabError(16, _(\
+ 'error setting timestamp on file %s from %s, OSError: %s')
+ % (self.filename, self.url, e))
+ err.url = self.url
+ raise err
# re open it # re open it
- self.fo = open(self.filename, 'r') - self.fo = open(self.filename, 'r')
+ try: + try:
@ -607,7 +862,61 @@ index e090e90..b2770c5 100644
else: else:
#self.fo = open(self._temp_name, 'r') #self.fo = open(self._temp_name, 'r')
self.fo.seek(0) self.fo.seek(0)
@@ -1532,11 +1699,14 @@ class PyCurlFileObject():
self._complete = True
+ def _do_open_fo(self):
+ self._prog_reportname = str(self.filename)
+ self._prog_basename = os.path.basename(self.filename)
+ if self.append: mode = 'ab'
+ else: mode = 'wb'
+
+ if DEBUG: DEBUG.info('opening local file "%s" with mode %s' % \
+ (self.filename, mode))
+ try:
+ self.fo = open(self.filename, mode)
+ except IOError, e:
+ err = URLGrabError(16, _(\
+ 'error opening local file from %s, IOError: %s') % (self.url, e))
+ err.url = self.url
+ raise err
+
+ def _do_close_fo(self):
+ # close it up
+ self.fo.flush()
+ self.fo.close()
+
+ # Set the URL where we got it from:
+ if xattr is not None:
+ # See: http://www.freedesktop.org/wiki/CommonExtendedAttributes
+ try:
+ xattr.set(self.filename, 'user.xdg.origin.url', self.url)
+ except:
+ pass # URL too long. = IOError ... ignore everything.
+
+ # set the time
+ mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME)
+ if mod_time != -1:
+ try:
+ os.utime(self.filename, (mod_time, mod_time))
+ except OSError, e:
+ err = URLGrabError(16, _(\
+ 'error setting timestamp on file %s from %s, OSError: %s')
+ % (self.filename, self.url, e))
+ err.url = self.url
+ raise err
+
def _fill_buffer(self, amt=None):
"""fill the buffer to contain at least 'amt' bytes by reading
from the underlying file object. If amt is None, then it will
@@ -1526,17 +1755,20 @@ class PyCurlFileObject():
if self._prog_running:
downloaded += self._reget_length
self.opts.progress_obj.update(downloaded)
- except KeyboardInterrupt:
+ except (KeyboardInterrupt, IOError):
return -1
def _over_max_size(self, cur, max_size=None): def _over_max_size(self, cur, max_size=None):
if not max_size: if not max_size:
@ -626,7 +935,7 @@ index e090e90..b2770c5 100644
msg = _("Downloaded more than max size for %s: %s > %s") \ msg = _("Downloaded more than max size for %s: %s > %s") \
% (self.url, cur, max_size) % (self.url, cur, max_size)
@@ -1544,13 +1714,6 @@ class PyCurlFileObject(): @@ -1544,13 +1776,6 @@ class PyCurlFileObject():
return True return True
return False return False
@ -640,7 +949,7 @@ index e090e90..b2770c5 100644
def read(self, amt=None): def read(self, amt=None):
self._fill_buffer(amt) self._fill_buffer(amt)
if amt is None: if amt is None:
@@ -1582,9 +1745,21 @@ class PyCurlFileObject(): @@ -1582,9 +1807,21 @@ class PyCurlFileObject():
self.opts.progress_obj.end(self._amount_read) self.opts.progress_obj.end(self._amount_read)
self.fo.close() self.fo.close()
@ -658,7 +967,7 @@ index e090e90..b2770c5 100644
+ _curl_cache.close() + _curl_cache.close()
+ _curl_cache = pycurl.Curl() + _curl_cache = pycurl.Curl()
+ +
+ +_libproxy_cache = None
+ +
##################################################################### #####################################################################
@ -687,10 +996,199 @@ index dad410b..8731aed 100644
return parsed_mirrors return parsed_mirrors
diff --git a/urlgrabber/progress.py b/urlgrabber/progress.py diff --git a/urlgrabber/progress.py b/urlgrabber/progress.py
index dd07c6a..45eb248 100644 index dd07c6a..3d7e99a 100644
--- a/urlgrabber/progress.py --- a/urlgrabber/progress.py
+++ b/urlgrabber/progress.py +++ b/urlgrabber/progress.py
@@ -658,6 +658,8 @@ def format_time(seconds, use_hours=0): @@ -211,6 +211,21 @@ def text_meter_total_size(size, downloaded=0):
# 4. + ( 5, total: 32)
#
+def _term_add_bar(tl, bar_max_length, pc):
+ blen = bar_max_length
+ bar = '='*int(blen * pc)
+ if (blen * pc) - int(blen * pc) >= 0.5:
+ bar += '-'
+ return tl.add(' [%-*.*s]' % (blen, blen, bar))
+
+def _term_add_end(tl, osize, size):
+ if osize is not None:
+ if size > osize: # Is ??? better? Really need something to say < vs >.
+ return tl.add(' !!! '), True
+ elif size != osize:
+ return tl.add(' ... '), True
+ return tl.add(' ' * 5), False
+
class TextMeter(BaseMeter):
def __init__(self, fo=sys.stderr):
BaseMeter.__init__(self)
@@ -259,13 +274,10 @@ class TextMeter(BaseMeter):
ui_rate = tl.add(' %5sB/s' % ave_dl)
# Make text grow a bit before we start growing the bar too
blen = 4 + tl.rest_split(8 + 8 + 4)
- bar = '='*int(blen * frac)
- if (blen * frac) - int(blen * frac) >= 0.5:
- bar += '-'
- ui_bar = tl.add(' [%-*.*s]' % (blen, blen, bar))
- out = '%-*.*s%s%s%s%s%s%s%s\r' % (tl.rest(), tl.rest(), text,
- ui_sofar_pc, ui_pc, ui_bar,
- ui_rate, ui_size, ui_time, ui_end)
+ ui_bar = _term_add_bar(tl, blen, frac)
+ out = '\r%-*.*s%s%s%s%s%s%s%s\r' % (tl.rest(), tl.rest(), text,
+ ui_sofar_pc, ui_pc, ui_bar,
+ ui_rate,ui_size,ui_time, ui_end)
self.fo.write(out)
self.fo.flush()
@@ -284,12 +296,7 @@ class TextMeter(BaseMeter):
tl = TerminalLine(8)
ui_size = tl.add(' | %5sB' % total_size)
ui_time = tl.add(' %9s' % total_time)
- not_done = self.size is not None and amount_read != self.size
- if not_done:
- ui_end = tl.add(' ... ')
- else:
- ui_end = tl.add(' ' * 5)
-
+ ui_end, not_done = _term_add_end(tl, self.size, amount_read)
out = '\r%-*.*s%s%s%s\n' % (tl.rest(), tl.rest(), text,
ui_size, ui_time, ui_end)
self.fo.write(out)
@@ -331,12 +338,21 @@ class MultiFileHelper(BaseMeter):
def message(self, message):
self.master.message_meter(self, message)
+class _FakeLock:
+ def acquire(self):
+ pass
+ def release(self):
+ pass
+
class MultiFileMeter:
helperclass = MultiFileHelper
- def __init__(self):
+ def __init__(self, threaded=True):
self.meters = []
self.in_progress_meters = []
- self._lock = thread.allocate_lock()
+ if threaded:
+ self._lock = thread.allocate_lock()
+ else:
+ self._lock = _FakeLock()
self.update_period = 0.3 # seconds
self.numfiles = None
@@ -369,6 +385,7 @@ class MultiFileMeter:
def end(self, now=None):
if now is None: now = time.time()
+ self.re.update(self._amount_read(), now)
self._do_end(now)
def _do_end(self, now):
@@ -466,11 +483,20 @@ class MultiFileMeter:
class TextMultiFileMeter(MultiFileMeter):
- def __init__(self, fo=sys.stderr):
+ def __init__(self, fo=sys.stderr, threaded=True):
self.fo = fo
- MultiFileMeter.__init__(self)
+ MultiFileMeter.__init__(self, threaded)
# files: ###/### ###% data: ######/###### ###% time: ##:##:##/##:##:##
+# New output, like TextMeter output...
+# update: Size, All files
+# -----------------------
+# (<#file>/<#tot files>): <text> <pc> <bar> <rate> | <size> <eta time> ETA
+# 8-22 1 3-4 1 6-12 1 8 3 6 1 9 1 3 1
+# end
+# ---
+# <text> | <file size> <file elapsed time>
+# 8-56 3 6 1 9 5
def _do_update_meter(self, meter, now):
self._lock.acquire()
try:
@@ -480,7 +506,7 @@ class TextMultiFileMeter(MultiFileMeter):
tf = self.numfiles or 1
pf = 100 * float(df)/tf + 0.49
dd = self.re.last_amount_read
- td = self.total_size
+ td = self.re.total
pd = 100 * (self.re.fraction_read() or 0) + 0.49
dt = self.re.elapsed_time()
rt = self.re.remaining_time()
@@ -491,9 +517,33 @@ class TextMultiFileMeter(MultiFileMeter):
ftd = format_number(td) + 'B'
fdt = format_time(dt, 1)
ftt = format_time(tt, 1)
-
- out = '%-79.79s' % (format % (df, tf, pf, fdd, ftd, pd, fdt, ftt))
- self.fo.write('\r' + out)
+
+ frac = self.re.fraction_read() or 0
+ ave_dl = format_number(self.re.average_rate())
+ text = meter.text or meter.basename
+ if tf > 1:
+ text = '(%u/%u): %s' % (df+1, tf, text)
+
+ # Include text + ui_rate in minimal
+ tl = TerminalLine(8, 8+1+8)
+
+ ui_size = tl.add(' | %5sB' % format_number(dd))
+
+ ui_time = tl.add(' %9s' % format_time(rt))
+ ui_end = tl.add(' ETA ')
+
+ ui_sofar_pc = tl.add(' %i%%' % pf,
+ full_len=len(" (100%)"))
+ ui_rate = tl.add(' %5sB/s' % ave_dl)
+
+ # Make text grow a bit before we start growing the bar too
+ blen = 4 + tl.rest_split(8 + 8 + 4)
+ ui_bar = _term_add_bar(tl, blen, frac)
+ out = '\r%-*.*s%s%s%s%s%s%s\r' % (tl.rest(), tl.rest(), text,
+ ui_sofar_pc, ui_bar,
+ ui_rate, ui_size, ui_time,
+ ui_end)
+ self.fo.write(out)
self.fo.flush()
finally:
self._lock.release()
@@ -502,15 +552,28 @@ class TextMultiFileMeter(MultiFileMeter):
self._lock.acquire()
try:
format = "%-30.30s %6.6s %8.8s %9.9s"
- fn = meter.basename
+ fn = meter.text or meter.basename
size = meter.last_amount_read
fsize = format_number(size) + 'B'
et = meter.re.elapsed_time()
fet = format_time(et, 1)
- frate = format_number(size / et) + 'B/s'
-
- out = '%-79.79s' % (format % (fn, fsize, fet, frate))
- self.fo.write('\r' + out + '\n')
+ frate = format_number(et and size / et) + 'B/s'
+ df = self.finished_files
+ tf = self.numfiles or 1
+
+ total_time = format_time(et)
+ total_size = format_number(size)
+ text = meter.text or meter.basename
+ if tf > 1:
+ text = '(%u/%u): %s' % (df, tf, text)
+
+ tl = TerminalLine(8)
+ ui_size = tl.add(' | %5sB' % total_size)
+ ui_time = tl.add(' %9s' % total_time)
+ ui_end, not_done = _term_add_end(tl, meter.size, size)
+ out = '\r%-*.*s%s%s%s\n' % (tl.rest(), tl.rest(), text,
+ ui_size, ui_time, ui_end)
+ self.fo.write(out)
finally:
self._lock.release()
self._do_update_meter(meter, now)
@@ -658,6 +721,8 @@ def format_time(seconds, use_hours=0):
if seconds is None or seconds < 0: if seconds is None or seconds < 0:
if use_hours: return '--:--:--' if use_hours: return '--:--:--'
else: return '--:--' else: return '--:--'

Loading…
Cancel
Save