update to latest head, multi-downloader

epel9
Zdeněk Pavlas 13 years ago
parent dadfd07b88
commit a16417493c

@ -0,0 +1,904 @@
diff --git a/scripts/urlgrabber-ext-down b/scripts/urlgrabber-ext-down
new file mode 100755
index 0000000..c37e6a8
--- /dev/null
+++ b/scripts/urlgrabber-ext-down
@@ -0,0 +1,55 @@
+#! /usr/bin/python
+# A very simple external downloader
+
+import time, os, errno, sys
+from urlgrabber.grabber import \
+ _readlines, URLGrabberOptions, _loads, \
+ PyCurlFileObject, URLGrabError
+
+def write(fmt, *arg):
+ try: os.write(1, fmt % arg)
+ except OSError, e:
+ if e.arg[0] != errno.EPIPE: raise
+ sys.exit(1)
+
+class ProxyProgress:
+ def start(self, *d1, **d2):
+ self.next_update = 0
+ def update(self, _amount_read):
+ t = time.time()
+ if t < self.next_update: return
+ self.next_update = t + 0.31
+ write('%d %d\n', self._id, _amount_read)
+
+def main():
+ import signal
+ signal.signal(signal.SIGINT, lambda n, f: sys.exit(1))
+ cnt = 0
+ while True:
+ lines = _readlines(0)
+ if not lines: break
+ for line in lines:
+ cnt += 1
+ opts = URLGrabberOptions()
+ opts._id = cnt
+ for k in line.split(' '):
+ k, v = k.split('=', 1)
+ setattr(opts, k, _loads(v))
+ if opts.progress_obj:
+ opts.progress_obj = ProxyProgress()
+ opts.progress_obj._id = cnt
+ tm = time.time()
+ try:
+ fo = PyCurlFileObject(opts.url, opts.filename, opts)
+ fo._do_grab()
+ fo.fo.close()
+ size = fo._amount_read
+ dlsz = size - fo._reget_length
+ ug_err = 'OK'
+ except URLGrabError, e:
+ size = dlsz = 0
+ ug_err = '%d %s' % e.args
+ write('%d %d %d %.3f %s\n', opts._id, size, dlsz, time.time() - tm, ug_err)
+
+if __name__ == '__main__':
+ main()
diff --git a/setup.py b/setup.py
index d0b87b8..bfa4a18 100644
--- a/setup.py
+++ b/setup.py
@@ -15,8 +15,10 @@ url = _urlgrabber.__url__
packages = ['urlgrabber']
package_dir = {'urlgrabber':'urlgrabber'}
scripts = ['scripts/urlgrabber']
-data_files = [('share/doc/' + name + '-' + version,
- ['README','LICENSE', 'TODO', 'ChangeLog'])]
+data_files = [
+ ('share/doc/' + name + '-' + version, ['README','LICENSE', 'TODO', 'ChangeLog']),
+ ('libexec', ['scripts/urlgrabber-ext-down']),
+]
options = { 'clean' : { 'all' : 1 } }
classifiers = [
'Development Status :: 4 - Beta',
diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py
index 38ae1f7..094be77 100644
--- a/urlgrabber/grabber.py
+++ b/urlgrabber/grabber.py
@@ -263,6 +263,33 @@ GENERAL ARGUMENTS (kwargs)
What type of name to IP resolving to use, default is to do both IPV4 and
IPV6.
+ async = (key, limit)
+
+ When this option is set, the urlgrab() is not processed immediately
+ but queued. parallel_wait() then processes grabs in parallel, limiting
+ the numer of connections in each 'key' group to at most 'limit'.
+
+ max_connections
+
+ The global connection limit.
+
+ timedhosts
+
+ The filename of the host download statistics. If defined, urlgrabber
+ will update the stats at the end of every download. At the end of
+ parallel_wait(), the updated stats are saved. If synchronous grabs
+ are used, you should call th_save().
+
+ default_speed, half_life
+
+ These options only affect the async mirror selection code.
+ The default_speed option sets the speed estimate for mirrors
+ we have never downloaded from, and defaults to 1 MBps.
+
+ The speed estimate also drifts exponentially from the speed
+ actually measured to the default speed, with default
+ period of 30 days.
+
RETRY RELATED ARGUMENTS
@@ -343,6 +370,15 @@ RETRY RELATED ARGUMENTS
but it cannot (without severe trickiness) prevent the exception
from being raised.
+ failfunc = None
+
+ The callback that gets called when urlgrab request fails.
+ If defined, urlgrab() calls it instead of raising URLGrabError.
+ Callback syntax is identical to failure_callback.
+
+ Contrary to failure_callback, it's called only once. It's primary
+ purpose is to use urlgrab() without a try/except block.
+
interrupt_callback = None
This callback is called if KeyboardInterrupt is received at any
@@ -444,7 +480,7 @@ import pycurl
from ftplib import parse150
from StringIO import StringIO
from httplib import HTTPException
-import socket
+import socket, select, fcntl
from byterange import range_tuple_normalize, range_tuple_to_header, RangeError
try:
@@ -878,6 +914,7 @@ class URLGrabberOptions:
self.retry = None
self.retrycodes = [-1,2,4,5,6,7]
self.checkfunc = None
+ self.failfunc = _do_raise
self.copy_local = 0
self.close_connection = 0
self.range = None
@@ -886,6 +923,7 @@ class URLGrabberOptions:
self.keepalive = 1
self.proxies = None
self.libproxy = False
+ self.proxy = None
self.reget = None
self.failure_callback = None
self.interrupt_callback = None
@@ -913,6 +951,12 @@ class URLGrabberOptions:
self.size = None # if we know how big the thing we're getting is going
# to be. this is ultimately a MAXIMUM size for the file
self.max_header_size = 2097152 #2mb seems reasonable for maximum header size
+ self.async = None # blocking by default
+ self.mirror_group = None
+ self.max_connections = 5
+ self.timedhosts = None
+ self.half_life = 30*24*60*60 # 30 days
+ self.default_speed = 1e6 # 1 MBit
def __repr__(self):
return self.format()
@@ -932,6 +976,17 @@ class URLGrabberOptions:
s = s + indent + '}'
return s
+def _do_raise(obj):
+ raise obj.exception
+
+def _run_callback(cb, obj):
+ if not cb:
+ return
+ if callable(cb):
+ return cb(obj)
+ cb, arg, karg = cb
+ return cb(obj, *arg, **karg)
+
class URLGrabber(object):
"""Provides easy opening of URLs with a variety of options.
@@ -977,10 +1032,9 @@ class URLGrabber(object):
if DEBUG: DEBUG.info('exception: %s', exception)
if callback:
if DEBUG: DEBUG.info('calling callback: %s', callback)
- cb_func, cb_args, cb_kwargs = self._make_callback(callback)
obj = CallbackObject(exception=exception, url=args[0],
tries=tries, retry=opts.retry)
- cb_func(obj, *cb_args, **cb_kwargs)
+ _run_callback(callback, obj)
if (opts.retry is None) or (tries == opts.retry):
if DEBUG: DEBUG.info('retries exceeded, re-raising')
@@ -1043,30 +1097,36 @@ class URLGrabber(object):
elif not opts.range:
if not opts.checkfunc is None:
- cb_func, cb_args, cb_kwargs = \
- self._make_callback(opts.checkfunc)
- obj = CallbackObject()
- obj.filename = path
- obj.url = url
- apply(cb_func, (obj, )+cb_args, cb_kwargs)
+ obj = CallbackObject(filename=path, url=url)
+ _run_callback(opts.checkfunc, obj)
return path
+ if opts.async:
+ opts.url = url
+ opts.filename = filename
+ opts.size = int(opts.size or 0)
+ _async_queue.append(opts)
+ return filename
+
def retryfunc(opts, url, filename):
+ tm = time.time()
fo = PyCurlFileObject(url, filename, opts)
try:
fo._do_grab()
+ _TH.update(url, fo._amount_read - fo._reget_length, time.time() - tm, None)
if not opts.checkfunc is None:
- cb_func, cb_args, cb_kwargs = \
- self._make_callback(opts.checkfunc)
- obj = CallbackObject()
- obj.filename = filename
- obj.url = url
- apply(cb_func, (obj, )+cb_args, cb_kwargs)
+ obj = CallbackObject(filename=filename, url=url)
+ _run_callback(opts.checkfunc, obj)
finally:
fo.close()
return filename
- return self._retry(opts, retryfunc, url, filename)
+ try:
+ return self._retry(opts, retryfunc, url, filename)
+ except URLGrabError, e:
+ _TH.update(url, 0, 0, e)
+ opts.exception = e
+ return _run_callback(opts.failfunc, opts)
def urlread(self, url, limit=None, **kwargs):
"""read the url into a string, up to 'limit' bytes
@@ -1095,12 +1155,8 @@ class URLGrabber(object):
else: s = fo.read(limit)
if not opts.checkfunc is None:
- cb_func, cb_args, cb_kwargs = \
- self._make_callback(opts.checkfunc)
- obj = CallbackObject()
- obj.data = s
- obj.url = url
- apply(cb_func, (obj, )+cb_args, cb_kwargs)
+ obj = CallbackObject(data=s, url=url)
+ _run_callback(opts.checkfunc, obj)
finally:
fo.close()
return s
@@ -1115,6 +1171,7 @@ class URLGrabber(object):
return s
def _make_callback(self, callback_obj):
+ # not used, left for compatibility
if callable(callback_obj):
return callback_obj, (), {}
else:
@@ -1346,14 +1403,8 @@ class PyCurlFileObject(object):
return
try:
- e = None
self.curl_obj.perform()
- except pycurl.error, e: pass
- self._do_perform_exc(e)
-
- def _do_perform_exc(self, e):
- # handle pycurl exception 'e'
- if e:
+ except pycurl.error, e:
# XXX - break some of these out a bit more clearly
# to other URLGrabErrors from
# http://curl.haxx.se/libcurl/c/libcurl-errors.html
@@ -1607,7 +1658,22 @@ class PyCurlFileObject(object):
_was_filename = False
if type(self.filename) in types.StringTypes and self.filename:
_was_filename = True
- self._do_open_fo()
+ self._prog_reportname = str(self.filename)
+ self._prog_basename = os.path.basename(self.filename)
+
+ if self.append: mode = 'ab'
+ else: mode = 'wb'
+
+ if DEBUG: DEBUG.info('opening local file "%s" with mode %s' % \
+ (self.filename, mode))
+ try:
+ self.fo = open(self.filename, mode)
+ except IOError, e:
+ err = URLGrabError(16, _(\
+ 'error opening local file from %s, IOError: %s') % (self.url, e))
+ err.url = self.url
+ raise err
+
else:
self._prog_reportname = 'MEMORY'
self._prog_basename = 'MEMORY'
@@ -1627,7 +1693,29 @@ class PyCurlFileObject(object):
raise e
if _was_filename:
- self._do_close_fo()
+ # close it up
+ self.fo.flush()
+ self.fo.close()
+
+ # Set the URL where we got it from:
+ if xattr is not None:
+ # See: http://www.freedesktop.org/wiki/CommonExtendedAttributes
+ try:
+ xattr.set(self.filename, 'user.xdg.origin.url', self.url)
+ except:
+ pass # URL too long. = IOError ... ignore everything.
+
+ # set the time
+ mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME)
+ if mod_time != -1:
+ try:
+ os.utime(self.filename, (mod_time, mod_time))
+ except OSError, e:
+ err = URLGrabError(16, _(\
+ 'error setting timestamp on file %s from %s, OSError: %s')
+ % (self.filename, self.url, e))
+ err.url = self.url
+ raise err
# re open it
try:
self.fo = open(self.filename, 'r')
@@ -1643,47 +1731,6 @@ class PyCurlFileObject(object):
self._complete = True
- def _do_open_fo(self):
- self._prog_reportname = str(self.filename)
- self._prog_basename = os.path.basename(self.filename)
- if self.append: mode = 'ab'
- else: mode = 'wb'
-
- if DEBUG: DEBUG.info('opening local file "%s" with mode %s' % \
- (self.filename, mode))
- try:
- self.fo = open(self.filename, mode)
- except IOError, e:
- err = URLGrabError(16, _(\
- 'error opening local file from %s, IOError: %s') % (self.url, e))
- err.url = self.url
- raise err
-
- def _do_close_fo(self):
- # close it up
- self.fo.flush()
- self.fo.close()
-
- # Set the URL where we got it from:
- if xattr is not None:
- # See: http://www.freedesktop.org/wiki/CommonExtendedAttributes
- try:
- xattr.set(self.filename, 'user.xdg.origin.url', self.url)
- except:
- pass # URL too long. = IOError ... ignore everything.
-
- # set the time
- mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME)
- if mod_time != -1:
- try:
- os.utime(self.filename, (mod_time, mod_time))
- except OSError, e:
- err = URLGrabError(16, _(\
- 'error setting timestamp on file %s from %s, OSError: %s')
- % (self.filename, self.url, e))
- err.url = self.url
- raise err
-
def _fill_buffer(self, amt=None):
"""fill the buffer to contain at least 'amt' bytes by reading
from the underlying file object. If amt is None, then it will
@@ -1858,6 +1905,425 @@ def retrygrab(url, filename=None, copy_local=0, close_connection=0,
#####################################################################
+# Serializer + parser: A replacement of the rather bulky Json code.
+#
+# - handles basic python literals, lists and tuples.
+# - serialized strings never contain ' ' or '\n'
+#
+#####################################################################
+
+_quoter_map = {}
+for c in '%[(,)] \n':
+ _quoter_map[c] = '%%%02x' % ord(c)
+del c
+
+def _dumps(v):
+ if v is None: return 'None'
+ if v is True: return 'True'
+ if v is False: return 'False'
+ if type(v) in (int, long, float):
+ return str(v)
+ if type(v) == unicode:
+ v = v.encode('UTF8')
+ if type(v) == str:
+ def quoter(c): return _quoter_map.get(c, c)
+ return "'%s'" % ''.join(map(quoter, v))
+ if type(v) == tuple:
+ return "(%s)" % ','.join(map(_dumps, v))
+ if type(v) == list:
+ return "[%s]" % ','.join(map(_dumps, v))
+ raise TypeError, 'Can\'t serialize %s' % v
+
+def _loads(s):
+ def decode(v):
+ if v == 'None': return None
+ if v == 'True': return True
+ if v == 'False': return False
+ try: return int(v)
+ except ValueError: pass
+ try: return float(v)
+ except ValueError: pass
+ if len(v) >= 2 and v[0] == v[-1] == "'":
+ ret = []; i = 1
+ while True:
+ j = v.find('%', i)
+ ret.append(v[i:j]) # skips the final "'"
+ if j == -1: break
+ ret.append(chr(int(v[j + 1:j + 3], 16)))
+ i = j + 3
+ v = ''.join(ret)
+ return v
+ stk = None
+ l = []
+ i = j = 0
+ while True:
+ if j == len(s) or s[j] in ',)]':
+ if j > i:
+ l.append(decode(s[i:j]))
+ if j == len(s): break
+ if s[j] in ')]':
+ if s[j] == ')':
+ l = tuple(l)
+ stk[0].append(l)
+ l, stk = stk
+ i = j = j + 1
+ elif s[j] in '[(':
+ stk = l, stk
+ l = []
+ i = j = j + 1
+ else:
+ j += 1 # safe because '[(,)]' are quoted
+ if stk: raise ValueError
+ if len(l) == 1: l = l[0]
+ return l
+
+
+#####################################################################
+# External downloader process
+#####################################################################
+
+def _readlines(fd):
+ buf = os.read(fd, 4096)
+ if not buf: return None
+ # whole lines only, no buffering
+ while buf[-1] != '\n':
+ buf += os.read(fd, 4096)
+ return buf[:-1].split('\n')
+
+import subprocess
+
+class _ExternalDownloader:
+ def __init__(self):
+ self.popen = subprocess.Popen(
+ '/usr/libexec/urlgrabber-ext-down',
+ stdin = subprocess.PIPE,
+ stdout = subprocess.PIPE,
+ )
+ self.stdin = self.popen.stdin.fileno()
+ self.stdout = self.popen.stdout.fileno()
+ self.running = {}
+ self.cnt = 0
+
+ # list of options we pass to downloader
+ _options = (
+ 'url', 'filename',
+ 'timeout', 'close_connection', 'keepalive',
+ 'throttle', 'bandwidth', 'range', 'reget',
+ 'user_agent', 'http_headers', 'ftp_headers',
+ 'proxy', 'prefix', 'username', 'password',
+ 'ssl_ca_cert',
+ 'ssl_cert', 'ssl_cert_type',
+ 'ssl_key', 'ssl_key_type',
+ 'ssl_key_pass',
+ 'ssl_verify_peer', 'ssl_verify_host',
+ 'size', 'max_header_size', 'ip_resolve',
+ )
+
+ def start(self, opts):
+ arg = []
+ for k in self._options:
+ v = getattr(opts, k)
+ if v is None: continue
+ arg.append('%s=%s' % (k, _dumps(v)))
+ if opts.progress_obj:
+ arg.append('progress_obj=True')
+ arg = ' '.join(arg)
+ if DEBUG: DEBUG.info('attempt %i/%s: %s', opts.tries, opts.retry, opts.url)
+
+ self.cnt += 1
+ self.running[self.cnt] = opts
+ os.write(self.stdin, arg +'\n')
+
+ def perform(self):
+ ret = []
+ lines = _readlines(self.stdout)
+ if not lines:
+ if DEBUG: DEBUG.info('downloader died')
+ raise KeyboardInterrupt
+ for line in lines:
+ # parse downloader output
+ line = line.split(' ', 5)
+ _id, size = map(int, line[:2])
+ if len(line) == 2:
+ opts = self.running[_id]
+ m = opts.progress_obj
+ if m:
+ if not m.last_update_time:
+ m.start(text = opts.text)
+ m.update(size)
+ continue
+ # job done
+ opts = self.running.pop(_id)
+ if line[4] == 'OK':
+ ug_err = None
+ if DEBUG: DEBUG.info('success')
+ else:
+ ug_err = URLGrabError(int(line[4]), line[5])
+ if DEBUG: DEBUG.info('failure: %s', err)
+ _TH.update(opts.url, int(line[2]), float(line[3]), ug_err)
+ ret.append((opts, size, ug_err))
+ return ret
+
+ def abort(self):
+ self.popen.stdin.close()
+ self.popen.stdout.close()
+ self.popen.wait()
+
+class _ExternalDownloaderPool:
+ def __init__(self):
+ self.epoll = select.epoll()
+ self.running = {}
+ self.cache = {}
+
+ def start(self, opts):
+ host = urlparse.urlsplit(opts.url).netloc
+ dl = self.cache.pop(host, None)
+ if not dl:
+ dl = _ExternalDownloader()
+ fl = fcntl.fcntl(dl.stdin, fcntl.F_GETFD)
+ fcntl.fcntl(dl.stdin, fcntl.F_SETFD, fl | fcntl.FD_CLOEXEC)
+ self.epoll.register(dl.stdout, select.EPOLLIN)
+ self.running[dl.stdout] = dl
+ dl.start(opts)
+
+ def perform(self):
+ ret = []
+ for fd, event in self.epoll.poll():
+ assert event & select.EPOLLIN
+ done = self.running[fd].perform()
+ if not done: continue
+ assert len(done) == 1
+ ret.extend(done)
+
+ # dl finished, move it to the cache
+ host = urlparse.urlsplit(done[0][0].url).netloc
+ if host in self.cache: self.cache[host].abort()
+ self.epoll.unregister(fd)
+ self.cache[host] = self.running.pop(fd)
+ return ret
+
+ def abort(self):
+ for dl in self.running.values():
+ self.epoll.unregister(dl.stdout)
+ dl.abort()
+ for dl in self.cache.values():
+ dl.abort()
+
+
+#####################################################################
+# High level async API
+#####################################################################
+
+_async_queue = []
+
+def parallel_wait(meter = 'text'):
+ '''Process queued requests in parallel.
+ '''
+
+ if meter:
+ count = total = 0
+ for opts in _async_queue:
+ count += 1
+ total += opts.size
+ if meter == 'text':
+ from progress import TextMultiFileMeter
+ meter = TextMultiFileMeter()
+ meter.start(count, total)
+
+ dl = _ExternalDownloaderPool()
+ host_con = {} # current host connection counts
+
+ def start(opts, tries):
+ key, limit = opts.async
+ host_con[key] = host_con.get(key, 0) + 1
+ opts.tries = tries
+ opts.progress_obj = meter and meter.newMeter()
+ if DEBUG: DEBUG.info('attempt %i/%s: %s', opts.tries, opts.retry, opts.url)
+ dl.start(opts)
+
+ def perform():
+ for opts, size, ug_err in dl.perform():
+ key, limit = opts.async
+ host_con[key] -= 1
+ if meter:
+ m = opts.progress_obj
+ m.basename = os.path.basename(opts.filename)
+ if ug_err:
+ m.failure(ug_err.args[1])
+ else:
+ # file size might have changed
+ meter.re.total += size - opts.size
+ m.end(size)
+ meter.removeMeter(m)
+
+ if ug_err is None:
+ if opts.checkfunc:
+ try: _run_callback(opts.checkfunc, opts)
+ except URLGrabError, ug_err: pass
+ if ug_err is None:
+ continue
+
+ retry = opts.retry or 0
+ if opts.failure_callback:
+ opts.exception = ug_err
+ try: _run_callback(opts.failure_callback, opts)
+ except URLGrabError, ug_err:
+ retry = 0 # no retries
+ if opts.tries < retry and ug_err.args[0] in opts.retrycodes:
+ start(opts, opts.tries + 1) # simple retry
+ continue
+
+ if opts.mirror_group:
+ mg, failed = opts.mirror_group
+ opts.mirror = key
+ opts.exception = ug_err
+ action = _run_callback(mg.failure_callback, opts)
+ if not (action and action.get('fail')):
+ # mask this mirror and retry
+ failed.add(key)
+ _async_queue.append(opts)
+ continue
+
+ # urlgrab failed
+ opts.exception = ug_err
+ _run_callback(opts.failfunc, opts)
+
+ try:
+ idx = 0
+ while True:
+ if idx >= len(_async_queue):
+ # the queue is empty
+ if not dl.running: break
+ # pending dl may extend it
+ perform()
+ continue
+
+ # handle next request
+ opts = _async_queue[idx]
+ idx += 1
+
+ # check global limit
+ while len(dl.running) >= opts.max_connections:
+ perform()
+
+ if opts.mirror_group:
+ mg, failed = opts.mirror_group
+
+ # find the best mirror
+ best = None
+ for mirror in mg.mirrors:
+ key = mirror['mirror']
+ if key in failed: continue
+
+ # estimate mirror speed
+ speed = _TH.estimate(key)
+ speed /= 1 + host_con.get(key, 0)
+ if best is None or speed > best_speed:
+ best = mirror
+ best_speed = speed
+
+ if best is None:
+ opts.exception = URLGrabError(256, _('No more mirrors to try.'))
+ _run_callback(opts.failfunc, opts)
+ continue
+
+ # update the current mirror and limit
+ key = best['mirror']
+ limit = best.get('kwargs', {}).get('max_connections', 3)
+ opts.async = key, limit
+
+ # update URL and proxy
+ url = mg._join_url(key, opts.relative_url)
+ url, parts = opts.urlparser.parse(url, opts)
+ opts.find_proxy(url, parts[0])
+ opts.url = url
+
+ # check host limit, then start
+ key, limit = opts.async
+ while host_con.get(key, 0) >= limit:
+ perform()
+ start(opts, 1)
+ except IOError, e:
+ if e.errno != 4: raise
+ raise KeyboardInterrupt
+
+ finally:
+ dl.abort()
+ if meter: meter.end()
+ del _async_queue[:]
+ _TH.save()
+
+
+#####################################################################
+# Host bandwidth estimation
+#####################################################################
+
+class _TH:
+ hosts = {}
+ dirty = None
+
+ @staticmethod
+ def load():
+ filename = default_grabber.opts.timedhosts
+ if filename and _TH.dirty is None:
+ try:
+ for line in open(filename):
+ host, speed, fail, ts = line.split()
+ _TH.hosts[host] = int(speed), int(fail), int(ts)
+ except IOError: pass
+ _TH.dirty = False
+
+ @staticmethod
+ def save():
+ filename = default_grabber.opts.timedhosts
+ if filename and _TH.dirty is True:
+ tmp = '%s.%d' % (filename, os.getpid())
+ try:
+ f = open(tmp, 'w')
+ for host in _TH.hosts:
+ f.write(host + ' %d %d %d\n' % _TH.hosts[host])
+ f.close()
+ os.rename(tmp, filename)
+ except IOError: pass
+ _TH.dirty = False
+
+ @staticmethod
+ def update(url, dl_size, dl_time, ug_err):
+ _TH.load()
+ host = urlparse.urlsplit(url).netloc
+ speed, fail, ts = _TH.hosts.get(host) or (0, 0, 0)
+ now = time.time()
+
+ if ug_err is None:
+ # k1: the older, the less useful
+ # k2: if it was <1MiB, don't trust it much
+ # speeds vary, use 10:1 smoothing
+ k1 = 2**((ts - now) / default_grabber.opts.half_life)
+ k2 = min(dl_size / 1e6, 1.0) / 10
+ speed = (k1 * speed + k2 * dl_size / dl_time) / (k1 + k2)
+ fail = 0
+ elif getattr(ug_err, 'code', None) == 404:
+ fail = 0 # alive, at least
+ else:
+ fail += 1 # seems dead
+
+ _TH.hosts[host] = speed, fail, now
+ _TH.dirty = True
+
+ @staticmethod
+ def estimate(url):
+ _TH.load()
+ host = urlparse.urlsplit(url).netloc
+ default_speed = default_grabber.opts.default_speed
+ try: speed, fail, ts = _TH.hosts[host]
+ except KeyError: return default_speed
+
+ speed *= 2**-fail
+ k = 2**((ts - time.time()) / default_grabber.opts.half_life)
+ speed = k * speed + (1 - k) * default_speed
+ return speed
+
+#####################################################################
# TESTING
def _main_test():
try: url, filename = sys.argv[1:3]
diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py
index 8731aed..d699b61 100644
--- a/urlgrabber/mirror.py
+++ b/urlgrabber/mirror.py
@@ -76,6 +76,9 @@ CUSTOMIZATION
'grabber' is omitted, the default grabber will be used. If
kwargs are omitted, then (duh) they will not be used.
+ kwarg 'max_connections' is used to store the max connection
+ limit of this mirror.
+
3) Pass keyword arguments when instantiating the mirror group.
See, for example, the failure_callback argument.
@@ -91,6 +94,7 @@ import random
import thread # needed for locking to make this threadsafe
from grabber import URLGrabError, CallbackObject, DEBUG, _to_utf8
+from grabber import _run_callback, _do_raise, _async_queue
def _(st):
return st
@@ -254,7 +258,7 @@ class MirrorGroup:
# if these values are found in **kwargs passed to one of the urlXXX
# methods, they will be stripped before getting passed on to the
# grabber
- options = ['default_action', 'failure_callback']
+ options = ['default_action', 'failure_callback', 'failfunc']
def _process_kwargs(self, kwargs):
self.failure_callback = kwargs.get('failure_callback')
@@ -403,10 +407,25 @@ class MirrorGroup:
self._failure(gr, obj)
def urlgrab(self, url, filename=None, **kwargs):
+ if kwargs.get('async'):
+ opts = self.grabber.opts.derive(**kwargs)
+ opts.mirror_group = self, set()
+ opts.relative_url = _to_utf8(url)
+
+ opts.url = 'http://tbd'
+ opts.filename = filename
+ opts.size = int(opts.size or 0)
+ _async_queue.append(opts)
+ return filename
+
kw = dict(kwargs)
kw['filename'] = filename
func = 'urlgrab'
- return self._mirror_try(func, url, kw)
+ try:
+ return self._mirror_try(func, url, kw)
+ except URLGrabError, e:
+ obj = CallbackObject(url=url, filename=filename, exception=e, **kwargs)
+ return _run_callback(kwargs.get('failfunc', _do_raise), obj)
def urlopen(self, url, **kwargs):
kw = dict(kwargs)
diff --git a/urlgrabber/progress.py b/urlgrabber/progress.py
index 3d7e99a..4c126c5 100644
--- a/urlgrabber/progress.py
+++ b/urlgrabber/progress.py
@@ -576,7 +576,6 @@ class TextMultiFileMeter(MultiFileMeter):
self.fo.write(out)
finally:
self._lock.release()
- self._do_update_meter(meter, now)
def _do_failure_meter(self, meter, message, now):
self._lock.acquire()
@@ -599,15 +598,6 @@ class TextMultiFileMeter(MultiFileMeter):
pass
finally:
self._lock.release()
-
- def _do_end(self, now):
- self._do_update_meter(None, now)
- self._lock.acquire()
- try:
- self.fo.write('\n')
- self.fo.flush()
- finally:
- self._lock.release()
######################################################################
# support classes and functions

@ -3,9 +3,10 @@
Summary: A high-level cross-protocol url-grabber
Name: python-urlgrabber
Version: 3.9.1
Release: 11%{?dist}
Release: 12%{?dist}
Source0: urlgrabber-%{version}.tar.gz
Patch1: urlgrabber-HEAD.patch
Patch2: multi-downloader.patch
License: LGPLv2+
Group: Development/Libraries
@ -24,6 +25,7 @@ authentication, proxies and more.
%prep
%setup -q -n urlgrabber-%{version}
%patch1 -p1
%patch2 -p1
%build
python setup.py build
@ -41,8 +43,13 @@ rm -rf $RPM_BUILD_ROOT
%doc ChangeLog LICENSE README TODO
%{python_sitelib}/urlgrabber*
%{_bindir}/urlgrabber
%attr(0755,root,root) /usr/libexec/urlgrabber-ext-down
%changelog
* Mon May 14 2012 Zdeněk Pavlas <zpavlas@redhat.com> - 3.9.1-12
- Update to latest HEAD
- Merge multi-downloader patches
* Sat Jan 14 2012 Fedora Release Engineering <rel-eng@lists.fedoraproject.org> - 3.9.1-11
- Rebuilt for https://fedoraproject.org/wiki/Fedora_17_Mass_Rebuild

@ -1,3 +1,16 @@
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..1ffe416
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,7 @@
+*.py[co]
+MANIFEST
+dist
+build
+*.kdev*
+*.kateproject
+ipython.log*
diff --git a/scripts/urlgrabber b/scripts/urlgrabber
index 518e512..09cd896 100644
--- a/scripts/urlgrabber
@ -125,9 +138,18 @@ index 3e5f3b7..8eeaeda 100644
return (fb,lb)
diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py
index e090e90..b2770c5 100644
index e090e90..38ae1f7 100644
--- a/urlgrabber/grabber.py
+++ b/urlgrabber/grabber.py
@@ -49,7 +49,7 @@ GENERAL ARGUMENTS (kwargs)
progress_obj = None
a class instance that supports the following methods:
- po.start(filename, url, basename, length, text)
+ po.start(filename, url, basename, size, now, text)
# length will be None if unknown
po.update(read) # read == bytes read so far
po.end()
@@ -68,14 +68,14 @@ GENERAL ARGUMENTS (kwargs)
(which can be set on default_grabber.throttle) is used. See
BANDWIDTH THROTTLING for more information.
@ -150,7 +172,22 @@ index e090e90..b2770c5 100644
bandwidth = 0
@@ -198,6 +198,12 @@ GENERAL ARGUMENTS (kwargs)
@@ -143,8 +143,12 @@ GENERAL ARGUMENTS (kwargs)
note that proxy authentication information may be provided using
normal URL constructs:
proxies={ 'http' : 'http://user:host@foo:3128' }
- Lastly, if proxies is None, the default environment settings will
- be used.
+
+ libproxy = False
+
+ Use the libproxy module (if installed) to find proxies.
+ The libproxy code is only used if the proxies dictionary
+ does not provide any proxies.
prefix = None
@@ -198,6 +202,12 @@ GENERAL ARGUMENTS (kwargs)
control, you should probably subclass URLParser and pass it in via
the 'urlparser' option.
@ -163,11 +200,57 @@ index e090e90..b2770c5 100644
ssl_ca_cert = None
this option can be used if M2Crypto is available and will be
@@ -248,6 +254,11 @@ GENERAL ARGUMENTS (kwargs)
@@ -211,43 +221,48 @@ GENERAL ARGUMENTS (kwargs)
No-op when using the curl backend (default)
- self.ssl_verify_peer = True
+ ssl_verify_peer = True
Check the server's certificate to make sure it is valid with what our CA validates
- self.ssl_verify_host = True
+ ssl_verify_host = True
Check the server's hostname to make sure it matches the certificate DN
- self.ssl_key = None
+ ssl_key = None
Path to the key the client should use to connect/authenticate with
- self.ssl_key_type = 'PEM'
+ ssl_key_type = 'PEM'
PEM or DER - format of key
- self.ssl_cert = None
+ ssl_cert = None
Path to the ssl certificate the client should use to to authenticate with
- self.ssl_cert_type = 'PEM'
+ ssl_cert_type = 'PEM'
PEM or DER - format of certificate
- self.ssl_key_pass = None
+ ssl_key_pass = None
password to access the ssl_key
- self.size = None
+ size = None
size (in bytes) or Maximum size of the thing being downloaded.
This is mostly to keep us from exploding with an endless datastream
- self.max_header_size = 2097152
+ max_header_size = 2097152
Maximum size (in bytes) of the headers.
+ self.ip_resolve = 'whatever'
+ ip_resolve = 'whatever'
+
+ What type of name to IP resolving to use, default is to do both IPV4 and
+ IPV6.
@ -175,7 +258,7 @@ index e090e90..b2770c5 100644
RETRY RELATED ARGUMENTS
@@ -420,6 +431,7 @@ import time
@@ -420,6 +435,7 @@ import time
import string
import urllib
import urllib2
@ -183,7 +266,22 @@ index e090e90..b2770c5 100644
import mimetools
import thread
import types
@@ -439,6 +451,12 @@ try:
@@ -431,6 +447,14 @@ from httplib import HTTPException
import socket
from byterange import range_tuple_normalize, range_tuple_to_header, RangeError
+try:
+ import xattr
+ if not hasattr(xattr, 'set'):
+ xattr = None # This is a "newer" API.
+except ImportError:
+ xattr = None
+
+
########################################################################
# MODULE INITIALIZATION
########################################################################
@@ -439,6 +463,12 @@ try:
except:
__version__ = '???'
@ -196,7 +294,7 @@ index e090e90..b2770c5 100644
########################################################################
# functions for debugging output. These functions are here because they
# are also part of the module initialization.
@@ -527,6 +545,22 @@ def _(st):
@@ -527,6 +557,22 @@ def _(st):
# END MODULE INITIALIZATION
########################################################################
@ -219,7 +317,7 @@ index e090e90..b2770c5 100644
class URLGrabError(IOError):
@@ -662,6 +696,7 @@ class URLParser:
@@ -662,6 +708,7 @@ class URLParser:
opts.quote = 0 --> do not quote it
opts.quote = None --> guess
"""
@ -227,15 +325,59 @@ index e090e90..b2770c5 100644
quote = opts.quote
if opts.prefix:
@@ -800,6 +835,7 @@ class URLGrabberOptions:
@@ -768,6 +815,41 @@ class URLGrabberOptions:
else: # throttle is a float
return self.bandwidth * self.throttle
+ def find_proxy(self, url, scheme):
+ """Find the proxy to use for this URL.
+ Use the proxies dictionary first, then libproxy.
+ """
+ self.proxy = None
+ if scheme not in ('ftp', 'http', 'https'):
+ return
+
+ if self.proxies:
+ proxy = self.proxies.get(scheme)
+ if proxy is None:
+ if scheme == 'http':
+ proxy = self.proxies.get('https')
+ elif scheme == 'https':
+ proxy = self.proxies.get('http')
+ if proxy == '_none_':
+ proxy = ''
+ self.proxy = proxy
+ return
+
+ if self.libproxy:
+ global _libproxy_cache
+ if _libproxy_cache is None:
+ try:
+ import libproxy
+ _libproxy_cache = libproxy.ProxyFactory()
+ except:
+ _libproxy_cache = False
+ if _libproxy_cache:
+ for proxy in _libproxy_cache.getProxies(url):
+ if proxy.startswith('http://'):
+ if DEBUG: DEBUG.info('using proxy "%s" for url %s' % (proxy, url))
+ self.proxy = proxy
+ break
+
def derive(self, **kwargs):
"""Create a derived URLGrabberOptions instance.
This method creates a new instance and overrides the
@@ -800,21 +882,25 @@ class URLGrabberOptions:
self.close_connection = 0
self.range = None
self.user_agent = 'urlgrabber/%s' % __version__
+ self.ip_resolve = None
self.keepalive = 1
self.proxies = None
+ self.libproxy = False
self.reget = None
@@ -808,13 +844,15 @@ class URLGrabberOptions:
self.failure_callback = None
self.interrupt_callback = None
self.prefix = None
self.opener = None
self.cache_openers = True
@ -252,7 +394,7 @@ index e090e90..b2770c5 100644
self.ssl_ca_cert = None # sets SSL_CAINFO - path to certdb
self.ssl_context = None # no-op in pycurl
self.ssl_verify_peer = True # check peer's cert for authenticityb
@@ -846,7 +884,7 @@ class URLGrabberOptions:
@@ -846,7 +932,7 @@ class URLGrabberOptions:
s = s + indent + '}'
return s
@ -261,8 +403,28 @@ index e090e90..b2770c5 100644
"""Provides easy opening of URLs with a variety of options.
All options are specified as kwargs. Options may be specified when
@@ -931,6 +969,9 @@ class URLGrabber:
@@ -912,9 +998,11 @@ class URLGrabber:
returned that supports them. The file object can be treated
like any other file object.
"""
+ url = _to_utf8(url)
opts = self.opts.derive(**kwargs)
if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
(url,parts) = opts.urlparser.parse(url, opts)
+ opts.find_proxy(url, parts[0])
def retryfunc(opts, url):
return PyCurlFileObject(url, filename=None, opts=opts)
return self._retry(opts, retryfunc, url)
@@ -925,12 +1013,17 @@ class URLGrabber:
urlgrab returns the filename of the local file, which may be
different from the passed-in filename if copy_local == 0.
"""
+ url = _to_utf8(url)
opts = self.opts.derive(**kwargs)
if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
(url,parts) = opts.urlparser.parse(url, opts)
(scheme, host, path, parm, query, frag) = parts
+ opts.find_proxy(url, scheme)
if filename is None:
filename = os.path.basename( urllib.unquote(path) )
+ if not filename:
@ -271,7 +433,19 @@ index e090e90..b2770c5 100644
if scheme == 'file' and not opts.copy_local:
# just return the name of the local file - don't make a
# copy currently
@@ -1030,7 +1071,7 @@ class URLGrabber:
@@ -982,9 +1075,11 @@ class URLGrabber:
"I want the first N bytes" but rather 'read the whole file
into memory, but don't use too much'
"""
+ url = _to_utf8(url)
opts = self.opts.derive(**kwargs)
if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
(url,parts) = opts.urlparser.parse(url, opts)
+ opts.find_proxy(url, parts[0])
if limit is not None:
limit = limit + 1
@@ -1030,7 +1125,7 @@ class URLGrabber:
default_grabber = URLGrabber()
@ -280,7 +454,7 @@ index e090e90..b2770c5 100644
def __init__(self, url, filename, opts):
self.fo = None
self._hdr_dump = ''
@@ -1052,9 +1093,15 @@ class PyCurlFileObject():
@@ -1052,10 +1147,11 @@ class PyCurlFileObject():
self._reget_length = 0
self._prog_running = False
self._error = (None, None)
@ -289,15 +463,12 @@ index e090e90..b2770c5 100644
+ self._hdr_ended = False
self._do_open()
-
+
+ def geturl(self):
+ """ Provide the geturl() method, used to be got from
+ urllib.addinfourl, via. urllib.URLopener.* """
+ return self.url
def __getattr__(self, name):
"""This effectively allows us to wrap at the instance level.
@@ -1085,9 +1132,14 @@ class PyCurlFileObject():
Any attribute not found in _this_ object will be searched for
@@ -1085,9 +1181,14 @@ class PyCurlFileObject():
return -1
def _hdr_retrieve(self, buf):
@ -313,7 +484,7 @@ index e090e90..b2770c5 100644
try:
self._hdr_dump += buf
# we have to get the size before we do the progress obj start
@@ -1104,7 +1156,17 @@ class PyCurlFileObject():
@@ -1104,7 +1205,17 @@ class PyCurlFileObject():
s = parse150(buf)
if s:
self.size = int(s)
@ -332,7 +503,7 @@ index e090e90..b2770c5 100644
return len(buf)
except KeyboardInterrupt:
return pycurl.READFUNC_ABORT
@@ -1113,8 +1175,10 @@ class PyCurlFileObject():
@@ -1113,8 +1224,10 @@ class PyCurlFileObject():
if self._parsed_hdr:
return self._parsed_hdr
statusend = self._hdr_dump.find('\n')
@ -343,7 +514,17 @@ index e090e90..b2770c5 100644
self._parsed_hdr = mimetools.Message(hdrfp)
return self._parsed_hdr
@@ -1136,11 +1200,21 @@ class PyCurlFileObject():
@@ -1127,6 +1240,9 @@ class PyCurlFileObject():
if not opts:
opts = self.opts
+ # keepalives
+ if not opts.keepalive:
+ self.curl_obj.setopt(pycurl.FORBID_REUSE, 1)
# defaults we're always going to set
self.curl_obj.setopt(pycurl.NOPROGRESS, False)
@@ -1136,11 +1252,21 @@ class PyCurlFileObject():
self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update)
self.curl_obj.setopt(pycurl.FAILONERROR, True)
self.curl_obj.setopt(pycurl.OPT_FILETIME, True)
@ -365,7 +546,7 @@ index e090e90..b2770c5 100644
# maybe to be options later
self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)
@@ -1148,9 +1222,11 @@ class PyCurlFileObject():
@@ -1148,9 +1274,11 @@ class PyCurlFileObject():
# timeouts
timeout = 300
@ -380,11 +561,50 @@ index e090e90..b2770c5 100644
# ssl options
if self.scheme == 'https':
@@ -1203,12 +1279,19 @@ class PyCurlFileObject():
if proxy == '_none_': proxy = ""
self.curl_obj.setopt(pycurl.PROXY, proxy)
@@ -1158,13 +1286,16 @@ class PyCurlFileObject():
self.curl_obj.setopt(pycurl.CAPATH, opts.ssl_ca_cert)
self.curl_obj.setopt(pycurl.CAINFO, opts.ssl_ca_cert)
self.curl_obj.setopt(pycurl.SSL_VERIFYPEER, opts.ssl_verify_peer)
- self.curl_obj.setopt(pycurl.SSL_VERIFYHOST, opts.ssl_verify_host)
+ if opts.ssl_verify_host: # 1 is meaningless to curl
+ self.curl_obj.setopt(pycurl.SSL_VERIFYHOST, 2)
if opts.ssl_key:
self.curl_obj.setopt(pycurl.SSLKEY, opts.ssl_key)
if opts.ssl_key_type:
self.curl_obj.setopt(pycurl.SSLKEYTYPE, opts.ssl_key_type)
if opts.ssl_cert:
self.curl_obj.setopt(pycurl.SSLCERT, opts.ssl_cert)
+ # if we have a client side cert - turn off reuse b/c nss is odd
+ self.curl_obj.setopt(pycurl.FORBID_REUSE, 1)
if opts.ssl_cert_type:
self.curl_obj.setopt(pycurl.SSLCERTTYPE, opts.ssl_cert_type)
if opts.ssl_key_pass:
@@ -1187,28 +1318,24 @@ class PyCurlFileObject():
if hasattr(opts, 'raw_throttle') and opts.raw_throttle():
self.curl_obj.setopt(pycurl.MAX_RECV_SPEED_LARGE, int(opts.raw_throttle()))
- # proxy settings
- if opts.proxies:
- for (scheme, proxy) in opts.proxies.items():
- if self.scheme in ('ftp'): # only set the ftp proxy for ftp items
- if scheme not in ('ftp'):
- continue
- else:
- if proxy == '_none_': proxy = ""
- self.curl_obj.setopt(pycurl.PROXY, proxy)
- elif self.scheme in ('http', 'https'):
- if scheme not in ('http', 'https'):
- continue
- else:
- if proxy == '_none_': proxy = ""
- self.curl_obj.setopt(pycurl.PROXY, proxy)
-
- # FIXME username/password/auth settings
+ # proxy
+ if opts.proxy is not None:
+ self.curl_obj.setopt(pycurl.PROXY, opts.proxy)
+ self.curl_obj.setopt(pycurl.PROXYAUTH, pycurl.HTTPAUTH_ANY)
+
+ if opts.username and opts.password:
+ if self.scheme in ('http', 'https'):
+ self.curl_obj.setopt(pycurl.HTTPAUTH, pycurl.HTTPAUTH_ANY)
@ -402,7 +622,23 @@ index e090e90..b2770c5 100644
# our url
self.curl_obj.setopt(pycurl.URL, self.url)
@@ -1228,12 +1311,14 @@ class PyCurlFileObject():
@@ -1219,8 +1346,14 @@ class PyCurlFileObject():
return
try:
+ e = None
self.curl_obj.perform()
- except pycurl.error, e:
+ except pycurl.error, e: pass
+ self._do_perform_exc(e)
+
+ def _do_perform_exc(self, e):
+ # handle pycurl exception 'e'
+ if e:
# XXX - break some of these out a bit more clearly
# to other URLGrabErrors from
# http://curl.haxx.se/libcurl/c/libcurl-errors.html
@@ -1228,12 +1361,14 @@ class PyCurlFileObject():
code = self.http_code
errcode = e.args[0]
@ -419,7 +655,7 @@ index e090e90..b2770c5 100644
# this is probably wrong but ultimately this is what happens
# we have a legit http code and a pycurl 'writer failed' code
@@ -1244,23 +1329,23 @@ class PyCurlFileObject():
@@ -1244,23 +1379,23 @@ class PyCurlFileObject():
raise KeyboardInterrupt
elif errcode == 28:
@ -450,7 +686,7 @@ index e090e90..b2770c5 100644
# this is probably wrong but ultimately this is what happens
# we have a legit http code and a pycurl 'writer failed' code
# which almost always means something aborted it from outside
@@ -1272,33 +1357,93 @@ class PyCurlFileObject():
@@ -1272,33 +1407,94 @@ class PyCurlFileObject():
elif errcode == 58:
msg = _("problem with the local client certificate")
err = URLGrabError(14, msg)
@ -519,6 +755,7 @@ index e090e90..b2770c5 100644
+ 42 : _("Aborted by callback"),
+ 47 : _("Too many redirects"),
+ 51 : _("Peer certificate failed verification"),
+ 52 : _("Got nothing: SSL certificate expired?"),
+ 53 : _("SSL engine not found"),
+ 54 : _("SSL engine set failed"),
+ 55 : _("Network error send()"),
@ -545,13 +782,13 @@ index e090e90..b2770c5 100644
+ else:
+ if self._error[1]:
+ msg = self._error[1]
+ err = URLGRabError(14, msg)
+ err = URLGrabError(14, msg)
+ err.url = urllib.unquote(self.url)
+ raise err
def _do_open(self):
self.curl_obj = _curl_cache
@@ -1333,7 +1478,11 @@ class PyCurlFileObject():
@@ -1333,7 +1529,11 @@ class PyCurlFileObject():
if self.opts.range:
rt = self.opts.range
@ -564,36 +801,54 @@ index e090e90..b2770c5 100644
if rt:
header = range_tuple_to_header(rt)
@@ -1434,9 +1583,13 @@ class PyCurlFileObject():
@@ -1407,22 +1607,7 @@ class PyCurlFileObject():
_was_filename = False
if type(self.filename) in types.StringTypes and self.filename:
_was_filename = True
- self._prog_reportname = str(self.filename)
- self._prog_basename = os.path.basename(self.filename)
-
- if self.append: mode = 'ab'
- else: mode = 'wb'
-
- if DEBUG: DEBUG.info('opening local file "%s" with mode %s' % \
- (self.filename, mode))
- try:
- self.fo = open(self.filename, mode)
- except IOError, e:
- err = URLGrabError(16, _(\
- 'error opening local file from %s, IOError: %s') % (self.url, e))
- err.url = self.url
- raise err
-
+ self._do_open_fo()
else:
self._prog_reportname = 'MEMORY'
self._prog_basename = 'MEMORY'
@@ -1434,27 +1619,71 @@ class PyCurlFileObject():
#fh, self._temp_name = mkstemp()
#self.fo = open(self._temp_name, 'wb')
-
- self._do_perform()
-
-
-
- if _was_filename:
- # close it up
+ try:
+ self._do_perform()
+ except URLGrabError, e:
+ self.fo.flush()
+ self.fo.close()
self.fo.flush()
self.fo.close()
- # set the time
- mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME)
- if mod_time != -1:
- os.utime(self.filename, (mod_time, mod_time))
+ raise e
+
if _was_filename:
@@ -1446,9 +1599,23 @@ class PyCurlFileObject():
# set the time
mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME)
if mod_time != -1:
- os.utime(self.filename, (mod_time, mod_time))
+ try:
+ os.utime(self.filename, (mod_time, mod_time))
+ except OSError, e:
+ err = URLGrabError(16, _(\
+ 'error setting timestamp on file %s from %s, OSError: %s')
+ % (self.filename, self.url, e))
+ err.url = self.url
+ raise err
+ if _was_filename:
+ self._do_close_fo()
# re open it
- self.fo = open(self.filename, 'r')
+ try:
@ -607,7 +862,61 @@ index e090e90..b2770c5 100644
else:
#self.fo = open(self._temp_name, 'r')
self.fo.seek(0)
@@ -1532,11 +1699,14 @@ class PyCurlFileObject():
self._complete = True
+ def _do_open_fo(self):
+ self._prog_reportname = str(self.filename)
+ self._prog_basename = os.path.basename(self.filename)
+ if self.append: mode = 'ab'
+ else: mode = 'wb'
+
+ if DEBUG: DEBUG.info('opening local file "%s" with mode %s' % \
+ (self.filename, mode))
+ try:
+ self.fo = open(self.filename, mode)
+ except IOError, e:
+ err = URLGrabError(16, _(\
+ 'error opening local file from %s, IOError: %s') % (self.url, e))
+ err.url = self.url
+ raise err
+
+ def _do_close_fo(self):
+ # close it up
+ self.fo.flush()
+ self.fo.close()
+
+ # Set the URL where we got it from:
+ if xattr is not None:
+ # See: http://www.freedesktop.org/wiki/CommonExtendedAttributes
+ try:
+ xattr.set(self.filename, 'user.xdg.origin.url', self.url)
+ except:
+ pass # URL too long. = IOError ... ignore everything.
+
+ # set the time
+ mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME)
+ if mod_time != -1:
+ try:
+ os.utime(self.filename, (mod_time, mod_time))
+ except OSError, e:
+ err = URLGrabError(16, _(\
+ 'error setting timestamp on file %s from %s, OSError: %s')
+ % (self.filename, self.url, e))
+ err.url = self.url
+ raise err
+
def _fill_buffer(self, amt=None):
"""fill the buffer to contain at least 'amt' bytes by reading
from the underlying file object. If amt is None, then it will
@@ -1526,17 +1755,20 @@ class PyCurlFileObject():
if self._prog_running:
downloaded += self._reget_length
self.opts.progress_obj.update(downloaded)
- except KeyboardInterrupt:
+ except (KeyboardInterrupt, IOError):
return -1
def _over_max_size(self, cur, max_size=None):
if not max_size:
@ -626,7 +935,7 @@ index e090e90..b2770c5 100644
msg = _("Downloaded more than max size for %s: %s > %s") \
% (self.url, cur, max_size)
@@ -1544,13 +1714,6 @@ class PyCurlFileObject():
@@ -1544,13 +1776,6 @@ class PyCurlFileObject():
return True
return False
@ -640,7 +949,7 @@ index e090e90..b2770c5 100644
def read(self, amt=None):
self._fill_buffer(amt)
if amt is None:
@@ -1582,9 +1745,21 @@ class PyCurlFileObject():
@@ -1582,9 +1807,21 @@ class PyCurlFileObject():
self.opts.progress_obj.end(self._amount_read)
self.fo.close()
@ -658,7 +967,7 @@ index e090e90..b2770c5 100644
+ _curl_cache.close()
+ _curl_cache = pycurl.Curl()
+
+
+_libproxy_cache = None
+
#####################################################################
@ -687,10 +996,199 @@ index dad410b..8731aed 100644
return parsed_mirrors
diff --git a/urlgrabber/progress.py b/urlgrabber/progress.py
index dd07c6a..45eb248 100644
index dd07c6a..3d7e99a 100644
--- a/urlgrabber/progress.py
+++ b/urlgrabber/progress.py
@@ -658,6 +658,8 @@ def format_time(seconds, use_hours=0):
@@ -211,6 +211,21 @@ def text_meter_total_size(size, downloaded=0):
# 4. + ( 5, total: 32)
#
+def _term_add_bar(tl, bar_max_length, pc):
+ blen = bar_max_length
+ bar = '='*int(blen * pc)
+ if (blen * pc) - int(blen * pc) >= 0.5:
+ bar += '-'
+ return tl.add(' [%-*.*s]' % (blen, blen, bar))
+
+def _term_add_end(tl, osize, size):
+ if osize is not None:
+ if size > osize: # Is ??? better? Really need something to say < vs >.
+ return tl.add(' !!! '), True
+ elif size != osize:
+ return tl.add(' ... '), True
+ return tl.add(' ' * 5), False
+
class TextMeter(BaseMeter):
def __init__(self, fo=sys.stderr):
BaseMeter.__init__(self)
@@ -259,13 +274,10 @@ class TextMeter(BaseMeter):
ui_rate = tl.add(' %5sB/s' % ave_dl)
# Make text grow a bit before we start growing the bar too
blen = 4 + tl.rest_split(8 + 8 + 4)
- bar = '='*int(blen * frac)
- if (blen * frac) - int(blen * frac) >= 0.5:
- bar += '-'
- ui_bar = tl.add(' [%-*.*s]' % (blen, blen, bar))
- out = '%-*.*s%s%s%s%s%s%s%s\r' % (tl.rest(), tl.rest(), text,
- ui_sofar_pc, ui_pc, ui_bar,
- ui_rate, ui_size, ui_time, ui_end)
+ ui_bar = _term_add_bar(tl, blen, frac)
+ out = '\r%-*.*s%s%s%s%s%s%s%s\r' % (tl.rest(), tl.rest(), text,
+ ui_sofar_pc, ui_pc, ui_bar,
+ ui_rate,ui_size,ui_time, ui_end)
self.fo.write(out)
self.fo.flush()
@@ -284,12 +296,7 @@ class TextMeter(BaseMeter):
tl = TerminalLine(8)
ui_size = tl.add(' | %5sB' % total_size)
ui_time = tl.add(' %9s' % total_time)
- not_done = self.size is not None and amount_read != self.size
- if not_done:
- ui_end = tl.add(' ... ')
- else:
- ui_end = tl.add(' ' * 5)
-
+ ui_end, not_done = _term_add_end(tl, self.size, amount_read)
out = '\r%-*.*s%s%s%s\n' % (tl.rest(), tl.rest(), text,
ui_size, ui_time, ui_end)
self.fo.write(out)
@@ -331,12 +338,21 @@ class MultiFileHelper(BaseMeter):
def message(self, message):
self.master.message_meter(self, message)
+class _FakeLock:
+ def acquire(self):
+ pass
+ def release(self):
+ pass
+
class MultiFileMeter:
helperclass = MultiFileHelper
- def __init__(self):
+ def __init__(self, threaded=True):
self.meters = []
self.in_progress_meters = []
- self._lock = thread.allocate_lock()
+ if threaded:
+ self._lock = thread.allocate_lock()
+ else:
+ self._lock = _FakeLock()
self.update_period = 0.3 # seconds
self.numfiles = None
@@ -369,6 +385,7 @@ class MultiFileMeter:
def end(self, now=None):
if now is None: now = time.time()
+ self.re.update(self._amount_read(), now)
self._do_end(now)
def _do_end(self, now):
@@ -466,11 +483,20 @@ class MultiFileMeter:
class TextMultiFileMeter(MultiFileMeter):
- def __init__(self, fo=sys.stderr):
+ def __init__(self, fo=sys.stderr, threaded=True):
self.fo = fo
- MultiFileMeter.__init__(self)
+ MultiFileMeter.__init__(self, threaded)
# files: ###/### ###% data: ######/###### ###% time: ##:##:##/##:##:##
+# New output, like TextMeter output...
+# update: Size, All files
+# -----------------------
+# (<#file>/<#tot files>): <text> <pc> <bar> <rate> | <size> <eta time> ETA
+# 8-22 1 3-4 1 6-12 1 8 3 6 1 9 1 3 1
+# end
+# ---
+# <text> | <file size> <file elapsed time>
+# 8-56 3 6 1 9 5
def _do_update_meter(self, meter, now):
self._lock.acquire()
try:
@@ -480,7 +506,7 @@ class TextMultiFileMeter(MultiFileMeter):
tf = self.numfiles or 1
pf = 100 * float(df)/tf + 0.49
dd = self.re.last_amount_read
- td = self.total_size
+ td = self.re.total
pd = 100 * (self.re.fraction_read() or 0) + 0.49
dt = self.re.elapsed_time()
rt = self.re.remaining_time()
@@ -491,9 +517,33 @@ class TextMultiFileMeter(MultiFileMeter):
ftd = format_number(td) + 'B'
fdt = format_time(dt, 1)
ftt = format_time(tt, 1)
-
- out = '%-79.79s' % (format % (df, tf, pf, fdd, ftd, pd, fdt, ftt))
- self.fo.write('\r' + out)
+
+ frac = self.re.fraction_read() or 0
+ ave_dl = format_number(self.re.average_rate())
+ text = meter.text or meter.basename
+ if tf > 1:
+ text = '(%u/%u): %s' % (df+1, tf, text)
+
+ # Include text + ui_rate in minimal
+ tl = TerminalLine(8, 8+1+8)
+
+ ui_size = tl.add(' | %5sB' % format_number(dd))
+
+ ui_time = tl.add(' %9s' % format_time(rt))
+ ui_end = tl.add(' ETA ')
+
+ ui_sofar_pc = tl.add(' %i%%' % pf,
+ full_len=len(" (100%)"))
+ ui_rate = tl.add(' %5sB/s' % ave_dl)
+
+ # Make text grow a bit before we start growing the bar too
+ blen = 4 + tl.rest_split(8 + 8 + 4)
+ ui_bar = _term_add_bar(tl, blen, frac)
+ out = '\r%-*.*s%s%s%s%s%s%s\r' % (tl.rest(), tl.rest(), text,
+ ui_sofar_pc, ui_bar,
+ ui_rate, ui_size, ui_time,
+ ui_end)
+ self.fo.write(out)
self.fo.flush()
finally:
self._lock.release()
@@ -502,15 +552,28 @@ class TextMultiFileMeter(MultiFileMeter):
self._lock.acquire()
try:
format = "%-30.30s %6.6s %8.8s %9.9s"
- fn = meter.basename
+ fn = meter.text or meter.basename
size = meter.last_amount_read
fsize = format_number(size) + 'B'
et = meter.re.elapsed_time()
fet = format_time(et, 1)
- frate = format_number(size / et) + 'B/s'
-
- out = '%-79.79s' % (format % (fn, fsize, fet, frate))
- self.fo.write('\r' + out + '\n')
+ frate = format_number(et and size / et) + 'B/s'
+ df = self.finished_files
+ tf = self.numfiles or 1
+
+ total_time = format_time(et)
+ total_size = format_number(size)
+ text = meter.text or meter.basename
+ if tf > 1:
+ text = '(%u/%u): %s' % (df, tf, text)
+
+ tl = TerminalLine(8)
+ ui_size = tl.add(' | %5sB' % total_size)
+ ui_time = tl.add(' %9s' % total_time)
+ ui_end, not_done = _term_add_end(tl, meter.size, size)
+ out = '\r%-*.*s%s%s%s\n' % (tl.rest(), tl.rest(), text,
+ ui_size, ui_time, ui_end)
+ self.fo.write(out)
finally:
self._lock.release()
self._do_update_meter(meter, now)
@@ -658,6 +721,8 @@ def format_time(seconds, use_hours=0):
if seconds is None or seconds < 0:
if use_hours: return '--:--:--'
else: return '--:--'

Loading…
Cancel
Save