commit 7c74f526dd761b647d6bb6a7b7d6c285fe78bdb8 Author: Zdeněk Pavlas Date: Fri May 18 15:38:44 2012 +0200 timedhosts: fix file:// profiling. BZ 822632. - Do not profile absolute file:// URLs. - Give a hint to _TH.update() which baseurl was used so we may profile file:// mirrors, too. - Strip username and password from stored hostnames. diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py index 094be77..be85f92 100644 --- a/urlgrabber/grabber.py +++ b/urlgrabber/grabber.py @@ -2060,7 +2060,7 @@ class _ExternalDownloader: else: ug_err = URLGrabError(int(line[4]), line[5]) if DEBUG: DEBUG.info('failure: %s', err) - _TH.update(opts.url, int(line[2]), float(line[3]), ug_err) + _TH.update(opts.url, int(line[2]), float(line[3]), ug_err, opts.async[0]) ret.append((opts, size, ug_err)) return ret @@ -2268,7 +2268,7 @@ class _TH: if filename and _TH.dirty is None: try: for line in open(filename): - host, speed, fail, ts = line.split() + host, speed, fail, ts = line.split(' ', 3) _TH.hosts[host] = int(speed), int(fail), int(ts) except IOError: pass _TH.dirty = False @@ -2288,9 +2288,14 @@ class _TH: _TH.dirty = False @staticmethod - def update(url, dl_size, dl_time, ug_err): + def update(url, dl_size, dl_time, ug_err, baseurl=None): _TH.load() - host = urlparse.urlsplit(url).netloc + + # Use hostname from URL. If it's a file:// URL, use baseurl. + # If no baseurl, do not update timedhosts. + host = urlparse.urlsplit(url).netloc.split('@')[-1] or baseurl + if not host: return + speed, fail, ts = _TH.hosts.get(host) or (0, 0, 0) now = time.time() @@ -2311,9 +2316,12 @@ class _TH: _TH.dirty = True @staticmethod - def estimate(url): + def estimate(baseurl): _TH.load() - host = urlparse.urlsplit(url).netloc + + # Use just the hostname, unless it's a file:// baseurl. + host = urlparse.urlsplit(baseurl).netloc.split('@')[-1] or baseurl + default_speed = default_grabber.opts.default_speed try: speed, fail, ts = _TH.hosts[host] except KeyError: return default_speed commit fa6a17c29e9dea3ccd2d384039b305f027a5b75e Author: Zdeněk Pavlas Date: Mon May 21 09:06:13 2012 +0200 timedhosts: sanity check on dl_time - handle the dl_time <= 0 case - relative validity of calculated speed now depends on dl_time instead of dl_size. (that's where the random error is) diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py index be85f92..73e14aa 100644 --- a/urlgrabber/grabber.py +++ b/urlgrabber/grabber.py @@ -2301,11 +2301,12 @@ class _TH: if ug_err is None: # k1: the older, the less useful - # k2: if it was <1MiB, don't trust it much + # k2: <500ms readings are less reliable # speeds vary, use 10:1 smoothing k1 = 2**((ts - now) / default_grabber.opts.half_life) - k2 = min(dl_size / 1e6, 1.0) / 10 - speed = (k1 * speed + k2 * dl_size / dl_time) / (k1 + k2) + k2 = min(dl_time / .500, 1.0) / 10 + if k2 > 0: + speed = (k1 * speed + k2 * dl_size / dl_time) / (k1 + k2) fail = 0 elif getattr(ug_err, 'code', None) == 404: fail = 0 # alive, at least