Update to latest HEAD.

epel9
Zdeněk Pavlas 12 years ago
parent add16b8996
commit b14cebc2e8

@ -3,7 +3,7 @@
Summary: A high-level cross-protocol url-grabber
Name: python-urlgrabber
Version: 3.9.1
Release: 21%{?dist}
Release: 22%{?dist}
Source0: urlgrabber-%{version}.tar.gz
Patch1: urlgrabber-HEAD.patch
@ -44,6 +44,10 @@ rm -rf $RPM_BUILD_ROOT
%attr(0755,root,root) %{_libexecdir}/urlgrabber-ext-down
%changelog
* Thu Dec 6 2012 Zdeněk Pavlas <zpavlas@redhat.com> - 3.9.1-22
- Update to latest HEAD.
- Improve URLGRABBER_DEBUG, add max_connections. BZ 853432
* Thu Nov 1 2012 Zdeněk Pavlas <zpavlas@redhat.com> - 3.9.1-21
- Update to latest HEAD.
- Get rid of "HTTP 200 OK" errors. BZ 871835.

@ -236,7 +236,7 @@ index 3e5f3b7..8eeaeda 100644
return (fb,lb)
diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py
index e090e90..74a692c 100644
index e090e90..78c2e59 100644
--- a/urlgrabber/grabber.py
+++ b/urlgrabber/grabber.py
@@ -49,11 +49,26 @@ GENERAL ARGUMENTS (kwargs)
@ -458,7 +458,26 @@ index e090e90..74a692c 100644
########################################################################
# functions for debugging output. These functions are here because they
# are also part of the module initialization.
@@ -527,6 +608,29 @@ def _(st):
@@ -504,6 +585,7 @@ def _init_default_logger(logspec=None):
else: handler = logging.FileHandler(filename)
handler.setFormatter(formatter)
DBOBJ = logging.getLogger('urlgrabber')
+ DBOBJ.propagate = False
DBOBJ.addHandler(handler)
DBOBJ.setLevel(level)
except (KeyError, ImportError, ValueError):
@@ -512,8 +594,8 @@ def _init_default_logger(logspec=None):
def _log_package_state():
if not DEBUG: return
- DEBUG.info('urlgrabber version = %s' % __version__)
- DEBUG.info('trans function "_" = %s' % _)
+ DEBUG.debug('urlgrabber version = %s' % __version__)
+ DEBUG.debug('trans function "_" = %s' % _)
_init_default_logger()
_log_package_state()
@@ -527,6 +609,29 @@ def _(st):
# END MODULE INITIALIZATION
########################################################################
@ -488,7 +507,7 @@ index e090e90..74a692c 100644
class URLGrabError(IOError):
@@ -662,6 +766,7 @@ class URLParser:
@@ -662,6 +767,7 @@ class URLParser:
opts.quote = 0 --> do not quote it
opts.quote = None --> guess
"""
@ -496,7 +515,7 @@ index e090e90..74a692c 100644
quote = opts.quote
if opts.prefix:
@@ -768,6 +873,41 @@ class URLGrabberOptions:
@@ -768,6 +874,41 @@ class URLGrabberOptions:
else: # throttle is a float
return self.bandwidth * self.throttle
@ -538,7 +557,7 @@ index e090e90..74a692c 100644
def derive(self, **kwargs):
"""Create a derived URLGrabberOptions instance.
This method creates a new instance and overrides the
@@ -791,30 +931,37 @@ class URLGrabberOptions:
@@ -791,30 +932,37 @@ class URLGrabberOptions:
provided here.
"""
self.progress_obj = None
@ -577,7 +596,7 @@ index e090e90..74a692c 100644
self.ssl_ca_cert = None # sets SSL_CAINFO - path to certdb
self.ssl_context = None # no-op in pycurl
self.ssl_verify_peer = True # check peer's cert for authenticityb
@@ -827,6 +974,12 @@ class URLGrabberOptions:
@@ -827,6 +975,12 @@ class URLGrabberOptions:
self.size = None # if we know how big the thing we're getting is going
# to be. this is ultimately a MAXIMUM size for the file
self.max_header_size = 2097152 #2mb seems reasonable for maximum header size
@ -590,7 +609,7 @@ index e090e90..74a692c 100644
def __repr__(self):
return self.format()
@@ -846,7 +999,18 @@ class URLGrabberOptions:
@@ -846,7 +1000,18 @@ class URLGrabberOptions:
s = s + indent + '}'
return s
@ -610,7 +629,7 @@ index e090e90..74a692c 100644
"""Provides easy opening of URLs with a variety of options.
All options are specified as kwargs. Options may be specified when
@@ -872,7 +1036,6 @@ class URLGrabber:
@@ -872,7 +1037,6 @@ class URLGrabber:
# beware of infinite loops :)
tries = tries + 1
exception = None
@ -618,7 +637,7 @@ index e090e90..74a692c 100644
callback = None
if DEBUG: DEBUG.info('attempt %i/%s: %s',
tries, opts.retry, args[0])
@@ -883,54 +1046,62 @@ class URLGrabber:
@@ -883,54 +1047,62 @@ class URLGrabber:
except URLGrabError, e:
exception = e
callback = opts.failure_callback
@ -688,7 +707,7 @@ index e090e90..74a692c 100644
if scheme == 'file' and not opts.copy_local:
# just return the name of the local file - don't make a
# copy currently
@@ -950,41 +1121,51 @@ class URLGrabber:
@@ -950,41 +1122,51 @@ class URLGrabber:
elif not opts.range:
if not opts.checkfunc is None:
@ -755,7 +774,7 @@ index e090e90..74a692c 100644
if limit is not None:
limit = limit + 1
@@ -1000,12 +1181,8 @@ class URLGrabber:
@@ -1000,12 +1182,8 @@ class URLGrabber:
else: s = fo.read(limit)
if not opts.checkfunc is None:
@ -770,7 +789,7 @@ index e090e90..74a692c 100644
finally:
fo.close()
return s
@@ -1020,6 +1197,7 @@ class URLGrabber:
@@ -1020,6 +1198,7 @@ class URLGrabber:
return s
def _make_callback(self, callback_obj):
@ -778,7 +797,7 @@ index e090e90..74a692c 100644
if callable(callback_obj):
return callback_obj, (), {}
else:
@@ -1030,7 +1208,7 @@ class URLGrabber:
@@ -1030,7 +1209,7 @@ class URLGrabber:
default_grabber = URLGrabber()
@ -787,7 +806,7 @@ index e090e90..74a692c 100644
def __init__(self, url, filename, opts):
self.fo = None
self._hdr_dump = ''
@@ -1052,10 +1230,13 @@ class PyCurlFileObject():
@@ -1052,10 +1231,13 @@ class PyCurlFileObject():
self._reget_length = 0
self._prog_running = False
self._error = (None, None)
@ -803,7 +822,7 @@ index e090e90..74a692c 100644
def __getattr__(self, name):
"""This effectively allows us to wrap at the instance level.
Any attribute not found in _this_ object will be searched for
@@ -1067,6 +1248,12 @@ class PyCurlFileObject():
@@ -1067,6 +1249,12 @@ class PyCurlFileObject():
def _retrieve(self, buf):
try:
@ -816,7 +835,7 @@ index e090e90..74a692c 100644
if not self._prog_running:
if self.opts.progress_obj:
size = self.size + self._reget_length
@@ -1079,15 +1266,24 @@ class PyCurlFileObject():
@@ -1079,15 +1267,24 @@ class PyCurlFileObject():
self.opts.progress_obj.update(self._amount_read)
self._amount_read += len(buf)
@ -843,7 +862,7 @@ index e090e90..74a692c 100644
try:
self._hdr_dump += buf
# we have to get the size before we do the progress obj start
@@ -1104,7 +1300,17 @@ class PyCurlFileObject():
@@ -1104,7 +1301,17 @@ class PyCurlFileObject():
s = parse150(buf)
if s:
self.size = int(s)
@ -857,12 +876,12 @@ index e090e90..74a692c 100644
+
+ if len(self._hdr_dump) != 0 and buf == '\r\n':
+ self._hdr_ended = True
+ if DEBUG: DEBUG.info('header ended:')
+ if DEBUG: DEBUG.debug('header ended:')
+
return len(buf)
except KeyboardInterrupt:
return pycurl.READFUNC_ABORT
@@ -1113,8 +1319,10 @@ class PyCurlFileObject():
@@ -1113,8 +1320,10 @@ class PyCurlFileObject():
if self._parsed_hdr:
return self._parsed_hdr
statusend = self._hdr_dump.find('\n')
@ -873,7 +892,7 @@ index e090e90..74a692c 100644
self._parsed_hdr = mimetools.Message(hdrfp)
return self._parsed_hdr
@@ -1127,6 +1335,9 @@ class PyCurlFileObject():
@@ -1127,6 +1336,9 @@ class PyCurlFileObject():
if not opts:
opts = self.opts
@ -883,13 +902,14 @@ index e090e90..74a692c 100644
# defaults we're always going to set
self.curl_obj.setopt(pycurl.NOPROGRESS, False)
@@ -1136,11 +1347,21 @@ class PyCurlFileObject():
@@ -1136,11 +1348,21 @@ class PyCurlFileObject():
self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update)
self.curl_obj.setopt(pycurl.FAILONERROR, True)
self.curl_obj.setopt(pycurl.OPT_FILETIME, True)
+ self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)
if DEBUG:
- if DEBUG:
+ if DEBUG and DEBUG.level <= 10:
self.curl_obj.setopt(pycurl.VERBOSE, True)
if opts.user_agent:
self.curl_obj.setopt(pycurl.USERAGENT, opts.user_agent)
@ -905,7 +925,7 @@ index e090e90..74a692c 100644
# maybe to be options later
self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)
@@ -1148,9 +1369,11 @@ class PyCurlFileObject():
@@ -1148,9 +1370,11 @@ class PyCurlFileObject():
# timeouts
timeout = 300
@ -920,7 +940,7 @@ index e090e90..74a692c 100644
# ssl options
if self.scheme == 'https':
@@ -1158,13 +1381,16 @@ class PyCurlFileObject():
@@ -1158,13 +1382,16 @@ class PyCurlFileObject():
self.curl_obj.setopt(pycurl.CAPATH, opts.ssl_ca_cert)
self.curl_obj.setopt(pycurl.CAINFO, opts.ssl_ca_cert)
self.curl_obj.setopt(pycurl.SSL_VERIFYPEER, opts.ssl_verify_peer)
@ -938,7 +958,7 @@ index e090e90..74a692c 100644
if opts.ssl_cert_type:
self.curl_obj.setopt(pycurl.SSLCERTTYPE, opts.ssl_cert_type)
if opts.ssl_key_pass:
@@ -1187,28 +1413,26 @@ class PyCurlFileObject():
@@ -1187,28 +1414,26 @@ class PyCurlFileObject():
if hasattr(opts, 'raw_throttle') and opts.raw_throttle():
self.curl_obj.setopt(pycurl.MAX_RECV_SPEED_LARGE, int(opts.raw_throttle()))
@ -983,7 +1003,7 @@ index e090e90..74a692c 100644
# our url
self.curl_obj.setopt(pycurl.URL, self.url)
@@ -1228,39 +1452,36 @@ class PyCurlFileObject():
@@ -1228,39 +1453,36 @@ class PyCurlFileObject():
code = self.http_code
errcode = e.args[0]
@ -1032,7 +1052,7 @@ index e090e90..74a692c 100644
# this is probably wrong but ultimately this is what happens
# we have a legit http code and a pycurl 'writer failed' code
# which almost always means something aborted it from outside
@@ -1272,33 +1493,94 @@ class PyCurlFileObject():
@@ -1272,33 +1494,94 @@ class PyCurlFileObject():
elif errcode == 58:
msg = _("problem with the local client certificate")
err = URLGrabError(14, msg)
@ -1135,7 +1155,7 @@ index e090e90..74a692c 100644
def _do_open(self):
self.curl_obj = _curl_cache
@@ -1333,7 +1615,11 @@ class PyCurlFileObject():
@@ -1333,7 +1616,11 @@ class PyCurlFileObject():
if self.opts.range:
rt = self.opts.range
@ -1148,7 +1168,7 @@ index e090e90..74a692c 100644
if rt:
header = range_tuple_to_header(rt)
@@ -1434,21 +1720,46 @@ class PyCurlFileObject():
@@ -1434,21 +1721,46 @@ class PyCurlFileObject():
#fh, self._temp_name = mkstemp()
#self.fo = open(self._temp_name, 'wb')
@ -1202,7 +1222,7 @@ index e090e90..74a692c 100644
else:
#self.fo = open(self._temp_name, 'r')
self.fo.seek(0)
@@ -1526,17 +1837,20 @@ class PyCurlFileObject():
@@ -1526,17 +1838,20 @@ class PyCurlFileObject():
if self._prog_running:
downloaded += self._reget_length
self.opts.progress_obj.update(downloaded)
@ -1228,7 +1248,7 @@ index e090e90..74a692c 100644
msg = _("Downloaded more than max size for %s: %s > %s") \
% (self.url, cur, max_size)
@@ -1544,13 +1858,6 @@ class PyCurlFileObject():
@@ -1544,13 +1859,6 @@ class PyCurlFileObject():
return True
return False
@ -1242,7 +1262,7 @@ index e090e90..74a692c 100644
def read(self, amt=None):
self._fill_buffer(amt)
if amt is None:
@@ -1582,9 +1889,21 @@ class PyCurlFileObject():
@@ -1582,9 +1890,21 @@ class PyCurlFileObject():
self.opts.progress_obj.end(self._amount_read)
self.fo.close()
@ -1265,7 +1285,7 @@ index e090e90..74a692c 100644
#####################################################################
# DEPRECATED FUNCTIONS
@@ -1621,6 +1940,467 @@ def retrygrab(url, filename=None, copy_local=0, close_connection=0,
@@ -1621,6 +1941,478 @@ def retrygrab(url, filename=None, copy_local=0, close_connection=0,
#####################################################################
@ -1498,17 +1518,23 @@ index e090e90..74a692c 100644
+ host_con = {} # current host connection counts
+
+ def start(opts, tries):
+ opts.tries = tries
+ try:
+ dl.start(opts)
+ except OSError, e:
+ # can't spawn downloader, give up immediately
+ opts.exception = URLGrabError(5, exception2msg(e))
+ _run_callback(opts.failfunc, opts)
+ return
+
+ key, limit = opts.async
+ host_con[key] = host_con.get(key, 0) + 1
+ opts.tries = tries
+ if opts.progress_obj:
+ if opts.multi_progress_obj:
+ opts._progress = opts.multi_progress_obj.newMeter()
+ opts._progress.start(text=opts.text)
+ else:
+ opts._progress = time.time() # no updates
+ if DEBUG: DEBUG.info('attempt %i/%s: %s', opts.tries, opts.retry, opts.url)
+ dl.start(opts)
+
+ def perform():
+ for opts, size, ug_err in dl.perform():
@ -1588,6 +1614,8 @@ index e090e90..74a692c 100644
+ # check global limit
+ while len(dl.running) >= default_grabber.opts.max_connections:
+ perform()
+ if DEBUG:
+ DEBUG.info('max_connections: %d/%d', len(dl.running), default_grabber.opts.max_connections)
+
+ if opts.mirror_group:
+ mg, errors, failed, removed = opts.mirror_group
@ -1636,6 +1664,9 @@ index e090e90..74a692c 100644
+ key, limit = opts.async
+ while host_con.get(key, 0) >= limit:
+ perform()
+ if DEBUG:
+ DEBUG.info('max_connections(%s): %d/%d', key, host_con.get(key, 0), limit)
+
+ start(opts, 1)
+ except IOError, e:
+ if e.errno != 4: raise

Loading…
Cancel
Save