diff --git a/python-urlgrabber.spec b/python-urlgrabber.spec index 68d3eff..98e7bc7 100644 --- a/python-urlgrabber.spec +++ b/python-urlgrabber.spec @@ -3,7 +3,7 @@ Summary: A high-level cross-protocol url-grabber Name: python-urlgrabber Version: 3.9.1 -Release: 30%{?dist} +Release: 31%{?dist} Source0: urlgrabber-%{version}.tar.gz Patch1: urlgrabber-HEAD.patch @@ -44,6 +44,13 @@ rm -rf $RPM_BUILD_ROOT %attr(0755,root,root) %{_libexecdir}/urlgrabber-ext-down %changelog +* Thu Aug 29 2013 Zdenek Pavlas - 3.9.1-31 +- Update to latest HEAD. +- add ftp_disable_epsv option. BZ 849177 +- Spelling fixes. +- docs: throttling is per-connection, suggest max_connections=1. BZ 998263 +- More robust "Content-Length" parsing. BZ 1000841 + * Sun Aug 04 2013 Fedora Release Engineering - 3.9.1-30 - Rebuilt for https://fedoraproject.org/wiki/Fedora_20_Mass_Rebuild diff --git a/urlgrabber-HEAD.patch b/urlgrabber-HEAD.patch index 8ae9c9a..b6a75da 100644 --- a/urlgrabber-HEAD.patch +++ b/urlgrabber-HEAD.patch @@ -11,6 +11,19 @@ index 0000000..1ffe416 +*.kdev* +*.kateproject +ipython.log* +diff --git a/README b/README +index 5fd378b..2718d3c 100644 +--- a/README ++++ b/README +@@ -19,7 +19,7 @@ You can build rpms by running + python setup.py bdist_rpm + + The rpms (both source and "binary") will be specific to the current +-distrubution/version and may not be portable to others. This is ++distribution/version and may not be portable to others. This is + because they will be built for the currently installed python. + + keepalive.py and byterange.py are generic urllib2 extension modules and diff --git a/scripts/urlgrabber b/scripts/urlgrabber index 518e512..07881b3 100644 --- a/scripts/urlgrabber @@ -189,6 +202,63 @@ index 50c6348..5fb43f9 100644 base_ftp = 'ftp://localhost/test/' # set to a proftp server only. we're working around a couple of +diff --git a/test/munittest.py b/test/munittest.py +index 96230b8..16a61ae 100644 +--- a/test/munittest.py ++++ b/test/munittest.py +@@ -113,7 +113,7 @@ import types + __all__ = ['TestResult', 'TestCase', 'TestSuite', 'TextTestRunner', + 'TestLoader', 'FunctionTestCase', 'main', 'defaultTestLoader'] + +-# Expose obsolete functions for backwards compatability ++# Expose obsolete functions for backwards compatibility + __all__.extend(['getTestCaseNames', 'makeSuite', 'findTestCases']) + + +@@ -410,7 +410,7 @@ class TestCase: + (default 7) and comparing to zero. + + Note that decimal places (from zero) is usually not the same +- as significant digits (measured from the most signficant digit). ++ as significant digits (measured from the most significant digit). + """ + if round(second-first, places) != 0: + raise self.failureException, \ +@@ -422,7 +422,7 @@ class TestCase: + (default 7) and comparing to zero. + + Note that decimal places (from zero) is usually not the same +- as significant digits (measured from the most signficant digit). ++ as significant digits (measured from the most significant digit). + """ + if round(second-first, places) == 0: + raise self.failureException, \ +diff --git a/test/test_byterange.py b/test/test_byterange.py +index 96f1573..fe7e105 100644 +--- a/test/test_byterange.py ++++ b/test/test_byterange.py +@@ -56,7 +56,7 @@ class RangeableFileObjectTestCase(TestCase): + """RangeableFileObject.seek() poor mans version.. + + We just delete the seek method from StringIO so we can +- excercise RangeableFileObject when the file object supplied ++ exercise RangeableFileObject when the file object supplied + doesn't support seek. + """ + seek = StringIO.seek +diff --git a/test/test_grabber.py b/test/test_grabber.py +index eecdbcf..d3a7692 100644 +--- a/test/test_grabber.py ++++ b/test/test_grabber.py +@@ -86,7 +86,7 @@ class FileObjectTests(TestCase): + + class HTTPTests(TestCase): + def test_reference_file(self): +- "download refernce file via HTTP" ++ "download reference file via HTTP" + filename = tempfile.mktemp() + grabber.urlgrab(ref_http, filename) + diff --git a/test/test_mirror.py b/test/test_mirror.py index 70fe069..6fdb668 100644 --- a/test/test_mirror.py @@ -260,9 +330,18 @@ index 70fe069..6fdb668 100644 tl = TestLoader() return tl.loadTestsFromModule(sys.modules[__name__]) diff --git a/urlgrabber/byterange.py b/urlgrabber/byterange.py -index 3e5f3b7..8eeaeda 100644 +index 3e5f3b7..5efa160 100644 --- a/urlgrabber/byterange.py +++ b/urlgrabber/byterange.py +@@ -40,7 +40,7 @@ class HTTPRangeHandler(urllib2.BaseHandler): + + This was extremely simple. The Range header is a HTTP feature to + begin with so all this class does is tell urllib2 that the +- "206 Partial Content" reponse from the HTTP server is what we ++ "206 Partial Content" response from the HTTP server is what we + expected. + + Example: @@ -68,7 +68,7 @@ class HTTPRangeHandler(urllib2.BaseHandler): def http_error_416(self, req, fp, code, msg, hdrs): @@ -305,6 +384,15 @@ index 3e5f3b7..8eeaeda 100644 else: retrlen = lb - fb fp = RangeableFileObject(fp, (0,retrlen)) +@@ -442,7 +442,7 @@ def range_tuple_normalize(range_tup): + Return a tuple whose first element is guaranteed to be an int + and whose second element will be '' (meaning: the last byte) or + an int. Finally, return None if the normalized tuple == (0,'') +- as that is equivelant to retrieving the entire file. ++ as that is equivalent to retrieving the entire file. + """ + if range_tup is None: return None + # handle first byte @@ -458,6 +458,6 @@ def range_tuple_normalize(range_tup): # check if range is over the entire file if (fb,lb) == (0,''): return None @@ -314,9 +402,18 @@ index 3e5f3b7..8eeaeda 100644 return (fb,lb) diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py -index e090e90..6b409e3 100644 +index e090e90..30a8bdb 100644 --- a/urlgrabber/grabber.py +++ b/urlgrabber/grabber.py +@@ -35,7 +35,7 @@ GENERAL ARGUMENTS (kwargs) + close_connection = 0 [0|1] + + tells URLGrabber to close the connection after a file has been +- transfered. This is ignored unless the download happens with the ++ transferred. This is ignored unless the download happens with the + http keepalive handler (keepalive=1). Otherwise, the connection + is left open for further use. The module level default for this + option is 0 (keepalive connections will not be closed). @@ -49,11 +49,26 @@ GENERAL ARGUMENTS (kwargs) progress_obj = None @@ -373,6 +470,15 @@ index e090e90..6b409e3 100644 bandwidth = 0 +@@ -91,7 +112,7 @@ GENERAL ARGUMENTS (kwargs) + range to retrieve. Either or both of the values may set to + None. If first_byte is None, byte offset 0 is assumed. If + last_byte is None, the last byte available is assumed. Note that +- the range specification is python-like in that (0,10) will yeild ++ the range specification is python-like in that (0,10) will yield + the first 10 bytes of the file. + + If set to None, no range will be used. @@ -143,8 +164,12 @@ GENERAL ARGUMENTS (kwargs) note that proxy authentication information may be provided using normal URL constructs: @@ -401,7 +507,7 @@ index e090e90..6b409e3 100644 ssl_ca_cert = None this option can be used if M2Crypto is available and will be -@@ -211,43 +242,75 @@ GENERAL ARGUMENTS (kwargs) +@@ -211,43 +242,82 @@ GENERAL ARGUMENTS (kwargs) No-op when using the curl backend (default) @@ -482,11 +588,18 @@ index e090e90..6b409e3 100644 + The speed estimate also drifts exponentially from the speed + actually measured to the default speed, with default + period of 30 days. ++ ++ ftp_disable_epsv = False ++ ++ False, True ++ ++ This options disables Extended Passive Mode (the EPSV command) ++ which does not work correctly on some buggy ftp servers. + RETRY RELATED ARGUMENTS -@@ -328,6 +391,15 @@ RETRY RELATED ARGUMENTS +@@ -328,6 +398,15 @@ RETRY RELATED ARGUMENTS but it cannot (without severe trickiness) prevent the exception from being raised. @@ -502,7 +615,19 @@ index e090e90..6b409e3 100644 interrupt_callback = None This callback is called if KeyboardInterrupt is received at any -@@ -420,6 +492,7 @@ import time +@@ -368,6 +447,11 @@ BANDWIDTH THROTTLING + is a float and bandwidth == 0, throttling is disabled. If None, the + module-level default (which can be set with set_bandwidth) is used. + ++ Note that when multiple downloads run simultaneously (multiprocessing ++ or the parallel urlgrab() feature is used) the total bandwidth might ++ exceed the throttle limit. You may want to also set max_connections=1 ++ or scale your throttle option down accordingly. ++ + THROTTLING EXAMPLES: + + Lets say you have a 100 Mbps connection. This is (about) 10^8 bits +@@ -420,6 +504,7 @@ import time import string import urllib import urllib2 @@ -510,7 +635,7 @@ index e090e90..6b409e3 100644 import mimetools import thread import types -@@ -428,9 +501,17 @@ import pycurl +@@ -428,9 +513,17 @@ import pycurl from ftplib import parse150 from StringIO import StringIO from httplib import HTTPException @@ -529,7 +654,7 @@ index e090e90..6b409e3 100644 ######################################################################## # MODULE INITIALIZATION ######################################################################## -@@ -439,6 +520,12 @@ try: +@@ -439,6 +532,12 @@ try: except: __version__ = '???' @@ -542,7 +667,16 @@ index e090e90..6b409e3 100644 ######################################################################## # functions for debugging output. These functions are here because they # are also part of the module initialization. -@@ -504,6 +591,7 @@ def _init_default_logger(logspec=None): +@@ -482,7 +581,7 @@ def _init_default_logger(logspec=None): + URLGRABBER_DEBUG=WARNING,- # log warning and higher to stdout + URLGRABBER_DEBUG=INFO # log info and higher to stderr + +- This funtion is called during module initialization. It is not ++ This function is called during module initialization. It is not + intended to be called from outside. The only reason it is a + function at all is to keep the module-level namespace tidy and to + collect the code into a nice block.''' +@@ -504,6 +603,7 @@ def _init_default_logger(logspec=None): else: handler = logging.FileHandler(filename) handler.setFormatter(formatter) DBOBJ = logging.getLogger('urlgrabber') @@ -550,7 +684,7 @@ index e090e90..6b409e3 100644 DBOBJ.addHandler(handler) DBOBJ.setLevel(level) except (KeyError, ImportError, ValueError): -@@ -512,8 +600,8 @@ def _init_default_logger(logspec=None): +@@ -512,8 +612,8 @@ def _init_default_logger(logspec=None): def _log_package_state(): if not DEBUG: return @@ -561,7 +695,7 @@ index e090e90..6b409e3 100644 _init_default_logger() _log_package_state() -@@ -527,6 +615,29 @@ def _(st): +@@ -527,6 +627,29 @@ def _(st): # END MODULE INITIALIZATION ######################################################################## @@ -591,7 +725,7 @@ index e090e90..6b409e3 100644 class URLGrabError(IOError): -@@ -662,6 +773,7 @@ class URLParser: +@@ -662,6 +785,7 @@ class URLParser: opts.quote = 0 --> do not quote it opts.quote = None --> guess """ @@ -599,7 +733,7 @@ index e090e90..6b409e3 100644 quote = opts.quote if opts.prefix: -@@ -768,6 +880,41 @@ class URLGrabberOptions: +@@ -768,6 +892,41 @@ class URLGrabberOptions: else: # throttle is a float return self.bandwidth * self.throttle @@ -641,7 +775,7 @@ index e090e90..6b409e3 100644 def derive(self, **kwargs): """Create a derived URLGrabberOptions instance. This method creates a new instance and overrides the -@@ -791,30 +938,38 @@ class URLGrabberOptions: +@@ -791,30 +950,38 @@ class URLGrabberOptions: provided here. """ self.progress_obj = None @@ -681,7 +815,7 @@ index e090e90..6b409e3 100644 self.ssl_ca_cert = None # sets SSL_CAINFO - path to certdb self.ssl_context = None # no-op in pycurl self.ssl_verify_peer = True # check peer's cert for authenticityb -@@ -827,6 +982,12 @@ class URLGrabberOptions: +@@ -827,6 +994,13 @@ class URLGrabberOptions: self.size = None # if we know how big the thing we're getting is going # to be. this is ultimately a MAXIMUM size for the file self.max_header_size = 2097152 #2mb seems reasonable for maximum header size @@ -691,10 +825,11 @@ index e090e90..6b409e3 100644 + self.timedhosts = None + self.half_life = 30*24*60*60 # 30 days + self.default_speed = 1e6 # 1 MBit ++ self.ftp_disable_epsv = False def __repr__(self): return self.format() -@@ -846,7 +1007,18 @@ class URLGrabberOptions: +@@ -846,7 +1020,18 @@ class URLGrabberOptions: s = s + indent + '}' return s @@ -714,7 +849,7 @@ index e090e90..6b409e3 100644 """Provides easy opening of URLs with a variety of options. All options are specified as kwargs. Options may be specified when -@@ -872,7 +1044,6 @@ class URLGrabber: +@@ -872,7 +1057,6 @@ class URLGrabber: # beware of infinite loops :) tries = tries + 1 exception = None @@ -722,7 +857,7 @@ index e090e90..6b409e3 100644 callback = None if DEBUG: DEBUG.info('attempt %i/%s: %s', tries, opts.retry, args[0]) -@@ -883,54 +1054,62 @@ class URLGrabber: +@@ -883,54 +1067,62 @@ class URLGrabber: except URLGrabError, e: exception = e callback = opts.failure_callback @@ -792,7 +927,7 @@ index e090e90..6b409e3 100644 if scheme == 'file' and not opts.copy_local: # just return the name of the local file - don't make a # copy currently -@@ -950,41 +1129,51 @@ class URLGrabber: +@@ -950,41 +1142,51 @@ class URLGrabber: elif not opts.range: if not opts.checkfunc is None: @@ -859,7 +994,7 @@ index e090e90..6b409e3 100644 if limit is not None: limit = limit + 1 -@@ -1000,12 +1189,8 @@ class URLGrabber: +@@ -1000,12 +1202,8 @@ class URLGrabber: else: s = fo.read(limit) if not opts.checkfunc is None: @@ -874,7 +1009,7 @@ index e090e90..6b409e3 100644 finally: fo.close() return s -@@ -1020,6 +1205,7 @@ class URLGrabber: +@@ -1020,6 +1218,7 @@ class URLGrabber: return s def _make_callback(self, callback_obj): @@ -882,7 +1017,7 @@ index e090e90..6b409e3 100644 if callable(callback_obj): return callback_obj, (), {} else: -@@ -1030,7 +1216,7 @@ class URLGrabber: +@@ -1030,7 +1229,7 @@ class URLGrabber: default_grabber = URLGrabber() @@ -891,7 +1026,7 @@ index e090e90..6b409e3 100644 def __init__(self, url, filename, opts): self.fo = None self._hdr_dump = '' -@@ -1052,10 +1238,13 @@ class PyCurlFileObject(): +@@ -1052,10 +1251,13 @@ class PyCurlFileObject(): self._reget_length = 0 self._prog_running = False self._error = (None, None) @@ -907,7 +1042,7 @@ index e090e90..6b409e3 100644 def __getattr__(self, name): """This effectively allows us to wrap at the instance level. Any attribute not found in _this_ object will be searched for -@@ -1067,6 +1256,12 @@ class PyCurlFileObject(): +@@ -1067,6 +1269,12 @@ class PyCurlFileObject(): def _retrieve(self, buf): try: @@ -920,7 +1055,7 @@ index e090e90..6b409e3 100644 if not self._prog_running: if self.opts.progress_obj: size = self.size + self._reget_length -@@ -1079,32 +1274,62 @@ class PyCurlFileObject(): +@@ -1079,32 +1287,62 @@ class PyCurlFileObject(): self.opts.progress_obj.update(self._amount_read) self._amount_read += len(buf) @@ -953,7 +1088,7 @@ index e090e90..6b409e3 100644 - length = buf.split(':')[1] - self.size = int(length) + if self.scheme in ['http','https']: -+ if buf.lower().find('content-length') != -1: ++ if buf.lower().find('content-length:') != -1: + length = buf.split(':')[1] + self.size = int(length) + elif self.append and self._hdr_dump == '' and ' 200 ' in buf: @@ -990,7 +1125,7 @@ index e090e90..6b409e3 100644 return len(buf) except KeyboardInterrupt: return pycurl.READFUNC_ABORT -@@ -1113,8 +1338,10 @@ class PyCurlFileObject(): +@@ -1113,8 +1351,10 @@ class PyCurlFileObject(): if self._parsed_hdr: return self._parsed_hdr statusend = self._hdr_dump.find('\n') @@ -1001,7 +1136,7 @@ index e090e90..6b409e3 100644 self._parsed_hdr = mimetools.Message(hdrfp) return self._parsed_hdr -@@ -1127,6 +1354,9 @@ class PyCurlFileObject(): +@@ -1127,6 +1367,9 @@ class PyCurlFileObject(): if not opts: opts = self.opts @@ -1011,7 +1146,7 @@ index e090e90..6b409e3 100644 # defaults we're always going to set self.curl_obj.setopt(pycurl.NOPROGRESS, False) -@@ -1136,11 +1366,21 @@ class PyCurlFileObject(): +@@ -1136,11 +1379,21 @@ class PyCurlFileObject(): self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update) self.curl_obj.setopt(pycurl.FAILONERROR, True) self.curl_obj.setopt(pycurl.OPT_FILETIME, True) @@ -1034,7 +1169,7 @@ index e090e90..6b409e3 100644 # maybe to be options later self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True) -@@ -1148,9 +1388,11 @@ class PyCurlFileObject(): +@@ -1148,9 +1401,11 @@ class PyCurlFileObject(): # timeouts timeout = 300 @@ -1049,7 +1184,7 @@ index e090e90..6b409e3 100644 # ssl options if self.scheme == 'https': -@@ -1158,13 +1400,16 @@ class PyCurlFileObject(): +@@ -1158,13 +1413,16 @@ class PyCurlFileObject(): self.curl_obj.setopt(pycurl.CAPATH, opts.ssl_ca_cert) self.curl_obj.setopt(pycurl.CAINFO, opts.ssl_ca_cert) self.curl_obj.setopt(pycurl.SSL_VERIFYPEER, opts.ssl_verify_peer) @@ -1067,7 +1202,7 @@ index e090e90..6b409e3 100644 if opts.ssl_cert_type: self.curl_obj.setopt(pycurl.SSLCERTTYPE, opts.ssl_cert_type) if opts.ssl_key_pass: -@@ -1187,28 +1432,26 @@ class PyCurlFileObject(): +@@ -1187,29 +1445,31 @@ class PyCurlFileObject(): if hasattr(opts, 'raw_throttle') and opts.raw_throttle(): self.curl_obj.setopt(pycurl.MAX_RECV_SPEED_LARGE, int(opts.raw_throttle())) @@ -1108,11 +1243,17 @@ index e090e90..6b409e3 100644 if opts.data: self.curl_obj.setopt(pycurl.POST, True) - self.curl_obj.setopt(pycurl.POSTFIELDS, self._to_utf8(opts.data)) +- + self.curl_obj.setopt(pycurl.POSTFIELDS, _to_utf8(opts.data)) - ++ ++ # ftp ++ if opts.ftp_disable_epsv: ++ self.curl_obj.setopt(pycurl.FTP_USE_EPSV, False) ++ # our url self.curl_obj.setopt(pycurl.URL, self.url) -@@ -1228,39 +1471,26 @@ class PyCurlFileObject(): + +@@ -1228,39 +1488,26 @@ class PyCurlFileObject(): code = self.http_code errcode = e.args[0] @@ -1158,7 +1299,7 @@ index e090e90..6b409e3 100644 # this is probably wrong but ultimately this is what happens # we have a legit http code and a pycurl 'writer failed' code # which almost always means something aborted it from outside -@@ -1269,36 +1499,70 @@ class PyCurlFileObject(): +@@ -1269,36 +1516,70 @@ class PyCurlFileObject(): # figure out what aborted the pycurl process FIXME raise KeyboardInterrupt @@ -1254,7 +1395,7 @@ index e090e90..6b409e3 100644 def _do_open(self): self.curl_obj = _curl_cache -@@ -1333,7 +1597,11 @@ class PyCurlFileObject(): +@@ -1333,7 +1614,11 @@ class PyCurlFileObject(): if self.opts.range: rt = self.opts.range @@ -1267,7 +1408,7 @@ index e090e90..6b409e3 100644 if rt: header = range_tuple_to_header(rt) -@@ -1434,21 +1702,46 @@ class PyCurlFileObject(): +@@ -1434,21 +1719,46 @@ class PyCurlFileObject(): #fh, self._temp_name = mkstemp() #self.fo = open(self._temp_name, 'wb') @@ -1321,7 +1462,7 @@ index e090e90..6b409e3 100644 else: #self.fo = open(self._temp_name, 'r') self.fo.seek(0) -@@ -1526,17 +1819,20 @@ class PyCurlFileObject(): +@@ -1526,17 +1836,20 @@ class PyCurlFileObject(): if self._prog_running: downloaded += self._reget_length self.opts.progress_obj.update(downloaded) @@ -1347,7 +1488,7 @@ index e090e90..6b409e3 100644 msg = _("Downloaded more than max size for %s: %s > %s") \ % (self.url, cur, max_size) -@@ -1544,13 +1840,6 @@ class PyCurlFileObject(): +@@ -1544,13 +1857,6 @@ class PyCurlFileObject(): return True return False @@ -1361,7 +1502,7 @@ index e090e90..6b409e3 100644 def read(self, amt=None): self._fill_buffer(amt) if amt is None: -@@ -1582,9 +1871,21 @@ class PyCurlFileObject(): +@@ -1582,9 +1888,21 @@ class PyCurlFileObject(): self.opts.progress_obj.end(self._amount_read) self.fo.close() @@ -1384,7 +1525,7 @@ index e090e90..6b409e3 100644 ##################################################################### # DEPRECATED FUNCTIONS -@@ -1621,6 +1922,489 @@ def retrygrab(url, filename=None, copy_local=0, close_connection=0, +@@ -1621,6 +1939,490 @@ def retrygrab(url, filename=None, copy_local=0, close_connection=0, ##################################################################### @@ -1500,6 +1641,7 @@ index e090e90..6b409e3 100644 + 'ssl_key_pass', + 'ssl_verify_peer', 'ssl_verify_host', + 'size', 'max_header_size', 'ip_resolve', ++ 'ftp_disable_epsv' + ) + + def start(self, opts): @@ -1875,7 +2017,7 @@ index e090e90..6b409e3 100644 def _main_test(): try: url, filename = sys.argv[1:3] diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py -index dad410b..5d3aa34 100644 +index dad410b..988a309 100644 --- a/urlgrabber/mirror.py +++ b/urlgrabber/mirror.py @@ -76,6 +76,10 @@ CUSTOMIZATION @@ -1926,7 +2068,12 @@ index dad410b..5d3aa34 100644 This dict can be set at instantiation time, mg = MirrorGroup(grabber, mirrors, default_action={'fail':1}) -@@ -184,6 +195,7 @@ class MirrorGroup: +@@ -180,10 +191,11 @@ class MirrorGroup: + etc). Otherwise, it is assumed to be the callable object + itself. The callback will be passed a grabber.CallbackObject + instance along with args and kwargs (if present). The following +- attributes are defined withing the instance: ++ attributes are defined within the instance: obj.exception = < exception that was raised > obj.mirror = < the mirror that was tried >