diff --git a/fix-stringio.patch b/fix-stringio.patch deleted file mode 100644 index cbb1e96..0000000 --- a/fix-stringio.patch +++ /dev/null @@ -1,31 +0,0 @@ -From 62f94b534289b7ab0bda879962bf6efb124a9930 Mon Sep 17 00:00:00 2001 -From: Tomas Radej -Date: Tue, 2 Sep 2014 12:52:50 +0200 -Subject: [PATCH] StringIO patch - ---- - urlgrabber/grabber.py | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py -index 35c091e..26335d1 100644 ---- a/urlgrabber/grabber.py -+++ b/urlgrabber/grabber.py -@@ -1298,12 +1298,12 @@ class PyCurlFileObject(object): - start = self._range[0] - pos - stop = self._range[1] - pos - if start < len(buf) and stop > 0: -- if not six.PY3 or isinstance(self.fo, StringIO): -+ if not six.PY3 and isinstance(self.fo, StringIO): - self.fo.write(buf[max(start, 0):stop].decode('utf-8')) - else: - self.fo.write(buf[max(start, 0):stop]) - else: -- if not six.PY3 or isinstance(self.fo, StringIO): -+ if not six.PY3 and isinstance(self.fo, StringIO): - self.fo.write(buf.decode('utf-8')) - else: - self.fo.write(buf) --- -1.9.3 - diff --git a/python-urlgrabber.spec b/python-urlgrabber.spec index aa07e19..353327a 100644 --- a/python-urlgrabber.spec +++ b/python-urlgrabber.spec @@ -3,13 +3,13 @@ Summary: A high-level cross-protocol url-grabber Name: python-urlgrabber Version: 3.10.1 -Release: 4%{?dist} +Release: 5%{?dist} Source0: http://urlgrabber.baseurl.org/download/urlgrabber-%{version}.tar.gz Patch1: urlgrabber-HEAD.patch Patch2: BZ-1051554-speed-on-404-mirror.patch Patch3: port-to-python3.patch Patch4: port-tests-to-python3.patch -Patch5: fix-stringio.patch +Patch5: urlgrabber-stringio.patch License: LGPLv2+ Group: Development/Libraries @@ -54,6 +54,9 @@ rm -rf $RPM_BUILD_ROOT %attr(0755,root,root) %{_libexecdir}/urlgrabber-ext-down %changelog +* Tue Sep 09 2014 Tomas Radej - 3.10.1-5 +- Really fixed UTF behaviour + * Tue Sep 02 2014 Tomas Radej - 3.10.1-4 - Fixed UTF behaviour (bz #1135632) diff --git a/urlgrabber-stringio.patch b/urlgrabber-stringio.patch new file mode 100644 index 0000000..0560a49 --- /dev/null +++ b/urlgrabber-stringio.patch @@ -0,0 +1,433 @@ +diff --git a/test/test_grabber.py b/test/test_grabber.py +index bd36d66..bd54329 100644 +--- a/test/test_grabber.py ++++ b/test/test_grabber.py +@@ -42,7 +42,7 @@ from urlgrabber.progress import text_progress_meter + class FileObjectTests(TestCase): + + def setUp(self): +- self.filename = tempfile.mktemp() ++ _, self.filename = tempfile.mkstemp() + fo = open(self.filename, 'wb') + fo.write(reference_data.encode('utf-8')) + fo.close() +@@ -61,35 +61,36 @@ class FileObjectTests(TestCase): + def test_readall(self): + "PYCurlFileObject .read() method" + s = self.wrapper.read() +- self.fo_output.write(s) ++ self.fo_output.write(unicode(s) if not six.PY3 else s) + self.assert_(reference_data == self.fo_output.getvalue()) + + def test_readline(self): + "PyCurlFileObject .readline() method" + while 1: + s = self.wrapper.readline() +- self.fo_output.write(s) ++ self.fo_output.write(unicode(s) if not six.PY3 else s) + if not s: break + self.assert_(reference_data == self.fo_output.getvalue()) + + def test_readlines(self): + "PyCurlFileObject .readlines() method" + li = self.wrapper.readlines() +- self.fo_output.write(''.join(li)) ++ out = ''.join(li) ++ self.fo_output.write(unicode(out) if not six.PY3 else out) + self.assert_(reference_data == self.fo_output.getvalue()) + + def test_smallread(self): + "PyCurlFileObject .read(N) with small N" + while 1: + s = self.wrapper.read(23) +- self.fo_output.write(s) ++ self.fo_output.write(unicode(s) if not six.PY3 else s) + if not s: break + self.assert_(reference_data == self.fo_output.getvalue()) + + class HTTPTests(TestCase): + def test_reference_file(self): + "download reference file via HTTP" +- filename = tempfile.mktemp() ++ _, filename = tempfile.mkstemp() + grabber.urlgrab(ref_http, filename) + + fo = open(filename, 'rb' if not six.PY3 else 'r') +@@ -123,7 +124,7 @@ class URLGrabberModuleTestCase(TestCase): + + def test_urlgrab(self): + "module-level urlgrab() function" +- outfile = tempfile.mktemp() ++ _, outfile = tempfile.mkstemp() + filename = urlgrabber.urlgrab('http://www.python.org', + filename=outfile) + os.unlink(outfile) +@@ -367,7 +368,7 @@ class CheckfuncTestCase(TestCase): + def setUp(self): + cf = (self._checkfunc, ('foo',), {'bar': 'baz'}) + self.g = grabber.URLGrabber(checkfunc=cf) +- self.filename = tempfile.mktemp() ++ _, self.filename = tempfile.mkstemp() + self.data = short_reference_data + + def tearDown(self): +@@ -440,7 +441,7 @@ class RegetTestBase: + def setUp(self): + self.ref = short_reference_data + self.grabber = grabber.URLGrabber(reget='check_timestamp') +- self.filename = tempfile.mktemp() ++ _, self.filename = tempfile.mkstemp() + self.hl = len(self.ref) / 2 + self.url = 'OVERRIDE THIS' + +@@ -522,7 +523,7 @@ class HTTPRegetTests(FTPRegetTests): + class FileRegetTests(HTTPRegetTests): + def setUp(self): + self.ref = short_reference_data +- tmp = tempfile.mktemp() ++ _, tmp = tempfile.mkstemp() + tmpfo = open(tmp, 'wb' if not six.PY3 else 'w') + tmpfo.write(self.ref) + tmpfo.close() +@@ -534,7 +535,7 @@ class FileRegetTests(HTTPRegetTests): + + self.grabber = grabber.URLGrabber(reget='check_timestamp', + copy_local=1) +- self.filename = tempfile.mktemp() ++ _, self.filename = tempfile.mkstemp() + self.hl = len(self.ref) / 2 + + def tearDown(self): +diff --git a/test/test_mirror.py b/test/test_mirror.py +index c46cd33..b923dd1 100644 +--- a/test/test_mirror.py ++++ b/test/test_mirror.py +@@ -50,7 +50,7 @@ class BasicTests(TestCase): + + def test_urlgrab(self): + """MirrorGroup.urlgrab""" +- filename = tempfile.mktemp() ++ _, filename = tempfile.mkstemp() + url = 'short_reference' + self.mg.urlgrab(url, filename) + +@@ -84,7 +84,7 @@ class SubclassTests(TestCase): + def fetchwith(self, mgclass): + self.mg = mgclass(self.g, self.fullmirrors) + +- filename = tempfile.mktemp() ++ _, filename = tempfile.mkstemp() + url = 'short_reference' + self.mg.urlgrab(url, filename) + +@@ -137,7 +137,7 @@ class BadMirrorTests(TestCase): + + def test_simple_grab(self): + """test that a bad mirror raises URLGrabError""" +- filename = tempfile.mktemp() ++ _, filename = tempfile.mkstemp() + url = 'reference' + self.assertRaises(URLGrabError, self.mg.urlgrab, url, filename) + +@@ -150,7 +150,7 @@ class FailoverTests(TestCase): + + def test_simple_grab(self): + """test that a the MG fails over past a bad mirror""" +- filename = tempfile.mktemp() ++ _, filename = tempfile.mkstemp() + url = 'reference' + elist = [] + def cb(e, elist=elist): elist.append(e) +diff --git a/urlgrabber/byterange.py b/urlgrabber/byterange.py +index ffaed8e..95287fc 100644 +--- a/urlgrabber/byterange.py ++++ b/urlgrabber/byterange.py +@@ -27,7 +27,7 @@ from six.moves import urllib + + DEBUG = None + +-from io import StringIO ++from io import BytesIO + + class RangeError(IOError): + """Error raised when an unsatisfiable range is requested.""" +@@ -238,8 +238,8 @@ class FileRangeHandler(urllib.request.FileHandler): + raise RangeError(9, 'Requested Range Not Satisfiable') + size = (lb - fb) + fo = RangeableFileObject(fo, (fb,lb)) +- headers = email.message.Message(StringIO( +- 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' % ++ headers = email.message.Message(BytesIO( ++ b'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' % + (mtype or 'text/plain', size, modified))) + return urllib.addinfourl(fo, headers, 'file:'+file) + +@@ -323,13 +323,13 @@ class FTPRangeHandler(urllib.request.FTPHandler): + fp = RangeableFileObject(fp, (0,retrlen)) + # -- range support modifications end here + +- headers = "" ++ headers = b"" + mtype = mimetypes.guess_type(req.get_full_url())[0] + if mtype: +- headers += "Content-Type: %s\n" % mtype ++ headers += b"Content-Type: %s\n" % mtype + if retrlen is not None and retrlen >= 0: +- headers += "Content-Length: %d\n" % retrlen +- sf = StringIO(headers) ++ headers += b"Content-Length: %d\n" % retrlen ++ sf = BytesIO(headers) + headers = email.message.Message(sf) + return addinfourl(fp, headers, req.get_full_url()) + except ftplib.all_errors as msg: +diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py +index 35c091e..69cd113 100644 +--- a/urlgrabber/grabber.py ++++ b/urlgrabber/grabber.py +@@ -516,7 +516,7 @@ from six.moves import urllib + from six.moves.http_client import responses, HTTPException + from urlgrabber.byterange import range_tuple_normalize, range_tuple_to_header, RangeError + +-from io import StringIO ++from io import BytesIO + + try: + import xattr +@@ -1235,7 +1235,7 @@ default_grabber = URLGrabber() + class PyCurlFileObject(object): + def __init__(self, url, filename, opts): + self.fo = None +- self._hdr_dump = '' ++ self._hdr_dump = b'' + self._parsed_hdr = None + self.url = url + self.scheme = urllib.parse.urlsplit(self.url)[0] +@@ -1246,7 +1246,7 @@ class PyCurlFileObject(object): + if self.opts.reget == 'check_timestamp': + raise NotImplementedError("check_timestamp regets are not implemented in this ver of urlgrabber. Please report this.") + self._complete = False +- self._rbuf = '' ++ self._rbuf = b'' + self._rbufsize = 1024*8 + self._ttime = time.time() + self._tsize = 0 +@@ -1298,15 +1298,9 @@ class PyCurlFileObject(object): + start = self._range[0] - pos + stop = self._range[1] - pos + if start < len(buf) and stop > 0: +- if not six.PY3 or isinstance(self.fo, StringIO): +- self.fo.write(buf[max(start, 0):stop].decode('utf-8')) +- else: +- self.fo.write(buf[max(start, 0):stop]) ++ self.fo.write(buf[max(start, 0):stop]) + else: +- if not six.PY3 or isinstance(self.fo, StringIO): +- self.fo.write(buf.decode('utf-8')) +- else: +- self.fo.write(buf) ++ self.fo.write(buf) + except IOError as e: + self._cb_error = URLGrabError(16, exception2msg(e)) + return -1 +@@ -1316,7 +1310,7 @@ class PyCurlFileObject(object): + + def _hdr_retrieve(self, buf): + if self._hdr_ended: +- self._hdr_dump = '' ++ self._hdr_dump = b'' + self.size = 0 + self._hdr_ended = False + +@@ -1328,12 +1322,12 @@ class PyCurlFileObject(object): + # but we can't do that w/o making it do 2 connects, which sucks + # so we cheat and stuff it in here in the hdr_retrieve + if self.scheme in ['http','https']: +- content_length_str = 'content-length:' if not six.PY3 else b'content-length:' ++ content_length_str = b'content-length:' + if buf.lower().find(content_length_str) != -1: +- split_str = ':' if not six.PY3 else b':' ++ split_str = b':' + length = buf.split(split_str)[1] + self.size = int(length) +- elif (self.append or self.opts.range) and self._hdr_dump == '' and b' 200 ' in buf: ++ elif (self.append or self.opts.range) and self._hdr_dump == b'' and b' 200 ' in buf: + # reget was attempted but server sends it all + # undo what we did in _build_range() + self.append = False +@@ -1349,20 +1343,19 @@ class PyCurlFileObject(object): + if len(s) >= 14: + s = None # ignore MDTM responses + elif buf.startswith(b'150 '): +- s = parse150(buf if not six.PY3 else buf.decode('utf-8')) ++ s = parse150(buf.decode('utf-8')) # Necessary in Python 3, doesn't hurt in Python 2 + if s: + self.size = int(s) + +- location_str = 'location' if not six.PY3 else b'location' ++ location_str = b'location' + if buf.lower().find(location_str) != -1: +- buf_compat = buf if not six.PY3 else buf.decode('utf-8') +- location = ':'.join(buf_compat.split(':')[1:]) ++ location = b':'.join(buf.split(b':')[1:]) + location = location.strip() + self.scheme = urllib.parse.urlsplit(location)[0] + self.url = location + +- self._hdr_dump += buf if not six.PY3 else buf.decode('utf-8') +- end_str = '\r\n' if not six.PY3 else b'\r\n' ++ self._hdr_dump += buf ++ end_str = b'\r\n' + if len(self._hdr_dump) != 0 and buf == end_str: + self._hdr_ended = True + if DEBUG: DEBUG.debug('header ended:') +@@ -1374,12 +1367,12 @@ class PyCurlFileObject(object): + def _return_hdr_obj(self): + if self._parsed_hdr: + return self._parsed_hdr +- statusend = self._hdr_dump.find('\n') ++ statusend = self._hdr_dump.find(b'\n') + statusend += 1 # ridiculous as it may seem. +- hdrfp = StringIO() ++ hdrfp = BytesIO() + hdrfp.write(self._hdr_dump[statusend:]) + hdrfp.seek(0) +- self._parsed_hdr = Message(hdrfp) ++ self._parsed_hdr = Message(hdrfp) + return self._parsed_hdr + + hdr = property(_return_hdr_obj) +@@ -1709,7 +1702,7 @@ class PyCurlFileObject(object): + return (fo, hdr) + + def _do_grab(self): +- """dump the file to a filename or StringIO buffer""" ++ """dump the file to a filename or BytesIO buffer""" + + if self._complete: + return +@@ -1739,7 +1732,7 @@ class PyCurlFileObject(object): + self._prog_basename = 'MEMORY' + + +- self.fo = StringIO() ++ self.fo = BytesIO() + # if this is to be a tempfile instead.... + # it just makes crap in the tempdir + #fh, self._temp_name = mkstemp() +@@ -1778,7 +1771,7 @@ class PyCurlFileObject(object): + raise err + # re open it + try: +- self.fo = open(self.filename, 'r') ++ self.fo = open(self.filename, 'rb') + except IOError as e: + err = URLGrabError(16, _(\ + 'error opening file from %s, IOError: %s') % (self.url, e)) +@@ -1853,7 +1846,7 @@ class PyCurlFileObject(object): + #if self.opts.progress_obj: + # self.opts.progress_obj.update(self._amount_read) + +- self._rbuf = ''.join(buf) ++ self._rbuf = b''.join(buf) + return + + def _progress_update(self, download_total, downloaded, upload_total, uploaded): +@@ -1888,28 +1881,40 @@ class PyCurlFileObject(object): + def read(self, amt=None): + self._fill_buffer(amt) + if amt is None: +- s, self._rbuf = self._rbuf, '' ++ s, self._rbuf = self._rbuf, b'' + else: + s, self._rbuf = self._rbuf[:amt], self._rbuf[amt:] +- return s ++ return s if not six.PY3 else s.decode('utf-8') + + def readline(self, limit=-1): + if not self._complete: self._do_grab() +- return self.fo.readline() ++ return self.fo.readline() if not six.PY3 else self.fo.readline().decode('utf-8') + +- i = self._rbuf.find('\n') ++ i = self._rbuf.find(b'\n') + while i < 0 and not (0 < limit <= len(self._rbuf)): + L = len(self._rbuf) + self._fill_buffer(L + self._rbufsize) + if not len(self._rbuf) > L: break +- i = self._rbuf.find('\n', L) ++ i = self._rbuf.find(b'\n', L) + + if i < 0: i = len(self._rbuf) + else: i = i+1 + if 0 <= limit < len(self._rbuf): i = limit + + s, self._rbuf = self._rbuf[:i], self._rbuf[i:] +- return s ++ return s if not six.PY3 else s.decode('utf-8') ++ ++ # This was added here because we need to wrap self.fo readlines (which will ++ # always return bytes) in correct decoding ++ def readlines(self, *args, **kwargs): ++ if not six.PY3: ++ return [line for line in self.fo.readlines(*args, **kwargs)] ++ else: ++ return self._py3readlines(*args, **kwargs) ++ ++ def _py3readlines(self, *args, **kwargs): ++ for line in self.fo.readlines(*args, **kwargs): ++ yield line.decode('utf-8') + + def close(self): + if self._prog_running: +@@ -2055,11 +2060,9 @@ def _readlines(fd): + buf = os.read(fd, 4096) + if not buf: return None + # whole lines only, no buffering +- buf_compat = buf if not six.PY3 else buf.decode('utf-8') +- while buf_compat[-1] != '\n': ++ while buf.decode('utf-8')[-1] != '\n': + buf += os.read(fd, 4096) +- buf_compat = buf if not six.PY3 else buf.decode('utf-8') +- return buf_compat[:-1].split('\n') ++ return buf.decode('utf-8')[:-1].split('\n') + + import subprocess + +@@ -2403,7 +2406,7 @@ class _TH: + if filename and _TH.dirty is None: + try: + now = int(time.time()) +- for line in open(filename): ++ for line in open(filename, 'rb'): + host, speed, fail, ts = line.rsplit(' ', 3) + _TH.hosts[host] = int(speed), int(fail), min(int(ts), now) + except IOError: pass +@@ -2415,7 +2418,7 @@ class _TH: + if filename and _TH.dirty is True: + tmp = '%s.%d' % (filename, os.getpid()) + try: +- f = open(tmp, 'w') ++ f = open(tmp, 'wb') + for host in _TH.hosts: + f.write(host + ' %d %d %d\n' % _TH.hosts[host]) + f.close() +@@ -2536,7 +2539,7 @@ def _file_object_test(filename=None): + if filename is None: + filename = __file__ + print('using file "%s" for comparisons' % filename) +- fo = open(filename) ++ fo = open(filename, 'rb') + s_input = fo.read() + fo.close() + +@@ -2544,8 +2547,8 @@ def _file_object_test(filename=None): + _test_file_object_readall, + _test_file_object_readline, + _test_file_object_readlines]: +- fo_input = StringIO(s_input) +- fo_output = StringIO() ++ fo_input = BytesIO(s_input) ++ fo_output = BytesIO() + wrapper = PyCurlFileObject(fo_input, None, 0) + print('testing %-30s ' % testfunc.__name__, testfunc(wrapper, fo_output)) + s_output = fo_output.getvalue()