diff --git a/test/test_grabber.py b/test/test_grabber.py index bd36d66..bd54329 100644 --- a/test/test_grabber.py +++ b/test/test_grabber.py @@ -42,7 +42,7 @@ from urlgrabber.progress import text_progress_meter class FileObjectTests(TestCase): def setUp(self): - self.filename = tempfile.mktemp() + _, self.filename = tempfile.mkstemp() fo = open(self.filename, 'wb') fo.write(reference_data.encode('utf-8')) fo.close() @@ -61,35 +61,36 @@ class FileObjectTests(TestCase): def test_readall(self): "PYCurlFileObject .read() method" s = self.wrapper.read() - self.fo_output.write(s) + self.fo_output.write(unicode(s) if not six.PY3 else s) self.assert_(reference_data == self.fo_output.getvalue()) def test_readline(self): "PyCurlFileObject .readline() method" while 1: s = self.wrapper.readline() - self.fo_output.write(s) + self.fo_output.write(unicode(s) if not six.PY3 else s) if not s: break self.assert_(reference_data == self.fo_output.getvalue()) def test_readlines(self): "PyCurlFileObject .readlines() method" li = self.wrapper.readlines() - self.fo_output.write(''.join(li)) + out = ''.join(li) + self.fo_output.write(unicode(out) if not six.PY3 else out) self.assert_(reference_data == self.fo_output.getvalue()) def test_smallread(self): "PyCurlFileObject .read(N) with small N" while 1: s = self.wrapper.read(23) - self.fo_output.write(s) + self.fo_output.write(unicode(s) if not six.PY3 else s) if not s: break self.assert_(reference_data == self.fo_output.getvalue()) class HTTPTests(TestCase): def test_reference_file(self): "download reference file via HTTP" - filename = tempfile.mktemp() + _, filename = tempfile.mkstemp() grabber.urlgrab(ref_http, filename) fo = open(filename, 'rb' if not six.PY3 else 'r') @@ -123,7 +124,7 @@ class URLGrabberModuleTestCase(TestCase): def test_urlgrab(self): "module-level urlgrab() function" - outfile = tempfile.mktemp() + _, outfile = tempfile.mkstemp() filename = urlgrabber.urlgrab('http://www.python.org', filename=outfile) os.unlink(outfile) @@ -367,7 +368,7 @@ class CheckfuncTestCase(TestCase): def setUp(self): cf = (self._checkfunc, ('foo',), {'bar': 'baz'}) self.g = grabber.URLGrabber(checkfunc=cf) - self.filename = tempfile.mktemp() + _, self.filename = tempfile.mkstemp() self.data = short_reference_data def tearDown(self): @@ -440,7 +441,7 @@ class RegetTestBase: def setUp(self): self.ref = short_reference_data self.grabber = grabber.URLGrabber(reget='check_timestamp') - self.filename = tempfile.mktemp() + _, self.filename = tempfile.mkstemp() self.hl = len(self.ref) / 2 self.url = 'OVERRIDE THIS' @@ -522,7 +523,7 @@ class HTTPRegetTests(FTPRegetTests): class FileRegetTests(HTTPRegetTests): def setUp(self): self.ref = short_reference_data - tmp = tempfile.mktemp() + _, tmp = tempfile.mkstemp() tmpfo = open(tmp, 'wb' if not six.PY3 else 'w') tmpfo.write(self.ref) tmpfo.close() @@ -534,7 +535,7 @@ class FileRegetTests(HTTPRegetTests): self.grabber = grabber.URLGrabber(reget='check_timestamp', copy_local=1) - self.filename = tempfile.mktemp() + _, self.filename = tempfile.mkstemp() self.hl = len(self.ref) / 2 def tearDown(self): diff --git a/test/test_mirror.py b/test/test_mirror.py index c46cd33..b923dd1 100644 --- a/test/test_mirror.py +++ b/test/test_mirror.py @@ -50,7 +50,7 @@ class BasicTests(TestCase): def test_urlgrab(self): """MirrorGroup.urlgrab""" - filename = tempfile.mktemp() + _, filename = tempfile.mkstemp() url = 'short_reference' self.mg.urlgrab(url, filename) @@ -84,7 +84,7 @@ class SubclassTests(TestCase): def fetchwith(self, mgclass): self.mg = mgclass(self.g, self.fullmirrors) - filename = tempfile.mktemp() + _, filename = tempfile.mkstemp() url = 'short_reference' self.mg.urlgrab(url, filename) @@ -137,7 +137,7 @@ class BadMirrorTests(TestCase): def test_simple_grab(self): """test that a bad mirror raises URLGrabError""" - filename = tempfile.mktemp() + _, filename = tempfile.mkstemp() url = 'reference' self.assertRaises(URLGrabError, self.mg.urlgrab, url, filename) @@ -150,7 +150,7 @@ class FailoverTests(TestCase): def test_simple_grab(self): """test that a the MG fails over past a bad mirror""" - filename = tempfile.mktemp() + _, filename = tempfile.mkstemp() url = 'reference' elist = [] def cb(e, elist=elist): elist.append(e) diff --git a/urlgrabber/byterange.py b/urlgrabber/byterange.py index ffaed8e..95287fc 100644 --- a/urlgrabber/byterange.py +++ b/urlgrabber/byterange.py @@ -27,7 +27,7 @@ from six.moves import urllib DEBUG = None -from io import StringIO +from io import BytesIO class RangeError(IOError): """Error raised when an unsatisfiable range is requested.""" @@ -238,8 +238,8 @@ class FileRangeHandler(urllib.request.FileHandler): raise RangeError(9, 'Requested Range Not Satisfiable') size = (lb - fb) fo = RangeableFileObject(fo, (fb,lb)) - headers = email.message.Message(StringIO( - 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' % + headers = email.message.Message(BytesIO( + b'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' % (mtype or 'text/plain', size, modified))) return urllib.addinfourl(fo, headers, 'file:'+file) @@ -323,13 +323,13 @@ class FTPRangeHandler(urllib.request.FTPHandler): fp = RangeableFileObject(fp, (0,retrlen)) # -- range support modifications end here - headers = "" + headers = b"" mtype = mimetypes.guess_type(req.get_full_url())[0] if mtype: - headers += "Content-Type: %s\n" % mtype + headers += b"Content-Type: %s\n" % mtype if retrlen is not None and retrlen >= 0: - headers += "Content-Length: %d\n" % retrlen - sf = StringIO(headers) + headers += b"Content-Length: %d\n" % retrlen + sf = BytesIO(headers) headers = email.message.Message(sf) return addinfourl(fp, headers, req.get_full_url()) except ftplib.all_errors as msg: diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py index 35c091e..69cd113 100644 --- a/urlgrabber/grabber.py +++ b/urlgrabber/grabber.py @@ -516,7 +516,7 @@ from six.moves import urllib from six.moves.http_client import responses, HTTPException from urlgrabber.byterange import range_tuple_normalize, range_tuple_to_header, RangeError -from io import StringIO +from io import BytesIO try: import xattr @@ -1235,7 +1235,7 @@ default_grabber = URLGrabber() class PyCurlFileObject(object): def __init__(self, url, filename, opts): self.fo = None - self._hdr_dump = '' + self._hdr_dump = b'' self._parsed_hdr = None self.url = url self.scheme = urllib.parse.urlsplit(self.url)[0] @@ -1246,7 +1246,7 @@ class PyCurlFileObject(object): if self.opts.reget == 'check_timestamp': raise NotImplementedError("check_timestamp regets are not implemented in this ver of urlgrabber. Please report this.") self._complete = False - self._rbuf = '' + self._rbuf = b'' self._rbufsize = 1024*8 self._ttime = time.time() self._tsize = 0 @@ -1298,15 +1298,9 @@ class PyCurlFileObject(object): start = self._range[0] - pos stop = self._range[1] - pos if start < len(buf) and stop > 0: - if not six.PY3 or isinstance(self.fo, StringIO): - self.fo.write(buf[max(start, 0):stop].decode('utf-8')) - else: - self.fo.write(buf[max(start, 0):stop]) + self.fo.write(buf[max(start, 0):stop]) else: - if not six.PY3 or isinstance(self.fo, StringIO): - self.fo.write(buf.decode('utf-8')) - else: - self.fo.write(buf) + self.fo.write(buf) except IOError as e: self._cb_error = URLGrabError(16, exception2msg(e)) return -1 @@ -1316,7 +1310,7 @@ class PyCurlFileObject(object): def _hdr_retrieve(self, buf): if self._hdr_ended: - self._hdr_dump = '' + self._hdr_dump = b'' self.size = 0 self._hdr_ended = False @@ -1328,12 +1322,12 @@ class PyCurlFileObject(object): # but we can't do that w/o making it do 2 connects, which sucks # so we cheat and stuff it in here in the hdr_retrieve if self.scheme in ['http','https']: - content_length_str = 'content-length:' if not six.PY3 else b'content-length:' + content_length_str = b'content-length:' if buf.lower().find(content_length_str) != -1: - split_str = ':' if not six.PY3 else b':' + split_str = b':' length = buf.split(split_str)[1] self.size = int(length) - elif (self.append or self.opts.range) and self._hdr_dump == '' and b' 200 ' in buf: + elif (self.append or self.opts.range) and self._hdr_dump == b'' and b' 200 ' in buf: # reget was attempted but server sends it all # undo what we did in _build_range() self.append = False @@ -1349,20 +1343,19 @@ class PyCurlFileObject(object): if len(s) >= 14: s = None # ignore MDTM responses elif buf.startswith(b'150 '): - s = parse150(buf if not six.PY3 else buf.decode('utf-8')) + s = parse150(buf.decode('utf-8')) # Necessary in Python 3, doesn't hurt in Python 2 if s: self.size = int(s) - location_str = 'location' if not six.PY3 else b'location' + location_str = b'location' if buf.lower().find(location_str) != -1: - buf_compat = buf if not six.PY3 else buf.decode('utf-8') - location = ':'.join(buf_compat.split(':')[1:]) + location = b':'.join(buf.split(b':')[1:]) location = location.strip() self.scheme = urllib.parse.urlsplit(location)[0] self.url = location - self._hdr_dump += buf if not six.PY3 else buf.decode('utf-8') - end_str = '\r\n' if not six.PY3 else b'\r\n' + self._hdr_dump += buf + end_str = b'\r\n' if len(self._hdr_dump) != 0 and buf == end_str: self._hdr_ended = True if DEBUG: DEBUG.debug('header ended:') @@ -1374,12 +1367,12 @@ class PyCurlFileObject(object): def _return_hdr_obj(self): if self._parsed_hdr: return self._parsed_hdr - statusend = self._hdr_dump.find('\n') + statusend = self._hdr_dump.find(b'\n') statusend += 1 # ridiculous as it may seem. - hdrfp = StringIO() + hdrfp = BytesIO() hdrfp.write(self._hdr_dump[statusend:]) hdrfp.seek(0) - self._parsed_hdr = Message(hdrfp) + self._parsed_hdr = Message(hdrfp) return self._parsed_hdr hdr = property(_return_hdr_obj) @@ -1709,7 +1702,7 @@ class PyCurlFileObject(object): return (fo, hdr) def _do_grab(self): - """dump the file to a filename or StringIO buffer""" + """dump the file to a filename or BytesIO buffer""" if self._complete: return @@ -1739,7 +1732,7 @@ class PyCurlFileObject(object): self._prog_basename = 'MEMORY' - self.fo = StringIO() + self.fo = BytesIO() # if this is to be a tempfile instead.... # it just makes crap in the tempdir #fh, self._temp_name = mkstemp() @@ -1778,7 +1771,7 @@ class PyCurlFileObject(object): raise err # re open it try: - self.fo = open(self.filename, 'r') + self.fo = open(self.filename, 'rb') except IOError as e: err = URLGrabError(16, _(\ 'error opening file from %s, IOError: %s') % (self.url, e)) @@ -1853,7 +1846,7 @@ class PyCurlFileObject(object): #if self.opts.progress_obj: # self.opts.progress_obj.update(self._amount_read) - self._rbuf = ''.join(buf) + self._rbuf = b''.join(buf) return def _progress_update(self, download_total, downloaded, upload_total, uploaded): @@ -1888,28 +1881,40 @@ class PyCurlFileObject(object): def read(self, amt=None): self._fill_buffer(amt) if amt is None: - s, self._rbuf = self._rbuf, '' + s, self._rbuf = self._rbuf, b'' else: s, self._rbuf = self._rbuf[:amt], self._rbuf[amt:] - return s + return s if not six.PY3 else s.decode('utf-8') def readline(self, limit=-1): if not self._complete: self._do_grab() - return self.fo.readline() + return self.fo.readline() if not six.PY3 else self.fo.readline().decode('utf-8') - i = self._rbuf.find('\n') + i = self._rbuf.find(b'\n') while i < 0 and not (0 < limit <= len(self._rbuf)): L = len(self._rbuf) self._fill_buffer(L + self._rbufsize) if not len(self._rbuf) > L: break - i = self._rbuf.find('\n', L) + i = self._rbuf.find(b'\n', L) if i < 0: i = len(self._rbuf) else: i = i+1 if 0 <= limit < len(self._rbuf): i = limit s, self._rbuf = self._rbuf[:i], self._rbuf[i:] - return s + return s if not six.PY3 else s.decode('utf-8') + + # This was added here because we need to wrap self.fo readlines (which will + # always return bytes) in correct decoding + def readlines(self, *args, **kwargs): + if not six.PY3: + return [line for line in self.fo.readlines(*args, **kwargs)] + else: + return self._py3readlines(*args, **kwargs) + + def _py3readlines(self, *args, **kwargs): + for line in self.fo.readlines(*args, **kwargs): + yield line.decode('utf-8') def close(self): if self._prog_running: @@ -2055,11 +2060,9 @@ def _readlines(fd): buf = os.read(fd, 4096) if not buf: return None # whole lines only, no buffering - buf_compat = buf if not six.PY3 else buf.decode('utf-8') - while buf_compat[-1] != '\n': + while buf.decode('utf-8')[-1] != '\n': buf += os.read(fd, 4096) - buf_compat = buf if not six.PY3 else buf.decode('utf-8') - return buf_compat[:-1].split('\n') + return buf.decode('utf-8')[:-1].split('\n') import subprocess @@ -2403,7 +2406,7 @@ class _TH: if filename and _TH.dirty is None: try: now = int(time.time()) - for line in open(filename): + for line in open(filename, 'rb'): host, speed, fail, ts = line.rsplit(' ', 3) _TH.hosts[host] = int(speed), int(fail), min(int(ts), now) except IOError: pass @@ -2415,7 +2418,7 @@ class _TH: if filename and _TH.dirty is True: tmp = '%s.%d' % (filename, os.getpid()) try: - f = open(tmp, 'w') + f = open(tmp, 'wb') for host in _TH.hosts: f.write(host + ' %d %d %d\n' % _TH.hosts[host]) f.close() @@ -2536,7 +2539,7 @@ def _file_object_test(filename=None): if filename is None: filename = __file__ print('using file "%s" for comparisons' % filename) - fo = open(filename) + fo = open(filename, 'rb') s_input = fo.read() fo.close() @@ -2544,8 +2547,8 @@ def _file_object_test(filename=None): _test_file_object_readall, _test_file_object_readline, _test_file_object_readlines]: - fo_input = StringIO(s_input) - fo_output = StringIO() + fo_input = BytesIO(s_input) + fo_output = BytesIO() wrapper = PyCurlFileObject(fo_input, None, 0) print('testing %-30s ' % testfunc.__name__, testfunc(wrapper, fo_output)) s_output = fo_output.getvalue()