commit 7d6b90e17d333535549e2d3ec1cf41845a9b876f Author: Tomas Radej Date: Wed Aug 20 13:32:18 2014 +0200 Ported main code diff --git a/urlgrabber/__init__.py b/urlgrabber/__init__.py index b3047b0..636849c 100644 --- a/urlgrabber/__init__.py +++ b/urlgrabber/__init__.py @@ -52,4 +52,4 @@ __author__ = 'Michael D. Stenner , ' \ 'Zdenek Pavlas ' __url__ = 'http://urlgrabber.baseurl.org/' -from grabber import urlgrab, urlopen, urlread +from urlgrabber.grabber import urlgrab, urlopen, urlread diff --git a/urlgrabber/byterange.py b/urlgrabber/byterange.py index 5efa160..ffaed8e 100644 --- a/urlgrabber/byterange.py +++ b/urlgrabber/byterange.py @@ -18,24 +18,22 @@ # Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko +import email +import mimetypes import os +import six import stat -import urllib -import urllib2 -import rfc822 +from six.moves import urllib DEBUG = None -try: - from cStringIO import StringIO -except ImportError, msg: - from StringIO import StringIO +from io import StringIO class RangeError(IOError): """Error raised when an unsatisfiable range is requested.""" pass -class HTTPRangeHandler(urllib2.BaseHandler): +class HTTPRangeHandler(urllib.request.BaseHandler): """Handler that enables HTTP Range headers. This was extremely simple. The Range header is a HTTP feature to @@ -120,7 +118,7 @@ class RangeableFileObject: in self.fo. This includes methods.""" if hasattr(self.fo, name): return getattr(self.fo, name) - raise AttributeError, name + raise AttributeError(name) def tell(self): """Return the position within the range. @@ -211,37 +209,36 @@ class RangeableFileObject: raise RangeError(9, 'Requested Range Not Satisfiable') pos+= bufsize -class FileRangeHandler(urllib2.FileHandler): +class FileRangeHandler(urllib.request.FileHandler): """FileHandler subclass that adds Range support. This class handles Range headers exactly like an HTTP server would. """ def open_local_file(self, req): - import mimetypes - import mimetools host = req.get_host() file = req.get_selector() - localfile = urllib.url2pathname(file) + localfile = urllib.request.url2pathname(file) stats = os.stat(localfile) size = stats[stat.ST_SIZE] - modified = rfc822.formatdate(stats[stat.ST_MTIME]) + modified = email.utils.formatdate(stats[stat.ST_MTIME]) mtype = mimetypes.guess_type(file)[0] if host: - host, port = urllib.splitport(host) + host, port = urllib.parse.splitport(host) if port or socket.gethostbyname(host) not in self.get_names(): - raise urllib2.URLError('file not on local host') + raise urllib.error.URLError('file not on local host') fo = open(localfile,'rb') brange = req.headers.get('Range',None) brange = range_header_to_tuple(brange) assert brange != () if brange: (fb,lb) = brange - if lb == '': lb = size + if lb == '': + lb = size if fb < 0 or fb > size or lb > size: raise RangeError(9, 'Requested Range Not Satisfiable') size = (lb - fb) fo = RangeableFileObject(fo, (fb,lb)) - headers = mimetools.Message(StringIO( + headers = email.message.Message(StringIO( 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' % (mtype or 'text/plain', size, modified))) return urllib.addinfourl(fo, headers, 'file:'+file) @@ -254,42 +251,39 @@ class FileRangeHandler(urllib2.FileHandler): # follows: # -- range support modifications start/end here -from urllib import splitport, splituser, splitpasswd, splitattr, \ - unquote, addclosehook, addinfourl import ftplib import socket import sys -import mimetypes -import mimetools +from six.moves.urllib.parse import urlparse, unquote + +# Very old functions and classes, undocumented in current Python releases +if six.PY3: + from urllib.request import splitattr + from urllib.response import addinfourl +else: + from urllib import splitattr + from urllib import addinfourl -class FTPRangeHandler(urllib2.FTPHandler): + +class FTPRangeHandler(urllib.request.FTPHandler): def ftp_open(self, req): host = req.get_host() if not host: - raise IOError, ('ftp error', 'no host given') - host, port = splitport(host) - if port is None: - port = ftplib.FTP_PORT - else: - port = int(port) + raise IOError('ftp error', 'no host given') - # username/password handling - user, host = splituser(host) - if user: - user, passwd = splitpasswd(user) - else: - passwd = None + parsed = urlparse(host) + port = parsed.port or ftplib.FTP_PORT + user = unquote(parsed.username or '') + passwd = unquote(parsed.passwd or '') host = unquote(host) - user = unquote(user or '') - passwd = unquote(passwd or '') try: host = socket.gethostbyname(host) - except socket.error, msg: - raise urllib2.URLError(msg) + except socket.error as msg: + raise urllib.error.URLError(msg) path, attrs = splitattr(req.get_selector()) dirs = path.split('/') - dirs = map(unquote, dirs) + dirs = list(map(unquote, dirs)) dirs, file = dirs[:-1], dirs[-1] if dirs and not dirs[0]: dirs = dirs[1:] @@ -336,24 +330,36 @@ class FTPRangeHandler(urllib2.FTPHandler): if retrlen is not None and retrlen >= 0: headers += "Content-Length: %d\n" % retrlen sf = StringIO(headers) - headers = mimetools.Message(sf) + headers = email.message.Message(sf) return addinfourl(fp, headers, req.get_full_url()) - except ftplib.all_errors, msg: - raise IOError, ('ftp error', msg), sys.exc_info()[2] + except ftplib.all_errors as msg: + error = IOError('ftp error', msg) + six.reraise(error.__class__, error, sys.exc_info()[2]) def connect_ftp(self, user, passwd, host, port, dirs): fw = ftpwrapper(user, passwd, host, port, dirs) return fw -class ftpwrapper(urllib.ftpwrapper): +# Very old functions and classes, undocumented in current Python releases +if six.PY3: + from urllib.request import ftpwrapper, addclosehook +else: + from urllib import ftpwrapper, addclosehook + + +class ftpwrapper(ftpwrapper): # range support note: # this ftpwrapper code is copied directly from # urllib. The only enhancement is to add the rest # argument and pass it on to ftp.ntransfercmd def retrfile(self, file, type, rest=None): self.endtransfer() - if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1 - else: cmd = 'TYPE ' + type; isdir = 0 + if type in ('d', 'D'): + cmd = 'TYPE A' + isdir = 1 + else: + cmd = 'TYPE ' + type + isdir = 0 try: self.ftp.voidcmd(cmd) except ftplib.all_errors: @@ -364,22 +370,23 @@ class ftpwrapper(urllib.ftpwrapper): # Use nlst to see if the file exists at all try: self.ftp.nlst(file) - except ftplib.error_perm, reason: - raise IOError, ('ftp error', reason), sys.exc_info()[2] + except ftplib.error_perm as reason: + error = IOError('ftp error', reason) + six.reraise(error.__class__, error, sys.exc_info()[2]) # Restore the transfer mode! self.ftp.voidcmd(cmd) # Try to retrieve as a file try: cmd = 'RETR ' + file conn = self.ftp.ntransfercmd(cmd, rest) - except ftplib.error_perm, reason: + except ftplib.error_perm as reason: if str(reason)[:3] == '501': # workaround for REST not supported error fp, retrlen = self.retrfile(file, type) fp = RangeableFileObject(fp, (rest,'')) return (fp, retrlen) elif str(reason)[:3] != '550': - raise IOError, ('ftp error', reason), sys.exc_info()[2] + six.reraise(IOError, ('ftp error', reason), sys.exc_info()[2]) if not conn: # Set transfer mode to ASCII! self.ftp.voidcmd('TYPE A') @@ -458,6 +465,7 @@ def range_tuple_normalize(range_tup): # check if range is over the entire file if (fb,lb) == (0,''): return None # check that the range is valid - if lb < fb: raise RangeError(9, 'Invalid byte range: %s-%s' % (fb,lb)) + if lb != '' and lb < fb: + raise RangeError(9, 'Invalid byte range: %s-%s' % (fb, lb)) return (fb,lb) diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py index f8deeb8..35c091e 100644 --- a/urlgrabber/grabber.py +++ b/urlgrabber/grabber.py @@ -499,22 +499,24 @@ BANDWIDTH THROTTLING import os import sys -import urlparse import time +import collections +import fcntl +import pycurl +import select +import six +import socket +import stat import string -import urllib -import urllib2 -from httplib import responses -import mimetools -import thread import types -import stat -import pycurl +from email.message import Message from ftplib import parse150 -from StringIO import StringIO -from httplib import HTTPException -import socket, select, fcntl -from byterange import range_tuple_normalize, range_tuple_to_header, RangeError +from six.moves import _thread as thread +from six.moves import urllib +from six.moves.http_client import responses, HTTPException +from urlgrabber.byterange import range_tuple_normalize, range_tuple_to_header, RangeError + +from io import StringIO try: import xattr @@ -535,7 +537,7 @@ except: try: # this part isn't going to do much - need to talk to gettext from i18n import _ -except ImportError, msg: +except ImportError as msg: def _(st): return st ######################################################################## @@ -635,6 +637,8 @@ def _(st): def _to_utf8(obj, errors='replace'): '''convert 'unicode' to an encoded utf-8 byte string ''' + if six.PY3: + return obj # stolen from yum.i18n if isinstance(obj, unicode): obj = obj.encode('utf-8', errors) @@ -791,14 +795,14 @@ class URLParser: if opts.prefix: url = self.add_prefix(url, opts.prefix) - parts = urlparse.urlparse(url) + parts = urllib.parse.urlparse(url) (scheme, host, path, parm, query, frag) = parts - if not scheme or (len(scheme) == 1 and scheme in string.letters): + if not scheme or (len(scheme) == 1 and scheme in string.ascii_letters): # if a scheme isn't specified, we guess that it's "file:" if url[0] not in '/\\': url = os.path.abspath(url) - url = 'file:' + urllib.pathname2url(url) - parts = urlparse.urlparse(url) + url = 'file:' + urllib.request.pathname2url(url) + parts = urllib.parse.urlparse(url) quote = 0 # pathname2url quotes, so we won't do it again if scheme in ['http', 'https']: @@ -809,7 +813,7 @@ class URLParser: if quote: parts = self.quote(parts) - url = urlparse.urlunparse(parts) + url = urllib.parse.urlunparse(parts) return url, parts def add_prefix(self, url, prefix): @@ -833,7 +837,7 @@ class URLParser: passing into urlgrabber. """ (scheme, host, path, parm, query, frag) = parts - path = urllib.quote(path) + path = urllib.parse.quote(path) return (scheme, host, path, parm, query, frag) hexvals = '0123456789ABCDEF' @@ -850,7 +854,7 @@ class URLParser: (scheme, host, path, parm, query, frag) = parts if ' ' in path: return 1 - ind = string.find(path, '%') + ind = path.find('%') if ind > -1: while ind > -1: if len(path) < ind+3: @@ -859,7 +863,7 @@ class URLParser: if code[0] not in self.hexvals or \ code[1] not in self.hexvals: return 1 - ind = string.find(path, '%', ind+1) + ind = path.find('%', ind+1) return 0 return 1 @@ -879,13 +883,13 @@ class URLGrabberOptions: def __getattr__(self, name): if self.delegate and hasattr(self.delegate, name): return getattr(self.delegate, name) - raise AttributeError, name + raise AttributeError(name) def raw_throttle(self): """Calculate raw throttle value from throttle and bandwidth values. """ - if self.throttle <= 0: + if self.throttle is None or self.throttle <= 0: return 0 elif type(self.throttle) == type(0): return float(self.throttle) @@ -937,7 +941,7 @@ class URLGrabberOptions: def _set_attributes(self, **kwargs): """Update object attributes with those provided in kwargs.""" self.__dict__.update(kwargs) - if kwargs.has_key('range'): + if 'range' in kwargs: # normalize the supplied range value self.range = range_tuple_normalize(self.range) if not self.reget in [None, 'simple', 'check_timestamp']: @@ -1006,7 +1010,7 @@ class URLGrabberOptions: return self.format() def format(self, indent=' '): - keys = self.__dict__.keys() + keys = list(self.__dict__.keys()) if self.delegate is not None: keys.remove('delegate') keys.sort() @@ -1026,7 +1030,7 @@ def _do_raise(obj): def _run_callback(cb, obj): if not cb: return - if callable(cb): + if isinstance(cb, collections.Callable): return cb(obj) cb, arg, karg = cb return cb(obj, *arg, **karg) @@ -1058,16 +1062,15 @@ class URLGrabber(object): tries = tries + 1 exception = None callback = None - if DEBUG: DEBUG.info('attempt %i/%s: %s', - tries, opts.retry, args[0]) + if DEBUG: DEBUG.info('attempt %i/%s: %s', tries, opts.retry, args[0]) try: - r = apply(func, (opts,) + args, {}) + r = func(*(opts,) + args, **{}) if DEBUG: DEBUG.info('success') return r - except URLGrabError, e: + except URLGrabError as e: exception = e callback = opts.failure_callback - except KeyboardInterrupt, e: + except KeyboardInterrupt as e: exception = e callback = opts.interrupt_callback if not callback: @@ -1082,13 +1085,13 @@ class URLGrabber(object): if (opts.retry is None) or (tries == opts.retry): if DEBUG: DEBUG.info('retries exceeded, re-raising') - raise + raise exception retrycode = getattr(exception, 'errno', None) if (retrycode is not None) and (retrycode not in opts.retrycodes): if DEBUG: DEBUG.info('retrycode (%i) not in list %s, re-raising', retrycode, opts.retrycodes) - raise + raise exception def urlopen(self, url, opts=None, **kwargs): """open the url and return a file object @@ -1119,14 +1122,14 @@ class URLGrabber(object): (scheme, host, path, parm, query, frag) = parts opts.find_proxy(url, scheme) if filename is None: - filename = os.path.basename( urllib.unquote(path) ) + filename = os.path.basename( urllib.parse.unquote(path) ) if not filename: # This is better than nothing. filename = 'index.html' if scheme == 'file' and not opts.copy_local: # just return the name of the local file - don't make a # copy currently - path = urllib.url2pathname(path) + path = urllib.request.url2pathname(path) if host: path = os.path.normpath('//' + host + path) if not os.path.exists(path): @@ -1170,7 +1173,7 @@ class URLGrabber(object): try: return self._retry(opts, retryfunc, url, filename) - except URLGrabError, e: + except URLGrabError as e: _TH.update(url, 0, 0, e) opts.exception = e return _run_callback(opts.failfunc, opts) @@ -1219,7 +1222,7 @@ class URLGrabber(object): def _make_callback(self, callback_obj): # not used, left for compatibility - if callable(callback_obj): + if isinstance(callback_obj, collections.Callable): return callback_obj, (), {} else: return callback_obj @@ -1235,13 +1238,13 @@ class PyCurlFileObject(object): self._hdr_dump = '' self._parsed_hdr = None self.url = url - self.scheme = urlparse.urlsplit(self.url)[0] + self.scheme = urllib.parse.urlsplit(self.url)[0] self.filename = filename self.append = False self.reget_time = None self.opts = opts if self.opts.reget == 'check_timestamp': - raise NotImplementedError, "check_timestamp regets are not implemented in this ver of urlgrabber. Please report this." + raise NotImplementedError("check_timestamp regets are not implemented in this ver of urlgrabber. Please report this.") self._complete = False self._rbuf = '' self._rbufsize = 1024*8 @@ -1266,7 +1269,7 @@ class PyCurlFileObject(object): if hasattr(self.fo, name): return getattr(self.fo, name) - raise AttributeError, name + raise AttributeError(name) def _retrieve(self, buf): try: @@ -1280,7 +1283,7 @@ class PyCurlFileObject(object): if self.opts.progress_obj: size = self.size + self._reget_length self.opts.progress_obj.start(self._prog_reportname, - urllib.unquote(self.url), + urllib.parse.unquote(self.url), self._prog_basename, size=size, text=self.opts.text) @@ -1295,10 +1298,16 @@ class PyCurlFileObject(object): start = self._range[0] - pos stop = self._range[1] - pos if start < len(buf) and stop > 0: - self.fo.write(buf[max(start, 0):stop]) + if not six.PY3 or isinstance(self.fo, StringIO): + self.fo.write(buf[max(start, 0):stop].decode('utf-8')) + else: + self.fo.write(buf[max(start, 0):stop]) else: - self.fo.write(buf) - except IOError, e: + if not six.PY3 or isinstance(self.fo, StringIO): + self.fo.write(buf.decode('utf-8')) + else: + self.fo.write(buf) + except IOError as e: self._cb_error = URLGrabError(16, exception2msg(e)) return -1 return len(buf) @@ -1319,10 +1328,12 @@ class PyCurlFileObject(object): # but we can't do that w/o making it do 2 connects, which sucks # so we cheat and stuff it in here in the hdr_retrieve if self.scheme in ['http','https']: - if buf.lower().find('content-length:') != -1: - length = buf.split(':')[1] + content_length_str = 'content-length:' if not six.PY3 else b'content-length:' + if buf.lower().find(content_length_str) != -1: + split_str = ':' if not six.PY3 else b':' + length = buf.split(split_str)[1] self.size = int(length) - elif (self.append or self.opts.range) and self._hdr_dump == '' and ' 200 ' in buf: + elif (self.append or self.opts.range) and self._hdr_dump == '' and b' 200 ' in buf: # reget was attempted but server sends it all # undo what we did in _build_range() self.append = False @@ -1333,23 +1344,26 @@ class PyCurlFileObject(object): self.fo.truncate(0) elif self.scheme in ['ftp']: s = None - if buf.startswith('213 '): + if buf.startswith(b'213 '): s = buf[3:].strip() if len(s) >= 14: s = None # ignore MDTM responses - elif buf.startswith('150 '): - s = parse150(buf) + elif buf.startswith(b'150 '): + s = parse150(buf if not six.PY3 else buf.decode('utf-8')) if s: self.size = int(s) - if buf.lower().find('location') != -1: - location = ':'.join(buf.split(':')[1:]) + location_str = 'location' if not six.PY3 else b'location' + if buf.lower().find(location_str) != -1: + buf_compat = buf if not six.PY3 else buf.decode('utf-8') + location = ':'.join(buf_compat.split(':')[1:]) location = location.strip() - self.scheme = urlparse.urlsplit(location)[0] + self.scheme = urllib.parse.urlsplit(location)[0] self.url = location - self._hdr_dump += buf - if len(self._hdr_dump) != 0 and buf == '\r\n': + self._hdr_dump += buf if not six.PY3 else buf.decode('utf-8') + end_str = '\r\n' if not six.PY3 else b'\r\n' + if len(self._hdr_dump) != 0 and buf == end_str: self._hdr_ended = True if DEBUG: DEBUG.debug('header ended:') @@ -1365,7 +1379,7 @@ class PyCurlFileObject(object): hdrfp = StringIO() hdrfp.write(self._hdr_dump[statusend:]) hdrfp.seek(0) - self._parsed_hdr = mimetools.Message(hdrfp) + self._parsed_hdr = Message(hdrfp) return self._parsed_hdr hdr = property(_return_hdr_obj) @@ -1490,7 +1504,7 @@ class PyCurlFileObject(object): try: self.curl_obj.perform() - except pycurl.error, e: + except pycurl.error as e: # XXX - break some of these out a bit more clearly # to other URLGrabErrors from # http://curl.haxx.se/libcurl/c/libcurl-errors.html @@ -1498,7 +1512,7 @@ class PyCurlFileObject(object): code = self.http_code errcode = e.args[0] - errurl = urllib.unquote(self.url) + errurl = urllib.parse.unquote(self.url) if self._error[0]: errcode = self._error[0] @@ -1588,7 +1602,7 @@ class PyCurlFileObject(object): if self._error[1]: msg = self._error[1] err = URLGrabError(14, msg) - err.url = urllib.unquote(self.url) + err.url = urllib.parse.unquote(self.url) raise err def _do_open(self): @@ -1605,7 +1619,7 @@ class PyCurlFileObject(object): def _build_range(self): reget_length = 0 rt = None - if self.opts.reget and type(self.filename) in types.StringTypes: + if self.opts.reget and type(self.filename) in (type(str()), six.text_type): # we have reget turned on and we're dumping to a file try: s = os.stat(self.filename) @@ -1655,22 +1669,22 @@ class PyCurlFileObject(object): else: fo = opener.open(req) hdr = fo.info() - except ValueError, e: + except ValueError as e: err = URLGrabError(1, _('Bad URL: %s : %s') % (self.url, e, )) err.url = self.url raise err - except RangeError, e: + except RangeError as e: err = URLGrabError(9, _('%s on %s') % (e, self.url)) err.url = self.url raise err - except urllib2.HTTPError, e: + except urllib.error.HTTPError as e: new_e = URLGrabError(14, _('%s on %s') % (e, self.url)) new_e.code = e.code new_e.exception = e new_e.url = self.url raise new_e - except IOError, e: + except IOError as e: if hasattr(e, 'reason') and isinstance(e.reason, socket.timeout): err = URLGrabError(12, _('Timeout on %s: %s') % (self.url, e)) err.url = self.url @@ -1680,12 +1694,12 @@ class PyCurlFileObject(object): err.url = self.url raise err - except OSError, e: + except OSError as e: err = URLGrabError(5, _('%s on %s') % (e, self.url)) err.url = self.url raise err - except HTTPException, e: + except HTTPException as e: err = URLGrabError(7, _('HTTP Exception (%s) on %s: %s') % \ (e.__class__.__name__, self.url, e)) err.url = self.url @@ -1700,19 +1714,21 @@ class PyCurlFileObject(object): if self._complete: return _was_filename = False - if type(self.filename) in types.StringTypes and self.filename: + if self.filename and type(self.filename) in (type(str()), six.text_type): _was_filename = True self._prog_reportname = str(self.filename) self._prog_basename = os.path.basename(self.filename) - if self.append: mode = 'ab' - else: mode = 'wb' + if self.append: + mode = 'ab' + else: + mode = 'wb' - if DEBUG: DEBUG.info('opening local file "%s" with mode %s' % \ - (self.filename, mode)) + if DEBUG: + DEBUG.info('opening local file "%s" with mode %s' % (self.filename, mode)) try: self.fo = open(self.filename, mode) - except IOError, e: + except IOError as e: err = URLGrabError(16, _(\ 'error opening local file from %s, IOError: %s') % (self.url, e)) err.url = self.url @@ -1731,7 +1747,7 @@ class PyCurlFileObject(object): try: self._do_perform() - except URLGrabError, e: + except URLGrabError as e: self.fo.flush() self.fo.close() raise e @@ -1754,7 +1770,7 @@ class PyCurlFileObject(object): if mod_time != -1: try: os.utime(self.filename, (mod_time, mod_time)) - except OSError, e: + except OSError as e: err = URLGrabError(16, _(\ 'error setting timestamp on file %s from %s, OSError: %s') % (self.filename, self.url, e)) @@ -1763,7 +1779,7 @@ class PyCurlFileObject(object): # re open it try: self.fo = open(self.filename, 'r') - except IOError, e: + except IOError as e: err = URLGrabError(16, _(\ 'error opening file from %s, IOError: %s') % (self.url, e)) err.url = self.url @@ -1809,25 +1825,27 @@ class PyCurlFileObject(object): else: readamount = min(amt, self._rbufsize) try: new = self.fo.read(readamount) - except socket.error, e: + except socket.error as e: err = URLGrabError(4, _('Socket Error on %s: %s') % (self.url, e)) err.url = self.url raise err - except socket.timeout, e: + except socket.timeout as e: raise URLGrabError(12, _('Timeout on %s: %s') % (self.url, e)) err.url = self.url raise err - except IOError, e: + except IOError as e: raise URLGrabError(4, _('IOError on %s: %s') %(self.url, e)) err.url = self.url raise err newsize = len(new) - if not newsize: break # no more to read + if not newsize: + break # no more to read - if amt: amt = amt - newsize + if amt: + amt = amt - newsize buf.append(new) bufsize = bufsize + newsize self._tsize = newsize @@ -1835,7 +1853,7 @@ class PyCurlFileObject(object): #if self.opts.progress_obj: # self.opts.progress_obj.update(self._amount_read) - self._rbuf = string.join(buf, '') + self._rbuf = ''.join(buf) return def _progress_update(self, download_total, downloaded, upload_total, uploaded): @@ -1879,12 +1897,12 @@ class PyCurlFileObject(object): if not self._complete: self._do_grab() return self.fo.readline() - i = string.find(self._rbuf, '\n') + i = self._rbuf.find('\n') while i < 0 and not (0 < limit <= len(self._rbuf)): L = len(self._rbuf) self._fill_buffer(L + self._rbufsize) if not len(self._rbuf) > L: break - i = string.find(self._rbuf, '\n', L) + i = self._rbuf.find('\n', L) if i < 0: i = len(self._rbuf) else: i = i+1 @@ -1968,9 +1986,9 @@ def _dumps(v): if v is None: return 'None' if v is True: return 'True' if v is False: return 'False' - if type(v) in (int, long, float): + if type(v) in six.integer_types + (float,): return str(v) - if type(v) == unicode: + if not six.PY3 and type(v) == unicode: v = v.encode('UTF8') if type(v) == str: def quoter(c): return _quoter_map.get(c, c) @@ -1979,17 +1997,21 @@ def _dumps(v): return "(%s)" % ','.join(map(_dumps, v)) if type(v) == list: return "[%s]" % ','.join(map(_dumps, v)) - raise TypeError, 'Can\'t serialize %s' % v + raise TypeError('Can\'t serialize %s' % v) def _loads(s): def decode(v): if v == 'None': return None if v == 'True': return True if v == 'False': return False - try: return int(v) - except ValueError: pass - try: return float(v) - except ValueError: pass + try: + return int(v) + except ValueError: + pass + try: + return float(v) + except ValueError: + pass if len(v) >= 2 and v[0] == v[-1] == "'": ret = []; i = 1 while True: @@ -2033,9 +2055,11 @@ def _readlines(fd): buf = os.read(fd, 4096) if not buf: return None # whole lines only, no buffering - while buf[-1] != '\n': + buf_compat = buf if not six.PY3 else buf.decode('utf-8') + while buf_compat[-1] != '\n': buf += os.read(fd, 4096) - return buf[:-1].split('\n') + buf_compat = buf if not six.PY3 else buf.decode('utf-8') + return buf_compat[:-1].split('\n') import subprocess @@ -2071,7 +2095,8 @@ class _ExternalDownloader: arg = [] for k in self._options: v = getattr(opts, k) - if v is None: continue + if v is None: + continue arg.append('%s=%s' % (k, _dumps(v))) if opts.progress_obj and opts.multi_progress_obj: arg.append('progress_obj=True') @@ -2080,7 +2105,8 @@ class _ExternalDownloader: self.cnt += 1 self.running[self.cnt] = opts - os.write(self.stdin, arg +'\n') + result = arg +'\n' + os.write(self.stdin, result if not six.PY3 else result.encode('utf-8')) def perform(self): ret = [] @@ -2091,7 +2117,7 @@ class _ExternalDownloader: for line in lines: # parse downloader output line = line.split(' ', 6) - _id, size = map(int, line[:2]) + _id, size = list(map(int, line[:2])) if len(line) == 2: self.running[_id]._progress.update(size) continue @@ -2121,7 +2147,7 @@ class _ExternalDownloaderPool: self.cache = {} def start(self, opts): - host = urlparse.urlsplit(opts.url).netloc + host = urllib.parse.urlsplit(opts.url).netloc dl = self.cache.pop(host, None) if not dl: dl = _ExternalDownloader() @@ -2144,8 +2170,9 @@ class _ExternalDownloaderPool: ret.extend(done) # dl finished, move it to the cache - host = urlparse.urlsplit(done[0][0].url).netloc - if host in self.cache: self.cache[host].abort() + host = urllib.parse.urlsplit(done[0][0].url).netloc + if host in self.cache: + self.cache[host].abort() self.epoll.unregister(fd) self.cache[host] = self.running.pop(fd) return ret @@ -2189,7 +2216,7 @@ def parallel_wait(meter=None): opts.tries = tries try: dl.start(opts) - except OSError, e: + except OSError as e: # can't spawn downloader, give up immediately opts.exception = URLGrabError(5, exception2msg(e)) _run_callback(opts.failfunc, opts) @@ -2212,7 +2239,8 @@ def parallel_wait(meter=None): if ug_err is None: if opts.checkfunc: try: _run_callback(opts.checkfunc, opts) - except URLGrabError, ug_err: pass + except URLGrabError: + pass if opts.progress_obj: if opts.multi_progress_obj: @@ -2242,8 +2270,9 @@ def parallel_wait(meter=None): retry = opts.retry or 0 if opts.failure_callback: opts.exception = ug_err - try: _run_callback(opts.failure_callback, opts) - except URLGrabError, ug_err: + try: + _run_callback(opts.failure_callback, opts) + except URLGrabError: retry = 0 # no retries if opts.tries < retry and ug_err.errno in opts.retrycodes: start(opts, opts.tries + 1) # simple retry @@ -2293,8 +2322,7 @@ def parallel_wait(meter=None): # check global limit while len(dl.running) >= default_grabber.opts.max_connections: perform() - if DEBUG: - DEBUG.info('max_connections: %d/%d', len(dl.running), default_grabber.opts.max_connections) + if DEBUG: DEBUG.info('max_connections: %d/%d', len(dl.running), default_grabber.opts.max_connections) if opts.mirror_group: mg, errors, failed, removed = opts.mirror_group @@ -2345,12 +2373,12 @@ def parallel_wait(meter=None): limit = 1 while host_con.get(key, 0) >= (limit or 2): perform() - if DEBUG: - DEBUG.info('max_connections(%s): %d/%s', key, host_con.get(key, 0), limit) + if DEBUG: DEBUG.info('max_connections(%s): %d/%s', key, host_con.get(key, 0), limit) start(opts, 1) - except IOError, e: - if e.errno != 4: raise + except IOError as e: + if e.errno != 4: + raise raise KeyboardInterrupt finally: @@ -2399,7 +2427,7 @@ class _TH: def update(url, dl_size, dl_time, ug_err, baseurl=None): # Use hostname from URL. If it's a file:// URL, use baseurl. # If no baseurl, do not update timedhosts. - host = urlparse.urlsplit(url).netloc.split('@')[-1] or baseurl + host = urllib.parse.urlsplit(url).netloc.split('@')[-1] or baseurl if not host: return _TH.load() @@ -2431,7 +2459,7 @@ class _TH: _TH.load() # Use just the hostname, unless it's a file:// baseurl. - host = urlparse.urlsplit(baseurl).netloc.split('@')[-1] or baseurl + host = urllib.parse.urlsplit(baseurl).netloc.split('@')[-1] or baseurl default_speed = default_grabber.opts.default_speed try: speed, fail, ts = _TH.hosts[host] @@ -2447,68 +2475,67 @@ class _TH: def _main_test(): try: url, filename = sys.argv[1:3] except ValueError: - print 'usage:', sys.argv[0], \ - ' [copy_local=0|1] [close_connection=0|1]' + print('usage:', sys.argv[0], \ + ' [copy_local=0|1] [close_connection=0|1]') sys.exit() kwargs = {} for a in sys.argv[3:]: - k, v = string.split(a, '=', 1) + k, v = a.split('=', 1) kwargs[k] = int(v) set_throttle(1.0) set_bandwidth(32 * 1024) - print "throttle: %s, throttle bandwidth: %s B/s" % (default_grabber.throttle, - default_grabber.bandwidth) + print("throttle: %s, throttle bandwidth: %s B/s" % (default_grabber.throttle, + default_grabber.bandwidth)) - try: from progress import text_progress_meter - except ImportError, e: pass + try: from .progress import text_progress_meter + except ImportError: pass else: kwargs['progress_obj'] = text_progress_meter() - try: name = apply(urlgrab, (url, filename), kwargs) - except URLGrabError, e: print e - else: print 'LOCAL FILE:', name + try: name = urlgrab(*(url, filename), **kwargs) + except URLGrabError as e: print(e) + else: print('LOCAL FILE:', name) def _retry_test(): try: url, filename = sys.argv[1:3] except ValueError: - print 'usage:', sys.argv[0], \ - ' [copy_local=0|1] [close_connection=0|1]' + print('usage:', sys.argv[0], \ + ' [copy_local=0|1] [close_connection=0|1]') sys.exit() kwargs = {} for a in sys.argv[3:]: - k, v = string.split(a, '=', 1) + k, v = a.split('=', 1) kwargs[k] = int(v) - try: from progress import text_progress_meter - except ImportError, e: pass + try: from .progress import text_progress_meter + except ImportError: pass else: kwargs['progress_obj'] = text_progress_meter() def cfunc(filename, hello, there='foo'): - print hello, there + print(hello, there) import random rnum = random.random() if rnum < .5: - print 'forcing retry' + print('forcing retry') raise URLGrabError(-1, 'forcing retry') if rnum < .75: - print 'forcing failure' + print('forcing failure') raise URLGrabError(-2, 'forcing immediate failure') - print 'success' + print('success') return kwargs['checkfunc'] = (cfunc, ('hello',), {'there':'there'}) - try: name = apply(retrygrab, (url, filename), kwargs) - except URLGrabError, e: print e - else: print 'LOCAL FILE:', name + try: name = retrygrab(*(url, filename), **kwargs) + except URLGrabError as e: print(e) + else: print('LOCAL FILE:', name) def _file_object_test(filename=None): - import cStringIO if filename is None: filename = __file__ - print 'using file "%s" for comparisons' % filename + print('using file "%s" for comparisons' % filename) fo = open(filename) s_input = fo.read() fo.close() @@ -2517,14 +2544,13 @@ def _file_object_test(filename=None): _test_file_object_readall, _test_file_object_readline, _test_file_object_readlines]: - fo_input = cStringIO.StringIO(s_input) - fo_output = cStringIO.StringIO() + fo_input = StringIO(s_input) + fo_output = StringIO() wrapper = PyCurlFileObject(fo_input, None, 0) - print 'testing %-30s ' % testfunc.__name__, - testfunc(wrapper, fo_output) + print('testing %-30s ' % testfunc.__name__, testfunc(wrapper, fo_output)) s_output = fo_output.getvalue() - if s_output == s_input: print 'passed' - else: print 'FAILED' + if s_output == s_input: print('passed') + else: print('FAILED') def _test_file_object_smallread(wrapper, fo_output): while 1: @@ -2544,7 +2570,7 @@ def _test_file_object_readline(wrapper, fo_output): def _test_file_object_readlines(wrapper, fo_output): li = wrapper.readlines() - fo_output.write(string.join(li, '')) + fo_output.write(''.join(li)) if __name__ == '__main__': _main_test() diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py index 988a309..f3c2664 100644 --- a/urlgrabber/mirror.py +++ b/urlgrabber/mirror.py @@ -92,13 +92,14 @@ CUSTOMIZATION import sys +import six import random -import thread # needed for locking to make this threadsafe +from six.moves import _thread as thread # needed for locking to make this threadsafe -from grabber import URLGrabError, CallbackObject, DEBUG, _to_utf8 -from grabber import _run_callback, _do_raise -from grabber import exception2msg -from grabber import _TH +from urlgrabber.grabber import URLGrabError, CallbackObject, DEBUG, _to_utf8 +from urlgrabber.grabber import _run_callback, _do_raise +from urlgrabber.grabber import exception2msg +from urlgrabber.grabber import _TH def _(st): return st @@ -286,7 +287,7 @@ class MirrorGroup: def _parse_mirrors(self, mirrors): parsed_mirrors = [] for m in mirrors: - if isinstance(m, basestring): + if isinstance(m, six.string_types): m = {'mirror': _to_utf8(m)} parsed_mirrors.append(m) return parsed_mirrors @@ -423,7 +424,7 @@ class MirrorGroup: if DEBUG: DEBUG.info('MIRROR: trying %s -> %s', url, fullurl) try: return func_ref( *(fullurl,), opts=opts, **kw ) - except URLGrabError, e: + except URLGrabError as e: if DEBUG: DEBUG.info('MIRROR: failed') gr.errors.append((fullurl, exception2msg(e))) obj = CallbackObject() @@ -446,7 +447,7 @@ class MirrorGroup: func = 'urlgrab' try: return self._mirror_try(func, url, kw) - except URLGrabError, e: + except URLGrabError as e: obj = CallbackObject(url=url, filename=filename, exception=e, **kwargs) return _run_callback(kwargs.get('failfunc', _do_raise), obj) diff --git a/urlgrabber/progress.py b/urlgrabber/progress.py index 9b77c54..2235397 100644 --- a/urlgrabber/progress.py +++ b/urlgrabber/progress.py @@ -19,9 +19,10 @@ import sys +import six import time import math -import thread +from six.moves import _thread as thread import fcntl import struct import termios @@ -606,7 +607,7 @@ class TextMultiFileMeter(MultiFileMeter): try: format = "%-30.30s %6.6s %s" fn = meter.text or meter.basename - if type(message) in (type(''), type(u'')): + if type(message) in (type(''), type('')): message = message.splitlines() if not message: message = [''] out = '%-79s' % (format % (fn, 'FAILED', message[0] or '')) @@ -778,7 +779,7 @@ def format_number(number, SI=0, space=' '): depth = depth + 1 number = number / step - if type(number) == type(1) or type(number) == type(1L): + if type(number) in six.integer_types: # it's an int or a long, which means it didn't get divided, # which means it's already short enough format = '%i%s%s' @@ -806,7 +807,7 @@ def _tst(fn, cur, tot, beg, size, *args): tm.end(size) def _mtst(datas, *args): - print '-' * 79 + print('-' * 79) tm = TextMultiFileMeter(threaded=False) dl_sizes = {}