latest head

15 years ago · a5c865f04d
parent 36ff3aaff0
commit a5c865f04d
2 changed files with 299 additions and 37 deletions
--- a/python-urlgrabber.spec
+++ b/python-urlgrabber.spec
@ -3,7 +3,7 @@
 Summary: A high-level cross-protocol url-grabber
 Name: python-urlgrabber
 Version: 3.9.1
-Release: 8%{?dist}
+Release: 9%{?dist}
 Source0: urlgrabber-%{version}.tar.gz
 Patch1: urlgrabber-HEAD.patch
@ -43,6 +43,10 @@ rm -rf $RPM_BUILD_ROOT
 %{_bindir}/urlgrabber
 %changelog
 * Fri Sep  3 2010 Seth Vidal <skvidal at fedoraproject.org> - 3.9.1-9
 - new update to latest head with a number of patches collected from 
  older bug reports.
 * Mon Aug 30 2010 Seth Vidal <skvidal at fedoraproject.org> - 3.9.1-8
 - update to latest head patches
--- a/urlgrabber-HEAD.patch
+++ b/urlgrabber-HEAD.patch
@ -1,16 +1,3 @@
 diff --git a/.gitignore b/.gitignore
 new file mode 100644
 index 0000000..1ffe416
 --- /dev/null
 +++ b/.gitignore
@@ -0,0 +1,7 @@
 +*.py[co]
 +MANIFEST
 +dist
 +build
 +*.kdev*
 +*.kateproject
 +ipython.log*
 diff --git a/scripts/urlgrabber b/scripts/urlgrabber
 index 518e512..09cd896 100644
 --- a/scripts/urlgrabber
@ -83,8 +70,62 @@ index 50c6348..5fb43f9 100644
 base_ftp  = 'ftp://localhost/test/'
 # set to a proftp server only. we're working around a couple of
 diff --git a/urlgrabber/byterange.py b/urlgrabber/byterange.py
 index 3e5f3b7..8eeaeda 100644
 --- a/urlgrabber/byterange.py
 +++ b/urlgrabber/byterange.py
@@ -68,7 +68,7 @@ class HTTPRangeHandler(urllib2.BaseHandler):
     def http_error_416(self, req, fp, code, msg, hdrs):
         # HTTP's Range Not Satisfiable error
 -        raise RangeError('Requested Range Not Satisfiable')
 +        raise RangeError(9, 'Requested Range Not Satisfiable')
 class HTTPSRangeHandler(HTTPRangeHandler):
     """ Range Header support for HTTPS. """
@@ -208,7 +208,7 @@ class RangeableFileObject:
                 bufsize = offset - pos
             buf = self.fo.read(bufsize)
             if len(buf) != bufsize:
 -                raise RangeError('Requested Range Not Satisfiable')
 +                raise RangeError(9, 'Requested Range Not Satisfiable')
             pos+= bufsize
 class FileRangeHandler(urllib2.FileHandler):
@@ -238,7 +238,7 @@ class FileRangeHandler(urllib2.FileHandler):
             (fb,lb) = brange
             if lb == '': lb = size
             if fb < 0 or fb > size or lb > size:
 -                raise RangeError('Requested Range Not Satisfiable')
 +                raise RangeError(9, 'Requested Range Not Satisfiable')
             size = (lb - fb)
             fo = RangeableFileObject(fo, (fb,lb))
         headers = mimetools.Message(StringIO(
@@ -318,12 +318,12 @@ class FTPRangeHandler(urllib2.FTPHandler):
                 (fb,lb) = range_tup
                 if lb == '': 
                     if retrlen is None or retrlen == 0:
 -                        raise RangeError('Requested Range Not Satisfiable due to unobtainable file length.')
 +                        raise RangeError(9, 'Requested Range Not Satisfiable due to unobtainable file length.')
                     lb = retrlen
                     retrlen = lb - fb
                     if retrlen < 0:
                         # beginning of range is larger than file
 -                        raise RangeError('Requested Range Not Satisfiable')
 +                        raise RangeError(9, 'Requested Range Not Satisfiable')
                 else:
                     retrlen = lb - fb
                     fp = RangeableFileObject(fp, (0,retrlen))
@@ -458,6 +458,6 @@ def range_tuple_normalize(range_tup):
     # check if range is over the entire file
     if (fb,lb) == (0,''): return None
     # check that the range is valid
 -    if lb < fb: raise RangeError('Invalid byte range: %s-%s' % (fb,lb))
 +    if lb < fb: raise RangeError(9, 'Invalid byte range: %s-%s' % (fb,lb))
     return (fb,lb)
 diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py
-index e090e90..0c78857 100644
+index e090e90..b2770c5 100644
 --- a/urlgrabber/grabber.py
 +++ b/urlgrabber/grabber.py
@@ -68,14 +68,14 @@ GENERAL ARGUMENTS (kwargs)
@ -109,7 +150,20 @@ index e090e90..0c78857 100644
   bandwidth = 0
-@@ -248,6 +248,11 @@ GENERAL ARGUMENTS (kwargs)
+@@ -198,6 +198,12 @@ GENERAL ARGUMENTS (kwargs)
     control, you should probably subclass URLParser and pass it in via
     the 'urlparser' option.
 +  username = None
 +    username to use for simple http auth - is automatically quoted for special characters
 +
 +  password = None
 +    password to use for simple http auth - is automatically quoted for special characters
 +
   ssl_ca_cert = None
     this option can be used if M2Crypto is available and will be
@@ -248,6 +254,11 @@ GENERAL ARGUMENTS (kwargs)
     Maximum size (in bytes) of the headers.
@ -121,7 +175,15 @@ index e090e90..0c78857 100644
 RETRY RELATED ARGUMENTS
-@@ -439,6 +444,12 @@ try:
+@@ -420,6 +431,7 @@ import time
 import string
 import urllib
 import urllib2
 +from httplib import responses
 import mimetools
 import thread
 import types
@@ -439,6 +451,12 @@ try:
 except:
     __version__ = '???'
@ -134,7 +196,38 @@ index e090e90..0c78857 100644
 ########################################################################
 # functions for debugging output.  These functions are here because they
 # are also part of the module initialization.
-@@ -800,6 +811,7 @@ class URLGrabberOptions:
+@@ -527,6 +545,22 @@ def _(st):
 #                 END MODULE INITIALIZATION
 ########################################################################
 +########################################################################
 +#                 UTILITY FUNCTIONS
 +########################################################################
 +
 +# These functions are meant to be utilities for the urlgrabber library to use.
 +
 +def _to_utf8(obj, errors='replace'):
 +    '''convert 'unicode' to an encoded utf-8 byte string '''
 +    # stolen from yum.i18n
 +    if isinstance(obj, unicode):
 +        obj = obj.encode('utf-8', errors)
 +    return obj
 +
 +########################################################################
 +#                 END UTILITY FUNCTIONS
 +########################################################################
 class URLGrabError(IOError):
@@ -662,6 +696,7 @@ class URLParser:
           opts.quote = 0     --> do not quote it
           opts.quote = None  --> guess
         """
 +        url = _to_utf8(url)
         quote = opts.quote
         if opts.prefix:
@@ -800,6 +835,7 @@ class URLGrabberOptions:
         self.close_connection = 0
         self.range = None
         self.user_agent = 'urlgrabber/%s' % __version__
@ -142,7 +235,7 @@ index e090e90..0c78857 100644
         self.keepalive = 1
         self.proxies = None
         self.reget = None
-@@ -808,7 +820,7 @@ class URLGrabberOptions:
+@@ -808,13 +844,15 @@ class URLGrabberOptions:
         self.prefix = None
         self.opener = None
         self.cache_openers = True
@ -151,7 +244,24 @@ index e090e90..0c78857 100644
         self.text = None
         self.http_headers = None
         self.ftp_headers = None
-@@ -931,6 +943,9 @@ class URLGrabber:
+         self.data = None
         self.urlparser = URLParser()
         self.quote = None
 +        self.username = None
 +        self.password = None
         self.ssl_ca_cert = None # sets SSL_CAINFO - path to certdb
         self.ssl_context = None # no-op in pycurl
         self.ssl_verify_peer = True # check peer's cert for authenticityb
@@ -846,7 +884,7 @@ class URLGrabberOptions:
         s = s + indent + '}'
         return s
 -class URLGrabber:
 +class URLGrabber(object):
     """Provides easy opening of URLs with a variety of options.
     All options are specified as kwargs. Options may be specified when
@@ -931,6 +969,9 @@ class URLGrabber:
         (scheme, host, path, parm, query, frag) = parts
         if filename is None:
             filename = os.path.basename( urllib.unquote(path) )
@ -161,7 +271,16 @@ index e090e90..0c78857 100644
         if scheme == 'file' and not opts.copy_local:
             # just return the name of the local file - don't make a 
             # copy currently
-@@ -1052,9 +1067,15 @@ class PyCurlFileObject():
+@@ -1030,7 +1071,7 @@ class URLGrabber:
 default_grabber = URLGrabber()
 -class PyCurlFileObject():
 +class PyCurlFileObject(object):
     def __init__(self, url, filename, opts):
         self.fo = None
         self._hdr_dump = ''
@@ -1052,9 +1093,15 @@ class PyCurlFileObject():
         self._reget_length = 0
         self._prog_running = False
         self._error = (None, None)
@ -178,7 +297,7 @@ index e090e90..0c78857 100644
     def __getattr__(self, name):
         """This effectively allows us to wrap at the instance level.
-@@ -1085,9 +1106,14 @@ class PyCurlFileObject():
+@@ -1085,9 +1132,14 @@ class PyCurlFileObject():
             return -1
     def _hdr_retrieve(self, buf):
@ -194,7 +313,7 @@ index e090e90..0c78857 100644
         try:
             self._hdr_dump += buf
             # we have to get the size before we do the progress obj start
-@@ -1104,7 +1130,17 @@ class PyCurlFileObject():
+@@ -1104,7 +1156,17 @@ class PyCurlFileObject():
                     s = parse150(buf)
                 if s:
                     self.size = int(s)
@ -213,7 +332,7 @@ index e090e90..0c78857 100644
             return len(buf)
         except KeyboardInterrupt:
             return pycurl.READFUNC_ABORT
-@@ -1113,8 +1149,10 @@ class PyCurlFileObject():
+@@ -1113,8 +1175,10 @@ class PyCurlFileObject():
         if self._parsed_hdr:
             return self._parsed_hdr
         statusend = self._hdr_dump.find('\n')
@ -224,7 +343,7 @@ index e090e90..0c78857 100644
         self._parsed_hdr =  mimetools.Message(hdrfp)
         return self._parsed_hdr
-@@ -1136,11 +1174,21 @@ class PyCurlFileObject():
+@@ -1136,11 +1200,21 @@ class PyCurlFileObject():
         self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update)
         self.curl_obj.setopt(pycurl.FAILONERROR, True)
         self.curl_obj.setopt(pycurl.OPT_FILETIME, True)
@ -246,7 +365,7 @@ index e090e90..0c78857 100644
         # maybe to be options later
         self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)
-@@ -1148,9 +1196,11 @@ class PyCurlFileObject():
+@@ -1148,9 +1222,11 @@ class PyCurlFileObject():
         # timeouts
         timeout = 300
@ -261,26 +380,115 @@ index e090e90..0c78857 100644
         # ssl options
         if self.scheme == 'https':
-@@ -1276,7 +1326,7 @@ class PyCurlFileObject():
+@@ -1203,12 +1279,19 @@ class PyCurlFileObject():
                         if proxy == '_none_': proxy = ""
                         self.curl_obj.setopt(pycurl.PROXY, proxy)
 -        # FIXME username/password/auth settings
 +        if opts.username and opts.password:
 +            if self.scheme in ('http', 'https'):
 +                self.curl_obj.setopt(pycurl.HTTPAUTH, pycurl.HTTPAUTH_ANY)
 +
 +            if opts.username and opts.password:
 +                # apparently when applying them as curlopts they do not require quoting of any kind
 +                userpwd = '%s:%s' % (opts.username, opts.password)
 +                self.curl_obj.setopt(pycurl.USERPWD, userpwd)
         #posts - simple - expects the fields as they are
         if opts.data:
             self.curl_obj.setopt(pycurl.POST, True)
 -            self.curl_obj.setopt(pycurl.POSTFIELDS, self._to_utf8(opts.data))
 +            self.curl_obj.setopt(pycurl.POSTFIELDS, _to_utf8(opts.data))
         # our url
         self.curl_obj.setopt(pycurl.URL, self.url)
@@ -1228,12 +1311,14 @@ class PyCurlFileObject():
             code = self.http_code
             errcode = e.args[0]
 +            errurl = urllib.unquote(self.url)
 +            
             if self._error[0]:
                 errcode = self._error[0]
             if errcode == 23 and code >= 200 and code < 299:
 -                err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e))
 -                err.url = self.url
 +                err = URLGrabError(15, _('User (or something) called abort %s: %s') % (errurl, e))
 +                err.url = errurl
                 # this is probably wrong but ultimately this is what happens
                 # we have a legit http code and a pycurl 'writer failed' code
@@ -1244,23 +1329,23 @@ class PyCurlFileObject():
                 raise KeyboardInterrupt
             elif errcode == 28:
 -                err = URLGrabError(12, _('Timeout on %s: %s') % (self.url, e))
 -                err.url = self.url
 +                err = URLGrabError(12, _('Timeout on %s: %s') % (errurl, e))
 +                err.url = errurl
                 raise err
             elif errcode == 35:
                 msg = _("problem making ssl connection")
                 err = URLGrabError(14, msg)
 -                err.url = self.url
 +                err.url = errurl
                 raise err
             elif errcode == 37:
 -                msg = _("Could not open/read %s") % (self.url)
 +                msg = _("Could not open/read %s") % (errurl)
                 err = URLGrabError(14, msg)
 -                err.url = self.url
 +                err.url = errurl
                 raise err
             elif errcode == 42:
 -                err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e))
 -                err.url = self.url
 +                err = URLGrabError(15, _('User (or something) called abort %s: %s') % (errurl, e))
 +                err.url = errurl
                 # this is probably wrong but ultimately this is what happens
                 # we have a legit http code and a pycurl 'writer failed' code
                 # which almost always means something aborted it from outside
@@ -1272,33 +1357,93 @@ class PyCurlFileObject():
             elif errcode == 58:
                 msg = _("problem with the local client certificate")
                 err = URLGrabError(14, msg)
 -                err.url = self.url
 +                err.url = errurl
                 raise err
             elif errcode == 60:
 -                msg = _("client cert cannot be verified or client cert incorrect")
 +                msg = _("Peer cert cannot be verified or peer cert invalid")
                 err = URLGrabError(14, msg)
-                 err.url = self.url
+-                err.url = self.url
 +                err.url = errurl
                 raise err
-@@ -1291,14 +1341,70 @@ class PyCurlFileObject():
+             
             elif errcode == 63:
                 if self._error[1]:
                     msg = self._error[1]
                 else:
 -                    msg = _("Max download size exceeded on %s") % (self.url)
 +                    msg = _("Max download size exceeded on %s") % ()
                 err = URLGrabError(14, msg)
 -                err.url = self.url
 +                err.url = errurl
                 raise err
             elif str(e.args[1]) == '' and self.http_code != 0: # fake it until you make it
 -                msg = 'HTTP Error %s : %s ' % (self.http_code, self.url)
 +                if self.scheme in ['http', 'https']:
-+                    msg = 'HTTP Error %s : %s ' % (self.http_code, self.url)
+                    if self.http_code in responses:
 +                        resp = responses[self.http_code]
 +                        msg = 'HTTP Error %s - %s : %s' % (self.http_code, resp, errurl)
 +                    else:
 +                        msg = 'HTTP Error %s : %s ' % (self.http_code, errurl)
 +                elif self.scheme in ['ftp']:
-+                    msg = 'FTP Error %s : %s ' % (self.http_code, self.url)
+                    msg = 'FTP Error %s : %s ' % (self.http_code, errurl)
 +                else:
-+                    msg = "Unknown Error: URL=%s , scheme=%s" % (self.url, self.scheme)
+                    msg = "Unknown Error: URL=%s , scheme=%s" % (errurl, self.scheme)
             else:
 -                msg = 'PYCURL ERROR %s - "%s"' % (errcode, str(e.args[1]))
 +                pyerr2str = { 5 : _("Couldn't resolve proxy"),
@ -338,12 +546,25 @@ index e090e90..0c78857 100644
 +            if self._error[1]:
 +                msg = self._error[1]
 +                err = URLGRabError(14, msg)
-+                err.url = self.url
+                err.url = urllib.unquote(self.url)
 +                raise err
     def _do_open(self):
         self.curl_obj = _curl_cache
-@@ -1434,9 +1540,13 @@ class PyCurlFileObject():
+@@ -1333,7 +1478,11 @@ class PyCurlFileObject():
         if self.opts.range:
             rt = self.opts.range
 -            if rt[0]: rt = (rt[0] + reget_length, rt[1])
 +            
 +            if rt[0] is None:
 +                rt = (0, rt[1])
 +            rt = (rt[0] + reget_length, rt[1])
 +            
         if rt:
             header = range_tuple_to_header(rt)
@@ -1434,9 +1583,13 @@ class PyCurlFileObject():
             #fh, self._temp_name = mkstemp()
             #self.fo = open(self._temp_name, 'wb')
@ -360,7 +581,7 @@ index e090e90..0c78857 100644
         if _was_filename:
-@@ -1446,9 +1556,23 @@ class PyCurlFileObject():
+@@ -1446,9 +1599,23 @@ class PyCurlFileObject():
             # set the time
             mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME)
             if mod_time != -1:
@ -386,7 +607,7 @@ index e090e90..0c78857 100644
         else:
             #self.fo = open(self._temp_name, 'r')
             self.fo.seek(0)
-@@ -1532,11 +1656,14 @@ class PyCurlFileObject():
+@@ -1532,11 +1699,14 @@ class PyCurlFileObject():
     def _over_max_size(self, cur, max_size=None):
         if not max_size:
@ -405,7 +626,21 @@ index e090e90..0c78857 100644
             msg = _("Downloaded more than max size for %s: %s > %s") \
                         % (self.url, cur, max_size)
-@@ -1582,9 +1709,21 @@ class PyCurlFileObject():
+@@ -1544,13 +1714,6 @@ class PyCurlFileObject():
             return True
         return False
 -    def _to_utf8(self, obj, errors='replace'):
 -        '''convert 'unicode' to an encoded utf-8 byte string '''
 -        # stolen from yum.i18n
 -        if isinstance(obj, unicode):
 -            obj = obj.encode('utf-8', errors)
 -        return obj
 -        
     def read(self, amt=None):
         self._fill_buffer(amt)
         if amt is None:
@@ -1582,9 +1745,21 @@ class PyCurlFileObject():
             self.opts.progress_obj.end(self._amount_read)
         self.fo.close()
@ -428,6 +663,29 @@ index e090e90..0c78857 100644
 #####################################################################
 # DEPRECATED FUNCTIONS
 diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py
 index dad410b..8731aed 100644
 --- a/urlgrabber/mirror.py
 +++ b/urlgrabber/mirror.py
@@ -90,7 +90,7 @@ CUSTOMIZATION
 import random
 import thread  # needed for locking to make this threadsafe
 -from grabber import URLGrabError, CallbackObject, DEBUG
 +from grabber import URLGrabError, CallbackObject, DEBUG, _to_utf8
 def _(st): 
     return st
@@ -263,7 +263,8 @@ class MirrorGroup:
     def _parse_mirrors(self, mirrors):
         parsed_mirrors = []
         for m in mirrors:
 -            if type(m) == type(''): m = {'mirror': m}
 +            if isinstance(m, basestring):
 +                m = {'mirror': _to_utf8(m)}
             parsed_mirrors.append(m)
         return parsed_mirrors
 diff --git a/urlgrabber/progress.py b/urlgrabber/progress.py
 index dd07c6a..45eb248 100644
 --- a/urlgrabber/progress.py