diff --git a/python-urlgrabber.spec b/python-urlgrabber.spec index 9b49ca5..8908845 100644 --- a/python-urlgrabber.spec +++ b/python-urlgrabber.spec @@ -3,7 +3,7 @@ Summary: A high-level cross-protocol url-grabber Name: python-urlgrabber Version: 3.9.1 -Release: 8%{?dist} +Release: 9%{?dist} Source0: urlgrabber-%{version}.tar.gz Patch1: urlgrabber-HEAD.patch @@ -43,6 +43,10 @@ rm -rf $RPM_BUILD_ROOT %{_bindir}/urlgrabber %changelog +* Fri Sep 3 2010 Seth Vidal - 3.9.1-9 +- new update to latest head with a number of patches collected from + older bug reports. + * Mon Aug 30 2010 Seth Vidal - 3.9.1-8 - update to latest head patches diff --git a/urlgrabber-HEAD.patch b/urlgrabber-HEAD.patch index 6b97585..6627a1f 100644 --- a/urlgrabber-HEAD.patch +++ b/urlgrabber-HEAD.patch @@ -1,16 +1,3 @@ -diff --git a/.gitignore b/.gitignore -new file mode 100644 -index 0000000..1ffe416 ---- /dev/null -+++ b/.gitignore -@@ -0,0 +1,7 @@ -+*.py[co] -+MANIFEST -+dist -+build -+*.kdev* -+*.kateproject -+ipython.log* diff --git a/scripts/urlgrabber b/scripts/urlgrabber index 518e512..09cd896 100644 --- a/scripts/urlgrabber @@ -83,8 +70,62 @@ index 50c6348..5fb43f9 100644 base_ftp = 'ftp://localhost/test/' # set to a proftp server only. we're working around a couple of +diff --git a/urlgrabber/byterange.py b/urlgrabber/byterange.py +index 3e5f3b7..8eeaeda 100644 +--- a/urlgrabber/byterange.py ++++ b/urlgrabber/byterange.py +@@ -68,7 +68,7 @@ class HTTPRangeHandler(urllib2.BaseHandler): + + def http_error_416(self, req, fp, code, msg, hdrs): + # HTTP's Range Not Satisfiable error +- raise RangeError('Requested Range Not Satisfiable') ++ raise RangeError(9, 'Requested Range Not Satisfiable') + + class HTTPSRangeHandler(HTTPRangeHandler): + """ Range Header support for HTTPS. """ +@@ -208,7 +208,7 @@ class RangeableFileObject: + bufsize = offset - pos + buf = self.fo.read(bufsize) + if len(buf) != bufsize: +- raise RangeError('Requested Range Not Satisfiable') ++ raise RangeError(9, 'Requested Range Not Satisfiable') + pos+= bufsize + + class FileRangeHandler(urllib2.FileHandler): +@@ -238,7 +238,7 @@ class FileRangeHandler(urllib2.FileHandler): + (fb,lb) = brange + if lb == '': lb = size + if fb < 0 or fb > size or lb > size: +- raise RangeError('Requested Range Not Satisfiable') ++ raise RangeError(9, 'Requested Range Not Satisfiable') + size = (lb - fb) + fo = RangeableFileObject(fo, (fb,lb)) + headers = mimetools.Message(StringIO( +@@ -318,12 +318,12 @@ class FTPRangeHandler(urllib2.FTPHandler): + (fb,lb) = range_tup + if lb == '': + if retrlen is None or retrlen == 0: +- raise RangeError('Requested Range Not Satisfiable due to unobtainable file length.') ++ raise RangeError(9, 'Requested Range Not Satisfiable due to unobtainable file length.') + lb = retrlen + retrlen = lb - fb + if retrlen < 0: + # beginning of range is larger than file +- raise RangeError('Requested Range Not Satisfiable') ++ raise RangeError(9, 'Requested Range Not Satisfiable') + else: + retrlen = lb - fb + fp = RangeableFileObject(fp, (0,retrlen)) +@@ -458,6 +458,6 @@ def range_tuple_normalize(range_tup): + # check if range is over the entire file + if (fb,lb) == (0,''): return None + # check that the range is valid +- if lb < fb: raise RangeError('Invalid byte range: %s-%s' % (fb,lb)) ++ if lb < fb: raise RangeError(9, 'Invalid byte range: %s-%s' % (fb,lb)) + return (fb,lb) + diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py -index e090e90..0c78857 100644 +index e090e90..b2770c5 100644 --- a/urlgrabber/grabber.py +++ b/urlgrabber/grabber.py @@ -68,14 +68,14 @@ GENERAL ARGUMENTS (kwargs) @@ -109,7 +150,20 @@ index e090e90..0c78857 100644 bandwidth = 0 -@@ -248,6 +248,11 @@ GENERAL ARGUMENTS (kwargs) +@@ -198,6 +198,12 @@ GENERAL ARGUMENTS (kwargs) + control, you should probably subclass URLParser and pass it in via + the 'urlparser' option. + ++ username = None ++ username to use for simple http auth - is automatically quoted for special characters ++ ++ password = None ++ password to use for simple http auth - is automatically quoted for special characters ++ + ssl_ca_cert = None + + this option can be used if M2Crypto is available and will be +@@ -248,6 +254,11 @@ GENERAL ARGUMENTS (kwargs) Maximum size (in bytes) of the headers. @@ -121,7 +175,15 @@ index e090e90..0c78857 100644 RETRY RELATED ARGUMENTS -@@ -439,6 +444,12 @@ try: +@@ -420,6 +431,7 @@ import time + import string + import urllib + import urllib2 ++from httplib import responses + import mimetools + import thread + import types +@@ -439,6 +451,12 @@ try: except: __version__ = '???' @@ -134,7 +196,38 @@ index e090e90..0c78857 100644 ######################################################################## # functions for debugging output. These functions are here because they # are also part of the module initialization. -@@ -800,6 +811,7 @@ class URLGrabberOptions: +@@ -527,6 +545,22 @@ def _(st): + # END MODULE INITIALIZATION + ######################################################################## + ++######################################################################## ++# UTILITY FUNCTIONS ++######################################################################## ++ ++# These functions are meant to be utilities for the urlgrabber library to use. ++ ++def _to_utf8(obj, errors='replace'): ++ '''convert 'unicode' to an encoded utf-8 byte string ''' ++ # stolen from yum.i18n ++ if isinstance(obj, unicode): ++ obj = obj.encode('utf-8', errors) ++ return obj ++ ++######################################################################## ++# END UTILITY FUNCTIONS ++######################################################################## + + + class URLGrabError(IOError): +@@ -662,6 +696,7 @@ class URLParser: + opts.quote = 0 --> do not quote it + opts.quote = None --> guess + """ ++ url = _to_utf8(url) + quote = opts.quote + + if opts.prefix: +@@ -800,6 +835,7 @@ class URLGrabberOptions: self.close_connection = 0 self.range = None self.user_agent = 'urlgrabber/%s' % __version__ @@ -142,7 +235,7 @@ index e090e90..0c78857 100644 self.keepalive = 1 self.proxies = None self.reget = None -@@ -808,7 +820,7 @@ class URLGrabberOptions: +@@ -808,13 +844,15 @@ class URLGrabberOptions: self.prefix = None self.opener = None self.cache_openers = True @@ -151,7 +244,24 @@ index e090e90..0c78857 100644 self.text = None self.http_headers = None self.ftp_headers = None -@@ -931,6 +943,9 @@ class URLGrabber: + self.data = None + self.urlparser = URLParser() + self.quote = None ++ self.username = None ++ self.password = None + self.ssl_ca_cert = None # sets SSL_CAINFO - path to certdb + self.ssl_context = None # no-op in pycurl + self.ssl_verify_peer = True # check peer's cert for authenticityb +@@ -846,7 +884,7 @@ class URLGrabberOptions: + s = s + indent + '}' + return s + +-class URLGrabber: ++class URLGrabber(object): + """Provides easy opening of URLs with a variety of options. + + All options are specified as kwargs. Options may be specified when +@@ -931,6 +969,9 @@ class URLGrabber: (scheme, host, path, parm, query, frag) = parts if filename is None: filename = os.path.basename( urllib.unquote(path) ) @@ -161,7 +271,16 @@ index e090e90..0c78857 100644 if scheme == 'file' and not opts.copy_local: # just return the name of the local file - don't make a # copy currently -@@ -1052,9 +1067,15 @@ class PyCurlFileObject(): +@@ -1030,7 +1071,7 @@ class URLGrabber: + default_grabber = URLGrabber() + + +-class PyCurlFileObject(): ++class PyCurlFileObject(object): + def __init__(self, url, filename, opts): + self.fo = None + self._hdr_dump = '' +@@ -1052,9 +1093,15 @@ class PyCurlFileObject(): self._reget_length = 0 self._prog_running = False self._error = (None, None) @@ -178,7 +297,7 @@ index e090e90..0c78857 100644 def __getattr__(self, name): """This effectively allows us to wrap at the instance level. -@@ -1085,9 +1106,14 @@ class PyCurlFileObject(): +@@ -1085,9 +1132,14 @@ class PyCurlFileObject(): return -1 def _hdr_retrieve(self, buf): @@ -194,7 +313,7 @@ index e090e90..0c78857 100644 try: self._hdr_dump += buf # we have to get the size before we do the progress obj start -@@ -1104,7 +1130,17 @@ class PyCurlFileObject(): +@@ -1104,7 +1156,17 @@ class PyCurlFileObject(): s = parse150(buf) if s: self.size = int(s) @@ -213,7 +332,7 @@ index e090e90..0c78857 100644 return len(buf) except KeyboardInterrupt: return pycurl.READFUNC_ABORT -@@ -1113,8 +1149,10 @@ class PyCurlFileObject(): +@@ -1113,8 +1175,10 @@ class PyCurlFileObject(): if self._parsed_hdr: return self._parsed_hdr statusend = self._hdr_dump.find('\n') @@ -224,7 +343,7 @@ index e090e90..0c78857 100644 self._parsed_hdr = mimetools.Message(hdrfp) return self._parsed_hdr -@@ -1136,11 +1174,21 @@ class PyCurlFileObject(): +@@ -1136,11 +1200,21 @@ class PyCurlFileObject(): self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update) self.curl_obj.setopt(pycurl.FAILONERROR, True) self.curl_obj.setopt(pycurl.OPT_FILETIME, True) @@ -246,7 +365,7 @@ index e090e90..0c78857 100644 # maybe to be options later self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True) -@@ -1148,9 +1196,11 @@ class PyCurlFileObject(): +@@ -1148,9 +1222,11 @@ class PyCurlFileObject(): # timeouts timeout = 300 @@ -261,26 +380,115 @@ index e090e90..0c78857 100644 # ssl options if self.scheme == 'https': -@@ -1276,7 +1326,7 @@ class PyCurlFileObject(): +@@ -1203,12 +1279,19 @@ class PyCurlFileObject(): + if proxy == '_none_': proxy = "" + self.curl_obj.setopt(pycurl.PROXY, proxy) + +- # FIXME username/password/auth settings ++ if opts.username and opts.password: ++ if self.scheme in ('http', 'https'): ++ self.curl_obj.setopt(pycurl.HTTPAUTH, pycurl.HTTPAUTH_ANY) ++ ++ if opts.username and opts.password: ++ # apparently when applying them as curlopts they do not require quoting of any kind ++ userpwd = '%s:%s' % (opts.username, opts.password) ++ self.curl_obj.setopt(pycurl.USERPWD, userpwd) + + #posts - simple - expects the fields as they are + if opts.data: + self.curl_obj.setopt(pycurl.POST, True) +- self.curl_obj.setopt(pycurl.POSTFIELDS, self._to_utf8(opts.data)) ++ self.curl_obj.setopt(pycurl.POSTFIELDS, _to_utf8(opts.data)) + + # our url + self.curl_obj.setopt(pycurl.URL, self.url) +@@ -1228,12 +1311,14 @@ class PyCurlFileObject(): + + code = self.http_code + errcode = e.args[0] ++ errurl = urllib.unquote(self.url) ++ + if self._error[0]: + errcode = self._error[0] + + if errcode == 23 and code >= 200 and code < 299: +- err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e)) +- err.url = self.url ++ err = URLGrabError(15, _('User (or something) called abort %s: %s') % (errurl, e)) ++ err.url = errurl + + # this is probably wrong but ultimately this is what happens + # we have a legit http code and a pycurl 'writer failed' code +@@ -1244,23 +1329,23 @@ class PyCurlFileObject(): + raise KeyboardInterrupt + + elif errcode == 28: +- err = URLGrabError(12, _('Timeout on %s: %s') % (self.url, e)) +- err.url = self.url ++ err = URLGrabError(12, _('Timeout on %s: %s') % (errurl, e)) ++ err.url = errurl + raise err + elif errcode == 35: + msg = _("problem making ssl connection") + err = URLGrabError(14, msg) +- err.url = self.url ++ err.url = errurl + raise err + elif errcode == 37: +- msg = _("Could not open/read %s") % (self.url) ++ msg = _("Could not open/read %s") % (errurl) + err = URLGrabError(14, msg) +- err.url = self.url ++ err.url = errurl + raise err + + elif errcode == 42: +- err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e)) +- err.url = self.url ++ err = URLGrabError(15, _('User (or something) called abort %s: %s') % (errurl, e)) ++ err.url = errurl + # this is probably wrong but ultimately this is what happens + # we have a legit http code and a pycurl 'writer failed' code + # which almost always means something aborted it from outside +@@ -1272,33 +1357,93 @@ class PyCurlFileObject(): + elif errcode == 58: + msg = _("problem with the local client certificate") + err = URLGrabError(14, msg) +- err.url = self.url ++ err.url = errurl raise err elif errcode == 60: - msg = _("client cert cannot be verified or client cert incorrect") + msg = _("Peer cert cannot be verified or peer cert invalid") err = URLGrabError(14, msg) - err.url = self.url +- err.url = self.url ++ err.url = errurl raise err -@@ -1291,14 +1341,70 @@ class PyCurlFileObject(): + + elif errcode == 63: + if self._error[1]: + msg = self._error[1] + else: +- msg = _("Max download size exceeded on %s") % (self.url) ++ msg = _("Max download size exceeded on %s") % () + err = URLGrabError(14, msg) +- err.url = self.url ++ err.url = errurl raise err elif str(e.args[1]) == '' and self.http_code != 0: # fake it until you make it - msg = 'HTTP Error %s : %s ' % (self.http_code, self.url) + if self.scheme in ['http', 'https']: -+ msg = 'HTTP Error %s : %s ' % (self.http_code, self.url) ++ if self.http_code in responses: ++ resp = responses[self.http_code] ++ msg = 'HTTP Error %s - %s : %s' % (self.http_code, resp, errurl) ++ else: ++ msg = 'HTTP Error %s : %s ' % (self.http_code, errurl) + elif self.scheme in ['ftp']: -+ msg = 'FTP Error %s : %s ' % (self.http_code, self.url) ++ msg = 'FTP Error %s : %s ' % (self.http_code, errurl) + else: -+ msg = "Unknown Error: URL=%s , scheme=%s" % (self.url, self.scheme) ++ msg = "Unknown Error: URL=%s , scheme=%s" % (errurl, self.scheme) else: - msg = 'PYCURL ERROR %s - "%s"' % (errcode, str(e.args[1])) + pyerr2str = { 5 : _("Couldn't resolve proxy"), @@ -338,12 +546,25 @@ index e090e90..0c78857 100644 + if self._error[1]: + msg = self._error[1] + err = URLGRabError(14, msg) -+ err.url = self.url ++ err.url = urllib.unquote(self.url) + raise err def _do_open(self): self.curl_obj = _curl_cache -@@ -1434,9 +1540,13 @@ class PyCurlFileObject(): +@@ -1333,7 +1478,11 @@ class PyCurlFileObject(): + + if self.opts.range: + rt = self.opts.range +- if rt[0]: rt = (rt[0] + reget_length, rt[1]) ++ ++ if rt[0] is None: ++ rt = (0, rt[1]) ++ rt = (rt[0] + reget_length, rt[1]) ++ + + if rt: + header = range_tuple_to_header(rt) +@@ -1434,9 +1583,13 @@ class PyCurlFileObject(): #fh, self._temp_name = mkstemp() #self.fo = open(self._temp_name, 'wb') @@ -360,7 +581,7 @@ index e090e90..0c78857 100644 if _was_filename: -@@ -1446,9 +1556,23 @@ class PyCurlFileObject(): +@@ -1446,9 +1599,23 @@ class PyCurlFileObject(): # set the time mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME) if mod_time != -1: @@ -386,7 +607,7 @@ index e090e90..0c78857 100644 else: #self.fo = open(self._temp_name, 'r') self.fo.seek(0) -@@ -1532,11 +1656,14 @@ class PyCurlFileObject(): +@@ -1532,11 +1699,14 @@ class PyCurlFileObject(): def _over_max_size(self, cur, max_size=None): if not max_size: @@ -405,7 +626,21 @@ index e090e90..0c78857 100644 msg = _("Downloaded more than max size for %s: %s > %s") \ % (self.url, cur, max_size) -@@ -1582,9 +1709,21 @@ class PyCurlFileObject(): +@@ -1544,13 +1714,6 @@ class PyCurlFileObject(): + return True + return False + +- def _to_utf8(self, obj, errors='replace'): +- '''convert 'unicode' to an encoded utf-8 byte string ''' +- # stolen from yum.i18n +- if isinstance(obj, unicode): +- obj = obj.encode('utf-8', errors) +- return obj +- + def read(self, amt=None): + self._fill_buffer(amt) + if amt is None: +@@ -1582,9 +1745,21 @@ class PyCurlFileObject(): self.opts.progress_obj.end(self._amount_read) self.fo.close() @@ -428,6 +663,29 @@ index e090e90..0c78857 100644 ##################################################################### # DEPRECATED FUNCTIONS +diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py +index dad410b..8731aed 100644 +--- a/urlgrabber/mirror.py ++++ b/urlgrabber/mirror.py +@@ -90,7 +90,7 @@ CUSTOMIZATION + import random + import thread # needed for locking to make this threadsafe + +-from grabber import URLGrabError, CallbackObject, DEBUG ++from grabber import URLGrabError, CallbackObject, DEBUG, _to_utf8 + + def _(st): + return st +@@ -263,7 +263,8 @@ class MirrorGroup: + def _parse_mirrors(self, mirrors): + parsed_mirrors = [] + for m in mirrors: +- if type(m) == type(''): m = {'mirror': m} ++ if isinstance(m, basestring): ++ m = {'mirror': _to_utf8(m)} + parsed_mirrors.append(m) + return parsed_mirrors + diff --git a/urlgrabber/progress.py b/urlgrabber/progress.py index dd07c6a..45eb248 100644 --- a/urlgrabber/progress.py