diff --git a/scripts/urlgrabber b/scripts/urlgrabber index 518e512..09cd896 100644 --- a/scripts/urlgrabber +++ b/scripts/urlgrabber @@ -115,6 +115,7 @@ options: including quotes in the case of strings. e.g. --user_agent='"foobar/2.0"' + --output FILE -o FILE write output to FILE, otherwise the basename of the url will be used -O print the names of saved files to STDOUT @@ -170,12 +171,17 @@ class client_options: return ug_options, ug_defaults def process_command_line(self): - short_options = 'vd:hoOpD' + short_options = 'vd:ho:OpD' long_options = ['profile', 'repeat=', 'verbose=', - 'debug=', 'help', 'progress'] + 'debug=', 'help', 'progress', 'output='] ug_long = [ o + '=' for o in self.ug_options ] - optlist, args = getopt.getopt(sys.argv[1:], short_options, - long_options + ug_long) + try: + optlist, args = getopt.getopt(sys.argv[1:], short_options, + long_options + ug_long) + except getopt.GetoptError, e: + print >>sys.stderr, "Error:", e + self.help([], ret=1) + self.verbose = 0 self.debug = None self.outputfile = None @@ -193,6 +199,7 @@ class client_options: if o == '--verbose': self.verbose = v if o == '-v': self.verbose += 1 if o == '-o': self.outputfile = v + if o == '--output': self.outputfile = v if o == '-p' or o == '--progress': self.progress = 1 if o == '-d' or o == '--debug': self.debug = v if o == '--profile': self.profile = 1 @@ -222,7 +229,7 @@ class client_options: print "ERROR: cannot use -o when grabbing multiple files" sys.exit(1) - def help(self, args): + def help(self, args, ret=0): if not args: print MAINHELP else: @@ -234,7 +241,7 @@ class client_options: self.help_ug_option(a) else: print 'ERROR: no help on command "%s"' % a - sys.exit(0) + sys.exit(ret) def help_doc(self): print __doc__ diff --git a/test/base_test_code.py b/test/base_test_code.py index 50c6348..5fb43f9 100644 --- a/test/base_test_code.py +++ b/test/base_test_code.py @@ -1,6 +1,6 @@ from munittest import * -base_http = 'http://www.linux.duke.edu/projects/urlgrabber/test/' +base_http = 'http://urlgrabber.baseurl.org/test/' base_ftp = 'ftp://localhost/test/' # set to a proftp server only. we're working around a couple of diff --git a/urlgrabber/byterange.py b/urlgrabber/byterange.py index 3e5f3b7..8eeaeda 100644 --- a/urlgrabber/byterange.py +++ b/urlgrabber/byterange.py @@ -68,7 +68,7 @@ class HTTPRangeHandler(urllib2.BaseHandler): def http_error_416(self, req, fp, code, msg, hdrs): # HTTP's Range Not Satisfiable error - raise RangeError('Requested Range Not Satisfiable') + raise RangeError(9, 'Requested Range Not Satisfiable') class HTTPSRangeHandler(HTTPRangeHandler): """ Range Header support for HTTPS. """ @@ -208,7 +208,7 @@ class RangeableFileObject: bufsize = offset - pos buf = self.fo.read(bufsize) if len(buf) != bufsize: - raise RangeError('Requested Range Not Satisfiable') + raise RangeError(9, 'Requested Range Not Satisfiable') pos+= bufsize class FileRangeHandler(urllib2.FileHandler): @@ -238,7 +238,7 @@ class FileRangeHandler(urllib2.FileHandler): (fb,lb) = brange if lb == '': lb = size if fb < 0 or fb > size or lb > size: - raise RangeError('Requested Range Not Satisfiable') + raise RangeError(9, 'Requested Range Not Satisfiable') size = (lb - fb) fo = RangeableFileObject(fo, (fb,lb)) headers = mimetools.Message(StringIO( @@ -318,12 +318,12 @@ class FTPRangeHandler(urllib2.FTPHandler): (fb,lb) = range_tup if lb == '': if retrlen is None or retrlen == 0: - raise RangeError('Requested Range Not Satisfiable due to unobtainable file length.') + raise RangeError(9, 'Requested Range Not Satisfiable due to unobtainable file length.') lb = retrlen retrlen = lb - fb if retrlen < 0: # beginning of range is larger than file - raise RangeError('Requested Range Not Satisfiable') + raise RangeError(9, 'Requested Range Not Satisfiable') else: retrlen = lb - fb fp = RangeableFileObject(fp, (0,retrlen)) @@ -458,6 +458,6 @@ def range_tuple_normalize(range_tup): # check if range is over the entire file if (fb,lb) == (0,''): return None # check that the range is valid - if lb < fb: raise RangeError('Invalid byte range: %s-%s' % (fb,lb)) + if lb < fb: raise RangeError(9, 'Invalid byte range: %s-%s' % (fb,lb)) return (fb,lb) diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py index e090e90..b2770c5 100644 --- a/urlgrabber/grabber.py +++ b/urlgrabber/grabber.py @@ -68,14 +68,14 @@ GENERAL ARGUMENTS (kwargs) (which can be set on default_grabber.throttle) is used. See BANDWIDTH THROTTLING for more information. - timeout = None + timeout = 300 - a positive float expressing the number of seconds to wait for socket - operations. If the value is None or 0.0, socket operations will block - forever. Setting this option causes urlgrabber to call the settimeout - method on the Socket object used for the request. See the Python - documentation on settimeout for more information. - http://www.python.org/doc/current/lib/socket-objects.html + a positive integer expressing the number of seconds to wait before + timing out attempts to connect to a server. If the value is None + or 0, connection attempts will not time out. The timeout is passed + to the underlying pycurl object as its CONNECTTIMEOUT option, see + the curl documentation on CURLOPT_CONNECTTIMEOUT for more information. + http://curl.haxx.se/libcurl/c/curl_easy_setopt.html#CURLOPTCONNECTTIMEOUT bandwidth = 0 @@ -198,6 +198,12 @@ GENERAL ARGUMENTS (kwargs) control, you should probably subclass URLParser and pass it in via the 'urlparser' option. + username = None + username to use for simple http auth - is automatically quoted for special characters + + password = None + password to use for simple http auth - is automatically quoted for special characters + ssl_ca_cert = None this option can be used if M2Crypto is available and will be @@ -248,6 +254,11 @@ GENERAL ARGUMENTS (kwargs) Maximum size (in bytes) of the headers. + self.ip_resolve = 'whatever' + + What type of name to IP resolving to use, default is to do both IPV4 and + IPV6. + RETRY RELATED ARGUMENTS @@ -420,6 +431,7 @@ import time import string import urllib import urllib2 +from httplib import responses import mimetools import thread import types @@ -439,6 +451,12 @@ try: except: __version__ = '???' +try: + # this part isn't going to do much - need to talk to gettext + from i18n import _ +except ImportError, msg: + def _(st): return st + ######################################################################## # functions for debugging output. These functions are here because they # are also part of the module initialization. @@ -527,6 +545,22 @@ def _(st): # END MODULE INITIALIZATION ######################################################################## +######################################################################## +# UTILITY FUNCTIONS +######################################################################## + +# These functions are meant to be utilities for the urlgrabber library to use. + +def _to_utf8(obj, errors='replace'): + '''convert 'unicode' to an encoded utf-8 byte string ''' + # stolen from yum.i18n + if isinstance(obj, unicode): + obj = obj.encode('utf-8', errors) + return obj + +######################################################################## +# END UTILITY FUNCTIONS +######################################################################## class URLGrabError(IOError): @@ -662,6 +696,7 @@ class URLParser: opts.quote = 0 --> do not quote it opts.quote = None --> guess """ + url = _to_utf8(url) quote = opts.quote if opts.prefix: @@ -800,6 +835,7 @@ class URLGrabberOptions: self.close_connection = 0 self.range = None self.user_agent = 'urlgrabber/%s' % __version__ + self.ip_resolve = None self.keepalive = 1 self.proxies = None self.reget = None @@ -808,13 +844,15 @@ class URLGrabberOptions: self.prefix = None self.opener = None self.cache_openers = True - self.timeout = None + self.timeout = 300 self.text = None self.http_headers = None self.ftp_headers = None self.data = None self.urlparser = URLParser() self.quote = None + self.username = None + self.password = None self.ssl_ca_cert = None # sets SSL_CAINFO - path to certdb self.ssl_context = None # no-op in pycurl self.ssl_verify_peer = True # check peer's cert for authenticityb @@ -846,7 +884,7 @@ class URLGrabberOptions: s = s + indent + '}' return s -class URLGrabber: +class URLGrabber(object): """Provides easy opening of URLs with a variety of options. All options are specified as kwargs. Options may be specified when @@ -931,6 +969,9 @@ class URLGrabber: (scheme, host, path, parm, query, frag) = parts if filename is None: filename = os.path.basename( urllib.unquote(path) ) + if not filename: + # This is better than nothing. + filename = 'index.html' if scheme == 'file' and not opts.copy_local: # just return the name of the local file - don't make a # copy currently @@ -1030,7 +1071,7 @@ class URLGrabber: default_grabber = URLGrabber() -class PyCurlFileObject(): +class PyCurlFileObject(object): def __init__(self, url, filename, opts): self.fo = None self._hdr_dump = '' @@ -1052,9 +1093,15 @@ class PyCurlFileObject(): self._reget_length = 0 self._prog_running = False self._error = (None, None) - self.size = None + self.size = 0 + self._hdr_ended = False self._do_open() + + def geturl(self): + """ Provide the geturl() method, used to be got from + urllib.addinfourl, via. urllib.URLopener.* """ + return self.url def __getattr__(self, name): """This effectively allows us to wrap at the instance level. @@ -1085,9 +1132,14 @@ class PyCurlFileObject(): return -1 def _hdr_retrieve(self, buf): + if self._hdr_ended: + self._hdr_dump = '' + self.size = 0 + self._hdr_ended = False + if self._over_max_size(cur=len(self._hdr_dump), max_size=self.opts.max_header_size): - return -1 + return -1 try: self._hdr_dump += buf # we have to get the size before we do the progress obj start @@ -1104,7 +1156,17 @@ class PyCurlFileObject(): s = parse150(buf) if s: self.size = int(s) - + + if buf.lower().find('location') != -1: + location = ':'.join(buf.split(':')[1:]) + location = location.strip() + self.scheme = urlparse.urlsplit(location)[0] + self.url = location + + if len(self._hdr_dump) != 0 and buf == '\r\n': + self._hdr_ended = True + if DEBUG: DEBUG.info('header ended:') + return len(buf) except KeyboardInterrupt: return pycurl.READFUNC_ABORT @@ -1113,8 +1175,10 @@ class PyCurlFileObject(): if self._parsed_hdr: return self._parsed_hdr statusend = self._hdr_dump.find('\n') + statusend += 1 # ridiculous as it may seem. hdrfp = StringIO() hdrfp.write(self._hdr_dump[statusend:]) + hdrfp.seek(0) self._parsed_hdr = mimetools.Message(hdrfp) return self._parsed_hdr @@ -1136,11 +1200,21 @@ class PyCurlFileObject(): self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update) self.curl_obj.setopt(pycurl.FAILONERROR, True) self.curl_obj.setopt(pycurl.OPT_FILETIME, True) + self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True) if DEBUG: self.curl_obj.setopt(pycurl.VERBOSE, True) if opts.user_agent: self.curl_obj.setopt(pycurl.USERAGENT, opts.user_agent) + if opts.ip_resolve: + # Default is: IPRESOLVE_WHATEVER + ipr = opts.ip_resolve.lower() + if ipr == 'whatever': # Do we need this? + self.curl_obj.setopt(pycurl.IPRESOLVE,pycurl.IPRESOLVE_WHATEVER) + if ipr == 'ipv4': + self.curl_obj.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V4) + if ipr == 'ipv6': + self.curl_obj.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V6) # maybe to be options later self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True) @@ -1148,9 +1222,11 @@ class PyCurlFileObject(): # timeouts timeout = 300 - if opts.timeout: - timeout = int(opts.timeout) - self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout) + if hasattr(opts, 'timeout'): + timeout = int(opts.timeout or 0) + self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout) + self.curl_obj.setopt(pycurl.LOW_SPEED_LIMIT, 1) + self.curl_obj.setopt(pycurl.LOW_SPEED_TIME, timeout) # ssl options if self.scheme == 'https': @@ -1203,12 +1279,19 @@ class PyCurlFileObject(): if proxy == '_none_': proxy = "" self.curl_obj.setopt(pycurl.PROXY, proxy) - # FIXME username/password/auth settings + if opts.username and opts.password: + if self.scheme in ('http', 'https'): + self.curl_obj.setopt(pycurl.HTTPAUTH, pycurl.HTTPAUTH_ANY) + + if opts.username and opts.password: + # apparently when applying them as curlopts they do not require quoting of any kind + userpwd = '%s:%s' % (opts.username, opts.password) + self.curl_obj.setopt(pycurl.USERPWD, userpwd) #posts - simple - expects the fields as they are if opts.data: self.curl_obj.setopt(pycurl.POST, True) - self.curl_obj.setopt(pycurl.POSTFIELDS, self._to_utf8(opts.data)) + self.curl_obj.setopt(pycurl.POSTFIELDS, _to_utf8(opts.data)) # our url self.curl_obj.setopt(pycurl.URL, self.url) @@ -1228,12 +1311,14 @@ class PyCurlFileObject(): code = self.http_code errcode = e.args[0] + errurl = urllib.unquote(self.url) + if self._error[0]: errcode = self._error[0] if errcode == 23 and code >= 200 and code < 299: - err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e)) - err.url = self.url + err = URLGrabError(15, _('User (or something) called abort %s: %s') % (errurl, e)) + err.url = errurl # this is probably wrong but ultimately this is what happens # we have a legit http code and a pycurl 'writer failed' code @@ -1244,23 +1329,23 @@ class PyCurlFileObject(): raise KeyboardInterrupt elif errcode == 28: - err = URLGrabError(12, _('Timeout on %s: %s') % (self.url, e)) - err.url = self.url + err = URLGrabError(12, _('Timeout on %s: %s') % (errurl, e)) + err.url = errurl raise err elif errcode == 35: msg = _("problem making ssl connection") err = URLGrabError(14, msg) - err.url = self.url + err.url = errurl raise err elif errcode == 37: - msg = _("Could not open/read %s") % (self.url) + msg = _("Could not open/read %s") % (errurl) err = URLGrabError(14, msg) - err.url = self.url + err.url = errurl raise err elif errcode == 42: - err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e)) - err.url = self.url + err = URLGrabError(15, _('User (or something) called abort %s: %s') % (errurl, e)) + err.url = errurl # this is probably wrong but ultimately this is what happens # we have a legit http code and a pycurl 'writer failed' code # which almost always means something aborted it from outside @@ -1272,33 +1357,93 @@ class PyCurlFileObject(): elif errcode == 58: msg = _("problem with the local client certificate") err = URLGrabError(14, msg) - err.url = self.url + err.url = errurl raise err elif errcode == 60: - msg = _("client cert cannot be verified or client cert incorrect") + msg = _("Peer cert cannot be verified or peer cert invalid") err = URLGrabError(14, msg) - err.url = self.url + err.url = errurl raise err elif errcode == 63: if self._error[1]: msg = self._error[1] else: - msg = _("Max download size exceeded on %s") % (self.url) + msg = _("Max download size exceeded on %s") % () err = URLGrabError(14, msg) - err.url = self.url + err.url = errurl raise err elif str(e.args[1]) == '' and self.http_code != 0: # fake it until you make it - msg = 'HTTP Error %s : %s ' % (self.http_code, self.url) + if self.scheme in ['http', 'https']: + if self.http_code in responses: + resp = responses[self.http_code] + msg = 'HTTP Error %s - %s : %s' % (self.http_code, resp, errurl) + else: + msg = 'HTTP Error %s : %s ' % (self.http_code, errurl) + elif self.scheme in ['ftp']: + msg = 'FTP Error %s : %s ' % (self.http_code, errurl) + else: + msg = "Unknown Error: URL=%s , scheme=%s" % (errurl, self.scheme) else: - msg = 'PYCURL ERROR %s - "%s"' % (errcode, str(e.args[1])) + pyerr2str = { 5 : _("Couldn't resolve proxy"), + 6 : _("Couldn't resolve host"), + 7 : _("Couldn't connect"), + 8 : _("Bad reply to FTP server"), + 9 : _("Access denied"), + 11 : _("Bad reply to FTP pass"), + 13 : _("Bad reply to FTP pasv"), + 14 : _("Bad reply to FTP 227"), + 15 : _("Couldn't get FTP host"), + 17 : _("Couldn't set FTP type"), + 18 : _("Partial file"), + 19 : _("FTP RETR command failed"), + 22 : _("HTTP returned error"), + 23 : _("Write error"), + 25 : _("Upload failed"), + 26 : _("Read error"), + 27 : _("Out of Memory"), + 28 : _("Operation timed out"), + 30 : _("FTP PORT command failed"), + 31 : _("FTP REST command failed"), + 33 : _("Range failed"), + 34 : _("HTTP POST failed"), + 35 : _("SSL CONNECT failed"), + 36 : _("Couldn't resume download"), + 37 : _("Couldn't read file"), + 42 : _("Aborted by callback"), + 47 : _("Too many redirects"), + 51 : _("Peer certificate failed verification"), + 53 : _("SSL engine not found"), + 54 : _("SSL engine set failed"), + 55 : _("Network error send()"), + 56 : _("Network error recv()"), + 58 : _("Local certificate failed"), + 59 : _("SSL set cipher failed"), + 60 : _("Local CA certificate failed"), + 61 : _("HTTP bad transfer encoding"), + 63 : _("Maximum file size exceeded"), + 64 : _("FTP SSL failed"), + 67 : _("Authentication failure"), + 70 : _("Out of disk space on server"), + 73 : _("Remove file exists"), + } + errstr = str(e.args[1]) + if not errstr: + errstr = pyerr2str.get(errcode, '') + msg = 'curl#%s - "%s"' % (errcode, errstr) code = errcode err = URLGrabError(14, msg) err.code = code err.exception = e raise err + else: + if self._error[1]: + msg = self._error[1] + err = URLGRabError(14, msg) + err.url = urllib.unquote(self.url) + raise err def _do_open(self): self.curl_obj = _curl_cache @@ -1333,7 +1478,11 @@ class PyCurlFileObject(): if self.opts.range: rt = self.opts.range - if rt[0]: rt = (rt[0] + reget_length, rt[1]) + + if rt[0] is None: + rt = (0, rt[1]) + rt = (rt[0] + reget_length, rt[1]) + if rt: header = range_tuple_to_header(rt) @@ -1434,9 +1583,13 @@ class PyCurlFileObject(): #fh, self._temp_name = mkstemp() #self.fo = open(self._temp_name, 'wb') - - self._do_perform() - + try: + self._do_perform() + except URLGrabError, e: + self.fo.flush() + self.fo.close() + raise e + if _was_filename: @@ -1446,9 +1599,23 @@ class PyCurlFileObject(): # set the time mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME) if mod_time != -1: - os.utime(self.filename, (mod_time, mod_time)) + try: + os.utime(self.filename, (mod_time, mod_time)) + except OSError, e: + err = URLGrabError(16, _(\ + 'error setting timestamp on file %s from %s, OSError: %s') + % (self.filename, self.url, e)) + err.url = self.url + raise err # re open it - self.fo = open(self.filename, 'r') + try: + self.fo = open(self.filename, 'r') + except IOError, e: + err = URLGrabError(16, _(\ + 'error opening file from %s, IOError: %s') % (self.url, e)) + err.url = self.url + raise err + else: #self.fo = open(self._temp_name, 'r') self.fo.seek(0) @@ -1532,11 +1699,14 @@ class PyCurlFileObject(): def _over_max_size(self, cur, max_size=None): if not max_size: - max_size = self.size - if self.opts.size: # if we set an opts size use that, no matter what - max_size = self.opts.size + if not self.opts.size: + max_size = self.size + else: + max_size = self.opts.size + if not max_size: return False # if we have None for all of the Max then this is dumb - if cur > max_size + max_size*.10: + + if cur > int(float(max_size) * 1.10): msg = _("Downloaded more than max size for %s: %s > %s") \ % (self.url, cur, max_size) @@ -1544,13 +1714,6 @@ class PyCurlFileObject(): return True return False - def _to_utf8(self, obj, errors='replace'): - '''convert 'unicode' to an encoded utf-8 byte string ''' - # stolen from yum.i18n - if isinstance(obj, unicode): - obj = obj.encode('utf-8', errors) - return obj - def read(self, amt=None): self._fill_buffer(amt) if amt is None: @@ -1582,9 +1745,21 @@ class PyCurlFileObject(): self.opts.progress_obj.end(self._amount_read) self.fo.close() - + def geturl(self): + """ Provide the geturl() method, used to be got from + urllib.addinfourl, via. urllib.URLopener.* """ + return self.url + _curl_cache = pycurl.Curl() # make one and reuse it over and over and over +def reset_curl_obj(): + """To make sure curl has reread the network/dns info we force a reload""" + global _curl_cache + _curl_cache.close() + _curl_cache = pycurl.Curl() + + + ##################################################################### # DEPRECATED FUNCTIONS diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py index dad410b..8731aed 100644 --- a/urlgrabber/mirror.py +++ b/urlgrabber/mirror.py @@ -90,7 +90,7 @@ CUSTOMIZATION import random import thread # needed for locking to make this threadsafe -from grabber import URLGrabError, CallbackObject, DEBUG +from grabber import URLGrabError, CallbackObject, DEBUG, _to_utf8 def _(st): return st @@ -263,7 +263,8 @@ class MirrorGroup: def _parse_mirrors(self, mirrors): parsed_mirrors = [] for m in mirrors: - if type(m) == type(''): m = {'mirror': m} + if isinstance(m, basestring): + m = {'mirror': _to_utf8(m)} parsed_mirrors.append(m) return parsed_mirrors diff --git a/urlgrabber/progress.py b/urlgrabber/progress.py index dd07c6a..45eb248 100644 --- a/urlgrabber/progress.py +++ b/urlgrabber/progress.py @@ -658,6 +658,8 @@ def format_time(seconds, use_hours=0): if seconds is None or seconds < 0: if use_hours: return '--:--:--' else: return '--:--' + elif seconds == float('inf'): + return 'Infinite' else: seconds = int(seconds) minutes = seconds / 60