You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
python-urlgrabber/urlgrabber-HEAD.patch

1200 lines
46 KiB

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..1ffe416
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,7 @@
+*.py[co]
+MANIFEST
+dist
+build
+*.kdev*
+*.kateproject
+ipython.log*
diff --git a/scripts/urlgrabber b/scripts/urlgrabber
index 518e512..09cd896 100644
--- a/scripts/urlgrabber
+++ b/scripts/urlgrabber
@@ -115,6 +115,7 @@ options:
including quotes in the case of strings.
e.g. --user_agent='"foobar/2.0"'
+ --output FILE
-o FILE write output to FILE, otherwise the basename of the
url will be used
-O print the names of saved files to STDOUT
@@ -170,12 +171,17 @@ class client_options:
return ug_options, ug_defaults
def process_command_line(self):
- short_options = 'vd:hoOpD'
+ short_options = 'vd:ho:OpD'
long_options = ['profile', 'repeat=', 'verbose=',
- 'debug=', 'help', 'progress']
+ 'debug=', 'help', 'progress', 'output=']
ug_long = [ o + '=' for o in self.ug_options ]
- optlist, args = getopt.getopt(sys.argv[1:], short_options,
- long_options + ug_long)
+ try:
+ optlist, args = getopt.getopt(sys.argv[1:], short_options,
+ long_options + ug_long)
+ except getopt.GetoptError, e:
+ print >>sys.stderr, "Error:", e
+ self.help([], ret=1)
+
self.verbose = 0
self.debug = None
self.outputfile = None
@@ -193,6 +199,7 @@ class client_options:
if o == '--verbose': self.verbose = v
if o == '-v': self.verbose += 1
if o == '-o': self.outputfile = v
+ if o == '--output': self.outputfile = v
if o == '-p' or o == '--progress': self.progress = 1
if o == '-d' or o == '--debug': self.debug = v
if o == '--profile': self.profile = 1
@@ -222,7 +229,7 @@ class client_options:
print "ERROR: cannot use -o when grabbing multiple files"
sys.exit(1)
- def help(self, args):
+ def help(self, args, ret=0):
if not args:
print MAINHELP
else:
@@ -234,7 +241,7 @@ class client_options:
self.help_ug_option(a)
else:
print 'ERROR: no help on command "%s"' % a
- sys.exit(0)
+ sys.exit(ret)
def help_doc(self):
print __doc__
diff --git a/test/base_test_code.py b/test/base_test_code.py
index 50c6348..5fb43f9 100644
--- a/test/base_test_code.py
+++ b/test/base_test_code.py
@@ -1,6 +1,6 @@
from munittest import *
-base_http = 'http://www.linux.duke.edu/projects/urlgrabber/test/'
+base_http = 'http://urlgrabber.baseurl.org/test/'
base_ftp = 'ftp://localhost/test/'
# set to a proftp server only. we're working around a couple of
15 years ago
diff --git a/urlgrabber/byterange.py b/urlgrabber/byterange.py
index 3e5f3b7..8eeaeda 100644
--- a/urlgrabber/byterange.py
+++ b/urlgrabber/byterange.py
@@ -68,7 +68,7 @@ class HTTPRangeHandler(urllib2.BaseHandler):
def http_error_416(self, req, fp, code, msg, hdrs):
# HTTP's Range Not Satisfiable error
- raise RangeError('Requested Range Not Satisfiable')
+ raise RangeError(9, 'Requested Range Not Satisfiable')
class HTTPSRangeHandler(HTTPRangeHandler):
""" Range Header support for HTTPS. """
@@ -208,7 +208,7 @@ class RangeableFileObject:
bufsize = offset - pos
buf = self.fo.read(bufsize)
if len(buf) != bufsize:
- raise RangeError('Requested Range Not Satisfiable')
+ raise RangeError(9, 'Requested Range Not Satisfiable')
pos+= bufsize
class FileRangeHandler(urllib2.FileHandler):
@@ -238,7 +238,7 @@ class FileRangeHandler(urllib2.FileHandler):
(fb,lb) = brange
if lb == '': lb = size
if fb < 0 or fb > size or lb > size:
- raise RangeError('Requested Range Not Satisfiable')
+ raise RangeError(9, 'Requested Range Not Satisfiable')
size = (lb - fb)
fo = RangeableFileObject(fo, (fb,lb))
headers = mimetools.Message(StringIO(
@@ -318,12 +318,12 @@ class FTPRangeHandler(urllib2.FTPHandler):
(fb,lb) = range_tup
if lb == '':
if retrlen is None or retrlen == 0:
- raise RangeError('Requested Range Not Satisfiable due to unobtainable file length.')
+ raise RangeError(9, 'Requested Range Not Satisfiable due to unobtainable file length.')
lb = retrlen
retrlen = lb - fb
if retrlen < 0:
# beginning of range is larger than file
- raise RangeError('Requested Range Not Satisfiable')
+ raise RangeError(9, 'Requested Range Not Satisfiable')
else:
retrlen = lb - fb
fp = RangeableFileObject(fp, (0,retrlen))
@@ -458,6 +458,6 @@ def range_tuple_normalize(range_tup):
# check if range is over the entire file
if (fb,lb) == (0,''): return None
# check that the range is valid
- if lb < fb: raise RangeError('Invalid byte range: %s-%s' % (fb,lb))
+ if lb < fb: raise RangeError(9, 'Invalid byte range: %s-%s' % (fb,lb))
return (fb,lb)
diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py
index e090e90..38ae1f7 100644
--- a/urlgrabber/grabber.py
+++ b/urlgrabber/grabber.py
@@ -49,7 +49,7 @@ GENERAL ARGUMENTS (kwargs)
progress_obj = None
a class instance that supports the following methods:
- po.start(filename, url, basename, length, text)
+ po.start(filename, url, basename, size, now, text)
# length will be None if unknown
po.update(read) # read == bytes read so far
po.end()
@@ -68,14 +68,14 @@ GENERAL ARGUMENTS (kwargs)
(which can be set on default_grabber.throttle) is used. See
BANDWIDTH THROTTLING for more information.
- timeout = None
+ timeout = 300
- a positive float expressing the number of seconds to wait for socket
- operations. If the value is None or 0.0, socket operations will block
- forever. Setting this option causes urlgrabber to call the settimeout
- method on the Socket object used for the request. See the Python
- documentation on settimeout for more information.
- http://www.python.org/doc/current/lib/socket-objects.html
+ a positive integer expressing the number of seconds to wait before
+ timing out attempts to connect to a server. If the value is None
+ or 0, connection attempts will not time out. The timeout is passed
+ to the underlying pycurl object as its CONNECTTIMEOUT option, see
+ the curl documentation on CURLOPT_CONNECTTIMEOUT for more information.
+ http://curl.haxx.se/libcurl/c/curl_easy_setopt.html#CURLOPTCONNECTTIMEOUT
bandwidth = 0
@@ -143,8 +143,12 @@ GENERAL ARGUMENTS (kwargs)
note that proxy authentication information may be provided using
normal URL constructs:
proxies={ 'http' : 'http://user:host@foo:3128' }
- Lastly, if proxies is None, the default environment settings will
- be used.
+
+ libproxy = False
+
+ Use the libproxy module (if installed) to find proxies.
+ The libproxy code is only used if the proxies dictionary
+ does not provide any proxies.
prefix = None
@@ -198,6 +202,12 @@ GENERAL ARGUMENTS (kwargs)
15 years ago
control, you should probably subclass URLParser and pass it in via
the 'urlparser' option.
+ username = None
+ username to use for simple http auth - is automatically quoted for special characters
+
+ password = None
+ password to use for simple http auth - is automatically quoted for special characters
+
ssl_ca_cert = None
this option can be used if M2Crypto is available and will be
@@ -211,43 +221,48 @@ GENERAL ARGUMENTS (kwargs)
No-op when using the curl backend (default)
- self.ssl_verify_peer = True
+ ssl_verify_peer = True
Check the server's certificate to make sure it is valid with what our CA validates
- self.ssl_verify_host = True
+ ssl_verify_host = True
Check the server's hostname to make sure it matches the certificate DN
- self.ssl_key = None
+ ssl_key = None
Path to the key the client should use to connect/authenticate with
- self.ssl_key_type = 'PEM'
+ ssl_key_type = 'PEM'
PEM or DER - format of key
- self.ssl_cert = None
+ ssl_cert = None
Path to the ssl certificate the client should use to to authenticate with
- self.ssl_cert_type = 'PEM'
+ ssl_cert_type = 'PEM'
PEM or DER - format of certificate
- self.ssl_key_pass = None
+ ssl_key_pass = None
password to access the ssl_key
- self.size = None
+ size = None
size (in bytes) or Maximum size of the thing being downloaded.
This is mostly to keep us from exploding with an endless datastream
- self.max_header_size = 2097152
+ max_header_size = 2097152
Maximum size (in bytes) of the headers.
+ ip_resolve = 'whatever'
+
+ What type of name to IP resolving to use, default is to do both IPV4 and
+ IPV6.
+
RETRY RELATED ARGUMENTS
@@ -420,6 +435,7 @@ import time
15 years ago
import string
import urllib
import urllib2
+from httplib import responses
import mimetools
import thread
import types
@@ -431,6 +447,14 @@ from httplib import HTTPException
import socket
from byterange import range_tuple_normalize, range_tuple_to_header, RangeError
+try:
+ import xattr
+ if not hasattr(xattr, 'set'):
+ xattr = None # This is a "newer" API.
+except ImportError:
+ xattr = None
+
+
########################################################################
# MODULE INITIALIZATION
########################################################################
@@ -439,6 +463,12 @@ try:
except:
__version__ = '???'
+try:
+ # this part isn't going to do much - need to talk to gettext
+ from i18n import _
+except ImportError, msg:
+ def _(st): return st
+
########################################################################
# functions for debugging output. These functions are here because they
# are also part of the module initialization.
@@ -527,6 +557,22 @@ def _(st):
15 years ago
# END MODULE INITIALIZATION
########################################################################
+########################################################################
+# UTILITY FUNCTIONS
+########################################################################
+
+# These functions are meant to be utilities for the urlgrabber library to use.
+
+def _to_utf8(obj, errors='replace'):
+ '''convert 'unicode' to an encoded utf-8 byte string '''
+ # stolen from yum.i18n
+ if isinstance(obj, unicode):
+ obj = obj.encode('utf-8', errors)
+ return obj
+
+########################################################################
+# END UTILITY FUNCTIONS
+########################################################################
class URLGrabError(IOError):
@@ -662,6 +708,7 @@ class URLParser:
15 years ago
opts.quote = 0 --> do not quote it
opts.quote = None --> guess
"""
+ url = _to_utf8(url)
quote = opts.quote
if opts.prefix:
@@ -768,6 +815,41 @@ class URLGrabberOptions:
else: # throttle is a float
return self.bandwidth * self.throttle
+ def find_proxy(self, url, scheme):
+ """Find the proxy to use for this URL.
+ Use the proxies dictionary first, then libproxy.
+ """
+ self.proxy = None
+ if scheme not in ('ftp', 'http', 'https'):
+ return
+
+ if self.proxies:
+ proxy = self.proxies.get(scheme)
+ if proxy is None:
+ if scheme == 'http':
+ proxy = self.proxies.get('https')
+ elif scheme == 'https':
+ proxy = self.proxies.get('http')
+ if proxy == '_none_':
+ proxy = ''
+ self.proxy = proxy
+ return
+
+ if self.libproxy:
+ global _libproxy_cache
+ if _libproxy_cache is None:
+ try:
+ import libproxy
+ _libproxy_cache = libproxy.ProxyFactory()
+ except:
+ _libproxy_cache = False
+ if _libproxy_cache:
+ for proxy in _libproxy_cache.getProxies(url):
+ if proxy.startswith('http://'):
+ if DEBUG: DEBUG.info('using proxy "%s" for url %s' % (proxy, url))
+ self.proxy = proxy
+ break
+
def derive(self, **kwargs):
"""Create a derived URLGrabberOptions instance.
This method creates a new instance and overrides the
@@ -800,21 +882,25 @@ class URLGrabberOptions:
self.close_connection = 0
self.range = None
self.user_agent = 'urlgrabber/%s' % __version__
+ self.ip_resolve = None
self.keepalive = 1
self.proxies = None
+ self.libproxy = False
self.reget = None
self.failure_callback = None
self.interrupt_callback = None
self.prefix = None
self.opener = None
self.cache_openers = True
- self.timeout = None
+ self.timeout = 300
self.text = None
self.http_headers = None
self.ftp_headers = None
15 years ago
self.data = None
self.urlparser = URLParser()
self.quote = None
+ self.username = None
+ self.password = None
self.ssl_ca_cert = None # sets SSL_CAINFO - path to certdb
self.ssl_context = None # no-op in pycurl
self.ssl_verify_peer = True # check peer's cert for authenticityb
@@ -846,7 +932,7 @@ class URLGrabberOptions:
15 years ago
s = s + indent + '}'
return s
-class URLGrabber:
+class URLGrabber(object):
"""Provides easy opening of URLs with a variety of options.
All options are specified as kwargs. Options may be specified when
@@ -912,9 +998,11 @@ class URLGrabber:
returned that supports them. The file object can be treated
like any other file object.
"""
+ url = _to_utf8(url)
opts = self.opts.derive(**kwargs)
if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
(url,parts) = opts.urlparser.parse(url, opts)
+ opts.find_proxy(url, parts[0])
def retryfunc(opts, url):
return PyCurlFileObject(url, filename=None, opts=opts)
return self._retry(opts, retryfunc, url)
@@ -925,12 +1013,17 @@ class URLGrabber:
urlgrab returns the filename of the local file, which may be
different from the passed-in filename if copy_local == 0.
"""
+ url = _to_utf8(url)
opts = self.opts.derive(**kwargs)
if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
(url,parts) = opts.urlparser.parse(url, opts)
(scheme, host, path, parm, query, frag) = parts
+ opts.find_proxy(url, scheme)
if filename is None:
filename = os.path.basename( urllib.unquote(path) )
+ if not filename:
+ # This is better than nothing.
+ filename = 'index.html'
if scheme == 'file' and not opts.copy_local:
# just return the name of the local file - don't make a
# copy currently
@@ -982,9 +1075,11 @@ class URLGrabber:
"I want the first N bytes" but rather 'read the whole file
into memory, but don't use too much'
"""
+ url = _to_utf8(url)
opts = self.opts.derive(**kwargs)
if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
(url,parts) = opts.urlparser.parse(url, opts)
+ opts.find_proxy(url, parts[0])
if limit is not None:
limit = limit + 1
@@ -1030,7 +1125,7 @@ class URLGrabber:
15 years ago
default_grabber = URLGrabber()
-class PyCurlFileObject():
+class PyCurlFileObject(object):
def __init__(self, url, filename, opts):
self.fo = None
self._hdr_dump = ''
@@ -1052,10 +1147,11 @@ class PyCurlFileObject():
self._reget_length = 0
self._prog_running = False
self._error = (None, None)
- self.size = None
+ self.size = 0
+ self._hdr_ended = False
self._do_open()
-
+
def __getattr__(self, name):
"""This effectively allows us to wrap at the instance level.
Any attribute not found in _this_ object will be searched for
@@ -1085,9 +1181,14 @@ class PyCurlFileObject():
return -1
def _hdr_retrieve(self, buf):
+ if self._hdr_ended:
+ self._hdr_dump = ''
+ self.size = 0
+ self._hdr_ended = False
+
if self._over_max_size(cur=len(self._hdr_dump),
max_size=self.opts.max_header_size):
- return -1
+ return -1
try:
self._hdr_dump += buf
# we have to get the size before we do the progress obj start
@@ -1104,7 +1205,17 @@ class PyCurlFileObject():
s = parse150(buf)
if s:
self.size = int(s)
-
+
+ if buf.lower().find('location') != -1:
+ location = ':'.join(buf.split(':')[1:])
+ location = location.strip()
+ self.scheme = urlparse.urlsplit(location)[0]
+ self.url = location
+
+ if len(self._hdr_dump) != 0 and buf == '\r\n':
+ self._hdr_ended = True
+ if DEBUG: DEBUG.info('header ended:')
+
return len(buf)
except KeyboardInterrupt:
return pycurl.READFUNC_ABORT
@@ -1113,8 +1224,10 @@ class PyCurlFileObject():
if self._parsed_hdr:
return self._parsed_hdr
statusend = self._hdr_dump.find('\n')
+ statusend += 1 # ridiculous as it may seem.
hdrfp = StringIO()
hdrfp.write(self._hdr_dump[statusend:])
+ hdrfp.seek(0)
self._parsed_hdr = mimetools.Message(hdrfp)
return self._parsed_hdr
@@ -1127,6 +1240,9 @@ class PyCurlFileObject():
if not opts:
opts = self.opts
+ # keepalives
+ if not opts.keepalive:
+ self.curl_obj.setopt(pycurl.FORBID_REUSE, 1)
# defaults we're always going to set
self.curl_obj.setopt(pycurl.NOPROGRESS, False)
@@ -1136,11 +1252,21 @@ class PyCurlFileObject():
self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update)
self.curl_obj.setopt(pycurl.FAILONERROR, True)
self.curl_obj.setopt(pycurl.OPT_FILETIME, True)
+ self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)
if DEBUG:
self.curl_obj.setopt(pycurl.VERBOSE, True)
if opts.user_agent:
self.curl_obj.setopt(pycurl.USERAGENT, opts.user_agent)
+ if opts.ip_resolve:
+ # Default is: IPRESOLVE_WHATEVER
+ ipr = opts.ip_resolve.lower()
+ if ipr == 'whatever': # Do we need this?
+ self.curl_obj.setopt(pycurl.IPRESOLVE,pycurl.IPRESOLVE_WHATEVER)
+ if ipr == 'ipv4':
+ self.curl_obj.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V4)
+ if ipr == 'ipv6':
+ self.curl_obj.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V6)
# maybe to be options later
self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)
@@ -1148,9 +1274,11 @@ class PyCurlFileObject():
# timeouts
timeout = 300
- if opts.timeout:
- timeout = int(opts.timeout)
- self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout)
+ if hasattr(opts, 'timeout'):
+ timeout = int(opts.timeout or 0)
+ self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout)
+ self.curl_obj.setopt(pycurl.LOW_SPEED_LIMIT, 1)
+ self.curl_obj.setopt(pycurl.LOW_SPEED_TIME, timeout)
# ssl options
if self.scheme == 'https':
@@ -1158,13 +1286,16 @@ class PyCurlFileObject():
self.curl_obj.setopt(pycurl.CAPATH, opts.ssl_ca_cert)
self.curl_obj.setopt(pycurl.CAINFO, opts.ssl_ca_cert)
self.curl_obj.setopt(pycurl.SSL_VERIFYPEER, opts.ssl_verify_peer)
- self.curl_obj.setopt(pycurl.SSL_VERIFYHOST, opts.ssl_verify_host)
+ if opts.ssl_verify_host: # 1 is meaningless to curl
+ self.curl_obj.setopt(pycurl.SSL_VERIFYHOST, 2)
if opts.ssl_key:
self.curl_obj.setopt(pycurl.SSLKEY, opts.ssl_key)
if opts.ssl_key_type:
self.curl_obj.setopt(pycurl.SSLKEYTYPE, opts.ssl_key_type)
if opts.ssl_cert:
self.curl_obj.setopt(pycurl.SSLCERT, opts.ssl_cert)
+ # if we have a client side cert - turn off reuse b/c nss is odd
+ self.curl_obj.setopt(pycurl.FORBID_REUSE, 1)
if opts.ssl_cert_type:
self.curl_obj.setopt(pycurl.SSLCERTTYPE, opts.ssl_cert_type)
if opts.ssl_key_pass:
@@ -1187,28 +1318,24 @@ class PyCurlFileObject():
if hasattr(opts, 'raw_throttle') and opts.raw_throttle():
self.curl_obj.setopt(pycurl.MAX_RECV_SPEED_LARGE, int(opts.raw_throttle()))
15 years ago
- # proxy settings
- if opts.proxies:
- for (scheme, proxy) in opts.proxies.items():
- if self.scheme in ('ftp'): # only set the ftp proxy for ftp items
- if scheme not in ('ftp'):
- continue
- else:
- if proxy == '_none_': proxy = ""
- self.curl_obj.setopt(pycurl.PROXY, proxy)
- elif self.scheme in ('http', 'https'):
- if scheme not in ('http', 'https'):
- continue
- else:
- if proxy == '_none_': proxy = ""
- self.curl_obj.setopt(pycurl.PROXY, proxy)
-
15 years ago
- # FIXME username/password/auth settings
+ # proxy
+ if opts.proxy is not None:
+ self.curl_obj.setopt(pycurl.PROXY, opts.proxy)
+ self.curl_obj.setopt(pycurl.PROXYAUTH, pycurl.HTTPAUTH_ANY)
+
15 years ago
+ if opts.username and opts.password:
+ if self.scheme in ('http', 'https'):
+ self.curl_obj.setopt(pycurl.HTTPAUTH, pycurl.HTTPAUTH_ANY)
+
+ if opts.username and opts.password:
+ # apparently when applying them as curlopts they do not require quoting of any kind
+ userpwd = '%s:%s' % (opts.username, opts.password)
+ self.curl_obj.setopt(pycurl.USERPWD, userpwd)
#posts - simple - expects the fields as they are
if opts.data:
self.curl_obj.setopt(pycurl.POST, True)
- self.curl_obj.setopt(pycurl.POSTFIELDS, self._to_utf8(opts.data))
+ self.curl_obj.setopt(pycurl.POSTFIELDS, _to_utf8(opts.data))
# our url
self.curl_obj.setopt(pycurl.URL, self.url)
@@ -1219,8 +1346,14 @@ class PyCurlFileObject():
return
try:
+ e = None
self.curl_obj.perform()
- except pycurl.error, e:
+ except pycurl.error, e: pass
+ self._do_perform_exc(e)
+
+ def _do_perform_exc(self, e):
+ # handle pycurl exception 'e'
+ if e:
# XXX - break some of these out a bit more clearly
# to other URLGrabErrors from
# http://curl.haxx.se/libcurl/c/libcurl-errors.html
@@ -1228,12 +1361,14 @@ class PyCurlFileObject():
15 years ago
code = self.http_code
errcode = e.args[0]
+ errurl = urllib.unquote(self.url)
+
if self._error[0]:
errcode = self._error[0]
if errcode == 23 and code >= 200 and code < 299:
- err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e))
- err.url = self.url
+ err = URLGrabError(15, _('User (or something) called abort %s: %s') % (errurl, e))
+ err.url = errurl
# this is probably wrong but ultimately this is what happens
# we have a legit http code and a pycurl 'writer failed' code
@@ -1244,23 +1379,23 @@ class PyCurlFileObject():
15 years ago
raise KeyboardInterrupt
elif errcode == 28:
- err = URLGrabError(12, _('Timeout on %s: %s') % (self.url, e))
- err.url = self.url
+ err = URLGrabError(12, _('Timeout on %s: %s') % (errurl, e))
+ err.url = errurl
raise err
elif errcode == 35:
msg = _("problem making ssl connection")
err = URLGrabError(14, msg)
- err.url = self.url
+ err.url = errurl
raise err
elif errcode == 37:
- msg = _("Could not open/read %s") % (self.url)
+ msg = _("Could not open/read %s") % (errurl)
err = URLGrabError(14, msg)
- err.url = self.url
+ err.url = errurl
raise err
elif errcode == 42:
- err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e))
- err.url = self.url
+ err = URLGrabError(15, _('User (or something) called abort %s: %s') % (errurl, e))
+ err.url = errurl
# this is probably wrong but ultimately this is what happens
# we have a legit http code and a pycurl 'writer failed' code
# which almost always means something aborted it from outside
@@ -1272,33 +1407,94 @@ class PyCurlFileObject():
15 years ago
elif errcode == 58:
msg = _("problem with the local client certificate")
err = URLGrabError(14, msg)
- err.url = self.url
+ err.url = errurl
raise err
elif errcode == 60:
- msg = _("client cert cannot be verified or client cert incorrect")
+ msg = _("Peer cert cannot be verified or peer cert invalid")
err = URLGrabError(14, msg)
15 years ago
- err.url = self.url
+ err.url = errurl
raise err
15 years ago
elif errcode == 63:
if self._error[1]:
msg = self._error[1]
else:
- msg = _("Max download size exceeded on %s") % (self.url)
+ msg = _("Max download size exceeded on %s") % ()
err = URLGrabError(14, msg)
- err.url = self.url
+ err.url = errurl
raise err
elif str(e.args[1]) == '' and self.http_code != 0: # fake it until you make it
- msg = 'HTTP Error %s : %s ' % (self.http_code, self.url)
+ if self.scheme in ['http', 'https']:
15 years ago
+ if self.http_code in responses:
+ resp = responses[self.http_code]
+ msg = 'HTTP Error %s - %s : %s' % (self.http_code, resp, errurl)
+ else:
+ msg = 'HTTP Error %s : %s ' % (self.http_code, errurl)
+ elif self.scheme in ['ftp']:
15 years ago
+ msg = 'FTP Error %s : %s ' % (self.http_code, errurl)
+ else:
15 years ago
+ msg = "Unknown Error: URL=%s , scheme=%s" % (errurl, self.scheme)
else:
- msg = 'PYCURL ERROR %s - "%s"' % (errcode, str(e.args[1]))
+ pyerr2str = { 5 : _("Couldn't resolve proxy"),
+ 6 : _("Couldn't resolve host"),
+ 7 : _("Couldn't connect"),
+ 8 : _("Bad reply to FTP server"),
+ 9 : _("Access denied"),
+ 11 : _("Bad reply to FTP pass"),
+ 13 : _("Bad reply to FTP pasv"),
+ 14 : _("Bad reply to FTP 227"),
+ 15 : _("Couldn't get FTP host"),
+ 17 : _("Couldn't set FTP type"),
+ 18 : _("Partial file"),
+ 19 : _("FTP RETR command failed"),
+ 22 : _("HTTP returned error"),
+ 23 : _("Write error"),
+ 25 : _("Upload failed"),
+ 26 : _("Read error"),
+ 27 : _("Out of Memory"),
+ 28 : _("Operation timed out"),
+ 30 : _("FTP PORT command failed"),
+ 31 : _("FTP REST command failed"),
+ 33 : _("Range failed"),
+ 34 : _("HTTP POST failed"),
+ 35 : _("SSL CONNECT failed"),
+ 36 : _("Couldn't resume download"),
+ 37 : _("Couldn't read file"),
+ 42 : _("Aborted by callback"),
+ 47 : _("Too many redirects"),
+ 51 : _("Peer certificate failed verification"),
+ 52 : _("Got nothing: SSL certificate expired?"),
+ 53 : _("SSL engine not found"),
+ 54 : _("SSL engine set failed"),
+ 55 : _("Network error send()"),
+ 56 : _("Network error recv()"),
+ 58 : _("Local certificate failed"),
+ 59 : _("SSL set cipher failed"),
+ 60 : _("Local CA certificate failed"),
+ 61 : _("HTTP bad transfer encoding"),
+ 63 : _("Maximum file size exceeded"),
+ 64 : _("FTP SSL failed"),
+ 67 : _("Authentication failure"),
+ 70 : _("Out of disk space on server"),
+ 73 : _("Remove file exists"),
+ }
+ errstr = str(e.args[1])
+ if not errstr:
+ errstr = pyerr2str.get(errcode, '<Unknown>')
+ msg = 'curl#%s - "%s"' % (errcode, errstr)
code = errcode
err = URLGrabError(14, msg)
err.code = code
err.exception = e
raise err
+ else:
+ if self._error[1]:
+ msg = self._error[1]
+ err = URLGrabError(14, msg)
15 years ago
+ err.url = urllib.unquote(self.url)
+ raise err
def _do_open(self):
self.curl_obj = _curl_cache
@@ -1333,7 +1529,11 @@ class PyCurlFileObject():
15 years ago
if self.opts.range:
rt = self.opts.range
- if rt[0]: rt = (rt[0] + reget_length, rt[1])
+
+ if rt[0] is None:
+ rt = (0, rt[1])
+ rt = (rt[0] + reget_length, rt[1])
+
if rt:
header = range_tuple_to_header(rt)
@@ -1407,22 +1607,7 @@ class PyCurlFileObject():
_was_filename = False
if type(self.filename) in types.StringTypes and self.filename:
_was_filename = True
- self._prog_reportname = str(self.filename)
- self._prog_basename = os.path.basename(self.filename)
-
- if self.append: mode = 'ab'
- else: mode = 'wb'
-
- if DEBUG: DEBUG.info('opening local file "%s" with mode %s' % \
- (self.filename, mode))
- try:
- self.fo = open(self.filename, mode)
- except IOError, e:
- err = URLGrabError(16, _(\
- 'error opening local file from %s, IOError: %s') % (self.url, e))
- err.url = self.url
- raise err
-
+ self._do_open_fo()
else:
self._prog_reportname = 'MEMORY'
self._prog_basename = 'MEMORY'
@@ -1434,27 +1619,71 @@ class PyCurlFileObject():
#fh, self._temp_name = mkstemp()
#self.fo = open(self._temp_name, 'wb')
-
- self._do_perform()
-
-
-
- if _was_filename:
- # close it up
+ try:
+ self._do_perform()
+ except URLGrabError, e:
self.fo.flush()
self.fo.close()
- # set the time
- mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME)
- if mod_time != -1:
- os.utime(self.filename, (mod_time, mod_time))
+ raise e
+
+ if _was_filename:
+ self._do_close_fo()
# re open it
- self.fo = open(self.filename, 'r')
+ try:
+ self.fo = open(self.filename, 'r')
+ except IOError, e:
+ err = URLGrabError(16, _(\
+ 'error opening file from %s, IOError: %s') % (self.url, e))
+ err.url = self.url
+ raise err
+
else:
#self.fo = open(self._temp_name, 'r')
self.fo.seek(0)
self._complete = True
+ def _do_open_fo(self):
+ self._prog_reportname = str(self.filename)
+ self._prog_basename = os.path.basename(self.filename)
+ if self.append: mode = 'ab'
+ else: mode = 'wb'
+
+ if DEBUG: DEBUG.info('opening local file "%s" with mode %s' % \
+ (self.filename, mode))
+ try:
+ self.fo = open(self.filename, mode)
+ except IOError, e:
+ err = URLGrabError(16, _(\
+ 'error opening local file from %s, IOError: %s') % (self.url, e))
+ err.url = self.url
+ raise err
+
+ def _do_close_fo(self):
+ # close it up
+ self.fo.flush()
+ self.fo.close()
+
+ # Set the URL where we got it from:
+ if xattr is not None:
+ # See: http://www.freedesktop.org/wiki/CommonExtendedAttributes
+ try:
+ xattr.set(self.filename, 'user.xdg.origin.url', self.url)
+ except:
+ pass # URL too long. = IOError ... ignore everything.
+
+ # set the time
+ mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME)
+ if mod_time != -1:
+ try:
+ os.utime(self.filename, (mod_time, mod_time))
+ except OSError, e:
+ err = URLGrabError(16, _(\
+ 'error setting timestamp on file %s from %s, OSError: %s')
+ % (self.filename, self.url, e))
+ err.url = self.url
+ raise err
+
def _fill_buffer(self, amt=None):
"""fill the buffer to contain at least 'amt' bytes by reading
from the underlying file object. If amt is None, then it will
@@ -1526,17 +1755,20 @@ class PyCurlFileObject():
if self._prog_running:
downloaded += self._reget_length
self.opts.progress_obj.update(downloaded)
- except KeyboardInterrupt:
+ except (KeyboardInterrupt, IOError):
return -1
def _over_max_size(self, cur, max_size=None):
if not max_size:
- max_size = self.size
- if self.opts.size: # if we set an opts size use that, no matter what
- max_size = self.opts.size
+ if not self.opts.size:
+ max_size = self.size
+ else:
+ max_size = self.opts.size
+
if not max_size: return False # if we have None for all of the Max then this is dumb
- if cur > max_size + max_size*.10:
+
+ if cur > int(float(max_size) * 1.10):
msg = _("Downloaded more than max size for %s: %s > %s") \
% (self.url, cur, max_size)
@@ -1544,13 +1776,6 @@ class PyCurlFileObject():
15 years ago
return True
return False
- def _to_utf8(self, obj, errors='replace'):
- '''convert 'unicode' to an encoded utf-8 byte string '''
- # stolen from yum.i18n
- if isinstance(obj, unicode):
- obj = obj.encode('utf-8', errors)
- return obj
-
def read(self, amt=None):
self._fill_buffer(amt)
if amt is None:
@@ -1582,9 +1807,21 @@ class PyCurlFileObject():
self.opts.progress_obj.end(self._amount_read)
self.fo.close()
-
+ def geturl(self):
+ """ Provide the geturl() method, used to be got from
+ urllib.addinfourl, via. urllib.URLopener.* """
+ return self.url
+
_curl_cache = pycurl.Curl() # make one and reuse it over and over and over
+def reset_curl_obj():
+ """To make sure curl has reread the network/dns info we force a reload"""
+ global _curl_cache
+ _curl_cache.close()
+ _curl_cache = pycurl.Curl()
+
+_libproxy_cache = None
+
#####################################################################
# DEPRECATED FUNCTIONS
15 years ago
diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py
index dad410b..8731aed 100644
--- a/urlgrabber/mirror.py
+++ b/urlgrabber/mirror.py
@@ -90,7 +90,7 @@ CUSTOMIZATION
import random
import thread # needed for locking to make this threadsafe
-from grabber import URLGrabError, CallbackObject, DEBUG
+from grabber import URLGrabError, CallbackObject, DEBUG, _to_utf8
def _(st):
return st
@@ -263,7 +263,8 @@ class MirrorGroup:
def _parse_mirrors(self, mirrors):
parsed_mirrors = []
for m in mirrors:
- if type(m) == type(''): m = {'mirror': m}
+ if isinstance(m, basestring):
+ m = {'mirror': _to_utf8(m)}
parsed_mirrors.append(m)
return parsed_mirrors
diff --git a/urlgrabber/progress.py b/urlgrabber/progress.py
index dd07c6a..3d7e99a 100644
--- a/urlgrabber/progress.py
+++ b/urlgrabber/progress.py
@@ -211,6 +211,21 @@ def text_meter_total_size(size, downloaded=0):
# 4. + ( 5, total: 32)
#
+def _term_add_bar(tl, bar_max_length, pc):
+ blen = bar_max_length
+ bar = '='*int(blen * pc)
+ if (blen * pc) - int(blen * pc) >= 0.5:
+ bar += '-'
+ return tl.add(' [%-*.*s]' % (blen, blen, bar))
+
+def _term_add_end(tl, osize, size):
+ if osize is not None:
+ if size > osize: # Is ??? better? Really need something to say < vs >.
+ return tl.add(' !!! '), True
+ elif size != osize:
+ return tl.add(' ... '), True
+ return tl.add(' ' * 5), False
+
class TextMeter(BaseMeter):
def __init__(self, fo=sys.stderr):
BaseMeter.__init__(self)
@@ -259,13 +274,10 @@ class TextMeter(BaseMeter):
ui_rate = tl.add(' %5sB/s' % ave_dl)
# Make text grow a bit before we start growing the bar too
blen = 4 + tl.rest_split(8 + 8 + 4)
- bar = '='*int(blen * frac)
- if (blen * frac) - int(blen * frac) >= 0.5:
- bar += '-'
- ui_bar = tl.add(' [%-*.*s]' % (blen, blen, bar))
- out = '%-*.*s%s%s%s%s%s%s%s\r' % (tl.rest(), tl.rest(), text,
- ui_sofar_pc, ui_pc, ui_bar,
- ui_rate, ui_size, ui_time, ui_end)
+ ui_bar = _term_add_bar(tl, blen, frac)
+ out = '\r%-*.*s%s%s%s%s%s%s%s\r' % (tl.rest(), tl.rest(), text,
+ ui_sofar_pc, ui_pc, ui_bar,
+ ui_rate,ui_size,ui_time, ui_end)
self.fo.write(out)
self.fo.flush()
@@ -284,12 +296,7 @@ class TextMeter(BaseMeter):
tl = TerminalLine(8)
ui_size = tl.add(' | %5sB' % total_size)
ui_time = tl.add(' %9s' % total_time)
- not_done = self.size is not None and amount_read != self.size
- if not_done:
- ui_end = tl.add(' ... ')
- else:
- ui_end = tl.add(' ' * 5)
-
+ ui_end, not_done = _term_add_end(tl, self.size, amount_read)
out = '\r%-*.*s%s%s%s\n' % (tl.rest(), tl.rest(), text,
ui_size, ui_time, ui_end)
self.fo.write(out)
@@ -331,12 +338,21 @@ class MultiFileHelper(BaseMeter):
def message(self, message):
self.master.message_meter(self, message)
+class _FakeLock:
+ def acquire(self):
+ pass
+ def release(self):
+ pass
+
class MultiFileMeter:
helperclass = MultiFileHelper
- def __init__(self):
+ def __init__(self, threaded=True):
self.meters = []
self.in_progress_meters = []
- self._lock = thread.allocate_lock()
+ if threaded:
+ self._lock = thread.allocate_lock()
+ else:
+ self._lock = _FakeLock()
self.update_period = 0.3 # seconds
self.numfiles = None
@@ -369,6 +385,7 @@ class MultiFileMeter:
def end(self, now=None):
if now is None: now = time.time()
+ self.re.update(self._amount_read(), now)
self._do_end(now)
def _do_end(self, now):
@@ -466,11 +483,20 @@ class MultiFileMeter:
class TextMultiFileMeter(MultiFileMeter):
- def __init__(self, fo=sys.stderr):
+ def __init__(self, fo=sys.stderr, threaded=True):
self.fo = fo
- MultiFileMeter.__init__(self)
+ MultiFileMeter.__init__(self, threaded)
# files: ###/### ###% data: ######/###### ###% time: ##:##:##/##:##:##
+# New output, like TextMeter output...
+# update: Size, All files
+# -----------------------
+# (<#file>/<#tot files>): <text> <pc> <bar> <rate> | <size> <eta time> ETA
+# 8-22 1 3-4 1 6-12 1 8 3 6 1 9 1 3 1
+# end
+# ---
+# <text> | <file size> <file elapsed time>
+# 8-56 3 6 1 9 5
def _do_update_meter(self, meter, now):
self._lock.acquire()
try:
@@ -480,7 +506,7 @@ class TextMultiFileMeter(MultiFileMeter):
tf = self.numfiles or 1
pf = 100 * float(df)/tf + 0.49
dd = self.re.last_amount_read
- td = self.total_size
+ td = self.re.total
pd = 100 * (self.re.fraction_read() or 0) + 0.49
dt = self.re.elapsed_time()
rt = self.re.remaining_time()
@@ -491,9 +517,33 @@ class TextMultiFileMeter(MultiFileMeter):
ftd = format_number(td) + 'B'
fdt = format_time(dt, 1)
ftt = format_time(tt, 1)
-
- out = '%-79.79s' % (format % (df, tf, pf, fdd, ftd, pd, fdt, ftt))
- self.fo.write('\r' + out)
+
+ frac = self.re.fraction_read() or 0
+ ave_dl = format_number(self.re.average_rate())
+ text = meter.text or meter.basename
+ if tf > 1:
+ text = '(%u/%u): %s' % (df+1, tf, text)
+
+ # Include text + ui_rate in minimal
+ tl = TerminalLine(8, 8+1+8)
+
+ ui_size = tl.add(' | %5sB' % format_number(dd))
+
+ ui_time = tl.add(' %9s' % format_time(rt))
+ ui_end = tl.add(' ETA ')
+
+ ui_sofar_pc = tl.add(' %i%%' % pf,
+ full_len=len(" (100%)"))
+ ui_rate = tl.add(' %5sB/s' % ave_dl)
+
+ # Make text grow a bit before we start growing the bar too
+ blen = 4 + tl.rest_split(8 + 8 + 4)
+ ui_bar = _term_add_bar(tl, blen, frac)
+ out = '\r%-*.*s%s%s%s%s%s%s\r' % (tl.rest(), tl.rest(), text,
+ ui_sofar_pc, ui_bar,
+ ui_rate, ui_size, ui_time,
+ ui_end)
+ self.fo.write(out)
self.fo.flush()
finally:
self._lock.release()
@@ -502,15 +552,28 @@ class TextMultiFileMeter(MultiFileMeter):
self._lock.acquire()
try:
format = "%-30.30s %6.6s %8.8s %9.9s"
- fn = meter.basename
+ fn = meter.text or meter.basename
size = meter.last_amount_read
fsize = format_number(size) + 'B'
et = meter.re.elapsed_time()
fet = format_time(et, 1)
- frate = format_number(size / et) + 'B/s'
-
- out = '%-79.79s' % (format % (fn, fsize, fet, frate))
- self.fo.write('\r' + out + '\n')
+ frate = format_number(et and size / et) + 'B/s'
+ df = self.finished_files
+ tf = self.numfiles or 1
+
+ total_time = format_time(et)
+ total_size = format_number(size)
+ text = meter.text or meter.basename
+ if tf > 1:
+ text = '(%u/%u): %s' % (df, tf, text)
+
+ tl = TerminalLine(8)
+ ui_size = tl.add(' | %5sB' % total_size)
+ ui_time = tl.add(' %9s' % total_time)
+ ui_end, not_done = _term_add_end(tl, meter.size, size)
+ out = '\r%-*.*s%s%s%s\n' % (tl.rest(), tl.rest(), text,
+ ui_size, ui_time, ui_end)
+ self.fo.write(out)
finally:
self._lock.release()
self._do_update_meter(meter, now)
@@ -658,6 +721,8 @@ def format_time(seconds, use_hours=0):
if seconds is None or seconds < 0:
if use_hours: return '--:--:--'
else: return '--:--'
+ elif seconds == float('inf'):
+ return 'Infinite'
else:
seconds = int(seconds)
minutes = seconds / 60