You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
python-urlgrabber/urlgrabber-HEAD.patch

444 lines
17 KiB

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..1ffe416
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,7 @@
+*.py[co]
+MANIFEST
+dist
+build
+*.kdev*
+*.kateproject
+ipython.log*
diff --git a/scripts/urlgrabber b/scripts/urlgrabber
index 518e512..09cd896 100644
--- a/scripts/urlgrabber
+++ b/scripts/urlgrabber
@@ -115,6 +115,7 @@ options:
including quotes in the case of strings.
e.g. --user_agent='"foobar/2.0"'
+ --output FILE
-o FILE write output to FILE, otherwise the basename of the
url will be used
-O print the names of saved files to STDOUT
@@ -170,12 +171,17 @@ class client_options:
return ug_options, ug_defaults
def process_command_line(self):
- short_options = 'vd:hoOpD'
+ short_options = 'vd:ho:OpD'
long_options = ['profile', 'repeat=', 'verbose=',
- 'debug=', 'help', 'progress']
+ 'debug=', 'help', 'progress', 'output=']
ug_long = [ o + '=' for o in self.ug_options ]
- optlist, args = getopt.getopt(sys.argv[1:], short_options,
- long_options + ug_long)
+ try:
+ optlist, args = getopt.getopt(sys.argv[1:], short_options,
+ long_options + ug_long)
+ except getopt.GetoptError, e:
+ print >>sys.stderr, "Error:", e
+ self.help([], ret=1)
+
self.verbose = 0
self.debug = None
self.outputfile = None
@@ -193,6 +199,7 @@ class client_options:
if o == '--verbose': self.verbose = v
if o == '-v': self.verbose += 1
if o == '-o': self.outputfile = v
+ if o == '--output': self.outputfile = v
if o == '-p' or o == '--progress': self.progress = 1
if o == '-d' or o == '--debug': self.debug = v
if o == '--profile': self.profile = 1
@@ -222,7 +229,7 @@ class client_options:
print "ERROR: cannot use -o when grabbing multiple files"
sys.exit(1)
- def help(self, args):
+ def help(self, args, ret=0):
if not args:
print MAINHELP
else:
@@ -234,7 +241,7 @@ class client_options:
self.help_ug_option(a)
else:
print 'ERROR: no help on command "%s"' % a
- sys.exit(0)
+ sys.exit(ret)
def help_doc(self):
print __doc__
diff --git a/test/base_test_code.py b/test/base_test_code.py
index 50c6348..5fb43f9 100644
--- a/test/base_test_code.py
+++ b/test/base_test_code.py
@@ -1,6 +1,6 @@
from munittest import *
-base_http = 'http://www.linux.duke.edu/projects/urlgrabber/test/'
+base_http = 'http://urlgrabber.baseurl.org/test/'
base_ftp = 'ftp://localhost/test/'
# set to a proftp server only. we're working around a couple of
diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py
index e090e90..0c78857 100644
--- a/urlgrabber/grabber.py
+++ b/urlgrabber/grabber.py
@@ -68,14 +68,14 @@ GENERAL ARGUMENTS (kwargs)
(which can be set on default_grabber.throttle) is used. See
BANDWIDTH THROTTLING for more information.
- timeout = None
+ timeout = 300
- a positive float expressing the number of seconds to wait for socket
- operations. If the value is None or 0.0, socket operations will block
- forever. Setting this option causes urlgrabber to call the settimeout
- method on the Socket object used for the request. See the Python
- documentation on settimeout for more information.
- http://www.python.org/doc/current/lib/socket-objects.html
+ a positive integer expressing the number of seconds to wait before
+ timing out attempts to connect to a server. If the value is None
+ or 0, connection attempts will not time out. The timeout is passed
+ to the underlying pycurl object as its CONNECTTIMEOUT option, see
+ the curl documentation on CURLOPT_CONNECTTIMEOUT for more information.
+ http://curl.haxx.se/libcurl/c/curl_easy_setopt.html#CURLOPTCONNECTTIMEOUT
bandwidth = 0
@@ -248,6 +248,11 @@ GENERAL ARGUMENTS (kwargs)
Maximum size (in bytes) of the headers.
+ self.ip_resolve = 'whatever'
+
+ What type of name to IP resolving to use, default is to do both IPV4 and
+ IPV6.
+
RETRY RELATED ARGUMENTS
@@ -439,6 +444,12 @@ try:
except:
__version__ = '???'
+try:
+ # this part isn't going to do much - need to talk to gettext
+ from i18n import _
+except ImportError, msg:
+ def _(st): return st
+
########################################################################
# functions for debugging output. These functions are here because they
# are also part of the module initialization.
@@ -800,6 +811,7 @@ class URLGrabberOptions:
self.close_connection = 0
self.range = None
self.user_agent = 'urlgrabber/%s' % __version__
+ self.ip_resolve = None
self.keepalive = 1
self.proxies = None
self.reget = None
@@ -808,7 +820,7 @@ class URLGrabberOptions:
self.prefix = None
self.opener = None
self.cache_openers = True
- self.timeout = None
+ self.timeout = 300
self.text = None
self.http_headers = None
self.ftp_headers = None
@@ -931,6 +943,9 @@ class URLGrabber:
(scheme, host, path, parm, query, frag) = parts
if filename is None:
filename = os.path.basename( urllib.unquote(path) )
+ if not filename:
+ # This is better than nothing.
+ filename = 'index.html'
if scheme == 'file' and not opts.copy_local:
# just return the name of the local file - don't make a
# copy currently
@@ -1052,9 +1067,15 @@ class PyCurlFileObject():
self._reget_length = 0
self._prog_running = False
self._error = (None, None)
- self.size = None
+ self.size = 0
+ self._hdr_ended = False
self._do_open()
+
+ def geturl(self):
+ """ Provide the geturl() method, used to be got from
+ urllib.addinfourl, via. urllib.URLopener.* """
+ return self.url
def __getattr__(self, name):
"""This effectively allows us to wrap at the instance level.
@@ -1085,9 +1106,14 @@ class PyCurlFileObject():
return -1
def _hdr_retrieve(self, buf):
+ if self._hdr_ended:
+ self._hdr_dump = ''
+ self.size = 0
+ self._hdr_ended = False
+
if self._over_max_size(cur=len(self._hdr_dump),
max_size=self.opts.max_header_size):
- return -1
+ return -1
try:
self._hdr_dump += buf
# we have to get the size before we do the progress obj start
@@ -1104,7 +1130,17 @@ class PyCurlFileObject():
s = parse150(buf)
if s:
self.size = int(s)
-
+
+ if buf.lower().find('location') != -1:
+ location = ':'.join(buf.split(':')[1:])
+ location = location.strip()
+ self.scheme = urlparse.urlsplit(location)[0]
+ self.url = location
+
+ if len(self._hdr_dump) != 0 and buf == '\r\n':
+ self._hdr_ended = True
+ if DEBUG: DEBUG.info('header ended:')
+
return len(buf)
except KeyboardInterrupt:
return pycurl.READFUNC_ABORT
@@ -1113,8 +1149,10 @@ class PyCurlFileObject():
if self._parsed_hdr:
return self._parsed_hdr
statusend = self._hdr_dump.find('\n')
+ statusend += 1 # ridiculous as it may seem.
hdrfp = StringIO()
hdrfp.write(self._hdr_dump[statusend:])
+ hdrfp.seek(0)
self._parsed_hdr = mimetools.Message(hdrfp)
return self._parsed_hdr
@@ -1136,11 +1174,21 @@ class PyCurlFileObject():
self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update)
self.curl_obj.setopt(pycurl.FAILONERROR, True)
self.curl_obj.setopt(pycurl.OPT_FILETIME, True)
+ self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)
if DEBUG:
self.curl_obj.setopt(pycurl.VERBOSE, True)
if opts.user_agent:
self.curl_obj.setopt(pycurl.USERAGENT, opts.user_agent)
+ if opts.ip_resolve:
+ # Default is: IPRESOLVE_WHATEVER
+ ipr = opts.ip_resolve.lower()
+ if ipr == 'whatever': # Do we need this?
+ self.curl_obj.setopt(pycurl.IPRESOLVE,pycurl.IPRESOLVE_WHATEVER)
+ if ipr == 'ipv4':
+ self.curl_obj.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V4)
+ if ipr == 'ipv6':
+ self.curl_obj.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V6)
# maybe to be options later
self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)
@@ -1148,9 +1196,11 @@ class PyCurlFileObject():
# timeouts
timeout = 300
- if opts.timeout:
- timeout = int(opts.timeout)
- self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout)
+ if hasattr(opts, 'timeout'):
+ timeout = int(opts.timeout or 0)
+ self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout)
+ self.curl_obj.setopt(pycurl.LOW_SPEED_LIMIT, 1)
+ self.curl_obj.setopt(pycurl.LOW_SPEED_TIME, timeout)
# ssl options
if self.scheme == 'https':
@@ -1276,7 +1326,7 @@ class PyCurlFileObject():
raise err
elif errcode == 60:
- msg = _("client cert cannot be verified or client cert incorrect")
+ msg = _("Peer cert cannot be verified or peer cert invalid")
err = URLGrabError(14, msg)
err.url = self.url
raise err
@@ -1291,14 +1341,70 @@ class PyCurlFileObject():
raise err
elif str(e.args[1]) == '' and self.http_code != 0: # fake it until you make it
- msg = 'HTTP Error %s : %s ' % (self.http_code, self.url)
+ if self.scheme in ['http', 'https']:
+ msg = 'HTTP Error %s : %s ' % (self.http_code, self.url)
+ elif self.scheme in ['ftp']:
+ msg = 'FTP Error %s : %s ' % (self.http_code, self.url)
+ else:
+ msg = "Unknown Error: URL=%s , scheme=%s" % (self.url, self.scheme)
else:
- msg = 'PYCURL ERROR %s - "%s"' % (errcode, str(e.args[1]))
+ pyerr2str = { 5 : _("Couldn't resolve proxy"),
+ 6 : _("Couldn't resolve host"),
+ 7 : _("Couldn't connect"),
+ 8 : _("Bad reply to FTP server"),
+ 9 : _("Access denied"),
+ 11 : _("Bad reply to FTP pass"),
+ 13 : _("Bad reply to FTP pasv"),
+ 14 : _("Bad reply to FTP 227"),
+ 15 : _("Couldn't get FTP host"),
+ 17 : _("Couldn't set FTP type"),
+ 18 : _("Partial file"),
+ 19 : _("FTP RETR command failed"),
+ 22 : _("HTTP returned error"),
+ 23 : _("Write error"),
+ 25 : _("Upload failed"),
+ 26 : _("Read error"),
+ 27 : _("Out of Memory"),
+ 28 : _("Operation timed out"),
+ 30 : _("FTP PORT command failed"),
+ 31 : _("FTP REST command failed"),
+ 33 : _("Range failed"),
+ 34 : _("HTTP POST failed"),
+ 35 : _("SSL CONNECT failed"),
+ 36 : _("Couldn't resume download"),
+ 37 : _("Couldn't read file"),
+ 42 : _("Aborted by callback"),
+ 47 : _("Too many redirects"),
+ 51 : _("Peer certificate failed verification"),
+ 53 : _("SSL engine not found"),
+ 54 : _("SSL engine set failed"),
+ 55 : _("Network error send()"),
+ 56 : _("Network error recv()"),
+ 58 : _("Local certificate failed"),
+ 59 : _("SSL set cipher failed"),
+ 60 : _("Local CA certificate failed"),
+ 61 : _("HTTP bad transfer encoding"),
+ 63 : _("Maximum file size exceeded"),
+ 64 : _("FTP SSL failed"),
+ 67 : _("Authentication failure"),
+ 70 : _("Out of disk space on server"),
+ 73 : _("Remove file exists"),
+ }
+ errstr = str(e.args[1])
+ if not errstr:
+ errstr = pyerr2str.get(errcode, '<Unknown>')
+ msg = 'curl#%s - "%s"' % (errcode, errstr)
code = errcode
err = URLGrabError(14, msg)
err.code = code
err.exception = e
raise err
+ else:
+ if self._error[1]:
+ msg = self._error[1]
+ err = URLGRabError(14, msg)
+ err.url = self.url
+ raise err
def _do_open(self):
self.curl_obj = _curl_cache
@@ -1434,9 +1540,13 @@ class PyCurlFileObject():
#fh, self._temp_name = mkstemp()
#self.fo = open(self._temp_name, 'wb')
-
- self._do_perform()
-
+ try:
+ self._do_perform()
+ except URLGrabError, e:
+ self.fo.flush()
+ self.fo.close()
+ raise e
+
if _was_filename:
@@ -1446,9 +1556,23 @@ class PyCurlFileObject():
# set the time
mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME)
if mod_time != -1:
- os.utime(self.filename, (mod_time, mod_time))
+ try:
+ os.utime(self.filename, (mod_time, mod_time))
+ except OSError, e:
+ err = URLGrabError(16, _(\
+ 'error setting timestamp on file %s from %s, OSError: %s')
+ % (self.filename, self.url, e))
+ err.url = self.url
+ raise err
# re open it
- self.fo = open(self.filename, 'r')
+ try:
+ self.fo = open(self.filename, 'r')
+ except IOError, e:
+ err = URLGrabError(16, _(\
+ 'error opening file from %s, IOError: %s') % (self.url, e))
+ err.url = self.url
+ raise err
+
else:
#self.fo = open(self._temp_name, 'r')
self.fo.seek(0)
@@ -1532,11 +1656,14 @@ class PyCurlFileObject():
def _over_max_size(self, cur, max_size=None):
if not max_size:
- max_size = self.size
- if self.opts.size: # if we set an opts size use that, no matter what
- max_size = self.opts.size
+ if not self.opts.size:
+ max_size = self.size
+ else:
+ max_size = self.opts.size
+
if not max_size: return False # if we have None for all of the Max then this is dumb
- if cur > max_size + max_size*.10:
+
+ if cur > int(float(max_size) * 1.10):
msg = _("Downloaded more than max size for %s: %s > %s") \
% (self.url, cur, max_size)
@@ -1582,9 +1709,21 @@ class PyCurlFileObject():
self.opts.progress_obj.end(self._amount_read)
self.fo.close()
-
+ def geturl(self):
+ """ Provide the geturl() method, used to be got from
+ urllib.addinfourl, via. urllib.URLopener.* """
+ return self.url
+
_curl_cache = pycurl.Curl() # make one and reuse it over and over and over
+def reset_curl_obj():
+ """To make sure curl has reread the network/dns info we force a reload"""
+ global _curl_cache
+ _curl_cache.close()
+ _curl_cache = pycurl.Curl()
+
+
+
#####################################################################
# DEPRECATED FUNCTIONS
diff --git a/urlgrabber/progress.py b/urlgrabber/progress.py
index dd07c6a..45eb248 100644
--- a/urlgrabber/progress.py
+++ b/urlgrabber/progress.py
@@ -658,6 +658,8 @@ def format_time(seconds, use_hours=0):
if seconds is None or seconds < 0:
if use_hours: return '--:--:--'
else: return '--:--'
+ elif seconds == float('inf'):
+ return 'Infinite'
else:
seconds = int(seconds)
minutes = seconds / 60