|
|
@ -1,16 +1,3 @@
|
|
|
|
diff --git a/.gitignore b/.gitignore
|
|
|
|
|
|
|
|
new file mode 100644
|
|
|
|
|
|
|
|
index 0000000..1ffe416
|
|
|
|
|
|
|
|
--- /dev/null
|
|
|
|
|
|
|
|
+++ b/.gitignore
|
|
|
|
|
|
|
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
+*.py[co]
|
|
|
|
|
|
|
|
+MANIFEST
|
|
|
|
|
|
|
|
+dist
|
|
|
|
|
|
|
|
+build
|
|
|
|
|
|
|
|
+*.kdev*
|
|
|
|
|
|
|
|
+*.kateproject
|
|
|
|
|
|
|
|
+ipython.log*
|
|
|
|
|
|
|
|
diff --git a/scripts/urlgrabber b/scripts/urlgrabber
|
|
|
|
diff --git a/scripts/urlgrabber b/scripts/urlgrabber
|
|
|
|
index 518e512..09cd896 100644
|
|
|
|
index 518e512..09cd896 100644
|
|
|
|
--- a/scripts/urlgrabber
|
|
|
|
--- a/scripts/urlgrabber
|
|
|
@ -83,8 +70,62 @@ index 50c6348..5fb43f9 100644
|
|
|
|
base_ftp = 'ftp://localhost/test/'
|
|
|
|
base_ftp = 'ftp://localhost/test/'
|
|
|
|
|
|
|
|
|
|
|
|
# set to a proftp server only. we're working around a couple of
|
|
|
|
# set to a proftp server only. we're working around a couple of
|
|
|
|
|
|
|
|
diff --git a/urlgrabber/byterange.py b/urlgrabber/byterange.py
|
|
|
|
|
|
|
|
index 3e5f3b7..8eeaeda 100644
|
|
|
|
|
|
|
|
--- a/urlgrabber/byterange.py
|
|
|
|
|
|
|
|
+++ b/urlgrabber/byterange.py
|
|
|
|
|
|
|
|
@@ -68,7 +68,7 @@ class HTTPRangeHandler(urllib2.BaseHandler):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def http_error_416(self, req, fp, code, msg, hdrs):
|
|
|
|
|
|
|
|
# HTTP's Range Not Satisfiable error
|
|
|
|
|
|
|
|
- raise RangeError('Requested Range Not Satisfiable')
|
|
|
|
|
|
|
|
+ raise RangeError(9, 'Requested Range Not Satisfiable')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class HTTPSRangeHandler(HTTPRangeHandler):
|
|
|
|
|
|
|
|
""" Range Header support for HTTPS. """
|
|
|
|
|
|
|
|
@@ -208,7 +208,7 @@ class RangeableFileObject:
|
|
|
|
|
|
|
|
bufsize = offset - pos
|
|
|
|
|
|
|
|
buf = self.fo.read(bufsize)
|
|
|
|
|
|
|
|
if len(buf) != bufsize:
|
|
|
|
|
|
|
|
- raise RangeError('Requested Range Not Satisfiable')
|
|
|
|
|
|
|
|
+ raise RangeError(9, 'Requested Range Not Satisfiable')
|
|
|
|
|
|
|
|
pos+= bufsize
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class FileRangeHandler(urllib2.FileHandler):
|
|
|
|
|
|
|
|
@@ -238,7 +238,7 @@ class FileRangeHandler(urllib2.FileHandler):
|
|
|
|
|
|
|
|
(fb,lb) = brange
|
|
|
|
|
|
|
|
if lb == '': lb = size
|
|
|
|
|
|
|
|
if fb < 0 or fb > size or lb > size:
|
|
|
|
|
|
|
|
- raise RangeError('Requested Range Not Satisfiable')
|
|
|
|
|
|
|
|
+ raise RangeError(9, 'Requested Range Not Satisfiable')
|
|
|
|
|
|
|
|
size = (lb - fb)
|
|
|
|
|
|
|
|
fo = RangeableFileObject(fo, (fb,lb))
|
|
|
|
|
|
|
|
headers = mimetools.Message(StringIO(
|
|
|
|
|
|
|
|
@@ -318,12 +318,12 @@ class FTPRangeHandler(urllib2.FTPHandler):
|
|
|
|
|
|
|
|
(fb,lb) = range_tup
|
|
|
|
|
|
|
|
if lb == '':
|
|
|
|
|
|
|
|
if retrlen is None or retrlen == 0:
|
|
|
|
|
|
|
|
- raise RangeError('Requested Range Not Satisfiable due to unobtainable file length.')
|
|
|
|
|
|
|
|
+ raise RangeError(9, 'Requested Range Not Satisfiable due to unobtainable file length.')
|
|
|
|
|
|
|
|
lb = retrlen
|
|
|
|
|
|
|
|
retrlen = lb - fb
|
|
|
|
|
|
|
|
if retrlen < 0:
|
|
|
|
|
|
|
|
# beginning of range is larger than file
|
|
|
|
|
|
|
|
- raise RangeError('Requested Range Not Satisfiable')
|
|
|
|
|
|
|
|
+ raise RangeError(9, 'Requested Range Not Satisfiable')
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
retrlen = lb - fb
|
|
|
|
|
|
|
|
fp = RangeableFileObject(fp, (0,retrlen))
|
|
|
|
|
|
|
|
@@ -458,6 +458,6 @@ def range_tuple_normalize(range_tup):
|
|
|
|
|
|
|
|
# check if range is over the entire file
|
|
|
|
|
|
|
|
if (fb,lb) == (0,''): return None
|
|
|
|
|
|
|
|
# check that the range is valid
|
|
|
|
|
|
|
|
- if lb < fb: raise RangeError('Invalid byte range: %s-%s' % (fb,lb))
|
|
|
|
|
|
|
|
+ if lb < fb: raise RangeError(9, 'Invalid byte range: %s-%s' % (fb,lb))
|
|
|
|
|
|
|
|
return (fb,lb)
|
|
|
|
|
|
|
|
|
|
|
|
diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py
|
|
|
|
diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py
|
|
|
|
index e090e90..0c78857 100644
|
|
|
|
index e090e90..b2770c5 100644
|
|
|
|
--- a/urlgrabber/grabber.py
|
|
|
|
--- a/urlgrabber/grabber.py
|
|
|
|
+++ b/urlgrabber/grabber.py
|
|
|
|
+++ b/urlgrabber/grabber.py
|
|
|
|
@@ -68,14 +68,14 @@ GENERAL ARGUMENTS (kwargs)
|
|
|
|
@@ -68,14 +68,14 @@ GENERAL ARGUMENTS (kwargs)
|
|
|
@ -109,7 +150,20 @@ index e090e90..0c78857 100644
|
|
|
|
|
|
|
|
|
|
|
|
bandwidth = 0
|
|
|
|
bandwidth = 0
|
|
|
|
|
|
|
|
|
|
|
|
@@ -248,6 +248,11 @@ GENERAL ARGUMENTS (kwargs)
|
|
|
|
@@ -198,6 +198,12 @@ GENERAL ARGUMENTS (kwargs)
|
|
|
|
|
|
|
|
control, you should probably subclass URLParser and pass it in via
|
|
|
|
|
|
|
|
the 'urlparser' option.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
+ username = None
|
|
|
|
|
|
|
|
+ username to use for simple http auth - is automatically quoted for special characters
|
|
|
|
|
|
|
|
+
|
|
|
|
|
|
|
|
+ password = None
|
|
|
|
|
|
|
|
+ password to use for simple http auth - is automatically quoted for special characters
|
|
|
|
|
|
|
|
+
|
|
|
|
|
|
|
|
ssl_ca_cert = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
this option can be used if M2Crypto is available and will be
|
|
|
|
|
|
|
|
@@ -248,6 +254,11 @@ GENERAL ARGUMENTS (kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
Maximum size (in bytes) of the headers.
|
|
|
|
Maximum size (in bytes) of the headers.
|
|
|
|
|
|
|
|
|
|
|
@ -121,7 +175,15 @@ index e090e90..0c78857 100644
|
|
|
|
|
|
|
|
|
|
|
|
RETRY RELATED ARGUMENTS
|
|
|
|
RETRY RELATED ARGUMENTS
|
|
|
|
|
|
|
|
|
|
|
|
@@ -439,6 +444,12 @@ try:
|
|
|
|
@@ -420,6 +431,7 @@ import time
|
|
|
|
|
|
|
|
import string
|
|
|
|
|
|
|
|
import urllib
|
|
|
|
|
|
|
|
import urllib2
|
|
|
|
|
|
|
|
+from httplib import responses
|
|
|
|
|
|
|
|
import mimetools
|
|
|
|
|
|
|
|
import thread
|
|
|
|
|
|
|
|
import types
|
|
|
|
|
|
|
|
@@ -439,6 +451,12 @@ try:
|
|
|
|
except:
|
|
|
|
except:
|
|
|
|
__version__ = '???'
|
|
|
|
__version__ = '???'
|
|
|
|
|
|
|
|
|
|
|
@ -134,7 +196,38 @@ index e090e90..0c78857 100644
|
|
|
|
########################################################################
|
|
|
|
########################################################################
|
|
|
|
# functions for debugging output. These functions are here because they
|
|
|
|
# functions for debugging output. These functions are here because they
|
|
|
|
# are also part of the module initialization.
|
|
|
|
# are also part of the module initialization.
|
|
|
|
@@ -800,6 +811,7 @@ class URLGrabberOptions:
|
|
|
|
@@ -527,6 +545,22 @@ def _(st):
|
|
|
|
|
|
|
|
# END MODULE INITIALIZATION
|
|
|
|
|
|
|
|
########################################################################
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
+########################################################################
|
|
|
|
|
|
|
|
+# UTILITY FUNCTIONS
|
|
|
|
|
|
|
|
+########################################################################
|
|
|
|
|
|
|
|
+
|
|
|
|
|
|
|
|
+# These functions are meant to be utilities for the urlgrabber library to use.
|
|
|
|
|
|
|
|
+
|
|
|
|
|
|
|
|
+def _to_utf8(obj, errors='replace'):
|
|
|
|
|
|
|
|
+ '''convert 'unicode' to an encoded utf-8 byte string '''
|
|
|
|
|
|
|
|
+ # stolen from yum.i18n
|
|
|
|
|
|
|
|
+ if isinstance(obj, unicode):
|
|
|
|
|
|
|
|
+ obj = obj.encode('utf-8', errors)
|
|
|
|
|
|
|
|
+ return obj
|
|
|
|
|
|
|
|
+
|
|
|
|
|
|
|
|
+########################################################################
|
|
|
|
|
|
|
|
+# END UTILITY FUNCTIONS
|
|
|
|
|
|
|
|
+########################################################################
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class URLGrabError(IOError):
|
|
|
|
|
|
|
|
@@ -662,6 +696,7 @@ class URLParser:
|
|
|
|
|
|
|
|
opts.quote = 0 --> do not quote it
|
|
|
|
|
|
|
|
opts.quote = None --> guess
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
+ url = _to_utf8(url)
|
|
|
|
|
|
|
|
quote = opts.quote
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if opts.prefix:
|
|
|
|
|
|
|
|
@@ -800,6 +835,7 @@ class URLGrabberOptions:
|
|
|
|
self.close_connection = 0
|
|
|
|
self.close_connection = 0
|
|
|
|
self.range = None
|
|
|
|
self.range = None
|
|
|
|
self.user_agent = 'urlgrabber/%s' % __version__
|
|
|
|
self.user_agent = 'urlgrabber/%s' % __version__
|
|
|
@ -142,7 +235,7 @@ index e090e90..0c78857 100644
|
|
|
|
self.keepalive = 1
|
|
|
|
self.keepalive = 1
|
|
|
|
self.proxies = None
|
|
|
|
self.proxies = None
|
|
|
|
self.reget = None
|
|
|
|
self.reget = None
|
|
|
|
@@ -808,7 +820,7 @@ class URLGrabberOptions:
|
|
|
|
@@ -808,13 +844,15 @@ class URLGrabberOptions:
|
|
|
|
self.prefix = None
|
|
|
|
self.prefix = None
|
|
|
|
self.opener = None
|
|
|
|
self.opener = None
|
|
|
|
self.cache_openers = True
|
|
|
|
self.cache_openers = True
|
|
|
@ -151,7 +244,24 @@ index e090e90..0c78857 100644
|
|
|
|
self.text = None
|
|
|
|
self.text = None
|
|
|
|
self.http_headers = None
|
|
|
|
self.http_headers = None
|
|
|
|
self.ftp_headers = None
|
|
|
|
self.ftp_headers = None
|
|
|
|
@@ -931,6 +943,9 @@ class URLGrabber:
|
|
|
|
self.data = None
|
|
|
|
|
|
|
|
self.urlparser = URLParser()
|
|
|
|
|
|
|
|
self.quote = None
|
|
|
|
|
|
|
|
+ self.username = None
|
|
|
|
|
|
|
|
+ self.password = None
|
|
|
|
|
|
|
|
self.ssl_ca_cert = None # sets SSL_CAINFO - path to certdb
|
|
|
|
|
|
|
|
self.ssl_context = None # no-op in pycurl
|
|
|
|
|
|
|
|
self.ssl_verify_peer = True # check peer's cert for authenticityb
|
|
|
|
|
|
|
|
@@ -846,7 +884,7 @@ class URLGrabberOptions:
|
|
|
|
|
|
|
|
s = s + indent + '}'
|
|
|
|
|
|
|
|
return s
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
-class URLGrabber:
|
|
|
|
|
|
|
|
+class URLGrabber(object):
|
|
|
|
|
|
|
|
"""Provides easy opening of URLs with a variety of options.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
All options are specified as kwargs. Options may be specified when
|
|
|
|
|
|
|
|
@@ -931,6 +969,9 @@ class URLGrabber:
|
|
|
|
(scheme, host, path, parm, query, frag) = parts
|
|
|
|
(scheme, host, path, parm, query, frag) = parts
|
|
|
|
if filename is None:
|
|
|
|
if filename is None:
|
|
|
|
filename = os.path.basename( urllib.unquote(path) )
|
|
|
|
filename = os.path.basename( urllib.unquote(path) )
|
|
|
@ -161,7 +271,16 @@ index e090e90..0c78857 100644
|
|
|
|
if scheme == 'file' and not opts.copy_local:
|
|
|
|
if scheme == 'file' and not opts.copy_local:
|
|
|
|
# just return the name of the local file - don't make a
|
|
|
|
# just return the name of the local file - don't make a
|
|
|
|
# copy currently
|
|
|
|
# copy currently
|
|
|
|
@@ -1052,9 +1067,15 @@ class PyCurlFileObject():
|
|
|
|
@@ -1030,7 +1071,7 @@ class URLGrabber:
|
|
|
|
|
|
|
|
default_grabber = URLGrabber()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
-class PyCurlFileObject():
|
|
|
|
|
|
|
|
+class PyCurlFileObject(object):
|
|
|
|
|
|
|
|
def __init__(self, url, filename, opts):
|
|
|
|
|
|
|
|
self.fo = None
|
|
|
|
|
|
|
|
self._hdr_dump = ''
|
|
|
|
|
|
|
|
@@ -1052,9 +1093,15 @@ class PyCurlFileObject():
|
|
|
|
self._reget_length = 0
|
|
|
|
self._reget_length = 0
|
|
|
|
self._prog_running = False
|
|
|
|
self._prog_running = False
|
|
|
|
self._error = (None, None)
|
|
|
|
self._error = (None, None)
|
|
|
@ -178,7 +297,7 @@ index e090e90..0c78857 100644
|
|
|
|
|
|
|
|
|
|
|
|
def __getattr__(self, name):
|
|
|
|
def __getattr__(self, name):
|
|
|
|
"""This effectively allows us to wrap at the instance level.
|
|
|
|
"""This effectively allows us to wrap at the instance level.
|
|
|
|
@@ -1085,9 +1106,14 @@ class PyCurlFileObject():
|
|
|
|
@@ -1085,9 +1132,14 @@ class PyCurlFileObject():
|
|
|
|
return -1
|
|
|
|
return -1
|
|
|
|
|
|
|
|
|
|
|
|
def _hdr_retrieve(self, buf):
|
|
|
|
def _hdr_retrieve(self, buf):
|
|
|
@ -194,7 +313,7 @@ index e090e90..0c78857 100644
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
self._hdr_dump += buf
|
|
|
|
self._hdr_dump += buf
|
|
|
|
# we have to get the size before we do the progress obj start
|
|
|
|
# we have to get the size before we do the progress obj start
|
|
|
|
@@ -1104,7 +1130,17 @@ class PyCurlFileObject():
|
|
|
|
@@ -1104,7 +1156,17 @@ class PyCurlFileObject():
|
|
|
|
s = parse150(buf)
|
|
|
|
s = parse150(buf)
|
|
|
|
if s:
|
|
|
|
if s:
|
|
|
|
self.size = int(s)
|
|
|
|
self.size = int(s)
|
|
|
@ -213,7 +332,7 @@ index e090e90..0c78857 100644
|
|
|
|
return len(buf)
|
|
|
|
return len(buf)
|
|
|
|
except KeyboardInterrupt:
|
|
|
|
except KeyboardInterrupt:
|
|
|
|
return pycurl.READFUNC_ABORT
|
|
|
|
return pycurl.READFUNC_ABORT
|
|
|
|
@@ -1113,8 +1149,10 @@ class PyCurlFileObject():
|
|
|
|
@@ -1113,8 +1175,10 @@ class PyCurlFileObject():
|
|
|
|
if self._parsed_hdr:
|
|
|
|
if self._parsed_hdr:
|
|
|
|
return self._parsed_hdr
|
|
|
|
return self._parsed_hdr
|
|
|
|
statusend = self._hdr_dump.find('\n')
|
|
|
|
statusend = self._hdr_dump.find('\n')
|
|
|
@ -224,7 +343,7 @@ index e090e90..0c78857 100644
|
|
|
|
self._parsed_hdr = mimetools.Message(hdrfp)
|
|
|
|
self._parsed_hdr = mimetools.Message(hdrfp)
|
|
|
|
return self._parsed_hdr
|
|
|
|
return self._parsed_hdr
|
|
|
|
|
|
|
|
|
|
|
|
@@ -1136,11 +1174,21 @@ class PyCurlFileObject():
|
|
|
|
@@ -1136,11 +1200,21 @@ class PyCurlFileObject():
|
|
|
|
self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update)
|
|
|
|
self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update)
|
|
|
|
self.curl_obj.setopt(pycurl.FAILONERROR, True)
|
|
|
|
self.curl_obj.setopt(pycurl.FAILONERROR, True)
|
|
|
|
self.curl_obj.setopt(pycurl.OPT_FILETIME, True)
|
|
|
|
self.curl_obj.setopt(pycurl.OPT_FILETIME, True)
|
|
|
@ -246,7 +365,7 @@ index e090e90..0c78857 100644
|
|
|
|
|
|
|
|
|
|
|
|
# maybe to be options later
|
|
|
|
# maybe to be options later
|
|
|
|
self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)
|
|
|
|
self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)
|
|
|
|
@@ -1148,9 +1196,11 @@ class PyCurlFileObject():
|
|
|
|
@@ -1148,9 +1222,11 @@ class PyCurlFileObject():
|
|
|
|
|
|
|
|
|
|
|
|
# timeouts
|
|
|
|
# timeouts
|
|
|
|
timeout = 300
|
|
|
|
timeout = 300
|
|
|
@ -261,26 +380,115 @@ index e090e90..0c78857 100644
|
|
|
|
|
|
|
|
|
|
|
|
# ssl options
|
|
|
|
# ssl options
|
|
|
|
if self.scheme == 'https':
|
|
|
|
if self.scheme == 'https':
|
|
|
|
@@ -1276,7 +1326,7 @@ class PyCurlFileObject():
|
|
|
|
@@ -1203,12 +1279,19 @@ class PyCurlFileObject():
|
|
|
|
|
|
|
|
if proxy == '_none_': proxy = ""
|
|
|
|
|
|
|
|
self.curl_obj.setopt(pycurl.PROXY, proxy)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
- # FIXME username/password/auth settings
|
|
|
|
|
|
|
|
+ if opts.username and opts.password:
|
|
|
|
|
|
|
|
+ if self.scheme in ('http', 'https'):
|
|
|
|
|
|
|
|
+ self.curl_obj.setopt(pycurl.HTTPAUTH, pycurl.HTTPAUTH_ANY)
|
|
|
|
|
|
|
|
+
|
|
|
|
|
|
|
|
+ if opts.username and opts.password:
|
|
|
|
|
|
|
|
+ # apparently when applying them as curlopts they do not require quoting of any kind
|
|
|
|
|
|
|
|
+ userpwd = '%s:%s' % (opts.username, opts.password)
|
|
|
|
|
|
|
|
+ self.curl_obj.setopt(pycurl.USERPWD, userpwd)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#posts - simple - expects the fields as they are
|
|
|
|
|
|
|
|
if opts.data:
|
|
|
|
|
|
|
|
self.curl_obj.setopt(pycurl.POST, True)
|
|
|
|
|
|
|
|
- self.curl_obj.setopt(pycurl.POSTFIELDS, self._to_utf8(opts.data))
|
|
|
|
|
|
|
|
+ self.curl_obj.setopt(pycurl.POSTFIELDS, _to_utf8(opts.data))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# our url
|
|
|
|
|
|
|
|
self.curl_obj.setopt(pycurl.URL, self.url)
|
|
|
|
|
|
|
|
@@ -1228,12 +1311,14 @@ class PyCurlFileObject():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
code = self.http_code
|
|
|
|
|
|
|
|
errcode = e.args[0]
|
|
|
|
|
|
|
|
+ errurl = urllib.unquote(self.url)
|
|
|
|
|
|
|
|
+
|
|
|
|
|
|
|
|
if self._error[0]:
|
|
|
|
|
|
|
|
errcode = self._error[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if errcode == 23 and code >= 200 and code < 299:
|
|
|
|
|
|
|
|
- err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e))
|
|
|
|
|
|
|
|
- err.url = self.url
|
|
|
|
|
|
|
|
+ err = URLGrabError(15, _('User (or something) called abort %s: %s') % (errurl, e))
|
|
|
|
|
|
|
|
+ err.url = errurl
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# this is probably wrong but ultimately this is what happens
|
|
|
|
|
|
|
|
# we have a legit http code and a pycurl 'writer failed' code
|
|
|
|
|
|
|
|
@@ -1244,23 +1329,23 @@ class PyCurlFileObject():
|
|
|
|
|
|
|
|
raise KeyboardInterrupt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
elif errcode == 28:
|
|
|
|
|
|
|
|
- err = URLGrabError(12, _('Timeout on %s: %s') % (self.url, e))
|
|
|
|
|
|
|
|
- err.url = self.url
|
|
|
|
|
|
|
|
+ err = URLGrabError(12, _('Timeout on %s: %s') % (errurl, e))
|
|
|
|
|
|
|
|
+ err.url = errurl
|
|
|
|
|
|
|
|
raise err
|
|
|
|
|
|
|
|
elif errcode == 35:
|
|
|
|
|
|
|
|
msg = _("problem making ssl connection")
|
|
|
|
|
|
|
|
err = URLGrabError(14, msg)
|
|
|
|
|
|
|
|
- err.url = self.url
|
|
|
|
|
|
|
|
+ err.url = errurl
|
|
|
|
|
|
|
|
raise err
|
|
|
|
|
|
|
|
elif errcode == 37:
|
|
|
|
|
|
|
|
- msg = _("Could not open/read %s") % (self.url)
|
|
|
|
|
|
|
|
+ msg = _("Could not open/read %s") % (errurl)
|
|
|
|
|
|
|
|
err = URLGrabError(14, msg)
|
|
|
|
|
|
|
|
- err.url = self.url
|
|
|
|
|
|
|
|
+ err.url = errurl
|
|
|
|
|
|
|
|
raise err
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
elif errcode == 42:
|
|
|
|
|
|
|
|
- err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e))
|
|
|
|
|
|
|
|
- err.url = self.url
|
|
|
|
|
|
|
|
+ err = URLGrabError(15, _('User (or something) called abort %s: %s') % (errurl, e))
|
|
|
|
|
|
|
|
+ err.url = errurl
|
|
|
|
|
|
|
|
# this is probably wrong but ultimately this is what happens
|
|
|
|
|
|
|
|
# we have a legit http code and a pycurl 'writer failed' code
|
|
|
|
|
|
|
|
# which almost always means something aborted it from outside
|
|
|
|
|
|
|
|
@@ -1272,33 +1357,93 @@ class PyCurlFileObject():
|
|
|
|
|
|
|
|
elif errcode == 58:
|
|
|
|
|
|
|
|
msg = _("problem with the local client certificate")
|
|
|
|
|
|
|
|
err = URLGrabError(14, msg)
|
|
|
|
|
|
|
|
- err.url = self.url
|
|
|
|
|
|
|
|
+ err.url = errurl
|
|
|
|
raise err
|
|
|
|
raise err
|
|
|
|
|
|
|
|
|
|
|
|
elif errcode == 60:
|
|
|
|
elif errcode == 60:
|
|
|
|
- msg = _("client cert cannot be verified or client cert incorrect")
|
|
|
|
- msg = _("client cert cannot be verified or client cert incorrect")
|
|
|
|
+ msg = _("Peer cert cannot be verified or peer cert invalid")
|
|
|
|
+ msg = _("Peer cert cannot be verified or peer cert invalid")
|
|
|
|
err = URLGrabError(14, msg)
|
|
|
|
err = URLGrabError(14, msg)
|
|
|
|
err.url = self.url
|
|
|
|
- err.url = self.url
|
|
|
|
|
|
|
|
+ err.url = errurl
|
|
|
|
raise err
|
|
|
|
raise err
|
|
|
|
@@ -1291,14 +1341,70 @@ class PyCurlFileObject():
|
|
|
|
|
|
|
|
|
|
|
|
elif errcode == 63:
|
|
|
|
|
|
|
|
if self._error[1]:
|
|
|
|
|
|
|
|
msg = self._error[1]
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
- msg = _("Max download size exceeded on %s") % (self.url)
|
|
|
|
|
|
|
|
+ msg = _("Max download size exceeded on %s") % ()
|
|
|
|
|
|
|
|
err = URLGrabError(14, msg)
|
|
|
|
|
|
|
|
- err.url = self.url
|
|
|
|
|
|
|
|
+ err.url = errurl
|
|
|
|
raise err
|
|
|
|
raise err
|
|
|
|
|
|
|
|
|
|
|
|
elif str(e.args[1]) == '' and self.http_code != 0: # fake it until you make it
|
|
|
|
elif str(e.args[1]) == '' and self.http_code != 0: # fake it until you make it
|
|
|
|
- msg = 'HTTP Error %s : %s ' % (self.http_code, self.url)
|
|
|
|
- msg = 'HTTP Error %s : %s ' % (self.http_code, self.url)
|
|
|
|
+ if self.scheme in ['http', 'https']:
|
|
|
|
+ if self.scheme in ['http', 'https']:
|
|
|
|
+ msg = 'HTTP Error %s : %s ' % (self.http_code, self.url)
|
|
|
|
+ if self.http_code in responses:
|
|
|
|
|
|
|
|
+ resp = responses[self.http_code]
|
|
|
|
|
|
|
|
+ msg = 'HTTP Error %s - %s : %s' % (self.http_code, resp, errurl)
|
|
|
|
|
|
|
|
+ else:
|
|
|
|
|
|
|
|
+ msg = 'HTTP Error %s : %s ' % (self.http_code, errurl)
|
|
|
|
+ elif self.scheme in ['ftp']:
|
|
|
|
+ elif self.scheme in ['ftp']:
|
|
|
|
+ msg = 'FTP Error %s : %s ' % (self.http_code, self.url)
|
|
|
|
+ msg = 'FTP Error %s : %s ' % (self.http_code, errurl)
|
|
|
|
+ else:
|
|
|
|
+ else:
|
|
|
|
+ msg = "Unknown Error: URL=%s , scheme=%s" % (self.url, self.scheme)
|
|
|
|
+ msg = "Unknown Error: URL=%s , scheme=%s" % (errurl, self.scheme)
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
- msg = 'PYCURL ERROR %s - "%s"' % (errcode, str(e.args[1]))
|
|
|
|
- msg = 'PYCURL ERROR %s - "%s"' % (errcode, str(e.args[1]))
|
|
|
|
+ pyerr2str = { 5 : _("Couldn't resolve proxy"),
|
|
|
|
+ pyerr2str = { 5 : _("Couldn't resolve proxy"),
|
|
|
@ -338,12 +546,25 @@ index e090e90..0c78857 100644
|
|
|
|
+ if self._error[1]:
|
|
|
|
+ if self._error[1]:
|
|
|
|
+ msg = self._error[1]
|
|
|
|
+ msg = self._error[1]
|
|
|
|
+ err = URLGRabError(14, msg)
|
|
|
|
+ err = URLGRabError(14, msg)
|
|
|
|
+ err.url = self.url
|
|
|
|
+ err.url = urllib.unquote(self.url)
|
|
|
|
+ raise err
|
|
|
|
+ raise err
|
|
|
|
|
|
|
|
|
|
|
|
def _do_open(self):
|
|
|
|
def _do_open(self):
|
|
|
|
self.curl_obj = _curl_cache
|
|
|
|
self.curl_obj = _curl_cache
|
|
|
|
@@ -1434,9 +1540,13 @@ class PyCurlFileObject():
|
|
|
|
@@ -1333,7 +1478,11 @@ class PyCurlFileObject():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if self.opts.range:
|
|
|
|
|
|
|
|
rt = self.opts.range
|
|
|
|
|
|
|
|
- if rt[0]: rt = (rt[0] + reget_length, rt[1])
|
|
|
|
|
|
|
|
+
|
|
|
|
|
|
|
|
+ if rt[0] is None:
|
|
|
|
|
|
|
|
+ rt = (0, rt[1])
|
|
|
|
|
|
|
|
+ rt = (rt[0] + reget_length, rt[1])
|
|
|
|
|
|
|
|
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if rt:
|
|
|
|
|
|
|
|
header = range_tuple_to_header(rt)
|
|
|
|
|
|
|
|
@@ -1434,9 +1583,13 @@ class PyCurlFileObject():
|
|
|
|
#fh, self._temp_name = mkstemp()
|
|
|
|
#fh, self._temp_name = mkstemp()
|
|
|
|
#self.fo = open(self._temp_name, 'wb')
|
|
|
|
#self.fo = open(self._temp_name, 'wb')
|
|
|
|
|
|
|
|
|
|
|
@ -360,7 +581,7 @@ index e090e90..0c78857 100644
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if _was_filename:
|
|
|
|
if _was_filename:
|
|
|
|
@@ -1446,9 +1556,23 @@ class PyCurlFileObject():
|
|
|
|
@@ -1446,9 +1599,23 @@ class PyCurlFileObject():
|
|
|
|
# set the time
|
|
|
|
# set the time
|
|
|
|
mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME)
|
|
|
|
mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME)
|
|
|
|
if mod_time != -1:
|
|
|
|
if mod_time != -1:
|
|
|
@ -386,7 +607,7 @@ index e090e90..0c78857 100644
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
#self.fo = open(self._temp_name, 'r')
|
|
|
|
#self.fo = open(self._temp_name, 'r')
|
|
|
|
self.fo.seek(0)
|
|
|
|
self.fo.seek(0)
|
|
|
|
@@ -1532,11 +1656,14 @@ class PyCurlFileObject():
|
|
|
|
@@ -1532,11 +1699,14 @@ class PyCurlFileObject():
|
|
|
|
def _over_max_size(self, cur, max_size=None):
|
|
|
|
def _over_max_size(self, cur, max_size=None):
|
|
|
|
|
|
|
|
|
|
|
|
if not max_size:
|
|
|
|
if not max_size:
|
|
|
@ -405,7 +626,21 @@ index e090e90..0c78857 100644
|
|
|
|
|
|
|
|
|
|
|
|
msg = _("Downloaded more than max size for %s: %s > %s") \
|
|
|
|
msg = _("Downloaded more than max size for %s: %s > %s") \
|
|
|
|
% (self.url, cur, max_size)
|
|
|
|
% (self.url, cur, max_size)
|
|
|
|
@@ -1582,9 +1709,21 @@ class PyCurlFileObject():
|
|
|
|
@@ -1544,13 +1714,6 @@ class PyCurlFileObject():
|
|
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
- def _to_utf8(self, obj, errors='replace'):
|
|
|
|
|
|
|
|
- '''convert 'unicode' to an encoded utf-8 byte string '''
|
|
|
|
|
|
|
|
- # stolen from yum.i18n
|
|
|
|
|
|
|
|
- if isinstance(obj, unicode):
|
|
|
|
|
|
|
|
- obj = obj.encode('utf-8', errors)
|
|
|
|
|
|
|
|
- return obj
|
|
|
|
|
|
|
|
-
|
|
|
|
|
|
|
|
def read(self, amt=None):
|
|
|
|
|
|
|
|
self._fill_buffer(amt)
|
|
|
|
|
|
|
|
if amt is None:
|
|
|
|
|
|
|
|
@@ -1582,9 +1745,21 @@ class PyCurlFileObject():
|
|
|
|
self.opts.progress_obj.end(self._amount_read)
|
|
|
|
self.opts.progress_obj.end(self._amount_read)
|
|
|
|
self.fo.close()
|
|
|
|
self.fo.close()
|
|
|
|
|
|
|
|
|
|
|
@ -428,6 +663,29 @@ index e090e90..0c78857 100644
|
|
|
|
|
|
|
|
|
|
|
|
#####################################################################
|
|
|
|
#####################################################################
|
|
|
|
# DEPRECATED FUNCTIONS
|
|
|
|
# DEPRECATED FUNCTIONS
|
|
|
|
|
|
|
|
diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py
|
|
|
|
|
|
|
|
index dad410b..8731aed 100644
|
|
|
|
|
|
|
|
--- a/urlgrabber/mirror.py
|
|
|
|
|
|
|
|
+++ b/urlgrabber/mirror.py
|
|
|
|
|
|
|
|
@@ -90,7 +90,7 @@ CUSTOMIZATION
|
|
|
|
|
|
|
|
import random
|
|
|
|
|
|
|
|
import thread # needed for locking to make this threadsafe
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
-from grabber import URLGrabError, CallbackObject, DEBUG
|
|
|
|
|
|
|
|
+from grabber import URLGrabError, CallbackObject, DEBUG, _to_utf8
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _(st):
|
|
|
|
|
|
|
|
return st
|
|
|
|
|
|
|
|
@@ -263,7 +263,8 @@ class MirrorGroup:
|
|
|
|
|
|
|
|
def _parse_mirrors(self, mirrors):
|
|
|
|
|
|
|
|
parsed_mirrors = []
|
|
|
|
|
|
|
|
for m in mirrors:
|
|
|
|
|
|
|
|
- if type(m) == type(''): m = {'mirror': m}
|
|
|
|
|
|
|
|
+ if isinstance(m, basestring):
|
|
|
|
|
|
|
|
+ m = {'mirror': _to_utf8(m)}
|
|
|
|
|
|
|
|
parsed_mirrors.append(m)
|
|
|
|
|
|
|
|
return parsed_mirrors
|
|
|
|
|
|
|
|
|
|
|
|
diff --git a/urlgrabber/progress.py b/urlgrabber/progress.py
|
|
|
|
diff --git a/urlgrabber/progress.py b/urlgrabber/progress.py
|
|
|
|
index dd07c6a..45eb248 100644
|
|
|
|
index dd07c6a..45eb248 100644
|
|
|
|
--- a/urlgrabber/progress.py
|
|
|
|
--- a/urlgrabber/progress.py
|
|
|
|