|
|
|
@ -11,6 +11,19 @@ index 0000000..1ffe416
|
|
|
|
|
+*.kdev*
|
|
|
|
|
+*.kateproject
|
|
|
|
|
+ipython.log*
|
|
|
|
|
diff --git a/README b/README
|
|
|
|
|
index 5fd378b..2718d3c 100644
|
|
|
|
|
--- a/README
|
|
|
|
|
+++ b/README
|
|
|
|
|
@@ -19,7 +19,7 @@ You can build rpms by running
|
|
|
|
|
python setup.py bdist_rpm
|
|
|
|
|
|
|
|
|
|
The rpms (both source and "binary") will be specific to the current
|
|
|
|
|
-distrubution/version and may not be portable to others. This is
|
|
|
|
|
+distribution/version and may not be portable to others. This is
|
|
|
|
|
because they will be built for the currently installed python.
|
|
|
|
|
|
|
|
|
|
keepalive.py and byterange.py are generic urllib2 extension modules and
|
|
|
|
|
diff --git a/scripts/urlgrabber b/scripts/urlgrabber
|
|
|
|
|
index 518e512..07881b3 100644
|
|
|
|
|
--- a/scripts/urlgrabber
|
|
|
|
@ -189,6 +202,63 @@ index 50c6348..5fb43f9 100644
|
|
|
|
|
base_ftp = 'ftp://localhost/test/'
|
|
|
|
|
|
|
|
|
|
# set to a proftp server only. we're working around a couple of
|
|
|
|
|
diff --git a/test/munittest.py b/test/munittest.py
|
|
|
|
|
index 96230b8..16a61ae 100644
|
|
|
|
|
--- a/test/munittest.py
|
|
|
|
|
+++ b/test/munittest.py
|
|
|
|
|
@@ -113,7 +113,7 @@ import types
|
|
|
|
|
__all__ = ['TestResult', 'TestCase', 'TestSuite', 'TextTestRunner',
|
|
|
|
|
'TestLoader', 'FunctionTestCase', 'main', 'defaultTestLoader']
|
|
|
|
|
|
|
|
|
|
-# Expose obsolete functions for backwards compatability
|
|
|
|
|
+# Expose obsolete functions for backwards compatibility
|
|
|
|
|
__all__.extend(['getTestCaseNames', 'makeSuite', 'findTestCases'])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@@ -410,7 +410,7 @@ class TestCase:
|
|
|
|
|
(default 7) and comparing to zero.
|
|
|
|
|
|
|
|
|
|
Note that decimal places (from zero) is usually not the same
|
|
|
|
|
- as significant digits (measured from the most signficant digit).
|
|
|
|
|
+ as significant digits (measured from the most significant digit).
|
|
|
|
|
"""
|
|
|
|
|
if round(second-first, places) != 0:
|
|
|
|
|
raise self.failureException, \
|
|
|
|
|
@@ -422,7 +422,7 @@ class TestCase:
|
|
|
|
|
(default 7) and comparing to zero.
|
|
|
|
|
|
|
|
|
|
Note that decimal places (from zero) is usually not the same
|
|
|
|
|
- as significant digits (measured from the most signficant digit).
|
|
|
|
|
+ as significant digits (measured from the most significant digit).
|
|
|
|
|
"""
|
|
|
|
|
if round(second-first, places) == 0:
|
|
|
|
|
raise self.failureException, \
|
|
|
|
|
diff --git a/test/test_byterange.py b/test/test_byterange.py
|
|
|
|
|
index 96f1573..fe7e105 100644
|
|
|
|
|
--- a/test/test_byterange.py
|
|
|
|
|
+++ b/test/test_byterange.py
|
|
|
|
|
@@ -56,7 +56,7 @@ class RangeableFileObjectTestCase(TestCase):
|
|
|
|
|
"""RangeableFileObject.seek() poor mans version..
|
|
|
|
|
|
|
|
|
|
We just delete the seek method from StringIO so we can
|
|
|
|
|
- excercise RangeableFileObject when the file object supplied
|
|
|
|
|
+ exercise RangeableFileObject when the file object supplied
|
|
|
|
|
doesn't support seek.
|
|
|
|
|
"""
|
|
|
|
|
seek = StringIO.seek
|
|
|
|
|
diff --git a/test/test_grabber.py b/test/test_grabber.py
|
|
|
|
|
index eecdbcf..d3a7692 100644
|
|
|
|
|
--- a/test/test_grabber.py
|
|
|
|
|
+++ b/test/test_grabber.py
|
|
|
|
|
@@ -86,7 +86,7 @@ class FileObjectTests(TestCase):
|
|
|
|
|
|
|
|
|
|
class HTTPTests(TestCase):
|
|
|
|
|
def test_reference_file(self):
|
|
|
|
|
- "download refernce file via HTTP"
|
|
|
|
|
+ "download reference file via HTTP"
|
|
|
|
|
filename = tempfile.mktemp()
|
|
|
|
|
grabber.urlgrab(ref_http, filename)
|
|
|
|
|
|
|
|
|
|
diff --git a/test/test_mirror.py b/test/test_mirror.py
|
|
|
|
|
index 70fe069..6fdb668 100644
|
|
|
|
|
--- a/test/test_mirror.py
|
|
|
|
@ -260,9 +330,18 @@ index 70fe069..6fdb668 100644
|
|
|
|
|
tl = TestLoader()
|
|
|
|
|
return tl.loadTestsFromModule(sys.modules[__name__])
|
|
|
|
|
diff --git a/urlgrabber/byterange.py b/urlgrabber/byterange.py
|
|
|
|
|
index 3e5f3b7..8eeaeda 100644
|
|
|
|
|
index 3e5f3b7..5efa160 100644
|
|
|
|
|
--- a/urlgrabber/byterange.py
|
|
|
|
|
+++ b/urlgrabber/byterange.py
|
|
|
|
|
@@ -40,7 +40,7 @@ class HTTPRangeHandler(urllib2.BaseHandler):
|
|
|
|
|
|
|
|
|
|
This was extremely simple. The Range header is a HTTP feature to
|
|
|
|
|
begin with so all this class does is tell urllib2 that the
|
|
|
|
|
- "206 Partial Content" reponse from the HTTP server is what we
|
|
|
|
|
+ "206 Partial Content" response from the HTTP server is what we
|
|
|
|
|
expected.
|
|
|
|
|
|
|
|
|
|
Example:
|
|
|
|
|
@@ -68,7 +68,7 @@ class HTTPRangeHandler(urllib2.BaseHandler):
|
|
|
|
|
|
|
|
|
|
def http_error_416(self, req, fp, code, msg, hdrs):
|
|
|
|
@ -305,6 +384,15 @@ index 3e5f3b7..8eeaeda 100644
|
|
|
|
|
else:
|
|
|
|
|
retrlen = lb - fb
|
|
|
|
|
fp = RangeableFileObject(fp, (0,retrlen))
|
|
|
|
|
@@ -442,7 +442,7 @@ def range_tuple_normalize(range_tup):
|
|
|
|
|
Return a tuple whose first element is guaranteed to be an int
|
|
|
|
|
and whose second element will be '' (meaning: the last byte) or
|
|
|
|
|
an int. Finally, return None if the normalized tuple == (0,'')
|
|
|
|
|
- as that is equivelant to retrieving the entire file.
|
|
|
|
|
+ as that is equivalent to retrieving the entire file.
|
|
|
|
|
"""
|
|
|
|
|
if range_tup is None: return None
|
|
|
|
|
# handle first byte
|
|
|
|
|
@@ -458,6 +458,6 @@ def range_tuple_normalize(range_tup):
|
|
|
|
|
# check if range is over the entire file
|
|
|
|
|
if (fb,lb) == (0,''): return None
|
|
|
|
@ -314,9 +402,18 @@ index 3e5f3b7..8eeaeda 100644
|
|
|
|
|
return (fb,lb)
|
|
|
|
|
|
|
|
|
|
diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py
|
|
|
|
|
index e090e90..6b409e3 100644
|
|
|
|
|
index e090e90..30a8bdb 100644
|
|
|
|
|
--- a/urlgrabber/grabber.py
|
|
|
|
|
+++ b/urlgrabber/grabber.py
|
|
|
|
|
@@ -35,7 +35,7 @@ GENERAL ARGUMENTS (kwargs)
|
|
|
|
|
close_connection = 0 [0|1]
|
|
|
|
|
|
|
|
|
|
tells URLGrabber to close the connection after a file has been
|
|
|
|
|
- transfered. This is ignored unless the download happens with the
|
|
|
|
|
+ transferred. This is ignored unless the download happens with the
|
|
|
|
|
http keepalive handler (keepalive=1). Otherwise, the connection
|
|
|
|
|
is left open for further use. The module level default for this
|
|
|
|
|
option is 0 (keepalive connections will not be closed).
|
|
|
|
|
@@ -49,11 +49,26 @@ GENERAL ARGUMENTS (kwargs)
|
|
|
|
|
progress_obj = None
|
|
|
|
|
|
|
|
|
@ -373,6 +470,15 @@ index e090e90..6b409e3 100644
|
|
|
|
|
|
|
|
|
|
bandwidth = 0
|
|
|
|
|
|
|
|
|
|
@@ -91,7 +112,7 @@ GENERAL ARGUMENTS (kwargs)
|
|
|
|
|
range to retrieve. Either or both of the values may set to
|
|
|
|
|
None. If first_byte is None, byte offset 0 is assumed. If
|
|
|
|
|
last_byte is None, the last byte available is assumed. Note that
|
|
|
|
|
- the range specification is python-like in that (0,10) will yeild
|
|
|
|
|
+ the range specification is python-like in that (0,10) will yield
|
|
|
|
|
the first 10 bytes of the file.
|
|
|
|
|
|
|
|
|
|
If set to None, no range will be used.
|
|
|
|
|
@@ -143,8 +164,12 @@ GENERAL ARGUMENTS (kwargs)
|
|
|
|
|
note that proxy authentication information may be provided using
|
|
|
|
|
normal URL constructs:
|
|
|
|
@ -401,7 +507,7 @@ index e090e90..6b409e3 100644
|
|
|
|
|
ssl_ca_cert = None
|
|
|
|
|
|
|
|
|
|
this option can be used if M2Crypto is available and will be
|
|
|
|
|
@@ -211,43 +242,75 @@ GENERAL ARGUMENTS (kwargs)
|
|
|
|
|
@@ -211,43 +242,82 @@ GENERAL ARGUMENTS (kwargs)
|
|
|
|
|
No-op when using the curl backend (default)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -482,11 +588,18 @@ index e090e90..6b409e3 100644
|
|
|
|
|
+ The speed estimate also drifts exponentially from the speed
|
|
|
|
|
+ actually measured to the default speed, with default
|
|
|
|
|
+ period of 30 days.
|
|
|
|
|
+
|
|
|
|
|
+ ftp_disable_epsv = False
|
|
|
|
|
+
|
|
|
|
|
+ False, True
|
|
|
|
|
+
|
|
|
|
|
+ This options disables Extended Passive Mode (the EPSV command)
|
|
|
|
|
+ which does not work correctly on some buggy ftp servers.
|
|
|
|
|
+
|
|
|
|
|
|
|
|
|
|
RETRY RELATED ARGUMENTS
|
|
|
|
|
|
|
|
|
|
@@ -328,6 +391,15 @@ RETRY RELATED ARGUMENTS
|
|
|
|
|
@@ -328,6 +398,15 @@ RETRY RELATED ARGUMENTS
|
|
|
|
|
but it cannot (without severe trickiness) prevent the exception
|
|
|
|
|
from being raised.
|
|
|
|
|
|
|
|
|
@ -502,7 +615,19 @@ index e090e90..6b409e3 100644
|
|
|
|
|
interrupt_callback = None
|
|
|
|
|
|
|
|
|
|
This callback is called if KeyboardInterrupt is received at any
|
|
|
|
|
@@ -420,6 +492,7 @@ import time
|
|
|
|
|
@@ -368,6 +447,11 @@ BANDWIDTH THROTTLING
|
|
|
|
|
is a float and bandwidth == 0, throttling is disabled. If None, the
|
|
|
|
|
module-level default (which can be set with set_bandwidth) is used.
|
|
|
|
|
|
|
|
|
|
+ Note that when multiple downloads run simultaneously (multiprocessing
|
|
|
|
|
+ or the parallel urlgrab() feature is used) the total bandwidth might
|
|
|
|
|
+ exceed the throttle limit. You may want to also set max_connections=1
|
|
|
|
|
+ or scale your throttle option down accordingly.
|
|
|
|
|
+
|
|
|
|
|
THROTTLING EXAMPLES:
|
|
|
|
|
|
|
|
|
|
Lets say you have a 100 Mbps connection. This is (about) 10^8 bits
|
|
|
|
|
@@ -420,6 +504,7 @@ import time
|
|
|
|
|
import string
|
|
|
|
|
import urllib
|
|
|
|
|
import urllib2
|
|
|
|
@ -510,7 +635,7 @@ index e090e90..6b409e3 100644
|
|
|
|
|
import mimetools
|
|
|
|
|
import thread
|
|
|
|
|
import types
|
|
|
|
|
@@ -428,9 +501,17 @@ import pycurl
|
|
|
|
|
@@ -428,9 +513,17 @@ import pycurl
|
|
|
|
|
from ftplib import parse150
|
|
|
|
|
from StringIO import StringIO
|
|
|
|
|
from httplib import HTTPException
|
|
|
|
@ -529,7 +654,7 @@ index e090e90..6b409e3 100644
|
|
|
|
|
########################################################################
|
|
|
|
|
# MODULE INITIALIZATION
|
|
|
|
|
########################################################################
|
|
|
|
|
@@ -439,6 +520,12 @@ try:
|
|
|
|
|
@@ -439,6 +532,12 @@ try:
|
|
|
|
|
except:
|
|
|
|
|
__version__ = '???'
|
|
|
|
|
|
|
|
|
@ -542,7 +667,16 @@ index e090e90..6b409e3 100644
|
|
|
|
|
########################################################################
|
|
|
|
|
# functions for debugging output. These functions are here because they
|
|
|
|
|
# are also part of the module initialization.
|
|
|
|
|
@@ -504,6 +591,7 @@ def _init_default_logger(logspec=None):
|
|
|
|
|
@@ -482,7 +581,7 @@ def _init_default_logger(logspec=None):
|
|
|
|
|
URLGRABBER_DEBUG=WARNING,- # log warning and higher to stdout
|
|
|
|
|
URLGRABBER_DEBUG=INFO # log info and higher to stderr
|
|
|
|
|
|
|
|
|
|
- This funtion is called during module initialization. It is not
|
|
|
|
|
+ This function is called during module initialization. It is not
|
|
|
|
|
intended to be called from outside. The only reason it is a
|
|
|
|
|
function at all is to keep the module-level namespace tidy and to
|
|
|
|
|
collect the code into a nice block.'''
|
|
|
|
|
@@ -504,6 +603,7 @@ def _init_default_logger(logspec=None):
|
|
|
|
|
else: handler = logging.FileHandler(filename)
|
|
|
|
|
handler.setFormatter(formatter)
|
|
|
|
|
DBOBJ = logging.getLogger('urlgrabber')
|
|
|
|
@ -550,7 +684,7 @@ index e090e90..6b409e3 100644
|
|
|
|
|
DBOBJ.addHandler(handler)
|
|
|
|
|
DBOBJ.setLevel(level)
|
|
|
|
|
except (KeyError, ImportError, ValueError):
|
|
|
|
|
@@ -512,8 +600,8 @@ def _init_default_logger(logspec=None):
|
|
|
|
|
@@ -512,8 +612,8 @@ def _init_default_logger(logspec=None):
|
|
|
|
|
|
|
|
|
|
def _log_package_state():
|
|
|
|
|
if not DEBUG: return
|
|
|
|
@ -561,7 +695,7 @@ index e090e90..6b409e3 100644
|
|
|
|
|
|
|
|
|
|
_init_default_logger()
|
|
|
|
|
_log_package_state()
|
|
|
|
|
@@ -527,6 +615,29 @@ def _(st):
|
|
|
|
|
@@ -527,6 +627,29 @@ def _(st):
|
|
|
|
|
# END MODULE INITIALIZATION
|
|
|
|
|
########################################################################
|
|
|
|
|
|
|
|
|
@ -591,7 +725,7 @@ index e090e90..6b409e3 100644
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class URLGrabError(IOError):
|
|
|
|
|
@@ -662,6 +773,7 @@ class URLParser:
|
|
|
|
|
@@ -662,6 +785,7 @@ class URLParser:
|
|
|
|
|
opts.quote = 0 --> do not quote it
|
|
|
|
|
opts.quote = None --> guess
|
|
|
|
|
"""
|
|
|
|
@ -599,7 +733,7 @@ index e090e90..6b409e3 100644
|
|
|
|
|
quote = opts.quote
|
|
|
|
|
|
|
|
|
|
if opts.prefix:
|
|
|
|
|
@@ -768,6 +880,41 @@ class URLGrabberOptions:
|
|
|
|
|
@@ -768,6 +892,41 @@ class URLGrabberOptions:
|
|
|
|
|
else: # throttle is a float
|
|
|
|
|
return self.bandwidth * self.throttle
|
|
|
|
|
|
|
|
|
@ -641,7 +775,7 @@ index e090e90..6b409e3 100644
|
|
|
|
|
def derive(self, **kwargs):
|
|
|
|
|
"""Create a derived URLGrabberOptions instance.
|
|
|
|
|
This method creates a new instance and overrides the
|
|
|
|
|
@@ -791,30 +938,38 @@ class URLGrabberOptions:
|
|
|
|
|
@@ -791,30 +950,38 @@ class URLGrabberOptions:
|
|
|
|
|
provided here.
|
|
|
|
|
"""
|
|
|
|
|
self.progress_obj = None
|
|
|
|
@ -681,7 +815,7 @@ index e090e90..6b409e3 100644
|
|
|
|
|
self.ssl_ca_cert = None # sets SSL_CAINFO - path to certdb
|
|
|
|
|
self.ssl_context = None # no-op in pycurl
|
|
|
|
|
self.ssl_verify_peer = True # check peer's cert for authenticityb
|
|
|
|
|
@@ -827,6 +982,12 @@ class URLGrabberOptions:
|
|
|
|
|
@@ -827,6 +994,13 @@ class URLGrabberOptions:
|
|
|
|
|
self.size = None # if we know how big the thing we're getting is going
|
|
|
|
|
# to be. this is ultimately a MAXIMUM size for the file
|
|
|
|
|
self.max_header_size = 2097152 #2mb seems reasonable for maximum header size
|
|
|
|
@ -691,10 +825,11 @@ index e090e90..6b409e3 100644
|
|
|
|
|
+ self.timedhosts = None
|
|
|
|
|
+ self.half_life = 30*24*60*60 # 30 days
|
|
|
|
|
+ self.default_speed = 1e6 # 1 MBit
|
|
|
|
|
+ self.ftp_disable_epsv = False
|
|
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
|
return self.format()
|
|
|
|
|
@@ -846,7 +1007,18 @@ class URLGrabberOptions:
|
|
|
|
|
@@ -846,7 +1020,18 @@ class URLGrabberOptions:
|
|
|
|
|
s = s + indent + '}'
|
|
|
|
|
return s
|
|
|
|
|
|
|
|
|
@ -714,7 +849,7 @@ index e090e90..6b409e3 100644
|
|
|
|
|
"""Provides easy opening of URLs with a variety of options.
|
|
|
|
|
|
|
|
|
|
All options are specified as kwargs. Options may be specified when
|
|
|
|
|
@@ -872,7 +1044,6 @@ class URLGrabber:
|
|
|
|
|
@@ -872,7 +1057,6 @@ class URLGrabber:
|
|
|
|
|
# beware of infinite loops :)
|
|
|
|
|
tries = tries + 1
|
|
|
|
|
exception = None
|
|
|
|
@ -722,7 +857,7 @@ index e090e90..6b409e3 100644
|
|
|
|
|
callback = None
|
|
|
|
|
if DEBUG: DEBUG.info('attempt %i/%s: %s',
|
|
|
|
|
tries, opts.retry, args[0])
|
|
|
|
|
@@ -883,54 +1054,62 @@ class URLGrabber:
|
|
|
|
|
@@ -883,54 +1067,62 @@ class URLGrabber:
|
|
|
|
|
except URLGrabError, e:
|
|
|
|
|
exception = e
|
|
|
|
|
callback = opts.failure_callback
|
|
|
|
@ -792,7 +927,7 @@ index e090e90..6b409e3 100644
|
|
|
|
|
if scheme == 'file' and not opts.copy_local:
|
|
|
|
|
# just return the name of the local file - don't make a
|
|
|
|
|
# copy currently
|
|
|
|
|
@@ -950,41 +1129,51 @@ class URLGrabber:
|
|
|
|
|
@@ -950,41 +1142,51 @@ class URLGrabber:
|
|
|
|
|
|
|
|
|
|
elif not opts.range:
|
|
|
|
|
if not opts.checkfunc is None:
|
|
|
|
@ -859,7 +994,7 @@ index e090e90..6b409e3 100644
|
|
|
|
|
if limit is not None:
|
|
|
|
|
limit = limit + 1
|
|
|
|
|
|
|
|
|
|
@@ -1000,12 +1189,8 @@ class URLGrabber:
|
|
|
|
|
@@ -1000,12 +1202,8 @@ class URLGrabber:
|
|
|
|
|
else: s = fo.read(limit)
|
|
|
|
|
|
|
|
|
|
if not opts.checkfunc is None:
|
|
|
|
@ -874,7 +1009,7 @@ index e090e90..6b409e3 100644
|
|
|
|
|
finally:
|
|
|
|
|
fo.close()
|
|
|
|
|
return s
|
|
|
|
|
@@ -1020,6 +1205,7 @@ class URLGrabber:
|
|
|
|
|
@@ -1020,6 +1218,7 @@ class URLGrabber:
|
|
|
|
|
return s
|
|
|
|
|
|
|
|
|
|
def _make_callback(self, callback_obj):
|
|
|
|
@ -882,7 +1017,7 @@ index e090e90..6b409e3 100644
|
|
|
|
|
if callable(callback_obj):
|
|
|
|
|
return callback_obj, (), {}
|
|
|
|
|
else:
|
|
|
|
|
@@ -1030,7 +1216,7 @@ class URLGrabber:
|
|
|
|
|
@@ -1030,7 +1229,7 @@ class URLGrabber:
|
|
|
|
|
default_grabber = URLGrabber()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -891,7 +1026,7 @@ index e090e90..6b409e3 100644
|
|
|
|
|
def __init__(self, url, filename, opts):
|
|
|
|
|
self.fo = None
|
|
|
|
|
self._hdr_dump = ''
|
|
|
|
|
@@ -1052,10 +1238,13 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1052,10 +1251,13 @@ class PyCurlFileObject():
|
|
|
|
|
self._reget_length = 0
|
|
|
|
|
self._prog_running = False
|
|
|
|
|
self._error = (None, None)
|
|
|
|
@ -907,7 +1042,7 @@ index e090e90..6b409e3 100644
|
|
|
|
|
def __getattr__(self, name):
|
|
|
|
|
"""This effectively allows us to wrap at the instance level.
|
|
|
|
|
Any attribute not found in _this_ object will be searched for
|
|
|
|
|
@@ -1067,6 +1256,12 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1067,6 +1269,12 @@ class PyCurlFileObject():
|
|
|
|
|
|
|
|
|
|
def _retrieve(self, buf):
|
|
|
|
|
try:
|
|
|
|
@ -920,7 +1055,7 @@ index e090e90..6b409e3 100644
|
|
|
|
|
if not self._prog_running:
|
|
|
|
|
if self.opts.progress_obj:
|
|
|
|
|
size = self.size + self._reget_length
|
|
|
|
|
@@ -1079,32 +1274,62 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1079,32 +1287,62 @@ class PyCurlFileObject():
|
|
|
|
|
self.opts.progress_obj.update(self._amount_read)
|
|
|
|
|
|
|
|
|
|
self._amount_read += len(buf)
|
|
|
|
@ -953,7 +1088,7 @@ index e090e90..6b409e3 100644
|
|
|
|
|
- length = buf.split(':')[1]
|
|
|
|
|
- self.size = int(length)
|
|
|
|
|
+ if self.scheme in ['http','https']:
|
|
|
|
|
+ if buf.lower().find('content-length') != -1:
|
|
|
|
|
+ if buf.lower().find('content-length:') != -1:
|
|
|
|
|
+ length = buf.split(':')[1]
|
|
|
|
|
+ self.size = int(length)
|
|
|
|
|
+ elif self.append and self._hdr_dump == '' and ' 200 ' in buf:
|
|
|
|
@ -990,7 +1125,7 @@ index e090e90..6b409e3 100644
|
|
|
|
|
return len(buf)
|
|
|
|
|
except KeyboardInterrupt:
|
|
|
|
|
return pycurl.READFUNC_ABORT
|
|
|
|
|
@@ -1113,8 +1338,10 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1113,8 +1351,10 @@ class PyCurlFileObject():
|
|
|
|
|
if self._parsed_hdr:
|
|
|
|
|
return self._parsed_hdr
|
|
|
|
|
statusend = self._hdr_dump.find('\n')
|
|
|
|
@ -1001,7 +1136,7 @@ index e090e90..6b409e3 100644
|
|
|
|
|
self._parsed_hdr = mimetools.Message(hdrfp)
|
|
|
|
|
return self._parsed_hdr
|
|
|
|
|
|
|
|
|
|
@@ -1127,6 +1354,9 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1127,6 +1367,9 @@ class PyCurlFileObject():
|
|
|
|
|
if not opts:
|
|
|
|
|
opts = self.opts
|
|
|
|
|
|
|
|
|
@ -1011,7 +1146,7 @@ index e090e90..6b409e3 100644
|
|
|
|
|
|
|
|
|
|
# defaults we're always going to set
|
|
|
|
|
self.curl_obj.setopt(pycurl.NOPROGRESS, False)
|
|
|
|
|
@@ -1136,11 +1366,21 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1136,11 +1379,21 @@ class PyCurlFileObject():
|
|
|
|
|
self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update)
|
|
|
|
|
self.curl_obj.setopt(pycurl.FAILONERROR, True)
|
|
|
|
|
self.curl_obj.setopt(pycurl.OPT_FILETIME, True)
|
|
|
|
@ -1034,7 +1169,7 @@ index e090e90..6b409e3 100644
|
|
|
|
|
|
|
|
|
|
# maybe to be options later
|
|
|
|
|
self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)
|
|
|
|
|
@@ -1148,9 +1388,11 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1148,9 +1401,11 @@ class PyCurlFileObject():
|
|
|
|
|
|
|
|
|
|
# timeouts
|
|
|
|
|
timeout = 300
|
|
|
|
@ -1049,7 +1184,7 @@ index e090e90..6b409e3 100644
|
|
|
|
|
|
|
|
|
|
# ssl options
|
|
|
|
|
if self.scheme == 'https':
|
|
|
|
|
@@ -1158,13 +1400,16 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1158,13 +1413,16 @@ class PyCurlFileObject():
|
|
|
|
|
self.curl_obj.setopt(pycurl.CAPATH, opts.ssl_ca_cert)
|
|
|
|
|
self.curl_obj.setopt(pycurl.CAINFO, opts.ssl_ca_cert)
|
|
|
|
|
self.curl_obj.setopt(pycurl.SSL_VERIFYPEER, opts.ssl_verify_peer)
|
|
|
|
@ -1067,7 +1202,7 @@ index e090e90..6b409e3 100644
|
|
|
|
|
if opts.ssl_cert_type:
|
|
|
|
|
self.curl_obj.setopt(pycurl.SSLCERTTYPE, opts.ssl_cert_type)
|
|
|
|
|
if opts.ssl_key_pass:
|
|
|
|
|
@@ -1187,28 +1432,26 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1187,29 +1445,31 @@ class PyCurlFileObject():
|
|
|
|
|
if hasattr(opts, 'raw_throttle') and opts.raw_throttle():
|
|
|
|
|
self.curl_obj.setopt(pycurl.MAX_RECV_SPEED_LARGE, int(opts.raw_throttle()))
|
|
|
|
|
|
|
|
|
@ -1108,11 +1243,17 @@ index e090e90..6b409e3 100644
|
|
|
|
|
if opts.data:
|
|
|
|
|
self.curl_obj.setopt(pycurl.POST, True)
|
|
|
|
|
- self.curl_obj.setopt(pycurl.POSTFIELDS, self._to_utf8(opts.data))
|
|
|
|
|
-
|
|
|
|
|
+ self.curl_obj.setopt(pycurl.POSTFIELDS, _to_utf8(opts.data))
|
|
|
|
|
|
|
|
|
|
+
|
|
|
|
|
+ # ftp
|
|
|
|
|
+ if opts.ftp_disable_epsv:
|
|
|
|
|
+ self.curl_obj.setopt(pycurl.FTP_USE_EPSV, False)
|
|
|
|
|
+
|
|
|
|
|
# our url
|
|
|
|
|
self.curl_obj.setopt(pycurl.URL, self.url)
|
|
|
|
|
@@ -1228,39 +1471,26 @@ class PyCurlFileObject():
|
|
|
|
|
|
|
|
|
|
@@ -1228,39 +1488,26 @@ class PyCurlFileObject():
|
|
|
|
|
|
|
|
|
|
code = self.http_code
|
|
|
|
|
errcode = e.args[0]
|
|
|
|
@ -1158,7 +1299,7 @@ index e090e90..6b409e3 100644
|
|
|
|
|
# this is probably wrong but ultimately this is what happens
|
|
|
|
|
# we have a legit http code and a pycurl 'writer failed' code
|
|
|
|
|
# which almost always means something aborted it from outside
|
|
|
|
|
@@ -1269,36 +1499,70 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1269,36 +1516,70 @@ class PyCurlFileObject():
|
|
|
|
|
# figure out what aborted the pycurl process FIXME
|
|
|
|
|
raise KeyboardInterrupt
|
|
|
|
|
|
|
|
|
@ -1254,7 +1395,7 @@ index e090e90..6b409e3 100644
|
|
|
|
|
|
|
|
|
|
def _do_open(self):
|
|
|
|
|
self.curl_obj = _curl_cache
|
|
|
|
|
@@ -1333,7 +1597,11 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1333,7 +1614,11 @@ class PyCurlFileObject():
|
|
|
|
|
|
|
|
|
|
if self.opts.range:
|
|
|
|
|
rt = self.opts.range
|
|
|
|
@ -1267,7 +1408,7 @@ index e090e90..6b409e3 100644
|
|
|
|
|
|
|
|
|
|
if rt:
|
|
|
|
|
header = range_tuple_to_header(rt)
|
|
|
|
|
@@ -1434,21 +1702,46 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1434,21 +1719,46 @@ class PyCurlFileObject():
|
|
|
|
|
#fh, self._temp_name = mkstemp()
|
|
|
|
|
#self.fo = open(self._temp_name, 'wb')
|
|
|
|
|
|
|
|
|
@ -1321,7 +1462,7 @@ index e090e90..6b409e3 100644
|
|
|
|
|
else:
|
|
|
|
|
#self.fo = open(self._temp_name, 'r')
|
|
|
|
|
self.fo.seek(0)
|
|
|
|
|
@@ -1526,17 +1819,20 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1526,17 +1836,20 @@ class PyCurlFileObject():
|
|
|
|
|
if self._prog_running:
|
|
|
|
|
downloaded += self._reget_length
|
|
|
|
|
self.opts.progress_obj.update(downloaded)
|
|
|
|
@ -1347,7 +1488,7 @@ index e090e90..6b409e3 100644
|
|
|
|
|
|
|
|
|
|
msg = _("Downloaded more than max size for %s: %s > %s") \
|
|
|
|
|
% (self.url, cur, max_size)
|
|
|
|
|
@@ -1544,13 +1840,6 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1544,13 +1857,6 @@ class PyCurlFileObject():
|
|
|
|
|
return True
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
@ -1361,7 +1502,7 @@ index e090e90..6b409e3 100644
|
|
|
|
|
def read(self, amt=None):
|
|
|
|
|
self._fill_buffer(amt)
|
|
|
|
|
if amt is None:
|
|
|
|
|
@@ -1582,9 +1871,21 @@ class PyCurlFileObject():
|
|
|
|
|
@@ -1582,9 +1888,21 @@ class PyCurlFileObject():
|
|
|
|
|
self.opts.progress_obj.end(self._amount_read)
|
|
|
|
|
self.fo.close()
|
|
|
|
|
|
|
|
|
@ -1384,7 +1525,7 @@ index e090e90..6b409e3 100644
|
|
|
|
|
|
|
|
|
|
#####################################################################
|
|
|
|
|
# DEPRECATED FUNCTIONS
|
|
|
|
|
@@ -1621,6 +1922,489 @@ def retrygrab(url, filename=None, copy_local=0, close_connection=0,
|
|
|
|
|
@@ -1621,6 +1939,490 @@ def retrygrab(url, filename=None, copy_local=0, close_connection=0,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#####################################################################
|
|
|
|
@ -1500,6 +1641,7 @@ index e090e90..6b409e3 100644
|
|
|
|
|
+ 'ssl_key_pass',
|
|
|
|
|
+ 'ssl_verify_peer', 'ssl_verify_host',
|
|
|
|
|
+ 'size', 'max_header_size', 'ip_resolve',
|
|
|
|
|
+ 'ftp_disable_epsv'
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ def start(self, opts):
|
|
|
|
@ -1875,7 +2017,7 @@ index e090e90..6b409e3 100644
|
|
|
|
|
def _main_test():
|
|
|
|
|
try: url, filename = sys.argv[1:3]
|
|
|
|
|
diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py
|
|
|
|
|
index dad410b..5d3aa34 100644
|
|
|
|
|
index dad410b..988a309 100644
|
|
|
|
|
--- a/urlgrabber/mirror.py
|
|
|
|
|
+++ b/urlgrabber/mirror.py
|
|
|
|
|
@@ -76,6 +76,10 @@ CUSTOMIZATION
|
|
|
|
@ -1926,7 +2068,12 @@ index dad410b..5d3aa34 100644
|
|
|
|
|
|
|
|
|
|
This dict can be set at instantiation time,
|
|
|
|
|
mg = MirrorGroup(grabber, mirrors, default_action={'fail':1})
|
|
|
|
|
@@ -184,6 +195,7 @@ class MirrorGroup:
|
|
|
|
|
@@ -180,10 +191,11 @@ class MirrorGroup:
|
|
|
|
|
etc). Otherwise, it is assumed to be the callable object
|
|
|
|
|
itself. The callback will be passed a grabber.CallbackObject
|
|
|
|
|
instance along with args and kwargs (if present). The following
|
|
|
|
|
- attributes are defined withing the instance:
|
|
|
|
|
+ attributes are defined within the instance:
|
|
|
|
|
|
|
|
|
|
obj.exception = < exception that was raised >
|
|
|
|
|
obj.mirror = < the mirror that was tried >
|
|
|
|
|