parent
21a8790d62
commit
69dd5be1bf
@ -1,385 +1,36 @@
|
|||||||
diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py
|
diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py
|
||||||
index cf51dff..979b4c1 100644
|
index e090e90..c4916d5 100644
|
||||||
--- a/urlgrabber/grabber.py
|
--- a/urlgrabber/grabber.py
|
||||||
+++ b/urlgrabber/grabber.py
|
+++ b/urlgrabber/grabber.py
|
||||||
@@ -402,11 +402,11 @@ import urllib
|
@@ -1052,7 +1052,7 @@ class PyCurlFileObject():
|
||||||
import urllib2
|
self._reget_length = 0
|
||||||
import mimetools
|
self._prog_running = False
|
||||||
import thread
|
self._error = (None, None)
|
||||||
+import types
|
- self.size = None
|
||||||
from stat import * # S_* and ST_*
|
+ self.size = 0
|
||||||
import pycurl
|
self._do_open()
|
||||||
from ftplib import parse150
|
|
||||||
from StringIO import StringIO
|
|
||||||
-from tempfile import mkstemp
|
|
||||||
|
|
||||||
########################################################################
|
|
||||||
# MODULE INITIALIZATION
|
|
||||||
@@ -467,6 +467,13 @@ except AttributeError:
|
|
||||||
TimeoutError = None
|
|
||||||
have_socket_timeout = False
|
|
||||||
|
|
||||||
+try:
|
|
||||||
+ import signal
|
|
||||||
+ from signal import SIGPIPE, SIG_IGN
|
|
||||||
+ signal.signal(signal.SIGPIPE, signal.SIG_IGN)
|
|
||||||
+except ImportError:
|
|
||||||
+ pass
|
|
||||||
+
|
|
||||||
########################################################################
|
|
||||||
# functions for debugging output. These functions are here because they
|
|
||||||
# are also part of the module initialization.
|
|
||||||
@@ -859,8 +866,15 @@ class URLGrabberOptions:
|
|
||||||
self.data = None
|
|
||||||
self.urlparser = URLParser()
|
|
||||||
self.quote = None
|
|
||||||
- self.ssl_ca_cert = None
|
|
||||||
- self.ssl_context = None
|
|
||||||
+ self.ssl_ca_cert = None # sets SSL_CAINFO - path to certdb
|
|
||||||
+ self.ssl_context = None # no-op in pycurl
|
|
||||||
+ self.ssl_verify_peer = True # check peer's cert for authenticityb
|
|
||||||
+ self.ssl_verify_host = True # make sure who they are and who the cert is for matches
|
|
||||||
+ self.ssl_key = None # client key
|
|
||||||
+ self.ssl_key_type = 'PEM' #(or DER)
|
|
||||||
+ self.ssl_cert = None # client cert
|
|
||||||
+ self.ssl_cert_type = 'PEM' # (or DER)
|
|
||||||
+ self.ssl_key_pass = None # password to access the key
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return self.format()
|
|
||||||
@@ -1219,7 +1233,7 @@ class URLGrabberFileObject:
|
|
||||||
self.append = 0
|
|
||||||
reget_length = 0
|
|
||||||
rt = None
|
|
||||||
- if have_range and self.opts.reget and type(self.filename) == type(''):
|
|
||||||
+ if have_range and self.opts.reget and type(self.filename) in types.StringTypes:
|
|
||||||
# we have reget turned on and we're dumping to a file
|
|
||||||
try:
|
|
||||||
s = os.stat(self.filename)
|
|
||||||
@@ -1450,9 +1464,11 @@ class PyCurlFileObject():
|
|
||||||
self.scheme = urlparse.urlsplit(self.url)[0]
|
|
||||||
self.filename = filename
|
|
||||||
self.append = False
|
|
||||||
+ self.reget_time = None
|
|
||||||
self.opts = opts
|
|
||||||
+ if self.opts.reget == 'check_timestamp':
|
|
||||||
+ raise NotImplementedError, "check_timestamp regets are not implemented in this ver of urlgrabber. Please report this."
|
|
||||||
self._complete = False
|
|
||||||
- self.reget_time = None
|
|
||||||
self._rbuf = ''
|
|
||||||
self._rbufsize = 1024*8
|
|
||||||
self._ttime = time.time()
|
|
||||||
@@ -1474,39 +1490,45 @@ class PyCurlFileObject():
|
|
||||||
raise AttributeError, name
|
|
||||||
|
|
||||||
def _retrieve(self, buf):
|
|
||||||
- if not self._prog_running:
|
|
||||||
- if self.opts.progress_obj:
|
|
||||||
- size = self.size + self._reget_length
|
|
||||||
- self.opts.progress_obj.start(self._prog_reportname,
|
|
||||||
- urllib.unquote(self.url),
|
|
||||||
- self._prog_basename,
|
|
||||||
- size=size,
|
|
||||||
- text=self.opts.text)
|
|
||||||
- self._prog_running = True
|
|
||||||
- self.opts.progress_obj.update(self._amount_read)
|
|
||||||
-
|
|
||||||
- self._amount_read += len(buf)
|
|
||||||
- self.fo.write(buf)
|
|
||||||
- return len(buf)
|
|
||||||
-
|
|
||||||
+ try:
|
|
||||||
+ if not self._prog_running:
|
|
||||||
+ if self.opts.progress_obj:
|
|
||||||
+ size = self.size + self._reget_length
|
|
||||||
+ self.opts.progress_obj.start(self._prog_reportname,
|
|
||||||
+ urllib.unquote(self.url),
|
|
||||||
+ self._prog_basename,
|
|
||||||
+ size=size,
|
|
||||||
+ text=self.opts.text)
|
|
||||||
+ self._prog_running = True
|
|
||||||
+ self.opts.progress_obj.update(self._amount_read)
|
|
||||||
+
|
|
||||||
+ self._amount_read += len(buf)
|
|
||||||
+ self.fo.write(buf)
|
|
||||||
+ return len(buf)
|
|
||||||
+ except KeyboardInterrupt:
|
|
||||||
+ return -1
|
|
||||||
+
|
|
||||||
def _hdr_retrieve(self, buf):
|
|
||||||
- self._hdr_dump += buf
|
|
||||||
- # we have to get the size before we do the progress obj start
|
|
||||||
- # but we can't do that w/o making it do 2 connects, which sucks
|
|
||||||
- # so we cheat and stuff it in here in the hdr_retrieve
|
|
||||||
- if self.scheme in ['http','https'] and buf.lower().find('content-length') != -1:
|
|
||||||
- length = buf.split(':')[1]
|
|
||||||
- self.size = int(length)
|
|
||||||
- elif self.scheme in ['ftp']:
|
|
||||||
- s = None
|
|
||||||
- if buf.startswith('213 '):
|
|
||||||
- s = buf[3:].strip()
|
|
||||||
- elif buf.startswith('150 '):
|
|
||||||
- s = parse150(buf)
|
|
||||||
- if s:
|
|
||||||
- self.size = s
|
|
||||||
-
|
|
||||||
- return len(buf)
|
|
||||||
+ try:
|
|
||||||
+ self._hdr_dump += buf
|
|
||||||
+ # we have to get the size before we do the progress obj start
|
|
||||||
+ # but we can't do that w/o making it do 2 connects, which sucks
|
|
||||||
+ # so we cheat and stuff it in here in the hdr_retrieve
|
|
||||||
+ if self.scheme in ['http','https'] and buf.lower().find('content-length') != -1:
|
|
||||||
+ length = buf.split(':')[1]
|
|
||||||
+ self.size = int(length)
|
|
||||||
+ elif self.scheme in ['ftp']:
|
|
||||||
+ s = None
|
|
||||||
+ if buf.startswith('213 '):
|
|
||||||
+ s = buf[3:].strip()
|
|
||||||
+ elif buf.startswith('150 '):
|
|
||||||
+ s = parse150(buf)
|
|
||||||
+ if s:
|
|
||||||
+ self.size = s
|
|
||||||
+
|
|
||||||
+ return len(buf)
|
|
||||||
+ except KeyboardInterrupt:
|
|
||||||
+ return pycurl.READFUNC_ABORT
|
|
||||||
|
|
||||||
def _return_hdr_obj(self):
|
|
||||||
if self._parsed_hdr:
|
|
||||||
@@ -1528,11 +1550,13 @@ class PyCurlFileObject():
|
|
||||||
|
|
||||||
|
|
||||||
# defaults we're always going to set
|
|
||||||
- self.curl_obj.setopt(pycurl.NOPROGRESS, 0)
|
|
||||||
+ self.curl_obj.setopt(pycurl.NOPROGRESS, False)
|
|
||||||
+ self.curl_obj.setopt(pycurl.NOSIGNAL, True)
|
|
||||||
self.curl_obj.setopt(pycurl.WRITEFUNCTION, self._retrieve)
|
|
||||||
self.curl_obj.setopt(pycurl.HEADERFUNCTION, self._hdr_retrieve)
|
|
||||||
self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update)
|
|
||||||
- self.curl_obj.setopt(pycurl.FAILONERROR, 1)
|
|
||||||
+ self.curl_obj.setopt(pycurl.FAILONERROR, True)
|
|
||||||
+ self.curl_obj.setopt(pycurl.OPT_FILETIME, True)
|
|
||||||
|
|
||||||
if DEBUG:
|
|
||||||
self.curl_obj.setopt(pycurl.VERBOSE, True)
|
|
||||||
@@ -1540,19 +1564,32 @@ class PyCurlFileObject():
|
|
||||||
self.curl_obj.setopt(pycurl.USERAGENT, opts.user_agent)
|
|
||||||
|
|
||||||
# maybe to be options later
|
|
||||||
- self.curl_obj.setopt(pycurl.FOLLOWLOCATION, 1)
|
|
||||||
+ self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)
|
|
||||||
self.curl_obj.setopt(pycurl.MAXREDIRS, 5)
|
|
||||||
- self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, 30)
|
|
||||||
|
|
||||||
# timeouts
|
@@ -1299,6 +1299,12 @@ class PyCurlFileObject():
|
||||||
timeout = 300
|
err.code = code
|
||||||
if opts.timeout:
|
|
||||||
timeout = int(opts.timeout)
|
|
||||||
- self.curl_obj.setopt(pycurl.TIMEOUT, timeout)
|
|
||||||
+ self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout)
|
|
||||||
+
|
|
||||||
# ssl options
|
|
||||||
if self.scheme == 'https':
|
|
||||||
if opts.ssl_ca_cert: # this may do ZERO with nss according to curl docs
|
|
||||||
self.curl_obj.setopt(pycurl.CAPATH, opts.ssl_ca_cert)
|
|
||||||
+ self.curl_obj.setopt(pycurl.CAINFO, opts.ssl_ca_cert)
|
|
||||||
+ self.curl_obj.setopt(pycurl.SSL_VERIFYPEER, opts.ssl_verify_peer)
|
|
||||||
+ self.curl_obj.setopt(pycurl.SSL_VERIFYHOST, opts.ssl_verify_host)
|
|
||||||
+ if opts.ssl_key:
|
|
||||||
+ self.curl_obj.setopt(pycurl.SSLKEY, opts.ssl_key)
|
|
||||||
+ if opts.ssl_key_type:
|
|
||||||
+ self.curl_obj.setopt(pycurl.SSLKEYTYPE, opts.ssl_key_type)
|
|
||||||
+ if opts.ssl_cert:
|
|
||||||
+ self.curl_obj.setopt(pycurl.SSLCERT, opts.ssl_cert)
|
|
||||||
+ if opts.ssl_cert_type:
|
|
||||||
+ self.curl_obj.setopt(pycurl.SSLCERTTYPE, opts.ssl_cert_type)
|
|
||||||
+ if opts.ssl_key_pass:
|
|
||||||
+ self.curl_obj.setopt(pycurl.SSLKEYPASSWD, opts.ssl_key_pass)
|
|
||||||
|
|
||||||
#headers:
|
|
||||||
if opts.http_headers and self.scheme in ('http', 'https'):
|
|
||||||
@@ -1578,19 +1615,21 @@ class PyCurlFileObject():
|
|
||||||
if scheme not in ('ftp'):
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
+ if proxy == '_none_': proxy = ""
|
|
||||||
self.curl_obj.setopt(pycurl.PROXY, proxy)
|
|
||||||
elif self.scheme in ('http', 'https'):
|
|
||||||
if scheme not in ('http', 'https'):
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
+ if proxy == '_none_': proxy = ""
|
|
||||||
self.curl_obj.setopt(pycurl.PROXY, proxy)
|
|
||||||
-
|
|
||||||
- # username/password/auth settings
|
|
||||||
+
|
|
||||||
+ # FIXME username/password/auth settings
|
|
||||||
|
|
||||||
#posts - simple - expects the fields as they are
|
|
||||||
if opts.data:
|
|
||||||
self.curl_obj.setopt(pycurl.POST, True)
|
|
||||||
- self.curl_obj.setopt(pycurl.POSTFIELDS, opts.data)
|
|
||||||
+ self.curl_obj.setopt(pycurl.POSTFIELDS, self._to_utf8(opts.data))
|
|
||||||
|
|
||||||
# our url
|
|
||||||
self.curl_obj.setopt(pycurl.URL, self.url)
|
|
||||||
@@ -1607,18 +1646,62 @@ class PyCurlFileObject():
|
|
||||||
# to other URLGrabErrors from
|
|
||||||
# http://curl.haxx.se/libcurl/c/libcurl-errors.html
|
|
||||||
# this covers e.args[0] == 22 pretty well - which will be common
|
|
||||||
- if str(e.args[1]) == '': # fake it until you make it
|
|
||||||
+ code = self.http_code
|
|
||||||
+ if e.args[0] == 23 and code >= 200 and code < 299:
|
|
||||||
+ err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e))
|
|
||||||
+ err.url = self.url
|
|
||||||
+ # this is probably wrong but ultimately this is what happens
|
|
||||||
+ # we have a legit http code and a pycurl 'writer failed' code
|
|
||||||
+ # which almost always means something aborted it from outside
|
|
||||||
+ # since we cannot know what it is -I'm banking on it being
|
|
||||||
+ # a ctrl-c. XXXX - if there's a way of going back two raises to
|
|
||||||
+ # figure out what aborted the pycurl process FIXME
|
|
||||||
+ raise KeyboardInterrupt
|
|
||||||
+
|
|
||||||
+ elif e.args[0] == 28:
|
|
||||||
+ err = URLGrabError(12, _('Timeout on %s: %s') % (self.url, e))
|
|
||||||
+ err.url = self.url
|
|
||||||
+ raise err
|
|
||||||
+ elif e.args[0] == 35:
|
|
||||||
+ msg = _("problem making ssl connection")
|
|
||||||
+ err = URLGrabError(14, msg)
|
|
||||||
+ err.url = self.url
|
|
||||||
+ raise err
|
|
||||||
+
|
|
||||||
+ if e.args[0] == 42:
|
|
||||||
+ err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e))
|
|
||||||
+ err.url = self.url
|
|
||||||
+ # this is probably wrong but ultimately this is what happens
|
|
||||||
+ # we have a legit http code and a pycurl 'writer failed' code
|
|
||||||
+ # which almost always means something aborted it from outside
|
|
||||||
+ # since we cannot know what it is -I'm banking on it being
|
|
||||||
+ # a ctrl-c. XXXX - if there's a way of going back two raises to
|
|
||||||
+ # figure out what aborted the pycurl process FIXME
|
|
||||||
+ raise KeyboardInterrupt
|
|
||||||
+
|
|
||||||
+ elif e.args[0] == 58:
|
|
||||||
+ msg = _("problem with the local client certificate")
|
|
||||||
+ err = URLGrabError(14, msg)
|
|
||||||
+ err.url = self.url
|
|
||||||
+ raise err
|
|
||||||
+
|
|
||||||
+ elif e.args[0] == 60:
|
|
||||||
+ msg = _("client cert cannot be verified or client cert incorrect")
|
|
||||||
+ err = URLGrabError(14, msg)
|
|
||||||
+ err.url = self.url
|
|
||||||
+ raise err
|
|
||||||
+
|
|
||||||
+ elif str(e.args[1]) == '' and self.http_code != 0: # fake it until you make it
|
|
||||||
msg = 'HTTP Error %s : %s ' % (self.http_code, self.url)
|
|
||||||
else:
|
|
||||||
- msg = str(e.args[1])
|
|
||||||
+ msg = 'PYCURL ERROR %s - "%s"' % (e.args[0], str(e.args[1]))
|
|
||||||
+ code = e.args[0]
|
|
||||||
err = URLGrabError(14, msg)
|
|
||||||
- err.code = self.http_code
|
|
||||||
+ err.code = code
|
|
||||||
err.exception = e
|
err.exception = e
|
||||||
raise err
|
raise err
|
||||||
-
|
+ else:
|
||||||
+
|
+ if self._error[1]:
|
||||||
|
+ msg = self._error[1]
|
||||||
|
+ err = URLGRabError(14, msg)
|
||||||
|
+ err.url = self.url
|
||||||
|
+ raise err
|
||||||
|
|
||||||
def _do_open(self):
|
def _do_open(self):
|
||||||
- self.append = False
|
|
||||||
- self.reget_time = None
|
|
||||||
self.curl_obj = _curl_cache
|
self.curl_obj = _curl_cache
|
||||||
self.curl_obj.reset() # reset all old settings away, just in case
|
@@ -1536,7 +1542,8 @@ class PyCurlFileObject():
|
||||||
# setup any ranges
|
if self.opts.size: # if we set an opts size use that, no matter what
|
||||||
@@ -1630,11 +1713,9 @@ class PyCurlFileObject():
|
max_size = self.opts.size
|
||||||
pass
|
if not max_size: return False # if we have None for all of the Max then this is dumb
|
||||||
|
- if cur > max_size + max_size*.10:
|
||||||
def _build_range(self):
|
|
||||||
- self.reget_time = None
|
|
||||||
- self.append = False
|
|
||||||
reget_length = 0
|
|
||||||
rt = None
|
|
||||||
- if self.opts.reget and type(self.filename) == type(''):
|
|
||||||
+ if self.opts.reget and type(self.filename) in types.StringTypes:
|
|
||||||
# we have reget turned on and we're dumping to a file
|
|
||||||
try:
|
|
||||||
s = os.stat(self.filename)
|
|
||||||
@@ -1726,10 +1807,10 @@ class PyCurlFileObject():
|
|
||||||
if self._complete:
|
|
||||||
return
|
|
||||||
|
|
||||||
- if self.filename:
|
|
||||||
+ if self.filename is not None:
|
|
||||||
self._prog_reportname = str(self.filename)
|
|
||||||
self._prog_basename = os.path.basename(self.filename)
|
|
||||||
-
|
|
||||||
+
|
|
||||||
if self.append: mode = 'ab'
|
|
||||||
else: mode = 'wb'
|
|
||||||
|
|
||||||
@@ -1746,19 +1827,23 @@ class PyCurlFileObject():
|
|
||||||
else:
|
|
||||||
self._prog_reportname = 'MEMORY'
|
|
||||||
self._prog_basename = 'MEMORY'
|
|
||||||
- fh, self._temp_name = mkstemp()
|
|
||||||
+
|
+
|
||||||
|
+ if cur > int(float(max_size) * 1.10):
|
||||||
- self.fo = open(self._temp_name, 'wb')
|
|
||||||
+ self.fo = StringIO()
|
|
||||||
+ # if this is to be a tempfile instead....
|
|
||||||
+ # it just makes crap in the tempdir
|
|
||||||
+ #fh, self._temp_name = mkstemp()
|
|
||||||
+ #self.fo = open(self._temp_name, 'wb')
|
|
||||||
|
|
||||||
|
|
||||||
self._do_perform()
|
|
||||||
|
|
||||||
|
|
||||||
- # close it up
|
msg = _("Downloaded more than max size for %s: %s > %s") \
|
||||||
- self.fo.flush()
|
% (self.url, cur, max_size)
|
||||||
- self.fo.close()
|
|
||||||
|
|
||||||
if self.filename:
|
|
||||||
+ # close it up
|
|
||||||
+ self.fo.flush()
|
|
||||||
+ self.fo.close()
|
|
||||||
# set the time
|
|
||||||
mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME)
|
|
||||||
if mod_time != -1:
|
|
||||||
@@ -1766,7 +1851,8 @@ class PyCurlFileObject():
|
|
||||||
# re open it
|
|
||||||
self.fo = open(self.filename, 'r')
|
|
||||||
else:
|
|
||||||
- self.fo = open(self._temp_name, 'r')
|
|
||||||
+ #self.fo = open(self._temp_name, 'r')
|
|
||||||
+ self.fo.seek(0)
|
|
||||||
|
|
||||||
self._complete = True
|
|
||||||
|
|
||||||
@@ -1834,10 +1920,20 @@ class PyCurlFileObject():
|
|
||||||
return
|
|
||||||
|
|
||||||
def _progress_update(self, download_total, downloaded, upload_total, uploaded):
|
|
||||||
- if self._prog_running:
|
|
||||||
- downloaded += self._reget_length
|
|
||||||
- self.opts.progress_obj.update(downloaded)
|
|
||||||
-
|
|
||||||
+ try:
|
|
||||||
+ if self._prog_running:
|
|
||||||
+ downloaded += self._reget_length
|
|
||||||
+ self.opts.progress_obj.update(downloaded)
|
|
||||||
+ except KeyboardInterrupt:
|
|
||||||
+ return -1
|
|
||||||
+
|
|
||||||
+ def _to_utf8(self, obj, errors='replace'):
|
|
||||||
+ '''convert 'unicode' to an encoded utf-8 byte string '''
|
|
||||||
+ # stolen from yum.i18n
|
|
||||||
+ if isinstance(obj, unicode):
|
|
||||||
+ obj = obj.encode('utf-8', errors)
|
|
||||||
+ return obj
|
|
||||||
+
|
|
||||||
def read(self, amt=None):
|
|
||||||
self._fill_buffer(amt)
|
|
||||||
if amt is None:
|
|
||||||
|
Loading…
Reference in new issue