diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py index cf51dff..9692219 100644 --- a/urlgrabber/grabber.py +++ b/urlgrabber/grabber.py @@ -402,11 +402,11 @@ import urllib import urllib2 import mimetools import thread +import types from stat import * # S_* and ST_* import pycurl from ftplib import parse150 from StringIO import StringIO -from tempfile import mkstemp ######################################################################## # MODULE INITIALIZATION @@ -467,6 +467,13 @@ except AttributeError: TimeoutError = None have_socket_timeout = False +try: + import signal + from signal import SIGPIPE, SIG_IGN + signal.signal(signal.SIGPIPE, signal.SIG_IGN) +except ImportError: + pass + ######################################################################## # functions for debugging output. These functions are here because they # are also part of the module initialization. @@ -859,8 +866,15 @@ class URLGrabberOptions: self.data = None self.urlparser = URLParser() self.quote = None - self.ssl_ca_cert = None - self.ssl_context = None + self.ssl_ca_cert = None # sets SSL_CAINFO - path to certdb + self.ssl_context = None # no-op in pycurl + self.ssl_verify_peer = True # check peer's cert for authenticityb + self.ssl_verify_host = True # make sure who they are and who the cert is for matches + self.ssl_key = None # client key + self.ssl_key_type = 'PEM' #(or DER) + self.ssl_cert = None # client cert + self.ssl_cert_type = 'PEM' # (or DER) + self.ssl_key_pass = None # password to access the key def __repr__(self): return self.format() @@ -1219,7 +1233,7 @@ class URLGrabberFileObject: self.append = 0 reget_length = 0 rt = None - if have_range and self.opts.reget and type(self.filename) == type(''): + if have_range and self.opts.reget and type(self.filename) in types.StringTypes: # we have reget turned on and we're dumping to a file try: s = os.stat(self.filename) @@ -1450,9 +1464,11 @@ class PyCurlFileObject(): self.scheme = urlparse.urlsplit(self.url)[0] self.filename = filename self.append = False + self.reget_time = None self.opts = opts + if self.opts.reget == 'check_timestamp': + raise NotImplementedError, "check_timestamp regets are not implemented in this ver of urlgrabber. Please report this." self._complete = False - self.reget_time = None self._rbuf = '' self._rbufsize = 1024*8 self._ttime = time.time() @@ -1474,39 +1490,45 @@ class PyCurlFileObject(): raise AttributeError, name def _retrieve(self, buf): - if not self._prog_running: - if self.opts.progress_obj: - size = self.size + self._reget_length - self.opts.progress_obj.start(self._prog_reportname, - urllib.unquote(self.url), - self._prog_basename, - size=size, - text=self.opts.text) - self._prog_running = True - self.opts.progress_obj.update(self._amount_read) - - self._amount_read += len(buf) - self.fo.write(buf) - return len(buf) - + try: + if not self._prog_running: + if self.opts.progress_obj: + size = self.size + self._reget_length + self.opts.progress_obj.start(self._prog_reportname, + urllib.unquote(self.url), + self._prog_basename, + size=size, + text=self.opts.text) + self._prog_running = True + self.opts.progress_obj.update(self._amount_read) + + self._amount_read += len(buf) + self.fo.write(buf) + return len(buf) + except KeyboardInterrupt: + return pycurl.READFUNC_ABORT + def _hdr_retrieve(self, buf): - self._hdr_dump += buf - # we have to get the size before we do the progress obj start - # but we can't do that w/o making it do 2 connects, which sucks - # so we cheat and stuff it in here in the hdr_retrieve - if self.scheme in ['http','https'] and buf.lower().find('content-length') != -1: - length = buf.split(':')[1] - self.size = int(length) - elif self.scheme in ['ftp']: - s = None - if buf.startswith('213 '): - s = buf[3:].strip() - elif buf.startswith('150 '): - s = parse150(buf) - if s: - self.size = s - - return len(buf) + try: + self._hdr_dump += buf + # we have to get the size before we do the progress obj start + # but we can't do that w/o making it do 2 connects, which sucks + # so we cheat and stuff it in here in the hdr_retrieve + if self.scheme in ['http','https'] and buf.lower().find('content-length') != -1: + length = buf.split(':')[1] + self.size = int(length) + elif self.scheme in ['ftp']: + s = None + if buf.startswith('213 '): + s = buf[3:].strip() + elif buf.startswith('150 '): + s = parse150(buf) + if s: + self.size = s + + return len(buf) + except KeyboardInterrupt: + return pycurl.READFUNC_ABORT def _return_hdr_obj(self): if self._parsed_hdr: @@ -1528,11 +1550,13 @@ class PyCurlFileObject(): # defaults we're always going to set - self.curl_obj.setopt(pycurl.NOPROGRESS, 0) + self.curl_obj.setopt(pycurl.NOPROGRESS, False) + self.curl_obj.setopt(pycurl.NOSIGNAL, True) self.curl_obj.setopt(pycurl.WRITEFUNCTION, self._retrieve) self.curl_obj.setopt(pycurl.HEADERFUNCTION, self._hdr_retrieve) self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update) - self.curl_obj.setopt(pycurl.FAILONERROR, 1) + self.curl_obj.setopt(pycurl.FAILONERROR, True) + self.curl_obj.setopt(pycurl.OPT_FILETIME, True) if DEBUG: self.curl_obj.setopt(pycurl.VERBOSE, True) @@ -1540,19 +1564,32 @@ class PyCurlFileObject(): self.curl_obj.setopt(pycurl.USERAGENT, opts.user_agent) # maybe to be options later - self.curl_obj.setopt(pycurl.FOLLOWLOCATION, 1) + self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True) self.curl_obj.setopt(pycurl.MAXREDIRS, 5) - self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, 30) # timeouts timeout = 300 if opts.timeout: timeout = int(opts.timeout) - self.curl_obj.setopt(pycurl.TIMEOUT, timeout) + self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout) + # ssl options if self.scheme == 'https': if opts.ssl_ca_cert: # this may do ZERO with nss according to curl docs self.curl_obj.setopt(pycurl.CAPATH, opts.ssl_ca_cert) + self.curl_obj.setopt(pycurl.CAINFO, opts.ssl_ca_cert) + self.curl_obj.setopt(pycurl.SSL_VERIFYPEER, opts.ssl_verify_peer) + self.curl_obj.setopt(pycurl.SSL_VERIFYHOST, opts.ssl_verify_host) + if opts.ssl_key: + self.curl_obj.setopt(pycurl.SSLKEY, opts.ssl_key) + if opts.ssl_key_type: + self.curl_obj.setopt(pycurl.SSLKEYTYPE, opts.ssl_key_type) + if opts.ssl_cert: + self.curl_obj.setopt(pycurl.SSLCERT, opts.ssl_cert) + if opts.ssl_cert_type: + self.curl_obj.setopt(pycurl.SSLCERTTYPE, opts.ssl_cert_type) + if opts.ssl_key_pass: + self.curl_obj.setopt(pycurl.SSLKEYPASSWD, opts.ssl_key_pass) #headers: if opts.http_headers and self.scheme in ('http', 'https'): @@ -1590,7 +1627,7 @@ class PyCurlFileObject(): #posts - simple - expects the fields as they are if opts.data: self.curl_obj.setopt(pycurl.POST, True) - self.curl_obj.setopt(pycurl.POSTFIELDS, opts.data) + self.curl_obj.setopt(pycurl.POSTFIELDS, self._to_utf8(opts.data)) # our url self.curl_obj.setopt(pycurl.URL, self.url) @@ -1607,18 +1644,51 @@ class PyCurlFileObject(): # to other URLGrabErrors from # http://curl.haxx.se/libcurl/c/libcurl-errors.html # this covers e.args[0] == 22 pretty well - which will be common - if str(e.args[1]) == '': # fake it until you make it + code = self.http_code + if e.args[0] == 23 and code >= 200 and code < 299: + err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e)) + err.url = self.url + # this is probably wrong but ultimately this is what happens + # we have a legit http code and a pycurl 'writer failed' code + # which almost always means something aborted it from outside + # since we cannot know what it is -I'm banking on it being + # a ctrl-c. XXXX - if there's a way of going back two raises to + # figure out what aborted the pycurl process FIXME + raise KeyboardInterrupt + + elif e.args[0] == 28: + err = URLGrabError(12, _('Timeout on %s: %s') % (self.url, e)) + err.url = self.url + raise err + elif e.args[0] == 35: + msg = _("problem making ssl connection") + err = URLGrabError(14, msg) + err.url = self.url + raise err + + elif e.args[0] == 58: + msg = _("problem with the local client certificate") + err = URLGrabError(14, msg) + err.url = self.url + raise err + + elif e.args[0] == 60: + msg = _("client cert cannot be verified or client cert incorrect") + err = URLGrabError(14, msg) + err.url = self.url + raise err + + elif str(e.args[1]) == '' and self.http_code != 0: # fake it until you make it msg = 'HTTP Error %s : %s ' % (self.http_code, self.url) else: - msg = str(e.args[1]) + msg = 'PYCURL ERROR %s - "%s"' % (e.args[0], str(e.args[1])) + code = e.args[0] err = URLGrabError(14, msg) - err.code = self.http_code + err.code = code err.exception = e raise err - + def _do_open(self): - self.append = False - self.reget_time = None self.curl_obj = _curl_cache self.curl_obj.reset() # reset all old settings away, just in case # setup any ranges @@ -1630,11 +1700,9 @@ class PyCurlFileObject(): pass def _build_range(self): - self.reget_time = None - self.append = False reget_length = 0 rt = None - if self.opts.reget and type(self.filename) == type(''): + if self.opts.reget and type(self.filename) in types.StringTypes: # we have reget turned on and we're dumping to a file try: s = os.stat(self.filename) @@ -1726,10 +1794,10 @@ class PyCurlFileObject(): if self._complete: return - if self.filename: + if self.filename is not None: self._prog_reportname = str(self.filename) self._prog_basename = os.path.basename(self.filename) - + if self.append: mode = 'ab' else: mode = 'wb' @@ -1746,19 +1814,23 @@ class PyCurlFileObject(): else: self._prog_reportname = 'MEMORY' self._prog_basename = 'MEMORY' - fh, self._temp_name = mkstemp() + - self.fo = open(self._temp_name, 'wb') + self.fo = StringIO() + # if this is to be a tempfile instead.... + # it just makes crap in the tempdir + #fh, self._temp_name = mkstemp() + #self.fo = open(self._temp_name, 'wb') self._do_perform() - # close it up - self.fo.flush() - self.fo.close() if self.filename: + # close it up + self.fo.flush() + self.fo.close() # set the time mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME) if mod_time != -1: @@ -1766,7 +1838,8 @@ class PyCurlFileObject(): # re open it self.fo = open(self.filename, 'r') else: - self.fo = open(self._temp_name, 'r') + #self.fo = open(self._temp_name, 'r') + self.fo.seek(0) self._complete = True @@ -1838,6 +1911,13 @@ class PyCurlFileObject(): downloaded += self._reget_length self.opts.progress_obj.update(downloaded) + def _to_utf8(self, obj, errors='replace'): + '''convert 'unicode' to an encoded utf-8 byte string ''' + # stolen from yum.i18n + if isinstance(obj, unicode): + obj = obj.encode('utf-8', errors) + return obj + def read(self, amt=None): self._fill_buffer(amt) if amt is None: