diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py index cf51dff..cea47e3 100644 --- a/urlgrabber/grabber.py +++ b/urlgrabber/grabber.py @@ -402,11 +402,11 @@ import urllib import urllib2 import mimetools import thread +import types from stat import * # S_* and ST_* import pycurl from ftplib import parse150 from StringIO import StringIO -from tempfile import mkstemp ######################################################################## # MODULE INITIALIZATION @@ -467,6 +467,13 @@ except AttributeError: TimeoutError = None have_socket_timeout = False +try: + import signal + from signal import SIGPIPE, SIG_IGN + signal.signal(signal.SIGPIPE, signal.SIG_IGN) +except ImportError: + pass + ######################################################################## # functions for debugging output. These functions are here because they # are also part of the module initialization. @@ -1219,7 +1226,7 @@ class URLGrabberFileObject: self.append = 0 reget_length = 0 rt = None - if have_range and self.opts.reget and type(self.filename) == type(''): + if have_range and self.opts.reget and type(self.filename) in types.StringTypes: # we have reget turned on and we're dumping to a file try: s = os.stat(self.filename) @@ -1450,9 +1457,11 @@ class PyCurlFileObject(): self.scheme = urlparse.urlsplit(self.url)[0] self.filename = filename self.append = False + self.reget_time = None self.opts = opts + if self.opts.reget == 'check_timestamp': + raise NotImplementedError, "check_timestamp regets are not implemented in this ver of urlgrabber. Please report this." self._complete = False - self.reget_time = None self._rbuf = '' self._rbufsize = 1024*8 self._ttime = time.time() @@ -1474,39 +1483,45 @@ class PyCurlFileObject(): raise AttributeError, name def _retrieve(self, buf): - if not self._prog_running: - if self.opts.progress_obj: - size = self.size + self._reget_length - self.opts.progress_obj.start(self._prog_reportname, - urllib.unquote(self.url), - self._prog_basename, - size=size, - text=self.opts.text) - self._prog_running = True - self.opts.progress_obj.update(self._amount_read) - - self._amount_read += len(buf) - self.fo.write(buf) - return len(buf) - + try: + if not self._prog_running: + if self.opts.progress_obj: + size = self.size + self._reget_length + self.opts.progress_obj.start(self._prog_reportname, + urllib.unquote(self.url), + self._prog_basename, + size=size, + text=self.opts.text) + self._prog_running = True + self.opts.progress_obj.update(self._amount_read) + + self._amount_read += len(buf) + self.fo.write(buf) + return len(buf) + except KeyboardInterrupt: + return pycurl.READFUNC_ABORT + def _hdr_retrieve(self, buf): - self._hdr_dump += buf - # we have to get the size before we do the progress obj start - # but we can't do that w/o making it do 2 connects, which sucks - # so we cheat and stuff it in here in the hdr_retrieve - if self.scheme in ['http','https'] and buf.lower().find('content-length') != -1: - length = buf.split(':')[1] - self.size = int(length) - elif self.scheme in ['ftp']: - s = None - if buf.startswith('213 '): - s = buf[3:].strip() - elif buf.startswith('150 '): - s = parse150(buf) - if s: - self.size = s - - return len(buf) + try: + self._hdr_dump += buf + # we have to get the size before we do the progress obj start + # but we can't do that w/o making it do 2 connects, which sucks + # so we cheat and stuff it in here in the hdr_retrieve + if self.scheme in ['http','https'] and buf.lower().find('content-length') != -1: + length = buf.split(':')[1] + self.size = int(length) + elif self.scheme in ['ftp']: + s = None + if buf.startswith('213 '): + s = buf[3:].strip() + elif buf.startswith('150 '): + s = parse150(buf) + if s: + self.size = s + + return len(buf) + except KeyboardInterrupt: + return pycurl.READFUNC_ABORT def _return_hdr_obj(self): if self._parsed_hdr: @@ -1528,11 +1543,13 @@ class PyCurlFileObject(): # defaults we're always going to set - self.curl_obj.setopt(pycurl.NOPROGRESS, 0) + self.curl_obj.setopt(pycurl.NOPROGRESS, False) + self.curl_obj.setopt(pycurl.NOSIGNAL, True) self.curl_obj.setopt(pycurl.WRITEFUNCTION, self._retrieve) self.curl_obj.setopt(pycurl.HEADERFUNCTION, self._hdr_retrieve) self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update) - self.curl_obj.setopt(pycurl.FAILONERROR, 1) + self.curl_obj.setopt(pycurl.FAILONERROR, True) + self.curl_obj.setopt(pycurl.OPT_FILETIME, True) if DEBUG: self.curl_obj.setopt(pycurl.VERBOSE, True) @@ -1540,15 +1557,15 @@ class PyCurlFileObject(): self.curl_obj.setopt(pycurl.USERAGENT, opts.user_agent) # maybe to be options later - self.curl_obj.setopt(pycurl.FOLLOWLOCATION, 1) + self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True) self.curl_obj.setopt(pycurl.MAXREDIRS, 5) - self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, 30) # timeouts timeout = 300 if opts.timeout: timeout = int(opts.timeout) - self.curl_obj.setopt(pycurl.TIMEOUT, timeout) + self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout) + # ssl options if self.scheme == 'https': if opts.ssl_ca_cert: # this may do ZERO with nss according to curl docs @@ -1607,18 +1624,33 @@ class PyCurlFileObject(): # to other URLGrabErrors from # http://curl.haxx.se/libcurl/c/libcurl-errors.html # this covers e.args[0] == 22 pretty well - which will be common + code = self.http_code + if e.args[0] == 28: + err = URLGrabError(12, _('Timeout on %s: %s') % (self.url, e)) + err.url = self.url + raise err + + elif e.args[0] == 23 and code >= 200 and code < 299: + err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e)) + err.url = self.url + # this is probably wrong but ultimately this is what happens + # we have a legit http code and a pycurl 'writer failed' code + # which almost always means something aborted it from outside + # since we cannot know what it is -I'm banking on it being + # a ctrl-c. XXXX - if there's a way of going back two raises to + # figure out what aborted the pycurl process FIXME + raise KeyboardInterrupt + if str(e.args[1]) == '': # fake it until you make it msg = 'HTTP Error %s : %s ' % (self.http_code, self.url) else: msg = str(e.args[1]) err = URLGrabError(14, msg) - err.code = self.http_code + err.code = code err.exception = e raise err - + def _do_open(self): - self.append = False - self.reget_time = None self.curl_obj = _curl_cache self.curl_obj.reset() # reset all old settings away, just in case # setup any ranges @@ -1630,11 +1662,9 @@ class PyCurlFileObject(): pass def _build_range(self): - self.reget_time = None - self.append = False reget_length = 0 rt = None - if self.opts.reget and type(self.filename) == type(''): + if self.opts.reget and type(self.filename) in types.StringTypes: # we have reget turned on and we're dumping to a file try: s = os.stat(self.filename) @@ -1729,7 +1759,7 @@ class PyCurlFileObject(): if self.filename: self._prog_reportname = str(self.filename) self._prog_basename = os.path.basename(self.filename) - + if self.append: mode = 'ab' else: mode = 'wb' @@ -1746,19 +1776,23 @@ class PyCurlFileObject(): else: self._prog_reportname = 'MEMORY' self._prog_basename = 'MEMORY' - fh, self._temp_name = mkstemp() + - self.fo = open(self._temp_name, 'wb') + self.fo = StringIO() + # if this is to be a tempfile instead.... + # it just makes crap in the tempdir + #fh, self._temp_name = mkstemp() + #self.fo = open(self._temp_name, 'wb') self._do_perform() - # close it up - self.fo.flush() - self.fo.close() if self.filename: + # close it up + self.fo.flush() + self.fo.close() # set the time mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME) if mod_time != -1: @@ -1766,7 +1800,8 @@ class PyCurlFileObject(): # re open it self.fo = open(self.filename, 'r') else: - self.fo = open(self._temp_name, 'r') + #self.fo = open(self._temp_name, 'r') + self.fo.seek(0) self._complete = True @@ -1838,6 +1873,7 @@ class PyCurlFileObject(): downloaded += self._reget_length self.opts.progress_obj.update(downloaded) + def read(self, amt=None): self._fill_buffer(amt) if amt is None: