python-urlgrabber/urlgrabber-HEAD.patch

diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py
index cf51dff..9692219 100644
--- a/urlgrabber/grabber.py
+++ b/urlgrabber/grabber.py
@@ -402,11 +402,11 @@ import urllib
 import urllib2
 import mimetools
 import thread
+import types
 from stat import *  # S_* and ST_*
 import pycurl
 from ftplib import parse150
 from StringIO import StringIO
-from tempfile import mkstemp

 ########################################################################
 #                     MODULE INITIALIZATION
@@ -467,6 +467,13 @@ except AttributeError:
     TimeoutError = None
     have_socket_timeout = False

+try:
+    import signal
+    from signal import SIGPIPE, SIG_IGN
+    signal.signal(signal.SIGPIPE, signal.SIG_IGN)
+except ImportError:
+    pass
+
 ########################################################################
 # functions for debugging output.  These functions are here because they
 # are also part of the module initialization.
@@ -859,8 +866,15 @@ class URLGrabberOptions:
         self.data = None
         self.urlparser = URLParser()
         self.quote = None
-        self.ssl_ca_cert = None
-        self.ssl_context = None
+        self.ssl_ca_cert = None # sets SSL_CAINFO - path to certdb
+        self.ssl_context = None # no-op in pycurl
+        self.ssl_verify_peer = True # check peer's cert for authenticityb
+        self.ssl_verify_host = True # make sure who they are and who the cert is for matches
+        self.ssl_key = None # client key
+        self.ssl_key_type = 'PEM' #(or DER)
+        self.ssl_cert = None # client cert
+        self.ssl_cert_type = 'PEM' # (or DER)
+        self.ssl_key_pass = None # password to access the key

     def __repr__(self):
         return self.format()
@@ -1219,7 +1233,7 @@ class URLGrabberFileObject:
         self.append = 0
         reget_length = 0
         rt = None
-        if have_range and self.opts.reget and type(self.filename) == type(''):
+        if have_range and self.opts.reget and type(self.filename) in types.StringTypes:
             # we have reget turned on and we're dumping to a file
             try:
                 s = os.stat(self.filename)
@@ -1450,9 +1464,11 @@ class PyCurlFileObject():
         self.scheme = urlparse.urlsplit(self.url)[0]
         self.filename = filename
         self.append = False
+        self.reget_time = None
         self.opts = opts
+        if self.opts.reget == 'check_timestamp':
+            raise NotImplementedError, "check_timestamp regets are not implemented in this ver of urlgrabber. Please report this."
         self._complete = False
-        self.reget_time = None
         self._rbuf = ''
         self._rbufsize = 1024*8
         self._ttime = time.time()
@@ -1474,39 +1490,45 @@ class PyCurlFileObject():
         raise AttributeError, name

     def _retrieve(self, buf):
-        if not self._prog_running:
-            if self.opts.progress_obj:
-                size  = self.size + self._reget_length
-                self.opts.progress_obj.start(self._prog_reportname,
-                                             urllib.unquote(self.url),
-                                             self._prog_basename,
-                                             size=size,
-                                             text=self.opts.text)
-                self._prog_running = True
-                self.opts.progress_obj.update(self._amount_read)
-
-        self._amount_read += len(buf)
-        self.fo.write(buf)
-        return len(buf)
-
+        try:
+            if not self._prog_running:
+                if self.opts.progress_obj:
+                    size  = self.size + self._reget_length
+                    self.opts.progress_obj.start(self._prog_reportname,
+                                                 urllib.unquote(self.url),
+                                                 self._prog_basename,
+                                                 size=size,
+                                                 text=self.opts.text)
+                    self._prog_running = True
+                    self.opts.progress_obj.update(self._amount_read)
+
+            self._amount_read += len(buf)
+            self.fo.write(buf)
+            return len(buf)
+        except KeyboardInterrupt:
+            return pycurl.READFUNC_ABORT
+
     def _hdr_retrieve(self, buf):
-        self._hdr_dump += buf
-        # we have to get the size before we do the progress obj start
-        # but we can't do that w/o making it do 2 connects, which sucks
-        # so we cheat and stuff it in here in the hdr_retrieve
-        if self.scheme in ['http','https'] and buf.lower().find('content-length') != -1:
-            length = buf.split(':')[1]
-            self.size = int(length)
-        elif self.scheme in ['ftp']:
-            s = None
-            if buf.startswith('213 '):
-                s = buf[3:].strip()
-            elif buf.startswith('150 '):
-                s = parse150(buf)
-            if s:
-                self.size = s
-
-        return len(buf)
+        try:
+            self._hdr_dump += buf
+            # we have to get the size before we do the progress obj start
+            # but we can't do that w/o making it do 2 connects, which sucks
+            # so we cheat and stuff it in here in the hdr_retrieve
+            if self.scheme in ['http','https'] and buf.lower().find('content-length') != -1:
+                length = buf.split(':')[1]
+                self.size = int(length)
+            elif self.scheme in ['ftp']:
+                s = None
+                if buf.startswith('213 '):
+                    s = buf[3:].strip()
+                elif buf.startswith('150 '):
+                    s = parse150(buf)
+                if s:
+                    self.size = s
+
+            return len(buf)
+        except KeyboardInterrupt:
+            return pycurl.READFUNC_ABORT

     def _return_hdr_obj(self):
         if self._parsed_hdr:
@@ -1528,11 +1550,13 @@ class PyCurlFileObject():


         # defaults we're always going to set
-        self.curl_obj.setopt(pycurl.NOPROGRESS, 0)
+        self.curl_obj.setopt(pycurl.NOPROGRESS, False)
+        self.curl_obj.setopt(pycurl.NOSIGNAL, True)
         self.curl_obj.setopt(pycurl.WRITEFUNCTION, self._retrieve)
         self.curl_obj.setopt(pycurl.HEADERFUNCTION, self._hdr_retrieve)
         self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update)
-        self.curl_obj.setopt(pycurl.FAILONERROR, 1)
+        self.curl_obj.setopt(pycurl.FAILONERROR, True)
+        self.curl_obj.setopt(pycurl.OPT_FILETIME, True)

         if DEBUG:
             self.curl_obj.setopt(pycurl.VERBOSE, True)
@@ -1540,19 +1564,32 @@ class PyCurlFileObject():
             self.curl_obj.setopt(pycurl.USERAGENT, opts.user_agent)

         # maybe to be options later
-        self.curl_obj.setopt(pycurl.FOLLOWLOCATION, 1)
+        self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)
         self.curl_obj.setopt(pycurl.MAXREDIRS, 5)
-        self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, 30)

         # timeouts
         timeout = 300
         if opts.timeout:
             timeout = int(opts.timeout)
-        self.curl_obj.setopt(pycurl.TIMEOUT, timeout)
+            self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout)
+
         # ssl options
         if self.scheme == 'https':
             if opts.ssl_ca_cert: # this may do ZERO with nss  according to curl docs
                 self.curl_obj.setopt(pycurl.CAPATH, opts.ssl_ca_cert)
+                self.curl_obj.setopt(pycurl.CAINFO, opts.ssl_ca_cert)
+            self.curl_obj.setopt(pycurl.SSL_VERIFYPEER, opts.ssl_verify_peer)
+            self.curl_obj.setopt(pycurl.SSL_VERIFYHOST, opts.ssl_verify_host)
+            if opts.ssl_key:
+                self.curl_obj.setopt(pycurl.SSLKEY, opts.ssl_key)
+            if opts.ssl_key_type:
+                self.curl_obj.setopt(pycurl.SSLKEYTYPE, opts.ssl_key_type)
+            if opts.ssl_cert:
+                self.curl_obj.setopt(pycurl.SSLCERT, opts.ssl_cert)
+            if opts.ssl_cert_type:
+                self.curl_obj.setopt(pycurl.SSLCERTTYPE, opts.ssl_cert_type)
+            if opts.ssl_key_pass:
+                self.curl_obj.setopt(pycurl.SSLKEYPASSWD, opts.ssl_key_pass)

         #headers:
         if opts.http_headers and self.scheme in ('http', 'https'):
@@ -1590,7 +1627,7 @@ class PyCurlFileObject():
         #posts - simple - expects the fields as they are
         if opts.data:
             self.curl_obj.setopt(pycurl.POST, True)
-            self.curl_obj.setopt(pycurl.POSTFIELDS, opts.data)
+            self.curl_obj.setopt(pycurl.POSTFIELDS, self._to_utf8(opts.data))

         # our url
         self.curl_obj.setopt(pycurl.URL, self.url)
@@ -1607,18 +1644,51 @@ class PyCurlFileObject():
             # to other URLGrabErrors from
             # http://curl.haxx.se/libcurl/c/libcurl-errors.html
             # this covers e.args[0] == 22 pretty well - which will be common
-            if str(e.args[1]) == '': # fake it until you make it
+            code = self.http_code
+            if e.args[0] == 23 and code >= 200 and code < 299:
+                err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e))
+                err.url = self.url
+                # this is probably wrong but ultimately this is what happens
+                # we have a legit http code and a pycurl 'writer failed' code
+                # which almost always means something aborted it from outside
+                # since we cannot know what it is -I'm banking on it being
+                # a ctrl-c. XXXX - if there's a way of going back two raises to
+                # figure out what aborted the pycurl process FIXME
+                raise KeyboardInterrupt
+
+            elif e.args[0] == 28:
+                err = URLGrabError(12, _('Timeout on %s: %s') % (self.url, e))
+                err.url = self.url
+                raise err
+            elif e.args[0] == 35:
+                msg = _("problem making ssl connection")
+                err = URLGrabError(14, msg)
+                err.url = self.url
+                raise err
+
+            elif e.args[0] == 58:
+                msg = _("problem with the local client certificate")
+                err = URLGrabError(14, msg)
+                err.url = self.url
+                raise err
+
+            elif e.args[0] == 60:
+                msg = _("client cert cannot be verified or client cert incorrect")
+                err = URLGrabError(14, msg)
+                err.url = self.url
+                raise err
+
+            elif str(e.args[1]) == '' and self.http_code != 0: # fake it until you make it
                 msg = 'HTTP Error %s : %s ' % (self.http_code, self.url)
             else:
-                msg = str(e.args[1])
+                msg = 'PYCURL ERROR %s - "%s"' % (e.args[0], str(e.args[1]))
+                code = e.args[0]
             err = URLGrabError(14, msg)
-            err.code = self.http_code
+            err.code = code
             err.exception = e
             raise err
-
+
     def _do_open(self):
-        self.append = False
-        self.reget_time = None
         self.curl_obj = _curl_cache
         self.curl_obj.reset() # reset all old settings away, just in case
         # setup any ranges
@@ -1630,11 +1700,9 @@ class PyCurlFileObject():
         pass

     def _build_range(self):
-        self.reget_time = None
-        self.append = False
         reget_length = 0
         rt = None
-        if self.opts.reget and type(self.filename) == type(''):
+        if self.opts.reget and type(self.filename) in types.StringTypes:
             # we have reget turned on and we're dumping to a file
             try:
                 s = os.stat(self.filename)
@@ -1726,10 +1794,10 @@ class PyCurlFileObject():
         if self._complete:
             return

-        if self.filename:
+        if self.filename is not None:
             self._prog_reportname = str(self.filename)
             self._prog_basename = os.path.basename(self.filename)
-
+
             if self.append: mode = 'ab'
             else: mode = 'wb'

@@ -1746,19 +1814,23 @@ class PyCurlFileObject():
         else:
             self._prog_reportname = 'MEMORY'
             self._prog_basename = 'MEMORY'
-            fh, self._temp_name = mkstemp()
+

-            self.fo = open(self._temp_name, 'wb')
+            self.fo = StringIO()
+            # if this is to be a tempfile instead....
+            # it just makes crap in the tempdir
+            #fh, self._temp_name = mkstemp()
+            #self.fo = open(self._temp_name, 'wb')


         self._do_perform()


-        # close it up
-        self.fo.flush()
-        self.fo.close()

         if self.filename:
+            # close it up
+            self.fo.flush()
+            self.fo.close()
             # set the time
             mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME)
             if mod_time != -1:
@@ -1766,7 +1838,8 @@ class PyCurlFileObject():
             # re open it
             self.fo = open(self.filename, 'r')
         else:
-            self.fo = open(self._temp_name, 'r')
+            #self.fo = open(self._temp_name, 'r')
+            self.fo.seek(0)

         self._complete = True

@@ -1838,6 +1911,13 @@ class PyCurlFileObject():
                 downloaded += self._reget_length
                 self.opts.progress_obj.update(downloaded)

+    def _to_utf8(self, obj, errors='replace'):
+        '''convert 'unicode' to an encoded utf-8 byte string '''
+        # stolen from yum.i18n
+        if isinstance(obj, unicode):
+            obj = obj.encode('utf-8', errors)
+        return obj
+
     def read(self, amt=None):
         self._fill_buffer(amt)
         if amt is None: