latest urlgrabber HEAD

16 years ago · b78a1772ac
parent 891a468499
commit b78a1772ac
2 changed files with 179 additions and 13 deletions
--- a/python-urlgrabber.spec
+++ b/python-urlgrabber.spec
@ -3,7 +3,7 @@
 Summary: A high-level cross-protocol url-grabber
 Name: python-urlgrabber
 Version: 3.9.0
-Release: 6%{?dist}
+Release: 7%{?dist}
 Source0: urlgrabber-%{version}.tar.gz
 Patch0:  urlgrabber-HEAD.patch
@ -44,6 +44,9 @@ rm -rf $RPM_BUILD_ROOT
 %{_bindir}/urlgrabber
 %changelog
 * Mon Aug 10 2009 Seth Vidal <skvidal at fedoraproject.org> - 3.9.0-6
 - reget fixes, tmpfiles no longer made for urlopen() calls.
 * Wed Aug  5 2009 Seth Vidal <skvidal at fedoraproject.org> - 3.9.0-5
 - apply complete patch to head fixes: timeouts, regets, improves exception raising
--- a/urlgrabber-HEAD.patch
+++ b/urlgrabber-HEAD.patch
@ -1,8 +1,8 @@
 diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py
-index cf51dff..3758799 100644
+index cf51dff..cea47e3 100644
 --- a/urlgrabber/grabber.py
 +++ b/urlgrabber/grabber.py
-@@ -402,6 +402,7 @@ import urllib
+@@ -402,11 +402,11 @@ import urllib
 import urllib2
 import mimetools
 import thread
@ -10,7 +10,26 @@ index cf51dff..3758799 100644
 from stat import *  # S_* and ST_*
 import pycurl
 from ftplib import parse150
-@@ -1219,7 +1220,7 @@ class URLGrabberFileObject:
+ from StringIO import StringIO
 -from tempfile import mkstemp
 ########################################################################
 #                     MODULE INITIALIZATION
@@ -467,6 +467,13 @@ except AttributeError:
     TimeoutError = None
     have_socket_timeout = False
 +try:
 +    import signal
 +    from signal import SIGPIPE, SIG_IGN
 +    signal.signal(signal.SIGPIPE, signal.SIG_IGN)
 +except ImportError:
 +    pass
 +
 ########################################################################
 # functions for debugging output.  These functions are here because they
 # are also part of the module initialization.
@@ -1219,7 +1226,7 @@ class URLGrabberFileObject:
         self.append = 0
         reget_length = 0
         rt = None
@ -19,20 +38,104 @@ index cf51dff..3758799 100644
             # we have reget turned on and we're dumping to a file
             try:
                 s = os.stat(self.filename)
-@@ -1450,6 +1451,7 @@ class PyCurlFileObject():
+@@ -1450,9 +1457,11 @@ class PyCurlFileObject():
         self.scheme = urlparse.urlsplit(self.url)[0]
         self.filename = filename
         self.append = False
 +        self.reget_time = None
         self.opts = opts
 +        if self.opts.reget == 'check_timestamp':
 +            raise NotImplementedError, "check_timestamp regets are not implemented in this ver of urlgrabber. Please report this."
         self._complete = False
-         self.reget_time = None
+-        self.reget_time = None
-@@ -1528,11 +1530,12 @@ class PyCurlFileObject():
+         self._rbuf = ''
         self._rbufsize = 1024*8
         self._ttime = time.time()
@@ -1474,39 +1483,45 @@ class PyCurlFileObject():
         raise AttributeError, name
     def _retrieve(self, buf):
 -        if not self._prog_running:
 -            if self.opts.progress_obj:
 -                size  = self.size + self._reget_length
 -                self.opts.progress_obj.start(self._prog_reportname, 
 -                                             urllib.unquote(self.url), 
 -                                             self._prog_basename, 
 -                                             size=size,
 -                                             text=self.opts.text)
 -                self._prog_running = True
 -                self.opts.progress_obj.update(self._amount_read)
 -
 -        self._amount_read += len(buf)
 -        self.fo.write(buf)
 -        return len(buf)
 -    
 +        try:
 +            if not self._prog_running:
 +                if self.opts.progress_obj:
 +                    size  = self.size + self._reget_length
 +                    self.opts.progress_obj.start(self._prog_reportname, 
 +                                                 urllib.unquote(self.url), 
 +                                                 self._prog_basename, 
 +                                                 size=size,
 +                                                 text=self.opts.text)
 +                    self._prog_running = True
 +                    self.opts.progress_obj.update(self._amount_read)
 +
 +            self._amount_read += len(buf)
 +            self.fo.write(buf)
 +            return len(buf)
 +        except KeyboardInterrupt:
 +            return pycurl.READFUNC_ABORT
 +            
     def _hdr_retrieve(self, buf):
 -        self._hdr_dump += buf
 -        # we have to get the size before we do the progress obj start
 -        # but we can't do that w/o making it do 2 connects, which sucks
 -        # so we cheat and stuff it in here in the hdr_retrieve
 -        if self.scheme in ['http','https'] and buf.lower().find('content-length') != -1:
 -            length = buf.split(':')[1]
 -            self.size = int(length)
 -        elif self.scheme in ['ftp']:
 -            s = None
 -            if buf.startswith('213 '):
 -                s = buf[3:].strip()
 -            elif buf.startswith('150 '):
 -                s = parse150(buf)
 -            if s:
 -                self.size = s
 -        
 -        return len(buf)
 +        try:
 +            self._hdr_dump += buf
 +            # we have to get the size before we do the progress obj start
 +            # but we can't do that w/o making it do 2 connects, which sucks
 +            # so we cheat and stuff it in here in the hdr_retrieve
 +            if self.scheme in ['http','https'] and buf.lower().find('content-length') != -1:
 +                length = buf.split(':')[1]
 +                self.size = int(length)
 +            elif self.scheme in ['ftp']:
 +                s = None
 +                if buf.startswith('213 '):
 +                    s = buf[3:].strip()
 +                elif buf.startswith('150 '):
 +                    s = parse150(buf)
 +                if s:
 +                    self.size = s
 +            
 +            return len(buf)
 +        except KeyboardInterrupt:
 +            return pycurl.READFUNC_ABORT
     def _return_hdr_obj(self):
         if self._parsed_hdr:
@@ -1528,11 +1543,13 @@ class PyCurlFileObject():
         # defaults we're always going to set
 -        self.curl_obj.setopt(pycurl.NOPROGRESS, 0)
 +        self.curl_obj.setopt(pycurl.NOPROGRESS, False)
 +        self.curl_obj.setopt(pycurl.NOSIGNAL, True)
         self.curl_obj.setopt(pycurl.WRITEFUNCTION, self._retrieve)
         self.curl_obj.setopt(pycurl.HEADERFUNCTION, self._hdr_retrieve)
         self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update)
@ -42,7 +145,7 @@ index cf51dff..3758799 100644
         if DEBUG:
             self.curl_obj.setopt(pycurl.VERBOSE, True)
-@@ -1540,15 +1543,15 @@ class PyCurlFileObject():
+@@ -1540,15 +1557,15 @@ class PyCurlFileObject():
             self.curl_obj.setopt(pycurl.USERAGENT, opts.user_agent)
         # maybe to be options later
@ -61,15 +164,27 @@ index cf51dff..3758799 100644
         # ssl options
         if self.scheme == 'https':
             if opts.ssl_ca_cert: # this may do ZERO with nss  according to curl docs
-@@ -1607,18 +1610,21 @@ class PyCurlFileObject():
+@@ -1607,18 +1624,33 @@ class PyCurlFileObject():
             # to other URLGrabErrors from 
             # http://curl.haxx.se/libcurl/c/libcurl-errors.html
             # this covers e.args[0] == 22 pretty well - which will be common
 +            code = self.http_code                                
 +            if e.args[0] == 28:
 +                err = URLGrabError(12, _('Timeout on %s: %s') % (self.url, e))
 +                err.url = self.url
 +                raise err
-+            code = self.http_code                
+
 +            elif e.args[0] == 23 and code >= 200 and code < 299:
 +                err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e))
 +                err.url = self.url
 +                # this is probably wrong but ultimately this is what happens
 +                # we have a legit http code and a pycurl 'writer failed' code
 +                # which almost always means something aborted it from outside
 +                # since we cannot know what it is -I'm banking on it being
 +                # a ctrl-c. XXXX - if there's a way of going back two raises to 
 +                # figure out what aborted the pycurl process FIXME
 +                raise KeyboardInterrupt
 +
             if str(e.args[1]) == '': # fake it until you make it
                 msg = 'HTTP Error %s : %s ' % (self.http_code, self.url)
             else:
@ -79,14 +194,15 @@ index cf51dff..3758799 100644
 +            err.code = code
             err.exception = e
             raise err
-             
+-            
 +
     def _do_open(self):
 -        self.append = False
 -        self.reget_time = None
         self.curl_obj = _curl_cache
         self.curl_obj.reset() # reset all old settings away, just in case
         # setup any ranges
-@@ -1630,11 +1636,9 @@ class PyCurlFileObject():
+@@ -1630,11 +1662,9 @@ class PyCurlFileObject():
         pass
     def _build_range(self):
@ -99,7 +215,7 @@ index cf51dff..3758799 100644
             # we have reget turned on and we're dumping to a file
             try:
                 s = os.stat(self.filename)
-@@ -1729,7 +1733,7 @@ class PyCurlFileObject():
+@@ -1729,7 +1759,7 @@ class PyCurlFileObject():
         if self.filename:
             self._prog_reportname = str(self.filename)
             self._prog_basename = os.path.basename(self.filename)
@ -108,3 +224,50 @@ index cf51dff..3758799 100644
             if self.append: mode = 'ab'
             else: mode = 'wb'
@@ -1746,19 +1776,23 @@ class PyCurlFileObject():
         else:
             self._prog_reportname = 'MEMORY'
             self._prog_basename = 'MEMORY'
 -            fh, self._temp_name = mkstemp()
 +
 -            self.fo = open(self._temp_name, 'wb')
 +            self.fo = StringIO()
 +            # if this is to be a tempfile instead....
 +            # it just makes crap in the tempdir
 +            #fh, self._temp_name = mkstemp()
 +            #self.fo = open(self._temp_name, 'wb')
         self._do_perform()
 -        # close it up
 -        self.fo.flush()
 -        self.fo.close()
         if self.filename:            
 +            # close it up
 +            self.fo.flush()
 +            self.fo.close()
             # set the time
             mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME)
             if mod_time != -1:
@@ -1766,7 +1800,8 @@ class PyCurlFileObject():
             # re open it
             self.fo = open(self.filename, 'r')
         else:
 -            self.fo = open(self._temp_name, 'r')
 +            #self.fo = open(self._temp_name, 'r')
 +            self.fo.seek(0)
         self._complete = True
@@ -1838,6 +1873,7 @@ class PyCurlFileObject():
                 downloaded += self._reget_length
                 self.opts.progress_obj.update(downloaded)
 +
     def read(self, amt=None):
         self._fill_buffer(amt)
         if amt is None: