- Update to upstream HEAD.

- LOWSPEEDLIMIT and hdrs
15 years ago · c4bd7d23d9
parent ce8535ddf5
commit c4bd7d23d9
4 changed files with 148 additions and 94 deletions
--- a/python-urlgrabber.spec
+++ b/python-urlgrabber.spec
@ -3,7 +3,7 @@
 Summary: A high-level cross-protocol url-grabber
 Name: python-urlgrabber
 Version: 3.9.1
-Release: 5%{?dist}
+Release: 6%{?dist}
 Source0: urlgrabber-%{version}.tar.gz
 Patch1: urlgrabber-HEAD.patch
@ -43,10 +43,13 @@ rm -rf $RPM_BUILD_ROOT
 %{_bindir}/urlgrabber
 %changelog
-* Tue Apr 13 2010 James Antill <james@fedoraproject.org> 3.9.1-5
+* Tue Apr 13 2010 James Antill <james@fedoraproject.org> 3.9.1-6
 - Update to upstream HEAD.
 - LOWSPEEDLIMIT and hdrs
 * Fri Feb 19 2010 Seth Vidal <skvidal at fedoraproject.org> - 3.9.1-5
 - add patch to allow reset_curl_obj() to close and reload the cached curl obj
 * Thu Nov 12 2009 Seth Vidal <skvidal at fedoraproject.org> - 3.9.1-4
 - reset header values when we redirect and make sure debug output will work
--- a/urlgrabber-3.0.0-cleanup.patch
+++ b/urlgrabber-3.0.0-cleanup.patch
@ -1,28 +0,0 @@
 diff -up urlgrabber-3.0.0/urlgrabber/grabber.py.cleanup urlgrabber-3.0.0/urlgrabber/grabber.py
 --- urlgrabber-3.0.0/urlgrabber/grabber.py.cleanup	2007-11-29 10:25:13.000000000 +0000
 +++ urlgrabber-3.0.0/urlgrabber/grabber.py	2007-11-29 10:26:15.000000000 +0000
@@ -1204,16 +1204,18 @@ class URLGrabberFileObject:
         bs = 1024*8
         size = 0
 -        if amount is not None: bs = min(bs, amount - size)
 -        block = self.read(bs)
 -        size = size + len(block)
 -        while block:
 -            new_fo.write(block)
 +        try:
             if amount is not None: bs = min(bs, amount - size)
             block = self.read(bs)
             size = size + len(block)
 +            while block:
 +                new_fo.write(block)
 +                if amount is not None: bs = min(bs, amount - size)
 +                block = self.read(bs)
 +                size = size + len(block)
 +        finally:
 +            new_fo.close()
 -        new_fo.close()
         try:
             modified_tuple  = self.hdr.getdate_tz('last-modified')
             modified_stamp  = rfc822.mktime_tz(modified_tuple)
--- a/urlgrabber-HEAD.patch
+++ b/urlgrabber-HEAD.patch
@ -1,7 +1,54 @@
 diff --git a/.gitignore b/.gitignore
 new file mode 100644
 index 0000000..1ffe416
 --- /dev/null
 +++ b/.gitignore
@@ -0,0 +1,7 @@
 +*.py[co]
 +MANIFEST
 +dist
 +build
 +*.kdev*
 +*.kateproject
 +ipython.log*
 diff --git a/test/base_test_code.py b/test/base_test_code.py
 index 50c6348..5fb43f9 100644
 --- a/test/base_test_code.py
 +++ b/test/base_test_code.py
@@ -1,6 +1,6 @@
 from munittest import *
 -base_http = 'http://www.linux.duke.edu/projects/urlgrabber/test/'
 +base_http = 'http://urlgrabber.baseurl.org/test/'
 base_ftp  = 'ftp://localhost/test/'
 # set to a proftp server only. we're working around a couple of
 diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py
-index e090e90..a26880c 100644
+index e090e90..4797436 100644
 --- a/urlgrabber/grabber.py
 +++ b/urlgrabber/grabber.py
@@ -68,14 +68,14 @@ GENERAL ARGUMENTS (kwargs)
     (which can be set on default_grabber.throttle) is used. See
     BANDWIDTH THROTTLING for more information.
 -  timeout = None
 +  timeout = 300
 -    a positive float expressing the number of seconds to wait for socket
 -    operations. If the value is None or 0.0, socket operations will block
 -    forever. Setting this option causes urlgrabber to call the settimeout
 -    method on the Socket object used for the request. See the Python
 -    documentation on settimeout for more information.
 -    http://www.python.org/doc/current/lib/socket-objects.html
 +    a positive integer expressing the number of seconds to wait before
 +    timing out attempts to connect to a server. If the value is None
 +    or 0, connection attempts will not time out. The timeout is passed
 +    to the underlying pycurl object as its CONNECTTIMEOUT option, see
 +    the curl documentation on CURLOPT_CONNECTTIMEOUT for more information.
 +    http://curl.haxx.se/libcurl/c/curl_easy_setopt.html#CURLOPTCONNECTTIMEOUT
   bandwidth = 0
@@ -439,6 +439,12 @@ try:
 except:
     __version__ = '???'
@ -15,7 +62,16 @@ index e090e90..a26880c 100644
 ########################################################################
 # functions for debugging output.  These functions are here because they
 # are also part of the module initialization.
-@@ -1052,7 +1058,8 @@ class PyCurlFileObject():
+@@ -808,7 +814,7 @@ class URLGrabberOptions:
         self.prefix = None
         self.opener = None
         self.cache_openers = True
 -        self.timeout = None
 +        self.timeout = 300
         self.text = None
         self.http_headers = None
         self.ftp_headers = None
@@ -1052,9 +1058,15 @@ class PyCurlFileObject():
         self._reget_length = 0
         self._prog_running = False
         self._error = (None, None)
@ -24,8 +80,15 @@ index e090e90..a26880c 100644
 +        self._hdr_ended = False
         self._do_open()
 +
 +    def geturl(self):
 +        """ Provide the geturl() method, used to be got from
 +            urllib.addinfourl, via. urllib.URLopener.* """
 +        return self.url
-@@ -1085,9 +1092,14 @@ class PyCurlFileObject():
+     def __getattr__(self, name):
         """This effectively allows us to wrap at the instance level.
@@ -1085,9 +1097,14 @@ class PyCurlFileObject():
             return -1
     def _hdr_retrieve(self, buf):
@ -41,7 +104,7 @@ index e090e90..a26880c 100644
         try:
             self._hdr_dump += buf
             # we have to get the size before we do the progress obj start
-@@ -1104,7 +1116,17 @@ class PyCurlFileObject():
+@@ -1104,7 +1121,17 @@ class PyCurlFileObject():
                     s = parse150(buf)
                 if s:
                     self.size = int(s)
@ -60,7 +123,18 @@ index e090e90..a26880c 100644
             return len(buf)
         except KeyboardInterrupt:
             return pycurl.READFUNC_ABORT
-@@ -1136,6 +1158,7 @@ class PyCurlFileObject():
+@@ -1113,8 +1140,10 @@ class PyCurlFileObject():
         if self._parsed_hdr:
             return self._parsed_hdr
         statusend = self._hdr_dump.find('\n')
 +        statusend += 1 # ridiculous as it may seem.
         hdrfp = StringIO()
         hdrfp.write(self._hdr_dump[statusend:])
 +        hdrfp.seek(0)
         self._parsed_hdr =  mimetools.Message(hdrfp)
         return self._parsed_hdr
@@ -1136,6 +1165,7 @@ class PyCurlFileObject():
         self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update)
         self.curl_obj.setopt(pycurl.FAILONERROR, True)
         self.curl_obj.setopt(pycurl.OPT_FILETIME, True)
@ -68,7 +142,31 @@ index e090e90..a26880c 100644
         if DEBUG:
             self.curl_obj.setopt(pycurl.VERBOSE, True)
-@@ -1291,7 +1314,12 @@ class PyCurlFileObject():
+@@ -1148,9 +1178,11 @@ class PyCurlFileObject():
         # timeouts
         timeout = 300
 -        if opts.timeout:
 -            timeout = int(opts.timeout)
 -            self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout)
 +        if hasattr(opts, 'timeout'):
 +            timeout = int(opts.timeout or 0)
 +        self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout)
 +        self.curl_obj.setopt(pycurl.LOW_SPEED_LIMIT, 1)
 +        self.curl_obj.setopt(pycurl.LOW_SPEED_TIME, timeout)
         # ssl options
         if self.scheme == 'https':
@@ -1276,7 +1308,7 @@ class PyCurlFileObject():
                 raise err
             elif errcode == 60:
 -                msg = _("client cert cannot be verified or client cert incorrect")
 +                msg = _("Peer cert cannot be verified or peer cert invalid")
                 err = URLGrabError(14, msg)
                 err.url = self.url
                 raise err
@@ -1291,7 +1323,12 @@ class PyCurlFileObject():
                 raise err
             elif str(e.args[1]) == '' and self.http_code != 0: # fake it until you make it
@ -82,7 +180,7 @@ index e090e90..a26880c 100644
             else:
                 msg = 'PYCURL ERROR %s - "%s"' % (errcode, str(e.args[1]))
                 code = errcode
-@@ -1299,6 +1327,12 @@ class PyCurlFileObject():
+@@ -1299,6 +1336,12 @@ class PyCurlFileObject():
             err.code = code
             err.exception = e
             raise err
@ -95,7 +193,33 @@ index e090e90..a26880c 100644
     def _do_open(self):
         self.curl_obj = _curl_cache
-@@ -1532,11 +1566,14 @@ class PyCurlFileObject():
+@@ -1446,9 +1489,23 @@ class PyCurlFileObject():
             # set the time
             mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME)
             if mod_time != -1:
 -                os.utime(self.filename, (mod_time, mod_time))
 +                try:
 +                    os.utime(self.filename, (mod_time, mod_time))
 +                except OSError, e:
 +                    err = URLGrabError(16, _(\
 +                      'error setting timestamp on file %s from %s, OSError: %s') 
 +                              % (self.filenameself.url, e))
 +                    err.url = self.url
 +                    raise err
             # re open it
 -            self.fo = open(self.filename, 'r')
 +            try:
 +                self.fo = open(self.filename, 'r')
 +            except IOError, e:
 +                err = URLGrabError(16, _(\
 +                  'error opening file from %s, IOError: %s') % (self.url, e))
 +                err.url = self.url
 +                raise err
 +                
         else:
             #self.fo = open(self._temp_name, 'r')
             self.fo.seek(0)
@@ -1532,11 +1589,14 @@ class PyCurlFileObject():
     def _over_max_size(self, cur, max_size=None):
         if not max_size:
@ -114,7 +238,7 @@ index e090e90..a26880c 100644
             msg = _("Downloaded more than max size for %s: %s > %s") \
                         % (self.url, cur, max_size)
-@@ -1582,7 +1619,11 @@ class PyCurlFileObject():
+@@ -1582,9 +1642,21 @@ class PyCurlFileObject():
             self.opts.progress_obj.end(self._amount_read)
         self.fo.close()
@ -126,7 +250,17 @@ index e090e90..a26880c 100644
 +        
 _curl_cache = pycurl.Curl() # make one and reuse it over and over and over
 +def reset_curl_obj():
 +    """To make sure curl has reread the network/dns info we force a reload"""
 +    global _curl_cache
 +    _curl_cache.close()
 +    _curl_cache = pycurl.Curl()
 +
 +
 +    
 #####################################################################
 # DEPRECATED FUNCTIONS
 diff --git a/urlgrabber/progress.py b/urlgrabber/progress.py
 index dd07c6a..45eb248 100644
 --- a/urlgrabber/progress.py
@ -140,43 +274,3 @@ index dd07c6a..45eb248 100644
     else:
         seconds = int(seconds)
         minutes = seconds / 60
 commit e85f27c43f991469db38bad97735ce2c0f7d075d
 Author: Seth Vidal <skvidal@fedoraproject.org>
 Date:   Mon Mar 15 22:50:21 2010 -0400
    make sure we're properly reading the hdrs and returning them
 diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py
 index 16bb1d2..ac5ae18 100644
 --- a/urlgrabber/grabber.py
 +++ b/urlgrabber/grabber.py
@@ -1135,8 +1135,10 @@ class PyCurlFileObject():
         if self._parsed_hdr:
             return self._parsed_hdr
         statusend = self._hdr_dump.find('\n')
 +        statusend += 1 # ridiculous as it may seem.
         hdrfp = StringIO()
         hdrfp.write(self._hdr_dump[statusend:])
 +        hdrfp.seek(0)
         self._parsed_hdr =  mimetools.Message(hdrfp)
         return self._parsed_hdr
 commit 8e57ad3fbf14c55434eab5c04c4e00ba4f5986f9
 Author: James Antill <james@and.org>
 Date:   Mon Mar 1 11:48:00 2010 -0500
    Implement connection established timeout using, LOW_SPEED_LIMIT
 diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py
 index e63d4bb..bd4da75 100644
 --- a/urlgrabber/grabber.py
 +++ b/urlgrabber/grabber.py
@@ -1179,6 +1179,8 @@ class PyCurlFileObject():
         if hasattr(opts, 'timeout'):
             timeout = int(opts.timeout or 0)
         self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout)
 +        self.curl_obj.setopt(pycurl.LOW_SPEED_LIMIT, 1)
 +        self.curl_obj.setopt(pycurl.LOW_SPEED_TIME, timeout)
         # ssl options
         if self.scheme == 'https':
--- a/urlgrabber-reset.patch
+++ b/urlgrabber-reset.patch
@ -1,15 +0,0 @@
 --- a/urlgrabber/grabber.py	2010-02-19 14:50:45.000000000 -0500
 +++ b/urlgrabber/grabber.py	2010-02-19 14:51:28.000000000 -0500
@@ -1626,6 +1626,12 @@
 _curl_cache = pycurl.Curl() # make one and reuse it over and over and over
 +def reset_curl_obj():
 +    """To make sure curl has reread the network/dns info we force a reload"""
 +    global _curl_cache
 +    _curl_cache.close()
 +    _curl_cache = pycurl.Curl()
 +
 #####################################################################
 # DEPRECATED FUNCTIONS