- Update to upstream HEAD.

- LOWSPEEDLIMIT and hdrs
15 years ago · c4bd7d23d9
parent ce8535ddf5
commit c4bd7d23d9
4 changed files with 148 additions and 94 deletions
--- a/python-urlgrabber.spec
+++ b/python-urlgrabber.spec
@ -3,7 +3,7 @@
 Summary: A high-level cross-protocol url-grabber
 Name: python-urlgrabber
 Version: 3.9.1
-Release: 5%{?dist}
+Release: 6%{?dist}
 Source0: urlgrabber-%{version}.tar.gz
 Patch1: urlgrabber-HEAD.patch

@ -43,10 +43,13 @@ rm -rf $RPM_BUILD_ROOT
 %{_bindir}/urlgrabber

 %changelog
-* Tue Apr 13 2010 James Antill <james@fedoraproject.org> 3.9.1-5
+* Tue Apr 13 2010 James Antill <james@fedoraproject.org> 3.9.1-6
 - Update to upstream HEAD.
 - LOWSPEEDLIMIT and hdrs

+* Fri Feb 19 2010 Seth Vidal <skvidal at fedoraproject.org> - 3.9.1-5
+- add patch to allow reset_curl_obj() to close and reload the cached curl obj
+
 * Thu Nov 12 2009 Seth Vidal <skvidal at fedoraproject.org> - 3.9.1-4
 - reset header values when we redirect and make sure debug output will work

--- a/urlgrabber-3.0.0-cleanup.patch
+++ b/urlgrabber-3.0.0-cleanup.patch
@ -1,28 +0,0 @@
-diff -up urlgrabber-3.0.0/urlgrabber/grabber.py.cleanup urlgrabber-3.0.0/urlgrabber/grabber.py
--- urlgrabber-3.0.0/urlgrabber/grabber.py.cleanup	2007-11-29 10:25:13.000000000 +0000
-+++ urlgrabber-3.0.0/urlgrabber/grabber.py	2007-11-29 10:26:15.000000000 +0000
-@@ -1204,16 +1204,18 @@ class URLGrabberFileObject:
-         bs = 1024*8
-         size = 0
- 
-        if amount is not None: bs = min(bs, amount - size)
-        block = self.read(bs)
-        size = size + len(block)
-        while block:
-            new_fo.write(block)
-+        try:
-             if amount is not None: bs = min(bs, amount - size)
-             block = self.read(bs)
-             size = size + len(block)
-+            while block:
-+                new_fo.write(block)
-+                if amount is not None: bs = min(bs, amount - size)
-+                block = self.read(bs)
-+                size = size + len(block)
-+        finally:
-+            new_fo.close()
- 
-        new_fo.close()
-         try:
-             modified_tuple  = self.hdr.getdate_tz('last-modified')
-             modified_stamp  = rfc822.mktime_tz(modified_tuple)
--- a/urlgrabber-HEAD.patch
+++ b/urlgrabber-HEAD.patch
@ -1,7 +1,54 @@
+diff --git a/.gitignore b/.gitignore
+new file mode 100644
+index 0000000..1ffe416
+--- /dev/null
+++ b/.gitignore
+@@ -0,0 +1,7 @@
+*.py[co]
+MANIFEST
+dist
+build
+*.kdev*
+*.kateproject
+ipython.log*
+diff --git a/test/base_test_code.py b/test/base_test_code.py
+index 50c6348..5fb43f9 100644
+--- a/test/base_test_code.py
+++ b/test/base_test_code.py
+@@ -1,6 +1,6 @@
+ from munittest import *
+ 
+-base_http = 'http://www.linux.duke.edu/projects/urlgrabber/test/'
+base_http = 'http://urlgrabber.baseurl.org/test/'
+ base_ftp  = 'ftp://localhost/test/'
+ 
+ # set to a proftp server only. we're working around a couple of
 diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py
-index e090e90..a26880c 100644
+index e090e90..4797436 100644
 --- a/urlgrabber/grabber.py
 +++ b/urlgrabber/grabber.py
+@@ -68,14 +68,14 @@ GENERAL ARGUMENTS (kwargs)
+     (which can be set on default_grabber.throttle) is used. See
+     BANDWIDTH THROTTLING for more information.
+ 
+-  timeout = None
+  timeout = 300
+ 
+-    a positive float expressing the number of seconds to wait for socket
+-    operations. If the value is None or 0.0, socket operations will block
+-    forever. Setting this option causes urlgrabber to call the settimeout
+-    method on the Socket object used for the request. See the Python
+-    documentation on settimeout for more information.
+-    http://www.python.org/doc/current/lib/socket-objects.html
+    a positive integer expressing the number of seconds to wait before
+    timing out attempts to connect to a server. If the value is None
+    or 0, connection attempts will not time out. The timeout is passed
+    to the underlying pycurl object as its CONNECTTIMEOUT option, see
+    the curl documentation on CURLOPT_CONNECTTIMEOUT for more information.
+    http://curl.haxx.se/libcurl/c/curl_easy_setopt.html#CURLOPTCONNECTTIMEOUT
+ 
+   bandwidth = 0
+ 
@@ -439,6 +439,12 @@ try:
 except:
     __version__ = '???'
@ -15,7 +62,16 @@ index e090e90..a26880c 100644
 ########################################################################
 # functions for debugging output.  These functions are here because they
 # are also part of the module initialization.
-@@ -1052,7 +1058,8 @@ class PyCurlFileObject():
+@@ -808,7 +814,7 @@ class URLGrabberOptions:
+         self.prefix = None
+         self.opener = None
+         self.cache_openers = True
+-        self.timeout = None
+        self.timeout = 300
+         self.text = None
+         self.http_headers = None
+         self.ftp_headers = None
+@@ -1052,9 +1058,15 @@ class PyCurlFileObject():
         self._reget_length = 0
         self._prog_running = False
         self._error = (None, None)
@ -24,8 +80,15 @@ index e090e90..a26880c 100644
 +        self._hdr_ended = False
         self._do_open()
         
+
+    def geturl(self):
+        """ Provide the geturl() method, used to be got from
+            urllib.addinfourl, via. urllib.URLopener.* """
+        return self.url
         
-@@ -1085,9 +1092,14 @@ class PyCurlFileObject():
+     def __getattr__(self, name):
+         """This effectively allows us to wrap at the instance level.
+@@ -1085,9 +1097,14 @@ class PyCurlFileObject():
             return -1
             
     def _hdr_retrieve(self, buf):
@ -41,7 +104,7 @@ index e090e90..a26880c 100644
         try:
             self._hdr_dump += buf
             # we have to get the size before we do the progress obj start
-@@ -1104,7 +1116,17 @@ class PyCurlFileObject():
+@@ -1104,7 +1121,17 @@ class PyCurlFileObject():
                     s = parse150(buf)
                 if s:
                     self.size = int(s)
@ -60,7 +123,18 @@ index e090e90..a26880c 100644
             return len(buf)
         except KeyboardInterrupt:
             return pycurl.READFUNC_ABORT
-@@ -1136,6 +1158,7 @@ class PyCurlFileObject():
+@@ -1113,8 +1140,10 @@ class PyCurlFileObject():
+         if self._parsed_hdr:
+             return self._parsed_hdr
+         statusend = self._hdr_dump.find('\n')
+        statusend += 1 # ridiculous as it may seem.
+         hdrfp = StringIO()
+         hdrfp.write(self._hdr_dump[statusend:])
+        hdrfp.seek(0)
+         self._parsed_hdr =  mimetools.Message(hdrfp)
+         return self._parsed_hdr
+     
+@@ -1136,6 +1165,7 @@ class PyCurlFileObject():
         self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update)
         self.curl_obj.setopt(pycurl.FAILONERROR, True)
         self.curl_obj.setopt(pycurl.OPT_FILETIME, True)
@ -68,7 +142,31 @@ index e090e90..a26880c 100644
         
         if DEBUG:
             self.curl_obj.setopt(pycurl.VERBOSE, True)
-@@ -1291,7 +1314,12 @@ class PyCurlFileObject():
+@@ -1148,9 +1178,11 @@ class PyCurlFileObject():
+         
+         # timeouts
+         timeout = 300
+-        if opts.timeout:
+-            timeout = int(opts.timeout)
+-            self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout)
+        if hasattr(opts, 'timeout'):
+            timeout = int(opts.timeout or 0)
+        self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout)
+        self.curl_obj.setopt(pycurl.LOW_SPEED_LIMIT, 1)
+        self.curl_obj.setopt(pycurl.LOW_SPEED_TIME, timeout)
+ 
+         # ssl options
+         if self.scheme == 'https':
+@@ -1276,7 +1308,7 @@ class PyCurlFileObject():
+                 raise err
+ 
+             elif errcode == 60:
+-                msg = _("client cert cannot be verified or client cert incorrect")
+                msg = _("Peer cert cannot be verified or peer cert invalid")
+                 err = URLGrabError(14, msg)
+                 err.url = self.url
+                 raise err
+@@ -1291,7 +1323,12 @@ class PyCurlFileObject():
                 raise err
                     
             elif str(e.args[1]) == '' and self.http_code != 0: # fake it until you make it
@ -82,7 +180,7 @@ index e090e90..a26880c 100644
             else:
                 msg = 'PYCURL ERROR %s - "%s"' % (errcode, str(e.args[1]))
                 code = errcode
-@@ -1299,6 +1327,12 @@ class PyCurlFileObject():
+@@ -1299,6 +1336,12 @@ class PyCurlFileObject():
             err.code = code
             err.exception = e
             raise err
@ -95,7 +193,33 @@ index e090e90..a26880c 100644
 
     def _do_open(self):
         self.curl_obj = _curl_cache
-@@ -1532,11 +1566,14 @@ class PyCurlFileObject():
+@@ -1446,9 +1489,23 @@ class PyCurlFileObject():
+             # set the time
+             mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME)
+             if mod_time != -1:
+-                os.utime(self.filename, (mod_time, mod_time))
+                try:
+                    os.utime(self.filename, (mod_time, mod_time))
+                except OSError, e:
+                    err = URLGrabError(16, _(\
+                      'error setting timestamp on file %s from %s, OSError: %s') 
+                              % (self.filenameself.url, e))
+                    err.url = self.url
+                    raise err
+             # re open it
+-            self.fo = open(self.filename, 'r')
+            try:
+                self.fo = open(self.filename, 'r')
+            except IOError, e:
+                err = URLGrabError(16, _(\
+                  'error opening file from %s, IOError: %s') % (self.url, e))
+                err.url = self.url
+                raise err
+                
+         else:
+             #self.fo = open(self._temp_name, 'r')
+             self.fo.seek(0)
+@@ -1532,11 +1589,14 @@ class PyCurlFileObject():
     def _over_max_size(self, cur, max_size=None):
 
         if not max_size:
@ -114,7 +238,7 @@ index e090e90..a26880c 100644
 
             msg = _("Downloaded more than max size for %s: %s > %s") \
                         % (self.url, cur, max_size)
-@@ -1582,7 +1619,11 @@ class PyCurlFileObject():
+@@ -1582,9 +1642,21 @@ class PyCurlFileObject():
             self.opts.progress_obj.end(self._amount_read)
         self.fo.close()
         
@ -126,7 +250,17 @@ index e090e90..a26880c 100644
 +        
 _curl_cache = pycurl.Curl() # make one and reuse it over and over and over
 
+def reset_curl_obj():
+    """To make sure curl has reread the network/dns info we force a reload"""
+    global _curl_cache
+    _curl_cache.close()
+    _curl_cache = pycurl.Curl()
+
+
+    
 
+ #####################################################################
+ # DEPRECATED FUNCTIONS
 diff --git a/urlgrabber/progress.py b/urlgrabber/progress.py
 index dd07c6a..45eb248 100644
 --- a/urlgrabber/progress.py
@ -140,43 +274,3 @@ index dd07c6a..45eb248 100644
     else:
         seconds = int(seconds)
         minutes = seconds / 60
-commit e85f27c43f991469db38bad97735ce2c0f7d075d
-Author: Seth Vidal <skvidal@fedoraproject.org>
-Date:   Mon Mar 15 22:50:21 2010 -0400
-
-    make sure we're properly reading the hdrs and returning them
-
-diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py
-index 16bb1d2..ac5ae18 100644
--- a/urlgrabber/grabber.py
-+++ b/urlgrabber/grabber.py
-@@ -1135,8 +1135,10 @@ class PyCurlFileObject():
-         if self._parsed_hdr:
-             return self._parsed_hdr
-         statusend = self._hdr_dump.find('\n')
-+        statusend += 1 # ridiculous as it may seem.
-         hdrfp = StringIO()
-         hdrfp.write(self._hdr_dump[statusend:])
-+        hdrfp.seek(0)
-         self._parsed_hdr =  mimetools.Message(hdrfp)
-         return self._parsed_hdr
-     
-commit 8e57ad3fbf14c55434eab5c04c4e00ba4f5986f9
-Author: James Antill <james@and.org>
-Date:   Mon Mar 1 11:48:00 2010 -0500
-
-    Implement connection established timeout using, LOW_SPEED_LIMIT
-
-diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py
-index e63d4bb..bd4da75 100644
--- a/urlgrabber/grabber.py
-+++ b/urlgrabber/grabber.py
-@@ -1179,6 +1179,8 @@ class PyCurlFileObject():
-         if hasattr(opts, 'timeout'):
-             timeout = int(opts.timeout or 0)
-         self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout)
-+        self.curl_obj.setopt(pycurl.LOW_SPEED_LIMIT, 1)
-+        self.curl_obj.setopt(pycurl.LOW_SPEED_TIME, timeout)
- 
-         # ssl options
-         if self.scheme == 'https':
--- a/urlgrabber-reset.patch
+++ b/urlgrabber-reset.patch
@ -1,15 +0,0 @@
--- a/urlgrabber/grabber.py	2010-02-19 14:50:45.000000000 -0500
-+++ b/urlgrabber/grabber.py	2010-02-19 14:51:28.000000000 -0500
-@@ -1626,6 +1626,12 @@
-         
- _curl_cache = pycurl.Curl() # make one and reuse it over and over and over
- 
-+def reset_curl_obj():
-+    """To make sure curl has reread the network/dns info we force a reload"""
-+    global _curl_cache
-+    _curl_cache.close()
-+    _curl_cache = pycurl.Curl()
-+
- 
- #####################################################################
- # DEPRECATED FUNCTIONS