python-urlgrabber/urlgrabber-HEAD.patch

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..1ffe416
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,7 @@
+*.py[co]
+MANIFEST
+dist
+build
+*.kdev*
+*.kateproject
+ipython.log*
diff --git a/test/base_test_code.py b/test/base_test_code.py
index 50c6348..5fb43f9 100644
--- a/test/base_test_code.py
+++ b/test/base_test_code.py
@@ -1,6 +1,6 @@
 from munittest import *
 
-base_http = 'http://www.linux.duke.edu/projects/urlgrabber/test/'
+base_http = 'http://urlgrabber.baseurl.org/test/'
 base_ftp  = 'ftp://localhost/test/'
 
 # set to a proftp server only. we're working around a couple of
diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py
index e090e90..4797436 100644
--- a/urlgrabber/grabber.py
+++ b/urlgrabber/grabber.py
@@ -68,14 +68,14 @@ GENERAL ARGUMENTS (kwargs)
     (which can be set on default_grabber.throttle) is used. See
     BANDWIDTH THROTTLING for more information.
 
-  timeout = None
+  timeout = 300
 
-    a positive float expressing the number of seconds to wait for socket
-    operations. If the value is None or 0.0, socket operations will block
-    forever. Setting this option causes urlgrabber to call the settimeout
-    method on the Socket object used for the request. See the Python
-    documentation on settimeout for more information.
-    http://www.python.org/doc/current/lib/socket-objects.html
+    a positive integer expressing the number of seconds to wait before
+    timing out attempts to connect to a server. If the value is None
+    or 0, connection attempts will not time out. The timeout is passed
+    to the underlying pycurl object as its CONNECTTIMEOUT option, see
+    the curl documentation on CURLOPT_CONNECTTIMEOUT for more information.
+    http://curl.haxx.se/libcurl/c/curl_easy_setopt.html#CURLOPTCONNECTTIMEOUT
 
   bandwidth = 0
 
@@ -439,6 +439,12 @@ try:
 except:
     __version__ = '???'
 
+try:
+    # this part isn't going to do much - need to talk to gettext
+    from i18n import _
+except ImportError, msg:
+    def _(st): return st
+    
 ########################################################################
 # functions for debugging output.  These functions are here because they
 # are also part of the module initialization.
@@ -808,7 +814,7 @@ class URLGrabberOptions:
         self.prefix = None
         self.opener = None
         self.cache_openers = True
-        self.timeout = None
+        self.timeout = 300
         self.text = None
         self.http_headers = None
         self.ftp_headers = None
@@ -1052,9 +1058,15 @@ class PyCurlFileObject():
         self._reget_length = 0
         self._prog_running = False
         self._error = (None, None)
-        self.size = None
+        self.size = 0
+        self._hdr_ended = False
         self._do_open()
         
+
+    def geturl(self):
+        """ Provide the geturl() method, used to be got from
+            urllib.addinfourl, via. urllib.URLopener.* """
+        return self.url
         
     def __getattr__(self, name):
         """This effectively allows us to wrap at the instance level.
@@ -1085,9 +1097,14 @@ class PyCurlFileObject():
             return -1
             
     def _hdr_retrieve(self, buf):
+        if self._hdr_ended:
+            self._hdr_dump = ''
+            self.size = 0
+            self._hdr_ended = False
+
         if self._over_max_size(cur=len(self._hdr_dump), 
                                max_size=self.opts.max_header_size):
-            return -1            
+            return -1
         try:
             self._hdr_dump += buf
             # we have to get the size before we do the progress obj start
@@ -1104,7 +1121,17 @@ class PyCurlFileObject():
                     s = parse150(buf)
                 if s:
                     self.size = int(s)
-            
+                    
+            if buf.lower().find('location') != -1:
+                location = ':'.join(buf.split(':')[1:])
+                location = location.strip()
+                self.scheme = urlparse.urlsplit(location)[0]
+                self.url = location
+                
+            if len(self._hdr_dump) != 0 and buf == '\r\n':
+                self._hdr_ended = True
+                if DEBUG: DEBUG.info('header ended:')
+                
             return len(buf)
         except KeyboardInterrupt:
             return pycurl.READFUNC_ABORT
@@ -1113,8 +1140,10 @@ class PyCurlFileObject():
         if self._parsed_hdr:
             return self._parsed_hdr
         statusend = self._hdr_dump.find('\n')
+        statusend += 1 # ridiculous as it may seem.
         hdrfp = StringIO()
         hdrfp.write(self._hdr_dump[statusend:])
+        hdrfp.seek(0)
         self._parsed_hdr =  mimetools.Message(hdrfp)
         return self._parsed_hdr
     
@@ -1136,6 +1165,7 @@ class PyCurlFileObject():
         self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update)
         self.curl_obj.setopt(pycurl.FAILONERROR, True)
         self.curl_obj.setopt(pycurl.OPT_FILETIME, True)
+        self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)
         
         if DEBUG:
             self.curl_obj.setopt(pycurl.VERBOSE, True)
@@ -1148,9 +1178,11 @@ class PyCurlFileObject():
         
         # timeouts
         timeout = 300
-        if opts.timeout:
-            timeout = int(opts.timeout)
-            self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout)
+        if hasattr(opts, 'timeout'):
+            timeout = int(opts.timeout or 0)
+        self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout)
+        self.curl_obj.setopt(pycurl.LOW_SPEED_LIMIT, 1)
+        self.curl_obj.setopt(pycurl.LOW_SPEED_TIME, timeout)
 
         # ssl options
         if self.scheme == 'https':
@@ -1276,7 +1308,7 @@ class PyCurlFileObject():
                 raise err
 
             elif errcode == 60:
-                msg = _("client cert cannot be verified or client cert incorrect")
+                msg = _("Peer cert cannot be verified or peer cert invalid")
                 err = URLGrabError(14, msg)
                 err.url = self.url
                 raise err
@@ -1291,7 +1323,12 @@ class PyCurlFileObject():
                 raise err
                     
             elif str(e.args[1]) == '' and self.http_code != 0: # fake it until you make it
-                msg = 'HTTP Error %s : %s ' % (self.http_code, self.url)
+                if self.scheme in ['http', 'https']:
+                    msg = 'HTTP Error %s : %s ' % (self.http_code, self.url)
+                elif self.scheme in ['ftp']:
+                    msg = 'FTP Error %s : %s ' % (self.http_code, self.url)
+                else:
+                    msg = "Unknown Error: URL=%s , scheme=%s" % (self.url, self.scheme)
             else:
                 msg = 'PYCURL ERROR %s - "%s"' % (errcode, str(e.args[1]))
                 code = errcode
@@ -1299,6 +1336,12 @@ class PyCurlFileObject():
             err.code = code
             err.exception = e
             raise err
+        else:
+            if self._error[1]:
+                msg = self._error[1]
+                err = URLGRabError(14, msg)
+                err.url = self.url
+                raise err
 
     def _do_open(self):
         self.curl_obj = _curl_cache
@@ -1446,9 +1489,23 @@ class PyCurlFileObject():
             # set the time
             mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME)
             if mod_time != -1:
-                os.utime(self.filename, (mod_time, mod_time))
+                try:
+                    os.utime(self.filename, (mod_time, mod_time))
+                except OSError, e:
+                    err = URLGrabError(16, _(\
+                      'error setting timestamp on file %s from %s, OSError: %s') 
+                              % (self.filenameself.url, e))
+                    err.url = self.url
+                    raise err
             # re open it
-            self.fo = open(self.filename, 'r')
+            try:
+                self.fo = open(self.filename, 'r')
+            except IOError, e:
+                err = URLGrabError(16, _(\
+                  'error opening file from %s, IOError: %s') % (self.url, e))
+                err.url = self.url
+                raise err
+                
         else:
             #self.fo = open(self._temp_name, 'r')
             self.fo.seek(0)
@@ -1532,11 +1589,14 @@ class PyCurlFileObject():
     def _over_max_size(self, cur, max_size=None):
 
         if not max_size:
-            max_size = self.size
-        if self.opts.size: # if we set an opts size use that, no matter what
-            max_size = self.opts.size
+            if not self.opts.size:
+                max_size = self.size
+            else:
+                max_size = self.opts.size
+
         if not max_size: return False # if we have None for all of the Max then this is dumb
-        if cur > max_size + max_size*.10:
+
+        if cur > int(float(max_size) * 1.10):
 
             msg = _("Downloaded more than max size for %s: %s > %s") \
                         % (self.url, cur, max_size)
@@ -1582,9 +1642,21 @@ class PyCurlFileObject():
             self.opts.progress_obj.end(self._amount_read)
         self.fo.close()
         
-
+    def geturl(self):
+        """ Provide the geturl() method, used to be got from
+            urllib.addinfourl, via. urllib.URLopener.* """
+        return self.url
+        
 _curl_cache = pycurl.Curl() # make one and reuse it over and over and over
 
+def reset_curl_obj():
+    """To make sure curl has reread the network/dns info we force a reload"""
+    global _curl_cache
+    _curl_cache.close()
+    _curl_cache = pycurl.Curl()
+
+
+    
 
 #####################################################################
 # DEPRECATED FUNCTIONS
diff --git a/urlgrabber/progress.py b/urlgrabber/progress.py
index dd07c6a..45eb248 100644
--- a/urlgrabber/progress.py
+++ b/urlgrabber/progress.py
@@ -658,6 +658,8 @@ def format_time(seconds, use_hours=0):
     if seconds is None or seconds < 0:
         if use_hours: return '--:--:--'
         else:         return '--:--'
+    elif seconds == float('inf'):
+        return 'Infinite'
     else:
         seconds = int(seconds)
         minutes = seconds / 60
- Update to upstream HEAD. - LOWSPEEDLIMIT and hdrs 15 years ago			`diff --git a/.gitignore b/.gitignore`
			`new file mode 100644`
			`index 0000000..1ffe416`
			`--- /dev/null`
			`+++ b/.gitignore`
			`@@ -0,0 +1,7 @@`
			`+*.py[co]`
			`+MANIFEST`
			`+dist`
			`+build`
			`+.kdev`
			`+*.kateproject`
			`+ipython.log*`
			`diff --git a/test/base_test_code.py b/test/base_test_code.py`
			`index 50c6348..5fb43f9 100644`
			`--- a/test/base_test_code.py`
			`+++ b/test/base_test_code.py`
			`@@ -1,6 +1,6 @@`
			`from munittest import *`

			`-base_http = 'http://www.linux.duke.edu/projects/urlgrabber/test/'`
			`+base_http = 'http://urlgrabber.baseurl.org/test/'`
			`base_ftp = 'ftp://localhost/test/'`

			`# set to a proftp server only. we're working around a couple of`
<sigh> actually check in the patch :( 16 years ago			`diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py`
- Update to upstream HEAD. - LOWSPEEDLIMIT and hdrs 15 years ago			`index e090e90..4797436 100644`
<sigh> actually check in the patch :( 16 years ago			`--- a/urlgrabber/grabber.py`
			`+++ b/urlgrabber/grabber.py`
- Update to upstream HEAD. - LOWSPEEDLIMIT and hdrs 15 years ago			`@@ -68,14 +68,14 @@ GENERAL ARGUMENTS (kwargs)`
			`(which can be set on default_grabber.throttle) is used. See`
			`BANDWIDTH THROTTLING for more information.`

			`- timeout = None`
			`+ timeout = 300`

			`- a positive float expressing the number of seconds to wait for socket`
			`- operations. If the value is None or 0.0, socket operations will block`
			`- forever. Setting this option causes urlgrabber to call the settimeout`
			`- method on the Socket object used for the request. See the Python`
			`- documentation on settimeout for more information.`
			`- http://www.python.org/doc/current/lib/socket-objects.html`
			`+ a positive integer expressing the number of seconds to wait before`
			`+ timing out attempts to connect to a server. If the value is None`
			`+ or 0, connection attempts will not time out. The timeout is passed`
			`+ to the underlying pycurl object as its CONNECTTIMEOUT option, see`
			`+ the curl documentation on CURLOPT_CONNECTTIMEOUT for more information.`
			`+ http://curl.haxx.se/libcurl/c/curl_easy_setopt.html#CURLOPTCONNECTTIMEOUT`

			`bandwidth = 0`

pull latest HEAD patch for rawhide 15 years ago			`@@ -439,6 +439,12 @@ try:`
			`except:`
			`__version__ = '???'`

			`+try:`
			`+ # this part isn't going to do much - need to talk to gettext`
			`+ from i18n import _`
			`+except ImportError, msg:`
			`+ def _(st): return st`
			`+`
			`########################################################################`
			`# functions for debugging output. These functions are here because they`
			`# are also part of the module initialization.`
- Update to upstream HEAD. - LOWSPEEDLIMIT and hdrs 15 years ago			`@@ -808,7 +814,7 @@ class URLGrabberOptions:`
			`self.prefix = None`
			`self.opener = None`
			`self.cache_openers = True`
			`- self.timeout = None`
			`+ self.timeout = 300`
			`self.text = None`
			`self.http_headers = None`
			`self.ftp_headers = None`
			`@@ -1052,9 +1058,15 @@ class PyCurlFileObject():`
int/float multiplication CRAP! 16 years ago			`self._reget_length = 0`
			`self._prog_running = False`
			`self._error = (None, None)`
			`- self.size = None`
			`+ self.size = 0`
pull latest HEAD patch for rawhide 15 years ago			`+ self._hdr_ended = False`
int/float multiplication CRAP! 16 years ago			`self._do_open()`
<sigh> actually check in the patch :( 16 years ago
- Update to upstream HEAD. - LOWSPEEDLIMIT and hdrs 15 years ago			`+`
			`+ def geturl(self):`
			`+ """ Provide the geturl() method, used to be got from`
			`+ urllib.addinfourl, via. urllib.URLopener.* """`
			`+ return self.url`
<sigh> actually check in the patch :( 16 years ago
- Update to upstream HEAD. - LOWSPEEDLIMIT and hdrs 15 years ago			`def __getattr__(self, name):`
			`"""This effectively allows us to wrap at the instance level.`
			`@@ -1085,9 +1097,14 @@ class PyCurlFileObject():`
pull latest HEAD patch for rawhide 15 years ago			`return -1`

			`def _hdr_retrieve(self, buf):`
			`+ if self._hdr_ended:`
			`+ self._hdr_dump = ''`
			`+ self.size = 0`
			`+ self._hdr_ended = False`
			`+`
			`if self._over_max_size(cur=len(self._hdr_dump),`
			`max_size=self.opts.max_header_size):`
			`- return -1`
			`+ return -1`
			`try:`
			`self._hdr_dump += buf`
			`# we have to get the size before we do the progress obj start`
- Update to upstream HEAD. - LOWSPEEDLIMIT and hdrs 15 years ago			`@@ -1104,7 +1121,17 @@ class PyCurlFileObject():`
pull latest HEAD patch for rawhide 15 years ago			`s = parse150(buf)`
			`if s:`
			`self.size = int(s)`
			`-`
			`+`
			`+ if buf.lower().find('location') != -1:`
			`+ location = ':'.join(buf.split(':')[1:])`
			`+ location = location.strip()`
			`+ self.scheme = urlparse.urlsplit(location)[0]`
			`+ self.url = location`
			`+`
			`+ if len(self._hdr_dump) != 0 and buf == '\r\n':`
			`+ self._hdr_ended = True`
			`+ if DEBUG: DEBUG.info('header ended:')`
			`+`
			`return len(buf)`
			`except KeyboardInterrupt:`
			`return pycurl.READFUNC_ABORT`
- Update to upstream HEAD. - LOWSPEEDLIMIT and hdrs 15 years ago			`@@ -1113,8 +1140,10 @@ class PyCurlFileObject():`
			`if self._parsed_hdr:`
			`return self._parsed_hdr`
			`statusend = self._hdr_dump.find('\n')`
			`+ statusend += 1 # ridiculous as it may seem.`
			`hdrfp = StringIO()`
			`hdrfp.write(self._hdr_dump[statusend:])`
			`+ hdrfp.seek(0)`
			`self._parsed_hdr = mimetools.Message(hdrfp)`
			`return self._parsed_hdr`

			`@@ -1136,6 +1165,7 @@ class PyCurlFileObject():`
pull latest HEAD patch for rawhide 15 years ago			`self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update)`
			`self.curl_obj.setopt(pycurl.FAILONERROR, True)`
			`self.curl_obj.setopt(pycurl.OPT_FILETIME, True)`
			`+ self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)`

			`if DEBUG:`
			`self.curl_obj.setopt(pycurl.VERBOSE, True)`
- Update to upstream HEAD. - LOWSPEEDLIMIT and hdrs 15 years ago			`@@ -1148,9 +1178,11 @@ class PyCurlFileObject():`

			`# timeouts`
			`timeout = 300`
			`- if opts.timeout:`
			`- timeout = int(opts.timeout)`
			`- self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout)`
			`+ if hasattr(opts, 'timeout'):`
			`+ timeout = int(opts.timeout or 0)`
			`+ self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout)`
			`+ self.curl_obj.setopt(pycurl.LOW_SPEED_LIMIT, 1)`
			`+ self.curl_obj.setopt(pycurl.LOW_SPEED_TIME, timeout)`

			`# ssl options`
			`if self.scheme == 'https':`
			`@@ -1276,7 +1308,7 @@ class PyCurlFileObject():`
			`raise err`

			`elif errcode == 60:`
			`- msg = _("client cert cannot be verified or client cert incorrect")`
			`+ msg = _("Peer cert cannot be verified or peer cert invalid")`
			`err = URLGrabError(14, msg)`
			`err.url = self.url`
			`raise err`
			`@@ -1291,7 +1323,12 @@ class PyCurlFileObject():`
pull latest HEAD patch for rawhide 15 years ago			`raise err`

			`elif str(e.args[1]) == '' and self.http_code != 0: # fake it until you make it`
			`- msg = 'HTTP Error %s : %s ' % (self.http_code, self.url)`
			`+ if self.scheme in ['http', 'https']:`
			`+ msg = 'HTTP Error %s : %s ' % (self.http_code, self.url)`
			`+ elif self.scheme in ['ftp']:`
			`+ msg = 'FTP Error %s : %s ' % (self.http_code, self.url)`
			`+ else:`
			`+ msg = "Unknown Error: URL=%s , scheme=%s" % (self.url, self.scheme)`
			`else:`
			`msg = 'PYCURL ERROR %s - "%s"' % (errcode, str(e.args[1]))`
			`code = errcode`
- Update to upstream HEAD. - LOWSPEEDLIMIT and hdrs 15 years ago			`@@ -1299,6 +1336,12 @@ class PyCurlFileObject():`
int/float multiplication CRAP! 16 years ago			`err.code = code`
<sigh> actually check in the patch :( 16 years ago			`err.exception = e`
			`raise err`
int/float multiplication CRAP! 16 years ago			`+ else:`
			`+ if self._error[1]:`
			`+ msg = self._error[1]`
			`+ err = URLGRabError(14, msg)`
			`+ err.url = self.url`
			`+ raise err`

<sigh> actually check in the patch :( 16 years ago			`def _do_open(self):`
			`self.curl_obj = _curl_cache`
- Update to upstream HEAD. - LOWSPEEDLIMIT and hdrs 15 years ago			`@@ -1446,9 +1489,23 @@ class PyCurlFileObject():`
			`# set the time`
			`mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME)`
			`if mod_time != -1:`
			`- os.utime(self.filename, (mod_time, mod_time))`
			`+ try:`
			`+ os.utime(self.filename, (mod_time, mod_time))`
			`+ except OSError, e:`
			`+ err = URLGrabError(16, _(\`
			`+ 'error setting timestamp on file %s from %s, OSError: %s')`
			`+ % (self.filenameself.url, e))`
			`+ err.url = self.url`
			`+ raise err`
			`# re open it`
			`- self.fo = open(self.filename, 'r')`
			`+ try:`
			`+ self.fo = open(self.filename, 'r')`
			`+ except IOError, e:`
			`+ err = URLGrabError(16, _(\`
			`+ 'error opening file from %s, IOError: %s') % (self.url, e))`
			`+ err.url = self.url`
			`+ raise err`
			`+`
			`else:`
			`#self.fo = open(self._temp_name, 'r')`
			`self.fo.seek(0)`
			`@@ -1532,11 +1589,14 @@ class PyCurlFileObject():`
pull latest HEAD patch for rawhide 15 years ago			`def _over_max_size(self, cur, max_size=None):`

			`if not max_size:`
			`- max_size = self.size`
			`- if self.opts.size: # if we set an opts size use that, no matter what`
			`- max_size = self.opts.size`
			`+ if not self.opts.size:`
			`+ max_size = self.size`
			`+ else:`
			`+ max_size = self.opts.size`
			`+`
int/float multiplication CRAP! 16 years ago			`if not max_size: return False # if we have None for all of the Max then this is dumb`
			`- if cur > max_size + max_size*.10:`
latest urlgrabber HEAD 16 years ago			`+`
int/float multiplication CRAP! 16 years ago			`+ if cur > int(float(max_size) * 1.10):`
latest urlgrabber HEAD 16 years ago
int/float multiplication CRAP! 16 years ago			`msg = _("Downloaded more than max size for %s: %s > %s") \`
			`% (self.url, cur, max_size)`
- Update to upstream HEAD. - LOWSPEEDLIMIT and hdrs 15 years ago			`@@ -1582,9 +1642,21 @@ class PyCurlFileObject():`
pull latest HEAD patch for rawhide 15 years ago			`self.opts.progress_obj.end(self._amount_read)`
			`self.fo.close()`

			`-`
			`+ def geturl(self):`
			`+ """ Provide the geturl() method, used to be got from`
			`+ urllib.addinfourl, via. urllib.URLopener.* """`
			`+ return self.url`
			`+`
			`_curl_cache = pycurl.Curl() # make one and reuse it over and over and over`

- Update to upstream HEAD. - LOWSPEEDLIMIT and hdrs 15 years ago			`+def reset_curl_obj():`
			`+ """To make sure curl has reread the network/dns info we force a reload"""`
			`+ global _curl_cache`
			`+ _curl_cache.close()`
			`+ _curl_cache = pycurl.Curl()`
			`+`
			`+`
			`+`
pull latest HEAD patch for rawhide 15 years ago
- Update to upstream HEAD. - LOWSPEEDLIMIT and hdrs 15 years ago			`#####################################################################`
			`# DEPRECATED FUNCTIONS`
pull latest HEAD patch for rawhide 15 years ago			`diff --git a/urlgrabber/progress.py b/urlgrabber/progress.py`
			`index dd07c6a..45eb248 100644`
			`--- a/urlgrabber/progress.py`
			`+++ b/urlgrabber/progress.py`
			`@@ -658,6 +658,8 @@ def format_time(seconds, use_hours=0):`
			`if seconds is None or seconds < 0:`
			`if use_hours: return '--:--:--'`
			`else: return '--:--'`
			`+ elif seconds == float('inf'):`
			`+ return 'Infinite'`
			`else:`
			`seconds = int(seconds)`
			`minutes = seconds / 60`