diff --git a/python-urlgrabber.spec b/python-urlgrabber.spec index 592872e..7750e2f 100644 --- a/python-urlgrabber.spec +++ b/python-urlgrabber.spec @@ -3,7 +3,7 @@ Summary: A high-level cross-protocol url-grabber Name: python-urlgrabber Version: 3.9.1 -Release: 5%{?dist} +Release: 6%{?dist} Source0: urlgrabber-%{version}.tar.gz Patch1: urlgrabber-HEAD.patch @@ -43,10 +43,13 @@ rm -rf $RPM_BUILD_ROOT %{_bindir}/urlgrabber %changelog -* Tue Apr 13 2010 James Antill 3.9.1-5 +* Tue Apr 13 2010 James Antill 3.9.1-6 - Update to upstream HEAD. - LOWSPEEDLIMIT and hdrs +* Fri Feb 19 2010 Seth Vidal - 3.9.1-5 +- add patch to allow reset_curl_obj() to close and reload the cached curl obj + * Thu Nov 12 2009 Seth Vidal - 3.9.1-4 - reset header values when we redirect and make sure debug output will work diff --git a/urlgrabber-3.0.0-cleanup.patch b/urlgrabber-3.0.0-cleanup.patch deleted file mode 100644 index 7a1ee05..0000000 --- a/urlgrabber-3.0.0-cleanup.patch +++ /dev/null @@ -1,28 +0,0 @@ -diff -up urlgrabber-3.0.0/urlgrabber/grabber.py.cleanup urlgrabber-3.0.0/urlgrabber/grabber.py ---- urlgrabber-3.0.0/urlgrabber/grabber.py.cleanup 2007-11-29 10:25:13.000000000 +0000 -+++ urlgrabber-3.0.0/urlgrabber/grabber.py 2007-11-29 10:26:15.000000000 +0000 -@@ -1204,16 +1204,18 @@ class URLGrabberFileObject: - bs = 1024*8 - size = 0 - -- if amount is not None: bs = min(bs, amount - size) -- block = self.read(bs) -- size = size + len(block) -- while block: -- new_fo.write(block) -+ try: - if amount is not None: bs = min(bs, amount - size) - block = self.read(bs) - size = size + len(block) -+ while block: -+ new_fo.write(block) -+ if amount is not None: bs = min(bs, amount - size) -+ block = self.read(bs) -+ size = size + len(block) -+ finally: -+ new_fo.close() - -- new_fo.close() - try: - modified_tuple = self.hdr.getdate_tz('last-modified') - modified_stamp = rfc822.mktime_tz(modified_tuple) diff --git a/urlgrabber-HEAD.patch b/urlgrabber-HEAD.patch index d2c6486..885f3a1 100644 --- a/urlgrabber-HEAD.patch +++ b/urlgrabber-HEAD.patch @@ -1,7 +1,54 @@ +diff --git a/.gitignore b/.gitignore +new file mode 100644 +index 0000000..1ffe416 +--- /dev/null ++++ b/.gitignore +@@ -0,0 +1,7 @@ ++*.py[co] ++MANIFEST ++dist ++build ++*.kdev* ++*.kateproject ++ipython.log* +diff --git a/test/base_test_code.py b/test/base_test_code.py +index 50c6348..5fb43f9 100644 +--- a/test/base_test_code.py ++++ b/test/base_test_code.py +@@ -1,6 +1,6 @@ + from munittest import * + +-base_http = 'http://www.linux.duke.edu/projects/urlgrabber/test/' ++base_http = 'http://urlgrabber.baseurl.org/test/' + base_ftp = 'ftp://localhost/test/' + + # set to a proftp server only. we're working around a couple of diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py -index e090e90..a26880c 100644 +index e090e90..4797436 100644 --- a/urlgrabber/grabber.py +++ b/urlgrabber/grabber.py +@@ -68,14 +68,14 @@ GENERAL ARGUMENTS (kwargs) + (which can be set on default_grabber.throttle) is used. See + BANDWIDTH THROTTLING for more information. + +- timeout = None ++ timeout = 300 + +- a positive float expressing the number of seconds to wait for socket +- operations. If the value is None or 0.0, socket operations will block +- forever. Setting this option causes urlgrabber to call the settimeout +- method on the Socket object used for the request. See the Python +- documentation on settimeout for more information. +- http://www.python.org/doc/current/lib/socket-objects.html ++ a positive integer expressing the number of seconds to wait before ++ timing out attempts to connect to a server. If the value is None ++ or 0, connection attempts will not time out. The timeout is passed ++ to the underlying pycurl object as its CONNECTTIMEOUT option, see ++ the curl documentation on CURLOPT_CONNECTTIMEOUT for more information. ++ http://curl.haxx.se/libcurl/c/curl_easy_setopt.html#CURLOPTCONNECTTIMEOUT + + bandwidth = 0 + @@ -439,6 +439,12 @@ try: except: __version__ = '???' @@ -15,7 +62,16 @@ index e090e90..a26880c 100644 ######################################################################## # functions for debugging output. These functions are here because they # are also part of the module initialization. -@@ -1052,7 +1058,8 @@ class PyCurlFileObject(): +@@ -808,7 +814,7 @@ class URLGrabberOptions: + self.prefix = None + self.opener = None + self.cache_openers = True +- self.timeout = None ++ self.timeout = 300 + self.text = None + self.http_headers = None + self.ftp_headers = None +@@ -1052,9 +1058,15 @@ class PyCurlFileObject(): self._reget_length = 0 self._prog_running = False self._error = (None, None) @@ -24,8 +80,15 @@ index e090e90..a26880c 100644 + self._hdr_ended = False self._do_open() ++ ++ def geturl(self): ++ """ Provide the geturl() method, used to be got from ++ urllib.addinfourl, via. urllib.URLopener.* """ ++ return self.url -@@ -1085,9 +1092,14 @@ class PyCurlFileObject(): + def __getattr__(self, name): + """This effectively allows us to wrap at the instance level. +@@ -1085,9 +1097,14 @@ class PyCurlFileObject(): return -1 def _hdr_retrieve(self, buf): @@ -41,7 +104,7 @@ index e090e90..a26880c 100644 try: self._hdr_dump += buf # we have to get the size before we do the progress obj start -@@ -1104,7 +1116,17 @@ class PyCurlFileObject(): +@@ -1104,7 +1121,17 @@ class PyCurlFileObject(): s = parse150(buf) if s: self.size = int(s) @@ -60,7 +123,18 @@ index e090e90..a26880c 100644 return len(buf) except KeyboardInterrupt: return pycurl.READFUNC_ABORT -@@ -1136,6 +1158,7 @@ class PyCurlFileObject(): +@@ -1113,8 +1140,10 @@ class PyCurlFileObject(): + if self._parsed_hdr: + return self._parsed_hdr + statusend = self._hdr_dump.find('\n') ++ statusend += 1 # ridiculous as it may seem. + hdrfp = StringIO() + hdrfp.write(self._hdr_dump[statusend:]) ++ hdrfp.seek(0) + self._parsed_hdr = mimetools.Message(hdrfp) + return self._parsed_hdr + +@@ -1136,6 +1165,7 @@ class PyCurlFileObject(): self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update) self.curl_obj.setopt(pycurl.FAILONERROR, True) self.curl_obj.setopt(pycurl.OPT_FILETIME, True) @@ -68,7 +142,31 @@ index e090e90..a26880c 100644 if DEBUG: self.curl_obj.setopt(pycurl.VERBOSE, True) -@@ -1291,7 +1314,12 @@ class PyCurlFileObject(): +@@ -1148,9 +1178,11 @@ class PyCurlFileObject(): + + # timeouts + timeout = 300 +- if opts.timeout: +- timeout = int(opts.timeout) +- self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout) ++ if hasattr(opts, 'timeout'): ++ timeout = int(opts.timeout or 0) ++ self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout) ++ self.curl_obj.setopt(pycurl.LOW_SPEED_LIMIT, 1) ++ self.curl_obj.setopt(pycurl.LOW_SPEED_TIME, timeout) + + # ssl options + if self.scheme == 'https': +@@ -1276,7 +1308,7 @@ class PyCurlFileObject(): + raise err + + elif errcode == 60: +- msg = _("client cert cannot be verified or client cert incorrect") ++ msg = _("Peer cert cannot be verified or peer cert invalid") + err = URLGrabError(14, msg) + err.url = self.url + raise err +@@ -1291,7 +1323,12 @@ class PyCurlFileObject(): raise err elif str(e.args[1]) == '' and self.http_code != 0: # fake it until you make it @@ -82,7 +180,7 @@ index e090e90..a26880c 100644 else: msg = 'PYCURL ERROR %s - "%s"' % (errcode, str(e.args[1])) code = errcode -@@ -1299,6 +1327,12 @@ class PyCurlFileObject(): +@@ -1299,6 +1336,12 @@ class PyCurlFileObject(): err.code = code err.exception = e raise err @@ -95,7 +193,33 @@ index e090e90..a26880c 100644 def _do_open(self): self.curl_obj = _curl_cache -@@ -1532,11 +1566,14 @@ class PyCurlFileObject(): +@@ -1446,9 +1489,23 @@ class PyCurlFileObject(): + # set the time + mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME) + if mod_time != -1: +- os.utime(self.filename, (mod_time, mod_time)) ++ try: ++ os.utime(self.filename, (mod_time, mod_time)) ++ except OSError, e: ++ err = URLGrabError(16, _(\ ++ 'error setting timestamp on file %s from %s, OSError: %s') ++ % (self.filenameself.url, e)) ++ err.url = self.url ++ raise err + # re open it +- self.fo = open(self.filename, 'r') ++ try: ++ self.fo = open(self.filename, 'r') ++ except IOError, e: ++ err = URLGrabError(16, _(\ ++ 'error opening file from %s, IOError: %s') % (self.url, e)) ++ err.url = self.url ++ raise err ++ + else: + #self.fo = open(self._temp_name, 'r') + self.fo.seek(0) +@@ -1532,11 +1589,14 @@ class PyCurlFileObject(): def _over_max_size(self, cur, max_size=None): if not max_size: @@ -114,7 +238,7 @@ index e090e90..a26880c 100644 msg = _("Downloaded more than max size for %s: %s > %s") \ % (self.url, cur, max_size) -@@ -1582,7 +1619,11 @@ class PyCurlFileObject(): +@@ -1582,9 +1642,21 @@ class PyCurlFileObject(): self.opts.progress_obj.end(self._amount_read) self.fo.close() @@ -126,7 +250,17 @@ index e090e90..a26880c 100644 + _curl_cache = pycurl.Curl() # make one and reuse it over and over and over ++def reset_curl_obj(): ++ """To make sure curl has reread the network/dns info we force a reload""" ++ global _curl_cache ++ _curl_cache.close() ++ _curl_cache = pycurl.Curl() ++ ++ ++ + ##################################################################### + # DEPRECATED FUNCTIONS diff --git a/urlgrabber/progress.py b/urlgrabber/progress.py index dd07c6a..45eb248 100644 --- a/urlgrabber/progress.py @@ -140,43 +274,3 @@ index dd07c6a..45eb248 100644 else: seconds = int(seconds) minutes = seconds / 60 -commit e85f27c43f991469db38bad97735ce2c0f7d075d -Author: Seth Vidal -Date: Mon Mar 15 22:50:21 2010 -0400 - - make sure we're properly reading the hdrs and returning them - -diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py -index 16bb1d2..ac5ae18 100644 ---- a/urlgrabber/grabber.py -+++ b/urlgrabber/grabber.py -@@ -1135,8 +1135,10 @@ class PyCurlFileObject(): - if self._parsed_hdr: - return self._parsed_hdr - statusend = self._hdr_dump.find('\n') -+ statusend += 1 # ridiculous as it may seem. - hdrfp = StringIO() - hdrfp.write(self._hdr_dump[statusend:]) -+ hdrfp.seek(0) - self._parsed_hdr = mimetools.Message(hdrfp) - return self._parsed_hdr - -commit 8e57ad3fbf14c55434eab5c04c4e00ba4f5986f9 -Author: James Antill -Date: Mon Mar 1 11:48:00 2010 -0500 - - Implement connection established timeout using, LOW_SPEED_LIMIT - -diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py -index e63d4bb..bd4da75 100644 ---- a/urlgrabber/grabber.py -+++ b/urlgrabber/grabber.py -@@ -1179,6 +1179,8 @@ class PyCurlFileObject(): - if hasattr(opts, 'timeout'): - timeout = int(opts.timeout or 0) - self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout) -+ self.curl_obj.setopt(pycurl.LOW_SPEED_LIMIT, 1) -+ self.curl_obj.setopt(pycurl.LOW_SPEED_TIME, timeout) - - # ssl options - if self.scheme == 'https': diff --git a/urlgrabber-reset.patch b/urlgrabber-reset.patch deleted file mode 100644 index b63e7c3..0000000 --- a/urlgrabber-reset.patch +++ /dev/null @@ -1,15 +0,0 @@ ---- a/urlgrabber/grabber.py 2010-02-19 14:50:45.000000000 -0500 -+++ b/urlgrabber/grabber.py 2010-02-19 14:51:28.000000000 -0500 -@@ -1626,6 +1626,12 @@ - - _curl_cache = pycurl.Curl() # make one and reuse it over and over and over - -+def reset_curl_obj(): -+ """To make sure curl has reread the network/dns info we force a reload""" -+ global _curl_cache -+ _curl_cache.close() -+ _curl_cache = pycurl.Curl() -+ - - ##################################################################### - # DEPRECATED FUNCTIONS