- Update to upstream HEAD.

- LOWSPEEDLIMIT and hdrs
epel9
James Antill 15 years ago
parent ce8535ddf5
commit c4bd7d23d9

@ -3,7 +3,7 @@
Summary: A high-level cross-protocol url-grabber Summary: A high-level cross-protocol url-grabber
Name: python-urlgrabber Name: python-urlgrabber
Version: 3.9.1 Version: 3.9.1
Release: 5%{?dist} Release: 6%{?dist}
Source0: urlgrabber-%{version}.tar.gz Source0: urlgrabber-%{version}.tar.gz
Patch1: urlgrabber-HEAD.patch Patch1: urlgrabber-HEAD.patch
@ -43,10 +43,13 @@ rm -rf $RPM_BUILD_ROOT
%{_bindir}/urlgrabber %{_bindir}/urlgrabber
%changelog %changelog
* Tue Apr 13 2010 James Antill <james@fedoraproject.org> 3.9.1-5 * Tue Apr 13 2010 James Antill <james@fedoraproject.org> 3.9.1-6
- Update to upstream HEAD. - Update to upstream HEAD.
- LOWSPEEDLIMIT and hdrs - LOWSPEEDLIMIT and hdrs
* Fri Feb 19 2010 Seth Vidal <skvidal at fedoraproject.org> - 3.9.1-5
- add patch to allow reset_curl_obj() to close and reload the cached curl obj
* Thu Nov 12 2009 Seth Vidal <skvidal at fedoraproject.org> - 3.9.1-4 * Thu Nov 12 2009 Seth Vidal <skvidal at fedoraproject.org> - 3.9.1-4
- reset header values when we redirect and make sure debug output will work - reset header values when we redirect and make sure debug output will work

@ -1,28 +0,0 @@
diff -up urlgrabber-3.0.0/urlgrabber/grabber.py.cleanup urlgrabber-3.0.0/urlgrabber/grabber.py
--- urlgrabber-3.0.0/urlgrabber/grabber.py.cleanup 2007-11-29 10:25:13.000000000 +0000
+++ urlgrabber-3.0.0/urlgrabber/grabber.py 2007-11-29 10:26:15.000000000 +0000
@@ -1204,16 +1204,18 @@ class URLGrabberFileObject:
bs = 1024*8
size = 0
- if amount is not None: bs = min(bs, amount - size)
- block = self.read(bs)
- size = size + len(block)
- while block:
- new_fo.write(block)
+ try:
if amount is not None: bs = min(bs, amount - size)
block = self.read(bs)
size = size + len(block)
+ while block:
+ new_fo.write(block)
+ if amount is not None: bs = min(bs, amount - size)
+ block = self.read(bs)
+ size = size + len(block)
+ finally:
+ new_fo.close()
- new_fo.close()
try:
modified_tuple = self.hdr.getdate_tz('last-modified')
modified_stamp = rfc822.mktime_tz(modified_tuple)

@ -1,7 +1,54 @@
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..1ffe416
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,7 @@
+*.py[co]
+MANIFEST
+dist
+build
+*.kdev*
+*.kateproject
+ipython.log*
diff --git a/test/base_test_code.py b/test/base_test_code.py
index 50c6348..5fb43f9 100644
--- a/test/base_test_code.py
+++ b/test/base_test_code.py
@@ -1,6 +1,6 @@
from munittest import *
-base_http = 'http://www.linux.duke.edu/projects/urlgrabber/test/'
+base_http = 'http://urlgrabber.baseurl.org/test/'
base_ftp = 'ftp://localhost/test/'
# set to a proftp server only. we're working around a couple of
diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py
index e090e90..a26880c 100644 index e090e90..4797436 100644
--- a/urlgrabber/grabber.py --- a/urlgrabber/grabber.py
+++ b/urlgrabber/grabber.py +++ b/urlgrabber/grabber.py
@@ -68,14 +68,14 @@ GENERAL ARGUMENTS (kwargs)
(which can be set on default_grabber.throttle) is used. See
BANDWIDTH THROTTLING for more information.
- timeout = None
+ timeout = 300
- a positive float expressing the number of seconds to wait for socket
- operations. If the value is None or 0.0, socket operations will block
- forever. Setting this option causes urlgrabber to call the settimeout
- method on the Socket object used for the request. See the Python
- documentation on settimeout for more information.
- http://www.python.org/doc/current/lib/socket-objects.html
+ a positive integer expressing the number of seconds to wait before
+ timing out attempts to connect to a server. If the value is None
+ or 0, connection attempts will not time out. The timeout is passed
+ to the underlying pycurl object as its CONNECTTIMEOUT option, see
+ the curl documentation on CURLOPT_CONNECTTIMEOUT for more information.
+ http://curl.haxx.se/libcurl/c/curl_easy_setopt.html#CURLOPTCONNECTTIMEOUT
bandwidth = 0
@@ -439,6 +439,12 @@ try: @@ -439,6 +439,12 @@ try:
except: except:
__version__ = '???' __version__ = '???'
@ -15,7 +62,16 @@ index e090e90..a26880c 100644
######################################################################## ########################################################################
# functions for debugging output. These functions are here because they # functions for debugging output. These functions are here because they
# are also part of the module initialization. # are also part of the module initialization.
@@ -1052,7 +1058,8 @@ class PyCurlFileObject(): @@ -808,7 +814,7 @@ class URLGrabberOptions:
self.prefix = None
self.opener = None
self.cache_openers = True
- self.timeout = None
+ self.timeout = 300
self.text = None
self.http_headers = None
self.ftp_headers = None
@@ -1052,9 +1058,15 @@ class PyCurlFileObject():
self._reget_length = 0 self._reget_length = 0
self._prog_running = False self._prog_running = False
self._error = (None, None) self._error = (None, None)
@ -24,8 +80,15 @@ index e090e90..a26880c 100644
+ self._hdr_ended = False + self._hdr_ended = False
self._do_open() self._do_open()
+
+ def geturl(self):
+ """ Provide the geturl() method, used to be got from
+ urllib.addinfourl, via. urllib.URLopener.* """
+ return self.url
@@ -1085,9 +1092,14 @@ class PyCurlFileObject(): def __getattr__(self, name):
"""This effectively allows us to wrap at the instance level.
@@ -1085,9 +1097,14 @@ class PyCurlFileObject():
return -1 return -1
def _hdr_retrieve(self, buf): def _hdr_retrieve(self, buf):
@ -41,7 +104,7 @@ index e090e90..a26880c 100644
try: try:
self._hdr_dump += buf self._hdr_dump += buf
# we have to get the size before we do the progress obj start # we have to get the size before we do the progress obj start
@@ -1104,7 +1116,17 @@ class PyCurlFileObject(): @@ -1104,7 +1121,17 @@ class PyCurlFileObject():
s = parse150(buf) s = parse150(buf)
if s: if s:
self.size = int(s) self.size = int(s)
@ -60,7 +123,18 @@ index e090e90..a26880c 100644
return len(buf) return len(buf)
except KeyboardInterrupt: except KeyboardInterrupt:
return pycurl.READFUNC_ABORT return pycurl.READFUNC_ABORT
@@ -1136,6 +1158,7 @@ class PyCurlFileObject(): @@ -1113,8 +1140,10 @@ class PyCurlFileObject():
if self._parsed_hdr:
return self._parsed_hdr
statusend = self._hdr_dump.find('\n')
+ statusend += 1 # ridiculous as it may seem.
hdrfp = StringIO()
hdrfp.write(self._hdr_dump[statusend:])
+ hdrfp.seek(0)
self._parsed_hdr = mimetools.Message(hdrfp)
return self._parsed_hdr
@@ -1136,6 +1165,7 @@ class PyCurlFileObject():
self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update) self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update)
self.curl_obj.setopt(pycurl.FAILONERROR, True) self.curl_obj.setopt(pycurl.FAILONERROR, True)
self.curl_obj.setopt(pycurl.OPT_FILETIME, True) self.curl_obj.setopt(pycurl.OPT_FILETIME, True)
@ -68,7 +142,31 @@ index e090e90..a26880c 100644
if DEBUG: if DEBUG:
self.curl_obj.setopt(pycurl.VERBOSE, True) self.curl_obj.setopt(pycurl.VERBOSE, True)
@@ -1291,7 +1314,12 @@ class PyCurlFileObject(): @@ -1148,9 +1178,11 @@ class PyCurlFileObject():
# timeouts
timeout = 300
- if opts.timeout:
- timeout = int(opts.timeout)
- self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout)
+ if hasattr(opts, 'timeout'):
+ timeout = int(opts.timeout or 0)
+ self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout)
+ self.curl_obj.setopt(pycurl.LOW_SPEED_LIMIT, 1)
+ self.curl_obj.setopt(pycurl.LOW_SPEED_TIME, timeout)
# ssl options
if self.scheme == 'https':
@@ -1276,7 +1308,7 @@ class PyCurlFileObject():
raise err
elif errcode == 60:
- msg = _("client cert cannot be verified or client cert incorrect")
+ msg = _("Peer cert cannot be verified or peer cert invalid")
err = URLGrabError(14, msg)
err.url = self.url
raise err
@@ -1291,7 +1323,12 @@ class PyCurlFileObject():
raise err raise err
elif str(e.args[1]) == '' and self.http_code != 0: # fake it until you make it elif str(e.args[1]) == '' and self.http_code != 0: # fake it until you make it
@ -82,7 +180,7 @@ index e090e90..a26880c 100644
else: else:
msg = 'PYCURL ERROR %s - "%s"' % (errcode, str(e.args[1])) msg = 'PYCURL ERROR %s - "%s"' % (errcode, str(e.args[1]))
code = errcode code = errcode
@@ -1299,6 +1327,12 @@ class PyCurlFileObject(): @@ -1299,6 +1336,12 @@ class PyCurlFileObject():
err.code = code err.code = code
err.exception = e err.exception = e
raise err raise err
@ -95,7 +193,33 @@ index e090e90..a26880c 100644
def _do_open(self): def _do_open(self):
self.curl_obj = _curl_cache self.curl_obj = _curl_cache
@@ -1532,11 +1566,14 @@ class PyCurlFileObject(): @@ -1446,9 +1489,23 @@ class PyCurlFileObject():
# set the time
mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME)
if mod_time != -1:
- os.utime(self.filename, (mod_time, mod_time))
+ try:
+ os.utime(self.filename, (mod_time, mod_time))
+ except OSError, e:
+ err = URLGrabError(16, _(\
+ 'error setting timestamp on file %s from %s, OSError: %s')
+ % (self.filenameself.url, e))
+ err.url = self.url
+ raise err
# re open it
- self.fo = open(self.filename, 'r')
+ try:
+ self.fo = open(self.filename, 'r')
+ except IOError, e:
+ err = URLGrabError(16, _(\
+ 'error opening file from %s, IOError: %s') % (self.url, e))
+ err.url = self.url
+ raise err
+
else:
#self.fo = open(self._temp_name, 'r')
self.fo.seek(0)
@@ -1532,11 +1589,14 @@ class PyCurlFileObject():
def _over_max_size(self, cur, max_size=None): def _over_max_size(self, cur, max_size=None):
if not max_size: if not max_size:
@ -114,7 +238,7 @@ index e090e90..a26880c 100644
msg = _("Downloaded more than max size for %s: %s > %s") \ msg = _("Downloaded more than max size for %s: %s > %s") \
% (self.url, cur, max_size) % (self.url, cur, max_size)
@@ -1582,7 +1619,11 @@ class PyCurlFileObject(): @@ -1582,9 +1642,21 @@ class PyCurlFileObject():
self.opts.progress_obj.end(self._amount_read) self.opts.progress_obj.end(self._amount_read)
self.fo.close() self.fo.close()
@ -126,7 +250,17 @@ index e090e90..a26880c 100644
+ +
_curl_cache = pycurl.Curl() # make one and reuse it over and over and over _curl_cache = pycurl.Curl() # make one and reuse it over and over and over
+def reset_curl_obj():
+ """To make sure curl has reread the network/dns info we force a reload"""
+ global _curl_cache
+ _curl_cache.close()
+ _curl_cache = pycurl.Curl()
+
+
+
#####################################################################
# DEPRECATED FUNCTIONS
diff --git a/urlgrabber/progress.py b/urlgrabber/progress.py diff --git a/urlgrabber/progress.py b/urlgrabber/progress.py
index dd07c6a..45eb248 100644 index dd07c6a..45eb248 100644
--- a/urlgrabber/progress.py --- a/urlgrabber/progress.py
@ -140,43 +274,3 @@ index dd07c6a..45eb248 100644
else: else:
seconds = int(seconds) seconds = int(seconds)
minutes = seconds / 60 minutes = seconds / 60
commit e85f27c43f991469db38bad97735ce2c0f7d075d
Author: Seth Vidal <skvidal@fedoraproject.org>
Date: Mon Mar 15 22:50:21 2010 -0400
make sure we're properly reading the hdrs and returning them
diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py
index 16bb1d2..ac5ae18 100644
--- a/urlgrabber/grabber.py
+++ b/urlgrabber/grabber.py
@@ -1135,8 +1135,10 @@ class PyCurlFileObject():
if self._parsed_hdr:
return self._parsed_hdr
statusend = self._hdr_dump.find('\n')
+ statusend += 1 # ridiculous as it may seem.
hdrfp = StringIO()
hdrfp.write(self._hdr_dump[statusend:])
+ hdrfp.seek(0)
self._parsed_hdr = mimetools.Message(hdrfp)
return self._parsed_hdr
commit 8e57ad3fbf14c55434eab5c04c4e00ba4f5986f9
Author: James Antill <james@and.org>
Date: Mon Mar 1 11:48:00 2010 -0500
Implement connection established timeout using, LOW_SPEED_LIMIT
diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py
index e63d4bb..bd4da75 100644
--- a/urlgrabber/grabber.py
+++ b/urlgrabber/grabber.py
@@ -1179,6 +1179,8 @@ class PyCurlFileObject():
if hasattr(opts, 'timeout'):
timeout = int(opts.timeout or 0)
self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout)
+ self.curl_obj.setopt(pycurl.LOW_SPEED_LIMIT, 1)
+ self.curl_obj.setopt(pycurl.LOW_SPEED_TIME, timeout)
# ssl options
if self.scheme == 'https':

@ -1,15 +0,0 @@
--- a/urlgrabber/grabber.py 2010-02-19 14:50:45.000000000 -0500
+++ b/urlgrabber/grabber.py 2010-02-19 14:51:28.000000000 -0500
@@ -1626,6 +1626,12 @@
_curl_cache = pycurl.Curl() # make one and reuse it over and over and over
+def reset_curl_obj():
+ """To make sure curl has reread the network/dns info we force a reload"""
+ global _curl_cache
+ _curl_cache.close()
+ _curl_cache = pycurl.Curl()
+
#####################################################################
# DEPRECATED FUNCTIONS
Loading…
Cancel
Save