latest urlgrabber HEAD

epel9
Seth Vidal 16 years ago
parent 891a468499
commit b78a1772ac

@ -3,7 +3,7 @@
Summary: A high-level cross-protocol url-grabber Summary: A high-level cross-protocol url-grabber
Name: python-urlgrabber Name: python-urlgrabber
Version: 3.9.0 Version: 3.9.0
Release: 6%{?dist} Release: 7%{?dist}
Source0: urlgrabber-%{version}.tar.gz Source0: urlgrabber-%{version}.tar.gz
Patch0: urlgrabber-HEAD.patch Patch0: urlgrabber-HEAD.patch
@ -44,6 +44,9 @@ rm -rf $RPM_BUILD_ROOT
%{_bindir}/urlgrabber %{_bindir}/urlgrabber
%changelog %changelog
* Mon Aug 10 2009 Seth Vidal <skvidal at fedoraproject.org> - 3.9.0-6
- reget fixes, tmpfiles no longer made for urlopen() calls.
* Wed Aug 5 2009 Seth Vidal <skvidal at fedoraproject.org> - 3.9.0-5 * Wed Aug 5 2009 Seth Vidal <skvidal at fedoraproject.org> - 3.9.0-5
- apply complete patch to head fixes: timeouts, regets, improves exception raising - apply complete patch to head fixes: timeouts, regets, improves exception raising

@ -1,8 +1,8 @@
diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py
index cf51dff..3758799 100644 index cf51dff..cea47e3 100644
--- a/urlgrabber/grabber.py --- a/urlgrabber/grabber.py
+++ b/urlgrabber/grabber.py +++ b/urlgrabber/grabber.py
@@ -402,6 +402,7 @@ import urllib @@ -402,11 +402,11 @@ import urllib
import urllib2 import urllib2
import mimetools import mimetools
import thread import thread
@ -10,7 +10,26 @@ index cf51dff..3758799 100644
from stat import * # S_* and ST_* from stat import * # S_* and ST_*
import pycurl import pycurl
from ftplib import parse150 from ftplib import parse150
@@ -1219,7 +1220,7 @@ class URLGrabberFileObject: from StringIO import StringIO
-from tempfile import mkstemp
########################################################################
# MODULE INITIALIZATION
@@ -467,6 +467,13 @@ except AttributeError:
TimeoutError = None
have_socket_timeout = False
+try:
+ import signal
+ from signal import SIGPIPE, SIG_IGN
+ signal.signal(signal.SIGPIPE, signal.SIG_IGN)
+except ImportError:
+ pass
+
########################################################################
# functions for debugging output. These functions are here because they
# are also part of the module initialization.
@@ -1219,7 +1226,7 @@ class URLGrabberFileObject:
self.append = 0 self.append = 0
reget_length = 0 reget_length = 0
rt = None rt = None
@ -19,20 +38,104 @@ index cf51dff..3758799 100644
# we have reget turned on and we're dumping to a file # we have reget turned on and we're dumping to a file
try: try:
s = os.stat(self.filename) s = os.stat(self.filename)
@@ -1450,6 +1451,7 @@ class PyCurlFileObject(): @@ -1450,9 +1457,11 @@ class PyCurlFileObject():
self.scheme = urlparse.urlsplit(self.url)[0] self.scheme = urlparse.urlsplit(self.url)[0]
self.filename = filename self.filename = filename
self.append = False self.append = False
+ self.reget_time = None + self.reget_time = None
self.opts = opts self.opts = opts
+ if self.opts.reget == 'check_timestamp':
+ raise NotImplementedError, "check_timestamp regets are not implemented in this ver of urlgrabber. Please report this."
self._complete = False self._complete = False
self.reget_time = None - self.reget_time = None
@@ -1528,11 +1530,12 @@ class PyCurlFileObject(): self._rbuf = ''
self._rbufsize = 1024*8
self._ttime = time.time()
@@ -1474,39 +1483,45 @@ class PyCurlFileObject():
raise AttributeError, name
def _retrieve(self, buf):
- if not self._prog_running:
- if self.opts.progress_obj:
- size = self.size + self._reget_length
- self.opts.progress_obj.start(self._prog_reportname,
- urllib.unquote(self.url),
- self._prog_basename,
- size=size,
- text=self.opts.text)
- self._prog_running = True
- self.opts.progress_obj.update(self._amount_read)
-
- self._amount_read += len(buf)
- self.fo.write(buf)
- return len(buf)
-
+ try:
+ if not self._prog_running:
+ if self.opts.progress_obj:
+ size = self.size + self._reget_length
+ self.opts.progress_obj.start(self._prog_reportname,
+ urllib.unquote(self.url),
+ self._prog_basename,
+ size=size,
+ text=self.opts.text)
+ self._prog_running = True
+ self.opts.progress_obj.update(self._amount_read)
+
+ self._amount_read += len(buf)
+ self.fo.write(buf)
+ return len(buf)
+ except KeyboardInterrupt:
+ return pycurl.READFUNC_ABORT
+
def _hdr_retrieve(self, buf):
- self._hdr_dump += buf
- # we have to get the size before we do the progress obj start
- # but we can't do that w/o making it do 2 connects, which sucks
- # so we cheat and stuff it in here in the hdr_retrieve
- if self.scheme in ['http','https'] and buf.lower().find('content-length') != -1:
- length = buf.split(':')[1]
- self.size = int(length)
- elif self.scheme in ['ftp']:
- s = None
- if buf.startswith('213 '):
- s = buf[3:].strip()
- elif buf.startswith('150 '):
- s = parse150(buf)
- if s:
- self.size = s
-
- return len(buf)
+ try:
+ self._hdr_dump += buf
+ # we have to get the size before we do the progress obj start
+ # but we can't do that w/o making it do 2 connects, which sucks
+ # so we cheat and stuff it in here in the hdr_retrieve
+ if self.scheme in ['http','https'] and buf.lower().find('content-length') != -1:
+ length = buf.split(':')[1]
+ self.size = int(length)
+ elif self.scheme in ['ftp']:
+ s = None
+ if buf.startswith('213 '):
+ s = buf[3:].strip()
+ elif buf.startswith('150 '):
+ s = parse150(buf)
+ if s:
+ self.size = s
+
+ return len(buf)
+ except KeyboardInterrupt:
+ return pycurl.READFUNC_ABORT
def _return_hdr_obj(self):
if self._parsed_hdr:
@@ -1528,11 +1543,13 @@ class PyCurlFileObject():
# defaults we're always going to set # defaults we're always going to set
- self.curl_obj.setopt(pycurl.NOPROGRESS, 0) - self.curl_obj.setopt(pycurl.NOPROGRESS, 0)
+ self.curl_obj.setopt(pycurl.NOPROGRESS, False) + self.curl_obj.setopt(pycurl.NOPROGRESS, False)
+ self.curl_obj.setopt(pycurl.NOSIGNAL, True)
self.curl_obj.setopt(pycurl.WRITEFUNCTION, self._retrieve) self.curl_obj.setopt(pycurl.WRITEFUNCTION, self._retrieve)
self.curl_obj.setopt(pycurl.HEADERFUNCTION, self._hdr_retrieve) self.curl_obj.setopt(pycurl.HEADERFUNCTION, self._hdr_retrieve)
self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update) self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update)
@ -42,7 +145,7 @@ index cf51dff..3758799 100644
if DEBUG: if DEBUG:
self.curl_obj.setopt(pycurl.VERBOSE, True) self.curl_obj.setopt(pycurl.VERBOSE, True)
@@ -1540,15 +1543,15 @@ class PyCurlFileObject(): @@ -1540,15 +1557,15 @@ class PyCurlFileObject():
self.curl_obj.setopt(pycurl.USERAGENT, opts.user_agent) self.curl_obj.setopt(pycurl.USERAGENT, opts.user_agent)
# maybe to be options later # maybe to be options later
@ -61,15 +164,27 @@ index cf51dff..3758799 100644
# ssl options # ssl options
if self.scheme == 'https': if self.scheme == 'https':
if opts.ssl_ca_cert: # this may do ZERO with nss according to curl docs if opts.ssl_ca_cert: # this may do ZERO with nss according to curl docs
@@ -1607,18 +1610,21 @@ class PyCurlFileObject(): @@ -1607,18 +1624,33 @@ class PyCurlFileObject():
# to other URLGrabErrors from # to other URLGrabErrors from
# http://curl.haxx.se/libcurl/c/libcurl-errors.html # http://curl.haxx.se/libcurl/c/libcurl-errors.html
# this covers e.args[0] == 22 pretty well - which will be common # this covers e.args[0] == 22 pretty well - which will be common
+ code = self.http_code
+ if e.args[0] == 28: + if e.args[0] == 28:
+ err = URLGrabError(12, _('Timeout on %s: %s') % (self.url, e)) + err = URLGrabError(12, _('Timeout on %s: %s') % (self.url, e))
+ err.url = self.url + err.url = self.url
+ raise err + raise err
+ code = self.http_code +
+ elif e.args[0] == 23 and code >= 200 and code < 299:
+ err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e))
+ err.url = self.url
+ # this is probably wrong but ultimately this is what happens
+ # we have a legit http code and a pycurl 'writer failed' code
+ # which almost always means something aborted it from outside
+ # since we cannot know what it is -I'm banking on it being
+ # a ctrl-c. XXXX - if there's a way of going back two raises to
+ # figure out what aborted the pycurl process FIXME
+ raise KeyboardInterrupt
+
if str(e.args[1]) == '': # fake it until you make it if str(e.args[1]) == '': # fake it until you make it
msg = 'HTTP Error %s : %s ' % (self.http_code, self.url) msg = 'HTTP Error %s : %s ' % (self.http_code, self.url)
else: else:
@ -79,14 +194,15 @@ index cf51dff..3758799 100644
+ err.code = code + err.code = code
err.exception = e err.exception = e
raise err raise err
-
+
def _do_open(self): def _do_open(self):
- self.append = False - self.append = False
- self.reget_time = None - self.reget_time = None
self.curl_obj = _curl_cache self.curl_obj = _curl_cache
self.curl_obj.reset() # reset all old settings away, just in case self.curl_obj.reset() # reset all old settings away, just in case
# setup any ranges # setup any ranges
@@ -1630,11 +1636,9 @@ class PyCurlFileObject(): @@ -1630,11 +1662,9 @@ class PyCurlFileObject():
pass pass
def _build_range(self): def _build_range(self):
@ -99,7 +215,7 @@ index cf51dff..3758799 100644
# we have reget turned on and we're dumping to a file # we have reget turned on and we're dumping to a file
try: try:
s = os.stat(self.filename) s = os.stat(self.filename)
@@ -1729,7 +1733,7 @@ class PyCurlFileObject(): @@ -1729,7 +1759,7 @@ class PyCurlFileObject():
if self.filename: if self.filename:
self._prog_reportname = str(self.filename) self._prog_reportname = str(self.filename)
self._prog_basename = os.path.basename(self.filename) self._prog_basename = os.path.basename(self.filename)
@ -108,3 +224,50 @@ index cf51dff..3758799 100644
if self.append: mode = 'ab' if self.append: mode = 'ab'
else: mode = 'wb' else: mode = 'wb'
@@ -1746,19 +1776,23 @@ class PyCurlFileObject():
else:
self._prog_reportname = 'MEMORY'
self._prog_basename = 'MEMORY'
- fh, self._temp_name = mkstemp()
+
- self.fo = open(self._temp_name, 'wb')
+ self.fo = StringIO()
+ # if this is to be a tempfile instead....
+ # it just makes crap in the tempdir
+ #fh, self._temp_name = mkstemp()
+ #self.fo = open(self._temp_name, 'wb')
self._do_perform()
- # close it up
- self.fo.flush()
- self.fo.close()
if self.filename:
+ # close it up
+ self.fo.flush()
+ self.fo.close()
# set the time
mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME)
if mod_time != -1:
@@ -1766,7 +1800,8 @@ class PyCurlFileObject():
# re open it
self.fo = open(self.filename, 'r')
else:
- self.fo = open(self._temp_name, 'r')
+ #self.fo = open(self._temp_name, 'r')
+ self.fo.seek(0)
self._complete = True
@@ -1838,6 +1873,7 @@ class PyCurlFileObject():
downloaded += self._reget_length
self.opts.progress_obj.update(downloaded)
+
def read(self, amt=None):
self._fill_buffer(amt)
if amt is None:

Loading…
Cancel
Save