diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1ffe416 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +*.py[co] +MANIFEST +dist +build +*.kdev* +*.kateproject +ipython.log* diff --git a/test/base_test_code.py b/test/base_test_code.py index 50c6348..5fb43f9 100644 --- a/test/base_test_code.py +++ b/test/base_test_code.py @@ -1,6 +1,6 @@ from munittest import * -base_http = 'http://www.linux.duke.edu/projects/urlgrabber/test/' +base_http = 'http://urlgrabber.baseurl.org/test/' base_ftp = 'ftp://localhost/test/' # set to a proftp server only. we're working around a couple of diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py index e090e90..4797436 100644 --- a/urlgrabber/grabber.py +++ b/urlgrabber/grabber.py @@ -68,14 +68,14 @@ GENERAL ARGUMENTS (kwargs) (which can be set on default_grabber.throttle) is used. See BANDWIDTH THROTTLING for more information. - timeout = None + timeout = 300 - a positive float expressing the number of seconds to wait for socket - operations. If the value is None or 0.0, socket operations will block - forever. Setting this option causes urlgrabber to call the settimeout - method on the Socket object used for the request. See the Python - documentation on settimeout for more information. - http://www.python.org/doc/current/lib/socket-objects.html + a positive integer expressing the number of seconds to wait before + timing out attempts to connect to a server. If the value is None + or 0, connection attempts will not time out. The timeout is passed + to the underlying pycurl object as its CONNECTTIMEOUT option, see + the curl documentation on CURLOPT_CONNECTTIMEOUT for more information. + http://curl.haxx.se/libcurl/c/curl_easy_setopt.html#CURLOPTCONNECTTIMEOUT bandwidth = 0 @@ -439,6 +439,12 @@ try: except: __version__ = '???' +try: + # this part isn't going to do much - need to talk to gettext + from i18n import _ +except ImportError, msg: + def _(st): return st + ######################################################################## # functions for debugging output. These functions are here because they # are also part of the module initialization. @@ -808,7 +814,7 @@ class URLGrabberOptions: self.prefix = None self.opener = None self.cache_openers = True - self.timeout = None + self.timeout = 300 self.text = None self.http_headers = None self.ftp_headers = None @@ -1052,9 +1058,15 @@ class PyCurlFileObject(): self._reget_length = 0 self._prog_running = False self._error = (None, None) - self.size = None + self.size = 0 + self._hdr_ended = False self._do_open() + + def geturl(self): + """ Provide the geturl() method, used to be got from + urllib.addinfourl, via. urllib.URLopener.* """ + return self.url def __getattr__(self, name): """This effectively allows us to wrap at the instance level. @@ -1085,9 +1097,14 @@ class PyCurlFileObject(): return -1 def _hdr_retrieve(self, buf): + if self._hdr_ended: + self._hdr_dump = '' + self.size = 0 + self._hdr_ended = False + if self._over_max_size(cur=len(self._hdr_dump), max_size=self.opts.max_header_size): - return -1 + return -1 try: self._hdr_dump += buf # we have to get the size before we do the progress obj start @@ -1104,7 +1121,17 @@ class PyCurlFileObject(): s = parse150(buf) if s: self.size = int(s) - + + if buf.lower().find('location') != -1: + location = ':'.join(buf.split(':')[1:]) + location = location.strip() + self.scheme = urlparse.urlsplit(location)[0] + self.url = location + + if len(self._hdr_dump) != 0 and buf == '\r\n': + self._hdr_ended = True + if DEBUG: DEBUG.info('header ended:') + return len(buf) except KeyboardInterrupt: return pycurl.READFUNC_ABORT @@ -1113,8 +1140,10 @@ class PyCurlFileObject(): if self._parsed_hdr: return self._parsed_hdr statusend = self._hdr_dump.find('\n') + statusend += 1 # ridiculous as it may seem. hdrfp = StringIO() hdrfp.write(self._hdr_dump[statusend:]) + hdrfp.seek(0) self._parsed_hdr = mimetools.Message(hdrfp) return self._parsed_hdr @@ -1136,6 +1165,7 @@ class PyCurlFileObject(): self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update) self.curl_obj.setopt(pycurl.FAILONERROR, True) self.curl_obj.setopt(pycurl.OPT_FILETIME, True) + self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True) if DEBUG: self.curl_obj.setopt(pycurl.VERBOSE, True) @@ -1148,9 +1178,11 @@ class PyCurlFileObject(): # timeouts timeout = 300 - if opts.timeout: - timeout = int(opts.timeout) - self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout) + if hasattr(opts, 'timeout'): + timeout = int(opts.timeout or 0) + self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout) + self.curl_obj.setopt(pycurl.LOW_SPEED_LIMIT, 1) + self.curl_obj.setopt(pycurl.LOW_SPEED_TIME, timeout) # ssl options if self.scheme == 'https': @@ -1276,7 +1308,7 @@ class PyCurlFileObject(): raise err elif errcode == 60: - msg = _("client cert cannot be verified or client cert incorrect") + msg = _("Peer cert cannot be verified or peer cert invalid") err = URLGrabError(14, msg) err.url = self.url raise err @@ -1291,7 +1323,12 @@ class PyCurlFileObject(): raise err elif str(e.args[1]) == '' and self.http_code != 0: # fake it until you make it - msg = 'HTTP Error %s : %s ' % (self.http_code, self.url) + if self.scheme in ['http', 'https']: + msg = 'HTTP Error %s : %s ' % (self.http_code, self.url) + elif self.scheme in ['ftp']: + msg = 'FTP Error %s : %s ' % (self.http_code, self.url) + else: + msg = "Unknown Error: URL=%s , scheme=%s" % (self.url, self.scheme) else: msg = 'PYCURL ERROR %s - "%s"' % (errcode, str(e.args[1])) code = errcode @@ -1299,6 +1336,12 @@ class PyCurlFileObject(): err.code = code err.exception = e raise err + else: + if self._error[1]: + msg = self._error[1] + err = URLGRabError(14, msg) + err.url = self.url + raise err def _do_open(self): self.curl_obj = _curl_cache @@ -1446,9 +1489,23 @@ class PyCurlFileObject(): # set the time mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME) if mod_time != -1: - os.utime(self.filename, (mod_time, mod_time)) + try: + os.utime(self.filename, (mod_time, mod_time)) + except OSError, e: + err = URLGrabError(16, _(\ + 'error setting timestamp on file %s from %s, OSError: %s') + % (self.filenameself.url, e)) + err.url = self.url + raise err # re open it - self.fo = open(self.filename, 'r') + try: + self.fo = open(self.filename, 'r') + except IOError, e: + err = URLGrabError(16, _(\ + 'error opening file from %s, IOError: %s') % (self.url, e)) + err.url = self.url + raise err + else: #self.fo = open(self._temp_name, 'r') self.fo.seek(0) @@ -1532,11 +1589,14 @@ class PyCurlFileObject(): def _over_max_size(self, cur, max_size=None): if not max_size: - max_size = self.size - if self.opts.size: # if we set an opts size use that, no matter what - max_size = self.opts.size + if not self.opts.size: + max_size = self.size + else: + max_size = self.opts.size + if not max_size: return False # if we have None for all of the Max then this is dumb - if cur > max_size + max_size*.10: + + if cur > int(float(max_size) * 1.10): msg = _("Downloaded more than max size for %s: %s > %s") \ % (self.url, cur, max_size) @@ -1582,9 +1642,21 @@ class PyCurlFileObject(): self.opts.progress_obj.end(self._amount_read) self.fo.close() - + def geturl(self): + """ Provide the geturl() method, used to be got from + urllib.addinfourl, via. urllib.URLopener.* """ + return self.url + _curl_cache = pycurl.Curl() # make one and reuse it over and over and over +def reset_curl_obj(): + """To make sure curl has reread the network/dns info we force a reload""" + global _curl_cache + _curl_cache.close() + _curl_cache = pycurl.Curl() + + + ##################################################################### # DEPRECATED FUNCTIONS diff --git a/urlgrabber/progress.py b/urlgrabber/progress.py index dd07c6a..45eb248 100644 --- a/urlgrabber/progress.py +++ b/urlgrabber/progress.py @@ -658,6 +658,8 @@ def format_time(seconds, use_hours=0): if seconds is None or seconds < 0: if use_hours: return '--:--:--' else: return '--:--' + elif seconds == float('inf'): + return 'Infinite' else: seconds = int(seconds) minutes = seconds / 60