From 36ff3aaff05bd6eddd257e6e6d8657e7de5e05bc Mon Sep 17 00:00:00 2001 From: Seth Vidal Date: Mon, 30 Aug 2010 11:53:16 -0400 Subject: [PATCH] update to latest urlgrabber head --- python-urlgrabber.spec | 5 +- urlgrabber-HEAD.patch | 201 +++++++++++++++++++++++++++++++++++++---- 2 files changed, 188 insertions(+), 18 deletions(-) diff --git a/python-urlgrabber.spec b/python-urlgrabber.spec index 8231d53..9b49ca5 100644 --- a/python-urlgrabber.spec +++ b/python-urlgrabber.spec @@ -3,7 +3,7 @@ Summary: A high-level cross-protocol url-grabber Name: python-urlgrabber Version: 3.9.1 -Release: 7%{?dist} +Release: 8%{?dist} Source0: urlgrabber-%{version}.tar.gz Patch1: urlgrabber-HEAD.patch @@ -43,6 +43,9 @@ rm -rf $RPM_BUILD_ROOT %{_bindir}/urlgrabber %changelog +* Mon Aug 30 2010 Seth Vidal - 3.9.1-8 +- update to latest head patches + * Thu Jul 22 2010 David Malcolm - 3.9.1-7 - Rebuilt for https://fedoraproject.org/wiki/Features/Python_2.7/MassRebuild diff --git a/urlgrabber-HEAD.patch b/urlgrabber-HEAD.patch index 885f3a1..6b97585 100644 --- a/urlgrabber-HEAD.patch +++ b/urlgrabber-HEAD.patch @@ -11,6 +11,66 @@ index 0000000..1ffe416 +*.kdev* +*.kateproject +ipython.log* +diff --git a/scripts/urlgrabber b/scripts/urlgrabber +index 518e512..09cd896 100644 +--- a/scripts/urlgrabber ++++ b/scripts/urlgrabber +@@ -115,6 +115,7 @@ options: + including quotes in the case of strings. + e.g. --user_agent='"foobar/2.0"' + ++ --output FILE + -o FILE write output to FILE, otherwise the basename of the + url will be used + -O print the names of saved files to STDOUT +@@ -170,12 +171,17 @@ class client_options: + return ug_options, ug_defaults + + def process_command_line(self): +- short_options = 'vd:hoOpD' ++ short_options = 'vd:ho:OpD' + long_options = ['profile', 'repeat=', 'verbose=', +- 'debug=', 'help', 'progress'] ++ 'debug=', 'help', 'progress', 'output='] + ug_long = [ o + '=' for o in self.ug_options ] +- optlist, args = getopt.getopt(sys.argv[1:], short_options, +- long_options + ug_long) ++ try: ++ optlist, args = getopt.getopt(sys.argv[1:], short_options, ++ long_options + ug_long) ++ except getopt.GetoptError, e: ++ print >>sys.stderr, "Error:", e ++ self.help([], ret=1) ++ + self.verbose = 0 + self.debug = None + self.outputfile = None +@@ -193,6 +199,7 @@ class client_options: + if o == '--verbose': self.verbose = v + if o == '-v': self.verbose += 1 + if o == '-o': self.outputfile = v ++ if o == '--output': self.outputfile = v + if o == '-p' or o == '--progress': self.progress = 1 + if o == '-d' or o == '--debug': self.debug = v + if o == '--profile': self.profile = 1 +@@ -222,7 +229,7 @@ class client_options: + print "ERROR: cannot use -o when grabbing multiple files" + sys.exit(1) + +- def help(self, args): ++ def help(self, args, ret=0): + if not args: + print MAINHELP + else: +@@ -234,7 +241,7 @@ class client_options: + self.help_ug_option(a) + else: + print 'ERROR: no help on command "%s"' % a +- sys.exit(0) ++ sys.exit(ret) + + def help_doc(self): + print __doc__ diff --git a/test/base_test_code.py b/test/base_test_code.py index 50c6348..5fb43f9 100644 --- a/test/base_test_code.py @@ -24,7 +84,7 @@ index 50c6348..5fb43f9 100644 # set to a proftp server only. we're working around a couple of diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py -index e090e90..4797436 100644 +index e090e90..0c78857 100644 --- a/urlgrabber/grabber.py +++ b/urlgrabber/grabber.py @@ -68,14 +68,14 @@ GENERAL ARGUMENTS (kwargs) @@ -49,7 +109,19 @@ index e090e90..4797436 100644 bandwidth = 0 -@@ -439,6 +439,12 @@ try: +@@ -248,6 +248,11 @@ GENERAL ARGUMENTS (kwargs) + + Maximum size (in bytes) of the headers. + ++ self.ip_resolve = 'whatever' ++ ++ What type of name to IP resolving to use, default is to do both IPV4 and ++ IPV6. ++ + + RETRY RELATED ARGUMENTS + +@@ -439,6 +444,12 @@ try: except: __version__ = '???' @@ -62,7 +134,15 @@ index e090e90..4797436 100644 ######################################################################## # functions for debugging output. These functions are here because they # are also part of the module initialization. -@@ -808,7 +814,7 @@ class URLGrabberOptions: +@@ -800,6 +811,7 @@ class URLGrabberOptions: + self.close_connection = 0 + self.range = None + self.user_agent = 'urlgrabber/%s' % __version__ ++ self.ip_resolve = None + self.keepalive = 1 + self.proxies = None + self.reget = None +@@ -808,7 +820,7 @@ class URLGrabberOptions: self.prefix = None self.opener = None self.cache_openers = True @@ -71,7 +151,17 @@ index e090e90..4797436 100644 self.text = None self.http_headers = None self.ftp_headers = None -@@ -1052,9 +1058,15 @@ class PyCurlFileObject(): +@@ -931,6 +943,9 @@ class URLGrabber: + (scheme, host, path, parm, query, frag) = parts + if filename is None: + filename = os.path.basename( urllib.unquote(path) ) ++ if not filename: ++ # This is better than nothing. ++ filename = 'index.html' + if scheme == 'file' and not opts.copy_local: + # just return the name of the local file - don't make a + # copy currently +@@ -1052,9 +1067,15 @@ class PyCurlFileObject(): self._reget_length = 0 self._prog_running = False self._error = (None, None) @@ -88,7 +178,7 @@ index e090e90..4797436 100644 def __getattr__(self, name): """This effectively allows us to wrap at the instance level. -@@ -1085,9 +1097,14 @@ class PyCurlFileObject(): +@@ -1085,9 +1106,14 @@ class PyCurlFileObject(): return -1 def _hdr_retrieve(self, buf): @@ -104,7 +194,7 @@ index e090e90..4797436 100644 try: self._hdr_dump += buf # we have to get the size before we do the progress obj start -@@ -1104,7 +1121,17 @@ class PyCurlFileObject(): +@@ -1104,7 +1130,17 @@ class PyCurlFileObject(): s = parse150(buf) if s: self.size = int(s) @@ -123,7 +213,7 @@ index e090e90..4797436 100644 return len(buf) except KeyboardInterrupt: return pycurl.READFUNC_ABORT -@@ -1113,8 +1140,10 @@ class PyCurlFileObject(): +@@ -1113,8 +1149,10 @@ class PyCurlFileObject(): if self._parsed_hdr: return self._parsed_hdr statusend = self._hdr_dump.find('\n') @@ -134,7 +224,7 @@ index e090e90..4797436 100644 self._parsed_hdr = mimetools.Message(hdrfp) return self._parsed_hdr -@@ -1136,6 +1165,7 @@ class PyCurlFileObject(): +@@ -1136,11 +1174,21 @@ class PyCurlFileObject(): self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update) self.curl_obj.setopt(pycurl.FAILONERROR, True) self.curl_obj.setopt(pycurl.OPT_FILETIME, True) @@ -142,7 +232,21 @@ index e090e90..4797436 100644 if DEBUG: self.curl_obj.setopt(pycurl.VERBOSE, True) -@@ -1148,9 +1178,11 @@ class PyCurlFileObject(): + if opts.user_agent: + self.curl_obj.setopt(pycurl.USERAGENT, opts.user_agent) ++ if opts.ip_resolve: ++ # Default is: IPRESOLVE_WHATEVER ++ ipr = opts.ip_resolve.lower() ++ if ipr == 'whatever': # Do we need this? ++ self.curl_obj.setopt(pycurl.IPRESOLVE,pycurl.IPRESOLVE_WHATEVER) ++ if ipr == 'ipv4': ++ self.curl_obj.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V4) ++ if ipr == 'ipv6': ++ self.curl_obj.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V6) + + # maybe to be options later + self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True) +@@ -1148,9 +1196,11 @@ class PyCurlFileObject(): # timeouts timeout = 300 @@ -157,7 +261,7 @@ index e090e90..4797436 100644 # ssl options if self.scheme == 'https': -@@ -1276,7 +1308,7 @@ class PyCurlFileObject(): +@@ -1276,7 +1326,7 @@ class PyCurlFileObject(): raise err elif errcode == 60: @@ -166,7 +270,7 @@ index e090e90..4797436 100644 err = URLGrabError(14, msg) err.url = self.url raise err -@@ -1291,7 +1323,12 @@ class PyCurlFileObject(): +@@ -1291,14 +1341,70 @@ class PyCurlFileObject(): raise err elif str(e.args[1]) == '' and self.http_code != 0: # fake it until you make it @@ -178,9 +282,55 @@ index e090e90..4797436 100644 + else: + msg = "Unknown Error: URL=%s , scheme=%s" % (self.url, self.scheme) else: - msg = 'PYCURL ERROR %s - "%s"' % (errcode, str(e.args[1])) +- msg = 'PYCURL ERROR %s - "%s"' % (errcode, str(e.args[1])) ++ pyerr2str = { 5 : _("Couldn't resolve proxy"), ++ 6 : _("Couldn't resolve host"), ++ 7 : _("Couldn't connect"), ++ 8 : _("Bad reply to FTP server"), ++ 9 : _("Access denied"), ++ 11 : _("Bad reply to FTP pass"), ++ 13 : _("Bad reply to FTP pasv"), ++ 14 : _("Bad reply to FTP 227"), ++ 15 : _("Couldn't get FTP host"), ++ 17 : _("Couldn't set FTP type"), ++ 18 : _("Partial file"), ++ 19 : _("FTP RETR command failed"), ++ 22 : _("HTTP returned error"), ++ 23 : _("Write error"), ++ 25 : _("Upload failed"), ++ 26 : _("Read error"), ++ 27 : _("Out of Memory"), ++ 28 : _("Operation timed out"), ++ 30 : _("FTP PORT command failed"), ++ 31 : _("FTP REST command failed"), ++ 33 : _("Range failed"), ++ 34 : _("HTTP POST failed"), ++ 35 : _("SSL CONNECT failed"), ++ 36 : _("Couldn't resume download"), ++ 37 : _("Couldn't read file"), ++ 42 : _("Aborted by callback"), ++ 47 : _("Too many redirects"), ++ 51 : _("Peer certificate failed verification"), ++ 53 : _("SSL engine not found"), ++ 54 : _("SSL engine set failed"), ++ 55 : _("Network error send()"), ++ 56 : _("Network error recv()"), ++ 58 : _("Local certificate failed"), ++ 59 : _("SSL set cipher failed"), ++ 60 : _("Local CA certificate failed"), ++ 61 : _("HTTP bad transfer encoding"), ++ 63 : _("Maximum file size exceeded"), ++ 64 : _("FTP SSL failed"), ++ 67 : _("Authentication failure"), ++ 70 : _("Out of disk space on server"), ++ 73 : _("Remove file exists"), ++ } ++ errstr = str(e.args[1]) ++ if not errstr: ++ errstr = pyerr2str.get(errcode, '') ++ msg = 'curl#%s - "%s"' % (errcode, errstr) code = errcode -@@ -1299,6 +1336,12 @@ class PyCurlFileObject(): + err = URLGrabError(14, msg) err.code = code err.exception = e raise err @@ -193,7 +343,24 @@ index e090e90..4797436 100644 def _do_open(self): self.curl_obj = _curl_cache -@@ -1446,9 +1489,23 @@ class PyCurlFileObject(): +@@ -1434,9 +1540,13 @@ class PyCurlFileObject(): + #fh, self._temp_name = mkstemp() + #self.fo = open(self._temp_name, 'wb') + +- +- self._do_perform() +- ++ try: ++ self._do_perform() ++ except URLGrabError, e: ++ self.fo.flush() ++ self.fo.close() ++ raise e ++ + + + if _was_filename: +@@ -1446,9 +1556,23 @@ class PyCurlFileObject(): # set the time mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME) if mod_time != -1: @@ -203,7 +370,7 @@ index e090e90..4797436 100644 + except OSError, e: + err = URLGrabError(16, _(\ + 'error setting timestamp on file %s from %s, OSError: %s') -+ % (self.filenameself.url, e)) ++ % (self.filename, self.url, e)) + err.url = self.url + raise err # re open it @@ -219,7 +386,7 @@ index e090e90..4797436 100644 else: #self.fo = open(self._temp_name, 'r') self.fo.seek(0) -@@ -1532,11 +1589,14 @@ class PyCurlFileObject(): +@@ -1532,11 +1656,14 @@ class PyCurlFileObject(): def _over_max_size(self, cur, max_size=None): if not max_size: @@ -238,7 +405,7 @@ index e090e90..4797436 100644 msg = _("Downloaded more than max size for %s: %s > %s") \ % (self.url, cur, max_size) -@@ -1582,9 +1642,21 @@ class PyCurlFileObject(): +@@ -1582,9 +1709,21 @@ class PyCurlFileObject(): self.opts.progress_obj.end(self._amount_read) self.fo.close()