diff --git a/youtube-dl b/youtube-dl
index e6b7be1..6a60334 100755
--- a/youtube-dl
+++ b/youtube-dl
@@ -14,10 +14,11 @@ __author__ = (
'Sören Schulze',
'Kevin Ngo',
'Ori Avtalion',
+ 'shizeeg',
)
__license__ = 'Public Domain'
-__version__ = '2011.11.23'
+__version__ = '2011.12.08'
UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
@@ -281,6 +282,14 @@ def _simplify_title(title):
expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
return expr.sub(u'_', title).strip(u'_')
+def _orderedSet(iterable):
+ """ Remove all duplicates from the input iterable """
+ res = []
+ for el in iterable:
+ if el not in res:
+ res.append(el)
+ return res
+
class DownloadError(Exception):
"""Download Error exception.
@@ -308,6 +317,10 @@ class PostProcessingError(Exception):
"""
pass
+class MaxDownloadsReached(Exception):
+ """ --max-downloads limit has been reached. """
+ pass
+
class UnavailableVideoError(Exception):
"""Unavailable Format exception.
@@ -698,8 +711,31 @@ class FileDownloader(object):
self.trouble(u'ERROR: invalid system charset or erroneous output template')
return None
+ def _match_entry(self, info_dict):
+ """ Returns None iff the file should be downloaded """
+
+ title = info_dict['title']
+ matchtitle = self.params.get('matchtitle', False)
+ if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
+ return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
+ rejecttitle = self.params.get('rejecttitle', False)
+ if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
+ return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
+ return None
+
def process_info(self, info_dict):
"""Process a single dictionary returned by an InfoExtractor."""
+
+ reason = self._match_entry(info_dict)
+ if reason is not None:
+ self.to_screen(u'[download] ' + reason)
+ return
+
+ max_downloads = self.params.get('max_downloads')
+ if max_downloads is not None:
+ if self._num_downloads > int(max_downloads):
+ raise MaxDownloadsReached()
+
filename = self.prepare_filename(info_dict)
# Forced printings
@@ -723,16 +759,6 @@ class FileDownloader(object):
if filename is None:
return
- matchtitle=self.params.get('matchtitle',False)
- rejecttitle=self.params.get('rejecttitle',False)
- title=info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
- if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
- self.to_screen(u'[download] "%s" title did not match pattern "%s"' % (title, matchtitle))
- return
- if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
- self.to_screen(u'[download] "%s" title matched reject pattern "%s"' % (title, rejecttitle))
- return
-
if self.params.get('nooverwrites', False) and os.path.exists(filename):
self.to_stderr(u'WARNING: file exists and will be skipped')
return
@@ -1095,6 +1121,7 @@ class YoutubeIE(InfoExtractor):
_NETRC_MACHINE = 'youtube'
# Listed in order of quality
_available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
+ _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
_video_extensions = {
'13': '3gp',
'17': 'mp4',
@@ -1344,10 +1371,11 @@ class YoutubeIE(InfoExtractor):
url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
format_limit = self._downloader.params.get('format_limit', None)
- if format_limit is not None and format_limit in self._available_formats:
- format_list = self._available_formats[self._available_formats.index(format_limit):]
+ available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
+ if format_limit is not None and format_limit in available_formats:
+ format_list = available_formats[available_formats.index(format_limit):]
else:
- format_list = self._available_formats
+ format_list = available_formats
existing_formats = [x for x in format_list if x in url_map]
if len(existing_formats) == 0:
self._downloader.trouble(u'ERROR: no known formats available for video')
@@ -1603,7 +1631,7 @@ class DailymotionIE(InfoExtractor):
video_url = mediaURL
- mobj = re.search(r'(?im)
Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?', webpage)
+ mobj = re.search(r'(?im)\s*(.+)\s*-\s*Video\s+Dailymotion', webpage)
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract title')
return
@@ -3608,6 +3636,245 @@ class InfoQIE(InfoExtractor):
except UnavailableVideoError, err:
self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
+class MixcloudIE(InfoExtractor):
+ """Information extractor for www.mixcloud.com"""
+ _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
+ IE_NAME = u'mixcloud'
+
+ def __init__(self, downloader=None):
+ InfoExtractor.__init__(self, downloader)
+
+ def report_download_json(self, file_id):
+ """Report JSON download."""
+ self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
+
+ def report_extraction(self, file_id):
+ """Report information extraction."""
+ self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
+
+ def get_urls(self, jsonData, fmt, bitrate='best'):
+ """Get urls from 'audio_formats' section in json"""
+ file_url = None
+ try:
+ bitrate_list = jsonData[fmt]
+ if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
+ bitrate = max(bitrate_list) # select highest
+
+ url_list = jsonData[fmt][bitrate]
+ except TypeError: # we have no bitrate info.
+ url_list = jsonData[fmt]
+
+ return url_list
+
+ def check_urls(self, url_list):
+ """Returns 1st active url from list"""
+ for url in url_list:
+ try:
+ urllib2.urlopen(url)
+ return url
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+ url = None
+
+ return None
+
+ def _print_formats(self, formats):
+ print 'Available formats:'
+ for fmt in formats.keys():
+ for b in formats[fmt]:
+ try:
+ ext = formats[fmt][b][0]
+ print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
+ except TypeError: # we have no bitrate info
+ ext = formats[fmt][0]
+ print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
+ break
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
+ return
+ # extract uploader & filename from url
+ uploader = mobj.group(1).decode('utf-8')
+ file_id = uploader + "-" + mobj.group(2).decode('utf-8')
+
+ # construct API request
+ file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
+ # retrieve .json file with links to files
+ request = urllib2.Request(file_url)
+ try:
+ self.report_download_json(file_url)
+ jsonData = urllib2.urlopen(request).read()
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+ self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
+ return
+
+ # parse JSON
+ json_data = json.loads(jsonData)
+ player_url = json_data['player_swf_url']
+ formats = dict(json_data['audio_formats'])
+
+ req_format = self._downloader.params.get('format', None)
+ bitrate = None
+
+ if self._downloader.params.get('listformats', None):
+ self._print_formats(formats)
+ return
+
+ if req_format is None or req_format == 'best':
+ for format_param in formats.keys():
+ url_list = self.get_urls(formats, format_param)
+ # check urls
+ file_url = self.check_urls(url_list)
+ if file_url is not None:
+ break # got it!
+ else:
+ if req_format not in formats.keys():
+ self._downloader.trouble(u'ERROR: format is not available')
+ return
+
+ url_list = self.get_urls(formats, req_format)
+ file_url = self.check_urls(url_list)
+ format_param = req_format
+
+ # We have audio
+ self._downloader.increment_downloads()
+ try:
+ # Process file information
+ self._downloader.process_info({
+ 'id': file_id.decode('utf-8'),
+ 'url': file_url.decode('utf-8'),
+ 'uploader': uploader.decode('utf-8'),
+ 'upload_date': u'NA',
+ 'title': json_data['name'],
+ 'stitle': _simplify_title(json_data['name']),
+ 'ext': file_url.split('.')[-1].decode('utf-8'),
+ 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
+ 'thumbnail': json_data['thumbnail_url'],
+ 'description': json_data['description'],
+ 'player_url': player_url.decode('utf-8'),
+ })
+ except UnavailableVideoError, err:
+ self._downloader.trouble(u'ERROR: unable to download file')
+
+class StanfordOpenClassroomIE(InfoExtractor):
+ """Information extractor for Stanford's Open ClassRoom"""
+
+ _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P[^&]+)(&video=(?P