[youtube] Extract additional meta data from video description on youtube music videos
YouTube music videos often have auto-generated video descriptions that can be utilized to extract additional information about the video. This is desirable in order to provide the user with as much meta data as possible. This commit adds extraction methods for the following fields for youtube music videos: - artist (fallback extraction methods added) - track (fallback extraction methods added) - album (new in this commit) - release_date (new in this commit) - release_year (new in this commit) 4 tests have been added to test this new functionality: - YoutubeIE tests 27, 28, 29, and 30 Resolves: #20599
This commit is contained in:
parent
aa05a093bb
commit
5caabd3c70
|
@ -1086,7 +1086,95 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||||
'skip_download': True,
|
'skip_download': True,
|
||||||
'youtube_include_dash_manifest': False,
|
'youtube_include_dash_manifest': False,
|
||||||
},
|
},
|
||||||
}
|
},
|
||||||
|
{
|
||||||
|
# artist and track fields should return non-null, per issue #20599
|
||||||
|
'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
|
||||||
|
'info_dict': {
|
||||||
|
'id': 'MgNrAu2pzNs',
|
||||||
|
'ext': 'mp4',
|
||||||
|
'title': 'Voyeur Girl',
|
||||||
|
'description': 'md5:7ae382a65843d6df2685993e90a8628f',
|
||||||
|
'upload_date': '20190312',
|
||||||
|
'uploader': 'Various Artists - Topic',
|
||||||
|
'uploader_id': 'UCVWKBi1ELZn0QX2CBLSkiyw',
|
||||||
|
'artist': 'Stephen',
|
||||||
|
'track': 'Voyeur Girl',
|
||||||
|
'album': 'it\'s too much love to know my dear',
|
||||||
|
'release_date': '20190313',
|
||||||
|
'release_year': 2019,
|
||||||
|
},
|
||||||
|
'params': {
|
||||||
|
'skip_download': True,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
# Retrieve 'artist' field from 'Artist:' in video description
|
||||||
|
# when it is present on youtube music video
|
||||||
|
# Some videos have release_date and no release_year -
|
||||||
|
# (release_year should be extracted from release_date)
|
||||||
|
# https://github.com/ytdl-org/youtube-dl/pull/20742#issuecomment-485740932
|
||||||
|
'url': 'https://www.youtube.com/watch?v=k0jLE7tTwjY',
|
||||||
|
'info_dict': {
|
||||||
|
'id': 'k0jLE7tTwjY',
|
||||||
|
'ext': 'mp4',
|
||||||
|
'title': 'Latch Feat. Sam Smith',
|
||||||
|
'description': 'md5:3cb1e8101a7c85fcba9b4fb41b951335',
|
||||||
|
'upload_date': '20150110',
|
||||||
|
'uploader': 'Various Artists - Topic',
|
||||||
|
'uploader_id': 'UCNkEcmYdjrH4RqtNgh7BZ9w',
|
||||||
|
'artist': 'Disclosure',
|
||||||
|
'track': 'Latch Feat. Sam Smith',
|
||||||
|
'album': 'Latch Featuring Sam Smith',
|
||||||
|
'release_date': '20121008',
|
||||||
|
'release_year': 2012,
|
||||||
|
},
|
||||||
|
'params': {
|
||||||
|
'skip_download': True,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
# handle multiple artists on youtube music video
|
||||||
|
'url': 'https://www.youtube.com/watch?v=74qn0eJSjpA',
|
||||||
|
'info_dict': {
|
||||||
|
'id': '74qn0eJSjpA',
|
||||||
|
'ext': 'mp4',
|
||||||
|
'title': 'Eastside',
|
||||||
|
'description': 'md5:290516bb73dcbfab0dcc4efe6c3de5f2',
|
||||||
|
'upload_date': '20180710',
|
||||||
|
'uploader': 'Benny Blanco - Topic',
|
||||||
|
'uploader_id': 'UCzqz_ksRu_WkIzmivMdIS7A',
|
||||||
|
'artist': 'benny blanco, Halsey, Khalid',
|
||||||
|
'track': 'Eastside',
|
||||||
|
'album': 'Eastside',
|
||||||
|
'release_date': '20180713',
|
||||||
|
'release_year': 2018,
|
||||||
|
},
|
||||||
|
'params': {
|
||||||
|
'skip_download': True,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
# handle youtube music video with release_year and no release_date
|
||||||
|
'url': 'https://www.youtube.com/watch?v=-hcAI0g-f5M',
|
||||||
|
'info_dict': {
|
||||||
|
'id': '-hcAI0g-f5M',
|
||||||
|
'ext': 'mp4',
|
||||||
|
'title': 'Put It On Me',
|
||||||
|
'description': 'md5:93c55acc682ae7b0c668f2e34e1c069e',
|
||||||
|
'upload_date': '20180426',
|
||||||
|
'uploader': 'Matt Maeson - Topic',
|
||||||
|
'uploader_id': 'UCnEkIGqtGcQMLk73Kp-Q5LQ',
|
||||||
|
'artist': 'Matt Maeson',
|
||||||
|
'track': 'Put It On Me',
|
||||||
|
'album': 'The Hearse',
|
||||||
|
'release_date': None,
|
||||||
|
'release_year': 2018,
|
||||||
|
},
|
||||||
|
'params': {
|
||||||
|
'skip_download': True,
|
||||||
|
},
|
||||||
|
},
|
||||||
]
|
]
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
|
@ -2073,6 +2161,36 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||||
|
|
||||||
track = extract_meta('Song')
|
track = extract_meta('Song')
|
||||||
artist = extract_meta('Artist')
|
artist = extract_meta('Artist')
|
||||||
|
album = None
|
||||||
|
release_date = None
|
||||||
|
release_year = None
|
||||||
|
|
||||||
|
description_info = video_description.split('\n\n')
|
||||||
|
# If the description of the video has the youtube music auto-generated format, extract additional info
|
||||||
|
if len(description_info) >= 5 and description_info[-1] == 'Auto-generated by YouTube.':
|
||||||
|
track_artist = description_info[1].split(' · ')
|
||||||
|
if len(track_artist) >= 2:
|
||||||
|
if track is None:
|
||||||
|
track = track_artist[0]
|
||||||
|
if artist is None:
|
||||||
|
artist = re.search(r'Artist: ([^\n]+)', description_info[-2])
|
||||||
|
if artist:
|
||||||
|
artist = artist.group(1)
|
||||||
|
if artist is None:
|
||||||
|
artist = track_artist[1]
|
||||||
|
# handle multiple artists
|
||||||
|
if len(track_artist) > 2:
|
||||||
|
for i in range(2, len(track_artist)):
|
||||||
|
artist += ', %s' % track_artist[i]
|
||||||
|
release_year = re.search(r'℗ ([0-9]+)', video_description)
|
||||||
|
if release_year:
|
||||||
|
release_year = int_or_none(release_year.group(1))
|
||||||
|
album = description_info[2]
|
||||||
|
if description_info[4].startswith('Released on: '):
|
||||||
|
release_date = description_info[4].split(': ')[1].replace('-', '')
|
||||||
|
# extract release_year from release_date if necessary
|
||||||
|
if release_year is None:
|
||||||
|
release_year = int_or_none(release_date[0:4])
|
||||||
|
|
||||||
m_episode = re.search(
|
m_episode = re.search(
|
||||||
r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
|
r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
|
||||||
|
@ -2226,6 +2344,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||||
'episode_number': episode_number,
|
'episode_number': episode_number,
|
||||||
'track': track,
|
'track': track,
|
||||||
'artist': artist,
|
'artist': artist,
|
||||||
|
'album': album,
|
||||||
|
'release_date': release_date,
|
||||||
|
'release_year': release_year,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue