This repository has been archived by the owner on Sep 24, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 265
/
youku.py
executable file
·203 lines (174 loc) · 7.46 KB
/
youku.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
#!/usr/bin/env python
# encoding: utf-8
__all__ = ['youku_download', 'youku_download_playlist', 'youku_download_by_id']
import urllib2
import json
from random import randint
from time import time
import re
import sys
from common import *
def find_video_id_from_url(url):
patterns = [r'^http://v.youku.com/v_show/id_([\w=]+).html',
r'^http://player.youku.com/player.php/sid/([\w=]+)/v.swf',
r'^loader\.swf\?VideoIDS=([\w=]+)',
r'^([\w=]+)$']
return r1_of(patterns, url)
def find_video_id_from_show_page(url):
return re.search(r'<div class="btnplay">.*href="([^"]+)"', get_html(url)).group(1)
def youku_url(url):
id = find_video_id_from_url(url)
if id:
return 'http://v.youku.com/v_show/id_%s.html' % id
if re.match(r'http://www.youku.com/show_page/id_\w+.html', url):
return find_video_id_from_show_page(url)
if re.match(r'http://v.youku.com/v_playlist/\w+.html', url):
return url
raise Exception('Invalid youku URL: '+url)
def trim_title(title):
title = title.replace(u' - 视频 - 优酷视频 - 在线观看', '')
title = title.replace(u' - 专辑 - 优酷视频', '')
title = re.sub(ur'—([^—]+)—优酷网,视频高清在线观看', '', title)
return title
def parse_video_title(url, page):
if re.search(r'v_playlist', url):
# if we are playing a viedo from play list, the meta title might be incorrect
title = r1_of([r'<div class="show_title" title="([^"]+)">[^<]', r'<title>([^<>]*)</title>'], page).decode('utf-8')
else:
title = r1_of([r'<div class="show_title" title="([^"]+)">[^<]', r'<meta name="title" content="([^"]*)"'], page).decode('utf-8')
assert title
title = trim_title(title)
if re.search(r'v_playlist', url) and re.search(r'-.*\S+', title):
title = re.sub(r'^[^-]+-\s*', '', title) # remove the special name from title for playlist video
title = re.sub(ur'—专辑:.*', u'', title) # remove the special name from title for playlist video
title = unescape_html(title)
subtitle = re.search(r'<span class="subtitle" id="subtitle">([^<>]*)</span>', page)
if subtitle:
subtitle = subtitle.group(1).decode('utf-8').strip()
if subtitle == title:
subtitle = None
if subtitle:
title += '-' + subtitle
return title
def parse_playlist_title(url, page):
if re.search(r'v_playlist', url):
# if we are playing a viedo from play list, the meta title might be incorrect
title = re.search(r'<title>([^<>]*)</title>', page).group(1).decode('utf-8')
else:
title = re.search(r'<meta name="title" content="([^"]*)"', page).group(1).decode('utf-8')
title = trim_title(title)
if re.search(r'v_playlist', url) and re.search(r'-.*\S+', title):
title = re.sub(ur'^[^-]+-\s*', u'', title)
title = re.sub(ur'^.*—专辑:《(.+)》', ur'\1', title)
title = unescape_html(title)
return title
def parse_page(url):
url = youku_url(url)
page = get_html(url)
id2 = re.search(r"var\s+videoId2\s*=\s*'(\S+)'", page).group(1)
title = parse_video_title(url, page)
return id2, title
def get_info(videoId2):
return json.loads(get_html('http://v.youku.com/player/getPlayList/VideoIDS/'+videoId2))
def find_video(info, stream_type=None):
#key = '%s%x' % (info['data'][0]['key2'], int(info['data'][0]['key1'], 16) ^ 0xA55AA5A5)
segs = info['data'][0]['segs']
types = segs.keys()
if not stream_type:
for x in ['hd3', 'hd2', 'mp4', 'flv']:
if x in types:
stream_type = x
break
else:
raise NotImplementedError()
assert stream_type in ('hd3', 'hd2', 'mp4', 'flv')
file_type = {'hd3':'flv', 'hd2':'flv', 'mp4':'mp4', 'flv':'flv'}[stream_type]
seed = info['data'][0]['seed']
source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\\:._-1234567890")
mixed = ''
while source:
seed = (seed * 211 + 30031) & 0xFFFF
index = seed * len(source) >> 16
c = source.pop(index)
mixed += c
ids = info['data'][0]['streamfileids'][stream_type].split('*')[:-1]
vid = ''.join(mixed[int(i)] for i in ids)
sid = '%s%s%s' % (int(time()*1000), randint(1000, 1999), randint(1000, 9999))
urls = []
for s in segs[stream_type]:
no = '%02x' % int(s['no'])
url = 'http://f.youku.com/player/getFlvPath/sid/%s_%s/st/%s/fileid/%s%s%s?K=%s&ts=%s' % (sid, no, file_type, vid[:8], no.upper(), vid[10:], s['k'], s['seconds'])
urls.append((url, int(s['size'])))
return urls
def file_type_of_url(url):
return str(re.search(r'/st/([^/]+)/', url).group(1))
def youku_download_by_id(id2, title, output_dir='.', stream_type=None, merge=True):
info = get_info(id2)
urls, sizes = zip(*find_video(info, stream_type))
total_size = sum(sizes)
download_urls(urls, title, file_type_of_url(urls[0]), total_size, output_dir, merge=merge)
def youku_download(url, output_dir='', stream_type=None, merge=True):
id2, title = parse_page(url)
if type(title) == unicode:
title = title.encode(default_encoding)
title = title.replace('?', '-')
youku_download_by_id(id2, title, output_dir, merge=merge)
def parse_playlist_videos(html):
return re.findall(r'id="A_(\w+)"', html)
def parse_playlist_pages(html):
m = re.search(r'<ul class="pages">.*?</ul>', html, flags=re.S)
if m:
urls = re.findall(r'href="([^"]+)"', m.group())
x1, x2, x3 = re.match(r'^(.*page_)(\d+)(_.*)$', urls[-1]).groups()
return ['http://v.youku.com%s%s%s?__rt=1&__ro=listShow' % (x1, i, x3) for i in range(2, int(x2)+1)]
else:
return []
def parse_playlist(url):
html = get_html(url)
video_id = re.search(r"var\s+videoId\s*=\s*'(\d+)'", html).group(1)
show_id = re.search(r'var\s+showid\s*=\s*"(\d+)"', html).group(1)
list_url = 'http://v.youku.com/v_vpofficiallist/page_1_showid_%s_id_%s.html?__rt=1&__ro=listShow' % (show_id, video_id)
html = get_html(list_url)
ids = parse_playlist_videos(html)
for url in parse_playlist_pages(html):
ids.extend(parse_playlist_videos(get_html(url)))
return ids
def parse_vplaylist(url):
id = r1_of([r'^http://www.youku.com/playlist_show/id_(\d+)(?:_ascending_\d_mode_pic(?:_page_\d+)?)?.html',
r'^http://v.youku.com/v_playlist/f(\d+)o[01]p\d+.html',
r'^http://u.youku.com/user_playlist/pid_(\d+)_id_[\w=]+(?:_page_\d+)?.html'],
url)
assert id, 'not valid vplaylist url: '+url
url = 'http://www.youku.com/playlist_show/id_%s.html' % id
n = int(re.search(r'<span class="num">(\d+)</span>', get_html(url)).group(1))
return ['http://v.youku.com/v_playlist/f%so0p%s.html' % (id, i) for i in range(n)]
def youku_download_playlist(url, create_dir=False, merge=True):
if re.match(r'http://www.youku.com/show_page/id_\w+.html', url):
url = find_video_id_from_show_page(url)
if re.match(r'http://www.youku.com/playlist_show/id_\d+(?:_ascending_\d_mode_pic(?:_page_\d+)?)?.html', url):
ids = parse_vplaylist(url)
elif re.match(r'http://v.youku.com/v_playlist/f\d+o[01]p\d+.html', url):
ids = parse_vplaylist(url)
elif re.match(r'http://u.youku.com/user_playlist/pid_(\d+)_id_[\w=]+(?:_page_\d+)?.html', url):
ids = parse_vplaylist(url)
else:
assert re.match(r'http://v.youku.com/v_show/id_([\w=]+).html', url), 'URL not supported as playlist'
ids = parse_playlist(url)
output_dir = '.'
if create_dir:
title = parse_playlist_title(url, get_html(url))
title = title.encode(default_encoding)
title = title.replace('?', '-')
import os
if not os.path.exists(title):
os.makedirs(title)
output_dir = title
for i, id in enumerate(ids):
print 'Downloading %s of %s videos...' % (i + 1, len(ids))
youku_download(id, output_dir=output_dir, merge=merge)
download = youku_download
download_playlist = youku_download_playlist
def main():
script_main('youku', youku_download, youku_download_playlist)
if __name__ == '__main__':
main()