-
Notifications
You must be signed in to change notification settings - Fork 0
/
search.py
88 lines (81 loc) · 2.71 KB
/
search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import urllib.request
import untangle
import sys
def allindices(string, sub):
listindex=[]
i = string.find(sub)
while i >= 0:
listindex.append(i)
i = string.find(sub, i + 1)
return listindex
def search(query):
indices = allindices(complete_caption.upper(),query.upper())
if (len(indices)==0):
return "Keyword not found"
else:
newset = set()
for index in indices:
if (index in keyword.keys()):
newset.add(keyword[index])
else:
curr = 0
for i in list(keyword.keys()):
if i < index:
curr = i
newset.add(keyword[curr])
return "Matching times: "+str(sorted(list(newset)))
#uses google provided captions
orig = input("Please paste the entire url of the youtube video: ")
url = "http://video.google.com/timedtext?lang=en&v="+orig[len("https://www.youtube.com/watch?v="):]
try:
obj = untangle.parse(url)
except Exception:
print("This video does not contain captions provided by google.")
#print("Following service provided by IBM Watson")
#use ibm watson here
#import youtube_dl
#options = {
#'format': 'bestaudio/best', # choice of quality
#'extractaudio' : True, # only keep the audio
#'audioformat' : "wav", # convert to mp3
#'outtmpl': 'rawaudio.wav', # name the file the ID of the video
#'noplaylist' : True, # only download single song, not playlist
#}
#with youtube_dl.YoutubeDL(options) as ydl:
#ydl.download([orig])
#print("File audio extracted")
# import json
# from os.path import join, dirname
# from watson_developer_cloud import SpeechToTextV1
# speech_to_text = SpeechToTextV1(
# username='9e54b45c-6560-47e3-b798-aff222850fbf',
# password='iydT2ZkN3ftT',
# x_watson_learning_opt_out=False
# )
#
# print(json.dumps(speech_to_text.models(), indent=2))
#
# print(json.dumps(speech_to_text.get_model('en-US_BroadbandModel'), indent=2))
#
# with open('rawaudio.wav', 'rb') as audio_file:
# print(json.dumps(speech_to_text.recognize(
# audio_file, content_type='audio/wav', timestamps=True, word_confidence=True), indent=2))
#end ibm watson translation
exit()
obj = obj.transcript.text
complete_caption = ""
keyword = {}
index = 0
for text in obj:
complete_caption += text.cdata
for item in text.cdata.split():
m, s = divmod(float(text['start']), 60)
h, m = divmod(m, 60)
keyword[index]= "%d:%02d:%02d" % (h, m, s)
index += len(item)+1
index-=1
while True:
x = input("Find: ")
if (x=="exit"):
break
print(search(x))