-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_gpt4o.py
174 lines (143 loc) · 6.31 KB
/
run_gpt4o.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import os
import time
import json
from argparse import ArgumentParser
import cv2
import base64
import requests
import numpy as np
def parse_args():
parser = ArgumentParser()
parser.add_argument("--video_path", type=str, default=None)
parser.add_argument("--output_path", type=str, default="./output")
parser.add_argument("--suffix", type=str, default="gpt4o_predictions")
args = parser.parse_args()
return args
def sample_frames(array, num_samples=8):
length = len(array)
if length <= num_samples:
return array
indices = np.linspace(0, length - 1, num_samples, dtype=int)
sampled_array = [array[i] for i in indices]
return sampled_array
def get_chat_gpt_response(prompt, base64Frames, api_key, max_retries=5, retry_delay=2):
url = "https://api.openai.com/v1/chat/completions"
headers = {
"Authorization": api_key,
"Content-Type": "application/json"
}
data = {
"model": "gpt-4o",
"messages": [
{
"role": "system",
"content": "You are ChatGPT, a large language model trained by OpenAI, based on the GPT-4 architecture. You are chatting with the user via the ChatGPT iOS app. This means most of the time your lines should be a sentence or two, unless the user's request requires reasoning or long-form outputs. Never use emojis, unless explicitly asked to. Knowledge cutoff: 2023-10 Current date: 2024-08-15. Image input capabilities: Enabled Personality: v2"
},
{
"role": "user",
"content": [
{
"type": "text",
"text": prompt
},
*map(lambda x: {
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{x}",
},
"resize": 768
}, sample_frames(base64Frames)),
]
}
]
}
# response = requests.post(url, headers=headers, json=data)
# return response.json()
for attempt in range(max_retries):
try:
response = requests.post(url, headers=headers, json=data)
response.raise_for_status() # 如果状态码不是200, 引发HTTPError
return response.json()
except requests.RequestException as e:
print(f"Request failed (attempt {attempt + 1}/{max_retries}): {e}")
if attempt < max_retries - 1: # 在最后一次重试前等待
time.sleep(retry_delay)
else:
return {"error": str(e)}
def process_description(video_key, base64Frames, api_key, prompt):
response = get_chat_gpt_response(prompt, base64Frames, api_key)
if 'error' in response:
print(f"video processing: {video_key} fail.")
return None
else:
pred = response.get('choices', [{}])[0].get('message', {}).get('content', None)
if pred is not None:
print(f"video processing: {video_key} succeed.")
return pred
else:
print(f"video processing: {video_key} fail.")
return None
def load_video_base64(video_path):
video = cv2.VideoCapture(video_path)
base64Frames = []
while video.isOpened():
success, frame = video.read()
if not success:
break
_, buffer = cv2.imencode(".jpg", frame)
base64Frames.append(base64.b64encode(buffer).decode("utf-8"))
video.release()
return base64Frames
if __name__ == "__main__":
args = parse_args()
api_key = ""
# Loading questions
question_paths = {
"entire": "./questions/entire_questions.json",
"interleave": "./questions/interleave_questions.json",
"misleading": "./questions/misleading_questions.json"
}
answer_prompt = "\nPlease answer yes or no:"
predictions = {}
if not os.path.exists(args.output_path):
os.makedirs(args.output_path)
pred_file = f"{args.output_path}/{args.suffix}.json"
for split, question_filepath in question_paths.items():
### split: question category
with open(question_filepath, 'r') as f:
input_datas = json.load(f)
predictions[split] = {}
for video_info in input_datas:
vid = video_info['id']
if vid not in predictions:
video_info_with_predictions = video_info.copy()
video_info_with_predictions["qa"] = []
video_path = os.path.join(args.video_path, split, f"{vid}.mp4")
base64Frames = load_video_base64(video_path)
### detailed description
try:
# video_llm_pred = run_inference(args, tokenizer, model, image_processor, context_len, video_path, inp)
desc_inp = "Please describe this video in detail."
pred_description = process_description(vid, base64Frames, api_key, desc_inp)
except Exception as e:
print (f"Inference error: {video_path}, Error Detail: {e}")
pred_description = ''
video_info_with_predictions['desc'] = pred_description
### binary classification
try:
a = video_info['questions']
except:
print (f"No questions: {video_path}")
for question in video_info['questions']:
inp = question['question'] + answer_prompt
try:
pred_output = process_description(vid, base64Frames, api_key, inp)
except Exception as e:
print (f"Inference error: {video_path}, Error Detail: {e}")
pred_output = ''
video_info_with_predictions["qa"].append({'question': question['question'], 'answer': question['answer'], 'prediction': pred_output})
predictions[split][vid] = video_info_with_predictions
with open(pred_file, 'w') as f:
json.dump(predictions, f, indent=4)
else:
print (f"{vid} collapse.")