Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Release] LMMs-Eval v0.3.0, Audio Evaluation #426

Merged
merged 48 commits into from
Nov 27, 2024
Merged
Show file tree
Hide file tree
Changes from 27 commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
3e53152
[Feat] Add qwen2_audio model support and Automatic speech recognition…
Prophet-C Oct 1, 2024
6c17c18
add clotho_aqa task
pbcong Oct 4, 2024
9b32985
Apply black formatting
pbcong Oct 4, 2024
396c89a
formatting
pbcong Oct 4, 2024
8f03304
excluding xl due to downloading issue.
Luodian Oct 16, 2024
7fa93c9
[Feat] add audiobench version of clothoaqa (#302)
pbcong Oct 7, 2024
e1deb38
Add AIR_bench task (#315)
pbcong Oct 12, 2024
763d768
add common_voice_15 and people_speech tasks (#316)
Prophet-C Oct 12, 2024
e8cc550
add indent to yaml
Yingluo-momo Oct 16, 2024
6634344
Add openhermes task (#323)
pbcong Oct 16, 2024
6d3392a
[Refactor] Fixing doc to audio return type, qwen_audio revise (#329)
kcz358 Oct 18, 2024
d7d5c69
add muchomusic and vocalsound task (#331)
pbcong Oct 18, 2024
016eeef
add alpaca audio task (#333)
pbcong Oct 19, 2024
c0996fa
[feat] added gigaspeech config (#334)
Yingluo-momo Oct 20, 2024
b154837
add tedlium_long_form and tedlium_dev_test tasks (#345)
Prophet-C Oct 24, 2024
8c1618c
[Feat] add-wavcaps (#349)
Yingluo-momo Oct 25, 2024
9e4fc32
Update dep and fix log samples for audio (#355)
kcz358 Oct 27, 2024
0f94b16
fix vocalsound (#362)
pbcong Oct 28, 2024
b422011
Add using simple prompt for Qwen2 Audio to align (#360)
kcz358 Oct 28, 2024
a7f339c
Add retry for gpt api call and improve air_bench aggregation function…
pbcong Oct 30, 2024
f2cbf0c
[Feat] Add mix_evals audio2text (#420)
kcz358 Nov 23, 2024
aabb021
Gemini Audio (#421)
pufanyi Nov 24, 2024
30a47b3
Revise prompt
kcz358 Nov 25, 2024
8647c9c
delete redundant tasks in gigaspeech
KairuiHu Nov 26, 2024
fcb9464
Fix wavcaps bugs
kcz358 Nov 26, 2024
5593321
Add lmms-eval-0.3 docs
kcz358 Nov 27, 2024
f285d85
Update lmms-eval-0.3.md
kcz358 Nov 27, 2024
141496b
fix errors in markdown and add hyperlinks
KairuiHu Nov 27, 2024
0012f11
proofread markdown and fix errors
KairuiHu Nov 27, 2024
9762609
rewrite some parts to fix errors
KairuiHu Nov 27, 2024
fba53c8
rewrite some parts to fix errors
KairuiHu Nov 27, 2024
41c3c20
rewrite some parts to fix errors
KairuiHu Nov 27, 2024
2687589
try optimize the table format using html
KairuiHu Nov 27, 2024
baca83a
try optimize the table format using html
KairuiHu Nov 27, 2024
f9ff4de
try optimize the table 2 format
KairuiHu Nov 27, 2024
f7398c9
final proofread
KairuiHu Nov 27, 2024
d143eb6
final proofread
KairuiHu Nov 27, 2024
eedce48
final proofread
KairuiHu Nov 27, 2024
687a92b
add explanantion for AIF and ASR
KairuiHu Nov 27, 2024
711aea4
standardize WER to WER(↓)
KairuiHu Nov 27, 2024
f999459
final proofread
KairuiHu Nov 27, 2024
3556ca4
final proofread
KairuiHu Nov 27, 2024
63ab6ba
final proofread
KairuiHu Nov 27, 2024
6bc6a21
final proofread
KairuiHu Nov 27, 2024
639db91
correct hyperlink errors
KairuiHu Nov 27, 2024
f2f5873
modify readme to support lmms-eval0.3.0 release
KairuiHu Nov 27, 2024
7cb99a0
modify icon
KairuiHu Nov 27, 2024
fb11759
fix typos
KairuiHu Nov 27, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
260 changes: 260 additions & 0 deletions docs/lmms-eval-0.3.md

Large diffs are not rendered by default.

10 changes: 9 additions & 1 deletion lmms_eval/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -492,7 +492,15 @@ def evaluate(
metrics = task.process_results(doc, [req.filtered_resps[filter_key] for req in requests])
if log_samples:
target = task.doc_to_target(doc)
saved_doc = {key: value for key, value in doc.items() if "image" not in key}
saved_doc = {}
for key, value in doc.items():
# If image is not in key
if "image" not in key:
# If audio is also not the value
if isinstance(value, dict) and "array" in value:
continue
else:
saved_doc[key] = value
filtered_arguments = []
for req in requests:
# check if req.args is a list of tuples, and each item in the list is a serializable object
Expand Down
2 changes: 1 addition & 1 deletion lmms_eval/evaluator_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,7 +346,7 @@ def consolidate_group_results(
task_root=None,
show_group_table=False,
task_aggregation_list=None,
) -> Tuple[dict, dict, bool, Union[None,]]:
) -> Tuple[dict, dict, bool, Union[None, dict]]:
"""
(Recursively) calculates groups' aggregated metrics and updates the results and versions dictionaries with this info.

Expand Down
1 change: 1 addition & 0 deletions lmms_eval/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
"phi3v": "Phi3v",
"qwen_vl": "Qwen_VL",
"qwen2_vl": "Qwen2_VL",
"qwen2_audio": "Qwen2_Audio",
"qwen_vl_api": "Qwen_VL_API",
"reka": "Reka",
"srt_api": "SRT_API",
Expand Down
40 changes: 28 additions & 12 deletions lmms_eval/models/gemini_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import time
from typing import List, Tuple

import datasets
from accelerate import Accelerator, DistributedType
from loguru import logger as eval_logger
from PIL import Image
Expand All @@ -25,29 +26,36 @@
eval_logger.error(f"Error importing generativeai: {str(e)}")
genai = None

try:
import soundfile as sf
except Exception as e:
eval_logger.warning(f"Error importing soundfile, audio generation will not work: {str(e)}")


@register_model("gemini_api")
class GeminiAPI(lmms):
def __init__(
self,
model_version: str = "gemini-1.5-pro",
modality: str = "image",
# modality: str = "image",
timeout: int = 120,
continual_mode: bool = False,
response_persistent_folder: str = None, # We will cache the Gemini API response in this path and use it for future requests
response_persistent_folder: str = "./logs/gemini_persistent_folder",
# We will cache the Gemini API response in this path and use it for future requests
**kwargs,
) -> None:
super().__init__()
self.model_version = model_version
self.timeout = timeout
self.model = genai.GenerativeModel(model_version)
self.continual_mode = continual_mode
if self.continual_mode and response_persistent_folder is None:
raise ValueError("Continual mode requires a persistent path for the response. We will cache the Gemini API response in this path and use it for future requests. Please provide a valid path.")
self.response_persistent_folder = response_persistent_folder
if not os.path.exists(self.response_persistent_folder):
os.makedirs(self.response_persistent_folder)
self.response_persistent_file = os.path.join(self.response_persistent_folder, f"{self.model_version}_response.json")
# if self.continual_mode and response_persistent_folder is None:
# raise ValueError("Continual mode requires a persistent path for the response. We will cache the Gemini API response in this path and use it for future requests. Please provide a valid path.")
if self.continual_mode:
self.response_persistent_folder = response_persistent_folder
if not os.path.exists(self.response_persistent_folder):
os.makedirs(self.response_persistent_folder)
self.response_persistent_file = os.path.join(self.response_persistent_folder, f"{self.model_version}_response.json")

if os.path.exists(self.response_persistent_file):
with open(self.response_persistent_file, "r") as f:
Expand All @@ -73,7 +81,7 @@ def __init__(

self.device = self.accelerator.device

self.modality = modality
# self.modality = modality

self.video_pool = []

Expand Down Expand Up @@ -107,9 +115,17 @@ def encode_video(self, video_path):
self.video_pool.append(uploaded_obj)
return uploaded_obj

def convert_video(self, images):
def encode_audio(self, audio):
audio_io = io.BytesIO()
sf.write(audio_io, audio["array"], audio["sampling_rate"], format="WAV")
return genai.upload_file(audio_io, mime_type="audio/wav")

def convert_modality(self, images):
for idx, img in enumerate(images):
if self.modality == "video" and isinstance(img, str):
if isinstance(img, dict) and "sampling_rate" in img: # audio
audio = self.encode_audio(img)
images[idx] = audio
elif isinstance(img, str): # video
try:
images[idx] = self.encode_video(img)
except Exception as e:
Expand Down Expand Up @@ -145,7 +161,7 @@ def get_uuid(task, split, doc_id):

visuals = [doc_to_visual(self.task_dict[task][split][doc_id])]
visuals = self.flatten(visuals)
visuals = self.convert_video(visuals)
visuals = self.convert_modality(visuals)

message = [contexts] + visuals

Expand Down
7 changes: 7 additions & 0 deletions lmms_eval/models/model_utils/audio_processing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import numpy as np
from librosa import resample


def downsample_audio(audio_array: np.ndarray, original_sr: int, target_sr: int) -> np.ndarray:
audio_resample_array = resample(audio_array, orig_sr=original_sr, target_sr=target_sr)
return audio_resample_array
Loading