EvolvingLMMs-Lab · KairuiHu · Nov 27, 2024 · Oct 1, 2024 · Oct 4, 2024 · Oct 4, 2024
diff --git a/docs/lmms-eval-0.3.md b/docs/lmms-eval-0.3.md
diff --git a/lmms_eval/evaluator.py b/lmms_eval/evaluator.py
@@ -492,7 +492,15 @@ def evaluate(
                 metrics = task.process_results(doc, [req.filtered_resps[filter_key] for req in requests])
                 if log_samples:
                     target = task.doc_to_target(doc)
-                    saved_doc = {key: value for key, value in doc.items() if "image" not in key}
+                    saved_doc = {}
+                    for key, value in doc.items():
+                        # If image is not in key
+                        if "image" not in key:
+                            # If audio is also not the value
+                            if isinstance(value, dict) and "array" in value:
+                                continue
+                            else:
+                                saved_doc[key] = value
                     filtered_arguments = []
                     for req in requests:
                         # check if req.args is a list of tuples, and each item in the list is a serializable object

diff --git a/lmms_eval/evaluator_utils.py b/lmms_eval/evaluator_utils.py
@@ -346,7 +346,7 @@ def consolidate_group_results(
     task_root=None,
     show_group_table=False,
     task_aggregation_list=None,
-) -> Tuple[dict, dict, bool, Union[None,]]:
+) -> Tuple[dict, dict, bool, Union[None, dict]]:
     """
     (Recursively) calculates groups' aggregated metrics and updates the results and versions dictionaries with this info.
 

diff --git a/lmms_eval/models/__init__.py b/lmms_eval/models/__init__.py
@@ -40,6 +40,7 @@
     "phi3v": "Phi3v",
     "qwen_vl": "Qwen_VL",
     "qwen2_vl": "Qwen2_VL",
+    "qwen2_audio": "Qwen2_Audio",
     "qwen_vl_api": "Qwen_VL_API",
     "reka": "Reka",
     "srt_api": "SRT_API",

diff --git a/lmms_eval/models/gemini_api.py b/lmms_eval/models/gemini_api.py
@@ -4,6 +4,7 @@
 import time
 from typing import List, Tuple
 
+import datasets
 from accelerate import Accelerator, DistributedType
 from loguru import logger as eval_logger
 from PIL import Image
@@ -25,29 +26,36 @@
     eval_logger.error(f"Error importing generativeai: {str(e)}")
     genai = None
 
+try:
+    import soundfile as sf
+except Exception as e:
+    eval_logger.warning(f"Error importing soundfile, audio generation will not work: {str(e)}")
+
 
 @register_model("gemini_api")
 class GeminiAPI(lmms):
     def __init__(
         self,
         model_version: str = "gemini-1.5-pro",
-        modality: str = "image",
+        # modality: str = "image",
         timeout: int = 120,
         continual_mode: bool = False,
-        response_persistent_folder: str = None,  # We will cache the Gemini API response in this path and use it for future requests
+        response_persistent_folder: str = "./logs/gemini_persistent_folder",
+        # We will cache the Gemini API response in this path and use it for future requests
         **kwargs,
     ) -> None:
         super().__init__()
         self.model_version = model_version
         self.timeout = timeout
         self.model = genai.GenerativeModel(model_version)
         self.continual_mode = continual_mode
-        if self.continual_mode and response_persistent_folder is None:
-            raise ValueError("Continual mode requires a persistent path for the response. We will cache the Gemini API response in this path and use it for future requests. Please provide a valid path.")
-        self.response_persistent_folder = response_persistent_folder
-        if not os.path.exists(self.response_persistent_folder):
-            os.makedirs(self.response_persistent_folder)
-        self.response_persistent_file = os.path.join(self.response_persistent_folder, f"{self.model_version}_response.json")
+        # if self.continual_mode and response_persistent_folder is None:
+        #     raise ValueError("Continual mode requires a persistent path for the response. We will cache the Gemini API response in this path and use it for future requests. Please provide a valid path.")
+        if self.continual_mode:
+            self.response_persistent_folder = response_persistent_folder
+            if not os.path.exists(self.response_persistent_folder):
+                os.makedirs(self.response_persistent_folder)
+            self.response_persistent_file = os.path.join(self.response_persistent_folder, f"{self.model_version}_response.json")
 
         if os.path.exists(self.response_persistent_file):
             with open(self.response_persistent_file, "r") as f:
@@ -73,7 +81,7 @@ def __init__(
 
         self.device = self.accelerator.device
 
-        self.modality = modality
+        # self.modality = modality
 
         self.video_pool = []
 
@@ -107,9 +115,17 @@ def encode_video(self, video_path):
         self.video_pool.append(uploaded_obj)
         return uploaded_obj
 
-    def convert_video(self, images):
+    def encode_audio(self, audio):
+        audio_io = io.BytesIO()
+        sf.write(audio_io, audio["array"], audio["sampling_rate"], format="WAV")
+        return genai.upload_file(audio_io, mime_type="audio/wav")
+
+    def convert_modality(self, images):
         for idx, img in enumerate(images):
-            if self.modality == "video" and isinstance(img, str):
+            if isinstance(img, dict) and "sampling_rate" in img:  # audio
+                audio = self.encode_audio(img)
+                images[idx] = audio
+            elif isinstance(img, str):  # video
                 try:
                     images[idx] = self.encode_video(img)
                 except Exception as e:
@@ -145,7 +161,7 @@ def get_uuid(task, split, doc_id):
 
             visuals = [doc_to_visual(self.task_dict[task][split][doc_id])]
             visuals = self.flatten(visuals)
-            visuals = self.convert_video(visuals)
+            visuals = self.convert_modality(visuals)
 
             message = [contexts] + visuals
 

diff --git a/lmms_eval/models/model_utils/audio_processing.py b/lmms_eval/models/model_utils/audio_processing.py
@@ -0,0 +1,7 @@
+import numpy as np
+from librosa import resample
+
+
+def downsample_audio(audio_array: np.ndarray, original_sr: int, target_sr: int) -> np.ndarray:
+    audio_resample_array = resample(audio_array, orig_sr=original_sr, target_sr=target_sr)
+    return audio_resample_array