Merge pull request #234 from eye-on-surveillance/AI/first-ord-extraction

AI/first-ord-extraction
eye-on-surveillance · Feb 4, 2024 · 7719699 · 7719699
2 parents 9b6f4b1 + 5d24f06
commit 7719699
Showing 16 changed files with 805 additions and 281 deletions.
diff --git a/packages/backend/src/cache/faiss_index_in_depth_cj.dvc b/packages/backend/src/cache/faiss_index_in_depth_cj.dvc
@@ -1,5 +1,5 @@
 outs:
-- md5: ccd643270cd24af47d96dfb198d468ae.dir
+- md5: 9c2090ec6ee84ddf3792866364b35b0f.dir
   size: 23116757
   nfiles: 2
   hash: md5

diff --git a/packages/backend/src/cache/faiss_index_in_depth_fc.dvc b/packages/backend/src/cache/faiss_index_in_depth_fc.dvc
@@ -1,5 +1,5 @@
 outs:
-- md5: a06190b8eee319f9083f642bb844d6a4.dir
+- md5: ad7c688f8fff83d65f8728a403c864dd.dir
   size: 63799934
   nfiles: 2
   hash: md5

diff --git a/packages/backend/src/cache/faiss_index_in_depth_news.dvc b/packages/backend/src/cache/faiss_index_in_depth_news.dvc
@@ -1,6 +1,6 @@
 outs:
-- md5: 89036cfaca111f3d2fa33ed70003aa46.dir
-  size: 242119
+- md5: d08e6dfb04e5afcff49028b2bffed7ad.dir
+  size: 526418
   nfiles: 2
   hash: md5
   path: faiss_index_in_depth_news
diff --git a/packages/backend/src/cache/faiss_index_in_depth_pc.dvc b/packages/backend/src/cache/faiss_index_in_depth_pc.dvc
@@ -1,6 +1,6 @@
 outs:
-- md5: 7645c2b96dcaf4936ee31103ac0c0968.dir
-  size: 2028257
+- md5: 75d23bd607ac91d67d994f624fb2e4c2.dir
+  size: 2097501
   nfiles: 2
   hash: md5
   path: faiss_index_in_depth_pc
diff --git a/packages/backend/src/cache/faiss_index_in_depth_pdf.dvc b/packages/backend/src/cache/faiss_index_in_depth_pdf.dvc
@@ -1,6 +1,6 @@
 outs:
-- md5: 47ce778b85c5553106c9f4da5f45ac28.dir
-  size: 54406163
+- md5: 28bfb00c7d3ebfbb4de620e9915e75b3.dir
+  size: 53418928
   nfiles: 2
   hash: md5
   path: faiss_index_in_depth_pdf
diff --git a/packages/backend/src/preprocessor.py b/packages/backend/src/preprocessor.py
@@ -6,7 +6,7 @@
 
 from langchain.chains import LLMChain, HypotheticalDocumentEmbedder
 from langchain.prompts import PromptTemplate
-from langchain.vectorstores.faiss import FAISS
+from langchain_community.vectorstores import FAISS
 from langchain_openai import OpenAI
 from pathlib import Path
 import shutil
@@ -73,7 +73,7 @@ def create_db_from_minutes_and_agendas(doc_directory):
 
         data = loader.load()
         text_splitter = RecursiveCharacterTextSplitter(
-            chunk_size=2000, chunk_overlap=1000
+            chunk_size=2000, chunk_overlap=100
         )
         docs = text_splitter.split_documents(data)
         all_docs.extend(docs)
@@ -103,7 +103,7 @@ def create_db_from_news_transcripts(news_json_directory):
 
         data = loader.load()
         text_splitter = RecursiveCharacterTextSplitter(
-            chunk_size=10000, chunk_overlap=5000
+            chunk_size=2000, chunk_overlap=100
         )
         docs = text_splitter.split_documents(data)
         all_docs.extend(docs)
@@ -136,7 +136,7 @@ def create_db_from_cj_transcripts(cj_json_directory):
 
         data = loader.load()
         text_splitter = RecursiveCharacterTextSplitter(
-            chunk_size=2000, chunk_overlap=1000
+            chunk_size=2000, chunk_overlap=100
         )
         docs = text_splitter.split_documents(data)
 
@@ -169,7 +169,7 @@ def create_db_from_fc_transcripts(fc_json_directory):
 
         data = loader.load()
         text_splitter = RecursiveCharacterTextSplitter(
-            chunk_size=2000, chunk_overlap=1000
+            chunk_size=2000, chunk_overlap=100
         )
         docs = text_splitter.split_documents(data)
         # Append the publish date to the end of page_content
@@ -199,7 +199,7 @@ def create_db_from_public_comments(pc_json_directory):
 
         data = loader.load()
         text_splitter = RecursiveCharacterTextSplitter(
-            chunk_size=10000, chunk_overlap=5000
+            chunk_size=2000, chunk_overlap=100
         )
         docs = text_splitter.split_documents(data)
         all_docs.extend(docs)

diff --git a/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_cj.dvc b/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_cj.dvc
@@ -1,5 +1,5 @@
 outs:
-- md5: ccd643270cd24af47d96dfb198d468ae.dir
+- md5: 9c2090ec6ee84ddf3792866364b35b0f.dir
   size: 23116757
   nfiles: 2
   hash: md5

diff --git a/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_fc.dvc b/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_fc.dvc
@@ -1,5 +1,5 @@
 outs:
-- md5: a06190b8eee319f9083f642bb844d6a4.dir
+- md5: ad7c688f8fff83d65f8728a403c864dd.dir
   size: 63799934
   nfiles: 2
   hash: md5

diff --git a/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_news.dvc b/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_news.dvc
@@ -1,6 +1,6 @@
 outs:
-- md5: 89036cfaca111f3d2fa33ed70003aa46.dir
-  size: 242119
+- md5: d08e6dfb04e5afcff49028b2bffed7ad.dir
+  size: 526418
   nfiles: 2
   hash: md5
   path: faiss_index_in_depth_news
diff --git a/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_pc.dvc b/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_pc.dvc
@@ -1,6 +1,6 @@
 outs:
-- md5: 7645c2b96dcaf4936ee31103ac0c0968.dir
-  size: 2028257
+- md5: 75d23bd607ac91d67d994f624fb2e4c2.dir
+  size: 2097501
   nfiles: 2
   hash: md5
   path: faiss_index_in_depth_pc
diff --git a/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_pdf.dvc b/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_pdf.dvc
@@ -1,6 +1,6 @@
 outs:
-- md5: 47ce778b85c5553106c9f4da5f45ac28.dir
-  size: 54406163
+- md5: 28bfb00c7d3ebfbb4de620e9915e75b3.dir
+  size: 53418928
   nfiles: 2
   hash: md5
   path: faiss_index_in_depth_pdf
diff --git a/packages/whisper/archive/src-diarization.py b/packages/whisper/archive/src-diarization.py
@@ -0,0 +1,389 @@
+import os
+import yaml
+import argparse
+import face_recognition
+import cv2
+import pandas as pd
+from pydub import AudioSegment
+from pytube import YouTube
+
+from transformers import pipeline
+from pyannote.audio import Pipeline
+import torch
+from moviepy.editor import VideoFileClip
+
+
+def load_config(config_file):
+    try:
+        with open(config_file, "r") as stream:
+            config = yaml.safe_load(stream)
+        return config
+    except FileNotFoundError:
+        print(f"Config file '{config_file}' not found.")
+        return None
+    except yaml.YAMLError as e:
+        print(f"Error parsing config file: {e}")
+        return None
+
+
+def load_face_labels(csv_file):
+    df = pd.read_csv(csv_file)
+
+    df.loc[:, "label"] = df.label.str.lower()
+    df.loc[:, "filepath"] = df.filepath.str.lower()
+    base_path = os.path.join(os.getcwd(), "training_data")  # Base path for images
+    adjusted_filepaths = [os.path.join(base_path, path) for path in df["filepath"]]
+    return dict(zip(adjusted_filepaths, df["label"]))
+
+
+def preprocess_audio_for_diarization(file_path):
+    audio = AudioSegment.from_file(file_path)
+    audio = audio.set_channels(1).set_frame_rate(16000)
+    preprocessed_path = file_path.replace(".mp3", "_preprocessed.wav")
+    audio.export(preprocessed_path, format="wav")
+    return preprocessed_path
+
+
+def perform_diarization(file_path, access_token):
+    diarization_pipeline = Pipeline.from_pretrained(
+        "pyannote/speaker-diarization", use_auth_token=access_token
+    )
+    diarization = diarization_pipeline(file_path)
+    return diarization
+
+
+def split_audio(file_path, start_time_ms, end_time_ms, segment_length_ms, output_dir):
+    audio = AudioSegment.from_file(file_path)
+    audio_segment = audio[start_time_ms:end_time_ms]
+    segments = []
+
+    for i in range(0, len(audio_segment), segment_length_ms):
+        segment = audio_segment[i : i + segment_length_ms]
+        segment_name = f"segment_{i // segment_length_ms}.mp3"
+        segment_file = os.path.join(output_dir, segment_name)
+        segment.export(segment_file, format="mp3")
+        segments.append(segment_file)
+
+    return segments
+
+
+def process_segment_with_whisper_and_diarization(
+    segment_path, diarization_results, pipe, model_batch_size
+):
+    transcript = pipe(
+        segment_path, batch_size=model_batch_size, return_timestamps=True
+    )["chunks"]
+    print(f"TRANSCRIPT: {transcript}")
+
+    diarized_transcript = []
+    for chunk in transcript:
+        start, end = chunk["timestamp"]
+        speaker_label = get_speaker_label(diarization_results, start, end)
+        diarized_transcript.append(
+            {
+                "start": start,
+                "end": end,
+                "speaker": speaker_label,
+                "text": chunk["text"],
+            }
+        )
+    return diarized_transcript
+
+
+def get_speaker_label(diarization_results, start, end):
+    # Handle None values for start and end
+    if start is None or end is None:
+        return None
+
+    overlap = {}
+    for turn, _, speaker in diarization_results.itertracks(yield_label=True):
+        if turn.end < start or turn.start > end:
+            continue
+        overlap[speaker] = (
+            overlap.get(speaker, 0) + min(end, turn.end) - max(start, turn.start)
+        )
+    if overlap:
+        return max(overlap, key=overlap.get)
+    return None
+
+
+def download_youtube_audio(url, save_path):
+    try:
+        yt = YouTube(url)
+        video = yt.streams.filter(only_audio=True).first()
+        out_file = video.download(output_path=save_path)
+        base, ext = os.path.splitext(out_file)
+        new_file = base + ".mp3"
+        os.rename(out_file, new_file)
+        return new_file
+    except Exception as e:
+        print(f"Error downloading audio: {e}")
+        return None
+
+
+def encode_faces(face_labels):
+    face_encodings = {}
+    for filepath, label in face_labels.items():
+        image = face_recognition.load_image_file(filepath)
+        encodings = face_recognition.face_encodings(image)
+        if not encodings:
+            print(f"No face detected in image: {filepath}, skipping...")
+            continue
+        face_encodings[label] = encodings[0]
+    return face_encodings
+
+
+def extract_frames(video_path, timestamps_ms):
+    cap = cv2.VideoCapture(video_path)
+    frames = {}
+
+    for timestamp_ms in timestamps_ms:
+        cap.set(cv2.CAP_PROP_POS_MSEC, timestamp_ms)
+        ret, frame = cap.read()
+        if ret:
+            frames[timestamp_ms] = frame
+    cap.release()
+    return frames
+
+
+def recognize_faces(frames, face_encodings):
+    recognized_faces = {}
+    for timestamp, frame in frames.items():
+        face_locations = face_recognition.face_locations(frame, model="cnn")
+        face_encs = face_recognition.face_encodings(frame, face_locations)
+        for face_enc in face_encs:
+            matches = face_recognition.compare_faces(
+                list(face_encodings.values()), face_enc
+            )
+            if True in matches:
+                first_match_index = matches.index(True)
+                name = list(face_encodings.keys())[first_match_index]
+                recognized_faces[timestamp] = name
+                print(f"Timestamp (ms): {timestamp}, Recognized face: {name}")
+    return recognized_faces
+
+
+def map_faces_to_speakers(diarized_transcript, recognized_faces, tolerance_ms=1000):
+    speaker_mapping = {}
+    labelname_to_speaker_mapping = {}
+
+    print("Received Recognized Faces (ms):", recognized_faces)
+
+    for segment_tuple in diarized_transcript:
+        segment, speaker = segment_tuple
+        segment_start_ms = segment.start * 1000
+        segment_end_ms = segment.end * 1000
+        segment_speakers = []
+
+        print(
+            f"\nProcessing Segment: [{segment_start_ms} - {segment_end_ms}], Speaker Label: {speaker}"
+        )
+
+        for face_time_ms, name in recognized_faces.items():
+            print(f"  Checking Face Time: {face_time_ms}, Name: {name}")
+
+            if (
+                (segment_start_ms - tolerance_ms)
+                <= face_time_ms
+                <= (segment_end_ms + tolerance_ms)
+            ):
+                segment_speakers.append(name)
+                print(
+                    f"    Matching Face Detected - Time (ms): {face_time_ms}, Name: {name}"
+                )
+
+        recognized_speaker = (
+            max(set(segment_speakers), key=segment_speakers.count)
+            if segment_speakers
+            else None
+        )
+        identified_speaker = recognized_speaker if recognized_speaker else speaker
+
+        speaker_mapping[(segment_start_ms, segment_end_ms)] = identified_speaker
+
+        if recognized_speaker:
+            labelname_to_speaker_mapping[speaker] = recognized_speaker
+
+        print(
+            f"  Finalized Mapping for Segment: {segment_start_ms} - {segment_end_ms}, Speaker: {identified_speaker}"
+        )
+
+    return speaker_mapping, labelname_to_speaker_mapping
+
+
+def export_diarized_transcript_with_names(
+    diarized_transcript, labelname_to_speaker_mapping, save_loc
+):
+    with open(save_loc, "w") as file:
+        for segment in diarized_transcript:
+            print(f"DIARIZED SEGMENT IN EXPORT FUNC: {segment}")
+            start, end, original_speaker_label, text = (
+                segment["start"],
+                segment["end"],
+                segment["speaker"],
+                segment["text"],
+            )
+
+            updated_speaker_label = labelname_to_speaker_mapping.get(
+                original_speaker_label, original_speaker_label
+            )
+
+            file.write(f"{start}-{end} {updated_speaker_label}: {text}\n")
+
+
+def extract_audio_from_mp4(video_file_path, output_audio_path):
+    """
+    Extracts the audio from an MP4 file and saves it as an MP3 file.
+    """
+    with VideoFileClip(video_file_path) as video:
+        audio = video.audio
+        audio.write_audiofile(output_audio_path, codec="mp3")
+    return output_audio_path
+
+
+def get_video_duration(video_path):
+    with VideoFileClip(video_path) as video:
+        return video.duration * 1000
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Read configuration from transcribe_config YAML file"
+    )
+    parser.add_argument("config_file", help="Path to YAML config file")
+
+    args = parser.parse_args()
+    config_file = args.config_file
+    config = load_config(config_file)
+
+    if config:
+        model_size = config["model"]["size"]
+        model_device = "cuda" if torch.cuda.is_available() else "mps"
+        model_chunk_length = int(config["model"]["chunk_length"])
+        model_batch_size = int(config["model"]["batch_size"])
+
+        video_path = "input/Regular Council Mtg 1-4-2024.mp4"
+        audio_output_path = "output_audio/test_audio.mp3"
+        if not os.path.exists(audio_output_path):
+            audio_path = extract_audio_from_mp4(video_path, audio_output_path)
+        else:
+            print(f"Audio file already exists at {audio_output_path}")
+            audio_path = audio_output_path
+
+        config["audio"]["path"] = audio_path
+
+        save_loc = config["transcript"]["save_loc"]
+
+        print("Model Size:", model_size)
+        print("Model Device:", model_device)
+        print("Chunk Length:", model_chunk_length)
+        print("Batch Size", model_batch_size)
+        print("Audio Path:", audio_path)
+        print("---------------")
+
+        model_names = {
+            "tiny": "openai/whisper-tiny.en",
+            "base": "openai/whisper-base.en",
+            "small": "openai/whisper-small.en",
+            "medium": "openai/whisper-medium.en",
+            "large": "openai/whisper-large",
+            "large-v2": "openai/whisper-large-v2",
+        }
+
+        model = model_names.get(model_size, "openai/whisper-tiny")
+
+        pipe = pipeline(
+            "automatic-speech-recognition",
+            model=model,
+            chunk_length_s=model_chunk_length,
+            device=model_device,
+        )
+
+        total_duration_ms = get_video_duration(video_path)
+        last_15_minutes_ms = 15 * 60 * 1000
+
+        start_time_ms = max(total_duration_ms - last_15_minutes_ms, 0)
+        end_time_ms = total_duration_ms
+
+        segment_length_ms = 60000
+
+        segments = split_audio(
+            audio_path, start_time_ms, end_time_ms, segment_length_ms, "output_audio"
+        )
+
+        access_token = ""
+
+        full_diarized_transcript = []
+        combined_diarization_data = []
+        if segments:
+            segment = segments[0]
+            print(f"Processing segment: {segment}")
+
+            preprocessed_segment_path = preprocess_audio_for_diarization(segment)
+            diarization = perform_diarization(preprocessed_segment_path, access_token)
+
+            for turn, _, speaker in diarization.itertracks(yield_label=True):
+                combined_diarization_data.append((turn, speaker))
+
+            segment_transcript = process_segment_with_whisper_and_diarization(
+                segment, diarization, pipe, model_batch_size
+            )
+            full_diarized_transcript.extend(segment_transcript)
+            os.remove(segment)
+
+            # Load face labels from CSV and prepare fr
+            face_labels = load_face_labels("training_data/training_data.csv")
+            face_encodings = encode_faces(face_labels)
+            video_path = "input/Regular Council Mtg 1-4-2024.mp4"
+
+            offset_ms = start_time_ms
+
+            # Adjust timestamps for frame extraction
+            adjusted_timestamps = []
+            for segment in full_diarized_transcript:
+                if segment["end"] is None or segment["speaker"] is None:
+                    continue
+
+                print("Start ms", segment)
+                start_ms = int(segment["start"] * 1000) + offset_ms
+                adjusted_timestamps.append(start_ms)
+            print(f"Adjusted timestamps {adjusted_timestamps}")
+
+            # Extract and save frames
+            frames = extract_frames(video_path, adjusted_timestamps)
+            for timestamp in adjusted_timestamps:
+                print("TIMESTAMP FOR FRAME: {timestamp}")
+                if timestamp in frames:
+                    frame = frames[timestamp]
+                    filename = os.path.join("output_frames", f"frame_{timestamp}.jpg")
+                    cv2.imwrite(filename, frame)
+
+            recognized_faces = recognize_faces(frames, face_encodings)
+            print("Recognized Faces Original", recognized_faces)
+
+            recognized_faces_adjusted = {
+                timestamp - offset_ms: name
+                for timestamp, name in recognized_faces.items()
+            }
+            print("Adjusted Recognized Faces:", recognized_faces_adjusted)
+
+            print("Combined Diarization Data:", combined_diarization_data)
+
+            speaker_mapping, labelname_to_speaker_mapping = map_faces_to_speakers(
+                combined_diarization_data, recognized_faces_adjusted, tolerance_ms=1
+            )
+
+            print("Speaker Mapping:", speaker_mapping)
+
+            export_diarized_transcript_with_names(
+                full_diarized_transcript, labelname_to_speaker_mapping, save_loc
+            )
+
+            print(
+                "Transcription and Diarization with Speaker Names Complete. Saved to",
+                save_loc,
+            )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/packages/whisper/archive/transcribe.py b/packages/whisper/archive/transcribe.py
@@ -8,7 +8,7 @@
 
 def load_config(config_file):
     try:
-        with open(config_file, 'r') as stream:
+        with open(config_file, "r") as stream:
             config = yaml.safe_load(stream)
         return config
     except FileNotFoundError:
@@ -20,56 +20,62 @@ def load_config(config_file):
 
 
 def main():
-    parser = argparse.ArgumentParser(description = "Read configuration from transcribe_config YAML file")
-    parser.add_argument("config_file", help = "Path to YAML config file")
-
+    parser = argparse.ArgumentParser(
+        description="Read configuration from transcribe_config YAML file"
+    )
+    parser.add_argument("config_file", help="Path to YAML config file")
+
     args = parser.parse_args()
     config_file = args.config_file
     config = load_config(config_file)
-    
+
     if config:
         model_size = config["model"]["size"]
         model_device = config["model"]["device"]
         model_chunk_length = int(config["model"]["chunk_length"])
         model_batch_size = int(config["model"]["batch_size"])
         audio_path = config["audio"]["path"]
         save_loc = config["transcript"]["save_loc"]
-        
+
         print("Model Size:", model_size)
         print("Model Device:", model_device)
         print("Chunk Length:", model_chunk_length)
         print("Batch Size", model_batch_size)
         print("Audio Path:", audio_path)
         print("---------------")
-
-
-    model_names = {"tiny":"openai/whisper-tiny.en",
-                   "base":"openai/whisper-base.en",
-                   "small":"openai/whisper-small.en",
-                   "medium":"openai/whisper-medium.en",
-                   "large":"openai/whisper-large",
-                   "large_v2":"openai/whisper-large-v2"}
-
+
+    model_names = {
+        "tiny": "openai/whisper-tiny.en",
+        "base": "openai/whisper-base.en",
+        "small": "openai/whisper-small.en",
+        "medium": "openai/whisper-medium.en",
+        "large": "openai/whisper-large",
+        "large_v2": "openai/whisper-large-v2",
+    }
+
     model = model_names[model_size]
-    
+
     pipe = pipeline(
         "automatic-speech-recognition",
-        model = model,
-        chunk_length_s = model_chunk_length,
-        device = model_device
+        model=model,
+        chunk_length_s=model_chunk_length,
+        device=model_device,
     )
     start_time = time.time()
-
-    transcript = pipe(audio_path, batch_size = model_batch_size, return_timestamps = True)["chunks"]
-
+
+    transcript = pipe(audio_path, batch_size=model_batch_size, return_timestamps=True)[
+        "chunks"
+    ]
+
     end_time = time.time()
     total_time = end_time - start_time
     print("Generation Complete. Time to Generate:", str(total_time))
     print("Saving Transcript to", save_loc)
-    with open(save_loc, 'w') as f:
+    with open(save_loc, "w") as f:
         for chunk in transcript:
             f.write("%s\n" % chunk)
     print("Save Complete")
 
+
 if __name__ == "__main__":
-    main()
+    main()
diff --git a/packages/whisper/config.yml b/packages/whisper/config.yml
@@ -1,12 +1,12 @@
 model:
-  size: "tiny"
+  size: "base"
   device: "mps"  # or your specific GPU device
-  chunk_length: "30"
-  batch_size: 12
+  chunk_length: "60"
+  batch_size: 4
 audio:
   path: "output_audio/City Council Meeting 142024.mp3"
   offset_ms: 0  # Adjust the offset based on your requirements
 transcript:
-  save_loc: "output_transcript/test_transcript.txt"
+  save_loc: "output_transcript/test-transcript-chunk-60-base.txt"
 diarization:
   path: "diarization"  # Adjust the path to your diarization data
diff --git a/packages/whisper/src-preprocess.py b/packages/whisper/src-preprocess.py
@@ -0,0 +1,316 @@
+import os
+import yaml
+import argparse
+import face_recognition
+import pandas as pd
+from pydub import AudioSegment
+from pytube import YouTube
+from transformers import pipeline
+import torch
+from moviepy.editor import VideoFileClip
+
+from pydub import AudioSegment
+
+import numpy as np
+import librosa
+import soundfile as sf
+
+
+from pydub import AudioSegment
+from pydub.effects import normalize, compress_dynamic_range
+from pydub import silence
+from scipy.signal import butter, lfilter
+
+
+def load_config(config_file):
+    try:
+        with open(config_file, "r") as stream:
+            config = yaml.safe_load(stream)
+        return config
+    except FileNotFoundError:
+        print(f"Config file '{config_file}' not found.")
+        return None
+    except yaml.YAMLError as e:
+        print(f"Error parsing config file: {e}")
+        return None
+
+
+def process_segment_with_whisper(segment_path, pipe, model_batch_size):
+    transcript = pipe(
+        segment_path, batch_size=model_batch_size, return_timestamps=True
+    )["chunks"]
+    print(f"TRANSCRIPT: {transcript}")
+
+    processed_transcript = []
+    for chunk in transcript:
+        start, end = chunk["timestamp"]
+        processed_transcript.append(
+            {
+                "start": start,
+                "end": end,
+                "text": chunk["text"],
+            }
+        )
+    return processed_transcript
+
+
+def download_youtube_audio(url, save_path):
+    try:
+        yt = YouTube(url)
+        video = yt.streams.filter(only_audio=True).first()
+        out_file = video.download(output_path=save_path)
+        base, ext = os.path.splitext(out_file)
+        new_file = base + ".mp3"
+        os.rename(out_file, new_file)
+        return new_file
+    except Exception as e:
+        print(f"Error downloading audio: {e}")
+        return None
+
+
+def recognize_faces(frames, face_encodings):
+    recognized_faces = {}
+    for timestamp, frame in frames.items():
+        face_locations = face_recognition.face_locations(frame, model="cnn")
+        face_encs = face_recognition.face_encodings(frame, face_locations)
+        for face_enc in face_encs:
+            matches = face_recognition.compare_faces(
+                list(face_encodings.values()), face_enc
+            )
+            if True in matches:
+                first_match_index = matches.index(True)
+                name = list(face_encodings.keys())[first_match_index]
+                recognized_faces[timestamp] = name
+                print(f"Timestamp (ms): {timestamp}, Recognized face: {name}")
+    return recognized_faces
+
+
+def export_transcript(transcript, save_loc):
+    with open(save_loc, "w") as file:
+        for segment in transcript:
+            print(f"TRANSCRIPT SEGMENT IN EXPORT FUNC: {segment}")
+            start, end, text = (
+                segment["start"],
+                segment["end"],
+                segment["text"],
+            )
+
+            file.write(f"{start}-{end}: {text}\n")
+
+
+def get_video_duration(video_path):
+    with VideoFileClip(video_path) as video:
+        return video.duration * 1000
+
+
+def split_audio(file_path, start_time_ms, end_time_ms, segment_length_ms, output_dir):
+    audio = AudioSegment.from_file(file_path)
+    audio_segment = audio[start_time_ms:end_time_ms]
+    segments = []
+
+    for i in range(0, len(audio_segment), segment_length_ms):
+        segment = audio_segment[i : i + segment_length_ms]
+        segment_name = f"segment_{i // segment_length_ms}.mp3"
+        segment_file = os.path.join(output_dir, segment_name)
+        segment.export(segment_file, format="mp3")
+        segments.append(segment_file)
+
+    return segments
+
+
+def preprocess_and_extract_audio(
+    video_path, output_audio_path, target_sample_rate=16000
+):
+    with VideoFileClip(video_path) as video:
+        audio = video.audio
+        temp_audio_path = "temp_audio.wav"
+        audio.write_audiofile(
+            temp_audio_path, codec="pcm_s16le"
+        ) 
+
+    y, sr = librosa.load(temp_audio_path, sr=target_sample_rate)
+
+    sf.write(output_audio_path, y, target_sample_rate)
+
+    os.remove(temp_audio_path)
+
+
+def apply_dynamic_range_compression(audio_path):
+    audio = AudioSegment.from_file(audio_path)
+    compressed_audio = compress_dynamic_range(audio)
+    normalized_audio = normalize(compressed_audio)
+    compressed_audio_path = "compressed_" + audio_path
+    normalized_audio.export(compressed_audio_path, format="mp3")
+    return compressed_audio_path
+
+
+def remove_silence(audio_path, silence_thresh=-40, min_silence_len=1000):
+    audio = AudioSegment.from_file(audio_path)
+    non_silent_chunks = silence.split_on_silence(
+        audio,
+        min_silence_len=min_silence_len, 
+        silence_thresh=silence_thresh,  
+        keep_silence=500, 
+    )
+
+    processed_audio = AudioSegment.empty()
+    for chunk in non_silent_chunks:
+        processed_audio += chunk
+
+    processed_audio_path = "processed_" + os.path.basename(audio_path)
+    processed_audio.export(processed_audio_path, format="mp3")
+    return processed_audio_path
+
+
+def butter_bandpass(lowcut, highcut, fs, order=5):
+    nyq = 0.5 * fs
+    low = lowcut / nyq
+    high = highcut / nyq
+    b, a = butter(order, [low, high], btype="band")
+    return b, a
+
+
+def butter_bandpass_filter(data, lowcut, highcut, fs, order=5):
+    b, a = butter_bandpass(lowcut, highcut, fs, order=order)
+    y = lfilter(b, a, data)
+    return y
+
+
+def bandpass_filter(audio_path, low=300, high=3400, sr=16000):
+    y, sr = librosa.load(audio_path, sr=sr)
+
+    y_filtered = butter_bandpass_filter(y, low, high, sr)
+
+    filtered_audio_path = "filtered_" + os.path.basename(audio_path)
+    sf.write(filtered_audio_path, y_filtered, sr)
+
+    return filtered_audio_path
+
+
+def change_speed(audio_path, speed=1.0):
+    audio = AudioSegment.from_file(audio_path)
+    playback_speed_audio = audio.speedup(playback_speed=speed)
+    speed_changed_audio_path = "speed_changed_" + audio_path
+    playback_speed_audio.export(speed_changed_audio_path, format="mp3")
+    return speed_changed_audio_path
+
+
+def audio_preprocessing_pipeline(audio_segment_path):
+    compressed_audio_path = apply_dynamic_range_compression(audio_segment_path)
+
+    no_silence_audio_path = remove_silence(compressed_audio_path)
+
+    filtered_audio_path = bandpass_filter(no_silence_audio_path)
+
+    return filtered_audio_path  
+
+
+def process_segment_with_whisper(segment_path, pipe, model_batch_size):
+    transcript = pipe(
+        segment_path, batch_size=model_batch_size, return_timestamps=True
+    )["chunks"]
+    print(f"TRANSCRIPT: {transcript}")
+
+    processed_transcript = []
+    for chunk in transcript:
+        start, end = chunk["timestamp"]
+        processed_transcript.append(
+            {
+                "start": start,
+                "end": end,
+                "text": chunk["text"],
+            }
+        )
+    return processed_transcript
+
+
+def extract_audio_from_mp4(video_file_path, output_audio_path):
+    """
+    Extracts the audio from an MP4 file and saves it as an MP3 file.
+    """
+    with VideoFileClip(video_file_path) as video:
+        audio = video.audio
+        audio.write_audiofile(output_audio_path, codec="mp3")
+    return output_audio_path
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Read configuration from transcribe_config YAML file"
+    )
+    parser.add_argument("config_file", help="Path to YAML config file")
+
+    args = parser.parse_args()
+    config_file = args.config_file
+    config = load_config(config_file)
+
+    if config:
+        model_size = config["model"]["size"]
+        model_device = "cuda" if torch.cuda.is_available() else "mps"
+        model_chunk_length = int(config["model"]["chunk_length"])
+        model_batch_size = int(config["model"]["batch_size"])
+
+        video_path = "input/Regular Council Mtg 1-4-2024.mp4"
+        audio_output_path = "output_audio/test_audio.mp3"
+        if not os.path.exists(audio_output_path):
+            audio_path = extract_audio_from_mp4(video_path, audio_output_path)
+        else:
+            print(f"Audio file already exists at {audio_output_path}")
+            audio_path = audio_output_path
+
+        config["audio"]["path"] = audio_path
+
+        save_loc = config["transcript"]["save_loc"]
+
+        print("Model Size:", model_size)
+        print("Model Device:", model_device)
+        print("Chunk Length:", model_chunk_length)
+        print("Batch Size", model_batch_size)
+        print("Audio Path:", audio_path)
+        print("---------------")
+
+        model_names = {
+            "tiny": "openai/whisper-tiny.en",
+            "base": "openai/whisper-base.en",
+            "small": "openai/whisper-small.en",
+            "medium": "openai/whisper-medium.en",
+            "large": "openai/whisper-large",
+            "large-v2": "openai/whisper-large-v2",
+            "large-v3": "openai/whisper-large-v3",
+        }
+
+        model = model_names.get(model_size, "openai/whisper-large")
+
+        pipe = pipeline(
+            "automatic-speech-recognition",
+            model=model,
+            chunk_length_s=model_chunk_length,
+            device=model_device,
+        )
+
+        total_duration_ms = get_video_duration(video_path)
+        last_15_minutes_ms = 15 * 60 * 1000
+        start_time_ms = max(total_duration_ms - last_15_minutes_ms, 0)
+        end_time_ms = total_duration_ms
+
+        audio = AudioSegment.from_file(audio_path)
+        last_15_min_audio = audio[start_time_ms:end_time_ms]
+        last_15_min_audio_path = os.path.join("output_audio", "last_15_min.mp3")
+        last_15_min_audio.export(last_15_min_audio_path, format="mp3")
+
+        preprocessed_audio_path = audio_preprocessing_pipeline(
+            last_15_min_audio_path
+        ) 
+
+        print(f"Processing audio: {preprocessed_audio_path}")
+        full_transcript = process_segment_with_whisper(
+            preprocessed_audio_path, pipe, model_batch_size
+        )
+
+        export_transcript(full_transcript, save_loc)
+
+        print("Transcription Complete. Saved to", save_loc)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/packages/whisper/src.py b/packages/whisper/src.py
@@ -2,13 +2,10 @@
 import yaml
 import argparse
 import face_recognition
-import cv2
 import pandas as pd
 from pydub import AudioSegment
 from pytube import YouTube
-
 from transformers import pipeline
-from pyannote.audio import Pipeline
 import torch
 from moviepy.editor import VideoFileClip
 
@@ -26,85 +23,23 @@ def load_config(config_file):
         return None
 
 
-def load_face_labels(csv_file):
-    df = pd.read_csv(csv_file)
-
-    df.loc[:, "label"] = df.label.str.lower()
-    df.loc[:, "filepath"] = df.filepath.str.lower()
-    base_path = os.path.join(os.getcwd(), "training_data")  # Base path for images
-    adjusted_filepaths = [os.path.join(base_path, path) for path in df["filepath"]]
-    return dict(zip(adjusted_filepaths, df["label"]))
-
-
-def preprocess_audio_for_diarization(file_path):
-    audio = AudioSegment.from_file(file_path)
-    audio = audio.set_channels(1).set_frame_rate(16000)
-    preprocessed_path = file_path.replace(".mp3", "_preprocessed.wav")
-    audio.export(preprocessed_path, format="wav")
-    return preprocessed_path
-
-
-def perform_diarization(file_path, access_token):
-    diarization_pipeline = Pipeline.from_pretrained(
-        "pyannote/speaker-diarization", use_auth_token=access_token
-    )
-    diarization = diarization_pipeline(file_path)
-    return diarization
-
-
-def split_audio(file_path, start_time_ms, end_time_ms, segment_length_ms, output_dir):
-    audio = AudioSegment.from_file(file_path)
-    audio_segment = audio[start_time_ms:end_time_ms]
-    segments = []
-
-    for i in range(0, len(audio_segment), segment_length_ms):
-        segment = audio_segment[i : i + segment_length_ms]
-        segment_name = f"segment_{i // segment_length_ms}.mp3"
-        segment_file = os.path.join(output_dir, segment_name)
-        segment.export(segment_file, format="mp3")
-        segments.append(segment_file)
-
-    return segments
-
-
-def process_segment_with_whisper_and_diarization(
-    segment_path, diarization_results, pipe, model_batch_size
-):
+def process_segment_with_whisper(segment_path, pipe, model_batch_size):
     transcript = pipe(
         segment_path, batch_size=model_batch_size, return_timestamps=True
     )["chunks"]
     print(f"TRANSCRIPT: {transcript}")
 
-    diarized_transcript = []
+    processed_transcript = []
     for chunk in transcript:
         start, end = chunk["timestamp"]
-        speaker_label = get_speaker_label(diarization_results, start, end)
-        diarized_transcript.append(
+        processed_transcript.append(
             {
                 "start": start,
                 "end": end,
-                "speaker": speaker_label,
                 "text": chunk["text"],
             }
         )
-    return diarized_transcript
-
-
-def get_speaker_label(diarization_results, start, end):
-    # Handle None values for start and end
-    if start is None or end is None:
-        return None
-
-    overlap = {}
-    for turn, _, speaker in diarization_results.itertracks(yield_label=True):
-        if turn.end < start or turn.start > end:
-            continue
-        overlap[speaker] = (
-            overlap.get(speaker, 0) + min(end, turn.end) - max(start, turn.start)
-        )
-    if overlap:
-        return max(overlap, key=overlap.get)
-    return None
+    return processed_transcript
 
 
 def download_youtube_audio(url, save_path):
@@ -121,31 +56,6 @@ def download_youtube_audio(url, save_path):
         return None
 
 
-def encode_faces(face_labels):
-    face_encodings = {}
-    for filepath, label in face_labels.items():
-        image = face_recognition.load_image_file(filepath)
-        encodings = face_recognition.face_encodings(image)
-        if not encodings:
-            print(f"No face detected in image: {filepath}, skipping...")
-            continue
-        face_encodings[label] = encodings[0]
-    return face_encodings
-
-
-def extract_frames(video_path, timestamps_ms):
-    cap = cv2.VideoCapture(video_path)
-    frames = {}
-
-    for timestamp_ms in timestamps_ms:
-        cap.set(cv2.CAP_PROP_POS_MSEC, timestamp_ms)
-        ret, frame = cap.read()
-        if ret:
-            frames[timestamp_ms] = frame
-    cap.release()
-    return frames
-
-
 def recognize_faces(frames, face_encodings):
     recognized_faces = {}
     for timestamp, frame in frames.items():
@@ -163,72 +73,17 @@ def recognize_faces(frames, face_encodings):
     return recognized_faces
 
 
-def map_faces_to_speakers(diarized_transcript, recognized_faces, tolerance_ms=1000):
-    speaker_mapping = {}
-    labelname_to_speaker_mapping = {}
-
-    print("Received Recognized Faces (ms):", recognized_faces)
-
-    for segment_tuple in diarized_transcript:
-        segment, speaker = segment_tuple
-        segment_start_ms = segment.start * 1000
-        segment_end_ms = segment.end * 1000
-        segment_speakers = []
-
-        print(
-            f"\nProcessing Segment: [{segment_start_ms} - {segment_end_ms}], Speaker Label: {speaker}"
-        )
-
-        for face_time_ms, name in recognized_faces.items():
-            print(f"  Checking Face Time: {face_time_ms}, Name: {name}")
-
-            if (
-                (segment_start_ms - tolerance_ms)
-                <= face_time_ms
-                <= (segment_end_ms + tolerance_ms)
-            ):
-                segment_speakers.append(name)
-                print(
-                    f"    Matching Face Detected - Time (ms): {face_time_ms}, Name: {name}"
-                )
-
-        recognized_speaker = (
-            max(set(segment_speakers), key=segment_speakers.count)
-            if segment_speakers
-            else None
-        )
-        identified_speaker = recognized_speaker if recognized_speaker else speaker
-
-        speaker_mapping[(segment_start_ms, segment_end_ms)] = identified_speaker
-
-        if recognized_speaker:
-            labelname_to_speaker_mapping[speaker] = recognized_speaker
-
-        print(
-            f"  Finalized Mapping for Segment: {segment_start_ms} - {segment_end_ms}, Speaker: {identified_speaker}"
-        )
-
-    return speaker_mapping, labelname_to_speaker_mapping
-
-
-def export_diarized_transcript_with_names(
-    diarized_transcript, labelname_to_speaker_mapping, save_loc
-):
+def export_transcript(transcript, save_loc):
     with open(save_loc, "w") as file:
-        for segment in diarized_transcript:
-            print(f"DIARIZED SEGMENT IN EXPORT FUNC: {segment}")
-            start, end, original_speaker_label, text = (
+        for segment in transcript:
+            print(f"TRANSCRIPT SEGMENT IN EXPORT FUNC: {segment}")
+            start, end, text = (
                 segment["start"],
                 segment["end"],
-                segment["speaker"],
                 segment["text"],
             )
 
-            updated_speaker_label = labelname_to_speaker_mapping.get(
-                original_speaker_label, original_speaker_label
-            )
-
-            file.write(f"{start}-{end} {updated_speaker_label}: {text}\n")
+            file.write(f"{start}-{end}: {text}\n")
 
 
 def extract_audio_from_mp4(video_file_path, output_audio_path):
@@ -241,6 +96,26 @@ def extract_audio_from_mp4(video_file_path, output_audio_path):
     return output_audio_path
 
 
+def get_video_duration(video_path):
+    with VideoFileClip(video_path) as video:
+        return video.duration * 1000
+
+
+def split_audio(file_path, start_time_ms, end_time_ms, segment_length_ms, output_dir):
+    audio = AudioSegment.from_file(file_path)
+    audio_segment = audio[start_time_ms:end_time_ms]
+    segments = []
+
+    for i in range(0, len(audio_segment), segment_length_ms):
+        segment = audio_segment[i : i + segment_length_ms]
+        segment_name = f"segment_{i // segment_length_ms}.mp3"
+        segment_file = os.path.join(output_dir, segment_name)
+        segment.export(segment_file, format="mp3")
+        segments.append(segment_file)
+
+    return segments
+
+
 def main():
     parser = argparse.ArgumentParser(
         description="Read configuration from transcribe_config YAML file"
@@ -283,9 +158,10 @@ def main():
             "medium": "openai/whisper-medium.en",
             "large": "openai/whisper-large",
             "large-v2": "openai/whisper-large-v2",
+            "large-v3": "openai/whisper-large-v3",
         }
 
-        model = model_names.get(model_size, "openai/whisper-tiny")
+        model = model_names.get(model_size, "openai/whisper-base")
 
         pipe = pipeline(
             "automatic-speech-recognition",
@@ -294,87 +170,24 @@ def main():
             device=model_device,
         )
 
-        start_time_ms = (1 * 60 * 60 + 10 * 60) * 1000
-        end_time_ms = (1 * 60 * 60 + 18 * 60) * 1000
+        total_duration_ms = get_video_duration(video_path)
+        last_15_minutes_ms = 15 * 60 * 1000
+        start_time_ms = max(total_duration_ms - last_15_minutes_ms, 0)
+        end_time_ms = total_duration_ms
 
-        segment_length_ms = 60000
+        audio = AudioSegment.from_file(audio_path)
+        last_15_min_audio = audio[start_time_ms:end_time_ms]
+        last_15_min_audio_path = os.path.join("output_audio", "last_15_min.mp3")
+        last_15_min_audio.export(last_15_min_audio_path, format="mp3")
 
-        segments = split_audio(
-            audio_path, start_time_ms, end_time_ms, segment_length_ms, "output_audio"
+        print(f"Processing audio: {last_15_min_audio_path}")
+        full_transcript = process_segment_with_whisper(
+            last_15_min_audio_path, pipe, model_batch_size
         )
 
-        access_token = ""
-
-        full_diarized_transcript = []
-        combined_diarization_data = []
-        if segments:
-            segment = segments[0]
-            print(f"Processing segment: {segment}")
-
-            preprocessed_segment_path = preprocess_audio_for_diarization(segment)
-            diarization = perform_diarization(preprocessed_segment_path, access_token)
-
-            for turn, _, speaker in diarization.itertracks(yield_label=True):
-                combined_diarization_data.append((turn, speaker))
+        export_transcript(full_transcript, save_loc)
 
-            segment_transcript = process_segment_with_whisper_and_diarization(
-                segment, diarization, pipe, model_batch_size
-            )
-            full_diarized_transcript.extend(segment_transcript)
-            os.remove(segment)
-
-            # Load face labels from CSV and prepare fr
-            face_labels = load_face_labels("training_data/training_data.csv")
-            face_encodings = encode_faces(face_labels)
-            video_path = "input/Regular Council Mtg 1-4-2024.mp4"
-
-            offset_ms = start_time_ms
-
-            # Adjust timestamps for frame extraction
-            adjusted_timestamps = []
-            for segment in full_diarized_transcript:
-                if segment["end"] is None or segment["speaker"] is None:
-                    continue
-
-                print("Start ms", segment)
-                start_ms = int(segment["start"] * 1000) + offset_ms
-                adjusted_timestamps.append(start_ms)
-            print(f"Adjusted timestamps {adjusted_timestamps}")
-
-            # Extract and save frames
-            frames = extract_frames(video_path, adjusted_timestamps)
-            for timestamp in adjusted_timestamps:
-                print("TIMESTAMP FOR FRAME: {timestamp}")
-                if timestamp in frames:
-                    frame = frames[timestamp]
-                    filename = os.path.join("output_frames", f"frame_{timestamp}.jpg")
-                    cv2.imwrite(filename, frame)
-
-            recognized_faces = recognize_faces(frames, face_encodings)
-            print("Recognized Faces Original", recognized_faces)
-
-            recognized_faces_adjusted = {
-                timestamp - offset_ms: name
-                for timestamp, name in recognized_faces.items()
-            }
-            print("Adjusted Recognized Faces:", recognized_faces_adjusted)
-
-            print("Combined Diarization Data:", combined_diarization_data)
-
-            speaker_mapping, labelname_to_speaker_mapping = map_faces_to_speakers(
-                combined_diarization_data, recognized_faces_adjusted, tolerance_ms=1
-            )
-
-            print("Speaker Mapping:", speaker_mapping)
-
-            export_diarized_transcript_with_names(
-                full_diarized_transcript, labelname_to_speaker_mapping, save_loc
-            )
-
-            print(
-                "Transcription and Diarization with Speaker Names Complete. Saved to",
-                save_loc,
-            )
+        print("Transcription Complete. Saved to", save_loc)
 
 
 if __name__ == "__main__":