diff --git a/packages/backend/src/cache/faiss_index_in_depth_cj.dvc b/packages/backend/src/cache/faiss_index_in_depth_cj.dvc index 465e570a..86b714b9 100644 --- a/packages/backend/src/cache/faiss_index_in_depth_cj.dvc +++ b/packages/backend/src/cache/faiss_index_in_depth_cj.dvc @@ -1,5 +1,5 @@ outs: -- md5: ccd643270cd24af47d96dfb198d468ae.dir +- md5: 9c2090ec6ee84ddf3792866364b35b0f.dir size: 23116757 nfiles: 2 hash: md5 diff --git a/packages/backend/src/cache/faiss_index_in_depth_fc.dvc b/packages/backend/src/cache/faiss_index_in_depth_fc.dvc index 584bd5a5..e67fea50 100644 --- a/packages/backend/src/cache/faiss_index_in_depth_fc.dvc +++ b/packages/backend/src/cache/faiss_index_in_depth_fc.dvc @@ -1,5 +1,5 @@ outs: -- md5: a06190b8eee319f9083f642bb844d6a4.dir +- md5: ad7c688f8fff83d65f8728a403c864dd.dir size: 63799934 nfiles: 2 hash: md5 diff --git a/packages/backend/src/cache/faiss_index_in_depth_news.dvc b/packages/backend/src/cache/faiss_index_in_depth_news.dvc index f66d927c..625cf616 100644 --- a/packages/backend/src/cache/faiss_index_in_depth_news.dvc +++ b/packages/backend/src/cache/faiss_index_in_depth_news.dvc @@ -1,6 +1,6 @@ outs: -- md5: 89036cfaca111f3d2fa33ed70003aa46.dir - size: 242119 +- md5: d08e6dfb04e5afcff49028b2bffed7ad.dir + size: 526418 nfiles: 2 hash: md5 path: faiss_index_in_depth_news diff --git a/packages/backend/src/cache/faiss_index_in_depth_pc.dvc b/packages/backend/src/cache/faiss_index_in_depth_pc.dvc index 8627ee16..171e7ff6 100644 --- a/packages/backend/src/cache/faiss_index_in_depth_pc.dvc +++ b/packages/backend/src/cache/faiss_index_in_depth_pc.dvc @@ -1,6 +1,6 @@ outs: -- md5: 7645c2b96dcaf4936ee31103ac0c0968.dir - size: 2028257 +- md5: 75d23bd607ac91d67d994f624fb2e4c2.dir + size: 2097501 nfiles: 2 hash: md5 path: faiss_index_in_depth_pc diff --git a/packages/backend/src/cache/faiss_index_in_depth_pdf.dvc b/packages/backend/src/cache/faiss_index_in_depth_pdf.dvc index 953f6df4..5913b763 100644 --- a/packages/backend/src/cache/faiss_index_in_depth_pdf.dvc +++ b/packages/backend/src/cache/faiss_index_in_depth_pdf.dvc @@ -1,6 +1,6 @@ outs: -- md5: 47ce778b85c5553106c9f4da5f45ac28.dir - size: 54406163 +- md5: 28bfb00c7d3ebfbb4de620e9915e75b3.dir + size: 53418928 nfiles: 2 hash: md5 path: faiss_index_in_depth_pdf diff --git a/packages/backend/src/preprocessor.py b/packages/backend/src/preprocessor.py index ae894cc7..b2b276ea 100644 --- a/packages/backend/src/preprocessor.py +++ b/packages/backend/src/preprocessor.py @@ -6,7 +6,7 @@ from langchain.chains import LLMChain, HypotheticalDocumentEmbedder from langchain.prompts import PromptTemplate -from langchain.vectorstores.faiss import FAISS +from langchain_community.vectorstores import FAISS from langchain_openai import OpenAI from pathlib import Path import shutil @@ -73,7 +73,7 @@ def create_db_from_minutes_and_agendas(doc_directory): data = loader.load() text_splitter = RecursiveCharacterTextSplitter( - chunk_size=2000, chunk_overlap=1000 + chunk_size=2000, chunk_overlap=100 ) docs = text_splitter.split_documents(data) all_docs.extend(docs) @@ -103,7 +103,7 @@ def create_db_from_news_transcripts(news_json_directory): data = loader.load() text_splitter = RecursiveCharacterTextSplitter( - chunk_size=10000, chunk_overlap=5000 + chunk_size=2000, chunk_overlap=100 ) docs = text_splitter.split_documents(data) all_docs.extend(docs) @@ -136,7 +136,7 @@ def create_db_from_cj_transcripts(cj_json_directory): data = loader.load() text_splitter = RecursiveCharacterTextSplitter( - chunk_size=2000, chunk_overlap=1000 + chunk_size=2000, chunk_overlap=100 ) docs = text_splitter.split_documents(data) @@ -169,7 +169,7 @@ def create_db_from_fc_transcripts(fc_json_directory): data = loader.load() text_splitter = RecursiveCharacterTextSplitter( - chunk_size=2000, chunk_overlap=1000 + chunk_size=2000, chunk_overlap=100 ) docs = text_splitter.split_documents(data) # Append the publish date to the end of page_content @@ -199,7 +199,7 @@ def create_db_from_public_comments(pc_json_directory): data = loader.load() text_splitter = RecursiveCharacterTextSplitter( - chunk_size=10000, chunk_overlap=5000 + chunk_size=2000, chunk_overlap=100 ) docs = text_splitter.split_documents(data) all_docs.extend(docs) diff --git a/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_cj.dvc b/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_cj.dvc index 465e570a..86b714b9 100644 --- a/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_cj.dvc +++ b/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_cj.dvc @@ -1,5 +1,5 @@ outs: -- md5: ccd643270cd24af47d96dfb198d468ae.dir +- md5: 9c2090ec6ee84ddf3792866364b35b0f.dir size: 23116757 nfiles: 2 hash: md5 diff --git a/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_fc.dvc b/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_fc.dvc index 584bd5a5..e67fea50 100644 --- a/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_fc.dvc +++ b/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_fc.dvc @@ -1,5 +1,5 @@ outs: -- md5: a06190b8eee319f9083f642bb844d6a4.dir +- md5: ad7c688f8fff83d65f8728a403c864dd.dir size: 63799934 nfiles: 2 hash: md5 diff --git a/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_news.dvc b/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_news.dvc index f66d927c..625cf616 100644 --- a/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_news.dvc +++ b/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_news.dvc @@ -1,6 +1,6 @@ outs: -- md5: 89036cfaca111f3d2fa33ed70003aa46.dir - size: 242119 +- md5: d08e6dfb04e5afcff49028b2bffed7ad.dir + size: 526418 nfiles: 2 hash: md5 path: faiss_index_in_depth_news diff --git a/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_pc.dvc b/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_pc.dvc index 8627ee16..171e7ff6 100644 --- a/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_pc.dvc +++ b/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_pc.dvc @@ -1,6 +1,6 @@ outs: -- md5: 7645c2b96dcaf4936ee31103ac0c0968.dir - size: 2028257 +- md5: 75d23bd607ac91d67d994f624fb2e4c2.dir + size: 2097501 nfiles: 2 hash: md5 path: faiss_index_in_depth_pc diff --git a/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_pdf.dvc b/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_pdf.dvc index 953f6df4..5913b763 100644 --- a/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_pdf.dvc +++ b/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_pdf.dvc @@ -1,6 +1,6 @@ outs: -- md5: 47ce778b85c5553106c9f4da5f45ac28.dir - size: 54406163 +- md5: 28bfb00c7d3ebfbb4de620e9915e75b3.dir + size: 53418928 nfiles: 2 hash: md5 path: faiss_index_in_depth_pdf diff --git a/packages/whisper/archive/src-diarization.py b/packages/whisper/archive/src-diarization.py new file mode 100644 index 00000000..d29b9485 --- /dev/null +++ b/packages/whisper/archive/src-diarization.py @@ -0,0 +1,389 @@ +import os +import yaml +import argparse +import face_recognition +import cv2 +import pandas as pd +from pydub import AudioSegment +from pytube import YouTube + +from transformers import pipeline +from pyannote.audio import Pipeline +import torch +from moviepy.editor import VideoFileClip + + +def load_config(config_file): + try: + with open(config_file, "r") as stream: + config = yaml.safe_load(stream) + return config + except FileNotFoundError: + print(f"Config file '{config_file}' not found.") + return None + except yaml.YAMLError as e: + print(f"Error parsing config file: {e}") + return None + + +def load_face_labels(csv_file): + df = pd.read_csv(csv_file) + + df.loc[:, "label"] = df.label.str.lower() + df.loc[:, "filepath"] = df.filepath.str.lower() + base_path = os.path.join(os.getcwd(), "training_data") # Base path for images + adjusted_filepaths = [os.path.join(base_path, path) for path in df["filepath"]] + return dict(zip(adjusted_filepaths, df["label"])) + + +def preprocess_audio_for_diarization(file_path): + audio = AudioSegment.from_file(file_path) + audio = audio.set_channels(1).set_frame_rate(16000) + preprocessed_path = file_path.replace(".mp3", "_preprocessed.wav") + audio.export(preprocessed_path, format="wav") + return preprocessed_path + + +def perform_diarization(file_path, access_token): + diarization_pipeline = Pipeline.from_pretrained( + "pyannote/speaker-diarization", use_auth_token=access_token + ) + diarization = diarization_pipeline(file_path) + return diarization + + +def split_audio(file_path, start_time_ms, end_time_ms, segment_length_ms, output_dir): + audio = AudioSegment.from_file(file_path) + audio_segment = audio[start_time_ms:end_time_ms] + segments = [] + + for i in range(0, len(audio_segment), segment_length_ms): + segment = audio_segment[i : i + segment_length_ms] + segment_name = f"segment_{i // segment_length_ms}.mp3" + segment_file = os.path.join(output_dir, segment_name) + segment.export(segment_file, format="mp3") + segments.append(segment_file) + + return segments + + +def process_segment_with_whisper_and_diarization( + segment_path, diarization_results, pipe, model_batch_size +): + transcript = pipe( + segment_path, batch_size=model_batch_size, return_timestamps=True + )["chunks"] + print(f"TRANSCRIPT: {transcript}") + + diarized_transcript = [] + for chunk in transcript: + start, end = chunk["timestamp"] + speaker_label = get_speaker_label(diarization_results, start, end) + diarized_transcript.append( + { + "start": start, + "end": end, + "speaker": speaker_label, + "text": chunk["text"], + } + ) + return diarized_transcript + + +def get_speaker_label(diarization_results, start, end): + # Handle None values for start and end + if start is None or end is None: + return None + + overlap = {} + for turn, _, speaker in diarization_results.itertracks(yield_label=True): + if turn.end < start or turn.start > end: + continue + overlap[speaker] = ( + overlap.get(speaker, 0) + min(end, turn.end) - max(start, turn.start) + ) + if overlap: + return max(overlap, key=overlap.get) + return None + + +def download_youtube_audio(url, save_path): + try: + yt = YouTube(url) + video = yt.streams.filter(only_audio=True).first() + out_file = video.download(output_path=save_path) + base, ext = os.path.splitext(out_file) + new_file = base + ".mp3" + os.rename(out_file, new_file) + return new_file + except Exception as e: + print(f"Error downloading audio: {e}") + return None + + +def encode_faces(face_labels): + face_encodings = {} + for filepath, label in face_labels.items(): + image = face_recognition.load_image_file(filepath) + encodings = face_recognition.face_encodings(image) + if not encodings: + print(f"No face detected in image: {filepath}, skipping...") + continue + face_encodings[label] = encodings[0] + return face_encodings + + +def extract_frames(video_path, timestamps_ms): + cap = cv2.VideoCapture(video_path) + frames = {} + + for timestamp_ms in timestamps_ms: + cap.set(cv2.CAP_PROP_POS_MSEC, timestamp_ms) + ret, frame = cap.read() + if ret: + frames[timestamp_ms] = frame + cap.release() + return frames + + +def recognize_faces(frames, face_encodings): + recognized_faces = {} + for timestamp, frame in frames.items(): + face_locations = face_recognition.face_locations(frame, model="cnn") + face_encs = face_recognition.face_encodings(frame, face_locations) + for face_enc in face_encs: + matches = face_recognition.compare_faces( + list(face_encodings.values()), face_enc + ) + if True in matches: + first_match_index = matches.index(True) + name = list(face_encodings.keys())[first_match_index] + recognized_faces[timestamp] = name + print(f"Timestamp (ms): {timestamp}, Recognized face: {name}") + return recognized_faces + + +def map_faces_to_speakers(diarized_transcript, recognized_faces, tolerance_ms=1000): + speaker_mapping = {} + labelname_to_speaker_mapping = {} + + print("Received Recognized Faces (ms):", recognized_faces) + + for segment_tuple in diarized_transcript: + segment, speaker = segment_tuple + segment_start_ms = segment.start * 1000 + segment_end_ms = segment.end * 1000 + segment_speakers = [] + + print( + f"\nProcessing Segment: [{segment_start_ms} - {segment_end_ms}], Speaker Label: {speaker}" + ) + + for face_time_ms, name in recognized_faces.items(): + print(f" Checking Face Time: {face_time_ms}, Name: {name}") + + if ( + (segment_start_ms - tolerance_ms) + <= face_time_ms + <= (segment_end_ms + tolerance_ms) + ): + segment_speakers.append(name) + print( + f" Matching Face Detected - Time (ms): {face_time_ms}, Name: {name}" + ) + + recognized_speaker = ( + max(set(segment_speakers), key=segment_speakers.count) + if segment_speakers + else None + ) + identified_speaker = recognized_speaker if recognized_speaker else speaker + + speaker_mapping[(segment_start_ms, segment_end_ms)] = identified_speaker + + if recognized_speaker: + labelname_to_speaker_mapping[speaker] = recognized_speaker + + print( + f" Finalized Mapping for Segment: {segment_start_ms} - {segment_end_ms}, Speaker: {identified_speaker}" + ) + + return speaker_mapping, labelname_to_speaker_mapping + + +def export_diarized_transcript_with_names( + diarized_transcript, labelname_to_speaker_mapping, save_loc +): + with open(save_loc, "w") as file: + for segment in diarized_transcript: + print(f"DIARIZED SEGMENT IN EXPORT FUNC: {segment}") + start, end, original_speaker_label, text = ( + segment["start"], + segment["end"], + segment["speaker"], + segment["text"], + ) + + updated_speaker_label = labelname_to_speaker_mapping.get( + original_speaker_label, original_speaker_label + ) + + file.write(f"{start}-{end} {updated_speaker_label}: {text}\n") + + +def extract_audio_from_mp4(video_file_path, output_audio_path): + """ + Extracts the audio from an MP4 file and saves it as an MP3 file. + """ + with VideoFileClip(video_file_path) as video: + audio = video.audio + audio.write_audiofile(output_audio_path, codec="mp3") + return output_audio_path + + +def get_video_duration(video_path): + with VideoFileClip(video_path) as video: + return video.duration * 1000 + + +def main(): + parser = argparse.ArgumentParser( + description="Read configuration from transcribe_config YAML file" + ) + parser.add_argument("config_file", help="Path to YAML config file") + + args = parser.parse_args() + config_file = args.config_file + config = load_config(config_file) + + if config: + model_size = config["model"]["size"] + model_device = "cuda" if torch.cuda.is_available() else "mps" + model_chunk_length = int(config["model"]["chunk_length"]) + model_batch_size = int(config["model"]["batch_size"]) + + video_path = "input/Regular Council Mtg 1-4-2024.mp4" + audio_output_path = "output_audio/test_audio.mp3" + if not os.path.exists(audio_output_path): + audio_path = extract_audio_from_mp4(video_path, audio_output_path) + else: + print(f"Audio file already exists at {audio_output_path}") + audio_path = audio_output_path + + config["audio"]["path"] = audio_path + + save_loc = config["transcript"]["save_loc"] + + print("Model Size:", model_size) + print("Model Device:", model_device) + print("Chunk Length:", model_chunk_length) + print("Batch Size", model_batch_size) + print("Audio Path:", audio_path) + print("---------------") + + model_names = { + "tiny": "openai/whisper-tiny.en", + "base": "openai/whisper-base.en", + "small": "openai/whisper-small.en", + "medium": "openai/whisper-medium.en", + "large": "openai/whisper-large", + "large-v2": "openai/whisper-large-v2", + } + + model = model_names.get(model_size, "openai/whisper-tiny") + + pipe = pipeline( + "automatic-speech-recognition", + model=model, + chunk_length_s=model_chunk_length, + device=model_device, + ) + + total_duration_ms = get_video_duration(video_path) + last_15_minutes_ms = 15 * 60 * 1000 + + start_time_ms = max(total_duration_ms - last_15_minutes_ms, 0) + end_time_ms = total_duration_ms + + segment_length_ms = 60000 + + segments = split_audio( + audio_path, start_time_ms, end_time_ms, segment_length_ms, "output_audio" + ) + + access_token = "" + + full_diarized_transcript = [] + combined_diarization_data = [] + if segments: + segment = segments[0] + print(f"Processing segment: {segment}") + + preprocessed_segment_path = preprocess_audio_for_diarization(segment) + diarization = perform_diarization(preprocessed_segment_path, access_token) + + for turn, _, speaker in diarization.itertracks(yield_label=True): + combined_diarization_data.append((turn, speaker)) + + segment_transcript = process_segment_with_whisper_and_diarization( + segment, diarization, pipe, model_batch_size + ) + full_diarized_transcript.extend(segment_transcript) + os.remove(segment) + + # Load face labels from CSV and prepare fr + face_labels = load_face_labels("training_data/training_data.csv") + face_encodings = encode_faces(face_labels) + video_path = "input/Regular Council Mtg 1-4-2024.mp4" + + offset_ms = start_time_ms + + # Adjust timestamps for frame extraction + adjusted_timestamps = [] + for segment in full_diarized_transcript: + if segment["end"] is None or segment["speaker"] is None: + continue + + print("Start ms", segment) + start_ms = int(segment["start"] * 1000) + offset_ms + adjusted_timestamps.append(start_ms) + print(f"Adjusted timestamps {adjusted_timestamps}") + + # Extract and save frames + frames = extract_frames(video_path, adjusted_timestamps) + for timestamp in adjusted_timestamps: + print("TIMESTAMP FOR FRAME: {timestamp}") + if timestamp in frames: + frame = frames[timestamp] + filename = os.path.join("output_frames", f"frame_{timestamp}.jpg") + cv2.imwrite(filename, frame) + + recognized_faces = recognize_faces(frames, face_encodings) + print("Recognized Faces Original", recognized_faces) + + recognized_faces_adjusted = { + timestamp - offset_ms: name + for timestamp, name in recognized_faces.items() + } + print("Adjusted Recognized Faces:", recognized_faces_adjusted) + + print("Combined Diarization Data:", combined_diarization_data) + + speaker_mapping, labelname_to_speaker_mapping = map_faces_to_speakers( + combined_diarization_data, recognized_faces_adjusted, tolerance_ms=1 + ) + + print("Speaker Mapping:", speaker_mapping) + + export_diarized_transcript_with_names( + full_diarized_transcript, labelname_to_speaker_mapping, save_loc + ) + + print( + "Transcription and Diarization with Speaker Names Complete. Saved to", + save_loc, + ) + + +if __name__ == "__main__": + main() diff --git a/packages/whisper/archive/transcribe.py b/packages/whisper/archive/transcribe.py index d7216197..c62e6bbf 100644 --- a/packages/whisper/archive/transcribe.py +++ b/packages/whisper/archive/transcribe.py @@ -8,7 +8,7 @@ def load_config(config_file): try: - with open(config_file, 'r') as stream: + with open(config_file, "r") as stream: config = yaml.safe_load(stream) return config except FileNotFoundError: @@ -20,13 +20,15 @@ def load_config(config_file): def main(): - parser = argparse.ArgumentParser(description = "Read configuration from transcribe_config YAML file") - parser.add_argument("config_file", help = "Path to YAML config file") - + parser = argparse.ArgumentParser( + description="Read configuration from transcribe_config YAML file" + ) + parser.add_argument("config_file", help="Path to YAML config file") + args = parser.parse_args() config_file = args.config_file config = load_config(config_file) - + if config: model_size = config["model"]["size"] model_device = config["model"]["device"] @@ -34,42 +36,46 @@ def main(): model_batch_size = int(config["model"]["batch_size"]) audio_path = config["audio"]["path"] save_loc = config["transcript"]["save_loc"] - + print("Model Size:", model_size) print("Model Device:", model_device) print("Chunk Length:", model_chunk_length) print("Batch Size", model_batch_size) print("Audio Path:", audio_path) print("---------------") - - - model_names = {"tiny":"openai/whisper-tiny.en", - "base":"openai/whisper-base.en", - "small":"openai/whisper-small.en", - "medium":"openai/whisper-medium.en", - "large":"openai/whisper-large", - "large_v2":"openai/whisper-large-v2"} - + + model_names = { + "tiny": "openai/whisper-tiny.en", + "base": "openai/whisper-base.en", + "small": "openai/whisper-small.en", + "medium": "openai/whisper-medium.en", + "large": "openai/whisper-large", + "large_v2": "openai/whisper-large-v2", + } + model = model_names[model_size] - + pipe = pipeline( "automatic-speech-recognition", - model = model, - chunk_length_s = model_chunk_length, - device = model_device + model=model, + chunk_length_s=model_chunk_length, + device=model_device, ) start_time = time.time() - - transcript = pipe(audio_path, batch_size = model_batch_size, return_timestamps = True)["chunks"] - + + transcript = pipe(audio_path, batch_size=model_batch_size, return_timestamps=True)[ + "chunks" + ] + end_time = time.time() total_time = end_time - start_time print("Generation Complete. Time to Generate:", str(total_time)) print("Saving Transcript to", save_loc) - with open(save_loc, 'w') as f: + with open(save_loc, "w") as f: for chunk in transcript: f.write("%s\n" % chunk) print("Save Complete") + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/packages/whisper/config.yml b/packages/whisper/config.yml index 0acef5a5..90fc3a41 100644 --- a/packages/whisper/config.yml +++ b/packages/whisper/config.yml @@ -1,12 +1,12 @@ model: - size: "tiny" + size: "base" device: "mps" # or your specific GPU device - chunk_length: "30" - batch_size: 12 + chunk_length: "60" + batch_size: 4 audio: path: "output_audio/City Council Meeting 142024.mp3" offset_ms: 0 # Adjust the offset based on your requirements transcript: - save_loc: "output_transcript/test_transcript.txt" + save_loc: "output_transcript/test-transcript-chunk-60-base.txt" diarization: path: "diarization" # Adjust the path to your diarization data diff --git a/packages/whisper/src-preprocess.py b/packages/whisper/src-preprocess.py new file mode 100644 index 00000000..abe9147b --- /dev/null +++ b/packages/whisper/src-preprocess.py @@ -0,0 +1,316 @@ +import os +import yaml +import argparse +import face_recognition +import pandas as pd +from pydub import AudioSegment +from pytube import YouTube +from transformers import pipeline +import torch +from moviepy.editor import VideoFileClip + +from pydub import AudioSegment + +import numpy as np +import librosa +import soundfile as sf + + +from pydub import AudioSegment +from pydub.effects import normalize, compress_dynamic_range +from pydub import silence +from scipy.signal import butter, lfilter + + +def load_config(config_file): + try: + with open(config_file, "r") as stream: + config = yaml.safe_load(stream) + return config + except FileNotFoundError: + print(f"Config file '{config_file}' not found.") + return None + except yaml.YAMLError as e: + print(f"Error parsing config file: {e}") + return None + + +def process_segment_with_whisper(segment_path, pipe, model_batch_size): + transcript = pipe( + segment_path, batch_size=model_batch_size, return_timestamps=True + )["chunks"] + print(f"TRANSCRIPT: {transcript}") + + processed_transcript = [] + for chunk in transcript: + start, end = chunk["timestamp"] + processed_transcript.append( + { + "start": start, + "end": end, + "text": chunk["text"], + } + ) + return processed_transcript + + +def download_youtube_audio(url, save_path): + try: + yt = YouTube(url) + video = yt.streams.filter(only_audio=True).first() + out_file = video.download(output_path=save_path) + base, ext = os.path.splitext(out_file) + new_file = base + ".mp3" + os.rename(out_file, new_file) + return new_file + except Exception as e: + print(f"Error downloading audio: {e}") + return None + + +def recognize_faces(frames, face_encodings): + recognized_faces = {} + for timestamp, frame in frames.items(): + face_locations = face_recognition.face_locations(frame, model="cnn") + face_encs = face_recognition.face_encodings(frame, face_locations) + for face_enc in face_encs: + matches = face_recognition.compare_faces( + list(face_encodings.values()), face_enc + ) + if True in matches: + first_match_index = matches.index(True) + name = list(face_encodings.keys())[first_match_index] + recognized_faces[timestamp] = name + print(f"Timestamp (ms): {timestamp}, Recognized face: {name}") + return recognized_faces + + +def export_transcript(transcript, save_loc): + with open(save_loc, "w") as file: + for segment in transcript: + print(f"TRANSCRIPT SEGMENT IN EXPORT FUNC: {segment}") + start, end, text = ( + segment["start"], + segment["end"], + segment["text"], + ) + + file.write(f"{start}-{end}: {text}\n") + + +def get_video_duration(video_path): + with VideoFileClip(video_path) as video: + return video.duration * 1000 + + +def split_audio(file_path, start_time_ms, end_time_ms, segment_length_ms, output_dir): + audio = AudioSegment.from_file(file_path) + audio_segment = audio[start_time_ms:end_time_ms] + segments = [] + + for i in range(0, len(audio_segment), segment_length_ms): + segment = audio_segment[i : i + segment_length_ms] + segment_name = f"segment_{i // segment_length_ms}.mp3" + segment_file = os.path.join(output_dir, segment_name) + segment.export(segment_file, format="mp3") + segments.append(segment_file) + + return segments + + +def preprocess_and_extract_audio( + video_path, output_audio_path, target_sample_rate=16000 +): + with VideoFileClip(video_path) as video: + audio = video.audio + temp_audio_path = "temp_audio.wav" + audio.write_audiofile( + temp_audio_path, codec="pcm_s16le" + ) + + y, sr = librosa.load(temp_audio_path, sr=target_sample_rate) + + sf.write(output_audio_path, y, target_sample_rate) + + os.remove(temp_audio_path) + + +def apply_dynamic_range_compression(audio_path): + audio = AudioSegment.from_file(audio_path) + compressed_audio = compress_dynamic_range(audio) + normalized_audio = normalize(compressed_audio) + compressed_audio_path = "compressed_" + audio_path + normalized_audio.export(compressed_audio_path, format="mp3") + return compressed_audio_path + + +def remove_silence(audio_path, silence_thresh=-40, min_silence_len=1000): + audio = AudioSegment.from_file(audio_path) + non_silent_chunks = silence.split_on_silence( + audio, + min_silence_len=min_silence_len, + silence_thresh=silence_thresh, + keep_silence=500, + ) + + processed_audio = AudioSegment.empty() + for chunk in non_silent_chunks: + processed_audio += chunk + + processed_audio_path = "processed_" + os.path.basename(audio_path) + processed_audio.export(processed_audio_path, format="mp3") + return processed_audio_path + + +def butter_bandpass(lowcut, highcut, fs, order=5): + nyq = 0.5 * fs + low = lowcut / nyq + high = highcut / nyq + b, a = butter(order, [low, high], btype="band") + return b, a + + +def butter_bandpass_filter(data, lowcut, highcut, fs, order=5): + b, a = butter_bandpass(lowcut, highcut, fs, order=order) + y = lfilter(b, a, data) + return y + + +def bandpass_filter(audio_path, low=300, high=3400, sr=16000): + y, sr = librosa.load(audio_path, sr=sr) + + y_filtered = butter_bandpass_filter(y, low, high, sr) + + filtered_audio_path = "filtered_" + os.path.basename(audio_path) + sf.write(filtered_audio_path, y_filtered, sr) + + return filtered_audio_path + + +def change_speed(audio_path, speed=1.0): + audio = AudioSegment.from_file(audio_path) + playback_speed_audio = audio.speedup(playback_speed=speed) + speed_changed_audio_path = "speed_changed_" + audio_path + playback_speed_audio.export(speed_changed_audio_path, format="mp3") + return speed_changed_audio_path + + +def audio_preprocessing_pipeline(audio_segment_path): + compressed_audio_path = apply_dynamic_range_compression(audio_segment_path) + + no_silence_audio_path = remove_silence(compressed_audio_path) + + filtered_audio_path = bandpass_filter(no_silence_audio_path) + + return filtered_audio_path + + +def process_segment_with_whisper(segment_path, pipe, model_batch_size): + transcript = pipe( + segment_path, batch_size=model_batch_size, return_timestamps=True + )["chunks"] + print(f"TRANSCRIPT: {transcript}") + + processed_transcript = [] + for chunk in transcript: + start, end = chunk["timestamp"] + processed_transcript.append( + { + "start": start, + "end": end, + "text": chunk["text"], + } + ) + return processed_transcript + + +def extract_audio_from_mp4(video_file_path, output_audio_path): + """ + Extracts the audio from an MP4 file and saves it as an MP3 file. + """ + with VideoFileClip(video_file_path) as video: + audio = video.audio + audio.write_audiofile(output_audio_path, codec="mp3") + return output_audio_path + + +def main(): + parser = argparse.ArgumentParser( + description="Read configuration from transcribe_config YAML file" + ) + parser.add_argument("config_file", help="Path to YAML config file") + + args = parser.parse_args() + config_file = args.config_file + config = load_config(config_file) + + if config: + model_size = config["model"]["size"] + model_device = "cuda" if torch.cuda.is_available() else "mps" + model_chunk_length = int(config["model"]["chunk_length"]) + model_batch_size = int(config["model"]["batch_size"]) + + video_path = "input/Regular Council Mtg 1-4-2024.mp4" + audio_output_path = "output_audio/test_audio.mp3" + if not os.path.exists(audio_output_path): + audio_path = extract_audio_from_mp4(video_path, audio_output_path) + else: + print(f"Audio file already exists at {audio_output_path}") + audio_path = audio_output_path + + config["audio"]["path"] = audio_path + + save_loc = config["transcript"]["save_loc"] + + print("Model Size:", model_size) + print("Model Device:", model_device) + print("Chunk Length:", model_chunk_length) + print("Batch Size", model_batch_size) + print("Audio Path:", audio_path) + print("---------------") + + model_names = { + "tiny": "openai/whisper-tiny.en", + "base": "openai/whisper-base.en", + "small": "openai/whisper-small.en", + "medium": "openai/whisper-medium.en", + "large": "openai/whisper-large", + "large-v2": "openai/whisper-large-v2", + "large-v3": "openai/whisper-large-v3", + } + + model = model_names.get(model_size, "openai/whisper-large") + + pipe = pipeline( + "automatic-speech-recognition", + model=model, + chunk_length_s=model_chunk_length, + device=model_device, + ) + + total_duration_ms = get_video_duration(video_path) + last_15_minutes_ms = 15 * 60 * 1000 + start_time_ms = max(total_duration_ms - last_15_minutes_ms, 0) + end_time_ms = total_duration_ms + + audio = AudioSegment.from_file(audio_path) + last_15_min_audio = audio[start_time_ms:end_time_ms] + last_15_min_audio_path = os.path.join("output_audio", "last_15_min.mp3") + last_15_min_audio.export(last_15_min_audio_path, format="mp3") + + preprocessed_audio_path = audio_preprocessing_pipeline( + last_15_min_audio_path + ) + + print(f"Processing audio: {preprocessed_audio_path}") + full_transcript = process_segment_with_whisper( + preprocessed_audio_path, pipe, model_batch_size + ) + + export_transcript(full_transcript, save_loc) + + print("Transcription Complete. Saved to", save_loc) + + +if __name__ == "__main__": + main() diff --git a/packages/whisper/src.py b/packages/whisper/src.py index ba569a1b..ed720ec9 100644 --- a/packages/whisper/src.py +++ b/packages/whisper/src.py @@ -2,13 +2,10 @@ import yaml import argparse import face_recognition -import cv2 import pandas as pd from pydub import AudioSegment from pytube import YouTube - from transformers import pipeline -from pyannote.audio import Pipeline import torch from moviepy.editor import VideoFileClip @@ -26,85 +23,23 @@ def load_config(config_file): return None -def load_face_labels(csv_file): - df = pd.read_csv(csv_file) - - df.loc[:, "label"] = df.label.str.lower() - df.loc[:, "filepath"] = df.filepath.str.lower() - base_path = os.path.join(os.getcwd(), "training_data") # Base path for images - adjusted_filepaths = [os.path.join(base_path, path) for path in df["filepath"]] - return dict(zip(adjusted_filepaths, df["label"])) - - -def preprocess_audio_for_diarization(file_path): - audio = AudioSegment.from_file(file_path) - audio = audio.set_channels(1).set_frame_rate(16000) - preprocessed_path = file_path.replace(".mp3", "_preprocessed.wav") - audio.export(preprocessed_path, format="wav") - return preprocessed_path - - -def perform_diarization(file_path, access_token): - diarization_pipeline = Pipeline.from_pretrained( - "pyannote/speaker-diarization", use_auth_token=access_token - ) - diarization = diarization_pipeline(file_path) - return diarization - - -def split_audio(file_path, start_time_ms, end_time_ms, segment_length_ms, output_dir): - audio = AudioSegment.from_file(file_path) - audio_segment = audio[start_time_ms:end_time_ms] - segments = [] - - for i in range(0, len(audio_segment), segment_length_ms): - segment = audio_segment[i : i + segment_length_ms] - segment_name = f"segment_{i // segment_length_ms}.mp3" - segment_file = os.path.join(output_dir, segment_name) - segment.export(segment_file, format="mp3") - segments.append(segment_file) - - return segments - - -def process_segment_with_whisper_and_diarization( - segment_path, diarization_results, pipe, model_batch_size -): +def process_segment_with_whisper(segment_path, pipe, model_batch_size): transcript = pipe( segment_path, batch_size=model_batch_size, return_timestamps=True )["chunks"] print(f"TRANSCRIPT: {transcript}") - diarized_transcript = [] + processed_transcript = [] for chunk in transcript: start, end = chunk["timestamp"] - speaker_label = get_speaker_label(diarization_results, start, end) - diarized_transcript.append( + processed_transcript.append( { "start": start, "end": end, - "speaker": speaker_label, "text": chunk["text"], } ) - return diarized_transcript - - -def get_speaker_label(diarization_results, start, end): - # Handle None values for start and end - if start is None or end is None: - return None - - overlap = {} - for turn, _, speaker in diarization_results.itertracks(yield_label=True): - if turn.end < start or turn.start > end: - continue - overlap[speaker] = ( - overlap.get(speaker, 0) + min(end, turn.end) - max(start, turn.start) - ) - if overlap: - return max(overlap, key=overlap.get) - return None + return processed_transcript def download_youtube_audio(url, save_path): @@ -121,31 +56,6 @@ def download_youtube_audio(url, save_path): return None -def encode_faces(face_labels): - face_encodings = {} - for filepath, label in face_labels.items(): - image = face_recognition.load_image_file(filepath) - encodings = face_recognition.face_encodings(image) - if not encodings: - print(f"No face detected in image: {filepath}, skipping...") - continue - face_encodings[label] = encodings[0] - return face_encodings - - -def extract_frames(video_path, timestamps_ms): - cap = cv2.VideoCapture(video_path) - frames = {} - - for timestamp_ms in timestamps_ms: - cap.set(cv2.CAP_PROP_POS_MSEC, timestamp_ms) - ret, frame = cap.read() - if ret: - frames[timestamp_ms] = frame - cap.release() - return frames - - def recognize_faces(frames, face_encodings): recognized_faces = {} for timestamp, frame in frames.items(): @@ -163,72 +73,17 @@ def recognize_faces(frames, face_encodings): return recognized_faces -def map_faces_to_speakers(diarized_transcript, recognized_faces, tolerance_ms=1000): - speaker_mapping = {} - labelname_to_speaker_mapping = {} - - print("Received Recognized Faces (ms):", recognized_faces) - - for segment_tuple in diarized_transcript: - segment, speaker = segment_tuple - segment_start_ms = segment.start * 1000 - segment_end_ms = segment.end * 1000 - segment_speakers = [] - - print( - f"\nProcessing Segment: [{segment_start_ms} - {segment_end_ms}], Speaker Label: {speaker}" - ) - - for face_time_ms, name in recognized_faces.items(): - print(f" Checking Face Time: {face_time_ms}, Name: {name}") - - if ( - (segment_start_ms - tolerance_ms) - <= face_time_ms - <= (segment_end_ms + tolerance_ms) - ): - segment_speakers.append(name) - print( - f" Matching Face Detected - Time (ms): {face_time_ms}, Name: {name}" - ) - - recognized_speaker = ( - max(set(segment_speakers), key=segment_speakers.count) - if segment_speakers - else None - ) - identified_speaker = recognized_speaker if recognized_speaker else speaker - - speaker_mapping[(segment_start_ms, segment_end_ms)] = identified_speaker - - if recognized_speaker: - labelname_to_speaker_mapping[speaker] = recognized_speaker - - print( - f" Finalized Mapping for Segment: {segment_start_ms} - {segment_end_ms}, Speaker: {identified_speaker}" - ) - - return speaker_mapping, labelname_to_speaker_mapping - - -def export_diarized_transcript_with_names( - diarized_transcript, labelname_to_speaker_mapping, save_loc -): +def export_transcript(transcript, save_loc): with open(save_loc, "w") as file: - for segment in diarized_transcript: - print(f"DIARIZED SEGMENT IN EXPORT FUNC: {segment}") - start, end, original_speaker_label, text = ( + for segment in transcript: + print(f"TRANSCRIPT SEGMENT IN EXPORT FUNC: {segment}") + start, end, text = ( segment["start"], segment["end"], - segment["speaker"], segment["text"], ) - updated_speaker_label = labelname_to_speaker_mapping.get( - original_speaker_label, original_speaker_label - ) - - file.write(f"{start}-{end} {updated_speaker_label}: {text}\n") + file.write(f"{start}-{end}: {text}\n") def extract_audio_from_mp4(video_file_path, output_audio_path): @@ -241,6 +96,26 @@ def extract_audio_from_mp4(video_file_path, output_audio_path): return output_audio_path +def get_video_duration(video_path): + with VideoFileClip(video_path) as video: + return video.duration * 1000 + + +def split_audio(file_path, start_time_ms, end_time_ms, segment_length_ms, output_dir): + audio = AudioSegment.from_file(file_path) + audio_segment = audio[start_time_ms:end_time_ms] + segments = [] + + for i in range(0, len(audio_segment), segment_length_ms): + segment = audio_segment[i : i + segment_length_ms] + segment_name = f"segment_{i // segment_length_ms}.mp3" + segment_file = os.path.join(output_dir, segment_name) + segment.export(segment_file, format="mp3") + segments.append(segment_file) + + return segments + + def main(): parser = argparse.ArgumentParser( description="Read configuration from transcribe_config YAML file" @@ -283,9 +158,10 @@ def main(): "medium": "openai/whisper-medium.en", "large": "openai/whisper-large", "large-v2": "openai/whisper-large-v2", + "large-v3": "openai/whisper-large-v3", } - model = model_names.get(model_size, "openai/whisper-tiny") + model = model_names.get(model_size, "openai/whisper-base") pipe = pipeline( "automatic-speech-recognition", @@ -294,87 +170,24 @@ def main(): device=model_device, ) - start_time_ms = (1 * 60 * 60 + 10 * 60) * 1000 - end_time_ms = (1 * 60 * 60 + 18 * 60) * 1000 + total_duration_ms = get_video_duration(video_path) + last_15_minutes_ms = 15 * 60 * 1000 + start_time_ms = max(total_duration_ms - last_15_minutes_ms, 0) + end_time_ms = total_duration_ms - segment_length_ms = 60000 + audio = AudioSegment.from_file(audio_path) + last_15_min_audio = audio[start_time_ms:end_time_ms] + last_15_min_audio_path = os.path.join("output_audio", "last_15_min.mp3") + last_15_min_audio.export(last_15_min_audio_path, format="mp3") - segments = split_audio( - audio_path, start_time_ms, end_time_ms, segment_length_ms, "output_audio" + print(f"Processing audio: {last_15_min_audio_path}") + full_transcript = process_segment_with_whisper( + last_15_min_audio_path, pipe, model_batch_size ) - access_token = "" - - full_diarized_transcript = [] - combined_diarization_data = [] - if segments: - segment = segments[0] - print(f"Processing segment: {segment}") - - preprocessed_segment_path = preprocess_audio_for_diarization(segment) - diarization = perform_diarization(preprocessed_segment_path, access_token) - - for turn, _, speaker in diarization.itertracks(yield_label=True): - combined_diarization_data.append((turn, speaker)) + export_transcript(full_transcript, save_loc) - segment_transcript = process_segment_with_whisper_and_diarization( - segment, diarization, pipe, model_batch_size - ) - full_diarized_transcript.extend(segment_transcript) - os.remove(segment) - - # Load face labels from CSV and prepare fr - face_labels = load_face_labels("training_data/training_data.csv") - face_encodings = encode_faces(face_labels) - video_path = "input/Regular Council Mtg 1-4-2024.mp4" - - offset_ms = start_time_ms - - # Adjust timestamps for frame extraction - adjusted_timestamps = [] - for segment in full_diarized_transcript: - if segment["end"] is None or segment["speaker"] is None: - continue - - print("Start ms", segment) - start_ms = int(segment["start"] * 1000) + offset_ms - adjusted_timestamps.append(start_ms) - print(f"Adjusted timestamps {adjusted_timestamps}") - - # Extract and save frames - frames = extract_frames(video_path, adjusted_timestamps) - for timestamp in adjusted_timestamps: - print("TIMESTAMP FOR FRAME: {timestamp}") - if timestamp in frames: - frame = frames[timestamp] - filename = os.path.join("output_frames", f"frame_{timestamp}.jpg") - cv2.imwrite(filename, frame) - - recognized_faces = recognize_faces(frames, face_encodings) - print("Recognized Faces Original", recognized_faces) - - recognized_faces_adjusted = { - timestamp - offset_ms: name - for timestamp, name in recognized_faces.items() - } - print("Adjusted Recognized Faces:", recognized_faces_adjusted) - - print("Combined Diarization Data:", combined_diarization_data) - - speaker_mapping, labelname_to_speaker_mapping = map_faces_to_speakers( - combined_diarization_data, recognized_faces_adjusted, tolerance_ms=1 - ) - - print("Speaker Mapping:", speaker_mapping) - - export_diarized_transcript_with_names( - full_diarized_transcript, labelname_to_speaker_mapping, save_loc - ) - - print( - "Transcription and Diarization with Speaker Names Complete. Saved to", - save_loc, - ) + print("Transcription Complete. Saved to", save_loc) if __name__ == "__main__":