Skip to content

Commit

Permalink
Merge pull request #234 from eye-on-surveillance/AI/first-ord-extraction
Browse files Browse the repository at this point in the history
AI/first-ord-extraction
ayyubibrahimi authored Feb 4, 2024

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
2 parents 9b6f4b1 + 5d24f06 commit 7719699
Showing 16 changed files with 805 additions and 281 deletions.
2 changes: 1 addition & 1 deletion packages/backend/src/cache/faiss_index_in_depth_cj.dvc
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
outs:
- md5: ccd643270cd24af47d96dfb198d468ae.dir
- md5: 9c2090ec6ee84ddf3792866364b35b0f.dir
size: 23116757
nfiles: 2
hash: md5
2 changes: 1 addition & 1 deletion packages/backend/src/cache/faiss_index_in_depth_fc.dvc
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
outs:
- md5: a06190b8eee319f9083f642bb844d6a4.dir
- md5: ad7c688f8fff83d65f8728a403c864dd.dir
size: 63799934
nfiles: 2
hash: md5
4 changes: 2 additions & 2 deletions packages/backend/src/cache/faiss_index_in_depth_news.dvc
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
outs:
- md5: 89036cfaca111f3d2fa33ed70003aa46.dir
size: 242119
- md5: d08e6dfb04e5afcff49028b2bffed7ad.dir
size: 526418
nfiles: 2
hash: md5
path: faiss_index_in_depth_news
4 changes: 2 additions & 2 deletions packages/backend/src/cache/faiss_index_in_depth_pc.dvc
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
outs:
- md5: 7645c2b96dcaf4936ee31103ac0c0968.dir
size: 2028257
- md5: 75d23bd607ac91d67d994f624fb2e4c2.dir
size: 2097501
nfiles: 2
hash: md5
path: faiss_index_in_depth_pc
4 changes: 2 additions & 2 deletions packages/backend/src/cache/faiss_index_in_depth_pdf.dvc
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
outs:
- md5: 47ce778b85c5553106c9f4da5f45ac28.dir
size: 54406163
- md5: 28bfb00c7d3ebfbb4de620e9915e75b3.dir
size: 53418928
nfiles: 2
hash: md5
path: faiss_index_in_depth_pdf
12 changes: 6 additions & 6 deletions packages/backend/src/preprocessor.py
Original file line number Diff line number Diff line change
@@ -6,7 +6,7 @@

from langchain.chains import LLMChain, HypotheticalDocumentEmbedder
from langchain.prompts import PromptTemplate
from langchain.vectorstores.faiss import FAISS
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAI
from pathlib import Path
import shutil
@@ -73,7 +73,7 @@ def create_db_from_minutes_and_agendas(doc_directory):

data = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=2000, chunk_overlap=1000
chunk_size=2000, chunk_overlap=100
)
docs = text_splitter.split_documents(data)
all_docs.extend(docs)
@@ -103,7 +103,7 @@ def create_db_from_news_transcripts(news_json_directory):

data = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=10000, chunk_overlap=5000
chunk_size=2000, chunk_overlap=100
)
docs = text_splitter.split_documents(data)
all_docs.extend(docs)
@@ -136,7 +136,7 @@ def create_db_from_cj_transcripts(cj_json_directory):

data = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=2000, chunk_overlap=1000
chunk_size=2000, chunk_overlap=100
)
docs = text_splitter.split_documents(data)

@@ -169,7 +169,7 @@ def create_db_from_fc_transcripts(fc_json_directory):

data = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=2000, chunk_overlap=1000
chunk_size=2000, chunk_overlap=100
)
docs = text_splitter.split_documents(data)
# Append the publish date to the end of page_content
@@ -199,7 +199,7 @@ def create_db_from_public_comments(pc_json_directory):

data = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=10000, chunk_overlap=5000
chunk_size=2000, chunk_overlap=100
)
docs = text_splitter.split_documents(data)
all_docs.extend(docs)
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
outs:
- md5: ccd643270cd24af47d96dfb198d468ae.dir
- md5: 9c2090ec6ee84ddf3792866364b35b0f.dir
size: 23116757
nfiles: 2
hash: md5
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
outs:
- md5: a06190b8eee319f9083f642bb844d6a4.dir
- md5: ad7c688f8fff83d65f8728a403c864dd.dir
size: 63799934
nfiles: 2
hash: md5
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
outs:
- md5: 89036cfaca111f3d2fa33ed70003aa46.dir
size: 242119
- md5: d08e6dfb04e5afcff49028b2bffed7ad.dir
size: 526418
nfiles: 2
hash: md5
path: faiss_index_in_depth_news
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
outs:
- md5: 7645c2b96dcaf4936ee31103ac0c0968.dir
size: 2028257
- md5: 75d23bd607ac91d67d994f624fb2e4c2.dir
size: 2097501
nfiles: 2
hash: md5
path: faiss_index_in_depth_pc
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
outs:
- md5: 47ce778b85c5553106c9f4da5f45ac28.dir
size: 54406163
- md5: 28bfb00c7d3ebfbb4de620e9915e75b3.dir
size: 53418928
nfiles: 2
hash: md5
path: faiss_index_in_depth_pdf
389 changes: 389 additions & 0 deletions packages/whisper/archive/src-diarization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,389 @@
import os
import yaml
import argparse
import face_recognition
import cv2
import pandas as pd
from pydub import AudioSegment
from pytube import YouTube

from transformers import pipeline
from pyannote.audio import Pipeline
import torch
from moviepy.editor import VideoFileClip


def load_config(config_file):
try:
with open(config_file, "r") as stream:
config = yaml.safe_load(stream)
return config
except FileNotFoundError:
print(f"Config file '{config_file}' not found.")
return None
except yaml.YAMLError as e:
print(f"Error parsing config file: {e}")
return None


def load_face_labels(csv_file):
df = pd.read_csv(csv_file)

df.loc[:, "label"] = df.label.str.lower()
df.loc[:, "filepath"] = df.filepath.str.lower()
base_path = os.path.join(os.getcwd(), "training_data") # Base path for images
adjusted_filepaths = [os.path.join(base_path, path) for path in df["filepath"]]
return dict(zip(adjusted_filepaths, df["label"]))


def preprocess_audio_for_diarization(file_path):
audio = AudioSegment.from_file(file_path)
audio = audio.set_channels(1).set_frame_rate(16000)
preprocessed_path = file_path.replace(".mp3", "_preprocessed.wav")
audio.export(preprocessed_path, format="wav")
return preprocessed_path


def perform_diarization(file_path, access_token):
diarization_pipeline = Pipeline.from_pretrained(
"pyannote/speaker-diarization", use_auth_token=access_token
)
diarization = diarization_pipeline(file_path)
return diarization


def split_audio(file_path, start_time_ms, end_time_ms, segment_length_ms, output_dir):
audio = AudioSegment.from_file(file_path)
audio_segment = audio[start_time_ms:end_time_ms]
segments = []

for i in range(0, len(audio_segment), segment_length_ms):
segment = audio_segment[i : i + segment_length_ms]
segment_name = f"segment_{i // segment_length_ms}.mp3"
segment_file = os.path.join(output_dir, segment_name)
segment.export(segment_file, format="mp3")
segments.append(segment_file)

return segments


def process_segment_with_whisper_and_diarization(
segment_path, diarization_results, pipe, model_batch_size
):
transcript = pipe(
segment_path, batch_size=model_batch_size, return_timestamps=True
)["chunks"]
print(f"TRANSCRIPT: {transcript}")

diarized_transcript = []
for chunk in transcript:
start, end = chunk["timestamp"]
speaker_label = get_speaker_label(diarization_results, start, end)
diarized_transcript.append(
{
"start": start,
"end": end,
"speaker": speaker_label,
"text": chunk["text"],
}
)
return diarized_transcript


def get_speaker_label(diarization_results, start, end):
# Handle None values for start and end
if start is None or end is None:
return None

overlap = {}
for turn, _, speaker in diarization_results.itertracks(yield_label=True):
if turn.end < start or turn.start > end:
continue
overlap[speaker] = (
overlap.get(speaker, 0) + min(end, turn.end) - max(start, turn.start)
)
if overlap:
return max(overlap, key=overlap.get)
return None


def download_youtube_audio(url, save_path):
try:
yt = YouTube(url)
video = yt.streams.filter(only_audio=True).first()
out_file = video.download(output_path=save_path)
base, ext = os.path.splitext(out_file)
new_file = base + ".mp3"
os.rename(out_file, new_file)
return new_file
except Exception as e:
print(f"Error downloading audio: {e}")
return None


def encode_faces(face_labels):
face_encodings = {}
for filepath, label in face_labels.items():
image = face_recognition.load_image_file(filepath)
encodings = face_recognition.face_encodings(image)
if not encodings:
print(f"No face detected in image: {filepath}, skipping...")
continue
face_encodings[label] = encodings[0]
return face_encodings


def extract_frames(video_path, timestamps_ms):
cap = cv2.VideoCapture(video_path)
frames = {}

for timestamp_ms in timestamps_ms:
cap.set(cv2.CAP_PROP_POS_MSEC, timestamp_ms)
ret, frame = cap.read()
if ret:
frames[timestamp_ms] = frame
cap.release()
return frames


def recognize_faces(frames, face_encodings):
recognized_faces = {}
for timestamp, frame in frames.items():
face_locations = face_recognition.face_locations(frame, model="cnn")
face_encs = face_recognition.face_encodings(frame, face_locations)
for face_enc in face_encs:
matches = face_recognition.compare_faces(
list(face_encodings.values()), face_enc
)
if True in matches:
first_match_index = matches.index(True)
name = list(face_encodings.keys())[first_match_index]
recognized_faces[timestamp] = name
print(f"Timestamp (ms): {timestamp}, Recognized face: {name}")
return recognized_faces


def map_faces_to_speakers(diarized_transcript, recognized_faces, tolerance_ms=1000):
speaker_mapping = {}
labelname_to_speaker_mapping = {}

print("Received Recognized Faces (ms):", recognized_faces)

for segment_tuple in diarized_transcript:
segment, speaker = segment_tuple
segment_start_ms = segment.start * 1000
segment_end_ms = segment.end * 1000
segment_speakers = []

print(
f"\nProcessing Segment: [{segment_start_ms} - {segment_end_ms}], Speaker Label: {speaker}"
)

for face_time_ms, name in recognized_faces.items():
print(f" Checking Face Time: {face_time_ms}, Name: {name}")

if (
(segment_start_ms - tolerance_ms)
<= face_time_ms
<= (segment_end_ms + tolerance_ms)
):
segment_speakers.append(name)
print(
f" Matching Face Detected - Time (ms): {face_time_ms}, Name: {name}"
)

recognized_speaker = (
max(set(segment_speakers), key=segment_speakers.count)
if segment_speakers
else None
)
identified_speaker = recognized_speaker if recognized_speaker else speaker

speaker_mapping[(segment_start_ms, segment_end_ms)] = identified_speaker

if recognized_speaker:
labelname_to_speaker_mapping[speaker] = recognized_speaker

print(
f" Finalized Mapping for Segment: {segment_start_ms} - {segment_end_ms}, Speaker: {identified_speaker}"
)

return speaker_mapping, labelname_to_speaker_mapping


def export_diarized_transcript_with_names(
diarized_transcript, labelname_to_speaker_mapping, save_loc
):
with open(save_loc, "w") as file:
for segment in diarized_transcript:
print(f"DIARIZED SEGMENT IN EXPORT FUNC: {segment}")
start, end, original_speaker_label, text = (
segment["start"],
segment["end"],
segment["speaker"],
segment["text"],
)

updated_speaker_label = labelname_to_speaker_mapping.get(
original_speaker_label, original_speaker_label
)

file.write(f"{start}-{end} {updated_speaker_label}: {text}\n")


def extract_audio_from_mp4(video_file_path, output_audio_path):
"""
Extracts the audio from an MP4 file and saves it as an MP3 file.
"""
with VideoFileClip(video_file_path) as video:
audio = video.audio
audio.write_audiofile(output_audio_path, codec="mp3")
return output_audio_path


def get_video_duration(video_path):
with VideoFileClip(video_path) as video:
return video.duration * 1000


def main():
parser = argparse.ArgumentParser(
description="Read configuration from transcribe_config YAML file"
)
parser.add_argument("config_file", help="Path to YAML config file")

args = parser.parse_args()
config_file = args.config_file
config = load_config(config_file)

if config:
model_size = config["model"]["size"]
model_device = "cuda" if torch.cuda.is_available() else "mps"
model_chunk_length = int(config["model"]["chunk_length"])
model_batch_size = int(config["model"]["batch_size"])

video_path = "input/Regular Council Mtg 1-4-2024.mp4"
audio_output_path = "output_audio/test_audio.mp3"
if not os.path.exists(audio_output_path):
audio_path = extract_audio_from_mp4(video_path, audio_output_path)
else:
print(f"Audio file already exists at {audio_output_path}")
audio_path = audio_output_path

config["audio"]["path"] = audio_path

save_loc = config["transcript"]["save_loc"]

print("Model Size:", model_size)
print("Model Device:", model_device)
print("Chunk Length:", model_chunk_length)
print("Batch Size", model_batch_size)
print("Audio Path:", audio_path)
print("---------------")

model_names = {
"tiny": "openai/whisper-tiny.en",
"base": "openai/whisper-base.en",
"small": "openai/whisper-small.en",
"medium": "openai/whisper-medium.en",
"large": "openai/whisper-large",
"large-v2": "openai/whisper-large-v2",
}

model = model_names.get(model_size, "openai/whisper-tiny")

pipe = pipeline(
"automatic-speech-recognition",
model=model,
chunk_length_s=model_chunk_length,
device=model_device,
)

total_duration_ms = get_video_duration(video_path)
last_15_minutes_ms = 15 * 60 * 1000

start_time_ms = max(total_duration_ms - last_15_minutes_ms, 0)
end_time_ms = total_duration_ms

segment_length_ms = 60000

segments = split_audio(
audio_path, start_time_ms, end_time_ms, segment_length_ms, "output_audio"
)

access_token = ""

full_diarized_transcript = []
combined_diarization_data = []
if segments:
segment = segments[0]
print(f"Processing segment: {segment}")

preprocessed_segment_path = preprocess_audio_for_diarization(segment)
diarization = perform_diarization(preprocessed_segment_path, access_token)

for turn, _, speaker in diarization.itertracks(yield_label=True):
combined_diarization_data.append((turn, speaker))

segment_transcript = process_segment_with_whisper_and_diarization(
segment, diarization, pipe, model_batch_size
)
full_diarized_transcript.extend(segment_transcript)
os.remove(segment)

# Load face labels from CSV and prepare fr
face_labels = load_face_labels("training_data/training_data.csv")
face_encodings = encode_faces(face_labels)
video_path = "input/Regular Council Mtg 1-4-2024.mp4"

offset_ms = start_time_ms

# Adjust timestamps for frame extraction
adjusted_timestamps = []
for segment in full_diarized_transcript:
if segment["end"] is None or segment["speaker"] is None:
continue

print("Start ms", segment)
start_ms = int(segment["start"] * 1000) + offset_ms
adjusted_timestamps.append(start_ms)
print(f"Adjusted timestamps {adjusted_timestamps}")

# Extract and save frames
frames = extract_frames(video_path, adjusted_timestamps)
for timestamp in adjusted_timestamps:
print("TIMESTAMP FOR FRAME: {timestamp}")
if timestamp in frames:
frame = frames[timestamp]
filename = os.path.join("output_frames", f"frame_{timestamp}.jpg")
cv2.imwrite(filename, frame)

recognized_faces = recognize_faces(frames, face_encodings)
print("Recognized Faces Original", recognized_faces)

recognized_faces_adjusted = {
timestamp - offset_ms: name
for timestamp, name in recognized_faces.items()
}
print("Adjusted Recognized Faces:", recognized_faces_adjusted)

print("Combined Diarization Data:", combined_diarization_data)

speaker_mapping, labelname_to_speaker_mapping = map_faces_to_speakers(
combined_diarization_data, recognized_faces_adjusted, tolerance_ms=1
)

print("Speaker Mapping:", speaker_mapping)

export_diarized_transcript_with_names(
full_diarized_transcript, labelname_to_speaker_mapping, save_loc
)

print(
"Transcription and Diarization with Speaker Names Complete. Saved to",
save_loc,
)


if __name__ == "__main__":
main()
54 changes: 30 additions & 24 deletions packages/whisper/archive/transcribe.py
Original file line number Diff line number Diff line change
@@ -8,7 +8,7 @@

def load_config(config_file):
try:
with open(config_file, 'r') as stream:
with open(config_file, "r") as stream:
config = yaml.safe_load(stream)
return config
except FileNotFoundError:
@@ -20,56 +20,62 @@ def load_config(config_file):


def main():
parser = argparse.ArgumentParser(description = "Read configuration from transcribe_config YAML file")
parser.add_argument("config_file", help = "Path to YAML config file")

parser = argparse.ArgumentParser(
description="Read configuration from transcribe_config YAML file"
)
parser.add_argument("config_file", help="Path to YAML config file")

args = parser.parse_args()
config_file = args.config_file
config = load_config(config_file)

if config:
model_size = config["model"]["size"]
model_device = config["model"]["device"]
model_chunk_length = int(config["model"]["chunk_length"])
model_batch_size = int(config["model"]["batch_size"])
audio_path = config["audio"]["path"]
save_loc = config["transcript"]["save_loc"]

print("Model Size:", model_size)
print("Model Device:", model_device)
print("Chunk Length:", model_chunk_length)
print("Batch Size", model_batch_size)
print("Audio Path:", audio_path)
print("---------------")


model_names = {"tiny":"openai/whisper-tiny.en",
"base":"openai/whisper-base.en",
"small":"openai/whisper-small.en",
"medium":"openai/whisper-medium.en",
"large":"openai/whisper-large",
"large_v2":"openai/whisper-large-v2"}


model_names = {
"tiny": "openai/whisper-tiny.en",
"base": "openai/whisper-base.en",
"small": "openai/whisper-small.en",
"medium": "openai/whisper-medium.en",
"large": "openai/whisper-large",
"large_v2": "openai/whisper-large-v2",
}

model = model_names[model_size]

pipe = pipeline(
"automatic-speech-recognition",
model = model,
chunk_length_s = model_chunk_length,
device = model_device
model=model,
chunk_length_s=model_chunk_length,
device=model_device,
)
start_time = time.time()

transcript = pipe(audio_path, batch_size = model_batch_size, return_timestamps = True)["chunks"]


transcript = pipe(audio_path, batch_size=model_batch_size, return_timestamps=True)[
"chunks"
]

end_time = time.time()
total_time = end_time - start_time
print("Generation Complete. Time to Generate:", str(total_time))
print("Saving Transcript to", save_loc)
with open(save_loc, 'w') as f:
with open(save_loc, "w") as f:
for chunk in transcript:
f.write("%s\n" % chunk)
print("Save Complete")


if __name__ == "__main__":
main()
main()
8 changes: 4 additions & 4 deletions packages/whisper/config.yml
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
model:
size: "tiny"
size: "base"
device: "mps" # or your specific GPU device
chunk_length: "30"
batch_size: 12
chunk_length: "60"
batch_size: 4
audio:
path: "output_audio/City Council Meeting 142024.mp3"
offset_ms: 0 # Adjust the offset based on your requirements
transcript:
save_loc: "output_transcript/test_transcript.txt"
save_loc: "output_transcript/test-transcript-chunk-60-base.txt"
diarization:
path: "diarization" # Adjust the path to your diarization data
316 changes: 316 additions & 0 deletions packages/whisper/src-preprocess.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,316 @@
import os
import yaml
import argparse
import face_recognition
import pandas as pd
from pydub import AudioSegment
from pytube import YouTube
from transformers import pipeline
import torch
from moviepy.editor import VideoFileClip

from pydub import AudioSegment

import numpy as np
import librosa
import soundfile as sf


from pydub import AudioSegment
from pydub.effects import normalize, compress_dynamic_range
from pydub import silence
from scipy.signal import butter, lfilter


def load_config(config_file):
try:
with open(config_file, "r") as stream:
config = yaml.safe_load(stream)
return config
except FileNotFoundError:
print(f"Config file '{config_file}' not found.")
return None
except yaml.YAMLError as e:
print(f"Error parsing config file: {e}")
return None


def process_segment_with_whisper(segment_path, pipe, model_batch_size):
transcript = pipe(
segment_path, batch_size=model_batch_size, return_timestamps=True
)["chunks"]
print(f"TRANSCRIPT: {transcript}")

processed_transcript = []
for chunk in transcript:
start, end = chunk["timestamp"]
processed_transcript.append(
{
"start": start,
"end": end,
"text": chunk["text"],
}
)
return processed_transcript


def download_youtube_audio(url, save_path):
try:
yt = YouTube(url)
video = yt.streams.filter(only_audio=True).first()
out_file = video.download(output_path=save_path)
base, ext = os.path.splitext(out_file)
new_file = base + ".mp3"
os.rename(out_file, new_file)
return new_file
except Exception as e:
print(f"Error downloading audio: {e}")
return None


def recognize_faces(frames, face_encodings):
recognized_faces = {}
for timestamp, frame in frames.items():
face_locations = face_recognition.face_locations(frame, model="cnn")
face_encs = face_recognition.face_encodings(frame, face_locations)
for face_enc in face_encs:
matches = face_recognition.compare_faces(
list(face_encodings.values()), face_enc
)
if True in matches:
first_match_index = matches.index(True)
name = list(face_encodings.keys())[first_match_index]
recognized_faces[timestamp] = name
print(f"Timestamp (ms): {timestamp}, Recognized face: {name}")
return recognized_faces


def export_transcript(transcript, save_loc):
with open(save_loc, "w") as file:
for segment in transcript:
print(f"TRANSCRIPT SEGMENT IN EXPORT FUNC: {segment}")
start, end, text = (
segment["start"],
segment["end"],
segment["text"],
)

file.write(f"{start}-{end}: {text}\n")


def get_video_duration(video_path):
with VideoFileClip(video_path) as video:
return video.duration * 1000


def split_audio(file_path, start_time_ms, end_time_ms, segment_length_ms, output_dir):
audio = AudioSegment.from_file(file_path)
audio_segment = audio[start_time_ms:end_time_ms]
segments = []

for i in range(0, len(audio_segment), segment_length_ms):
segment = audio_segment[i : i + segment_length_ms]
segment_name = f"segment_{i // segment_length_ms}.mp3"
segment_file = os.path.join(output_dir, segment_name)
segment.export(segment_file, format="mp3")
segments.append(segment_file)

return segments


def preprocess_and_extract_audio(
video_path, output_audio_path, target_sample_rate=16000
):
with VideoFileClip(video_path) as video:
audio = video.audio
temp_audio_path = "temp_audio.wav"
audio.write_audiofile(
temp_audio_path, codec="pcm_s16le"
)

y, sr = librosa.load(temp_audio_path, sr=target_sample_rate)

sf.write(output_audio_path, y, target_sample_rate)

os.remove(temp_audio_path)


def apply_dynamic_range_compression(audio_path):
audio = AudioSegment.from_file(audio_path)
compressed_audio = compress_dynamic_range(audio)
normalized_audio = normalize(compressed_audio)
compressed_audio_path = "compressed_" + audio_path
normalized_audio.export(compressed_audio_path, format="mp3")
return compressed_audio_path


def remove_silence(audio_path, silence_thresh=-40, min_silence_len=1000):
audio = AudioSegment.from_file(audio_path)
non_silent_chunks = silence.split_on_silence(
audio,
min_silence_len=min_silence_len,
silence_thresh=silence_thresh,
keep_silence=500,
)

processed_audio = AudioSegment.empty()
for chunk in non_silent_chunks:
processed_audio += chunk

processed_audio_path = "processed_" + os.path.basename(audio_path)
processed_audio.export(processed_audio_path, format="mp3")
return processed_audio_path


def butter_bandpass(lowcut, highcut, fs, order=5):
nyq = 0.5 * fs
low = lowcut / nyq
high = highcut / nyq
b, a = butter(order, [low, high], btype="band")
return b, a


def butter_bandpass_filter(data, lowcut, highcut, fs, order=5):
b, a = butter_bandpass(lowcut, highcut, fs, order=order)
y = lfilter(b, a, data)
return y


def bandpass_filter(audio_path, low=300, high=3400, sr=16000):
y, sr = librosa.load(audio_path, sr=sr)

y_filtered = butter_bandpass_filter(y, low, high, sr)

filtered_audio_path = "filtered_" + os.path.basename(audio_path)
sf.write(filtered_audio_path, y_filtered, sr)

return filtered_audio_path


def change_speed(audio_path, speed=1.0):
audio = AudioSegment.from_file(audio_path)
playback_speed_audio = audio.speedup(playback_speed=speed)
speed_changed_audio_path = "speed_changed_" + audio_path
playback_speed_audio.export(speed_changed_audio_path, format="mp3")
return speed_changed_audio_path


def audio_preprocessing_pipeline(audio_segment_path):
compressed_audio_path = apply_dynamic_range_compression(audio_segment_path)

no_silence_audio_path = remove_silence(compressed_audio_path)

filtered_audio_path = bandpass_filter(no_silence_audio_path)

return filtered_audio_path


def process_segment_with_whisper(segment_path, pipe, model_batch_size):
transcript = pipe(
segment_path, batch_size=model_batch_size, return_timestamps=True
)["chunks"]
print(f"TRANSCRIPT: {transcript}")

processed_transcript = []
for chunk in transcript:
start, end = chunk["timestamp"]
processed_transcript.append(
{
"start": start,
"end": end,
"text": chunk["text"],
}
)
return processed_transcript


def extract_audio_from_mp4(video_file_path, output_audio_path):
"""
Extracts the audio from an MP4 file and saves it as an MP3 file.
"""
with VideoFileClip(video_file_path) as video:
audio = video.audio
audio.write_audiofile(output_audio_path, codec="mp3")
return output_audio_path


def main():
parser = argparse.ArgumentParser(
description="Read configuration from transcribe_config YAML file"
)
parser.add_argument("config_file", help="Path to YAML config file")

args = parser.parse_args()
config_file = args.config_file
config = load_config(config_file)

if config:
model_size = config["model"]["size"]
model_device = "cuda" if torch.cuda.is_available() else "mps"
model_chunk_length = int(config["model"]["chunk_length"])
model_batch_size = int(config["model"]["batch_size"])

video_path = "input/Regular Council Mtg 1-4-2024.mp4"
audio_output_path = "output_audio/test_audio.mp3"
if not os.path.exists(audio_output_path):
audio_path = extract_audio_from_mp4(video_path, audio_output_path)
else:
print(f"Audio file already exists at {audio_output_path}")
audio_path = audio_output_path

config["audio"]["path"] = audio_path

save_loc = config["transcript"]["save_loc"]

print("Model Size:", model_size)
print("Model Device:", model_device)
print("Chunk Length:", model_chunk_length)
print("Batch Size", model_batch_size)
print("Audio Path:", audio_path)
print("---------------")

model_names = {
"tiny": "openai/whisper-tiny.en",
"base": "openai/whisper-base.en",
"small": "openai/whisper-small.en",
"medium": "openai/whisper-medium.en",
"large": "openai/whisper-large",
"large-v2": "openai/whisper-large-v2",
"large-v3": "openai/whisper-large-v3",
}

model = model_names.get(model_size, "openai/whisper-large")

pipe = pipeline(
"automatic-speech-recognition",
model=model,
chunk_length_s=model_chunk_length,
device=model_device,
)

total_duration_ms = get_video_duration(video_path)
last_15_minutes_ms = 15 * 60 * 1000
start_time_ms = max(total_duration_ms - last_15_minutes_ms, 0)
end_time_ms = total_duration_ms

audio = AudioSegment.from_file(audio_path)
last_15_min_audio = audio[start_time_ms:end_time_ms]
last_15_min_audio_path = os.path.join("output_audio", "last_15_min.mp3")
last_15_min_audio.export(last_15_min_audio_path, format="mp3")

preprocessed_audio_path = audio_preprocessing_pipeline(
last_15_min_audio_path
)

print(f"Processing audio: {preprocessed_audio_path}")
full_transcript = process_segment_with_whisper(
preprocessed_audio_path, pipe, model_batch_size
)

export_transcript(full_transcript, save_loc)

print("Transcription Complete. Saved to", save_loc)


if __name__ == "__main__":
main()
275 changes: 44 additions & 231 deletions packages/whisper/src.py
Original file line number Diff line number Diff line change
@@ -2,13 +2,10 @@
import yaml
import argparse
import face_recognition
import cv2
import pandas as pd
from pydub import AudioSegment
from pytube import YouTube

from transformers import pipeline
from pyannote.audio import Pipeline
import torch
from moviepy.editor import VideoFileClip

@@ -26,85 +23,23 @@ def load_config(config_file):
return None


def load_face_labels(csv_file):
df = pd.read_csv(csv_file)

df.loc[:, "label"] = df.label.str.lower()
df.loc[:, "filepath"] = df.filepath.str.lower()
base_path = os.path.join(os.getcwd(), "training_data") # Base path for images
adjusted_filepaths = [os.path.join(base_path, path) for path in df["filepath"]]
return dict(zip(adjusted_filepaths, df["label"]))


def preprocess_audio_for_diarization(file_path):
audio = AudioSegment.from_file(file_path)
audio = audio.set_channels(1).set_frame_rate(16000)
preprocessed_path = file_path.replace(".mp3", "_preprocessed.wav")
audio.export(preprocessed_path, format="wav")
return preprocessed_path


def perform_diarization(file_path, access_token):
diarization_pipeline = Pipeline.from_pretrained(
"pyannote/speaker-diarization", use_auth_token=access_token
)
diarization = diarization_pipeline(file_path)
return diarization


def split_audio(file_path, start_time_ms, end_time_ms, segment_length_ms, output_dir):
audio = AudioSegment.from_file(file_path)
audio_segment = audio[start_time_ms:end_time_ms]
segments = []

for i in range(0, len(audio_segment), segment_length_ms):
segment = audio_segment[i : i + segment_length_ms]
segment_name = f"segment_{i // segment_length_ms}.mp3"
segment_file = os.path.join(output_dir, segment_name)
segment.export(segment_file, format="mp3")
segments.append(segment_file)

return segments


def process_segment_with_whisper_and_diarization(
segment_path, diarization_results, pipe, model_batch_size
):
def process_segment_with_whisper(segment_path, pipe, model_batch_size):
transcript = pipe(
segment_path, batch_size=model_batch_size, return_timestamps=True
)["chunks"]
print(f"TRANSCRIPT: {transcript}")

diarized_transcript = []
processed_transcript = []
for chunk in transcript:
start, end = chunk["timestamp"]
speaker_label = get_speaker_label(diarization_results, start, end)
diarized_transcript.append(
processed_transcript.append(
{
"start": start,
"end": end,
"speaker": speaker_label,
"text": chunk["text"],
}
)
return diarized_transcript


def get_speaker_label(diarization_results, start, end):
# Handle None values for start and end
if start is None or end is None:
return None

overlap = {}
for turn, _, speaker in diarization_results.itertracks(yield_label=True):
if turn.end < start or turn.start > end:
continue
overlap[speaker] = (
overlap.get(speaker, 0) + min(end, turn.end) - max(start, turn.start)
)
if overlap:
return max(overlap, key=overlap.get)
return None
return processed_transcript


def download_youtube_audio(url, save_path):
@@ -121,31 +56,6 @@ def download_youtube_audio(url, save_path):
return None


def encode_faces(face_labels):
face_encodings = {}
for filepath, label in face_labels.items():
image = face_recognition.load_image_file(filepath)
encodings = face_recognition.face_encodings(image)
if not encodings:
print(f"No face detected in image: {filepath}, skipping...")
continue
face_encodings[label] = encodings[0]
return face_encodings


def extract_frames(video_path, timestamps_ms):
cap = cv2.VideoCapture(video_path)
frames = {}

for timestamp_ms in timestamps_ms:
cap.set(cv2.CAP_PROP_POS_MSEC, timestamp_ms)
ret, frame = cap.read()
if ret:
frames[timestamp_ms] = frame
cap.release()
return frames


def recognize_faces(frames, face_encodings):
recognized_faces = {}
for timestamp, frame in frames.items():
@@ -163,72 +73,17 @@ def recognize_faces(frames, face_encodings):
return recognized_faces


def map_faces_to_speakers(diarized_transcript, recognized_faces, tolerance_ms=1000):
speaker_mapping = {}
labelname_to_speaker_mapping = {}

print("Received Recognized Faces (ms):", recognized_faces)

for segment_tuple in diarized_transcript:
segment, speaker = segment_tuple
segment_start_ms = segment.start * 1000
segment_end_ms = segment.end * 1000
segment_speakers = []

print(
f"\nProcessing Segment: [{segment_start_ms} - {segment_end_ms}], Speaker Label: {speaker}"
)

for face_time_ms, name in recognized_faces.items():
print(f" Checking Face Time: {face_time_ms}, Name: {name}")

if (
(segment_start_ms - tolerance_ms)
<= face_time_ms
<= (segment_end_ms + tolerance_ms)
):
segment_speakers.append(name)
print(
f" Matching Face Detected - Time (ms): {face_time_ms}, Name: {name}"
)

recognized_speaker = (
max(set(segment_speakers), key=segment_speakers.count)
if segment_speakers
else None
)
identified_speaker = recognized_speaker if recognized_speaker else speaker

speaker_mapping[(segment_start_ms, segment_end_ms)] = identified_speaker

if recognized_speaker:
labelname_to_speaker_mapping[speaker] = recognized_speaker

print(
f" Finalized Mapping for Segment: {segment_start_ms} - {segment_end_ms}, Speaker: {identified_speaker}"
)

return speaker_mapping, labelname_to_speaker_mapping


def export_diarized_transcript_with_names(
diarized_transcript, labelname_to_speaker_mapping, save_loc
):
def export_transcript(transcript, save_loc):
with open(save_loc, "w") as file:
for segment in diarized_transcript:
print(f"DIARIZED SEGMENT IN EXPORT FUNC: {segment}")
start, end, original_speaker_label, text = (
for segment in transcript:
print(f"TRANSCRIPT SEGMENT IN EXPORT FUNC: {segment}")
start, end, text = (
segment["start"],
segment["end"],
segment["speaker"],
segment["text"],
)

updated_speaker_label = labelname_to_speaker_mapping.get(
original_speaker_label, original_speaker_label
)

file.write(f"{start}-{end} {updated_speaker_label}: {text}\n")
file.write(f"{start}-{end}: {text}\n")


def extract_audio_from_mp4(video_file_path, output_audio_path):
@@ -241,6 +96,26 @@ def extract_audio_from_mp4(video_file_path, output_audio_path):
return output_audio_path


def get_video_duration(video_path):
with VideoFileClip(video_path) as video:
return video.duration * 1000


def split_audio(file_path, start_time_ms, end_time_ms, segment_length_ms, output_dir):
audio = AudioSegment.from_file(file_path)
audio_segment = audio[start_time_ms:end_time_ms]
segments = []

for i in range(0, len(audio_segment), segment_length_ms):
segment = audio_segment[i : i + segment_length_ms]
segment_name = f"segment_{i // segment_length_ms}.mp3"
segment_file = os.path.join(output_dir, segment_name)
segment.export(segment_file, format="mp3")
segments.append(segment_file)

return segments


def main():
parser = argparse.ArgumentParser(
description="Read configuration from transcribe_config YAML file"
@@ -283,9 +158,10 @@ def main():
"medium": "openai/whisper-medium.en",
"large": "openai/whisper-large",
"large-v2": "openai/whisper-large-v2",
"large-v3": "openai/whisper-large-v3",
}

model = model_names.get(model_size, "openai/whisper-tiny")
model = model_names.get(model_size, "openai/whisper-base")

pipe = pipeline(
"automatic-speech-recognition",
@@ -294,87 +170,24 @@ def main():
device=model_device,
)

start_time_ms = (1 * 60 * 60 + 10 * 60) * 1000
end_time_ms = (1 * 60 * 60 + 18 * 60) * 1000
total_duration_ms = get_video_duration(video_path)
last_15_minutes_ms = 15 * 60 * 1000
start_time_ms = max(total_duration_ms - last_15_minutes_ms, 0)
end_time_ms = total_duration_ms

segment_length_ms = 60000
audio = AudioSegment.from_file(audio_path)
last_15_min_audio = audio[start_time_ms:end_time_ms]
last_15_min_audio_path = os.path.join("output_audio", "last_15_min.mp3")
last_15_min_audio.export(last_15_min_audio_path, format="mp3")

segments = split_audio(
audio_path, start_time_ms, end_time_ms, segment_length_ms, "output_audio"
print(f"Processing audio: {last_15_min_audio_path}")
full_transcript = process_segment_with_whisper(
last_15_min_audio_path, pipe, model_batch_size
)

access_token = ""

full_diarized_transcript = []
combined_diarization_data = []
if segments:
segment = segments[0]
print(f"Processing segment: {segment}")

preprocessed_segment_path = preprocess_audio_for_diarization(segment)
diarization = perform_diarization(preprocessed_segment_path, access_token)

for turn, _, speaker in diarization.itertracks(yield_label=True):
combined_diarization_data.append((turn, speaker))
export_transcript(full_transcript, save_loc)

segment_transcript = process_segment_with_whisper_and_diarization(
segment, diarization, pipe, model_batch_size
)
full_diarized_transcript.extend(segment_transcript)
os.remove(segment)

# Load face labels from CSV and prepare fr
face_labels = load_face_labels("training_data/training_data.csv")
face_encodings = encode_faces(face_labels)
video_path = "input/Regular Council Mtg 1-4-2024.mp4"

offset_ms = start_time_ms

# Adjust timestamps for frame extraction
adjusted_timestamps = []
for segment in full_diarized_transcript:
if segment["end"] is None or segment["speaker"] is None:
continue

print("Start ms", segment)
start_ms = int(segment["start"] * 1000) + offset_ms
adjusted_timestamps.append(start_ms)
print(f"Adjusted timestamps {adjusted_timestamps}")

# Extract and save frames
frames = extract_frames(video_path, adjusted_timestamps)
for timestamp in adjusted_timestamps:
print("TIMESTAMP FOR FRAME: {timestamp}")
if timestamp in frames:
frame = frames[timestamp]
filename = os.path.join("output_frames", f"frame_{timestamp}.jpg")
cv2.imwrite(filename, frame)

recognized_faces = recognize_faces(frames, face_encodings)
print("Recognized Faces Original", recognized_faces)

recognized_faces_adjusted = {
timestamp - offset_ms: name
for timestamp, name in recognized_faces.items()
}
print("Adjusted Recognized Faces:", recognized_faces_adjusted)

print("Combined Diarization Data:", combined_diarization_data)

speaker_mapping, labelname_to_speaker_mapping = map_faces_to_speakers(
combined_diarization_data, recognized_faces_adjusted, tolerance_ms=1
)

print("Speaker Mapping:", speaker_mapping)

export_diarized_transcript_with_names(
full_diarized_transcript, labelname_to_speaker_mapping, save_loc
)

print(
"Transcription and Diarization with Speaker Names Complete. Saved to",
save_loc,
)
print("Transcription Complete. Saved to", save_loc)


if __name__ == "__main__":

0 comments on commit 7719699

Please sign in to comment.