eye-on-surveillance · marvinmarnold · Dec 7, 2023 · Dec 5, 2023 · Dec 5, 2023 · Dec 5, 2023
diff --git a/README.md b/README.md
@@ -2,6 +2,8 @@
 
 Work in Progress
 
+## Tulane Dev Integration Branch
+
 Sawt is a tool designed to bridge the communication gap between New Orleanians and their city council representatives.
 
 ## Prerequisites

diff --git a/packages/googlecloud/functions/getanswer/inquirer.py b/packages/googlecloud/functions/getanswer/inquirer.py
@@ -171,7 +171,9 @@ def get_indepth_response_from_query(df, db, query, k):
         query = transform_query_for_date(query)
 
     doc_list = db.similarity_search_with_score(query, k=k)
+
     docs = sort_retrived_documents(doc_list)
+
     docs_page_content = append_metadata_to_content(doc_list)
 
     template = """
@@ -245,3 +247,4 @@ def answer_query(
     final_response = route_question(df, db_general, db_in_depth, query, response_type)
 
     return final_response
+
diff --git a/packages/googlecloud/functions/getanswer/main.py b/packages/googlecloud/functions/getanswer/main.py
@@ -5,7 +5,7 @@
 import google.cloud.logging
 import functions_framework
 from supabase import create_client
-
+from dotenv import find_dotenv, load_dotenv
 from helper import parse_field, get_dbs
 from inquirer import answer_query
 import os
@@ -19,6 +19,9 @@
 db_general, db_in_depth, voting_roll_df = get_dbs()
 
 # Setup Supabase client
+load_dotenv(find_dotenv())
+
+
 try:
     supabase_url = os.environ["SUPABASE_URL_PRODUCTION"]
     supabase_key = os.environ["SUPABASE_SERVICE_KEY_PRODUCTION"]
@@ -115,8 +118,12 @@ def getanswer(request):
 
     end = time.time()
     elapsed = int((end - start) * 1000)
+
     update_supabase(responses_data, citations_data, card_id, elapsed)
     logging.info(f"Completed getanswer in {elapsed} seconds")
     print(f"\n\t--------- Completed getanswer in {elapsed} seconds --------\n")
 
     return ("Answer successfully submitted to Supabase", 200, headers)
+
+
+
diff --git a/packages/googlecloud/functions/getanswer/process_public_queries.py b/packages/googlecloud/functions/getanswer/process_public_queries.py
@@ -0,0 +1,50 @@
+import pandas as pd
+import numpy as np
+import requests
+import csv
+import json
+from tqdm import tqdm
+
+# Input CSV file with 'title' column
+input_csv = "/Users/haydenoutlaw/Desktop/card_rows_export_2023-11-29.csv"
+output_csv = "/Users/haydenoutlaw/Desktop/gpt4-varied-11-29.csv"
+
+# point to getanswer server
+api_endpoint = "http://localhost:8080"
+
+# list of k values
+k_list = [5, 10, 15]
+
+# get response from local getanswer server, store answers
+def make_api_call(title, k_inp):
+    payload = {"query": title, "response_type": "in_depth", "card_id": 1, "k": k_inp}
+    response = requests.post(f"{api_endpoint}", json=payload)
+    rdict = json.loads(response.text)
+    card_type_out = rdict["card_type"]
+    citations_out = rdict["citations"]
+    responses_out = rdict["responses"]
+    return card_type_out, citations_out, responses_out, k_inp
+
+# Open CSV file in append mode
+with open(output_csv, 'a', newline='', encoding='utf-8') as csv_file:
+    # define csv out file
+    csv_writer = csv.writer(csv_file)
+    csv_writer.writerow(["query", "response_id", "card_type", "citations", "responses", "k"])
+
+    # read inputs
+    df = pd.read_csv(input_csv)
+
+
+    print("Connected to getanswer at", api_endpoint)
+    print("K Values", k_list)
+    print("Generating Responses....")
+
+
+    # for all queries, get answers and write out one at a time
+    tqiter = enumerate(tqdm(df["title"]))
+    for i, query in tqiter:
+        for k_val in k_list:
+            card_type, citations, responses, k = make_api_call(query, k_val)
+            csv_writer.writerow([query, i, card_type, citations, responses, k])
+
+print(f"Results saved to '{output_csv}'.")
diff --git a/packages/transcription/.gitignore b/packages/transcription/.gitignore
@@ -0,0 +1,8 @@
+.env
+.log
+__pycache__/
+transcripts-data/
+audio/
+cred/
+.vscode/
+
diff --git a/packages/transcription/transcribe/README.md b/packages/transcription/transcribe/README.md
@@ -0,0 +1,19 @@
+## TU Capstone- Transcription
+
+A generic API for fetching YouTube Audio and Transcripts.
+
+#### Required Credentials   
+    - YOUTUBE_API_KEY
+    - GOOGLE_APPLICATION_CREDENTIALS
+Create a cred folder containing cred.env variables according to dotenv configuration.
+
+### transcripts.py
+Retrieves & downloads the x-most recent video transcripts from a YouTube Channel.
+
+### monitor.py
+Retrieves & downloads the x-most recent video audio mp4s from a YouTube Channel. Future implemention should consider using Windows Task Scheduler to periodically monitor channel for new videos.
+
+#### Oauth.py
+Helper authentication function.
+
+
diff --git a/packages/transcription/transcribe/monitor.py b/packages/transcription/transcribe/monitor.py
@@ -0,0 +1,71 @@
+from googleapiclient.discovery import build
+#import youtube_dl Has BEEN DEPRECATED BY GERMAN GOVERNMENT
+import os
+from dotenv import load_dotenv
+from pytube import YouTube
+import oauth
+# Initialize the YouTube Data API client
+
+env_vars = oauth.import_env_vars()
+YOUTUBE_API_KEY = env_vars.get('YOUTUBE_API_KEY')
+youtube = build('youtube', 'v3', developerKey=YOUTUBE_API_KEY)
+
+# Specify the YouTube channel ID
+channel_id = 'UC8oPEsQe9a0v6TdJ4K_QXoA' # New Orleans City Council
+
+def get_latest_videos(channel_id, max_results=5): 
+    """
+    Fetches the latest x-number of videos from a YouTube channel.
+
+    Args:
+        channel_id (str): The ID of the YouTube channel to monitor.
+        max_results (int): The maximum number of latest videos to fetch. Default is 5.
+
+    Returns:
+        list: A list of video IDs for the latest videos.
+    """
+    # Fetch channel details to get the ID of the uploads playlist
+    request = youtube.channels().list(
+        part='contentDetails',
+        id=channel_id
+    )
+    response = request.execute()
+
+    if not response.get('items'):
+        raise ValueError(f"No channel found with ID {channel_id}")
+
+    playlist_id = response['items'][0]['contentDetails']['relatedPlaylists']['uploads']
+
+    request = youtube.playlistItems().list(
+        part='snippet',
+        playlistId=playlist_id,
+        maxResults=max_results
+    )
+    response = request.execute()
+
+    video_ids = [item['snippet']['resourceId']['videoId'] for item in response['items']]
+
+    return video_ids
+
+def download_audio(video_ids):
+    """
+    Downloads the audio of a list of YouTube videos using pytube.
+
+    Args:
+        video_ids (list): A list of YouTube video IDs to download the audio for.
+
+    Downloads: mp4 audio files of the desired Youtube videos.
+    """
+    for video_id in video_ids:
+        yt = YouTube(f'https://www.youtube.com/watch?v={video_id}')
+        ys = yt.streams.filter(only_audio=True).first()
+
+        # Download the audio stream to the specified output path
+        print(f'Downloading audio for {video_id}...')
+        ys.download(output_path=r'transcripts-data\audio', filename=video_id+".mp4")
+
+# Get the latest videos
+video_ids = get_latest_videos(channel_id, 10)
+
+# Download the audio of the new videos
+download_audio(video_ids)
diff --git a/packages/transcription/transcribe/oauth.py b/packages/transcription/transcribe/oauth.py
@@ -0,0 +1,21 @@
+
+import os
+from dotenv import load_dotenv
+
+def import_env_vars():
+    os.chdir(r"packages\transcription")
+    load_dotenv(r"cred\cred.env")
+
+    # Get credentials from environment variables
+    YOUTUBE_API_KEY = os.getenv("YOUTUBE_API_KEY")
+    CLIENT_ID = os.getenv("CLIENT_ID")
+    CLIENT_SECRET = os.getenv("CLIENT_SECRET")
+    GOOGLE_APPLICATION_CREDENTIALS= os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
+
+    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = GOOGLE_APPLICATION_CREDENTIALS
+
+    return { "YOUTUBE_API_KEY": YOUTUBE_API_KEY,
+             "CLIENT_ID": CLIENT_ID,
+             "CLIENT_SECRET": CLIENT_SECRET,
+             "GOOGLE_APPLICATION_CREDENTIALS": GOOGLE_APPLICATION_CREDENTIALS
+    }
diff --git a/packages/transcription/transcribe/transcripts.py b/packages/transcription/transcribe/transcripts.py
@@ -0,0 +1,67 @@
+from youtube_transcript_api import YouTubeTranscriptApi
+from googleapiclient.discovery import build
+import oauth
+import json
+import os
+
+# Get credentials from environment variables
+env_vars = oauth.import_env_vars()
+YOUTUBE_API_KEY = env_vars.get("YOUTUBE_API_KEY")
+CLIENT_ID       = env_vars.get("CLIENT_ID")
+CLIENT_SECRET   = env_vars.get("CLIENT_SECRET")
+GOOGLE_APPLICATION_CREDENTIALS= env_vars.get("GOOGLE_APPLICATION_CREDENTIALS")
+
+def get_latest_videos(channel_id, max_results=5): 
+
+    """
+    Fetches the latest x-number of videos from a YouTube channel.
+
+    Args:
+        channel_id (str): The ID of the YouTube channel to monitor.
+        max_results (int): The maximum number of latest videos to fetch. Default is 5.
+
+    Returns:
+        list: A list of video IDs for the latest videos.
+    """
+    youtube = build('youtube', 'v3', developerKey=YOUTUBE_API_KEY)
+
+    # Fetch channel details to get the ID of the uploads playlist
+    request = youtube.channels().list(
+        part='contentDetails',
+        id=channel_id
+    )
+    response = request.execute()
+
+    if not response.get('items'):
+        raise ValueError(f"No channel found with ID {channel_id}")
+
+    playlist_id = response['items'][0]['contentDetails']['relatedPlaylists']['uploads']
+
+    request = youtube.playlistItems().list(
+        part='snippet',
+        playlistId=playlist_id,
+        maxResults=max_results
+    )
+    response = request.execute()
+
+    video_ids = [item['snippet']['resourceId']['videoId'] for item in response['items']]
+
+    return video_ids
+
+def download_transcripts(video_ids):
+    for video_id in video_ids:
+        try:
+            # Grabs transcript for the video
+            transcript = YouTubeTranscriptApi.get_transcript(video_id)
+            print(transcript)
+            with open(f'transcripts-data\\YT_transcripts\\{video_id}_transcript.json', 'w+', encoding='utf-8') as file:
+                  json.dump(transcript, file)
+
+            print(f'Transcript for {video_id} saved successfully.')
+
+        except Exception as e:
+            print(f'An error occurred while fetching the transcript for {video_id}: {e}')
+
+channel_id = "UC8oPEsQe9a0v6TdJ4K_QXoA"
+video_ids = get_latest_videos(channel_id, 10)
+download_transcripts(video_ids)
diff --git a/packages/transcription/whisper-model/README.md b/packages/transcription/whisper-model/README.md
@@ -0,0 +1,28 @@
+# HF Whisper Transcript App
+Application of [OpenAI Whisper-V2](https://huggingface.co/openai/whisper-large-v2) for audio file transcription.
+
+
+## To Run
+Configure [README.md]('README.md')
+```yml
+model:
+  #model size
+  #tiny, base, small, medium, large, large_v2
+  size: "tiny" 
+  # device for pytorch processing
+  device: "cpu"
+  # chunk length for audio processing
+  chunk_length: "10"
+  # batch size
+  batch_size: 1
+audio:
+  # path to audio file to process
+  path: "audio/trial_meeting.mp3"
+transcript:
+  # location to save transcript
+  save_loc: "transcripts/trial_meeting_transcript.txt" 
+```
+Execute from CL:
+```bash
+python transcribe.py transcribe_config.yml
+```