-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
AI/historical-data
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,4 +7,5 @@ raw/ | |
txt/ | ||
*.csv | ||
package-lock.json | ||
package.json | ||
package.json | ||
*.DS_Store |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
outs: | ||
- md5: 3878f1ecec272cc887ded3697602a653.dir | ||
size: 89147118 | ||
- md5: bfa4f1431d21d6972594243fae97f037.dir | ||
size: 119112746 | ||
nfiles: 2 | ||
hash: md5 | ||
path: faiss_index_general |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
outs: | ||
- md5: f3f1f964db26296ca3f600c2c092bea8.dir | ||
size: 89147118 | ||
- md5: 67c1f4c02de13c682a8136fe6f54beea.dir | ||
size: 119112746 | ||
nfiles: 2 | ||
hash: md5 | ||
path: faiss_index_in_depth |
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
{ | ||
"messages": [ | ||
{ | ||
"page_content": "- Topic: Special City Council Meeting\n- Summary: The special city council meeting was held on Monday, January 24, 2022, with five members present and constituting a quorum.\n- Ordinance Number: N/A\n- Votes Summary:\n - Vote 1: N/A - (5 present, 2 absent)\n- Decision/Key Actions: Quorum established for the special meeting\n- Tags/Keywords: City Council, Quorum, Special Meeting\n- UID: f883c863-6bc8-4a13-9241-85f66d8d4111", | ||
"uid": "f883c863-6bc8-4a13-9241-85f66d8d4111", | ||
"publish_date": "1-24-2022" | ||
}, | ||
{ | ||
"page_content": "- Topic: Presentation - A Working Dialogue with Criminal Justice System Stakeholders\n- Summary: The New Orleans City Council held a presentation where several stakeholders from the criminal justice system, including Gary Sells, Tanyaka B. Cline, Lisa Tennenbaum, and others, provided input and engaged in dialogue.\n- Ordinance Number: N/A\n- Votes Summary: N/A\n- Decision/Key Actions: The presentation was for informational purposes and did not involve a vote.\n- Tags/Keywords: Criminal Justice, Stakeholders, Dialogue\n- UID: 5c777703-d0c5-4ebc-87aa-3cca9b9c1cba\n\nPlease note that the provided link is not accessible and may need to be verified for accurate information.", | ||
"uid": "5c777703-d0c5-4ebc-87aa-3cca9b9c1cba", | ||
"publish_date": "1-24-2022" | ||
}, | ||
{ | ||
"page_content": "- Topic: Motion to Suspend Rule 30\n- Summary: The motion to suspend Rule 30 was introduced by King and seconded by Harris. The motion to suspend the rules passed with 7 YEAS and 0 NAYS.\n- Ordinance Number: N/A\n- Votes Summary:\n Vote 1: Passed - (7 YEAS, 0 NAYS, 0 ABSTAIN, 0 ABSENT)\n- Decision/Key Actions: The motion to suspend Rule 30 was approved.\n- UID: b88bc882-aa3d-4479-8d02-1238120dfcac", | ||
"uid": "b88bc882-aa3d-4479-8d02-1238120dfcac", | ||
"publish_date": "1-24-2022" | ||
}, | ||
{ | ||
"page_content": "- Topic: Conditional use permit for a neighborhood commercial establishment\n- Summary: The ordinance aims to establish a conditional use to permit a neighborhood commercial establishment in an HU-RM1 Historic Urban Multi-Family Residential District. The specific location is Square 486, Lot 5, in the First Municipal District, bounded by Thalia Street, South Roman Street, South Prieur Street, and Martin Luther King, Jr. Boulevard. \n- Ordinance Number: CAL. NO. 33,608\n- Votes Summary:\n - Motion to Suspend the Rules: Passed - (7 YEAS, 0 NAYS, 0 ABSTAIN, 0 ABSENT)\n- Decision/Key Actions: The motion to suspend the rules to introduce the ordinance on first reading passed. The ordinance was introduced and laid over as required by law, with a 90-day deadline of 4/6/22.\n- UID: ec959e9e-59b1-45ab-87ea-4d9d4957df2f", | ||
"uid": "ec959e9e-59b1-45ab-87ea-4d9d4957df2f", | ||
"publish_date": "1-24-2022" | ||
}, | ||
{ | ||
"page_content": "- Topic: Adjournment Motion\n- Summary: Council member Harris seconded the motion to adjourn the meeting.\n- Ordinance Number: N/A\n- Votes Summary:\n Vote 1: Adjourn - 7 YEAS, 0 NAYS, 0 ABSTAIN, 0 ABSENT\n- Decision/Key Actions: The motion to adjourn the meeting passed unanimously.\n- Tags/Keywords: Adjournment, Motion, Meeting\n- UID: e4203df0-3dfb-4934-af9a-63de216225d0", | ||
"uid": "e4203df0-3dfb-4934-af9a-63de216225d0", | ||
"publish_date": "1-24-2022" | ||
} | ||
] | ||
} |
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
outs: | ||
- md5: 3878f1ecec272cc887ded3697602a653.dir | ||
size: 89147118 | ||
- md5: bfa4f1431d21d6972594243fae97f037.dir | ||
size: 119112746 | ||
nfiles: 2 | ||
hash: md5 | ||
path: faiss_index_general |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
outs: | ||
- md5: f3f1f964db26296ca3f600c2c092bea8.dir | ||
size: 89147118 | ||
- md5: 67c1f4c02de13c682a8136fe6f54beea.dir | ||
size: 119112746 | ||
nfiles: 2 | ||
hash: md5 | ||
path: faiss_index_in_depth |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
import os | ||
from summary_model import ( | ||
pdf_to_images, | ||
extract_text_from_image, | ||
save_ocr_to_json, | ||
load_and_split, | ||
extract_date_from_filename, | ||
summarize_text, | ||
save_summaries_to_json, | ||
concatenate_jsons, | ||
) | ||
|
||
|
||
def main(): | ||
documents_directory = "../../backend/src/minutes_agendas_directory/2021/pdfs" | ||
output_json_dir = "../../backend/src/minutes_agendas_directory/2021/json" | ||
|
||
os.makedirs(output_json_dir, exist_ok=True) | ||
|
||
for pdf_filename in os.listdir(documents_directory): | ||
if pdf_filename.endswith(".pdf"): | ||
output_json_path = os.path.join( | ||
output_json_dir, f"{os.path.splitext(pdf_filename)[0]}.json" | ||
) | ||
|
||
if os.path.exists(output_json_path): | ||
print(f"Skipping {pdf_filename}, output already exists.") | ||
continue | ||
|
||
pdf_path = os.path.join(documents_directory, pdf_filename) | ||
publish_date = extract_date_from_filename(pdf_filename) | ||
ocr_json_path = ( | ||
"../../backend/src/minutes_agendas_directory/2022/json/ocr_text.json" | ||
) | ||
|
||
save_ocr_to_json(pdf_path, ocr_json_path, publish_date) | ||
chunks = load_and_split(ocr_json_path) | ||
summaries = summarize_text(chunks, publish_date) | ||
|
||
save_summaries_to_json(summaries, output_json_dir, pdf_filename) | ||
os.remove(ocr_json_path) | ||
|
||
input_json_directory = "../../backend/src/minutes_agendas_directory/2021/json" | ||
output_json_concat_path = ( | ||
"../../backend/src/minutes_agendas_directory/Minutes 2021.json" | ||
) | ||
concatenate_jsons(input_json_directory, output_json_concat_path) | ||
print(f"Summaries saved in directory: {output_json_dir}") | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,168 @@ | ||
import pytesseract | ||
from pdf2image import convert_from_path | ||
from langchain.chat_models import ChatOpenAI | ||
from langchain.text_splitter import RecursiveCharacterTextSplitter | ||
from langchain.document_loaders import JSONLoader | ||
import json | ||
import os | ||
from langchain.chains import LLMChain | ||
from langchain.prompts import PromptTemplate | ||
import uuid | ||
import re | ||
|
||
|
||
def pdf_to_images(pdf_path): | ||
"""Converts PDF file to images""" | ||
return convert_from_path(pdf_path) | ||
|
||
|
||
def extract_text_from_image(image): | ||
"""Extracts text from a single image using OCR""" | ||
return pytesseract.image_to_string(image) | ||
|
||
|
||
def save_ocr_to_json(pdf_path, ocr_json_path, publish_date): | ||
"""Performs OCR on a PDF and saves the result in a JSON format""" | ||
images = pdf_to_images(pdf_path) | ||
messages = [{"page_content": extract_text_from_image(image)} for image in images] | ||
|
||
with open(ocr_json_path, "w") as file: | ||
json.dump({"messages": messages}, file, indent=4) | ||
|
||
|
||
def load_and_split(json_path, chunk_size=4000, chunk_overlap=1000): | ||
"""Loads OCR text from JSON and splits it into chunks that approximately span 2 pages""" | ||
loader = JSONLoader( | ||
file_path=json_path, | ||
jq_schema=".messages[]", | ||
content_key="page_content", | ||
) | ||
|
||
data = loader.load() | ||
text_splitter = RecursiveCharacterTextSplitter( | ||
chunk_size=chunk_size, chunk_overlap=chunk_overlap | ||
) | ||
return text_splitter.split_documents(data) | ||
|
||
|
||
def extract_date_from_filename(filename): | ||
"""Extracts the publish date from the PDF filename using regex""" | ||
match = re.search(r"\d{1,2}-\d{1,2}-\d{4}", filename) | ||
return match.group(0) if match else None | ||
|
||
|
||
def summarize_text(chunks, publish_date): | ||
"""Summarizes the chunks of text""" | ||
chat = ChatOpenAI( | ||
model="gpt-3.5-turbo-1106", | ||
api_key="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", | ||
) | ||
summaries = [] | ||
|
||
for chunk in chunks: | ||
text_content = chunk.page_content | ||
uid = str(uuid.uuid4()) | ||
|
||
prompt = PromptTemplate( | ||
input_variables=["text_content", "uid"], | ||
template=""" | ||
## Council Meeting Ordinance Summary | ||
### Ordinance Details and Voting Outcomes: | ||
{text_content} | ||
### Summary Guidelines: | ||
- **Objective**: Clearly summarize each ordinance that was up for vote, including its brief description and the outcome of the vote (whether it passed or not). | ||
- **Structure**: Present each ordinance separately, starting with its calendar number and title, followed by a brief description, the voting results, and any noteworthy amendments or discussions. | ||
- **Detail**: Highlight important aspects of each ordinance, such as the purpose of the ordinance, key amendments, and the final decision (passed, amended, withdrawn, etc.). | ||
- **Formatting**: Use a structured format, listing each ordinance as a separate bullet point for clarity. | ||
- **Tone**: Maintain a neutral and factual tone, focusing on delivering information as presented in the chunk. | ||
### Additional Instructions: | ||
- **Specificity**: Ensure the summary is specific to the content of each ordinance, avoiding general statements. | ||
- **Contextual Clarity**: Where necessary, provide context to clarify the purpose of the ordinance or the implications of the vote. | ||
- **Coherence**: Each summary should provide a complete understanding of the ordinance's discussion and outcome within the council meeting. | ||
- For each ordinance, summarize the content, identify the ordinance number, which council member introduced it, identify the topic, and include the generated UID: {uid}. | ||
### Example Format: | ||
- Topic: [Primary topic or focus of this chunk]] | ||
- Summary: [Your summary here] | ||
- Ordinance Number: [Ordinance number here] | ||
- Votes Summary: | ||
Vote 1: Passed or Failed or N/A - (Number of YEAS, Number of NAYS, Number of ABSTAIN, Number of ABSENT) | ||
Vote 2: [Summary of the second vote, if applicable] | ||
...(Continue for additional votes) | ||
- Decision/Key Actions: [Key decisions or actions] | ||
- Tags/Keywords: [Relevant tags or keywords] | ||
- UID: {uid} | ||
### Role Emphasis: | ||
As an AI assistant, your task is to distill key information from the meeting's minutes, offering clear and concise summaries of each ordinance and motion, and their respective outcomes, to enable quick understanding and retrieval of crucial details. | ||
""", | ||
) | ||
|
||
chain = LLMChain(llm=chat, prompt=prompt) | ||
summary = chain.run(text_content=text_content, uid=uid, temperature=1) | ||
print(summary) | ||
|
||
summaries.append( | ||
{"page_content": summary, "uid": uid, "publish_date": publish_date} | ||
) | ||
return summaries | ||
|
||
|
||
def save_summaries_to_json(summaries, output_dir, pdf_filename): | ||
"""Saves the summaries to a JSON file, with all summaries under the key 'messages'""" | ||
output_file = os.path.join(output_dir, f"{os.path.splitext(pdf_filename)[0]}.json") | ||
with open(output_file, "w") as file: | ||
json.dump({"messages": summaries}, file, indent=4) | ||
|
||
|
||
def concatenate_jsons(input_dir, output_file): | ||
all_messages = [] | ||
|
||
for file_name in os.listdir(input_dir): | ||
if file_name.endswith(".json"): | ||
file_path = os.path.join(input_dir, file_name) | ||
|
||
with open(file_path, "r") as file: | ||
data = json.load(file) | ||
messages = data.get("messages", []) | ||
|
||
all_messages.extend(messages) | ||
|
||
with open(output_file, "w") as file: | ||
json.dump({"messages": all_messages}, file, indent=4) | ||
|
||
|
||
# if __name__ == "__main__": | ||
# documents_directory = "../input" | ||
# output_json_dir = "../output" | ||
|
||
# os.makedirs(output_json_dir, exist_ok=True) # | ||
|
||
# for pdf_filename in os.listdir(documents_directory): | ||
# if pdf_filename.endswith(".pdf"): | ||
# output_json_path = os.path.join( | ||
# output_json_dir, f"{os.path.splitext(pdf_filename)[0]}.json" | ||
# ) | ||
|
||
# if os.path.exists(output_json_path): | ||
# print(f"Skipping {pdf_filename}, output already exists.") | ||
# continue | ||
|
||
# pdf_path = os.path.join(documents_directory, pdf_filename) | ||
# publish_date = extract_date_from_filename(pdf_filename) | ||
# ocr_json_path = "../output/ocr_text.json" | ||
|
||
# save_ocr_to_json(pdf_path, ocr_json_path, publish_date) | ||
# chunks = load_and_split(ocr_json_path) | ||
# summaries = summarize_text(chunks, publish_date) | ||
|
||
# save_summaries_to_json(summaries, output_json_dir, pdf_filename) | ||
# os.remove(ocr_json_path) | ||
|
||
# input_directory = "../output" | ||
# output_json_path = "../output/Minutes 2022.json" | ||
# concatenate_jsons(input_directory, output_json_path) | ||
# print(f"Summaries saved in directory: {output_json_dir}") |