forked from gkamradt/LLMTest_NeedleInAHaystack
-
Notifications
You must be signed in to change notification settings - Fork 2
/
scripts.py
143 lines (116 loc) · 5.3 KB
/
scripts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
from dotenv import load_dotenv
import os
import tiktoken
import glob
import json
from langchain.evaluation import load_evaluator
from dotenv import load_dotenv
import numpy as np
import time
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
load_dotenv()
def read_files(directory):
context = ""
for file in glob.glob(directory):
with open(file, 'r') as f:
context += f.read()
return context
def encode_and_trim(context, context_length, enc):
tokens = enc.encode(context)
if len(tokens) > context_length:
context = enc.decode(tokens[:context_length])
return context
def insert_needle(needle, context, depth_percent, context_length, enc):
tokens_needle = enc.encode(needle)
tokens_context = enc.encode(context)
# Reducing the context length by 150 buffer. This is to account for system message, the user question, and response.
context_length -= 150
# If your context + needle are longer than the context length (which it will be), then reduce tokens from the context by the needle length
if len(tokens_context) + len(tokens_needle) > context_length:
tokens_context = tokens_context[:context_length - len(tokens_needle)]
if depth_percent == 100:
# If your depth percent is 100 (which means your needle is the last thing in the doc), throw it at the end
tokens_new_context = tokens_context + tokens_needle
else:
# Go get the position (in terms of tokens) to insert your needle
insertion_point = int(len(tokens_context) * (depth_percent / 100))
# tokens_new_context represents the tokens before the needle
tokens_new_context = tokens_context[:insertion_point]
# We want to make sure that we place our needle at a sentence break so we first see what token a '.' is
period_tokens = enc.encode('.')
# Then we iteration backwards until we find the first period
while tokens_new_context and tokens_new_context[-1] not in period_tokens:
insertion_point -= 1
tokens_new_context = tokens_context[:insertion_point]
# Once we get there, then add in your needle, and stick the rest of your context in on the other end.
# Now we have a needle in a haystack
tokens_new_context += tokens_needle + tokens_context[insertion_point:]
# Convert back to a string and return it
new_context = enc.decode(tokens_new_context)
return new_context
def generate_context(needle, context_length, depth_percent):
# Load up tiktoken so we navigate tokens more easily
enc = tiktoken.encoding_for_model("gpt-4")
# Get your files loaded into a string
context = read_files("PaulGrahamEssays/*.txt")
# Truncate the text to the context length you desire
context = encode_and_trim(context, context_length, enc)
# Insert your random statement according to your depth percent
context = insert_needle(needle, context, depth_percent, context_length, enc)
return context
def evaluate_response(response, needle, question_to_ask, evaluation_model):
accuracy_criteria = {
"accuracy": """
Score 1: The answer is completely unrelated to the reference.
Score 3: The answer has minor relevance but does not align with the reference.
Score 5: The answer has moderate relevance but contains inaccuracies.
Score 7: The answer aligns with the reference but has minor omissions.
Score 10: The answer is completely accurate and aligns perfectly with the reference.
Only respond with a numerical score.
"""
}
# Using GPT-4 to evaluate
evaluator = load_evaluator(
"labeled_score_string",
criteria=accuracy_criteria,
llm=evaluation_model,
)
eval_result = evaluator.evaluate_strings(
# The models response
prediction=response,
# The actual answer
reference=needle,
# The question asked
input=question_to_ask,
)
return int(eval_result['score'])
def result_exists(results, context_length, depth_percent, version, model):
"""
Checks to see if a result has already been evaluated or not
"""
conditions_met = []
for result in results:
context_length_met = result['context_length'] == context_length
depth_percent_met = result['depth_percent'] == depth_percent
version_met = result.get('version', 1) == version
model_met = result['model'] == model
conditions_met.append(context_length_met and depth_percent_met and version_met and model_met)
return any(conditions_met)
def retrieve_relevant_excerpts(long_text, question, embedding, chunk_size=500, top_k=6):
"""
Retrieves relevant excerpts from a long text using a question and an embedding model
"""
text_splitter = RecursiveCharacterTextSplitter(
chunk_size = chunk_size,
chunk_overlap = 50,
length_function = len,
add_start_index = True,
)
texts = text_splitter.create_documents([long_text])
db = FAISS.from_texts([text.page_content for text in texts], embedding)
retriever = db.as_retriever(search_kwargs={'k':top_k})
retrieved_docs = retriever.get_relevant_documents(
question,
)
return 'DOCUMENT\n'+'\nDOCUMENT:\n'.join([doc.page_content for doc in retrieved_docs])