-
Notifications
You must be signed in to change notification settings - Fork 0
/
ExtractorTest.py
169 lines (129 loc) · 7.08 KB
/
ExtractorTest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import fitz # PyMuPDF to extract text from PDFs
import re
import json
def extract_text_from_pdf(pdf_path, start_page=6):
"""Extract text from a PDF file starting from a specific page."""
text = ""
with fitz.open(pdf_path) as pdf:
for page_num in range(start_page - 1, len(pdf)): # Start from the specified page (page 6)
text += pdf[page_num].get_text()
return text
def extract_answers_and_explanations(text):
"""
Extract answers and explanations from the answers text.
Correct answer is in column 2, and explanation is in column 3.
"""
# Answer pattern: captures the question number, correct answers, and the explanation text
answer_pattern = re.compile(r"(\n)(\d+|A\d+)\s+([a-e, ]+)(.*?)(?=\nFL)", re.DOTALL)
# Explanation pattern: captures the question number and the full explanation before "FL"
explanation_pattern = re.compile(r"(\n)(\d+|A\d+)\s+([a-e, ]+)(.*?)(?=\nFL)", re.DOTALL)
answers = {}
# First, extract the answers and explanations using answer_pattern
for match in answer_pattern.finditer(text):
answer_id = match.group(2).strip() # Question number (e.g., 1, 2, A1, etc.)
correct_answer = match.group(3) # Correct answer(s) (a, b, c, d, e, or combinations like a,e)
explanation = match.group(4).strip() # Explanation for each option (a, b, c, d, e)
answers[answer_id] = {
"correct": correct_answer
}
# Next, extract the explanation text for each question
for match in explanation_pattern.finditer(text):
answer_id = match.group(2).strip() # Question number (e.g., 1, A1, etc.)
explanation_text = match.group(4).strip() # Full explanation (after the correct answers)
if answer_id in answers:
answers[answer_id]["explanation"] = {"full": explanation_text}
# Debugging: Check extracted answers and explanations
print(f"Debug - Extracted answers and explanations: {answers.keys()}")
return answers
def extract_questions_and_options(text):
"""
Extract questions and options from the questions text.
After 4 options (a, b, c, d), check the next line:
- If it starts with e), add it as the 5th option.
- If it does not start with e), discard the additional options.
- If it starts a new 'Domanda', it marks the next question.
"""
question_pattern = r"(Domanda\s(?:A?\d+))\s*(.*?)(?=\n[a-e]\))" # Match question titles and text up until options
option_pattern = r"(\n)([a-e])\)\s*(.*?)(?=\n(?!Domanda))" # Capture options a) to e)
questions = []
# Find all questions (including "Domanda" and the question body before options)
question_matches = re.finditer(question_pattern, text, re.DOTALL)
for match in question_matches:
title = match.group(1).strip() # Title of the question (e.g., "Domanda 1")
question_text = match.group(2).strip() # Question text (before the first option)
# Add the title (Domanda X) to the question text
full_question = f"{title} {question_text}"
# Extract options for this question
options = []
options_text = text[match.end():] # Capture the part of text after the question
# Find all options for this specific question (a-e)
option_matches = re.findall(option_pattern, options_text)
# Debugging: Display the option matches for this question
print(f"Debug - Extracted option matches for {title}: {option_matches}")
# Process options step-by-step
for i, option in enumerate(option_matches[:5]): # Limit to the first 5 matches
option_letter = option[1].strip() # Option letter (a, b, c, d, e)
option_text = option[2].strip() # Option text
# Concatenate the option letter with its text (e.g., "a) Configurare gli ambienti di test")
full_option = f"{option_letter}) {option_text}"
print(f"Debug - Checking option: {full_option}")
# Add the full option to the options list
options.append(full_option)
# If we reach the 5th option, ensure it corresponds to "e)"
if len(options) == 5:
if option_letter != "e": # Check the option letter, not the text
print(f"Debug - Discarding 5th option for {title} because it doesn't start with 'e)'.")
options = options[:4] # Remove the 5th option
break # Stop after processing the first 5 options
# Debugging: Final options for the current question
print(f"Debug - Final Options for {title}: {options}")
questions.append({"title": title, "question": full_question, "options": options})
return questions
def generate_quiz_json(questions, answers, output_path):
"""
Generate a JSON file combining questions, options, answers, and explanations.
"""
quiz_data = {
"id": "1",
"title": "ISTQB Sample Exam",
"time": "60 minutes",
"questionList": []
}
for question in questions:
# Normalize question ID
question_id_match = re.search(r"Domanda\s([A-Z]?\d+)", question["title"])
if question_id_match:
question_id = question_id_match.group(1).strip()
else:
print(f"Warning: Unable to extract ID for question: {question['title']}")
continue
# Prepare question data
question_data = {
"question": question["question"],
"options": question["options"],
}
# Match answers
if question_id in answers:
question_data["correct"] = answers[question_id]["correct"]
question_data["explanation"] = answers[question_id]["explanation"]
print(f"Debug - Found match for question {question_id}: Correct Answer: {answers[question_id]['correct']}, Explanation: {answers[question_id]['explanation']}")
else:
print(f"Warning: No answers found for Domanda {question_id}, adding empty fields.")
question_data["correct"] = []
question_data["explanation"] = {}
quiz_data["questionList"].append(question_data)
with open(output_path, "w") as f:
json.dump([quiz_data], f, indent=4, ensure_ascii=False)
print(f"Quiz JSON saved at: {output_path}")
# Paths to your PDFs
questions_pdf_path = "D:/isqtb/ITASTQB-QTEST-FL-2023-A.pdf"
answers_pdf_path = "D:/isqtb/ITASTQB-QTEST-FL-2023-A-SOL.pdf"
output_json_path = "D:/isqtb/JSONS/ITASTQB-QTEST-FL-2023-A.json"
# Extract text from the PDFs, starting from page 8 (index starts before page 8)
questions_text = extract_text_from_pdf(questions_pdf_path, start_page=8)
answers_text = extract_text_from_pdf(answers_pdf_path, start_page=6) # Answers start from page 6
# Parse the extracted text
parsed_questions = extract_questions_and_options(questions_text)
parsed_answers = extract_answers_and_explanations(answers_text)
# Generate the final JSON
generate_quiz_json(parsed_questions, parsed_answers, output_json_path)