-
Notifications
You must be signed in to change notification settings - Fork 0
/
convert_word_to_excel.py
58 lines (45 loc) · 2.01 KB
/
convert_word_to_excel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import openai
import pandas as pd
from io import StringIO
def extract_text_with_gpt(api_key, file_path, prompt):
# Set your OpenAI API key
openai.api_key = api_key
# Upload the Word file to OpenAI
with open(file_path, 'rb') as file:
# Upload the file and specify the purpose ('answers' for example)
file_response = openai.File.create(file=file, purpose='answers')
# Extract file ID from the response
file_id = file_response['id']
# Customize the prompt for GPT to extract specific content
response = openai.ChatCompletion.create(
model="gpt-4o-mini", # Use the appropriate model
messages=[
{
"role": "user",
"content": prompt # The customizable prompt for GPT
}
],
file=file_id
)
# Extract the content returned by GPT
extracted_text = response.choices[0].message['content']
return extracted_text
def save_text_to_excel(text_content, excel_file_path):
# Convert the text content to a pandas DataFrame
# Assuming text content is CSV formatted; adjust based on your actual prompt's output format
df = pd.read_csv(StringIO(text_content))
# Save the DataFrame to an Excel file
df.to_excel(excel_file_path, index=False)
print(f"Data successfully saved to {excel_file_path}.")
def main(api_key, word_file_path, excel_file_path, prompt):
# Extract text from the Word document using GPT-4o
extracted_text = extract_text_with_gpt(api_key, word_file_path, prompt)
# Save the extracted text to an Excel file
save_text_to_excel(extracted_text, excel_file_path)
# Example usage
if __name__ == "__main__":
api_key = "your_openai_api_key" # Replace with your actual API key
word_file_path = 'example.docx' # Path to the Word document
excel_file_path = 'output.xlsx' # Desired output path for the Excel file
prompt = "Extract all table data in CSV format from the uploaded Word document."
main(api_key, word_file_path, excel_file_path, prompt)