-
Notifications
You must be signed in to change notification settings - Fork 0
/
read_all_data.py
31 lines (22 loc) · 1020 Bytes
/
read_all_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import pandas as pd
import glob
import os
path = r'/media/pauloricardo/basement/commodities_usecase/sheets/'
all_files = glob.glob(os.path.join(path, "*.xlsx"))
def _read_excel(path):
return pd.read_excel(path, engine='openpyxl')
df_from_each_file = (_read_excel(f) for f in all_files)
df = pd.concat(df_from_each_file, ignore_index=True).dropna()
from sentence_transformers import SentenceTransformer, LoggingHandler
import numpy as np
import logging
#### Just some code to print debug information to stdout
np.set_printoptions(threshold=100)
logging.basicConfig(format='%(asctime)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S',
level=logging.INFO,
handlers=[LoggingHandler()])
# Load Sentence model (based on BERT) from URL
model = SentenceTransformer('distiluse-base-multilingual-cased-v2')
df['embedding'] = list(model.encode(df['Headlines'].to_list()))
df.to_parquet('/media/pauloricardo/basement/commodities_usecase/soybean_corn.parquet')