-
Notifications
You must be signed in to change notification settings - Fork 4
/
convert_h5_to_pyarrow.py
58 lines (45 loc) · 1.81 KB
/
convert_h5_to_pyarrow.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import h5py
from tqdm import tqdm
# Define your file paths
hdf5_file = 'pubmed_embeddings.h5'
parquet_file = 'pubmed_embeddings.parquet'
# Open the HDF5 file
with h5py.File(hdf5_file, 'r') as hdf:
# Get the datasets you want to convert
doc_ids = hdf['doc_ids']
doc_vectors = hdf['doc_vectors']
# Check that the datasets have compatible shapes
assert doc_ids.shape[0] == doc_vectors.shape[0], "The number of doc_ids must match the number of doc_vectors"
# Define the chunk size
chunk_size = 100_000 # Adjust the chunk size based on your memory constraints
# Initialize the Parquet writer
parquet_writer = None
# Initialize the progress bar
num_chunks = (doc_ids.shape[0] + chunk_size - 1) // chunk_size # Calculate the total number of chunks
pbar = tqdm(total=num_chunks, desc="Processing chunks", unit="chunk")
# Read the data in chunks and write to Parquet
for i in range(0, doc_ids.shape[0], chunk_size):
# Read a chunk of data
chunk_ids = doc_ids[i:i+chunk_size]
chunk_vectors = doc_vectors[i:i+chunk_size]
# Combine the chunks into a single DataFrame
chunk_df = pd.DataFrame({
'doc_id': chunk_ids,
'doc_vector': list(chunk_vectors)
})
# Convert the DataFrame to a PyArrow Table
chunk_table = pa.Table.from_pandas(chunk_df)
# Write the chunk to Parquet
if parquet_writer is None:
parquet_writer = pq.ParquetWriter(parquet_file, chunk_table.schema)
parquet_writer.write_table(chunk_table)
# Update the progress bar
pbar.update(1)
# Close the Parquet writer
if parquet_writer:
parquet_writer.close()
# Close the progress bar
pbar.close()