forked from ChocopieKewpie/dggsBenchmarks
-
Notifications
You must be signed in to change notification settings - Fork 0
/
DGGS_funcs.py
173 lines (137 loc) · 5.78 KB
/
DGGS_funcs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
from functools import partial
from math import sqrt
from pathlib import Path
import geopandas as gpd
import h3pandas
from itertools import product
import time
import timeit
import csv
import matplotlib.pyplot as plt # Ensure you import matplotlib for plotting
def benchmark_gpkg_files(input_dir, output_dir, num_files_to_open, bench_runs, h3_res=14):
# Get a list of all GPKG files in the folder
v_files = list(sorted(input_dir.glob('*.gpkg')))
# Initialize lists to store benchmarking times
indexing_time = []
joining_time = []
def run_loop(num_files_to_open, v_files, h3_res):
index_time = []
join_time = []
start_time = time.time()
for i, file in enumerate(v_files[:num_files_to_open]):
col_name = file.stem.split('_')[1] # Extracting the column name from the file name
df = gpd.read_file(file).to_crs(4326)
# Indexing benchmark
index_Stime = time.time()
df = df.h3.polyfill(h3_res, explode=True).drop(columns=['geometry'])
index_Etime = time.time()
index_Ftime = index_Etime - index_Stime
index_time.append(index_Ftime)
# Joining benchmark
join_Stime = time.time()
df.columns = [col_name, *df.columns[1:]] # Setting the column to the file
df = df.set_index('h3_polyfill')
if i == 0:
combined_df = df.copy()
else:
combined_df = combined_df.join(df)
join_Etime = time.time()
join_Ftime = join_Etime - join_Stime
join_time.append(join_Ftime)
# Calculate and print the time taken for each run
end_time = time.time()
time_taken = end_time - start_time
indexing_time.append(sum(index_time))
joining_time.append(sum(join_time))
print(f'Run: index time - {sum(index_time)} seconds, joining time - {sum(join_time)} seconds, Total time- {time_taken} seconds')
# Run the benchmark
timeit.timeit(lambda: run_loop(num_files_to_open, v_files, h3_res), number=bench_runs)
def load_and_prepare_data(input_dir, num_files_to_open, h3_res):
# Get a list of all GPKG files in the folder
v_files = list(sorted(input_dir.glob('*.gpkg')))
combined_df = None
for i, file in enumerate(v_files[:num_files_to_open]):
col_name = file.stem.split('_')[1] # Extracting the column name from the file name
df = gpd.read_file(file).to_crs(4326)
df = df.h3.polyfill(h3_res, explode=True).drop(columns=['geometry'])
df.columns = [col_name, *df.columns[1:]] # Setting the column to the file
df = df.set_index('h3_polyfill')
if i == 0:
combined_df = df.copy()
else:
combined_df = combined_df.join(df)
return combined_df
def define_classes():
def is_prime(n):
if n <= 1 or not isinstance(n, int):
return False
for i in range(2, int(sqrt(n)) + 1):
if n % i == 0:
return False
return True
def is_polygonal(s, x):
assert s > 2 and isinstance(s, int) and isinstance(x, int)
n = (sqrt(8 * (s - 2) * x + (s - 4) ** 2) + (s - 4)) / (2 * (s - 2))
return n.is_integer()
def is_fibonacci(n):
if not isinstance(n, int) or n < 0:
return False
a, b = 0, 1
while a < n:
a, b = b, a + b
return a == n
def is_perfect(n):
if n < 2 or not isinstance(n, int):
return False
total = 1
i = 2
while i * i <= n:
if n % i == 0:
total += i + n // i
i += 1
return total == n
classes = [
('is_prime', is_prime),
('is_triangular', partial(is_polygonal, 3)),
('is_rectangular', partial(is_polygonal, 4)),
('is_pentagonal', partial(is_polygonal, 5)),
('is_hexagonal', partial(is_polygonal, 6)),
('is_fibonacci', is_fibonacci),
('is_perfect', is_perfect),
]
return classes
def apply_classifications(combined_df, classes):
combined_df['sum'] = combined_df.select_dtypes(include='number').sum(axis=1)
# Ensure 'sum' column is filled and converted to integers
combined_df['sum'] = combined_df['sum'].fillna(0).astype(int)
for name, func in classes:
combined_df[name] = combined_df['sum'].apply(func)
numerics = list(combined_df.select_dtypes(include='number'))
combined_df = combined_df.drop(columns=numerics)
return combined_df
def benchmark_classification(combined_df, bench_runs):
classify_time = []
# Adjusting for actual boolean combinations
combinations = list(product([0, 1], repeat=len(combined_df.columns)))
class_mapping = {tuple(row): f'Class_{i+1:03}' for i, row in enumerate(combinations)}
def classify_loop():
start_time = time.time()
# Classification process
bench_class = combined_df.apply(lambda row: class_mapping[tuple(row)], axis=1)
end_time = time.time()
# Calculate and print the time taken for each run
time_taken = end_time - start_time
classify_time.append(time_taken)
print(f'Run: Time taken - {time_taken} seconds')
timeit.timeit(lambda: classify_loop(), number=bench_runs)
def final_processing_and_plotting(combined_df, class_mapping):
# Apply the final classification to create the 'class' column
combined_df['class'] = combined_df.apply(lambda row: class_mapping[tuple(row.astype(int))], axis=1)
# Create a GeoDataFrame for H3 geometries
h3_df = combined_df[['class']]
h3_df = h3_df.h3.h3_to_geo_boundary()
# Dissolve by 'class' and reproject to CRS 2193
h3_df = h3_df.dissolve(by='class').reset_index().to_crs(2193)
# Plot the results
h3_df.plot('class', cmap='viridis')
plt.show()