-
Notifications
You must be signed in to change notification settings - Fork 0
/
benchmark_elpa.f90
225 lines (163 loc) · 5.85 KB
/
benchmark_elpa.f90
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
! Brandon Cook's ELPA driver that was published in article Cook, Kurth, Deslippe, Carrier, Hill WichMann "Eigensolver performance comparison on Cray XC systems," Concurrency Computat Pract Exper. 2019, 31, https://doi.org/10.1002/cpe.4997
! Updated by Paul Lin July 2020 for ELPA 2020.05.001
module helper
implicit none
contains
subroutine transpose_matrix(uplo, ln, a_ref, a_sym, desca)
character, intent(in) :: uplo
integer, intent(in) :: ln
complex(kind=8), intent(in) :: a_ref(:,:)
complex(kind=8), intent(out) :: a_sym(:,:)
integer, intent(in) :: desca(:)
complex(kind=8) :: alpha, beta
logical :: lsame
alpha = (1.0, 0.0)
beta = (0.0, 0.0)
call pztranc(ln, ln, alpha, a_ref, 1, 1, desca, beta, a_sym, 1, 1, desca)
if (lsame(uplo, "L")) then
call pztrmr2d("L", "N", ln, ln, a_ref, 1, 1, desca, a_sym, 1, 1, desca, desca(2))
else
call pztrmr2d("U", "N", ln, ln, a_ref, 1, 1, desca, a_sym, 1, 1, desca, desca(2))
end if
end subroutine transpose_matrix
elemental subroutine convert_to_int(str, i, stat)
character(len=*), intent(in) :: str
integer, intent(out) :: i, stat
read(str, *, iostat=stat) i
end subroutine convert_to_int
end module helper
program benchmark_elpa
use elpa
! use elpa_driver ! not in ELPA 2020.05.001
use helper
implicit none
include 'mpif.h'
integer :: ctxt_sys, my_blacs_ctxt
integer :: rank, size, i, j
integer :: ln, lnprow, lnpcol, lnbrow, lnbcol
integer :: llda, ml, nl, lsize, numEle
integer :: myrow, mycol, info, err
real(kind=8) :: t1, t2
class(elpa_t), pointer :: e
! integer, allocatable :: desca(:)
integer :: desca(9)
complex(kind=8), allocatable :: a_ref(:,:), a_sym(:,:), z(:,:)
real(kind=8), allocatable :: w(:)
integer, dimension(4) :: iseed
character(len=10), dimension(4) :: argc
integer numroc, mpi_comm_rows, mpi_comm_cols
integer, parameter :: ELS_TO_PRINT = 6 ! arbitrary els to print out
integer(kind=c_int) :: error_elpa
! Initialize MPI and BLACS
call mpi_init(err)
call mpi_comm_rank(MPI_COMM_WORLD, rank, err)
call mpi_comm_size(MPI_COMM_WORLD, size, err)
do i = 1, 4
call get_command_argument(i, argc(i))
end do
call convert_to_int(argc(1), ln, err)
call convert_to_int(argc(2), lnprow, err)
call convert_to_int(argc(3), lnpcol, err)
call convert_to_int(argc(4), lnbrow, err)
lnbcol = lnbrow
if (rank == 0) then
print *, "ln =", ln
print *, "lnprow =", lnprow
print *, "lnpcol =", lnpcol
print *, "lnbropw =", lnbrow
end if
! call blacs_get(0, 0, ctxt_sys)
! call blacs_gridinit(ctxt_sys, "C", lnprow, lnpcol)
! call blacs_gridinit(ctxt_sys, "R", lnprow, lnpcol)
! call blacs_gridinfo(ctxt_sys, lnprow, lnpcol, myrow, mycol)
my_blacs_ctxt = MPI_COMM_WORLD
call blacs_gridinit(my_blacs_ctxt, 'R', lnprow, lnpcol)
call blacs_gridinfo(my_blacs_ctxt, lnprow, lnpcol, myrow, mycol)
if (myrow .eq. -1) then
print *, "Failed to properly initialize MPI and/or BLACS!"
call MPI_FINALIZE(err)
stop
end if
! Explicitly get and set the row and column communicators, as the API seems to be
! failing to initialize them as they should
! NOTE: elpa_get_communicators not in ELPA 2020.05.001
! err = elpa_get_communicators(MPI_COMM_WORLD, myrow, mycol, mpi_comm_rows, mpi_comm_cols)
! Allocate my matrices now
ml = numroc(ln, lnbrow, myrow, 0, lnprow)
nl = numroc(ln, lnbcol, mycol, 0, lnpcol)
llda = ml
allocate(a_ref(ml,nl))
allocate(a_sym(ml,nl))
allocate(z(ml,nl))
allocate(w(ln))
! allocate(desca(9))
! Create my blacs descriptor for transposing the matrix
! call descinit(desca, ln, ln, lnbrow, lnbcol, 0, 0, ctxt_sys, llda, info)
call descinit(desca, ln, ln, lnbrow, lnbcol, 0, 0, my_blacs_ctxt, llda, info)
iseed(1) = myrow
iseed(2) = mycol
iseed(3) = mycol + myrow*lnpcol
iseed(4) = 1
if (iand(iseed(4), 2) == 0) then
iseed(4) = iseed(4) + 1
end if
! Try initializing and allocating elpa
if (elpa_init(20170403) /= elpa_ok) then
print *, "ELPA API not supported"
stop
end if
e => elpa_allocate(err)
if (err /= ELPA_OK) then
print *,"crap! elpa_allocate failed"
endif
numEle = ml*nl
! write(6,*) iseed, numEle
call zlarnv(1, iseed, numEle, a_ref)
call transpose_matrix("L", ln, a_ref, a_sym, desca)
! end do
! do i=1, ml
! do j=1, ml
! write(6,*) "rank=", rank, "a_ref(",i,j,")=", a_ref(i,j)
! end do
! end do
deallocate(a_ref) ! We don't need it anymore, used only for constructing a_sym
t1 = MPI_WTIME()
call e%set("na", ln, err)
call e%set("nev", ln, err)
call e%set("nblk", lnbrow, err)
call e%set("local_nrows", ml, err)
call e%set("local_ncols", nl, err)
call e%set("process_row", myrow, err)
call e%set("process_col", mycol, err)
call e%set("mpi_comm_parent", MPI_COMM_WORLD, err)
call e%set("solver", ELPA_SOLVER_2STAGE, err)
! call e%set("gpu", 0, err) ! no gpu offload
call e%set("gpu", 1, err)
! call e%set("complex_kernel", ELPA_2STAGE_COMPLEX_AVX2_BLOCK1, err)
! call e%set("complex_kernel", ELPA_2STAGE_COMPLEX_AVX512_BLOCK1, err)
call e%set("complex_kernel", ELPA_2STAGE_COMPLEX_GPU, err)
err=e%setup()
call e%eigenvectors(a_sym, w, z, err)
call elpa_deallocate(e)
call elpa_uninit()
t2 = MPI_WTIME()
if (rank == 0) then
print *, "ELPA time: ", t2 - t1
end if
! deallocate stuff
if (rank == 0 ) then
do j=1, ELS_TO_PRINT
write(6,*) "w(", j, ")=", w(j), "z(", j, ")=", z(j,1)
end do
end if
deallocate(a_sym)
deallocate(z)
deallocate(w)
! deallocate(desca)
! Finish MPI
! call MPI_FINALIZE(err)
call blacs_gridexit(my_blacs_ctxt)
call blacs_exit(0)
! Finish MPI
! call MPI_FINALIZE(err)
end program benchmark_elpa