-
Notifications
You must be signed in to change notification settings - Fork 0
/
pinned_dt.f90
119 lines (100 loc) · 3.85 KB
/
pinned_dt.f90
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
!!
!! This code is completely based on the following NIVIDIA blog:
!! https://developer.nvidia.com/blog/how-optimize-data-transfers-cuda-fortran/
!!
program pinned_dt
use cudafor
implicit none
integer, parameter :: nElements = 40*1024*1024
! host arrays
! pageable
real :: a_pag(nElements), b_pag(nElements)
! pinned
real, allocatable, pinned :: a_pin(:), b_pin(:)
! device arrays
real, device :: a_d(nElements)
! cuda events for timing
type(cudaEvent) :: startEvent, stopEvent
! other variables
type(cudaDeviceProp) :: prop
real :: time
integer :: istat, i
logical :: pinnedFlag
! allocate and initialize arrays
do i = 1, nElements
a_pag(i) = i
enddo
b_pag = 0.0
allocate(a_pin(nElements), b_pin(nElements), &
STAT=istat, PINNED=pinnedFlag)
if (istat /= 0) then
write(*,*) 'Allocation of pinned arrays failed'
pinnedFlag = .false.
else
if (.not. pinnedFlag) write(*,*) 'Pinned allocation failed'
endif
if (pinnedFlag) then
a_pin = a_pag
b_pin = 0.0
endif
istat = cudaEventCreate(startEvent)
istat = cudaEventCreate(stopEvent)
! get some device props
istat = cudaGetDeviceProperties(prop,0)
write(*,*)
write(*,*) 'Device: ', trim(prop%name)
write(*,*) 'Transfer size (MB): ', 4*nElements / 1024. / 1024.
! do pageable data transfer
write(*,*)
write(*,*) 'Pageable transfers'
! host to device
istat = cudaEventRecord(startEvent, 0)
a_d = a_pag
istat = cudaEventRecord(stopEvent, 0)
istat = cudaEventSynchronize(stopEvent)
istat = cudaEventElapsedTime(time, startEvent, stopEvent)
write(*,*) ' Host to Device bandwidth (GB/s): ', &
nElements*4*1e-6/time
! device to host
istat = cudaEventRecord(startEvent, 0)
b_pag = a_d
istat = cudaEventRecord(stopEvent, 0)
istat = cudaEventSynchronize(stopEvent)
istat = cudaEventElapsedTime(time, startEvent, stopEvent)
write(*,*) ' Device to Host bandwidth (GB/s): ', &
nElements*4*1e-6/time
! check errors
if ( any(a_pag /= b_pag)) then
write(*,*) '*** Pageable transfers failed ***'
endif
! do pinned data transfers
if (pinnedFlag) then
write(*,*)
write(*,*) 'Pinned transfers'
! host to device
istat = cudaEventRecord(startEvent, 0)
a_d = a_pin
istat = cudaEventRecord(stopEvent, 0)
istat = cudaEventSynchronize(stopEvent)
istat = cudaEventElapsedTime(time, startEvent, stopEvent)
write(*,*) ' Host to Device bandwidth (GB/s): ', &
nElements*4*1e-6/time
! device to host
istat = cudaEventRecord(startEvent, 0)
b_pin = a_d
istat = cudaEventRecord(stopEvent, 0)
istat = cudaEventSynchronize(stopEvent)
istat = cudaEventElapsedTime(time, startEvent, stopEvent)
write(*,*) ' Device to Host bandwidth (GB/s): ', &
nElements*4*1e-6/time
! check errors
if ( any(a_pin /= b_pin)) then
write(*,*) '*** Pinned transfers failed ***'
endif
endif
! cleanup
if (allocated(a_pin)) deallocate(a_pin)
if (allocated(b_pin)) deallocate(b_pin)
istat = cudaEventDestroy(startEvent)
istat = cudaEventDestroy(stopEvent)
end program pinned_dt