forked from travisdowns/uarch-bench
-
Notifications
You must be signed in to change notification settings - Fork 0
/
x86-methods2.asm
213 lines (171 loc) · 3.91 KB
/
x86-methods2.asm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
%include "x86-helpers.asm"
nasm_util_assert_boilerplate
thunk_boilerplate
; segregate some particular benchamrk here if you want to repeatedly compile different versions of it quickly
%ifndef UNROLLB
;%warning 'UNROLLB' defined to default of 1
%define UNROLLB 1
%endif
%ifndef UNROLLX
;%warning UNROLLX defined to default of 1
%define UNROLLX 1
%else
;%warning 'UNROLLX' defined externally to UNROLLX
%endif
;; do a loop over the first half of all the cache linnes, then loop
;; over the second half
;; performance is crap
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
define_bench bandwidth_test256_2loops
mov rdx, [rsi + region.size]
mov rsi, [rsi + region.start]
.top:
mov rax, rdx
mov rcx, rsi
lfence
vpxor ymm0, ymm0, ymm0
vpxor ymm1, ymm1, ymm1
vpxor ymm2, ymm2, ymm2
.inner:
mov r9, rcx
lea r8, [rcx + UNROLLB * 64]
.firsttop:
vpaddb ymm0, ymm0, [rcx]
add rcx, 64
cmp rcx, r8
jb .firsttop
lea rcx, [r9 + 32]
lea r8, [rcx + UNROLLB * 64]
.secondtop:
vpaddb ymm1, ymm1, [rcx]
vpaddb ymm2, ymm2, [rcx + 64]
add rcx, 128
cmp rcx, r8
jb .secondtop
mov rcx, r9
add rcx, UNROLLB * 64
sub rax, UNROLLB * 64
jge .inner
dec rdi
jnz .top
ret
;; Interleaved 2-pass
;; the main loop does UNROLLB first half reads
;; then does UNROLLB second half reads hopefully finding the line in L1
;; OK but very jaggy performance
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
define_bench bandwidth_test256_2pass
mov rdx, [rsi + region.size]
mov rsi, [rsi + region.start]
.top:
mov rax, rdx
mov rcx, rsi
lfence
vpxor ymm0, ymm0, ymm0
vpxor ymm1, ymm1, ymm1
.inner:
%assign offset 0
%rep UNROLLB
vpaddb ymm0, ymm0, [rcx + offset]
%assign offset (offset + 64)
%endrep
%assign offset 32
%rep UNROLLB/2
vpaddb ymm0, ymm0, [rcx + offset]
vpaddb ymm1, ymm1, [rcx + offset + 64]
%assign offset (offset + 128)
%endrep
%if (UNROLLB % 2 == 1)
vpaddb ymm0, ymm0, [rcx + offset]
%endif
add rcx, UNROLLB * 64
sub rax, UNROLLB * 64
jge .inner
dec rdi
jnz .top
ret
;; the main loop interleaves in a fine-grained way an initial access
;; to a cache line, and then the second access to the cache line with
;; the former running UNROLLB lines ahead of the latter (once UNROLLB
;; gets to about 5 or 6 it seems the second access hits in L1 and max
;; speed is achieved) - good and very flat performance approaching
;; exactly 1.5 cycles/line
;;
;; UNROLLX must be even for it to work properly (to "pair up" the reads hitting L1)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
define_bench bandwidth_test256i
mov rdx, [rsi + region.size]
mov rsi, [rsi + region.start]
.top:
mov rax, rdx
sub rax, UNROLLB * 64 ; reduce main loop iterations since the intro/outro parts handle this
mov rcx, rsi
lfence
vpxor ymm0, ymm0, ymm0
vpxor ymm1, ymm1, ymm1
vpxor ymm2, ymm1, ymm1
; lead-in loop which reads the first half of the first UNROLLB cache lines
%assign offset 0
%rep UNROLLB
vpaddb ymm0, ymm0, [rcx + offset]
%assign offset (offset + 64)
%endrep
.inner:
%assign offset 0
%rep UNROLLX
vpaddb ymm0, ymm0, [rcx + offset + UNROLLB * 64]
%assign offset (offset + 64)
%endrep
%assign offset 0
%rep UNROLLX
vpaddb ymm1, ymm1, [rcx + offset + 32]
%assign offset (offset + 64)
%endrep
add rcx, UNROLLX * 64
sub rax, UNROLLX * 64
jge .inner
; lead out loop to read the remaining lines
%assign offset 0
%rep UNROLLB
vpaddb ymm0, ymm0, [rcx + offset]
%assign offset (offset + 64)
%endrep
dec rdi
jnz .top
ret
define_bench movd_xmm
vzeroall
.top:
%rep 100
vpor xmm0, xmm0, xmm0
movd eax, xmm0
movd xmm0, eax
%endrep
dec rdi
jnz .top
ret
define_bench movd_ymm
vzeroupper
vpor ymm0, ymm0, ymm0
.top:
%rep 100
vpor ymm0, ymm0, ymm0
movd eax, xmm0
movd xmm0, eax
%endrep
dec rdi
jnz .top
ret
define_bench rep_movsb
sub rsp, 1024
mov r8, rdi
.top:
%rep 100
mov ecx, 1024
mov rdi, rsp
rep stosb
%endrep
dec r8
jnz .top
add rsp, 1024
ret