-
Notifications
You must be signed in to change notification settings - Fork 0
/
test.c
150 lines (138 loc) · 4.08 KB
/
test.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#include <string.h>
#include <time.h>
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <immintrin.h>
#define VEC_SIZE 32
size_t _strlen_sse(const char *s)
{
if (__builtin_expect(s == NULL, 0))
return 0;
const char *ptr = s;
__m128i zero = _mm_setzero_si128();
size_t misalignment = (size_t)s & 15;
/*
* If the pointer is not aligned on a 16 bytes boundary,
* the function handles the initial bytes separately.
* It loads 16 bytes from the memory pointed by ptr and checks if there is a zero byte in the chunk.
*/
if (misalignment)
{
__m128i data = _mm_loadu_si128((const __m128i *)s);
__m128i cmp = _mm_cmpeq_epi8(data, zero);
uint32_t mask = _mm_movemask_epi8(cmp);
if (mask)
return (s - ptr) + __builtin_ctz(mask);
s += 16 - misalignment;
}
/*
* Prefetch the next 32 bytes to improve performance.
* The _MM_HINT_NTA hint is used to indicate that the data is not accessed again soon.
* This hint is useful when the data is not accessed sequentially.
* The _MM_HINT_T0 hint is used to indicate that the data is accessed soon.
* This hint is useful when the data is accessed sequentially.
*/
while (1)
{
__m128i data = _mm_loadu_si128((const __m128i *)s);
__m128i cmp = _mm_cmpeq_epi8(data, zero);
uint32_t mask = _mm_movemask_epi8(cmp);
if (mask)
return (s - ptr) + __builtin_ctz(mask);
s += 16;
}
}
#define SIMPLV_USE_SSE
#include "includes/simpl.h"
size_t _strlen_vec(const char *s)
{
const char *ptr = s;
vec zero = v256b_set1_char(0);
size_t offset = (uintptr_t)ptr & (VEC_SIZE - 1);
if (offset != 0)
{
size_t align_size = VEC_SIZE - offset;
vec data = v256b_loadu((const uvec *)ptr);
vec cmp = v32c_cmpeq(data, zero);
int mask = v32c_movemask(cmp);
if (mask != 0)
{
return (size_t)__builtin_ctz(mask);
}
ptr += align_size;
}
while (1)
{
vec data = v256b_loadu((const uvec *)ptr);
vec cmp = v32c_cmpeq(data, zero);
int mask = v32c_movemask(cmp);
if (mask != 0)
{
return (size_t)(ptr - s) + (size_t)__builtin_ctz(mask);
}
ptr += VEC_SIZE;
}
_mm256_zeroupper();
return 0;
}
/* size_t _strlen_avx(const char *s) */
/* { */
/* const char *ptr = s; */
/* const __m256i zero = _mm256_setzero_si256(); */
/* size_t offset = (uintptr_t)ptr & (VEC_SIZE - 1); */
/* */
/* if (offset != 0) */
/* { */
/* size_t align_size = VEC_SIZE - offset; */
/* __m256i data = _mm256_loadu_si256((const __m256i *)ptr); */
/* __m256i cmp = _mm256_cmpeq_epi8(data, zero); */
/* int mask = _mm256_movemask_epi8(cmp); */
/* if (mask != 0) */
/* { */
/* return (size_t)__builtin_ctz(mask); */
/* } */
/* ptr += align_size; */
/* } */
/* */
/* while (1) */
/* { */
/* __m256i data = _mm256_loadu_si256((const __m256i *)ptr); */
/* __m256i cmp = _mm256_cmpeq_epi8(data, zero); */
/* int mask = _mm256_movemask_epi8(cmp); */
/* if (mask != 0) */
/* { */
/* return (size_t)(ptr - s) + (size_t)__builtin_ctz(mask); */
/* } */
/* ptr += VEC_SIZE; */
/* } */
/* */
/* return 0; */
/* } */
int main(void)
{
size_t iterations = 100;
struct timespec start, end;
double cpu_time_used;
char *str = malloc(1024 * 1024 * 1024);
memset(str, 1, 1024 * 1024 * 1024);
int size;
int size_real;
clock_gettime(CLOCK_MONOTONIC, &start);
for (size_t i = 0; i < iterations; i++) {
size =_strlen_vec(str);
}
clock_gettime(CLOCK_MONOTONIC, &end);
cpu_time_used = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
printf("vec: %f seconds\n", cpu_time_used);
clock_gettime(CLOCK_MONOTONIC, &start);
for (size_t i = 0; i < iterations; i++) {
size_real =_strlen_sse(str);
}
clock_gettime(CLOCK_MONOTONIC, &end);
cpu_time_used = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
printf("sse: %f seconds\n", cpu_time_used);
printf("%d\n", size);
printf("%d\n", size_real);
free(str);
}