-
Notifications
You must be signed in to change notification settings - Fork 12
/
ctfs_cpy.c
executable file
·130 lines (119 loc) · 2.62 KB
/
ctfs_cpy.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#include "ctfs_runtime.h"
#define FLUSH_ALIGN (uint64_t)64
#define ALIGN_MASK (FLUSH_ALIGN - 1)
inline void avx_cpy(void *dest, const void *src, size_t size)
{
/*
* Copy the range in the forward direction.
*
* This is the most common, most optimized case, used unless
* the overlap specifically prevents it.
*/
/* copy up to FLUSH_ALIGN boundary */
size_t cnt = (uint64_t)dest & ALIGN_MASK;
if (unlikely(cnt > 0))
{
cnt = FLUSH_ALIGN - cnt;
if(cnt > size){
cnt = size;
size = 0;
}
else{
size -= cnt;
}
/* never try to copy more the len bytes */
// register uint32_t d;
register uint8_t d8;
// while(cnt > 3){
// d = *(uint32_t*)(src);
// _mm_stream_si32(dest, d);
// src += 4;
// dest += 4;
// cnt -= 4;
// }
// if(unlikely(cnt > 0)){
while(cnt){
d8 = *(uint8_t*)(src);
*(uint8_t*)dest = d8;
cnt --;
src ++;
dest ++;
}
cache_wb_one(dest);
// }
if(size == 0){
return;
}
}
assert((uint64_t)dest % 64 == 0);
register __m512i xmm0;
while(size >= 64){
xmm0 = _mm512_loadu_si512(src);
_mm512_stream_si512(dest, xmm0);
dest += 64;
src += 64;
size -= 64;
}
/* copy the tail (<512 bit) */
size &= ALIGN_MASK;
if (unlikely(size != 0))
{
while(size > 0){
*(uint8_t*)dest = *(uint8_t*)src;
size --;
dest ++;
src ++;
}
cache_wb_one(dest - 1);
}
}
inline void avx_cpyt(void *dest, void *src, size_t size)
{
/*
* Copy the range in the forward direction.
*
* This is the most common, most optimized case, used unless
* the overlap specifically prevents it.
*/
/* copy up to FLUSH_ALIGN boundary */
register __m512i xmm0;
while(size >= 512){
xmm0 = _mm512_loadu_si512(src);
_mm512_storeu_si512(dest, xmm0);
xmm0 = _mm512_loadu_si512(src + 64);
_mm512_storeu_si512(dest + 64, xmm0);
xmm0 = _mm512_loadu_si512(src + 128);
_mm512_storeu_si512(dest + 128, xmm0);
xmm0 = _mm512_loadu_si512(src + 192);
_mm512_storeu_si512(dest + 192, xmm0);
xmm0 = _mm512_loadu_si512(src + 256);
_mm512_storeu_si512(dest + 256, xmm0);
xmm0 = _mm512_loadu_si512(src + 320);
_mm512_storeu_si512(dest + 320, xmm0);
xmm0 = _mm512_loadu_si512(src + 384);
_mm512_storeu_si512(dest + 384, xmm0);
xmm0 = _mm512_loadu_si512(src + 448);
_mm512_storeu_si512(dest + 448, xmm0);
dest += 512;
src += 512;
size -= 512;
}
while(size >= 64){
xmm0 = _mm512_loadu_si512(src);
_mm512_storeu_si512(dest, xmm0);
dest += 64;
src += 64;
size -= 64;
}
/* copy the tail */
size &= ALIGN_MASK;
if (unlikely(size != 0))
{
while(size > 0){
*(uint8_t*)dest = *(uint8_t*)src;
size --;
dest ++;
src ++;
}
}
}