forked from rordenlab/MRIcroGL
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscale2uint8.cpp
127 lines (119 loc) · 4 KB
/
scale2uint8.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
// g++ -c -O3 scale2uint8.cpp -o scale2uint8.o
#include <stdio.h>
#ifdef __x86_64__
#include <immintrin.h>
#else
#include "sse2neon.h"
#endif
#include <math.h>
#ifndef MAX
#define MAX(A,B) ((A) > (B) ? (A) : (B))
#endif
#ifndef MIN
#define MIN(A,B) ((A) > (B) ? (B) : (A))
#endif
#define kSSE 4 //128-bit SSE handles 4 32-bit floats per instruction
void f32_i8sse(float *in32, uint8_t *out8, int64_t n, float mn, float mx) {
//bankers' rounding
float mn0 = mn;
if (mn == mx) mn0 = mn - 1.0E-4;
float slope1 = 255.0 / fabs(mx - mn0);
float intercept1 = -MIN(mx, mn0) * slope1;
float * vin = (float *)in32;
__m128i * vout = (__m128i *)out8;
__m128 intercept = _mm_set1_ps(intercept1);
__m128 slope = _mm_set1_ps(slope1);
#define kStep 16
if (n >= kStep) {
for (int64_t i = 0; i <= (n-kStep); i+=kStep) {
//we will save 16 bytes (128 bits) per iteration
__m128 v4, s0, s1, s2, s3;
__m128i i0, i1, i01, i23;
v4 = _mm_load_ps(vin); //read 4*float32
s0 = _mm_fmadd_ps (v4, slope, intercept);
vin += kSSE; //scaled 4 floats
v4 = _mm_load_ps(vin); //read 4*float32
s1 = _mm_fmadd_ps (v4, slope, intercept);
vin += kSSE; //scaled 4 floats
i0 = _mm_cvtps_epi32(s0); //4*float32 -> 4*int32
i1 = _mm_cvtps_epi32(s1); //4*float32 -> 4*int32
i01 = _mm_packs_epi32 (i0, i1); //4*int32,4xint32 -> 8*int16
v4 = _mm_load_ps(vin); //read 4*float32
s0 = _mm_fmadd_ps (v4, slope, intercept);
vin += kSSE; //scaled 4 floats
v4 = _mm_load_ps(vin); //read 4*float32
s1 = _mm_fmadd_ps (v4, slope, intercept);
vin += kSSE; //scaled 4 floats
i0 = _mm_cvtps_epi32(s0); //4*float32 -> 4*int32
i1 = _mm_cvtps_epi32(s1); //4*float32 -> 4*int32
i23 = _mm_packs_epi32 (i0, i1); //4*int32,4xint32 -> 8*int16
__m128i i01234 = _mm_packus_epi16 (i01, i23); // 8*int16,8*int16 -> 16*uint8
_mm_store_si128(vout, i01234);
vout += 1; //store 128bits (16*uint8)
}
}
int tail = (n % kStep);
while (tail > 0) {
int i = n - tail;
float v = (in32[i] * slope1) + intercept1;
v = MIN(255.0,v);
v = MAX(0.0, v);
out8[i] = lrint(v);
tail --;
}
}
void i16_i8sse(int16_t *in16, uint8_t *out8, int64_t n, float mn, float mx) {
//bankers' rounding
float mn0 = mn;
if (mn == mx) mn0 = mn - 1.0E-4;
float slope1 = 255.0 / fabs(mx - mn0);
float intercept1 = -MIN(mx, mn0) * slope1;
__m128i * vin = (__m128i *)in16;
__m128i * vout = (__m128i *)out8;
__m128 intercept = _mm_set1_ps(intercept1);
__m128 slope = _mm_set1_ps(slope1);
#define kStep 16
if (n >= kStep) {
for (int64_t i = 0; i <= (n-kStep); i+=kStep) {
//we will save 16 bytes (128 bits) per iteration
__m128 v4, s0, s1, s2, s3;
__m128i i0, i1, i01, i23, i16x8, i16x8s, i16x4;
i16x8 = _mm_load_si128(vin);
i16x8s = _mm_shuffle_epi_1032(i16x8);
i16x4 = _mm_cvtepi16_epi32(i16x8);
v4 = _mm_cvtepi32_ps(i16x4);
s0 = _mm_fmadd_ps (v4, slope, intercept);
i16x4 = _mm_cvtepi16_epi32(i16x8s);
v4 = _mm_cvtepi32_ps(i16x4);
s1 = _mm_fmadd_ps (v4, slope, intercept);
i0 = _mm_cvtps_epi32(s0); //4*float32 -> 4*int32
i1 = _mm_cvtps_epi32(s1); //4*float32 -> 4*int32
i01 = _mm_packs_epi32 (i0, i1); //4*int32,4xint32 -> 8*int16
vin += 1; //done reading 8xint16
i16x8 = _mm_load_si128(vin);
i16x8s = _mm_shuffle_epi_1032(i16x8);
i16x4 = _mm_cvtepi16_epi32(i16x8);
v4 = _mm_cvtepi32_ps(i16x4);
s0 = _mm_fmadd_ps (v4, slope, intercept);
i16x4 = _mm_cvtepi16_epi32(i16x8s);
v4 = _mm_cvtepi32_ps(i16x4);
s1 = _mm_fmadd_ps (v4, slope, intercept);
i0 = _mm_cvtps_epi32(s0); //4*float32 -> 4*int32
i1 = _mm_cvtps_epi32(s1); //4*float32 -> 4*int32
i23 = _mm_packs_epi32 (i0, i1); //4*int32,4xint32 -> 8*int16
vin += 1; //done reading 8xint16
__m128i i01234 = _mm_packus_epi16 (i01, i23); // 8*int16,8*int16 -> 16*uint8
_mm_store_si128(vout, i01234);
vout += 1; //store 128bits (16*uint8)
}
}
int tail = (n % kStep);
while (tail > 0) {
int i = n - tail;
float v = (in16[i] * slope1) + intercept1;
v = MIN(255.0,v);
v = MAX(0.0, v);
out8[i] = lrint(v);
tail --;
}
}