forked from microsoft/go-mssqldb
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ucs22str.go
151 lines (121 loc) · 4.84 KB
/
ucs22str.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
//go:build !386 && !arm && !mips && !mipsle
// +build !386,!arm,!mips,!mipsle
package mssql
import (
"fmt"
"reflect"
"unicode/utf16"
"unsafe"
)
func ucs22str(s []byte) (string, error) {
if len(s)%2 != 0 {
return "", fmt.Errorf("illegal UCS2 string length: %d", len(s))
}
// allocate a buffer which we will attempt to copy ascii into, optimistically, as we validate
buf := make([]byte, len(s)/2)
useFastPath := true
// how many 8 byte chunks are in the input buffer
nlen8 := len(s) & 0xFFFFFFF8
// our read and write offsets into the buffers
var (
readIndex int
writeIndex int
)
// step through in 8 byte chunks.
for readIndex = 0; readIndex < nlen8; readIndex += 8 {
// dereference directly into the array as uint64s
ui64 := *(*uint64)(unsafe.Pointer(uintptr(unsafe.Pointer(&s[0])) + uintptr(readIndex)))
// mask the entire 64 bit region and check for
// 1) even bytes > 0
// 2) odd bytes with their high bit set
// the mask for this is FF80....
if ui64&mask64 > 0 {
// if we find a value once masked, we have to take the slow path as this is not an ascii string
useFastPath = false
break
}
// we are ok to read out the 4 odd bytes and remove the empty even bytes
var ui32 uint32 = 0
ui32 |= uint32(byte(ui64))
ui64 = ui64 >> 8
ui32 |= uint32(uint16(ui64))
ui64 = ui64 >> 8
ui32 |= uint32(ui64 & 0xFF0000)
ui64 = ui64 >> 8
ui32 |= uint32(ui64 & 0xFF000000)
// write the new 32 bit value to the destination buffer
ptrui32 := ((*uint32)(unsafe.Pointer(uintptr(unsafe.Pointer(&buf[0])) + uintptr(writeIndex))))
*ptrui32 = ui32
// step forward four bytes in the destinaiton buffer
writeIndex += 4
}
// can we continue reading on the fast ascii path?
if useFastPath {
// we have now dealt with all the avalable 8 byte chunks, we have at most 7 bytes remaining.
// have we got at least 4 bytes remaining to be read?
if len(s)-readIndex >= 4 {
// deal with the next 32 bit region
// read 32 bits from the current read position in the source slice
ui32 := *(*uint32)(unsafe.Pointer(uintptr(unsafe.Pointer(&s[0])) + uintptr(readIndex)))
// mask the 32 bit value as above. again, if we find a value
// this is not ascii and we need to fall back to the slow path
// this time with a 32 bit mask
if ui32&mask32 > 0 {
// we have found non ascii text and must fallback
useFastPath = false
} else {
// read the two odd positions bytes and write as a single 16 bit value
var ui16 uint16 = 0
ui16 |= uint16(byte(ui32))
ui32 = ui32 >> 8
ui16 |= uint16(ui32)
ptrui16 := ((*uint16)(unsafe.Pointer(uintptr(unsafe.Pointer(&buf[0])) + uintptr((writeIndex)))))
*ptrui16 = ui16
// step forward the read and write positions.
readIndex += 4
writeIndex += 2
}
}
// Are we still on the fast path?
if useFastPath {
// have we got at least 2 bytes remaining to be read?
// actually we can only have at most 2 bytes at this point
// since we know the source buffer has even length.
if len(s)-readIndex >= 2 {
// read 2 bytes
ui16 := *(*uint16)(unsafe.Pointer(uintptr(unsafe.Pointer(&s[0])) + uintptr(readIndex)))
// mask again, but only 16bits
if ui16&mask16 == 0 {
// manually pull out the low byte and write to our destination buffer
buf[writeIndex] = byte(ui16 & 0xFF)
// we have now successfully read the entire ascii buffer and can convert to a string
return *(*string)(unsafe.Pointer(&buf)), nil
}
} else {
// there were no further bytes to read, but we have successfully read the ascii
// and can convert to a string
return *(*string)(unsafe.Pointer(&buf)), nil
}
}
}
// one of the above checks has found non ascii values in the buffer, either
// a high bit set in an odd byte or any non zero in an even byte.
// we fall back to a slower conversion here.
// we can reuse the underlying array and create our own uint16 slice here
// because utf16.Decode allocates a new buffer and only reads its input.
// declare a real uint16 slice so that the compiler can keep track of
// the underlying memory as we transfer & convert it.
// This is to ensure that the GC does not prematurely collect our data.
var uint16slice []uint16
uint16Header := (*reflect.SliceHeader)(unsafe.Pointer(&uint16slice))
sourceHeader := (*reflect.SliceHeader)(unsafe.Pointer(&s))
uint16Header.Data = sourceHeader.Data
// it is important to reference s after the assignment of the Data
// pointer to make sure that s is not garbage collected before
// we have another reference to the data.
uint16Header.Len = len(s) / 2 // the output is half the length in bytes
uint16Header.Cap = uint16Header.Len // the capacity is also half the number of bytes
// decode the uint16s as utf-16 and return a string.
// After this point both s and uint16slice can be garbage collected.
return string(utf16.Decode(uint16slice)), nil
}