forked from gurnec/HashCheck
-
Notifications
You must be signed in to change notification settings - Fork 2
/
UnicodeHelpers.c
105 lines (85 loc) · 2.74 KB
/
UnicodeHelpers.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
/**
* UTF-16/UTF-8 Helper Functions
* Copyright (C) Kai Liu. All rights reserved.
*
* Please refer to readme.txt for information about this source code.
* Please refer to license.txt for details about distribution and modification.
**/
#include "UnicodeHelpers.h"
#include "libs/BitwiseIntrinsics.h"
#define GETWORD(p) (*((UPWORD) (p)))
#define GETDWORD(p) (*((UPDWORD)(p)))
PBYTE __fastcall IsTextUTF8( PBYTE pbData )
{
PBYTE pbOriginal = pbData;
// Step 1: Check for the BOM (signature)
if (*pbData == 0xEF && GETWORD(pbData + 1) == 0xBFBB)
return(pbData + 3);
// Step 2: If there is no BOM, then manually walk the string and sniff
// We check for validity by masking out the data bits and testing if the
// remaining bits match a valid control sequence... this is not a perfect
// check, as it will pass overlong encodings (depending on your goal, you
// might actually want to pass overlongs) since the cost of being pedantic
// with overlongs is not worth it given the lack of substantial benefit.
while (*pbData)
{
if (*pbData < 0x80)
pbData += 1; // Plain ASCII
else if ((GETWORD(pbData) & 0xC0E0) == 0x80C0)
pbData += 2; // Valid 2-byte UTF-8 sequence
else if ((GETDWORD(pbData) & 0x00C0C0F0) == 0x008080E0)
pbData += 3; // Valid 3-byte UTF-8 sequence
else if ((GETDWORD(pbData) & 0xC0C0C0F8) == 0x808080F0)
pbData += 4; // Valid 4-byte UTF-8 sequence (outside of UTF-16 range)
else
return(NULL);
}
return(pbOriginal);
}
PWSTR __fastcall BufferToWStr( PBYTE *ppbData, DWORD cbData )
{
PBYTE pbData = *ppbData;
PWSTR pszResult;
// Step 1: Find out if the buffer is a UTF-16 string
INT iUnicodeTests = IS_TEXT_UNICODE;
IsTextUnicode(pbData, cbData, &iUnicodeTests);
if (iUnicodeTests & IS_TEXT_UNICODE)
{
// Reverse if necessary
if (iUnicodeTests & IS_TEXT_UNICODE_REVERSE_MASK)
SwapA16I((PWSTR)pbData, cbData / sizeof(WCHAR));
// Skip the BOM if it exists
if (iUnicodeTests & IS_TEXT_BOM)
pbData += sizeof(WCHAR);
return((PWSTR)pbData);
}
// Step 2: If this is not UTF-16, then check if this is UTF-8 or ANSI, and
// then convert as appropriate
if (pszResult = malloc((cbData + 1) * sizeof(WCHAR)))
{
PBYTE pbScratch;
UINT uCodePage;
// Check for UTF-8 and select the appropriate conversion method
if (pbScratch = IsTextUTF8(pbData))
{
uCodePage = CP_UTF8;
}
else
{
pbScratch = pbData;
uCodePage = CP_ACP;
}
// Convert, and if successful, replace the old buffer with the new
if (StrToWStrEx(pbScratch, pszResult, cbData + 1, uCodePage))
{
free(pbData);
*ppbData = (PBYTE)pszResult;
return(pszResult);
}
else
{
free(pszResult);
}
}
return(NULL);
}