forked from karpathy/llama2.c
-
Notifications
You must be signed in to change notification settings - Fork 0
/
test.c
113 lines (91 loc) · 5.69 KB
/
test.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#define TESTING
#include "run.c"
void assert_eq(int a, int b) {
if (a != b) {
printf("Assertion failed: %d != %d\n", a, b);
exit(EXIT_FAILURE);
}
}
void test_prompt_encoding(Tokenizer* tokenizer, char* prompt, int* expected_tokens, int num_expected_tokens) {
// encode
int* prompt_tokens = (int*)malloc((strlen(prompt)+3) * sizeof(int));
int num_prompt_tokens = 0; // the total number of prompt tokens
encode(tokenizer, prompt, 1, 0, prompt_tokens, &num_prompt_tokens);
#if VERBOSITY == 1
// print maybe
printf("expected tokens:\n");
for (int i = 0; i < num_expected_tokens; i++) printf("%d ", expected_tokens[i]);
printf("\n");
printf("actual tokens:\n");
for (int i = 0; i < num_prompt_tokens; i++) printf("%d ", prompt_tokens[i]);
printf("\n");
#endif
// verify
assert_eq(num_prompt_tokens, num_expected_tokens);
for (int i = 0; i < num_prompt_tokens; i++) {
assert_eq(prompt_tokens[i], expected_tokens[i]);
}
#if VERBOSITY == 1
printf("OK\n");
printf("---\n");
#endif
free(prompt_tokens);
}
void test_prompt_encodings() {
// let's verify that the Tokenizer works as expected
char *tokenizer_path = "tokenizer.bin";
int vocab_size = 128256;
Tokenizer tokenizer;
build_tokenizer(&tokenizer, tokenizer_path, vocab_size);
// test 0 (test the empty string) (I added this as a simple case)
char *prompt0 = "";
int expected_tokens0[] = {128000};
test_prompt_encoding(&tokenizer, prompt0, expected_tokens0, sizeof(expected_tokens0) / sizeof(int));
// the tests below are taken from the Meta Llama 2 repo example code
// https://github.com/facebookresearch/llama/blob/main/example_text_completion.py
// I updated the token ids to match the llama3.2 tokenizer
// test 1
char *prompt = "I believe the meaning of life is";
int expected_tokens[] = {128000, 40, 4510, 279, 7438, 315, 2324, 374};
test_prompt_encoding(&tokenizer, prompt, expected_tokens, sizeof(expected_tokens) / sizeof(int));
// test 2
char* prompt2 = "Simply put, the theory of relativity states that ";
int expected_tokens2[] = {128000, 61346, 2231, 11, 279, 10334, 315, 1375, 44515, 5415, 430, 220};
test_prompt_encoding(&tokenizer, prompt2, expected_tokens2, sizeof(expected_tokens2) / sizeof(int));
// test 3
char* prompt3 = "A brief message congratulating the team on the launch:\n\n Hi everyone,\n\n I just ";
int expected_tokens3[] = {128000, 32, 10015, 1984, 40588, 15853, 279, 2128, 389, 279, 7195, 1473, 286, 21694, 5127, 3638, 286, 358, 1120, 220};
test_prompt_encoding(&tokenizer, prompt3, expected_tokens3, sizeof(expected_tokens3) / sizeof(int));
// test 4
char* prompt4 = "Translate English to French:\n\n sea otter => loutre de mer\n peppermint => menthe poivrée\n plush girafe => girafe peluche\n cheese =>";
int expected_tokens4[] = {128000, 28573, 6498, 311, 8753, 1473, 286, 9581, 14479, 466, 591, 326, 412, 265, 409, 4809, 198, 286, 83804, 94932, 591, 11540, 383, 3273, 58866, 8047, 198, 286, 72779, 41389, 5763, 591, 41389, 5763, 12077, 34927, 198, 286, 17604, 591};
test_prompt_encoding(&tokenizer, prompt4, expected_tokens4, sizeof(expected_tokens4) / sizeof(int));
// more tests generated by chatgpt
char* prompt5 = "Let's test with numbers: 123, 456.";
int expected_tokens5[] = {128000, 10267, 596, 1296, 449, 5219, 25, 220, 4513, 11, 220, 10961, 13};
test_prompt_encoding(&tokenizer, prompt5, expected_tokens5, sizeof(expected_tokens5) / sizeof(int));
char* prompt6 = "Newline\nand special chars: @#!.";
int expected_tokens6[] = {128000, 3648, 1074, 198, 438, 3361, 23861, 25, 571, 8169, 13};
test_prompt_encoding(&tokenizer, prompt6, expected_tokens6, sizeof(expected_tokens6) / sizeof(int));
char* prompt7 = "Whitespace and tabs.";
int expected_tokens7[] = {128000, 74904, 257, 323, 257, 23204, 13};
test_prompt_encoding(&tokenizer, prompt7, expected_tokens7, sizeof(expected_tokens7) / sizeof(int));
char* prompt8 = "'ve got contractions, haven't we?";
int expected_tokens8[] = {128000, 3077, 2751, 6155, 4109, 11, 9167, 956, 584, 30};
test_prompt_encoding(&tokenizer, prompt8, expected_tokens8, sizeof(expected_tokens8) / sizeof(int));
char* prompt9 = "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful AI assistant for travel tips and recommendations<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat can you help me with?<|eot_id|><|start_header_id|>assistant<|end_header_id|>";
int expected_tokens9[] = {128000, 128006, 9125, 128007, 271, 2675, 527, 264, 11190, 15592, 18328, 369, 5944, 10631, 323, 19075, 128009, 128006, 882, 128007, 271, 3923, 649, 499, 1520, 757, 449, 30, 128009, 128006, 78191, 128007};
test_prompt_encoding(&tokenizer, prompt9, expected_tokens9, sizeof(expected_tokens9) / sizeof(int));
char* prompt10 = "<|not_a_special_token|><|this_should_split_normally|>";
int expected_tokens10[] = {128000, 27, 91, 1962, 4404, 42729, 6594, 91, 1822, 91, 576, 44478, 17489, 19731, 750, 91, 29};
test_prompt_encoding(&tokenizer, prompt10, expected_tokens10, sizeof(expected_tokens10) / sizeof(int));
char* prompt11 = "<|reserved_special_token_0|><|reserved_special_token_125|><|reserved_special_token_247|><|reserved_special_token_777|>";
int expected_tokens11[] = {128000, 128002, 128130, 128252, 27, 91, 52202, 42729, 6594, 62, 15831, 91, 29};
test_prompt_encoding(&tokenizer, prompt11, expected_tokens11, sizeof(expected_tokens11) / sizeof(int));
// memory and file handles cleanup
free_tokenizer(&tokenizer);
}
int main(int argc, char *argv[]) {
test_prompt_encodings();
printf("ALL OK\n");
}