Skip to content

Commit

Permalink
tests : add option to tokenize text files
Browse files Browse the repository at this point in the history
ggml-ci
  • Loading branch information
ggerganov committed Aug 26, 2023
1 parent 70005bd commit e4324cb
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 1 deletion.
44 changes: 43 additions & 1 deletion tests/test-tokenizer-0.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <string>
#include <map>
#include <vector>
#include <fstream>

// generate using test-tokenizer-0.py
static const std::map<std::string, std::vector<llama_token>> & k_tests() {
Expand Down Expand Up @@ -41,12 +42,17 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests() {

int main(int argc, char **argv) {
if (argc < 2) {
fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]);
return 1;
}

const std::string fname = argv[1];

std::string fname_text;
if (argc > 2) {
fname_text = argv[2];
}

fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());

llama_model * model;
Expand Down Expand Up @@ -131,6 +137,42 @@ int main(int argc, char **argv) {
}
}

if (!fname_text.empty()) {
fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str());

std::string text;
{
std::ifstream ifs(fname_text);
if (!ifs) {
fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str());
return 1;
}
text = std::string(std::istreambuf_iterator<char>(ifs), std::istreambuf_iterator<char>());
}

fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());

const std::vector<llama_token> res = llama_tokenize(ctx, text, true);

fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());

{
const std::string fname_out = fname_text + ".tokcpp";

std::ofstream ofs(fname_out);
if (!ofs) {
fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str());
return 1;
}

for (const auto & tok : res) {
ofs << tok << " ";
}

ofs << "\n";
}
}

llama_free_model(model);
llama_free(ctx);

Expand Down
18 changes: 18 additions & 0 deletions tests/test-tokenizer-0.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

parser = argparse.ArgumentParser()
parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
parser.add_argument("--fname-tok", help="path to a text file to tokenize")
args = parser.parse_args()

dir_tokenizer = args.dir_tokenizer
Expand Down Expand Up @@ -68,3 +69,20 @@
for x in res:
print("%7d," % x, end='')
print(" }, },")

fname_tok = args.fname_tok
if fname_tok:
print('tokenizing file: ', fname_tok)
fname_out = fname_tok + '.tok'
with open(fname_tok, 'r') as f:
lines = f.readlines()
s = ''.join(lines)
res = tokenizer.encode(s, add_bos=True)
# write to file
with open(fname_out, 'w') as f:
for x in res:
f.write(str(x) + ' ')
f.write('\n')
print('len(res): ', len(res))
print('len(lines): ', len(lines))
print('results written to: ', fname_out)

0 comments on commit e4324cb

Please sign in to comment.