From d51f85cc80f75eee4a8bef8b22bcda9d33fc6c42 Mon Sep 17 00:00:00 2001 From: Ma Mana ma Manama <78492008+Manamama@users.noreply.github.com> Date: Tue, 19 Dec 2023 01:20:31 +0100 Subject: [PATCH] Create SentencePiece_a_file.py See https://github.com/google/sentencepiece/issues/950 --- SentencePiece_a_file.py | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 SentencePiece_a_file.py diff --git a/SentencePiece_a_file.py b/SentencePiece_a_file.py new file mode 100644 index 0000000..c7ed308 --- /dev/null +++ b/SentencePiece_a_file.py @@ -0,0 +1,35 @@ +# Import the SentencePiece library +import sentencepiece as spm + +# Open the text file +text_file = "input.txt" # Change this to your text file name +with open(text_file, "r", encoding="utf-8") as f: + text = f.read() + +# Write the text to a new file with one sentence per line +with open('sentences.txt', 'w', encoding="utf-8") as f: + f.write(text.replace('. ', '.\n')) # Replace periods followed by a space with a period and a newline + +# Train SentencePiece model with Unigram +spm.SentencePieceTrainer.train('--input=sentences.txt --model_prefix=m --vocab_size=800') +#Unigram Language Model: This model tends to break down words into smaller subword units. For example, in your Unigram result, the word “All” is broken down into two tokens: ‘▁A’ and ‘ll’. This is because the Unigram model calculates the probability of each possible subword and chooses the most likely segmentation. +# +# Or: Train SentencePiece model with BPE +#spm.SentencePieceTrainer.train('--input=sentences.txt --model_prefix=m --vocab_size=800 --model_type=bpe') +#Byte Pair Encoding (BPE): This model, on the other hand, prefers to keep frequent words or subwords intact as much as possible. In your BPE result, the word “All” is kept as a single token: ‘▁All’. BPE starts with a base vocabulary of individual characters and iteratively merges the most frequent pair of symbols to create a new symbol, adding it to the vocabulary. + +# Load the trained SentencePiece model +sp = spm.SentencePieceProcessor() +sp.load('m.model') + +# Encode the text into subword pieces +pieces = sp.encode(text, out_type=str) + +# Print the pieces +print(pieces) + +# Decode the pieces back to the original text +text = sp.decode(pieces) + +# Print the text +print(text)