From d51f85cc80f75eee4a8bef8b22bcda9d33fc6c42 Mon Sep 17 00:00:00 2001
From: Ma Mana ma Manama <78492008+Manamama@users.noreply.github.com>
Date: Tue, 19 Dec 2023 01:20:31 +0100
Subject: [PATCH] Create SentencePiece_a_file.py

See
https://github.com/google/sentencepiece/issues/950
---
 SentencePiece_a_file.py | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)
 create mode 100644 SentencePiece_a_file.py

diff --git a/SentencePiece_a_file.py b/SentencePiece_a_file.py
new file mode 100644
index 0000000..c7ed308
--- /dev/null
+++ b/SentencePiece_a_file.py
@@ -0,0 +1,35 @@
+# Import the SentencePiece library
+import sentencepiece as spm
+
+# Open the text file
+text_file = "input.txt" # Change this to your text file name
+with open(text_file, "r", encoding="utf-8") as f:
+    text = f.read()
+
+# Write the text to a new file with one sentence per line
+with open('sentences.txt', 'w', encoding="utf-8") as f:
+    f.write(text.replace('. ', '.\n'))  # Replace periods followed by a space with a period and a newline
+
+# Train SentencePiece model with Unigram
+spm.SentencePieceTrainer.train('--input=sentences.txt --model_prefix=m --vocab_size=800')
+#Unigram Language Model: This model tends to break down words into smaller subword units. For example, in your Unigram result, the word “All” is broken down into two tokens: ‘▁A’ and ‘ll’. This is because the Unigram model calculates the probability of each possible subword and chooses the most likely segmentation.
+#
+# Or: Train SentencePiece model with BPE
+#spm.SentencePieceTrainer.train('--input=sentences.txt --model_prefix=m --vocab_size=800 --model_type=bpe')
+#Byte Pair Encoding (BPE): This model, on the other hand, prefers to keep frequent words or subwords intact as much as possible. In your BPE result, the word “All” is kept as a single token: ‘▁All’. BPE starts with a base vocabulary of individual characters and iteratively merges the most frequent pair of symbols to create a new symbol, adding it to the vocabulary.
+
+# Load the trained SentencePiece model
+sp = spm.SentencePieceProcessor()
+sp.load('m.model')
+
+# Encode the text into subword pieces
+pieces = sp.encode(text, out_type=str)
+
+# Print the pieces
+print(pieces)
+
+# Decode the pieces back to the original text
+text = sp.decode(pieces)
+
+# Print the text
+print(text)