Skip to content

Commit

Permalink
CV2-5412: no newlines in text for FastText (#458)
Browse files Browse the repository at this point in the history
* no newlines in text for FastText

* quick test with newlines

---------

Co-authored-by: computermacgyver <computermacgyver>
Co-authored-by: Devin Gaffney <[email protected]>
  • Loading branch information
computermacgyver and DGaffney authored Oct 3, 2024
1 parent c7fcdb2 commit c4db7ca
Show file tree
Hide file tree
Showing 2 changed files with 3 additions and 1 deletion.
3 changes: 2 additions & 1 deletion app/main/lib/langid.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# 3rd party langid providers
from flask import current_app as app
import json
import re

from google.cloud import translate_v2 as translate
# import requests # Used for MicrosoftLangidProvider
Expand Down Expand Up @@ -87,7 +88,7 @@ class FastTextLangidProvider:
fasttext_model = fasttext.load_model("extra/fasttext_language_id/lid.176.ftz")
@staticmethod
def langid(text):
prediction = list(FastTextLangidProvider.fasttext_model.predict(text))
prediction = list(FastTextLangidProvider.fasttext_model.predict(re.sub("[\n\r]"," ",text,re.MULTILINE)))
# prediction is a list of tuples, e.g., [('__label__en',), array([0.22517213])]

language = prediction[0][0].split("__")[-1]
Expand Down
1 change: 1 addition & 0 deletions app/test/test_langid.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
class TestLangidBlueprint(BaseTestCase):
TESTS = [
{ 'fasttext': 'hi', 'cld3': 'hi', 'microsoft': 'hi', 'google': 'hi', 'text': 'नमस्ते मेरा नाम करीम है' },
{ 'fasttext': 'hi', 'cld3': 'hi', 'microsoft': 'hi', 'google': 'hi', 'text': 'नमस्ते मेरा नाम\n\n करीम है' },
{ 'fasttext': None, 'cld3': 'hi-Latn', 'microsoft': 'en', 'google': ['hi', 'hi-Latn'], 'text': 'namaste mera naam Karim hai' },
{ 'fasttext': 'mr', 'cld3': 'mr', 'microsoft': 'hi', 'google': 'mr', 'text': 'हॅलो माझे नाव करीम आहे' },
{ 'fasttext': 'bn', 'cld3': 'bn', 'microsoft': 'bn', 'google': 'bn', 'text': 'হ্যালো আমার নাম কারিম' },
Expand Down

0 comments on commit c4db7ca

Please sign in to comment.