Merge branch 'develop'

meedan · Oct 2, 2024 · 8126264 · 8126264
2 parents 11711c2 + c7fcdb2
commit 8126264
Show file tree

Hide file tree

Showing 22 changed files with 511 additions and 99 deletions.
diff --git a/app/main/controller/healthcheck_controller.py b/app/main/controller/healthcheck_controller.py
@@ -19,7 +19,7 @@ def get(self):
       'ELASTICSEARCH_SIMILARITY': False,
       'REDIS': False,
       'DATABASE': False,
-      'LANGID': False
+      # 'LANGID': False
     }
 
     # Elasticsearch
@@ -50,11 +50,11 @@ def get(self):
     except Exception as e:
       result['DATABASE'] = str(e)
 
-    # Langid
-    try:
-      class_ = getattr(importlib.import_module('app.main.lib.langid'), app.config['PROVIDER_LANGID'].title() + 'LangidProvider')
-      result['LANGID'] = class_.test()
-    except Exception as e:
-      result['LANGID'] = '%s: %s' % (app.config['PROVIDER_LANGID'].title() + 'LangidProvider', str(e))
+    # # Langid
+    # try:
+    #   class_ = getattr(importlib.import_module('app.main.lib.langid'), app.config['PROVIDER_LANGID'].title() + 'LangidProvider')
+    #   result['LANGID'] = class_.test()
+    # except Exception as e:
+    #   result['LANGID'] = '%s: %s' % (app.config['PROVIDER_LANGID'].title() + 'LangidProvider', str(e))
 
     return { 'result': result }, 200 if all(x and type(x) == type(True) for x in result.values()) else 500
diff --git a/app/main/controller/image_ocr_controller.py b/app/main/controller/image_ocr_controller.py
@@ -1,10 +1,11 @@
+import json
 from flask import request, current_app as app
 from urllib3 import Retry
 from flask_restplus import Resource, Namespace, fields
 from google.cloud import vision
 import tenacity
 
-from app.main.lib.google_client import get_credentialed_google_client
+from app.main.lib.google_client import get_credentialed_google_client, convert_text_annotation_to_json
 
 api = Namespace('ocr', description='ocr operations')
 ocr_request = api.model('ocr_request', {
@@ -36,6 +37,9 @@ def post(self):
         if not texts:
             return
 
+        app.logger.info(
+            f"[Alegre OCR] [image_uri {image.source.image_uri}] Image OCR response package looks like {convert_text_annotation_to_json(texts[0])}")
+
         return {
             'text': texts[0].description
         }
diff --git a/app/main/controller/presto_controller.py b/app/main/controller/presto_controller.py
@@ -30,16 +30,21 @@ def post(self, action, model_type):
         if action == "add_item":
             app.logger.info(f"Data looks like {data}")
             result = similarity.callback_add_item(data.get("body"), model_type)
-            if data.get("body", {}).get("raw", {}).get("final_task") == "search":
-                result = similarity.callback_search_item(data.get("body"), model_type)
-                result["is_search_result_callback"] = True
-            callback_url = data.get("body", {}).get("raw", {}).get("callback_url", app.config['CHECK_API_HOST']) or app.config['CHECK_API_HOST']
-            if data.get("body", {}).get("raw", {}).get("requires_callback"):
-                app.logger.info(f"Sending callback to {callback_url} for {action} for model of {model_type} with body of {result}")
-                Webhook.return_webhook(callback_url, action, model_type, result)
-            output = {"action": action, "model_type": model_type, "data": result}
-            app.logger.info(f"PrestoResource value is {output}")
-            return_value = {"action": action, "model_type": model_type, "data": result}
+            if data.get("body", {}).get("raw", {}).get("suppress_response"):
+                # requested not to reply to caller with similarity response, so suppress it
+                return_value = {"action": action, "model_type": model_type, "data": result}
+            else:
+                if data.get("body", {}).get("raw", {}).get("final_task") == "search":
+                # compute a set of items that are similar to the just-stored item and respond to caller with them
+                    result = similarity.callback_search_item(data.get("body"), model_type)
+                    if result:
+                        result["is_search_result_callback"] = True
+                callback_url = data.get("body", {}).get("raw", {}).get("callback_url", app.config['CHECK_API_HOST']) or app.config['CHECK_API_HOST']
+                if result and data.get("body", {}).get("raw", {}).get("requires_callback"):
+                    app.logger.info(f"Sending callback to {callback_url} for {action} for model of {model_type} with body of {result}")
+                    Webhook.return_webhook(callback_url, action, model_type, result)
+                return_value = {"action": action, "model_type": model_type, "data": result}
+                app.logger.info(f"PrestoResource value is {return_value}")
         r = redis_client.get_client()
         r.lpush(f"{model_type}_{item_id}", json.dumps(data))
         r.expire(f"{model_type}_{item_id}", 60*60*24)

diff --git a/app/main/controller/similarity_async_controller.py b/app/main/controller/similarity_async_controller.py
@@ -33,6 +33,7 @@ def post(self, similarity_type):
         else:
             package = similarity.get_body_for_media_document(args, 'query')
         #Default to true for this endpoint instead of false in most other cases
+        package["suppress_response"] = args.get("suppress_response", False)
         package["requires_callback"] = args.get("requires_callback", True)
         response, waiting_for_callback = similarity.async_get_similar_items(package, similarity_type)
         if not waiting_for_callback:

diff --git a/app/main/lib/elastic_crud.py b/app/main/lib/elastic_crud.py
@@ -0,0 +1,101 @@
+import copy
+import uuid
+import json
+from flask import current_app as app
+from app.main.lib.presto import Presto, PRESTO_MODEL_MAP
+from app.main.lib.elasticsearch import store_document, get_by_doc_id
+
+def _after_log(retry_state):
+    app.logger.debug("Retrying image similarity...")
+
+def get_object_by_doc_id(doc_id):
+    return get_by_doc_id(doc_id)
+
+def get_object(task, _):
+    doc_id = task.get("doc_id", None)
+    language = task.get("language", None)
+    context = task.get("context", {})
+    if "contexts" not in task or not isinstance(task["contexts"], list):
+        task["contexts"] = [task["contexts"]] if "contexts" in task else []
+    if context:
+        task["contexts"].append(context)
+    store_document(task, doc_id, language)
+    if task.get("content") and not task.get("text"):
+        task["text"] = task["content"]
+    return task, False
+
+def get_context_for_search(task):
+    context = {}
+    dup = copy.deepcopy(task)
+    if dup.get('context'):
+        context = dup.get('context')
+    if dup.get("match_across_content_types"):
+        context.pop("content_type", None)
+    return context
+
+def get_presto_request_response(modality, callback_url, task):
+    response = json.loads(Presto.send_request(app.config['PRESTO_HOST'], PRESTO_MODEL_MAP[modality], callback_url, task, False).text)
+    assert response["message"] == "Message pushed successfully", f"Bad response message for {modality}, {callback_url}, {task} - response was {response}"
+    assert response["queue"] in PRESTO_MODEL_MAP.values(), f"Unknown queue for {modality}, {callback_url}, {task} - response was {response}"
+    assert isinstance(response["body"], dict), f"Bad body for {modality}, {callback_url}, {task} - response was {response}"
+    return response
+
+def requires_encoding(obj):
+    for model_key in obj.get("models", []):
+        if not obj.get('model_'+model_key):
+            return True
+    return False
+
+def get_blocked_presto_response(task, model, modality):
+    if task.get("doc_id") is None:
+        task["doc_id"] = str(uuid.uuid4())
+    obj, temporary = get_object(task, model)
+    doc_id = obj["doc_id"]
+    callback_url = Presto.add_item_callback_url(app.config['ALEGRE_HOST'], modality)
+    app.logger.info(f"Object for {task} of model {model} with id of {doc_id} has requires_encoding value of {requires_encoding(obj)}")
+    if requires_encoding(obj):
+        blocked_results = []
+        for model_key in obj.pop("models", []):
+            if model_key != "elasticsearch" and not obj.get('model_'+model_key):
+                response = get_presto_request_response(model_key, callback_url, obj)
+                blocked_results.append(Presto.blocked_response(response, modality))
+        # Warning: this is a blocking hold to wait until we get a response in
+        # a redis key that we've received something from presto.
+        return obj, temporary, get_context_for_search(task), blocked_results[-1]
+    else:
+        return obj, temporary, get_context_for_search(task), {"body": obj}
+
+def get_async_presto_response(task, model, modality):
+    app.logger.error(f"get_async_presto_response: {task} {model} {modality}")
+    obj, _ = get_object(task, model)
+    callback_url = Presto.add_item_callback_url(app.config['ALEGRE_HOST'], modality)
+    if task.get("doc_id") is None:
+        task["doc_id"] = str(uuid.uuid4())
+    task["final_task"] = "search"
+    if requires_encoding(obj):
+        responses = []
+        for model_key in obj.get("models", []):
+            if model_key != "elasticsearch" and not obj.get('model_'+model_key):
+                task["model"] = model_key
+                responses.append(get_presto_request_response(model_key, callback_url, task))
+        return responses, True
+    else:
+        return {"message": "Already encoded - passing on to search"}, False
+
+def parse_task_search(task):
+    # here, we have to unpack the task contents to pull out the body,
+    # which may be embedded in a body key in the dict if its coming from a presto callback.
+    # alternatively, the "body" is just the entire dictionary.
+    if "body" in task:
+        body = task.get("body", {})
+        threshold = body.get("raw", {}).get('threshold', 0.0)
+        limit = body.get("raw", {}).get("limit")
+        if not body.get("raw"):
+            body["raw"] = {}
+        body["hash_value"] = body.get("result", {}).get("hash_value")
+        body["context"] = body.get("context", body.get("raw", {}).get("context"))
+    else:
+        body = task
+        threshold = body.get('threshold', 0.0)
+        limit = body.get("limit")
+    return body, threshold, limit
diff --git a/app/main/lib/elasticsearch.py b/app/main/lib/elasticsearch.py
@@ -8,8 +8,7 @@
 
 from app.main.lib.language_analyzers import SUPPORTED_LANGUAGES
 from app.main.lib.error_log import ErrorLog
-#from app.main.lib.langid import Cld3LangidProvider as LangidProvider
-from app.main.lib.langid import GoogleLangidProvider as LangidProvider
+from app.main.lib.langid import HybridLangidProvider as LangidProvider
 
 def get_all_documents_matching_context(context):
   matches, clause_count = generate_matches(context)
@@ -106,7 +105,14 @@ def update_or_create_document(body, doc_id, index):
       )
   return result
 
+def get_by_doc_id(doc_id):
+    es = OpenSearch(app.config['ELASTICSEARCH_URL'])
+    response = es.get(index=app.config['ELASTICSEARCH_SIMILARITY'], id=doc_id)
+    return response['_source']
+
 def store_document(body, doc_id, language=None):
+    for field in ["per_model_threshold", "threshold", "model", "confirmed", "limit", "requires_callback"]:
+        body.pop(field, None)
     indices = [app.config['ELASTICSEARCH_SIMILARITY']]
     # 'auto' indicates we should try to guess the appropriate language
     if language == 'auto':
@@ -124,7 +130,7 @@ def store_document(body, doc_id, language=None):
     for index in indices:
       index_result = update_or_create_document(body, doc_id, index)
       results.append(index_result)
-      if index_result['result'] not in ['created', 'updated']:
+      if index_result['result'] not in ['created', 'updated', 'noop']:
           app.logger.warning('Problem adding document to ES index for language {0}: {1}'.format(language, index_result))
     result = results[0]
     success = False

diff --git a/app/main/lib/google_client.py b/app/main/lib/google_client.py
@@ -1,6 +1,7 @@
 import os
 import json
 from google.oauth2 import service_account
+from flask import current_app as app
 
 def get_credentialed_google_client(client):
     default_values = {}
@@ -26,3 +27,18 @@ def get_credentialed_google_client(client):
     except ValueError as e:
       print(f"Couldn't authenticate to google client: {str(e)}")
       return None
+def convert_text_annotation_to_json(text_annotation):
+    try:
+        text_json = {}
+        text_json['description'] = text_annotation.description
+        text_json['locale'] = text_annotation.locale
+        text_json['bounding_poly'] = []
+        for a_vertice in text_annotation.bounding_poly.vertices:
+            vertice_json = {}
+            vertice_json['x'] = a_vertice.x
+            vertice_json['y'] = a_vertice.y
+            text_json['bounding_poly'] += [vertice_json]
+        text_json = json.dumps(text_json)
+        return text_json
+    except Exception as e:
+        app.logger.exception(e)
diff --git a/app/main/lib/helpers.py b/app/main/lib/helpers.py
@@ -1,3 +1,19 @@
+def merge_dict_lists(list1, list2):
+    """
+    Merge two lists of dictionaries, ensuring all unique dictionaries are present in the final result.
+    
+    :param list1: First list of dictionaries.
+    :param list2: Second list of dictionaries.
+    :return: Merged list of unique dictionaries.
+    """
+    def to_hashable(d):
+        return tuple((k, tuple(v) if isinstance(v, list) else v) for k, v in sorted(d.items()))
+    def to_dict(t):
+        return {k: list(v) if isinstance(v, tuple) else v for k, v in t}
+    unique = set(to_hashable(d) for d in list1 + list2)
+    return [to_dict(d) for d in unique]
+
+
 def context_matches(query_context, item_context):
   """
     Check a pair of contexts to determine if they match - first pass is

diff --git a/app/main/lib/image_similarity.py b/app/main/lib/image_similarity.py
@@ -62,7 +62,7 @@ def add_image(save_params):
     db.session.rollback()
     raise e
 
-def callback_add(task):
+def callback_add_image(task):
     return media_crud.add(task, ImageModel, ["pdq", "phash"])[0]
 
 def search_image(image, model, limit, threshold, task, hash_value, context, temporary):

diff --git a/app/main/lib/langid.py b/app/main/lib/langid.py
@@ -1,8 +1,12 @@
 # 3rd party langid providers
-from google.cloud import translate_v2 as translate
 from flask import current_app as app
-import requests
+import json
+
+from google.cloud import translate_v2 as translate
+# import requests # Used for MicrosoftLangidProvider
 import cld3
+import fasttext
+
 
 from app.main.lib.google_client import get_credentialed_google_client
 
@@ -17,7 +21,8 @@ def langid(text):
         'language': response[0]['language'],
         'confidence': response[0]['confidence']
       },
-      'raw': response
+      'raw': response,
+      'model': 'Google',
     }
 
   @staticmethod
@@ -68,10 +73,79 @@ def langid(text):
         'language': prediction.language,
         'confidence': prediction.probability
       },
-      'raw': prediction
+      'raw': prediction,
+      'model': 'CLD3',
     }
 
   @staticmethod
   def test():
     cld3.get_language("Some text to check")
     return True
+
+class FastTextLangidProvider:
+# https://fasttext.cc/docs/en/language-identification.html
+  fasttext_model = fasttext.load_model("extra/fasttext_language_id/lid.176.ftz")
+  @staticmethod
+  def langid(text):
+    prediction = list(FastTextLangidProvider.fasttext_model.predict(text))
+    # prediction is a list of tuples, e.g., [('__label__en',), array([0.22517213])]
+
+    language = prediction[0][0].split("__")[-1]
+    prediction[1] = prediction[1].tolist()
+
+    # Use 'fil' for Filipino rather than tl for Tagalog
+    if language == "tl":
+      language = "fil"
+
+    return {
+      'result': {
+        'language': language,
+        'confidence': prediction[1][0]
+      },
+      'raw': prediction,
+      'model': 'FastText',
+    }
+
+  @staticmethod
+  def test():
+    FastTextLangidProvider.fasttext_model.get_language("Some text to check")
+    return True
+
+class HybridLangidProvider:
+  @staticmethod
+  def langid(text):
+    fasttext_result = FastTextLangidProvider.langid(text)
+    cld_result = Cld3LangidProvider.langid(text)
+    # max_confidence = max(fasttext_result['result']['confidence'], cld_result['result']['confidence'])
+    min_confidence = min(fasttext_result['result']['confidence'], cld_result['result']['confidence'])
+
+    # if fasttext_result['result']['language'] == cld_result['result']['language'] or max_confidence >= 0.8:
+    if fasttext_result['result']['language'] == cld_result['result']['language'] and min_confidence >= 0.9:
+      # OLD - FastText and CLD agree or one of them is more than 80% confident.
+      # Now - FastText and CLD agree AND BOTH are more than 90% confident
+      # Return the higher confidence result
+      # if fasttext_result['result']['language'] != cld_result['result']['language']:
+      #   # Log when there is disagreement
+      #   app.logger.info(json.dumps({
+      #     'service':'LangId',
+      #     'message': 'Disagreement between fasttext and cld. Returning higher confidence model',
+      #     'parameters':{'text':text, 'fasttext':fasttext_result, 'cld':cld_result,},
+      #     }))
+      if fasttext_result['result']['confidence'] > cld_result['result']['confidence']:
+        return fasttext_result
+      else:
+        return cld_result
+    else:
+      # Fallback to Google when models disagree and neither has a high-confidence result
+      google_result = GoogleLangidProvider.langid(text)
+      app.logger.info(json.dumps({
+        'service':'LangId',
+        'message': 'Called Google after inconclusive local results',
+        'parameters':{'text':text, 'fasttext':fasttext_result, 'cld':cld_result, 'google':google_result},
+        }))
+      return google_result
+
+  @staticmethod
+  def test():
+    HybridLangidProvider.langid("Some text to check")
+    return True