Skip to content

Commit

Permalink
Merge branch 'develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
caiosba committed Oct 2, 2024
2 parents 11711c2 + c7fcdb2 commit 8126264
Show file tree
Hide file tree
Showing 22 changed files with 511 additions and 99 deletions.
14 changes: 7 additions & 7 deletions app/main/controller/healthcheck_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def get(self):
'ELASTICSEARCH_SIMILARITY': False,
'REDIS': False,
'DATABASE': False,
'LANGID': False
# 'LANGID': False
}

# Elasticsearch
Expand Down Expand Up @@ -50,11 +50,11 @@ def get(self):
except Exception as e:
result['DATABASE'] = str(e)

# Langid
try:
class_ = getattr(importlib.import_module('app.main.lib.langid'), app.config['PROVIDER_LANGID'].title() + 'LangidProvider')
result['LANGID'] = class_.test()
except Exception as e:
result['LANGID'] = '%s: %s' % (app.config['PROVIDER_LANGID'].title() + 'LangidProvider', str(e))
# # Langid
# try:
# class_ = getattr(importlib.import_module('app.main.lib.langid'), app.config['PROVIDER_LANGID'].title() + 'LangidProvider')
# result['LANGID'] = class_.test()
# except Exception as e:
# result['LANGID'] = '%s: %s' % (app.config['PROVIDER_LANGID'].title() + 'LangidProvider', str(e))

return { 'result': result }, 200 if all(x and type(x) == type(True) for x in result.values()) else 500
6 changes: 5 additions & 1 deletion app/main/controller/image_ocr_controller.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import json
from flask import request, current_app as app
from urllib3 import Retry
from flask_restplus import Resource, Namespace, fields
from google.cloud import vision
import tenacity

from app.main.lib.google_client import get_credentialed_google_client
from app.main.lib.google_client import get_credentialed_google_client, convert_text_annotation_to_json

api = Namespace('ocr', description='ocr operations')
ocr_request = api.model('ocr_request', {
Expand Down Expand Up @@ -36,6 +37,9 @@ def post(self):
if not texts:
return

app.logger.info(
f"[Alegre OCR] [image_uri {image.source.image_uri}] Image OCR response package looks like {convert_text_annotation_to_json(texts[0])}")

return {
'text': texts[0].description
}
25 changes: 15 additions & 10 deletions app/main/controller/presto_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,16 +30,21 @@ def post(self, action, model_type):
if action == "add_item":
app.logger.info(f"Data looks like {data}")
result = similarity.callback_add_item(data.get("body"), model_type)
if data.get("body", {}).get("raw", {}).get("final_task") == "search":
result = similarity.callback_search_item(data.get("body"), model_type)
result["is_search_result_callback"] = True
callback_url = data.get("body", {}).get("raw", {}).get("callback_url", app.config['CHECK_API_HOST']) or app.config['CHECK_API_HOST']
if data.get("body", {}).get("raw", {}).get("requires_callback"):
app.logger.info(f"Sending callback to {callback_url} for {action} for model of {model_type} with body of {result}")
Webhook.return_webhook(callback_url, action, model_type, result)
output = {"action": action, "model_type": model_type, "data": result}
app.logger.info(f"PrestoResource value is {output}")
return_value = {"action": action, "model_type": model_type, "data": result}
if data.get("body", {}).get("raw", {}).get("suppress_response"):
# requested not to reply to caller with similarity response, so suppress it
return_value = {"action": action, "model_type": model_type, "data": result}
else:
if data.get("body", {}).get("raw", {}).get("final_task") == "search":
# compute a set of items that are similar to the just-stored item and respond to caller with them
result = similarity.callback_search_item(data.get("body"), model_type)
if result:
result["is_search_result_callback"] = True
callback_url = data.get("body", {}).get("raw", {}).get("callback_url", app.config['CHECK_API_HOST']) or app.config['CHECK_API_HOST']
if result and data.get("body", {}).get("raw", {}).get("requires_callback"):
app.logger.info(f"Sending callback to {callback_url} for {action} for model of {model_type} with body of {result}")
Webhook.return_webhook(callback_url, action, model_type, result)
return_value = {"action": action, "model_type": model_type, "data": result}
app.logger.info(f"PrestoResource value is {return_value}")
r = redis_client.get_client()
r.lpush(f"{model_type}_{item_id}", json.dumps(data))
r.expire(f"{model_type}_{item_id}", 60*60*24)
Expand Down
1 change: 1 addition & 0 deletions app/main/controller/similarity_async_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def post(self, similarity_type):
else:
package = similarity.get_body_for_media_document(args, 'query')
#Default to true for this endpoint instead of false in most other cases
package["suppress_response"] = args.get("suppress_response", False)
package["requires_callback"] = args.get("requires_callback", True)
response, waiting_for_callback = similarity.async_get_similar_items(package, similarity_type)
if not waiting_for_callback:
Expand Down
101 changes: 101 additions & 0 deletions app/main/lib/elastic_crud.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
import copy
import uuid
import json
from flask import current_app as app
from app.main.lib.presto import Presto, PRESTO_MODEL_MAP
from app.main.lib.elasticsearch import store_document, get_by_doc_id

def _after_log(retry_state):
app.logger.debug("Retrying image similarity...")

def get_object_by_doc_id(doc_id):
return get_by_doc_id(doc_id)

def get_object(task, _):
doc_id = task.get("doc_id", None)
language = task.get("language", None)
context = task.get("context", {})
if "contexts" not in task or not isinstance(task["contexts"], list):
task["contexts"] = [task["contexts"]] if "contexts" in task else []
if context:
task["contexts"].append(context)
store_document(task, doc_id, language)
if task.get("content") and not task.get("text"):
task["text"] = task["content"]
return task, False

def get_context_for_search(task):
context = {}
dup = copy.deepcopy(task)
if dup.get('context'):
context = dup.get('context')
if dup.get("match_across_content_types"):
context.pop("content_type", None)
return context

def get_presto_request_response(modality, callback_url, task):
response = json.loads(Presto.send_request(app.config['PRESTO_HOST'], PRESTO_MODEL_MAP[modality], callback_url, task, False).text)
assert response["message"] == "Message pushed successfully", f"Bad response message for {modality}, {callback_url}, {task} - response was {response}"
assert response["queue"] in PRESTO_MODEL_MAP.values(), f"Unknown queue for {modality}, {callback_url}, {task} - response was {response}"
assert isinstance(response["body"], dict), f"Bad body for {modality}, {callback_url}, {task} - response was {response}"
return response

def requires_encoding(obj):
for model_key in obj.get("models", []):
if not obj.get('model_'+model_key):
return True
return False

def get_blocked_presto_response(task, model, modality):
if task.get("doc_id") is None:
task["doc_id"] = str(uuid.uuid4())
obj, temporary = get_object(task, model)
doc_id = obj["doc_id"]
callback_url = Presto.add_item_callback_url(app.config['ALEGRE_HOST'], modality)
app.logger.info(f"Object for {task} of model {model} with id of {doc_id} has requires_encoding value of {requires_encoding(obj)}")
if requires_encoding(obj):
blocked_results = []
for model_key in obj.pop("models", []):
if model_key != "elasticsearch" and not obj.get('model_'+model_key):
response = get_presto_request_response(model_key, callback_url, obj)
blocked_results.append(Presto.blocked_response(response, modality))
# Warning: this is a blocking hold to wait until we get a response in
# a redis key that we've received something from presto.
return obj, temporary, get_context_for_search(task), blocked_results[-1]
else:
return obj, temporary, get_context_for_search(task), {"body": obj}

def get_async_presto_response(task, model, modality):
app.logger.error(f"get_async_presto_response: {task} {model} {modality}")
obj, _ = get_object(task, model)
callback_url = Presto.add_item_callback_url(app.config['ALEGRE_HOST'], modality)
if task.get("doc_id") is None:
task["doc_id"] = str(uuid.uuid4())
task["final_task"] = "search"
if requires_encoding(obj):
responses = []
for model_key in obj.get("models", []):
if model_key != "elasticsearch" and not obj.get('model_'+model_key):
task["model"] = model_key
responses.append(get_presto_request_response(model_key, callback_url, task))
return responses, True
else:
return {"message": "Already encoded - passing on to search"}, False

def parse_task_search(task):
# here, we have to unpack the task contents to pull out the body,
# which may be embedded in a body key in the dict if its coming from a presto callback.
# alternatively, the "body" is just the entire dictionary.
if "body" in task:
body = task.get("body", {})
threshold = body.get("raw", {}).get('threshold', 0.0)
limit = body.get("raw", {}).get("limit")
if not body.get("raw"):
body["raw"] = {}
body["hash_value"] = body.get("result", {}).get("hash_value")
body["context"] = body.get("context", body.get("raw", {}).get("context"))
else:
body = task
threshold = body.get('threshold', 0.0)
limit = body.get("limit")
return body, threshold, limit
12 changes: 9 additions & 3 deletions app/main/lib/elasticsearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@

from app.main.lib.language_analyzers import SUPPORTED_LANGUAGES
from app.main.lib.error_log import ErrorLog
#from app.main.lib.langid import Cld3LangidProvider as LangidProvider
from app.main.lib.langid import GoogleLangidProvider as LangidProvider
from app.main.lib.langid import HybridLangidProvider as LangidProvider

def get_all_documents_matching_context(context):
matches, clause_count = generate_matches(context)
Expand Down Expand Up @@ -106,7 +105,14 @@ def update_or_create_document(body, doc_id, index):
)
return result

def get_by_doc_id(doc_id):
es = OpenSearch(app.config['ELASTICSEARCH_URL'])
response = es.get(index=app.config['ELASTICSEARCH_SIMILARITY'], id=doc_id)
return response['_source']

def store_document(body, doc_id, language=None):
for field in ["per_model_threshold", "threshold", "model", "confirmed", "limit", "requires_callback"]:
body.pop(field, None)
indices = [app.config['ELASTICSEARCH_SIMILARITY']]
# 'auto' indicates we should try to guess the appropriate language
if language == 'auto':
Expand All @@ -124,7 +130,7 @@ def store_document(body, doc_id, language=None):
for index in indices:
index_result = update_or_create_document(body, doc_id, index)
results.append(index_result)
if index_result['result'] not in ['created', 'updated']:
if index_result['result'] not in ['created', 'updated', 'noop']:
app.logger.warning('Problem adding document to ES index for language {0}: {1}'.format(language, index_result))
result = results[0]
success = False
Expand Down
16 changes: 16 additions & 0 deletions app/main/lib/google_client.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import json
from google.oauth2 import service_account
from flask import current_app as app

def get_credentialed_google_client(client):
default_values = {}
Expand All @@ -26,3 +27,18 @@ def get_credentialed_google_client(client):
except ValueError as e:
print(f"Couldn't authenticate to google client: {str(e)}")
return None
def convert_text_annotation_to_json(text_annotation):
try:
text_json = {}
text_json['description'] = text_annotation.description
text_json['locale'] = text_annotation.locale
text_json['bounding_poly'] = []
for a_vertice in text_annotation.bounding_poly.vertices:
vertice_json = {}
vertice_json['x'] = a_vertice.x
vertice_json['y'] = a_vertice.y
text_json['bounding_poly'] += [vertice_json]
text_json = json.dumps(text_json)
return text_json
except Exception as e:
app.logger.exception(e)
16 changes: 16 additions & 0 deletions app/main/lib/helpers.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
def merge_dict_lists(list1, list2):
"""
Merge two lists of dictionaries, ensuring all unique dictionaries are present in the final result.
:param list1: First list of dictionaries.
:param list2: Second list of dictionaries.
:return: Merged list of unique dictionaries.
"""
def to_hashable(d):
return tuple((k, tuple(v) if isinstance(v, list) else v) for k, v in sorted(d.items()))
def to_dict(t):
return {k: list(v) if isinstance(v, tuple) else v for k, v in t}
unique = set(to_hashable(d) for d in list1 + list2)
return [to_dict(d) for d in unique]


def context_matches(query_context, item_context):
"""
Check a pair of contexts to determine if they match - first pass is
Expand Down
2 changes: 1 addition & 1 deletion app/main/lib/image_similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def add_image(save_params):
db.session.rollback()
raise e

def callback_add(task):
def callback_add_image(task):
return media_crud.add(task, ImageModel, ["pdq", "phash"])[0]

def search_image(image, model, limit, threshold, task, hash_value, context, temporary):
Expand Down
82 changes: 78 additions & 4 deletions app/main/lib/langid.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
# 3rd party langid providers
from google.cloud import translate_v2 as translate
from flask import current_app as app
import requests
import json

from google.cloud import translate_v2 as translate
# import requests # Used for MicrosoftLangidProvider
import cld3
import fasttext


from app.main.lib.google_client import get_credentialed_google_client

Expand All @@ -17,7 +21,8 @@ def langid(text):
'language': response[0]['language'],
'confidence': response[0]['confidence']
},
'raw': response
'raw': response,
'model': 'Google',
}

@staticmethod
Expand Down Expand Up @@ -68,10 +73,79 @@ def langid(text):
'language': prediction.language,
'confidence': prediction.probability
},
'raw': prediction
'raw': prediction,
'model': 'CLD3',
}

@staticmethod
def test():
cld3.get_language("Some text to check")
return True

class FastTextLangidProvider:
# https://fasttext.cc/docs/en/language-identification.html
fasttext_model = fasttext.load_model("extra/fasttext_language_id/lid.176.ftz")
@staticmethod
def langid(text):
prediction = list(FastTextLangidProvider.fasttext_model.predict(text))
# prediction is a list of tuples, e.g., [('__label__en',), array([0.22517213])]

language = prediction[0][0].split("__")[-1]
prediction[1] = prediction[1].tolist()

# Use 'fil' for Filipino rather than tl for Tagalog
if language == "tl":
language = "fil"

return {
'result': {
'language': language,
'confidence': prediction[1][0]
},
'raw': prediction,
'model': 'FastText',
}

@staticmethod
def test():
FastTextLangidProvider.fasttext_model.get_language("Some text to check")
return True

class HybridLangidProvider:
@staticmethod
def langid(text):
fasttext_result = FastTextLangidProvider.langid(text)
cld_result = Cld3LangidProvider.langid(text)
# max_confidence = max(fasttext_result['result']['confidence'], cld_result['result']['confidence'])
min_confidence = min(fasttext_result['result']['confidence'], cld_result['result']['confidence'])

# if fasttext_result['result']['language'] == cld_result['result']['language'] or max_confidence >= 0.8:
if fasttext_result['result']['language'] == cld_result['result']['language'] and min_confidence >= 0.9:
# OLD - FastText and CLD agree or one of them is more than 80% confident.
# Now - FastText and CLD agree AND BOTH are more than 90% confident
# Return the higher confidence result
# if fasttext_result['result']['language'] != cld_result['result']['language']:
# # Log when there is disagreement
# app.logger.info(json.dumps({
# 'service':'LangId',
# 'message': 'Disagreement between fasttext and cld. Returning higher confidence model',
# 'parameters':{'text':text, 'fasttext':fasttext_result, 'cld':cld_result,},
# }))
if fasttext_result['result']['confidence'] > cld_result['result']['confidence']:
return fasttext_result
else:
return cld_result
else:
# Fallback to Google when models disagree and neither has a high-confidence result
google_result = GoogleLangidProvider.langid(text)
app.logger.info(json.dumps({
'service':'LangId',
'message': 'Called Google after inconclusive local results',
'parameters':{'text':text, 'fasttext':fasttext_result, 'cld':cld_result, 'google':google_result},
}))
return google_result

@staticmethod
def test():
HybridLangidProvider.langid("Some text to check")
return True
Loading

0 comments on commit 8126264

Please sign in to comment.