diff --git a/.env_file.enc b/.env_file.enc index eaad44e2..5eea5307 100644 Binary files a/.env_file.enc and b/.env_file.enc differ diff --git a/.env_file.example b/.env_file.example index b63810a3..46d48f51 100644 --- a/.env_file.example +++ b/.env_file.example @@ -59,6 +59,10 @@ PROVIDER_IMAGE_CLASSIFICATION=google # AWS_ACCESS_KEY_ID= # AWS_SECRET_ACCESS_KEY= # AWS_SESSION_TOKEN= +S3_ENDPOINT=http://minio:9000 +AWS_DEFAULT_REGION=us-east-1 +AWS_ACCESS_KEY_ID=AKIAIOSFODNN7EXAMPLE +AWS_SECRET_ACCESS_KEY=wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY # Service host URLs diff --git a/.env_file.test b/.env_file.test index 1bc83068..291d6d46 100644 --- a/.env_file.test +++ b/.env_file.test @@ -7,7 +7,7 @@ DATABASE_HOST=postgres DATABASE_USER=postgres DATABASE_PASS=postgres S3_ENDPOINT=http://minio:9000 -AWS_DEFAULT_REGION=eu-west-1 +AWS_DEFAULT_REGION=us-east-1 AWS_ACCESS_KEY_ID=AKIAIOSFODNN7EXAMPLE AWS_SECRET_ACCESS_KEY=wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY diff --git a/.github/workflows/ci-tests.yaml b/.github/workflows/ci-tests.yaml new file mode 100644 index 00000000..d6ed86e6 --- /dev/null +++ b/.github/workflows/ci-tests.yaml @@ -0,0 +1,177 @@ +name: Build and Run Alegre Tests + +on: + schedule: + - cron: '0 9 * * *' #Run daily at 9 UTC + push: + branches: + - master + - develop + + pull_request: + branches: + - develop + +env: + CC_TEST_REPORTER_ID: "${{ secrets.CC_TEST_REPORTER_ID }}" + +jobs: + unit-tests: + runs-on: + labels: alegre + steps: + - name: Set permissions for _work directory + run: | + sudo chown -R $USER:$USER $GITHUB_WORKSPACE + sudo chmod 755 $GITHUB_WORKSPACE + - uses: actions/checkout@v4 + + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + role-to-assume: ${{ secrets.AWS_ROLE_TO_ASSUME }} + aws-region: eu-west-1 + + - name: Login to Amazon ECR + id: login-ecr + uses: aws-actions/amazon-ecr-login@v2 + + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Decrypt env + env: + DECRYPTION_PASSWORD: ${{ secrets.DECRYPTION_PASSWORD }} + run: | + openssl enc -aes-256-cbc -d -in .env_file.enc -out .env_file -k $DECRYPTION_PASSWORD + + - name: Decrypt Alegre credentials + env: + DECRYPTION_PASSWORD: ${{ secrets.DECRYPTION_PASSWORD }} + run: | + openssl aes-256-cbc -d -in google_credentials.json.enc -out google_credentials.json -k $DECRYPTION_PASSWORD + + - name: Install redis tools + run: | + sudo apt-get -y install redis-tools + + - name: Set up reporter + run: | + curl -L https://codeclimate.com/downloads/test-reporter/test-reporter-latest-linux-amd64 > ./cc-test-reporter + chmod +x ./cc-test-reporter + + - name: Before script + run: | + mkdir -p ~/.docker/cli-plugins/ && curl -SL https://github.com/docker/compose/releases/download/v2.30.1/docker-compose-linux-x86_64 -o ~/.docker/cli-plugins/docker-compose && chmod +x ~/.docker/cli-plugins/docker-compose && docker compose version + ./cc-test-reporter before-build + docker compose build + docker compose -f docker-compose.yml -f docker-test.yml up -d + docker compose logs -t -f & + echo "Waiting for Elasticsearch indexes..." && until curl --silent --fail -I "http://localhost:9200/alegre_similarity_test"; do sleep 1; done + until curl --silent --fail -I "http://localhost:3100"; do sleep 1; done + echo "Waiting for model servers..." + + - name: Run Unit Tests + id: unit-tests + run: | + docker compose exec alegre make test + + - name: Generate Coverage Report + if: ${{ github.event_name != 'pull_request' }} + run: | + docker compose exec alegre coverage xml + + - name: Upload Coverage Report + if: ${{ github.event_name != 'pull_request' }} + env: + CC_TEST_REPORTER_ID: ${{ secrets.CC_TEST_REPORTER_ID }} + run: ./cc-test-reporter after-build -t coverage.py --exit-code $? + + - name: Cleanup Docker Resources + if: always() + run: | + echo "Cleaning up Docker resources..." + docker stop $(docker ps -q) + docker rm $(docker ps -aq) + docker rmi $(docker images -q) + docker volume rm $(docker volume ls -q) + + contract-testing: + needs: unit-tests + runs-on: + labels: alegre + steps: + - name: Set permissions for _work directory + run: | + sudo chown -R $USER:$USER $GITHUB_WORKSPACE + sudo chmod 755 $GITHUB_WORKSPACE + - uses: actions/checkout@v4 + + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + role-to-assume: ${{ secrets.AWS_ROLE_TO_ASSUME }} + aws-region: eu-west-1 + + - name: Login to Amazon ECR + id: login-ecr + uses: aws-actions/amazon-ecr-login@v2 + + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Decrypt env + env: + DECRYPTION_PASSWORD: ${{ secrets.DECRYPTION_PASSWORD }} + run: | + openssl enc -aes-256-cbc -d -in .env_file.enc -out .env_file -k $DECRYPTION_PASSWORD + + - name: Decrypt Alegre credentials + env: + DECRYPTION_PASSWORD: ${{ secrets.DECRYPTION_PASSWORD }} + run: | + openssl aes-256-cbc -d -in google_credentials.json.enc -out google_credentials.json -k $DECRYPTION_PASSWORD + + - name: Install redis tools + run: | + sudo apt-get -y install redis-tools + + - name: Set up reporter + run: | + curl -L https://codeclimate.com/downloads/test-reporter/test-reporter-latest-linux-amd64 > ./cc-test-reporter + chmod +x ./cc-test-reporter + + - name: Before script + run: | + mkdir -p ~/.docker/cli-plugins/ && curl -SL https://github.com/docker/compose/releases/download/v2.30.1/docker-compose-linux-x86_64 -o ~/.docker/cli-plugins/docker-compose && chmod +x ~/.docker/cli-plugins/docker-compose && docker compose version + ./cc-test-reporter before-build + docker compose build + docker compose -f docker-compose.yml -f docker-test.yml up -d + docker compose logs -t -f & + echo "Waiting for Elasticsearch indexes..." && until curl --silent --fail -I "http://localhost:9200/alegre_similarity_test"; do sleep 1; done + until curl --silent --fail -I "http://localhost:3100"; do sleep 1; done + echo "Waiting for model servers..." + + - name: Run contract Tests + id: contract-tests + run: | + docker compose exec alegre make contract_testing + + - name: Cleanup Docker Resources + if: always() + run: | + echo "Cleaning up Docker resources..." + docker stop $(docker ps -q) + docker rm $(docker ps -aq) + docker rmi $(docker images -q) + docker volume rm $(docker volume ls -q) diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 54058018..00000000 --- a/.travis.yml +++ /dev/null @@ -1,40 +0,0 @@ -dist: jammy -before_install: -- openssl aes-256-cbc -K $encrypted_e34ab48306dd_key -iv $encrypted_e34ab48306dd_iv - -in .env_file.enc -out .env_file -d -- openssl aes-256-cbc -K $encrypted_126f44c7828e_key -iv $encrypted_126f44c7828e_iv - -in google_credentials.json.enc -out google_credentials.json -d -- sudo apt-get -y install redis-tools -- curl -L https://codeclimate.com/downloads/test-reporter/test-reporter-latest-linux-amd64 > ./cc-test-reporter -- chmod +x ./cc-test-reporter -before_script: -- ./cc-test-reporter before-build -- echo "$DOCKER_PASSWORD" | docker login -u "$DOCKER_USERNAME" --password-stdin -- docker-compose build --pull -- docker-compose -f docker-compose.yml -f docker-test.yml up -d -- docker-compose logs -t -f & -- echo "Waiting for Elasticsearch indexes..." && until curl --silent --fail -I "http://localhost:9200/alegre_similarity_test"; do sleep 1; done -- until curl --silent --fail -I "http://localhost:3100"; do sleep 1; done -- echo "Waiting for model servers..." && while [[ ! '2' =~ $(redis-cli -n 1 SCARD 'SharedModel') ]]; do sleep 1; done -#comment until fix timeout curl: (28) Operation timed out -# - docker-compose exec alegre bash -c "curl --max-time 600.0 -OL https://raw.githubusercontent.com/meedan/check-api/develop/spec/pacts/check_api-alegre.json" -jobs: - include: - - stage: tests - name: unit-tests - script: docker-compose exec alegre make test - - stage: tests - name: contract-testing - script: docker-compose exec alegre make contract_testing -after_script: -- docker-compose exec alegre coverage xml -- if [[ "$TRAVIS_PULL_REQUEST" == "false" && "$TRAVIS_JOB_NAME" != "contract-testing" ]]; then ./cc-test-reporter after-build -t coverage.py -r $CC_TEST_REPORTER_ID --exit-code $TRAVIS_TEST_RESULT; fi -notifications: - slack: - secure: PIYgKOnKjbWv7inSq4Gu0BZS24Ipte3DVZid71dD1iTSHiRtwWUV8jmhva7kVfhoyNst7Hck5v3rp6nY//W0a/frT0mdn1f4j41NA8VaAeKa7MOWQFyvaXBNTnjJJ+FMkpHhbQWGAE7tTKBMWeJDWCisgtvPNQvCd3GtnMMyeTeuQTZarJojTyGxJ00ubLCpKQICkCVEwapnZpONJLJV9h9XuScVo+69h7vpc3jL79/zSWHgH5YPmUEYqLoXArZUtZtsIxunp3ftBqRKoxRHJuUuVDfJ8skNradWoXLEA5Qf/sxxekh13FcYEXMIY37VOQgPJpF8cL0MWwANGrphTdrH2J4vsnoiJK0lZ8NMOIbnBXKhaMtF+PoBjWNgI+8y5xp5jYFV/sQkeqz1ZS3/3KsYFfUeXsy3gPjsASnV6WSk2EstWF4LZopjwqeTssF7nfpOUEf+KW1nCZrnvEt7Mc+Rat+TtTYXFjea5OXednBhAZf0woiRSMw7rlxp9KuxTZDgsbNvs/FoFpJMi7Rmw5+yVrshxxu1lO7hzdWw/8LzRvi6wWWF9mLzbxq3wPKOq/NHIjQrnp8M0o0ESgibQ/pyAU25mcjqNFgCyRLwPjV4s2Q4D/eESn3Vi6A4cvs/Oy4yHWDijm8QilqyrUR67M8NOip92X9FZT/5/BOfEoM= -env: - global: - - secure: RXJQ6+ke7GYXOCF8REbUOElUTY6ZsNqxuyfSCkFXNw5RE/XqTHp4QfOBzLWM7CdpXtfHY0LN+1DemufxdY3O71dMDWraIxIMssd5LXnZ+jF0HLq2lwFxAOCjDYh4xAkoA6l2VnlKVDSLhB78M/fjrLOcbTf8uEWMiQN/ZzB7v/GukS65Bp/Xn53/V77JPAKIrez9BulKDogoMi9NYnDDPzKxK0oOW8UV74vIfe2UcVajfMIxUDloaMcdjffVwYkbd/gPE2+Kkuj8LLclJvDiuMCBby2vWpOzZp+I+D7WSd088rJq80yhbBAaKAiOKF6g70nh7VgmSERTvsBn0AOW6oV9hD0Jrnq0sKsKIohmjcKYtJ39IDYIe0tWnWw8mOG6JVFflUQ1rNXRMSxFolrYNCWjUQxBVEap5gln35JTP/oSJ/OxkBwnc5AdFRAlqzayY5tjHh5nPAHME9NHZZoCD1yVuHLw44uFI3Azse+UMH1mauzNde/PVw3jQxObSXtrlnahqBel7Fq0/MHlyMPUjScyULuhntLlYKiTNxBZ7mlUTGbT8Ch7TFQL7LEy832TyFr3b4CjPCBEb0LyJ0HlSfNo50Jv9tmVHGxAC6NbRTnBzop8q0WJPg6mm4iUAxBsBWLbAfJmC5GALoL1S5W3z0Wxu3i+10F7I/3kewB4/4s= - - secure: rRike8e8nlVup6LtIqaKJLMtoyv2lRbY+x5nN5qoGWEND1kmnioNtkLmWonLqXw9PrBviCZN1fedjCXHRjkvlAYakkw1Hb7fYvp57L/2kJ7jZj5xyMHxhL2kwL7BzYi22KFLBvjJEFPMSJbbeJ5NPo31SOqmCSTjGvp3xkMQ71Wl+XB3RUsMCbkrHde2FW6plR0yL2AWnhrChXRbsAbOg/riVGyWonkFwBl/47L0C8mIxWnGn9UySvxGbJbsJXMia+1MRMewWWMoWhbtTZLsAt+ETH872H8zZy7Hj4/2V66ETdp1tzLAxmkXA0JLHpCKmoyU5htEBMLN3LXZegpXinYI4+pmPkbbg0x/1xUBEi8yh8u7fChkgeVRi9mEJvwCHBQ731LWuFxfGVnAhECMWFkzhuU76cjobea+c2eya1XBtPojwJbZGzUqJKHgzCVpGsoUEQvgbVhKvhlm2TI6NJhXMEpyzODXYauIEDSg+Ka6kXFhsovCoRBPfGSXXs3mox6FyqyJxIlsOTUwxqNE40/DfKgT2eFbG3kzTS0PFipojRreeK5MHE62BPkx1qup0d5+Wb3ZNgw4LnZcf2GlavRvB/0BKbdJzhDHZC14snL7I4K3FWZ6qPzfdiqSHiNY3smnWQHsyQTb4lHBOyvsaJVRpLAaF5ULB2peLDVbOxQ= - - secure: OAoROQYcmsxfKxbN1q0kSYOEp2ZLSvQa+lBzPWPClflV6f4xvuzRY1Nc54HL3NZz7rdNC2oKFu8oihbCpx+B4c3YViXizPHOn2wUueKZZBRGwlVnpxE+dDgIBGEL7mtwY9fj7mCF7K2alAw2rN6esQwOMujA8W8//qmfm+GcJKhOEAen2w3cD51H2L6aYqxyjQnHFa+oDDrpo710psvoJ92ectY4L+vLYCDHaHx9XwDC4M8IZCojSworHJjMOrM9eKCFjDoLaNmJ/07L6b5/vCOnBw77chaP+OXNwnv0zyE8mAkfl2E+tMRyydSdv/r2uwZBfOAtA1QiMKI1YMO1U319S4CTQE90VPAx8pT6efOpCRNxntoQsuYygRzXqI2gEYBArRPTCw+uyqHsO5U/QB6MZPaPqG1vLpHomTzMLOJM5OAQ3rxsBeE0PokGWIl3JNSnoxFrMHPAwXP9N9eKH7Bq8SslAIhwiAXwp9Lrhy3in+aBjLmrQi0wJi8mOPPjabVktB1fSYkedJ3vC9LNmtfQ/ITk4zfVDrDEnWjemEjZp3cAib0CYscTuU2cbu4JNf2edPdNMXI59W619vll/k6Gurmg/MSVv56X+h05XasxG0yUEhx0TIthVoBV71TTr1nRd8oFTYphOc+FFE08elFlfCyKSa+fJub01hZNGpM= - - secure: qu8gJdRfW9GX0tKhNiAhoG3gGSVTRA6TBnnBw1DyW9ejkLBhUI85Mq+WXyuLpCz/o4vf9a7dq89XztKjj7NPJi/R6Hyjx/FsitRVfdZWfEMmIN2cJkGzjg1N5k4E+Xn5Zw9MvcBl14YRBKQ1ePErsnTo18QeasFylpGr8cxyilmnDmqv3oTEs3Ms1pUHt6XkMj1AH6lvAnbHUcdUpjsVdwIDTLXKkpZu8GeaAn6nfFGtn9xCcCkE6/hLwlHMSmx7AcD8rO/ocSe6v82iX4MvU1ZJOc2yDLcIJH4ijNlpKx2UX7IYw14bjvrUZPF6ADP9vyY1w+UF6OMLzohjTJHCxGfwRhLgsky6o2xeGbRLpF1dWj0WTBX4LSRV5vTZFApIJsnps2hU0P9gdjcQOY3qWCOpmPu4cuUtI+s/SC5Ot93CmD9Jmlb7rL++KUtBnMnPsFifsFc+TfFdhPZjjsDcXlyYn15oCXrZlbd6h+NZwFARRti1981vMSQRvi9h+3nfgyehnVN2hVt3SnVcEJulKHOn2Bmnt8OjSZyADuS9as7+8xs8fRTNuej1h5agU5Ay7HhsYr28AIAMCy1hJW0Q7ln4wHUuqYNck+fpdFnPj33GfutP8uwOrHRlhhORhsscOkq1bacwcx0VCg/vYvXBjQM2s6ia/2vkvGnqbJRe0dI= diff --git a/app/main/controller/audio_similarity_controller.py b/app/main/controller/audio_similarity_controller.py index 1eb5abc9..dcd0a1a2 100644 --- a/app/main/controller/audio_similarity_controller.py +++ b/app/main/controller/audio_similarity_controller.py @@ -33,5 +33,4 @@ class AudioSimilaritySearchResource(Resource): @api.doc(params={'url': 'audio URL to be stored or queried for similarity', 'threshold': 'minimum score to consider, between 0.0 and 1.0 (defaults to 0.9)', 'context': 'context'} ) def post(self): args = request.json - app.logger.debug(f"Args are {args}") return jsonify({"message": "This endpoint is not implemented."}), 501 diff --git a/app/main/controller/image_ocr_controller.py b/app/main/controller/image_ocr_controller.py index 83f0cfdf..63d6e97a 100644 --- a/app/main/controller/image_ocr_controller.py +++ b/app/main/controller/image_ocr_controller.py @@ -18,6 +18,32 @@ def _after_log(retry_state): CLIENT = get_credentialed_google_client(vision.ImageAnnotatorClient) @api.route('/') class ImageOcrResource(Resource): + @staticmethod + def polygon_area(vertices): + area = 0 + for i in range(len(vertices)): + x1, y1 = vertices[i] + x2, y2 = vertices[(i + 1) % len(vertices)] + area += (x1 * y2 - x2 * y1) + return abs(area) / 2 + + @staticmethod + def calculate_text_percentage(response): + bounds = [] + for page in response.full_text_annotation.pages: + for block in page.blocks: + bounds.append(block.bounding_box) + total_text_area = 0 + for annotation in bounds: + vertices = [(v.x, v.y) for v in annotation.vertices] + area = ImageOcrResource.polygon_area(vertices) + total_text_area += area + # response object contains the whole image width and height in response.full_text_annotation.pages[0] + # as we are sending images, response.full_text_annotation.pages is always 1 page only + image_area = response.full_text_annotation.pages[0].width * response.full_text_annotation.pages[0].height + text_percentage = (total_text_area / image_area) * 100 + return text_percentage + @api.response(200, 'text successfully extracted.') @api.doc('Perform text extraction from an image') @api.doc(params={'url': 'url of image to extract text from'}) @@ -37,8 +63,13 @@ def post(self): if not texts: return - app.logger.info( - f"[Alegre OCR] [image_uri {image.source.image_uri}] Image OCR response package looks like {convert_text_annotation_to_json(texts[0])}") + #### calculate bounding boxes areas. + try: + text_percentage = ImageOcrResource.calculate_text_percentage(response) + app.logger.info( + f"[Alegre OCR] [image_uri {image.source.image_uri}] [percentage of image area covered by text {text_percentage}%] Image OCR response package looks like {convert_text_annotation_to_json(texts[0])}") + except Exception as caught_exception: + app.logger.error(f"[image_uri {image.source.image_uri}] Error calculating percentage of image area covered by text. Error was {caught_exception}. Image OCR response package looks like {convert_text_annotation_to_json(texts[0])}") return { 'text': texts[0].description diff --git a/app/main/controller/presto_controller.py b/app/main/controller/presto_controller.py index eeaa0800..b5fb2c61 100644 --- a/app/main/controller/presto_controller.py +++ b/app/main/controller/presto_controller.py @@ -25,10 +25,9 @@ class PrestoResource(Resource): def post(self, action, model_type): data = request.json item_id = data.get("body", {}).get("id") - app.logger.info(f"PrestoResource {action}/{model_type}") + app.logger.info(f"PrestoResource {action}/{model_type}, data is {data}") return_value = None if action == "add_item": - app.logger.info(f"Data looks like {data}") result = similarity.callback_add_item(data.get("body"), model_type) if data.get("body", {}).get("raw", {}).get("suppress_response"): # requested not to reply to caller with similarity response, so suppress it @@ -40,11 +39,11 @@ def post(self, action, model_type): if result: result["is_search_result_callback"] = True callback_url = data.get("body", {}).get("raw", {}).get("callback_url", app.config['CHECK_API_HOST']) or app.config['CHECK_API_HOST'] - if result and data.get("body", {}).get("raw", {}).get("requires_callback"): + if result and result.get("results") is not None and data.get("body", {}).get("raw", {}).get("requires_callback"): app.logger.info(f"Sending callback to {callback_url} for {action} for model of {model_type} with body of {result}") Webhook.return_webhook(callback_url, action, model_type, result) return_value = {"action": action, "model_type": model_type, "data": result} - app.logger.info(f"PrestoResource value is {return_value}") + app.logger.info(f"PrestoResource {action}/{model_type}, data is {data}, return_value is {return_value}") r = redis_client.get_client() r.lpush(f"{model_type}_{item_id}", json.dumps(data)) r.expire(f"{model_type}_{item_id}", 60*60*24) diff --git a/app/main/controller/similarity_async_controller.py b/app/main/controller/similarity_async_controller.py index c9f21dd7..a0cfdd5c 100644 --- a/app/main/controller/similarity_async_controller.py +++ b/app/main/controller/similarity_async_controller.py @@ -27,7 +27,7 @@ class AsyncSimilarityResource(Resource): @api.doc(params={'text': 'text to be stored or queried for similarity', 'threshold': 'minimum score to consider, between 0.0 and 1.0 (defaults to 0.9)', 'model': 'similarity model to use: "elasticsearch" (pure Elasticsearch, default) or the key name of an active model'}) def post(self, similarity_type): args = request.json - app.logger.debug(f"Args are {args}") + app.logger.info(f"[AsyncSimilarityResource] Starting Request - args are {args}, similarity_type is {similarity_type}") if similarity_type == "text": package = similarity.get_body_for_text_document(args, 'query') else: @@ -42,4 +42,5 @@ def post(self, similarity_type): result["is_shortcircuited_search_result_callback"] = True callback_url = args.get("callback_url", app.config['CHECK_API_HOST']) or app.config['CHECK_API_HOST'] Webhook.return_webhook(callback_url, "search", similarity_type, result) + app.logger.info(f"[AsyncSimilarityResource] Completing Request - args are {args}, similarity_type is {similarity_type}, reponse is {response}") return response diff --git a/app/main/controller/similarity_controller.py b/app/main/controller/similarity_controller.py index c4872911..35a369b5 100644 --- a/app/main/controller/similarity_controller.py +++ b/app/main/controller/similarity_controller.py @@ -46,5 +46,7 @@ class SimilaritySearchResource(Resource): @api.doc(params={'text': 'text to be stored or queried for similarity', 'threshold': 'minimum score to consider, between 0.0 and 1.0 (defaults to 0.9)', 'model': 'similarity model to use: "elasticsearch" (pure Elasticsearch, default) or the key name of an active model'}) def post(self): args = request.json - app.logger.debug(f"Args are {args}") - return similarity.get_similar_items(similarity.get_body_for_text_document(args, mode='query'), "text") + app.logger.info(f"[SimilaritySearchResource] Args are {args}") + response = similarity.get_similar_items(similarity.get_body_for_text_document(args, mode='query'), "text") + app.logger.info(f"[SimilaritySearchResource] Args are {args}, response is {response}") + return response diff --git a/app/main/controller/similarity_sync_controller.py b/app/main/controller/similarity_sync_controller.py index f691f76c..2c622c49 100644 --- a/app/main/controller/similarity_sync_controller.py +++ b/app/main/controller/similarity_sync_controller.py @@ -23,10 +23,12 @@ class SyncSimilarityResource(Resource): @api.doc(params={'text': 'text to be stored or queried for similarity', 'threshold': 'minimum score to consider, between 0.0 and 1.0 (defaults to 0.9)', 'model': 'similarity model to use: "elasticsearch" (pure Elasticsearch, default) or the key name of an active model'}) def post(self, similarity_type): args = request.json - app.logger.debug(f"Args are {args}") + app.logger.info(f"[SyncSimilarityResource] Starting Request - args are {args}, similarity_type is {similarity_type}") if similarity_type == "text": package = similarity.get_body_for_text_document(args, 'query') - return similarity.get_similar_items(package, similarity_type) + response = similarity.blocking_get_similar_items(package, similarity_type) else: package = similarity.get_body_for_media_document(args, 'query') - return similarity.blocking_get_similar_items(package, similarity_type) + response = similarity.blocking_get_similar_items(package, similarity_type) + app.logger.info(f"[SyncSimilarityResource] Completing Request - args are {args}, similarity_type is {similarity_type}, reponse is {response}") + return response diff --git a/app/main/controller/video_similarity_controller.py b/app/main/controller/video_similarity_controller.py index c1baa757..1504a2c9 100644 --- a/app/main/controller/video_similarity_controller.py +++ b/app/main/controller/video_similarity_controller.py @@ -33,5 +33,4 @@ class VideoSimilaritySearchResource(Resource): @api.doc(params={'url': 'video URL to be stored or queried for similarity', 'threshold': 'minimum score to consider, between 0.0 and 1.0 (defaults to 0.9)', 'context': 'context'} ) def post(self): args = request.json - app.logger.debug(f"Args are {args}") return jsonify({"message": "This endpoint is not implemented."}), 501 diff --git a/app/main/lib/elastic_crud.py b/app/main/lib/elastic_crud.py index fef79a05..e8732d3b 100644 --- a/app/main/lib/elastic_crud.py +++ b/app/main/lib/elastic_crud.py @@ -42,7 +42,7 @@ def get_presto_request_response(modality, callback_url, task): def requires_encoding(obj): for model_key in obj.get("models", []): - if not obj.get('model_'+model_key): + if model_key != "elasticsearch" and not obj.get('model_'+model_key): return True return False @@ -52,21 +52,19 @@ def get_blocked_presto_response(task, model, modality): obj, temporary = get_object(task, model) doc_id = obj["doc_id"] callback_url = Presto.add_item_callback_url(app.config['ALEGRE_HOST'], modality) - app.logger.info(f"Object for {task} of model {model} with id of {doc_id} has requires_encoding value of {requires_encoding(obj)}") if requires_encoding(obj): blocked_results = [] - for model_key in obj.pop("models", []): + for model_key in obj.get("models", []): if model_key != "elasticsearch" and not obj.get('model_'+model_key): response = get_presto_request_response(model_key, callback_url, obj) - blocked_results.append(Presto.blocked_response(response, modality)) + blocked_results.append({"model": model_key, "response": Presto.blocked_response(response, modality)}) # Warning: this is a blocking hold to wait until we get a response in # a redis key that we've received something from presto. - return obj, temporary, get_context_for_search(task), blocked_results[-1] + return obj, temporary, get_context_for_search(task), blocked_results else: return obj, temporary, get_context_for_search(task), {"body": obj} def get_async_presto_response(task, model, modality): - app.logger.error(f"get_async_presto_response: {task} {model} {modality}") obj, _ = get_object(task, model) callback_url = Presto.add_item_callback_url(app.config['ALEGRE_HOST'], modality) if task.get("doc_id") is None: diff --git a/app/main/lib/elasticsearch.py b/app/main/lib/elasticsearch.py index 6ea51f9c..c35847af 100644 --- a/app/main/lib/elasticsearch.py +++ b/app/main/lib/elasticsearch.py @@ -44,19 +44,24 @@ def get_all_documents_matching_context(context): return [] def generate_matches(context): + """ + If the keys are not project_media_id, has_custom_id, or field, return ANDs for each field, + with ORs for intra-key values (e.g. foo = bar AND baz = (blah|bat)) + """ matches = [] clause_count = 0 for key in context: - if isinstance(context[key], list): - clause_count += len(context[key]) - matches.append({ - 'query_string': { 'query': str.join(" OR ", [f"context.{key}: {v}" for v in context[key]])} - }) - else: - clause_count += 1 - matches.append({ - 'match': { 'context.' + key: context[key] } - }) + if key not in ["project_media_id", "has_custom_id", "field"]: + if isinstance(context[key], list): + clause_count += len(context[key]) + matches.append({ + 'query_string': { 'query': str.join(" OR ", [f"context.{key}: {v}" for v in context[key]])} + }) + else: + clause_count += 1 + matches.append({ + 'match': { 'context.' + key: context[key] } + }) return matches, clause_count def truncate_query(query, clause_count): @@ -83,7 +88,6 @@ def update_or_create_document(body, doc_id, index): found_doc = None if found_doc: body = {"doc": merge_contexts(body, found_doc)} - app.logger.info(f"Sending OpenSearch update: {body}") result = es.update( id=doc_id, body=body, @@ -91,14 +95,12 @@ def update_or_create_document(body, doc_id, index): retry_on_conflict=3 ) else: - app.logger.info(f"Sending OpenSearch store: {body}") result = es.index( id=doc_id, body=body, index=index ) else: - app.logger.info(f"Sending OpenSearch store without id: {body}") result = es.index( body=body, index=index @@ -111,12 +113,14 @@ def get_by_doc_id(doc_id): return response['_source'] def store_document(body, doc_id, language=None): - for field in ["per_model_threshold", "threshold", "model", "confirmed", "limit", "requires_callback"]: - body.pop(field, None) + storable_doc = {} + for k, v in body.items(): + if k not in ["per_model_threshold", "threshold", "model", "confirmed", "limit", "requires_callback"]: + storable_doc[k] = v indices = [app.config['ELASTICSEARCH_SIMILARITY']] # 'auto' indicates we should try to guess the appropriate language if language == 'auto': - text = body['content'] + text = storable_doc['content'] language = LangidProvider.langid(text)['result']['language'] if language not in SUPPORTED_LANGUAGES: app.logger.warning('Detected language {} is not supported'.format(language)) @@ -128,7 +132,7 @@ def store_document(body, doc_id, language=None): results = [] for index in indices: - index_result = update_or_create_document(body, doc_id, index) + index_result = update_or_create_document(storable_doc, doc_id, index) results.append(index_result) if index_result['result'] not in ['created', 'updated', 'noop']: app.logger.warning('Problem adding document to ES index for language {0}: {1}'.format(language, index_result)) diff --git a/app/main/lib/image_similarity.py b/app/main/lib/image_similarity.py index fc756ff6..a5050c3d 100644 --- a/app/main/lib/image_similarity.py +++ b/app/main/lib/image_similarity.py @@ -68,11 +68,9 @@ def callback_add_image(task): def search_image(image, model, limit, threshold, task, hash_value, context, temporary): if image: if model and model.lower() == "pdq": - app.logger.info(f"Searching with PDQ.") image.pdq = hash_value result = search_by_pdq(image.pdq, threshold, context, limit) else: - app.logger.info(f"Searching with phash.") image.phash = hash_value result = search_by_phash(image.phash, threshold, context, limit) if temporary: diff --git a/app/main/lib/langid.py b/app/main/lib/langid.py index dcb7dc8c..b6f5417e 100644 --- a/app/main/lib/langid.py +++ b/app/main/lib/langid.py @@ -70,8 +70,8 @@ def langid(text): prediction = cld3.get_language(text) return { 'result': { - 'language': prediction.language, - 'confidence': prediction.probability + 'language': prediction and prediction.language, + 'confidence': prediction and prediction.probability }, 'raw': prediction, 'model': 'CLD3', diff --git a/app/main/lib/media_crud.py b/app/main/lib/media_crud.py index 3f409e46..74360ccb 100644 --- a/app/main/lib/media_crud.py +++ b/app/main/lib/media_crud.py @@ -144,7 +144,6 @@ def get_blocked_presto_response(task, model, modality): callback_url = Presto.add_item_callback_url(app.config['ALEGRE_HOST'], modality) if task.get("doc_id") is None: task["doc_id"] = str(uuid.uuid4()) - app.logger.error(f"Object for {task} of model {model} with id of {obj.id} has requires_encoding value of {obj.requires_encoding}") if obj.requires_encoding: response = get_presto_request_response(modality, callback_url, task) # Warning: this is a blocking hold to wait until we get a response in diff --git a/app/main/lib/openai.py b/app/main/lib/openai.py index a0e03e1d..84e168bf 100644 --- a/app/main/lib/openai.py +++ b/app/main/lib/openai.py @@ -16,8 +16,8 @@ def retrieve_openai_embeddings(text, model_key): if val_from_cache is not None: return pickle.loads(val_from_cache) openai.api_key = app.config['OPENAI_API_KEY'] - app.logger.info(f"Calling OpenAI API") model_key_without_openai_prefix = model_key[len(PREFIX_OPENAI):] + app.logger.info(f"Calling OpenAI API with '{text}' and engine of {model_key_without_openai_prefix}") try: embeddings = openai.embeddings_utils.get_embedding(text, engine=model_key_without_openai_prefix) r_cache.set(key, pickle.dumps(embeddings)) diff --git a/app/main/lib/presto.py b/app/main/lib/presto.py index cef3ba88..db26840b 100644 --- a/app/main/lib/presto.py +++ b/app/main/lib/presto.py @@ -15,6 +15,7 @@ "xlm-r-bert-base-nli-stsb-mean-tokens": "mean_tokens__Model", "indian-sbert": "indian_sbert__Model", "paraphrase-filipino-mpnet-base-v2": "fptg__Model", + "paraphrase-multilingual-mpnet-base-v2": "paraphrase_multilingual__Model" } PRESTO_RESPONSE_TIMEOUT = os.getenv('PRESTO_RESPONSE_TIMEOUT', 120) diff --git a/app/main/lib/shared_models/video_model.py b/app/main/lib/shared_models/video_model.py index 983c31a4..ffeb6b8d 100644 --- a/app/main/lib/shared_models/video_model.py +++ b/app/main/lib/shared_models/video_model.py @@ -79,7 +79,6 @@ def load(self): pathlib.Path(self.directory).mkdir(parents=True, exist_ok=True) def respond(self, task): - app.logger.info('Received task that looks like: '+str(json.dumps(task))) if task["command"] == "delete": return self.delete(task) elif task["command"] == "add": diff --git a/app/main/lib/similarity.py b/app/main/lib/similarity.py index 473f9e1c..65e009ad 100644 --- a/app/main/lib/similarity.py +++ b/app/main/lib/similarity.py @@ -6,7 +6,7 @@ from app.main.lib.shared_models.video_model import VideoModel from app.main.lib.presto import Presto, PRESTO_MODEL_MAP from app.main.lib.image_similarity import add_image, callback_add_image, delete_image, blocking_search_image, async_search_image, async_search_image_on_callback -from app.main.lib.text_similarity import add_text, async_search_text, async_search_text_on_callback, callback_add_text, delete_text, search_text +from app.main.lib.text_similarity import add_text, async_search_text, async_search_text_on_callback, callback_add_text, delete_text, search_text, sync_search_text DEFAULT_SEARCH_LIMIT = 200 logging.basicConfig(level=logging.INFO) def get_body_for_media_document(params, mode): @@ -17,7 +17,7 @@ def get_body_for_media_document(params, mode): with some reformating. If we are storing, we remove unexpected items in `params` in order to avoid things being stored in OpenSearch unintentionally """ - app.logger.info( + app.logger.debug( f"[Alegre Similarity] get_body_for_text_document (mode={mode}):params (start) {params}") if 'created_at' not in params: params['created_at'] = datetime.now() @@ -34,7 +34,7 @@ def get_body_for_text_document(params, mode): with some reformating. If we are storing, we remove unexpected items in `params` in order to avoid things being stored in OpenSearch unintentionally """ - app.logger.info( + app.logger.debug( f"[Alegre Similarity] get_body_for_text_document (mode={mode}):params (start) {params}") # Combine model and models @@ -66,11 +66,11 @@ def get_body_for_text_document(params, mode): if mode == 'store': allow_list = set(['language', 'content', 'created_at', 'models', 'context', 'callback_url', 'content_hash']) keys_to_remove = params.keys() - allow_list - app.logger.info( + app.logger.debug( f"[Alegre Similarity] get_body_for_text_document:running in `store' mode. Removing {keys_to_remove}") for key in keys_to_remove: del params[key] - app.logger.info( + app.logger.debug( f"[Alegre Similarity] get_body_for_text_document (mode={mode}):params (end) {params}") return params @@ -99,11 +99,11 @@ def model_response_package(item, command): for optional_key in ["folder", "filepath"]: if optional_key in item.keys(): response_package[optional_key] = item[optional_key] - app.logger.info(f"[Alegre Similarity] [Item {item}, Command {command}] Response package looks like {response_package}") + app.logger.debug(f"[Alegre Similarity] [Item {item}, Command {command}] Response package looks like {response_package}") return response_package def add_item(item, similarity_type): - app.logger.info(f"[Alegre Similarity] [Item {item}, Similarity type: {similarity_type}] Adding item") + app.logger.debug(f"[Alegre Similarity] [Item {item}, Similarity type: {similarity_type}] Adding item") callback_url = Presto.add_item_callback_url(app.config['ALEGRE_HOST'], similarity_type) if similarity_type == "audio": response = Presto.send_request(app.config['PRESTO_HOST'], PRESTO_MODEL_MAP[similarity_type], callback_url, model_response_package(item, "add")).text @@ -117,7 +117,7 @@ def add_item(item, similarity_type): doc_id = item.pop("doc_id", None) language = item.pop("language", None) response = add_text(item, doc_id, language) - app.logger.info(f"[Alegre Similarity] [Item {item}, Similarity type: {similarity_type}] response for add was {response}") + app.logger.debug(f"[Alegre Similarity] [Item {item}, Similarity type: {similarity_type}] response for add was {response}") return response def callback_add_item(item, similarity_type): @@ -131,8 +131,8 @@ def callback_add_item(item, similarity_type): elif similarity_type == "text": function = callback_add_text if function: - response = function(item) - app.logger.info(f"[Alegre Similarity] CallbackAddItem: [Item {item}, Similarity type: {similarity_type}] Response looks like {response}") + response = {"item": function(item)} + app.logger.debug(f"[Alegre Similarity] CallbackAddItem: [Item {item}, Similarity type: {similarity_type}] Response looks like {response}") return response else: app.logger.warning(f"[Alegre Similarity] InvalidCallbackAddItem: [Item {item}, Similarity type: {similarity_type}] No response") @@ -148,25 +148,22 @@ def merge_audio_and_video_responses(video_response, audio_response): def callback_search_item(item, similarity_type): if similarity_type == "audio": response = audio_model().search(model_response_package(item.get("raw"), "search")) - app.logger.info(f"[Alegre Similarity] CallbackSearchItem: [Item {item}, Similarity type: {similarity_type}] Response looks like {response}") elif similarity_type == "video": response = video_model().search(model_response_package(item.get("raw"), "search")) # When we search for a video, we need to also search for the audio track of the video against our audio library in case it matches other audio clips. # audio_response = audio_model().search(video_model().overload_context_to_denote_content_type(model_response_package(item.get("raw"), "search"))) # response = merge_audio_and_video_responses(video_response, audio_response) - app.logger.info(f"[Alegre Similarity] CallbackSearchItem: [Item {item}, Similarity type: {similarity_type}] Response looks like {response}") elif similarity_type == "image": response = async_search_image_on_callback(item) - app.logger.info(f"[Alegre Similarity] CallbackSearchItem: [Item {item}, Similarity type: {similarity_type}] Response looks like {response}") elif similarity_type == "text": response = async_search_text_on_callback(item) - app.logger.info(f"[Alegre Similarity] CallbackSearchItem: [Item {item}, Similarity type: {similarity_type}] Response looks like {response}") else: app.logger.warning(f"[Alegre Similarity] InvalidCallbackSearchItem: [Item {item}, Similarity type: {similarity_type}] No response") + app.logger.debug(f"[Alegre Similarity] CallbackSearchItem: [Item {item}, Similarity type: {similarity_type}] Response looks like {response}") return {"item": item, "results": response} def delete_item(item, similarity_type): - app.logger.info(f"[Alegre Similarity] [Item {item}, Similarity type: {similarity_type}] Deleting item") + app.logger.debug(f"[Alegre Similarity] [Item {item}, Similarity type: {similarity_type}] Deleting item") if similarity_type == "audio": response = audio_model().delete(model_response_package(item, "delete")) elif similarity_type == "video": @@ -175,55 +172,47 @@ def delete_item(item, similarity_type): response = delete_image(item) elif similarity_type == "text": response = delete_text(item.get("doc_id"), item.get("context", {}), item.get("quiet", False)) - app.logger.info(f"[Alegre Similarity] [Item {item}, Similarity type: {similarity_type}] response for delete was {response}") + app.logger.debug(f"[Alegre Similarity] [Item {item}, Similarity type: {similarity_type}] response for delete was {response}") return response def get_similar_items(item, similarity_type): - app.logger.info(f"[Alegre Similarity] [Item {item}, Similarity type: {similarity_type}] searching on item") + app.logger.debug(f"[Alegre Similarity] [Item {item}, Similarity type: {similarity_type}] searching on item") response = None if similarity_type == "text": response = search_text(item) - app.logger.info(f"[Alegre Similarity] [Item {item}, Similarity type: {similarity_type}] response for search was {response}") + app.logger.debug(f"[Alegre Similarity] [Item {item}, Similarity type: {similarity_type}] response for search was {response}") return response def blocking_get_similar_items(item, similarity_type): - app.logger.info(f"[Alegre Similarity] [Item {item}, Similarity type: {similarity_type}] searching on item") + app.logger.debug(f"[Alegre Similarity] [Item {item}, Similarity type: {similarity_type}] searching on item") if similarity_type == "audio": response = audio_model().blocking_search(model_response_package(item, "search"), "audio") - app.logger.info(f"[Alegre Similarity] [Item {item}, Similarity type: {similarity_type}] response for search was {response}") - return response elif similarity_type == "image": response = blocking_search_image(item) - app.logger.info(f"[Alegre Similarity] [Item {item}, Similarity type: {similarity_type}] response for search was {response}") - return response elif similarity_type == "video": response = video_model().blocking_search(model_response_package(item, "search"), "video") - app.logger.info(f"[Alegre Similarity] [Item {item}, Similarity type: {similarity_type}] response for search was {response}") - return response + elif similarity_type == "text": + response = sync_search_text(item, "text") else: raise Exception(f"{similarity_type} modality not implemented for blocking requests!") + app.logger.debug(f"[Alegre Similarity] [Item {item}, Similarity type: {similarity_type}] response for search was {response}") + return response def async_get_similar_items(item, similarity_type): - app.logger.info(f"[Alegre Similarity] [Item {item}, Similarity type: {similarity_type}] searching on item") + app.logger.debug(f"[Alegre Similarity] [Item {item}, Similarity type: {similarity_type}] searching on item") if similarity_type == "audio": response, waiting_for_callback = audio_model().async_search(model_response_package(item, "search"), "audio") - app.logger.info(f"[Alegre Similarity] [Item {item}, Similarity type: {similarity_type}] response for search was {response}") - return response, waiting_for_callback elif similarity_type == "video": response, waiting_for_callback = video_model().async_search(model_response_package(item, "search"), "video") # Searching with an audio_model() call here is intentional - we need to encode the audio # track for all videos to see if we can match them across modes (i.e. this MP3 matches # this video's audio track, so they are able to be matched) # _, waiting_for_audio_callback = audio_model().async_search(video_model().overload_context_to_denote_content_type(model_response_package(item, "search")), "audio") - app.logger.info(f"[Alegre Similarity] [Item {item}, Similarity type: {similarity_type}] response for search was {response}") - return response, waiting_for_callback# or waiting_for_audio_callback elif similarity_type == "image": response, waiting_for_callback = async_search_image(item, "image") - app.logger.info(f"[Alegre Similarity] [Item {item}, Similarity type: {similarity_type}] response for search was {response}") - return response, waiting_for_callback elif similarity_type == "text": response, waiting_for_callback = async_search_text(item, "text") - app.logger.info(f"[Alegre Similarity] [Item {item}, Similarity type: {similarity_type}] response for search was {response}") - return response, waiting_for_callback else: - raise Exception(f"{similarity_type} modality not implemented for async requests!") \ No newline at end of file + raise Exception(f"{similarity_type} modality not implemented for async requests!") + app.logger.debug(f"[Alegre Similarity] [Item {item}, Similarity type: {similarity_type}] response for search was {response}") + return response, waiting_for_callback# or waiting_for_audio_callback diff --git a/app/main/lib/text_similarity.py b/app/main/lib/text_similarity.py index 0185840b..ff37f6ce 100644 --- a/app/main/lib/text_similarity.py +++ b/app/main/lib/text_similarity.py @@ -34,18 +34,28 @@ def get_document_body(body): def async_search_text(task, modality): return elastic_crud.get_async_presto_response(task, "text", modality) +def sync_search_text(task, modality): + obj, temporary, context, presto_result = elastic_crud.get_blocked_presto_response(task, "text", modality) + obj["models"] = ["elasticsearch"] + if isinstance(presto_result, list): + for presto_vector_result in presto_result: + obj['vector_'+presto_vector_result["model"]] = presto_vector_result["response"]["body"]["result"] + obj['model_'+presto_vector_result["model"]] = 1 + obj["models"].append(presto_vector_result["model"]) + document, _ = elastic_crud.get_object(obj, "text") + return search_text(document, True) + def fill_in_openai_embeddings(document): - for model_key in document.pop("models", []): + for model_key in document.get("models", []): if model_key != "elasticsearch" and model_key[:len(PREFIX_OPENAI)] == PREFIX_OPENAI: document['vector_'+model_key] = retrieve_openai_embeddings(document['content'], model_key) document['model_'+model_key] = 1 store_document(document, document["doc_id"], document["language"]) def async_search_text_on_callback(task): - app.logger.info(f"async_search_text_on_callback(task) is {task}") - document = elastic_crud.get_object_by_doc_id(task["id"]) + doc_id = task.get("raw", {}).get("doc_id") + document = elastic_crud.get_object_by_doc_id(doc_id) fill_in_openai_embeddings(document) - app.logger.info(f"async_search_text_on_callback(task) document is {document}") if not elastic_crud.requires_encoding(document): return search_text(document, True) return None @@ -69,13 +79,12 @@ def add_text(body, doc_id, language=None): def search_text(search_params, use_document_vectors=False): vector_for_search = None - app.logger.info(f"[Alegre Similarity]search_params are {search_params}") results = {"result": []} for model_key in search_params.pop("models", []): if model_key != "elasticsearch": search_params.pop("model", None) if use_document_vectors: - vector_for_search = search_params[model_key+"-tokens"] + vector_for_search = search_params["vector_"+model_key] else: vector_for_search = None result = search_text_by_model(dict(**search_params, **{'model': model_key}), vector_for_search) @@ -97,7 +106,7 @@ def get_model_and_threshold(search_params): if 'per_model_threshold' in search_params and isinstance(search_params['per_model_threshold'], list) and [e for e in search_params['per_model_threshold'] if e["model"] == model_key]: threshold = [e for e in search_params['per_model_threshold'] if e["model"] == model_key][0]["value"] if threshold is None: - app.logger.error( + app.logger.warn( f"[Alegre Similarity] get_model_and_threshold - no threshold was specified, backing down to default of 0.9 - search_params is {search_params}") threshold = 0.9 return model_key, threshold @@ -174,6 +183,16 @@ def insert_model_into_response(hits, model_key): hit["_source"]["model"] = model_key return hits +def return_sources(results): + """ + Results come back as embedded responses raw from elasticsearch - Other services expect the + _source value to be the root dict, and also needs index and score to be persisted as well. + May throw an error if source has index and score keys some day, but easy to fix for that, + and should noisily break since it would have other downstream consequences. + """ + #TODO: remove underscore version after dependencies updated https://meedan.atlassian.net/browse/CV2-5546 + return [dict(**r["_source"], **{"_id": r["_id"], "id": r["_id"], "index": r["_index"], "_score": r["_score"], "score": r["_score"]}) for r in results] + def strip_vectors(results): for result in results: vector_keys = [key for key in result["_source"].keys() if key[:7] == "vector_"] @@ -196,14 +215,10 @@ def restrict_results(results, search_params, model_key): return results def search_text_by_model(search_params, vector_for_search): - app.logger.info( - f"[Alegre Similarity] search_text_by_model:search_params {search_params}") language = None if not search_params.get("content"): return {"result": []} model_key, threshold = get_model_and_threshold(search_params) - app.logger.info( - f"[Alegre Similarity] search_text_by_model:model_key {model_key}, threshold:{threshold}") es = OpenSearch(app.config['ELASTICSEARCH_URL'], timeout=30) conditions = [] matches = [] @@ -253,17 +268,18 @@ def search_text_by_model(search_params, vector_for_search): conditions['query']['script_score']['query']['bool']['must'].append(context) limit = search_params.get("limit") body = get_body_from_conditions(conditions) - app.logger.info(f"Sending OpenSearch query: {body}") result = es.search( size=limit or ELASTICSEARCH_DEFAULT_LIMIT, #NOTE a default limit is given in similarity.py body=body, index=search_indices ) - response = strip_vectors( - restrict_results( - insert_model_into_response(result['hits']['hits'], model_key), - search_params, - model_key + response = return_sources( + strip_vectors( + restrict_results( + insert_model_into_response(result['hits']['hits'], model_key), + search_params, + model_key + ) ) ) return { diff --git a/app/test/test_langid.py b/app/test/test_langid.py index 395f2031..94b45df6 100644 --- a/app/test/test_langid.py +++ b/app/test/test_langid.py @@ -120,6 +120,19 @@ def test_langid_api_get_without_text(self): self.assertEqual('application/json', response.content_type) self.assertEqual(200, response.status_code) + def test_null_prediction_cld(self): + with patch('cld3.get_language', ) as mock_cld3_get_language: + mock_cld3_get_language.return_value = None + expected = { + 'result': { + 'language': None, + 'confidence': None + }, + 'raw': None, + 'model': 'CLD3', + } + self.assertEqual(Cld3LangidProvider.langid("foo bar"), {'model': 'CLD3', 'raw': None, 'result': {'confidence': None, 'language': None}}) + def test_langid_api_post(self): response = self.client.post( '/text/langid/', diff --git a/app/test/test_similarity.py b/app/test/test_similarity.py index 28d406cb..b8502f4e 100644 --- a/app/test/test_similarity.py +++ b/app/test/test_similarity.py @@ -306,10 +306,10 @@ def test_elasticsearch_performs_correct_fuzzy_search(self): post_response = self.client.post('/text/similarity/search/', data=json.dumps(lookup), content_type='application/json') lookup["fuzzy"] = True post_response_fuzzy = self.client.post('/text/similarity/search/', data=json.dumps(lookup), content_type='application/json') - self.assertGreater(json.loads(post_response_fuzzy.data.decode())["result"][0]["_score"], json.loads(post_response.data.decode())["result"][0]["_score"]) + self.assertGreater(json.loads(post_response_fuzzy.data.decode())["result"][0]["score"], json.loads(post_response.data.decode())["result"][0]["score"]) lookup["fuzzy"] = False post_response_fuzzy = self.client.post('/text/similarity/search/', data=json.dumps(lookup), content_type='application/json') - self.assertEqual(json.loads(post_response_fuzzy.data.decode())["result"][0]["_score"], json.loads(post_response.data.decode())["result"][0]["_score"]) + self.assertEqual(json.loads(post_response_fuzzy.data.decode())["result"][0]["score"], json.loads(post_response.data.decode())["result"][0]["score"]) def test_elasticsearch_update_text(self): with self.client: @@ -455,7 +455,7 @@ def test_model_similarity(self): ) result = json.loads(response.data.decode()) self.assertEqual(1, len(result['result'])) - similarity = result['result'][0]['_score'] + similarity = result['result'][0]['score'] self.assertGreater(similarity, 0.7) response = self.client.post( @@ -487,7 +487,7 @@ def test_model_similarity(self): ) result = json.loads(response.data.decode()) self.assertEqual(1, len(result['result'])) - similarity = result['result'][0]['_score'] + similarity = result['result'][0]['score'] self.assertGreater(similarity, 0.7) response = self.client.post( @@ -501,7 +501,7 @@ def test_model_similarity(self): ) result = json.loads(response.data.decode()) self.assertEqual(1, len(result['result'])) - similarity = result['result'][0]['_score'] + similarity = result['result'][0]['score'] self.assertGreater(similarity, 0.7) def test_wrong_model_key(self): @@ -599,7 +599,7 @@ def test_min_es_search(self): result = json.loads(response.data.decode()) self.assertEqual(1, len(result['result'])) - data['min_es_score']=10+result['result'][0]['_score'] + data['min_es_score']=10+result['result'][0]['score'] response = self.client.post( '/text/similarity/search/', diff --git a/app/test/test_similarity_lang_analyzers.py b/app/test/test_similarity_lang_analyzers.py index b6817750..4378b5c0 100644 --- a/app/test/test_similarity_lang_analyzers.py +++ b/app/test/test_similarity_lang_analyzers.py @@ -48,7 +48,7 @@ def test_all_analyzers(self): content_type='application/json' ) result = json.loads(response.data.decode()) - self.assertTrue(app.config['ELASTICSEARCH_SIMILARITY']+"_"+example['language'] in [e['_index'] for e in result['result']]) + self.assertTrue(app.config['ELASTICSEARCH_SIMILARITY']+"_"+example['language'] in [e['index'] for e in result['result']]) def test_auto_language_id(self): # language examples as input to language classifier @@ -86,7 +86,7 @@ def test_auto_language_id(self): index_alias = app.config['ELASTICSEARCH_SIMILARITY'] if expected_lang is not None: index_alias = app.config['ELASTICSEARCH_SIMILARITY']+"_"+expected_lang - self.assertTrue(index_alias in [e['_index'] for e in result['result']]) + self.assertTrue(index_alias in [e['index'] for e in result['result']]) def test_auto_language_query(self): # language examples as input to language classifier @@ -124,7 +124,7 @@ def test_auto_language_query(self): index_alias = app.config['ELASTICSEARCH_SIMILARITY'] if expected_lang is not None: index_alias = app.config['ELASTICSEARCH_SIMILARITY']+"_"+expected_lang - self.assertTrue(index_alias in [e['_index'] for e in result['result']]) + self.assertTrue(index_alias in [e['index'] for e in result['result']]) if __name__ == '__main__': diff --git a/cc-test-reporter b/cc-test-reporter new file mode 100755 index 00000000..23a0f531 Binary files /dev/null and b/cc-test-reporter differ diff --git a/docker-compose.yml b/docker-compose.yml index 22083971..b59188e8 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,4 +1,3 @@ -version: '2' volumes: elasticsearch: redis: diff --git a/docker-test.yml b/docker-test.yml index d6185b9d..67890adb 100644 --- a/docker-test.yml +++ b/docker-test.yml @@ -1,4 +1,3 @@ -version: '2' services: postgres: build: ./postgres diff --git a/google_credentials.json.enc b/google_credentials.json.enc index 5f591be1..a5ffa9f6 100644 Binary files a/google_credentials.json.enc and b/google_credentials.json.enc differ diff --git a/requirements.txt b/requirements.txt index 8d054506..f296eeb6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -137,7 +137,7 @@ threadpoolctl==2.2.0 tinysegmenter==0.3 tldextract==3.1.0 tmkpy==0.1.1 -tokenizers +tokenizers==0.10.3 toolz==0.9.0 torch==1.9.0 tqdm==4.66.3