Skip to content

Commit

Permalink
Merge pull request #97 from meedan/CV2-4789
Browse files Browse the repository at this point in the history
ClassyCat in Presto: CV2 4789
  • Loading branch information
ashkankzme authored Jul 25, 2024
2 parents e4b7cd7 + 1e675bf commit 589d144
Show file tree
Hide file tree
Showing 19 changed files with 1,673 additions and 32 deletions.
9 changes: 8 additions & 1 deletion .env_file.example
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,11 @@ OTEL_EXPORTER_OTLP_ENDPOINT="https://api.honeycomb.io"
OTEL_EXPORTER_OTLP_HEADERS="x-honeycomb-team=XXX"
HONEYCOMB_API_ENDPOINT="https://api.honeycomb.io"
REDIS_URL="redis://redis:6379/0"
CACHE_DEFAULT_TTL=86400
CACHE_DEFAULT_TTL=86400

CLASSYCAT_OUTPUT_BUCKET="classycat-qa"
CLASSYCAT_BATCH_SIZE_LIMIT=25
OPENROUTER_API_KEY=""
ANTHROPIC_API_KEY=""
CLASSYCAT_LLM_CLIENT_TYPE="openrouter"
CLASSYCAT_LLM_MODEL_NAME="anthropic/claude-3-sonnet"
9 changes: 8 additions & 1 deletion .env_file.test
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,11 @@ OTEL_EXPORTER_OTLP_ENDPOINT="https://api.honeycomb.io"
OTEL_EXPORTER_OTLP_HEADERS="x-honeycomb-team=XXX"
HONEYCOMB_API_ENDPOINT="https://api.honeycomb.io"
REDIS_URL="redis://redis:6379/0"
CACHE_DEFAULT_TTL=86400
CACHE_DEFAULT_TTL=86400

CLASSYCAT_OUTPUT_BUCKET="classycat-qa"
CLASSYCAT_BATCH_SIZE_LIMIT=25
OPENROUTER_API_KEY=""
ANTHROPIC_API_KEY=""
CLASSYCAT_LLM_CLIENT_TYPE="openrouter"
CLASSYCAT_LLM_MODEL_NAME="anthropic/claude-3-sonnet"
13 changes: 12 additions & 1 deletion .github/workflows/ci-test-branches.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,13 +54,24 @@ jobs:
--file ./Dockerfile ./
echo "::set-output name=image::$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG"
- name: Retrieve Presto test parameters from SSM
id: get-ssm-params
run: |
parameters=$(aws ssm get-parameters-by-path --path "/test/presto/" --recursive --query "Parameters[*].[Name,Value]" --output text)
env_content=""
while IFS=$'\t' read -r name value; do
key=$(echo "$name" | awk -F'/' '{print $NF}')
env_content="${env_content}${key}=${value}\n"
done <<< "$parameters"
echo -e "\n$env_content" >> .env_file
- name: Run Unit Tests
id: run-unit-test
env:
GIT_SHA: ${{ github.sha }}
BUILD_NUMBER: ${{ github.run_number }}
run: |
docker run -e APP=presto -e DEPLOY_ENV=test --rm ${{ steps.build-image.outputs.image }} make run_test
docker run -e APP=presto -e DEPLOY_ENV=test --env-file=.env_file --rm ${{ steps.build-image.outputs.image }} make run_test
- name: Send GitHub Action trigger data to Slack workflow on success
id: slack-api-notify-success
Expand Down
19 changes: 18 additions & 1 deletion .github/workflows/ci-test-pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,23 @@ jobs:
--tag $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG \
--file ./Dockerfile ./
echo "::set-output name=image::$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG"
- name: Install AWS CLI and jq
run: |
sudo apt-get update
sudo apt-get install -y awscli
sudo apt-get install -y jq
- name: Retrieve Presto test parameters from SSM
id: get-ssm-params
run: |
parameters=$(aws ssm get-parameters-by-path --path "/test/presto/" --recursive --query "Parameters[*].[Name,Value]" --output text)
env_content=""
while IFS=$'\t' read -r name value; do
key=$(echo "$name" | awk -F'/' '{print $NF}')
env_content="${env_content}${key}=${value}\n"
done <<< "$parameters"
echo -e "\n$env_content" >> .env_file
- name: Run PR Tests
id: run-tests
Expand All @@ -63,7 +80,7 @@ jobs:
# docker run --rm ${{ steps.build-image.outputs.image }} -e APP=presto -e DEPLOY_ENV=test -e GITHUB_TOKEN=none make run_test
echo "test not enabled for forks!"
else
docker run -e APP=presto -e DEPLOY_ENV=test -e GITHUB_TOKEN=none --rm ${{ steps.build-image.outputs.image }} make run_test
docker run -e APP=presto -e DEPLOY_ENV=test --env-file=.env_file -e GITHUB_TOKEN=none --rm ${{ steps.build-image.outputs.image }} make run_test
fi
- name: Reset cache
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
*.cpython-39.pyc
*.pyc
.env_file
.env
26 changes: 21 additions & 5 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ services:
- HONEYCOMB_API_ENDPOINT=${HONEYCOMB_API_ENDPOINT}
- REDIS_URL=${REDIS_URL}
- CACHE_DEFAULT_TTL=${CACHE_DEFAULT_TTL}

env_file:
- ./.env_file
depends_on:
Expand Down Expand Up @@ -107,7 +108,7 @@ services:
- "./:/app"
env_file:
- ./.env_file
environment:
environment:
ROLE: worker
MODEL_NAME: video.Model
depends_on:
Expand All @@ -122,7 +123,7 @@ services:
- "./:/app"
env_file:
- ./.env_file
environment:
environment:
ROLE: worker
MODEL_NAME: mean_tokens.Model
depends_on:
Expand All @@ -137,7 +138,7 @@ services:
- "./:/app"
env_file:
- ./.env_file
environment:
environment:
ROLE: worker
MODEL_NAME: fasttext.Model
depends_on:
Expand All @@ -152,7 +153,7 @@ services:
- "./:/app"
env_file:
- ./.env_file
environment:
environment:
ROLE: worker
MODEL_NAME: fptg.Model
depends_on:
Expand All @@ -167,11 +168,26 @@ services:
- "./:/app"
env_file:
- ./.env_file
environment:
environment:
ROLE: worker
MODEL_NAME: indian_sbert.Model
depends_on:
elasticmq:
condition: service_healthy
redis:
condition: service_healthy
classycat:
build: .
platform: linux/amd64
volumes:
- "./:/app"
env_file:
- ./.env_file
environment:
ROLE: worker
MODEL_NAME: classycat.Model
depends_on:
elasticmq:
condition: service_healthy
redis:
condition: service_healthy
229 changes: 229 additions & 0 deletions docs/classycat.design.decisions.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
# ClassyCat design decisions and notes

## ClassyCat in Presto (initial version)
ClassyCat is implemented in Presto and stores data in S3 for future use

```mermaid
sequenceDiagram
participant Check/Timpani
participant ClassyCat in Presto
participant ExternalLLM
participant S3
Check/Timpani->>ClassyCat in Presto: New item to label
ClassyCat in Presto->>ExternalLLM: Call external LLM<br>where needed
ExternalLLM-->>ClassyCat in Presto: Label(s)
ClassyCat in Presto->>S3: Store labelled points
ClassyCat in Presto-->>Check/Timpani: Label(s) for new item(s)
```


## Request and response JSON examples:
### Create Schema
- Input:

```json
{
"model_name": "classycat__Model",
"body": {
"id": 1200,
"parameters": {
"event_type": "schema_create",
"schema_name": "2024 Indian Election Test",
"topics": [
{
"topic": "Politics",
"description": "This topic includes political claims, attacks on leaders and parties, and general political commentary."
},
{
"topic": "Communalism",
"description": "This topic covers attack on religious minorities, statements on religious freedom and polarization."
}
],
"examples": [
{
"text": "Congress Manifesto is horrible. Never seen such a dangerous manifesto in my life. It's like vision 2047 document of PFI\n\nCheck these points of manifesto\n\n1. Will bring back triple talak (Muslim personal law)\n2. Reservation to Muslim in govt n private jobs (Implement Sachchar committee report)\n3. Support Love Jihad (right to love)\n4. Support Burqa in school (right to dress)\n5. End majoritarianism (Hinduism)\n6. Ban bulldozer action\n7. Support Gaza (Hamas)\n8. Legalise Same Sex Marriage, gender fluidity, trans movement\n9. Increase Muslim judges in judiciary\n10. Communal violence bill (will stop mob lynching)\n11. Legalise beef (right to eat everything)\n12. Separate loan intrest for Muslims\n13. Allow treason (No sedition)\n\nAll those Hindu who are thinking to vote Indi Alliance, NOTA or independent. Read this and think.\n",
"labels": [
"Politics",
"Communalism"
]
}
],
"languages": [
"English",
"Hindi",
"Telugu",
"Malayalam"
]
},
"callback_url": "http://example.com?callback"
}
}
```

- Output (callback):
```json
{
"body": {
"id": 1200,
"content_hash": null,
"callback_url": "http://host.docker.internal:9888",
"url": null,
"text": null,
"raw": {},
"parameters": {
"event_type": "schema_create",
"schema_name": "2024 Indian Election Test 2",
"topics": [
{
"topic": "Politics",
"description": "This topic includes political claims, attacks on leaders and parties, and general political commentary."
},
{
"topic": "Communalism",
"description": "This topic covers attack on religious minorities, statements on religious freedom and polarization."
}
],
"examples": [
{
"text": "Congress Manifesto is horrible. Never seen such a dangerous manifesto in my life. It's like vision 2047 document of PFI\n\nCheck these points of manifesto\n\n1. Will bring back triple talak (Muslim personal law)\n2. Reservation to Muslim in govt n private jobs (Implement Sachchar committee report)\n3. Support Love Jihad (right to love)\n4. Support Burqa in school (right to dress)\n5. End majoritarianism (Hinduism)\n6. Ban bulldozer action\n7. Support Gaza (Hamas)\n8. Legalise Same Sex Marriage, gender fluidity, trans movement\n9. Increase Muslim judges in judiciary\n10. Communal violence bill (will stop mob lynching)\n11. Legalise beef (right to eat everything)\n12. Separate loan intrest for Muslims\n13. Allow treason (No sedition)\n\nAll those Hindu who are thinking to vote Indi Alliance, NOTA or independent. Read this and think.\n",
"labels": [
"Politics",
"Communalism"
]
}
],
"languages": [
"English",
"Hindi",
"Telugu",
"Malayalam"
]
},
"result": {
"responseMessage": "success",
"schema_id": "e6729bb9-2491-47dc-824d-828d929ebcd2"
}
},
"model_name": "classycat.Model",
"retry_count": 0
}
```

### Schema Look Up:
- Input:
```json
{
"model_name": "classycat__Model",
"body": {
"callback_url": "http://example.com?callback",
"id": 1200,
"parameters": {
"event_type": "schema_lookup",
"schema_name": "2024 Indian Election Test"
}
}
}
```

- Output (callback):
```json
{
"body": {
"id": 1200,
"content_hash": null,
"callback_url": "http://host.docker.internal:9888",
"url": null,
"text": null,
"raw": {},
"parameters": {
"event_type": "schema_lookup",
"schema_name": "2024 Indian Election Test"
},
"result": {
"responseMessage": "success",
"schema_id": "12589852-4fff-430b-bf77-adad202d03ca"
}
},
"model_name": "classycat.Model",
"retry_count": 0
}
```

### Classify
- Input:
```json
{
"model_name": "classycat__Model",
"body": {
"id": 1200,
"parameters": {
"event_type": "classify",
"schema_id": "4a026b82-4a16-440d-aed7-bec07af12205",
"items": [
{
"id": "11",
"text": "modi and bjp want to rule india by dividing people against each other"
}
]
},
"callback_url": "http://example.com?callback"
}
}
```

- Output (callback):
```json
{
"body": {
"id": 1200,
"content_hash": null,
"callback_url": "http://host.docker.internal:9888",
"url": null,
"text": null,
"raw": {},
"parameters": {
"event_type": "classify",
"schema_id": "12589852-4fff-430b-bf77-adad202d03ca",
"items": [
{
"id": "11",
"text": "modi and bjp want to rule india by dividing people against each other"
}
]
},
"result": {
"responseMessage": "success",
"classification_results": [
{
"id": "11",
"text": "modi and bjp want to rule india by dividing people against each other",
"labels": [
"Politics",
"Communalism"
]
}
]
}
},
"model_name": "classycat.Model",
"retry_count": 0
}
```


## Notes

- We will implement Classy Cat inside Presto, which supports asynchronous callbacks
- We will respond with the appropriate error codes and messages in case of failures
- Classy Cat will accept classify requests in batches. To submit a single item request, simply submit a batch request of size 1.
- The maximum batch size for Classy Cat is 25
- For now, users of Classy Cat may choose how they would respond to requests not being serviced in case of failures
- For now, the scope of Classy Cat remains limited to classification through the use of LLMs. In future, we will consider implementing a KNN classification based on previous classifications. We currently will only store the LLM classification results, without making any local decisions.
- Presto will be OK to handle storage and state needs of Classy Cat
- Classy Cat will only call one call back URL per request
- Every ClassyCat response will include a message field (see example below). Upon success, that message will be “success”, otherwise it will contain the corresponding error message.
- The classification flowchart is as follows:

![Classification Diagram](../img/classycat_classification_flowchart.png)
Binary file added img/classycat_classification_flowchart.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit 589d144

Please sign in to comment.