Skip to content

Commit

Permalink
Merge pull request #31 from anevjes/openai-pii-stripping
Browse files Browse the repository at this point in the history
Openai-pii-stripping
  • Loading branch information
anevjes authored Aug 1, 2024
2 parents 4b2dbe3 + 744657b commit ed14a21
Show file tree
Hide file tree
Showing 12 changed files with 189 additions and 42 deletions.
14 changes: 4 additions & 10 deletions aisentry/facade/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,6 @@

load_dotenv(".env", override=True)


# os.environ["AZURE_CLIENT_ID"] = "your_client_id"
# os.environ["AZURE_TENANT_ID"] = "your_tenant_id"
# os.environ["AZURE_FEDERATED_TOKEN_FILE"] = "/var/run/secrets/tokens/azure-identity-token"


logger.info("Starting Ai-Sentry Facade app")
app = Quart(__name__)

Expand Down Expand Up @@ -132,10 +126,10 @@ async def catch_all(path):
client = endpoint_info["client"]


if openAI_request_headers.get('Api-Key') is not None:
logger.info("detected use of api-key header - will use this for authentication")
logger.debug(f"Swapping out api-key inside header with {endpoint_info['api-key']} value")
openAI_request_headers['Api-Key'] = endpoint_info['api-key']
# if openAI_request_headers.get('Api-Key') is not None:
# logger.info("detected use of api-key header - will use this for authentication")
# logger.debug(f"Swapping out api-key inside header with {endpoint_info['api-key']} value")
# openAI_request_headers['Api-Key'] = endpoint_info['api-key']

if endpoint_info['api-key'] is not None:
logger.info("No api-key header detected - will use the default api-key for authentication")
Expand Down
15 changes: 5 additions & 10 deletions aisentry/utils/analyze_pii.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,6 @@ async def analyze_pii_async(input_text: List[str]) -> None:
async for page in pages:
document_results.append(page)

# doc=""

for doc, action_results in zip(chunk, document_results):
for result in action_results:

Expand All @@ -71,21 +69,18 @@ async def analyze_pii_async(input_text: List[str]) -> None:
logger.debug(f".........Confidence Score: {pii_entity.confidence_score}")
if pii_entity.confidence_score >= 0.8 and pii_entity.category != "DateTime":
logger.debug(f"Removing PII entity: {pii_entity.text}, category: {pii_entity.category} from the logged payload")
# if pii_entity.text in "\},]":
# doc = doc.replace(pii_entity.text, "*PII*\"},")
# logger.info(f"PII-Processing: Replacing PII entity: {pii_entity.text} with extra escaping")
doc = doc.replace(pii_entity.text, "*PII*")
doc = doc.replace(pii_entity.text, "PII_REDACTED")

elif result.is_error is True:
logger.error(f'PII-Processing: An error with code {result.error.code} and message {result.error.message}')


#UNTOCHED
if ": *PII*," in doc:
doc = doc.replace(": *PII*,", ":\"*PII*\",")
if ": PII_REDACTED," in doc:
doc = doc.replace(": PII_REDACTED,", ":\"PII_REDACTED\",")

if "*PII*," in doc:
doc = doc.replace("*PII*,", "*PII*\"")
if "PII_REDACTED," in doc:
doc = doc.replace("PII_REDACTED,", "PII_REDACTED\"")

logger.info(f"PII stripping completed")
return doc
Expand Down
91 changes: 91 additions & 0 deletions aisentry/utils/analyze_pii_openai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import os
import logging
import asyncio
from dotenv import load_dotenv
from openai import AsyncAzureOpenAI



# initial setup for logging / env variable loading
log_level = os.getenv('LOG-LEVEL', 'INFO').upper()

# Set up the logger
logger = logging.getLogger(__name__)
logging.basicConfig(level=getattr(logging, log_level),
format='%(asctime)s - %(levelname)s - %(message)s',
datefmt='%d-%m-%Y %H:%M:%S'
)

# Initialize the OpenAI client with your key and endpoint

load_dotenv(".env", override=True)

openai_key = os.environ.get("PII_STRIPPING_OPENAI_API_KEY")
openai_endpoint = os.environ.get("PII_STRIPPING_OPENAI_ENDPOINT")


client = AsyncAzureOpenAI(
api_key=openai_key,
api_version="2023-12-01-preview",
azure_endpoint=openai_endpoint
)

pii_stripping_system_prompt = """Objective: Identify and flag any Personally Identifiable Information (PII) within text data to ensure data privacy and compliance with regulations such as GDPR, CCPA, etc.
PII includes but is not limited to:
Full Names: First and last names
Addresses: Street address, city, state, zip code
Phone Numbers: Any format of telephone numbers
Email Addresses: Any format of email addresses
Social Security Numbers (SSNs): XXX-XX-XXXX or similar formats
Credit Card Numbers: Any format of credit/debit card numbers
Bank Account Numbers: Any format of bank account numbers
Driver's License Numbers: Any format of driver's license numbers
Passport Numbers: Any format of passport numbers
Date of Birth: Full date of birth (MM/DD/YYYY or similar formats)
IP Addresses: Any format of IPv4 or IPv6 addresses
API-KEY or Token: Any format of API keys or tokens
Medical Information: Any health-related information that can identify an individual
Biometric Data: Fingerprints, facial recognition data, etc.
Instructions for the System:
Input: Accept text data for analysis.
Processing:
Use pattern matching, regular expressions, and machine learning algorithms to identify potential PII.
Cross-reference detected patterns with known PII formats.
Output:
Flag detected PII and categorize it.
Provide a confidence score for each detected PII item.
Highlight the specific text containing PII.
Example:
Input Text:
John Doe lives at 123 Maple Street, Springfield, IL 62704. His email is [email protected], and his phone number is (555) 123-4567. He was born on 01/15/1985 and his SSN is 123-45-6789.
Output:
Keep the same text structure but replace the PII with placeholders: [PII-Redacted]
Compliance Note: The system must handle all detected PII with strict confidentiality and in accordance with applicable data protection regulations."""


async def get_chat_pii_stripped_completion(prompt):
# Send the request to Azure OpenAI
response = await client.chat.completions.create(
model="gpt-4",
messages=[
{"role": "system", "content": pii_stripping_system_prompt},
{"role": "user", "content": f"Rewrite the input and Strip out PII information as per the system message from following input: {prompt}"}
]
)

# Extract the text from the response
#completion_text = response.completions[0].data.get("text", "")
message_content = response['choices'][0]['message']['content']

logger.info(f"PII Stripped Completion Text: {message_content}")
return message_content


41 changes: 35 additions & 6 deletions aisentry/worker/cosmos_logger/cosmos_logger.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from flask import Flask, request, jsonify
from cloudevents.http import from_http
from requests.exceptions import HTTPError
from azure.identity import DefaultAzureCredential
from azure.core.exceptions import AzureError
from dapr.clients import DaprClient
from utils.analyze_pii import analyze_pii_async
from utils.analyze_pii_openai import get_chat_pii_stripped_completion
import logging
import datetime
import asyncio
Expand All @@ -16,6 +16,9 @@
import uuid
from typing import List


load_dotenv(".env", override=True)

# Get log level from environment variable
log_level = os.getenv('LOG-LEVEL', 'INFO').upper()
logger = logging.getLogger(__name__)
Expand All @@ -29,10 +32,11 @@



load_dotenv(".env", override=True)

app_port = os.getenv('APP_PORT', '7000')

# This can be either OPENAI or TEXTANALYTICS
pii_stripping_service = os.getenv('PII_STRIPPING_SERVICE', 'OPENAI')

# Register Dapr pub/sub subscriptions
@app.route('/dapr/subscribe', methods=['GET'])
def subscribe():
Expand Down Expand Up @@ -82,9 +86,34 @@ async def oairequests_subscriber():

input_data: List[str] = []
input_data.append(output_binding_data)
output_binding_data = await analyze_pii_async(input_data)
logger.debug(f"PII stripped data: {output_binding_data}")

if pii_stripping_service == 'TEXTANALYTICS':
logger.debug(f"PII stripping service: {pii_stripping_service}")
output_binding_data = await analyze_pii_async(input_data)

#OPENAI BASED PII Stripping
else:
logger.debug(f"PII stripping service: {pii_stripping_service}")

# Ensure a new event loop is created if the current one is closed
try:
loop = asyncio.get_event_loop()
if loop.is_closed():
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
except RuntimeError:
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)

if loop.is_running():
task = loop.create_task(get_chat_pii_stripped_completion(input_data))
result = loop.run_until_complete(task)
else:
result = loop.run_until_complete(get_chat_pii_stripped_completion(input_data))

print(result)
# output_binding_data = await get_chat_pii_stripped_completion(input_data)

logger.debug(f"PII stripped data: {output_binding_data}")


elif headers['ai-sentry-log-level'] == 'COMPLETE':
Expand Down
1 change: 1 addition & 0 deletions aisentry/worker/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ msal==1.28.0
msal-extensions==1.1.0
multidict==6.0.5
openai==1.14.2
nest_asyncio==1.6.0
packaging==24.0
portalocker==2.8.2
priority==2.0.0
Expand Down
4 changes: 2 additions & 2 deletions build/build-ai-sentry-containers.ps1
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
param(
[string]$version = "0.1.1",
[string]$containerRegistry = "ariantestacr001.azurecr.io"
[string]$containerRegistry = "anevjesacrdevtest.azurecr.io"
)
#Uncomment first line for very first time to set to the right acr context
# az acr login --name ariantestacr001
# az acr login --name anevjesacrdevtest
Write-Host "Building AI-Sentry Facade:$version"
docker build --platform linux/amd64 -t ai-sentry-facadeapp:$version -f Dockerfile.facade ../aisentry/
docker tag ai-sentry-facadeapp:$version $containerRegistry/ai-sentry-facadeapp:$version
Expand Down
4 changes: 3 additions & 1 deletion content/documentation/AI-Sentry-config-settings.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,6 @@ Each pool is represented as an object within the "pools" array. Each pool object
Each endpoint object has the following properties:

- url: The URL of the OpenAI instance. You're expected to replace <your-open-ai-instance> with the actual instance name. You need to append
- api-key: The API key for accessing the OpenAI instance. You're expected to replace your-api-key with the actual API key.
- api-key: The API key for accessing the OpenAI instance. You're expected to replace your-api-key with the actual API key.

Please note: We also support JWT auth to backend openAI instances. If you simply set "api-Key": null within the property bags inside the facade layer config; you will leverage aks workload identity to connect to openAi backends - however you will need worklaod identity federated out with managed identity stood up with your AKS cluster - and ofcourse grant the RBAC to the managed identity across all the required openAI instances in the backend.
3 changes: 2 additions & 1 deletion content/documentation/AKSDeployment.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ helm repo add bitnami https://charts.bitnami.com/bitnami
helm install sentry-redis bitnami/redis-cluster
export REDIS_PASSWORD=$(kubectl get secret --namespace "default" sentry-redis-redis-cluster -o jsonpath="{.data.redis-password}" | base64 --decode)
kubectl create secret generic redis --from-literal=redis-password=$REDIS_PASSWORD -n sentry-ai
kubectl create namespace ai-sentry
kubectl create secret generic redis --from-literal=redis-password=$REDIS_PASSWORD -n ai-sentry
```

Expand Down
7 changes: 6 additions & 1 deletion content/documentation/Workload-identity-config.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# AKS Workload Identity setup

## Enable Workload identity against existing AKS cluster
```powershell
az aks update --resource-group "aks-devtest-rg" --name "anevjes-aks-dev" --enable-oidc-issuer --enable-workload-identity
```

## MI creation
```powershell
az account set --subscription "subscriptionID"
Expand Down Expand Up @@ -31,7 +36,7 @@ export SERVICE_ACCOUNT_NAMESPACE="ai-sentry"
## OIDC Issuer url

```bash
export AKS_OIDC_ISSUER="$(az aks show --name anevjes-aks --resource-group aks --query "oidcIssuerProfile.issuerUrl" -o tsv)"
export AKS_OIDC_ISSUER="$(az aks show --name anevjes-aks-dev --resource-group aks-devtest-rg --query "oidcIssuerProfile.issuerUrl" -o tsv)"
```

## Create AKS Service Account
Expand Down
8 changes: 4 additions & 4 deletions content/documentation/ai-sentry-config.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@
"endpoints": [
{
"url": "https://<your-open-ai-instance>.openai.azure.com/openai",
"api-key": "yourapi-key"
"api-key": "yourapi-key" //If you simply set "api-Key": null the facade layer will leverage aks workload identity to connect to openAi backends.
},
{
"url": "https://<your-open-ai-instance>.openai.azure.com/openai",
"api-key": "your-api-key"
"api-key": "your-api-key" //If you simply set "api-Key": null the facade layer will leverage aks workload identity to connect to openAi backends.
}
]
},
Expand All @@ -20,11 +20,11 @@
"endpoints": [
{
"url": "https://<your-open-ai-instance>.openai.azure.com/openai",
"api-key": "your-api-key"
"api-key": "your-api-key" //If you simply set "api-Key": null the facade layer will leverage aks workload identity to connect to openAi backends.
},
{
"url": "https://<your-open-ai-instance>.openai.azure.com/openai",
"api-key": "your-api-key"
"api-key": "your-api-key" //If you simply set "api-Key": null the facade layer will leverage aks workload identity to connect to openAi backends.
}
]
}
Expand Down
11 changes: 10 additions & 1 deletion deploy/aks/ai-sentry-deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ spec:
metadata:
labels:
app: facadeapp
azure.workload.identity/use: "true"
annotations:
dapr.io/enabled: "true"
dapr.io/app-id: "facadeapp"
Expand All @@ -75,6 +76,8 @@ spec:
dapr.io/app-health-probe-interval: "3"
dapr.io/app-health-probe-timeout: "200"
dapr.io/app-health-threshold: "2"
azure.workload.identity/inject-proxy-sidecar: "true"
azure.workload.identity/proxy-sidecar-port: "8000"

spec:
containers:
Expand All @@ -94,7 +97,7 @@ spec:
failureThreshold: 3
env:
- name: "AI-SENTRY-ENDPOINT-CONFIG"
value: "{\"pools\":[{\"name\":\"pool1\",\"description\":\"pool1 description\",\"endpoints\":[{\"url\":\"https://youropenaiendpoint.openai.azure.com/openai\",\"api-key\":\"yourkey\"}]},{\"name\":\"pool2\",\"description\":\"pool2 description\",\"endpoints\":[{\"url\":\"https://youropenai.openai.azure.com/openai\",\"api-key\":\"yourkey\"},{\"url\":\"https://youropenai.openai.azure.com/openai\",\"api-key\":\"yourkey\"}]}]}"
value: "{\"pools\":[{\"name\":\"pool1\",\"description\":\"pool1 description\",\"endpoints\":[{\"url\":\"https://youropenaiendpoint.openai.azure.com/openai\",\"api-key\":\"yourkey or simply null\"}]},{\"name\":\"pool2\",\"description\":\"pool2 description\",\"endpoints\":[{\"url\":\"https://youropenai.openai.azure.com/openai\",\"api-key or simply null\":\"yourkey or simply null\"},{\"url\":\"https://youropenai.openai.azure.com/openai\",\"api-key or simply null\":\"yourkey\"}]}]}"
- name: "LOG-LEVEL"
value: "INFO"
---
Expand Down Expand Up @@ -141,6 +144,12 @@ spec:
value: "your-key"
- name: "LOG-LEVEL"
value: "DEBUG"
- name: "PII_STRIPPING_SERVICE"
value: "OPENAI"
- name: PII_STRIPPING_OPENAI_ENDPOINT
value: "https://ptuopendeployment.openai.azure.com/"
- name: "PII_STRIPPING_OPENAI_API_KEY"
value: "yourapikeytoopenai / apim subscription key"
---
#Summary Logger Logger
kind: StatefulSet
Expand Down
Loading

0 comments on commit ed14a21

Please sign in to comment.