Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open AI LLM redacting support #8

Merged
merged 2 commits into from
Aug 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ ci-gcp = [] # For testing on CI/GCP
ci-aws = [] # For testing on CI/AWS
ci-ms-presidio = [] # For testing on CI/MS Presidiom
ci-gcp-llm = [] # For testing on CI/GCP with LLM models
ci = ["ci-gcp", "ci-aws", "ci-ms-presidio", "ci-gcp-llm"]
ci-open-ai = [] # For testing on CI/OpenAIP
ci = ["ci-gcp", "ci-aws", "ci-ms-presidio", "ci-gcp-llm", "ci-open-ai"]


[dependencies]
Expand Down
13 changes: 12 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ Google Cloud Platform's DLP API.
* images
* [Gemini LLM](https://ai.google.dev/gemini-api/docs) based redaction
* text, html, csv, json files
* [Open AI LLM](https://openai.com/) based redaction
* text, html, csv, json files
* ... more DLP providers can be added in the future.
* **CLI:** Easy-to-use command-line interface for streamlined workflows.
* Built with Rust to ensure speed, safety, and reliability.
Expand Down Expand Up @@ -67,7 +69,7 @@ Options:
-f, --filename-filter <FILENAME_FILTER>
Filter by name using glob patterns such as *.txt
-d, --redact <REDACT>
Redacter type [possible values: gcp-dlp, aws-comprehend, ms-presidio, gemini-llm]
Redacter type [possible values: gcp-dlp, aws-comprehend, ms-presidio, gemini-llm, open-ai-llm]
--gcp-project-id <GCP_PROJECT_ID>
GCP project id that will be used to redact and bill API calls
--allow-unsupported-copies
Expand All @@ -86,6 +88,10 @@ Options:
Gemini model name for Gemini LLM redacter. Default is 'models/gemini-1.5-flash'
--sampling-size <SAMPLING_SIZE>
Sampling size in bytes before redacting files. Disabled by default
--open-ai-api-key <OPEN_AI_API_KEY>
API key for OpenAI LLM redacter
--open-ai-model <OPEN_AI_MODEL>
Open AI model name for OpenAI LLM redacter. Default is 'gpt-4o-mini'
-h, --help
Print help
```
Expand Down Expand Up @@ -135,6 +141,11 @@ To be able to use GCP DLP you need to:
official [instructions](https://ai.google.dev/gemini-api/docs/oauth#set-cloud).
- provide a GCP project id using `--gcp-project-id` option.

### Open AI LLM

To be able to use Open AI LLM you need to provide an API key using `--open-ai-api-key` command line option.
Optionally, you can provide a model name using `--open-ai-model` option. Default is `gpt-4o-mini`.

## Examples:

```sh
Expand Down
25 changes: 24 additions & 1 deletion src/args.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
use crate::common_types::GcpProjectId;
use crate::errors::AppError;
use crate::redacters::{
GcpDlpRedacterOptions, GeminiLlmModelName, RedacterOptions, RedacterProviderOptions,
GcpDlpRedacterOptions, GeminiLlmModelName, OpenAiLlmApiKey, OpenAiModelName, RedacterOptions,
RedacterProviderOptions,
};
use clap::*;
use std::fmt::Display;
Expand Down Expand Up @@ -62,6 +63,7 @@ pub enum RedacterType {
AwsComprehend,
MsPresidio,
GeminiLlm,
OpenAiLlm,
}

impl std::str::FromStr for RedacterType {
Expand All @@ -85,6 +87,7 @@ impl Display for RedacterType {
RedacterType::AwsComprehend => write!(f, "aws-comprehend"),
RedacterType::MsPresidio => write!(f, "ms-presidio"),
RedacterType::GeminiLlm => write!(f, "gemini-llm"),
RedacterType::OpenAiLlm => write!(f, "openai-llm"),
}
}
}
Expand Down Expand Up @@ -138,6 +141,15 @@ pub struct RedacterArgs {
help = "Sampling size in bytes before redacting files. Disabled by default"
)]
pub sampling_size: Option<usize>,

#[arg(long, help = "API key for OpenAI LLM redacter")]
pub open_ai_api_key: Option<OpenAiLlmApiKey>,

#[arg(
long,
help = "Open AI model name for OpenAI LLM redacter. Default is 'gpt-4o-mini'"
)]
pub open_ai_model: Option<OpenAiModelName>,
}

impl TryInto<RedacterOptions> for RedacterArgs {
Expand Down Expand Up @@ -186,6 +198,17 @@ impl TryInto<RedacterOptions> for RedacterArgs {
gemini_model: self.gemini_model,
},
)),
Some(RedacterType::OpenAiLlm) => Ok(RedacterProviderOptions::OpenAiLlm(
crate::redacters::OpenAiLlmRedacterOptions {
api_key: self
.open_ai_api_key
.ok_or_else(|| AppError::RedacterConfigError {
message: "OpenAI API key is required for OpenAI LLM redacter"
.to_string(),
})?,
model: self.open_ai_model,
},
)),
None => Err(AppError::RedacterConfigError {
message: "Redacter type is required".to_string(),
}),
Expand Down
3 changes: 2 additions & 1 deletion src/commands/copy_command.rs
Original file line number Diff line number Diff line change
Expand Up @@ -193,8 +193,9 @@ async fn transfer_and_redact_file<
};
bar.println(
format!(
"Copying {} ({}) to {}. Size: {}",
"Copying {} ({},{}) to {}. Size: {}",
bold_style.apply_to(&base_resolved_file_ref.file_path),
base_resolved_file_ref.scheme,
file_ref
.media_type
.as_ref()
Expand Down
12 changes: 0 additions & 12 deletions src/filesystems/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,18 +38,6 @@ pub struct AbsoluteFilePath {
pub scheme: String,
}

impl AbsoluteFilePath {
pub fn value(&self) -> String {
format!("{}://{}", self.scheme, self.file_path)
}
}

impl RelativeFilePath {
pub fn is_dir(&self) -> bool {
self.value().ends_with('/')
}
}

#[derive(Debug, Clone)]
pub struct FileSystemRef {
pub relative_path: RelativeFilePath,
Expand Down
14 changes: 13 additions & 1 deletion src/redacters/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use crate::errors::AppError;
use crate::filesystems::FileSystemRef;
use crate::reporter::AppReporter;
use crate::AppResult;
Expand All @@ -16,9 +17,11 @@ mod ms_presidio;
pub use ms_presidio::*;

mod gemini_llm;
use crate::errors::AppError;
pub use gemini_llm::*;

mod open_ai_llm;
pub use open_ai_llm::*;

#[derive(Debug, Clone)]
pub struct RedacterDataItem {
pub content: RedacterDataItemContent,
Expand All @@ -44,6 +47,7 @@ pub enum Redacters<'a> {
AwsComprehendDlp(AwsComprehendRedacter<'a>),
MsPresidio(MsPresidioRedacter<'a>),
GeminiLlm(GeminiLlmRedacter<'a>),
OpenAiLlm(OpenAiLlmRedacter<'a>),
}

#[derive(Debug, Clone)]
Expand All @@ -61,6 +65,7 @@ pub enum RedacterProviderOptions {
AwsComprehend(AwsComprehendRedacterOptions),
MsPresidio(MsPresidioRedacterOptions),
GeminiLlm(GeminiLlmRedacterOptions),
OpenAiLlm(OpenAiLlmRedacterOptions),
}

impl Display for RedacterOptions {
Expand All @@ -70,6 +75,7 @@ impl Display for RedacterOptions {
RedacterProviderOptions::AwsComprehend(_) => write!(f, "aws-comprehend-dlp"),
RedacterProviderOptions::MsPresidio(_) => write!(f, "ms-presidio"),
RedacterProviderOptions::GeminiLlm(_) => write!(f, "gemini-llm"),
RedacterProviderOptions::OpenAiLlm(_) => write!(f, "openai-llm"),
}
}
}
Expand All @@ -94,6 +100,9 @@ impl<'a> Redacters<'a> {
RedacterProviderOptions::GeminiLlm(ref options) => Ok(Redacters::GeminiLlm(
GeminiLlmRedacter::new(redacter_options.clone(), options.clone(), reporter).await?,
)),
RedacterProviderOptions::OpenAiLlm(ref options) => Ok(Redacters::OpenAiLlm(
OpenAiLlmRedacter::new(redacter_options.clone(), options.clone(), reporter).await?,
)),
}
}

Expand Down Expand Up @@ -147,6 +156,7 @@ impl<'a> Redacter for Redacters<'a> {
Redacters::AwsComprehendDlp(redacter) => redacter.redact(input).await,
Redacters::MsPresidio(redacter) => redacter.redact(input).await,
Redacters::GeminiLlm(redacter) => redacter.redact(input).await,
Redacters::OpenAiLlm(redacter) => redacter.redact(input).await,
}
}

Expand All @@ -161,6 +171,7 @@ impl<'a> Redacter for Redacters<'a> {
}
Redacters::MsPresidio(redacter) => redacter.redact_supported_options(file_ref).await,
Redacters::GeminiLlm(redacter) => redacter.redact_supported_options(file_ref).await,
Redacters::OpenAiLlm(redacter) => redacter.redact_supported_options(file_ref).await,
}
}

Expand All @@ -170,6 +181,7 @@ impl<'a> Redacter for Redacters<'a> {
Redacters::AwsComprehendDlp(redacter) => redacter.options(),
Redacters::MsPresidio(redacter) => redacter.options(),
Redacters::GeminiLlm(redacter) => redacter.options(),
Redacters::OpenAiLlm(redacter) => redacter.options(),
}
}
}
Expand Down
Loading
Loading