GCP Vertex AI support as DLP

abdolence · Aug 22, 2024 · fe6be26 · fe6be26
1 parent 9c2f7f7
commit fe6be26
Show file tree

Hide file tree

Showing 8 changed files with 668 additions and 19 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -23,6 +23,7 @@ ci-gcp-llm = [] # For testing on CI/GCP with LLM models
 ci-open-ai = [] # For testing on CI/OpenAIP
 ci-clibpoard = [] # For testing on CI/Clipboard
 ci-ocr = [] # For testing on CI/OCR
+ci-gcp-vertex-ai = [] # For testing on CI/GCP with Vertex AI
 ci = ["ci-gcp", "ci-aws", "ci-ms-presidio", "ci-gcp-llm", "ci-open-ai", "ci-clibpoard"]
 pdf-render = ["pdfium-render"]
 clipboard = ["arboard"]
@@ -39,7 +40,7 @@ indicatif = { version = "0.17" }
 clap = { version = "4.1", features = ["derive"] }
 tokio = { version = "1.14", features = ["fs", "rt-multi-thread", "sync", "rt", "macros"] }
 tokio-util = { version = "0.7", features = ["compat"] }
-gcloud-sdk = { version = "0.25.5", features = ["google-privacy-dlp-v2", "google-rest-storage-v1", "google-ai-generativelanguage-v1beta"] }
+gcloud-sdk = { version = "0.25.5", features = ["google-privacy-dlp-v2", "google-rest-storage-v1", "google-ai-generativelanguage-v1beta", "google-cloud-aiplatform-v1beta1"] }
 futures = "0.3"
 sha2 = "0.10"
 async-trait = "0.1"

diff --git a/README.md b/README.md
@@ -28,15 +28,15 @@ Google Cloud Platform's DLP API.
         * structured data table files (csv)
         * images (jpeg, png, bpm, gif)
         * PDF files (rendering as images)
-    * [AWS Comprehend](https://aws.amazon.com/comprehend/) PII redaction:
-        * text, html, csv, json files
-        * images through text extraction using OCR
-        * PDF files (rendering as images from OCR)
     * [Microsoft Presidio](https://microsoft.github.io/presidio/) for PII redaction (open source project that you can
       install on-prem).
         * text, html, csv, json files
         * images
         * PDF files (rendering as images)
+    * [GCP Vertex AI](https://cloud.google.com/vertex-ai/docs) based redaction
+        * text, html, csv, json files
+        * images that are supported by the models
+        * PDF files (rendering as images)
     * [Gemini LLM](https://ai.google.dev/gemini-api/docs) based redaction
         * text, html, csv, json files
         * images that are supported by the models
@@ -45,6 +45,10 @@ Google Cloud Platform's DLP API.
         * text, html, csv, json files
         * images that are supported by the models
         * PDF files (rendering as images)
+    * [AWS Comprehend](https://aws.amazon.com/comprehend/) PII redaction:
+        * text, html, csv, json files
+        * images through text extraction using OCR
+        * PDF files (rendering as images from OCR)
     * ... more DLP providers can be added in the future.
 * **CLI:**  Easy-to-use command-line interface for streamlined workflows.
 * Built with Rust to ensure speed, safety, and reliability.
@@ -80,10 +84,12 @@ Arguments:
 Options:
   -m, --max-size-limit <MAX_SIZE_LIMIT>
           Maximum size of files to copy in bytes
+  -n, --max-files-limit <MAX_FILES_LIMIT>
+          Maximum number of files to copy. Sort order is not guaranteed and depends on the provider
   -f, --filename-filter <FILENAME_FILTER>
           Filter by name using glob patterns such as *.txt
   -d, --redact <REDACT>
-          List of redacters to use [possible values: gcp-dlp, aws-comprehend, ms-presidio, gemini-llm, open-ai-llm]
+          List of redacters to use [possible values: gcp-dlp, aws-comprehend, ms-presidio, gemini-llm, open-ai-llm, gcp-vertex-ai]
       --allow-unsupported-copies
           Allow unsupported types to be copied without redaction
       --gcp-project-id <GCP_PROJECT_ID>
@@ -92,6 +98,14 @@ Options:
           Additional GCP DLP built in info types for redaction
       --gcp-dlp-stored-info-type <GCP_DLP_STORED_INFO_TYPE>
           Additional GCP DLP user defined stored info types for redaction
+      --gcp-region <GCP_REGION>
+          GCP region that will be used to redact and bill API calls for Vertex AI
+      --gcp-vertex-ai-native-image-support
+          Vertex AI model supports image editing natively. Default is false.
+      --gcp-vertex-ai-text-model <GCP_VERTEX_AI_TEXT_MODEL>
+          Model name for text redaction in Vertex AI. Default is 'publishers/google/models/gemini-1.5-flash-001'
+      --gcp-vertex-ai-image-model <GCP_VERTEX_AI_IMAGE_MODEL>
+          Model name for image redaction in Vertex AI. Default is 'publishers/google/models/gemini-1.5-pro-001'
       --csv-headers-disable
           Disable CSV headers (if they are not present)
       --csv-delimiter <CSV_DELIMITER>
@@ -142,21 +156,36 @@ To be able to use GCP DLP you need to:
 
 Additionally you can provide the list of user defined info types using `--gcp-dlp-stored-info-type` option.
 
-### AWS Comprehend
-
-To be able to use AWS Comprehend DLP you need to authenticate using `aws configure` or provide a service account.
-To provide an AWS region use `--aws-region` option since AWS Comprehend may not be available in all regions.
-AWS Comprehend DLP is only available for unstructured text files.
-
 ### Microsoft Presidio
 
 To be able to use Microsoft Presidio DLP you need to have a running instance of the Presidio API.
 You can use Docker to run it locally or deploy it to your infrastructure.
 You need to provide the URLs for text analysis and image redaction endpoints using `--ms-presidio-text-analyze-url` and
 `--ms-presidio-image-redact-url` options.
 
+### GCP Vertex AI
+
+To be able to use GCP Vertex AI you need to:
+
+- authenticate using `gcloud auth application-default login` or provide a service account key
+  using `GOOGLE_APPLICATION_CREDENTIALS` environment variable.
+- provide a GCP project id using `--gcp-project-id` option.
+- provide a GCP region using `--gcp-region` option.
+
+You can specify different models using `--gcp-vertex-ai-text-model` and `--gcp-vertex-ai-image-model` options.
+By default, they are set to:
+
+- `publishers/google/models/gemini-1.5-flash-001` for text model
+- `publishers/google/models/gemini-1.5-pro-001` for image model
+
+In case you have access to native image editing models such as Google Imagen 3, you can enable those capabilities using
+`--gcp-vertex-ai-native-image-support` option.
+Without native image support, the tool will use LLM output and editing images by coordinates.
+
 ### Gemini LLM
 
+Consider using Vertex AI redacter for more flexibility instead of Gemini LLM.
+
 To be able to use Gemini as DLP/redacter you need to:
 
 - authenticate using `gcloud auth application-default login --client-id-file=<client_secret-file>.json` or provide a
@@ -171,6 +200,12 @@ To be able to use Gemini as DLP/redacter you need to:
 To be able to use Open AI LLM you need to provide an API key using `--open-ai-api-key` command line option.
 Optionally, you can provide a model name using `--open-ai-model` option. Default is `gpt-4o-mini`.
 
+### AWS Comprehend
+
+To be able to use AWS Comprehend DLP you need to authenticate using `aws configure` or provide a service account.
+To provide an AWS region use `--aws-region` option since AWS Comprehend may not be available in all regions.
+AWS Comprehend DLP is only available for unstructured text files.
+
 ## Multiple redacters
 
 You can specify multiple redacters using `--redact` option multiple times.

diff --git a/src/args.rs b/src/args.rs
@@ -1,8 +1,8 @@
-use crate::common_types::GcpProjectId;
+use crate::common_types::{GcpProjectId, GcpRegion};
 use crate::errors::AppError;
 use crate::redacters::{
-    GcpDlpRedacterOptions, GeminiLlmModelName, OpenAiLlmApiKey, OpenAiModelName,
-    RedacterBaseOptions, RedacterOptions, RedacterProviderOptions,
+    GcpDlpRedacterOptions, GcpVertexAiModelName, GeminiLlmModelName, OpenAiLlmApiKey,
+    OpenAiModelName, RedacterBaseOptions, RedacterOptions, RedacterProviderOptions,
 };
 use clap::*;
 use std::fmt::Display;
@@ -93,6 +93,7 @@ pub enum RedacterType {
     MsPresidio,
     GeminiLlm,
     OpenAiLlm,
+    GcpVertexAi,
 }
 
 impl std::str::FromStr for RedacterType {
@@ -117,6 +118,7 @@ impl Display for RedacterType {
             RedacterType::MsPresidio => write!(f, "ms-presidio"),
             RedacterType::GeminiLlm => write!(f, "gemini-llm"),
             RedacterType::OpenAiLlm => write!(f, "openai-llm"),
+            RedacterType::GcpVertexAi => write!(f, "gcp-vertex-ai"),
         }
     }
 }
@@ -149,6 +151,30 @@ pub struct RedacterArgs {
     )]
     pub gcp_dlp_stored_info_type: Option<Vec<String>>,
 
+    #[arg(
+        long,
+        help = "GCP region that will be used to redact and bill API calls for Vertex AI"
+    )]
+    pub gcp_region: Option<GcpRegion>,
+
+    #[arg(
+        long,
+        help = "Vertex AI model supports image editing natively. Default is false."
+    )]
+    pub gcp_vertex_ai_native_image_support: bool,
+
+    #[arg(
+        long,
+        help = "Model name for text redaction in Vertex AI. Default is 'publishers/google/models/gemini-1.5-flash-001'"
+    )]
+    pub gcp_vertex_ai_text_model: Option<GcpVertexAiModelName>,
+
+    #[arg(
+        long,
+        help = "Model name for image redaction in Vertex AI. Default is 'publishers/google/models/gemini-1.5-pro-001'"
+    )]
+    pub gcp_vertex_ai_image_model: Option<GcpVertexAiModelName>,
+
     #[arg(
         long,
         help = "Disable CSV headers (if they are not present)",
@@ -260,6 +286,25 @@ impl TryInto<RedacterOptions> for RedacterArgs {
                         model: self.open_ai_model.clone(),
                     },
                 )),
+                RedacterType::GcpVertexAi => Ok(RedacterProviderOptions::GcpVertexAi(
+                    crate::redacters::GcpVertexAiRedacterOptions {
+                        project_id: self.gcp_project_id.clone().ok_or_else(|| {
+                            AppError::RedacterConfigError {
+                                message: "GCP project id is required for GCP Vertex AI redacter"
+                                    .to_string(),
+                            }
+                        })?,
+                        gcp_region: self.gcp_region.clone().ok_or_else(|| {
+                            AppError::RedacterConfigError {
+                                message: "GCP region is required for GCP Vertex AI redacter"
+                                    .to_string(),
+                            }
+                        })?,
+                        native_image_support: self.gcp_vertex_ai_native_image_support,
+                        text_model: self.gcp_vertex_ai_text_model.clone(),
+                        image_model: self.gcp_vertex_ai_image_model.clone(),
+                    },
+                )),
             }?;
             provider_options.push(redacter_options);
         }

diff --git a/src/common_types.rs b/src/common_types.rs
@@ -4,6 +4,9 @@ use serde::{Deserialize, Serialize};
 #[derive(Debug, Clone, ValueStruct)]
 pub struct GcpProjectId(String);
 
+#[derive(Debug, Clone, ValueStruct)]
+pub struct GcpRegion(String);
+
 #[derive(Debug, Clone, ValueStruct)]
 pub struct AwsAccountId(String);
 

diff --git a/src/main.rs b/src/main.rs
@@ -1,12 +1,11 @@
 use std::error::Error;
 
+use crate::commands::*;
+use crate::errors::AppError;
 use args::*;
 use clap::Parser;
 use console::{Style, Term};
 
-use crate::commands::*;
-use crate::errors::AppError;
-
 mod args;
 mod reporter;