diff --git a/Cargo.lock b/Cargo.lock index 574b2f1..bbc00a6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -270,6 +270,29 @@ dependencies = [ "uuid", ] +[[package]] +name = "aws-sdk-comprehend" +version = "1.36.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b660cee83c4a583a8cc025d7a656d77df44c5874c8d42cd4f9bec7dde7182db5" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-types", + "bytes", + "fastrand", + "http 0.2.12", + "once_cell", + "regex-lite", + "tracing", +] + [[package]] name = "aws-sdk-s3" version = "1.42.0" @@ -2197,11 +2220,12 @@ dependencies = [ [[package]] name = "redacter" -version = "0.1.2" +version = "0.2.0" dependencies = [ "async-recursion", "async-trait", "aws-config", + "aws-sdk-comprehend", "aws-sdk-s3", "cargo-husky", "chrono", diff --git a/Cargo.toml b/Cargo.toml index 8fec93f..59acf06 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "redacter" -version = "0.1.2" +version = "0.2.0" edition = "2021" authors = ["Abdulla Abdurakhmanov "] license = "Apache-2.0" @@ -47,6 +47,7 @@ tempfile = "3" csv-async = { version = "1", default-features = false, features = ["tokio", "tokio-stream"] } aws-config = { version = "1", features = ["behavior-version-latest"] } aws-sdk-s3 = { version = "1" } +aws-sdk-comprehend = { version = "1" } [dev-dependencies] diff --git a/README.md b/README.md index cdaa834..e635fab 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,8 @@ Google Cloud Platform's DLP API. * text, html, json files * structured data table files (csv) * images (jpeg, png, bpm, gif) + * AWS Comprehend PII redaction for text files. + * ... more DLP providers can be added in the future. * **CLI:** Easy-to-use command-line interface for streamlined workflows. * Built with Rust to ensure speed, safety, and reliability. @@ -56,7 +58,7 @@ Options: -f, --filename-filter Filter by name using glob patterns such as *.txt -d, --redact - Redacter type [possible values: gcp-dlp] + Redacter type [possible values: gcp-dlp, aws-comprehend-dlp] --gcp-project-id GCP project id that will be used to redact and bill API calls --allow-unsupported-copies @@ -65,6 +67,8 @@ Options: Disable CSV headers (if they are not present) --csv-delimiter CSV delimiter (default is ',' + --aws-region + AWS region for AWS Comprehend DLP redacter -h, --help Print help ``` @@ -73,9 +77,6 @@ DLP is optional and should be enabled with `--redact` (`-d`) option. Without DLP enabled, the tool will copy all files without redaction. With DLP enabled, the tool will redact files based on the DLP model and skip unsupported files. -To be able to use GCP DLP you need to authenticate using `gcloud auth application-default login` or provide a service -account key using `GOOGLE_APPLICATION_CREDENTIALS` environment variable. - Source/destination can be a local file or directory, or a file in GCS, S3, or a zip archive: - Local file: `/tmp/file.txt` or `/tmp` for whole directory recursive copy @@ -83,7 +84,20 @@ Source/destination can be a local file or directory, or a file in GCS, S3, or a - S3: `s3://bucket/file.txt` or `s3://bucket/test-dir/` for whole directory recursive copy - Zip archive: `zip://tmp/archive.zip` -### Examples: +## DLP redacters + +### Google Cloud Platform DLP + +To be able to use GCP DLP you need to authenticate using `gcloud auth application-default login` or provide a service +account key using `GOOGLE_APPLICATION_CREDENTIALS` environment variable. + +### AWS Comprehend DLP + +To be able to use AWS Comprehend DLP you need to authenticate using `aws configure` or provide a service account. +To provide an AWS region use `--aws-region` option since AWS Comprehend may not be available in all regions. +AWS Comprehend DLP is only available for unstructured text files. + +## Examples: ```sh # Copy and redact a file from local filesystem to GCS @@ -120,6 +134,9 @@ and/or by size: - The accuracy of redaction depends on the DLP model, so don't rely on it as the only security measure. - The tool was mostly design to redact files internally. Not recommended use it in public environments without proper security measures and manual review. +- Integrity of the files is not guaranteed due to DLP implementation specifics. Some of the formats such as + HTML/XML/JSON + may be corrupted after redaction since they treated as text. - Use it at your own risk. The author is not responsible for any data loss or security breaches. ## Licence diff --git a/src/args.rs b/src/args.rs index 4b34cb8..251dce3 100644 --- a/src/args.rs +++ b/src/args.rs @@ -40,6 +40,7 @@ pub enum CliCommand { #[derive(ValueEnum, Debug, Clone)] pub enum RedacterType { GcpDlp, + AwsComprehendDlp, } impl std::str::FromStr for RedacterType { @@ -48,6 +49,7 @@ impl std::str::FromStr for RedacterType { fn from_str(s: &str) -> Result { match s { "gcp-dlp" => Ok(RedacterType::GcpDlp), + "aws-comprehend-dlp" => Ok(RedacterType::AwsComprehendDlp), _ => Err(format!("Unknown redacter type: {}", s)), } } @@ -57,6 +59,7 @@ impl Display for RedacterType { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { RedacterType::GcpDlp => write!(f, "gcp-dlp"), + RedacterType::AwsComprehendDlp => write!(f, "aws-comprehend-dlp"), } } } @@ -89,6 +92,9 @@ pub struct RedacterArgs { #[arg(long, help = "CSV delimiter (default is ','")] pub csv_delimiter: Option, + + #[arg(long, help = "AWS region for AWS Comprehend DLP redacter")] + pub aws_region: Option, } impl TryInto for RedacterArgs { @@ -104,6 +110,11 @@ impl TryInto for RedacterArgs { message: "GCP project id is required for GCP DLP redacter".to_string(), }), }, + Some(RedacterType::AwsComprehendDlp) => Ok(RedacterProviderOptions::AwsComprehendDlp( + crate::redacters::AwsComprehendDlpRedacterOptions { + region: self.aws_region.map(aws_config::Region::new), + }, + )), None => Err(AppError::RedacterConfigError { message: "Redacter type is required".to_string(), }), diff --git a/src/commands/copy_command.rs b/src/commands/copy_command.rs index 6ffc1d3..dbbfc78 100644 --- a/src/commands/copy_command.rs +++ b/src/commands/copy_command.rs @@ -3,7 +3,7 @@ use crate::filesystems::{ AbsoluteFilePath, DetectFileSystem, FileMatcher, FileMatcherResult, FileSystemConnection, FileSystemRef, }; -use crate::redacters::{Redacter, RedacterOptions, Redacters}; +use crate::redacters::{RedactSupportedOptions, Redacter, RedacterOptions, Redacters}; use crate::reporter::AppReporter; use crate::AppResult; use console::{Style, Term}; @@ -228,7 +228,9 @@ async fn redact_upload_file<'a, SFS: FileSystemConnection<'a>, DFS: FileSystemCo dest_file_ref: &FileSystemRef, redacter: &impl Redacter, ) -> AppResult { - if redacter.is_redact_supported(dest_file_ref).await? { + if redacter.redact_supported_options(dest_file_ref).await? + != RedactSupportedOptions::Unsupported + { match redacter.redact_stream(source_reader, dest_file_ref).await { Ok(redacted_reader) => { destination_fs diff --git a/src/filesystems/aws_s3.rs b/src/filesystems/aws_s3.rs index 711b0c5..63efa4a 100644 --- a/src/filesystems/aws_s3.rs +++ b/src/filesystems/aws_s3.rs @@ -23,7 +23,6 @@ impl<'a> AwsS3FileSystem<'a> { let shared_config = aws_config::load_from_env().await; let (bucket_name, object_name) = Self::parse_s3_path(path)?; let is_dir = object_name.ends_with('/'); - println!("Bucket: {}, Object: {}", bucket_name, object_name); let client = aws_sdk_s3::Client::new(&shared_config); Ok(AwsS3FileSystem { diff --git a/src/redacters/aws_comprehend.rs b/src/redacters/aws_comprehend.rs new file mode 100644 index 0000000..142d598 --- /dev/null +++ b/src/redacters/aws_comprehend.rs @@ -0,0 +1,175 @@ +use crate::errors::AppError; +use crate::filesystems::FileSystemRef; +use crate::redacters::{ + RedactSupportedOptions, Redacter, RedacterDataItem, RedacterDataItemContent, RedacterOptions, + Redacters, +}; +use crate::reporter::AppReporter; +use crate::AppResult; +use aws_config::Region; +use rvstruct::ValueStruct; + +#[derive(Debug, Clone)] +pub struct AwsComprehendDlpRedacterOptions { + pub region: Option, +} + +#[derive(Clone)] +pub struct AwsComprehendDlpRedacter<'a> { + client: aws_sdk_comprehend::Client, + redacter_options: RedacterOptions, + reporter: &'a AppReporter<'a>, +} + +impl<'a> AwsComprehendDlpRedacter<'a> { + pub async fn new( + redacter_options: RedacterOptions, + aws_dlp_options: AwsComprehendDlpRedacterOptions, + reporter: &'a AppReporter<'a>, + ) -> AppResult { + let region_provider = aws_config::meta::region::RegionProviderChain::first_try( + aws_dlp_options.region.clone(), + ) + .or_default_provider(); + let shared_config = aws_config::from_env().region(region_provider).load().await; + let client = aws_sdk_comprehend::Client::new(&shared_config); + Ok(Self { + client, + redacter_options, + reporter, + }) + } + + pub async fn redact_text_file( + &self, + input: RedacterDataItem, + ) -> AppResult { + self.reporter.report(format!( + "Redacting a text file: {} ({:?})", + input.file_ref.relative_path.value(), + input.file_ref.media_type + ))?; + let text_content = match input.content { + RedacterDataItemContent::Value(content) => Ok(content), + _ => Err(AppError::SystemError { + message: "Unsupported item for text redacting".to_string(), + }), + }?; + + let aws_request = self + .client + .detect_pii_entities() + .language_code(aws_sdk_comprehend::types::LanguageCode::En) + .text(text_content.clone()); + + let result = aws_request.send().await?; + let redacted_content = result.entities.iter().fold(text_content, |acc, entity| { + entity.iter().fold(acc, |acc, entity| { + match (entity.begin_offset, entity.end_offset) { + (Some(start), Some(end)) => [ + acc[..start as usize].to_string(), + "X".repeat((end - start) as usize), + acc[end as usize..].to_string(), + ] + .concat(), + (Some(start), None) => { + acc[..start as usize].to_string() + + "X".repeat(acc.len() - start as usize).as_str() + } + (None, Some(end)) => { + ["X".repeat(end as usize), acc[end as usize..].to_string()].concat() + } + _ => acc, + } + }) + }); + Ok(RedacterDataItemContent::Value(redacted_content)) + } +} + +impl<'a> Redacter for AwsComprehendDlpRedacter<'a> { + async fn redact(&self, input: RedacterDataItem) -> AppResult { + match &input.content { + RedacterDataItemContent::Value(_) => self.redact_text_file(input).await, + RedacterDataItemContent::Table { .. } | RedacterDataItemContent::Image { .. } => { + Err(AppError::SystemError { + message: "Attempt to redact of unsupported image type".to_string(), + }) + } + } + } + + async fn redact_supported_options( + &self, + file_ref: &FileSystemRef, + ) -> AppResult { + Ok(match file_ref.media_type.as_ref() { + Some(media_type) if Redacters::is_mime_text(media_type) => { + RedactSupportedOptions::Supported + } + Some(media_type) if Redacters::is_mime_table(media_type) => { + RedactSupportedOptions::SupportedAsText + } + _ => RedactSupportedOptions::Unsupported, + }) + } + + fn options(&self) -> &RedacterOptions { + &self.redacter_options + } +} + +#[allow(unused_imports)] +mod tests { + use super::*; + use crate::redacters::RedacterProviderOptions; + use console::Term; + + #[tokio::test] + #[cfg_attr(not(feature = "ci-aws"), ignore)] + async fn redact_text_file_test() -> Result<(), Box> { + let term = Term::stdout(); + let reporter: AppReporter = AppReporter::from(&term); + let test_aws_region = std::env::var("TEST_AWS_REGION").expect("TEST_AWS_REGION required"); + let test_content = "Hello, John"; + + let file_ref = FileSystemRef { + relative_path: "temp_file.txt".into(), + media_type: Some(mime::TEXT_PLAIN), + file_size: Some(test_content.len() as u64), + }; + + let content = RedacterDataItemContent::Value(test_content.to_string()); + let input = RedacterDataItem { file_ref, content }; + + let redacter_options = RedacterOptions { + provider_options: RedacterProviderOptions::AwsComprehendDlp( + AwsComprehendDlpRedacterOptions { + region: Some(Region::new(test_aws_region.clone())), + }, + ), + allow_unsupported_copies: false, + csv_headers_disable: false, + csv_delimiter: None, + }; + + let redacter = AwsComprehendDlpRedacter::new( + redacter_options, + AwsComprehendDlpRedacterOptions { + region: Some(Region::new(test_aws_region)), + }, + &reporter, + ) + .await?; + + let redacted_content = redacter.redact(input).await?; + match redacted_content { + RedacterDataItemContent::Value(value) => { + assert_eq!(value, "Hello, XXXX"); + } + _ => panic!("Unexpected redacted content type"), + } + + Ok(()) + } +} diff --git a/src/redacters/gcp_dlp.rs b/src/redacters/gcp_dlp.rs index 16ec547..d22306a 100644 --- a/src/redacters/gcp_dlp.rs +++ b/src/redacters/gcp_dlp.rs @@ -2,7 +2,8 @@ use crate::common_types::GcpProjectId; use crate::errors::AppError; use crate::filesystems::FileSystemRef; use crate::redacters::{ - Redacter, RedacterDataItem, RedacterDataItemContent, RedacterOptions, Redacters, + RedactSupportedOptions, Redacter, RedacterDataItem, RedacterDataItemContent, RedacterOptions, + Redacters, }; use crate::reporter::AppReporter; use crate::AppResult; @@ -14,10 +15,10 @@ use rvstruct::ValueStruct; #[derive(Clone)] pub struct GcpDlpRedacter<'a> { - pub client: GoogleApi>, - pub redacter_options: RedacterOptions, - pub gcp_dlp_options: GcpDlpRedacterOptions, - pub reporter: &'a AppReporter<'a>, + client: GoogleApi>, + redacter_options: RedacterOptions, + gcp_dlp_options: GcpDlpRedacterOptions, + reporter: &'a AppReporter<'a>, } #[derive(Debug, Clone)] @@ -204,12 +205,21 @@ impl<'a> Redacter for GcpDlpRedacter<'a> { } } - async fn is_redact_supported(&self, file_ref: &FileSystemRef) -> AppResult { - Ok(file_ref.media_type.as_ref().iter().all(|media_type| { - Redacters::is_mime_text(media_type) - || Redacters::is_mime_table(media_type) - || Self::check_supported_image_type(media_type) - })) + async fn redact_supported_options( + &self, + file_ref: &FileSystemRef, + ) -> AppResult { + Ok( + if file_ref.media_type.as_ref().iter().all(|media_type| { + Redacters::is_mime_text(media_type) + || Redacters::is_mime_table(media_type) + || Self::check_supported_image_type(media_type) + }) { + RedactSupportedOptions::Supported + } else { + RedactSupportedOptions::Unsupported + }, + ) } fn options(&self) -> &RedacterOptions { @@ -366,7 +376,7 @@ mod tests { use console::Term; #[tokio::test] - #[cfg_attr(not(feature = "ci"), ignore)] + #[cfg_attr(not(feature = "ci-gcp"), ignore)] async fn redact_text_file_test() -> Result<(), Box> { let term = Term::stdout(); let reporter: AppReporter = AppReporter::from(&term); diff --git a/src/redacters/mod.rs b/src/redacters/mod.rs index 551bc3e..beae5bd 100644 --- a/src/redacters/mod.rs +++ b/src/redacters/mod.rs @@ -5,12 +5,16 @@ use gcloud_sdk::prost::bytes; use mime::Mime; use std::fmt::Display; -mod gcp_dlp; use crate::errors::AppError; use crate::filesystems::FileSystemRef; use crate::reporter::AppReporter; + +mod gcp_dlp; pub use gcp_dlp::*; +mod aws_comprehend; +pub use aws_comprehend::*; + #[derive(Debug, Clone)] pub struct RedacterDataItem { pub content: RedacterDataItemContent, @@ -33,6 +37,7 @@ pub enum RedacterDataItemContent { #[derive(Clone)] pub enum Redacters<'a> { GcpDlp(GcpDlpRedacter<'a>), + AwsComprehendDlp(AwsComprehendDlpRedacter<'a>), } #[derive(Debug, Clone)] @@ -46,12 +51,14 @@ pub struct RedacterOptions { #[derive(Debug, Clone)] pub enum RedacterProviderOptions { GcpDlp(GcpDlpRedacterOptions), + AwsComprehendDlp(AwsComprehendDlpRedacterOptions), } impl Display for RedacterOptions { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self.provider_options { RedacterProviderOptions::GcpDlp(_) => write!(f, "gcp-dlp"), + RedacterProviderOptions::AwsComprehendDlp(_) => write!(f, "aws-comprehend-dlp"), } } } @@ -65,6 +72,16 @@ impl<'a> Redacters<'a> { RedacterProviderOptions::GcpDlp(ref options) => Ok(Redacters::GcpDlp( GcpDlpRedacter::new(redacter_options.clone(), options.clone(), reporter).await?, )), + RedacterProviderOptions::AwsComprehendDlp(ref options) => { + Ok(Redacters::AwsComprehendDlp( + AwsComprehendDlpRedacter::new( + redacter_options.clone(), + options.clone(), + reporter, + ) + .await?, + )) + } } } @@ -74,7 +91,9 @@ impl<'a> Redacters<'a> { && (mime.subtype() == mime::PLAIN || mime.subtype() == mime::HTML || mime.subtype() == mime::XML - || mime.subtype() == mime::CSS)) + || mime.subtype() == mime::CSS + || mime.subtype() == "x-yaml" + || mime.subtype() == "yaml")) || (mime.type_() == mime::APPLICATION && (mime.subtype() == mime::XML || mime.subtype() == mime::JSON @@ -91,10 +110,20 @@ impl<'a> Redacters<'a> { } } +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum RedactSupportedOptions { + Supported, + SupportedAsText, + Unsupported, +} + pub trait Redacter { async fn redact(&self, input: RedacterDataItem) -> AppResult; - async fn is_redact_supported(&self, file_ref: &FileSystemRef) -> AppResult; + async fn redact_supported_options( + &self, + file_ref: &FileSystemRef, + ) -> AppResult; fn options(&self) -> &RedacterOptions; @@ -106,14 +135,21 @@ pub trait Redacter { file_ref: &FileSystemRef, ) -> AppResult> + Send + Sync + Unpin + 'static>> { + let supported_options = self.redact_supported_options(file_ref).await?; let content_to_redact = match file_ref.media_type { - Some(ref mime) if Redacters::is_mime_text(mime) => { + Some(ref mime) + if Redacters::is_mime_text(mime) + || (Redacters::is_mime_table(mime) + && matches!( + supported_options, + RedactSupportedOptions::SupportedAsText + )) => + { let all_chunks: Vec = input.try_collect().await?; let all_bytes = all_chunks.concat(); - let content = - String::from_utf8(all_bytes).map_err(|e| crate::AppError::SystemError { - message: format!("Failed to convert bytes to string: {}", e), - })?; + let content = String::from_utf8(all_bytes).map_err(|e| AppError::SystemError { + message: format!("Failed to convert bytes to string: {}", e), + })?; Ok(RedacterDataItem { content: RedacterDataItemContent::Value(content), file_ref: file_ref.clone(), @@ -202,18 +238,26 @@ impl<'a> Redacter for Redacters<'a> { async fn redact(&self, input: RedacterDataItem) -> AppResult { match self { Redacters::GcpDlp(redacter) => redacter.redact(input).await, + Redacters::AwsComprehendDlp(redacter) => redacter.redact(input).await, } } - async fn is_redact_supported(&self, file_ref: &FileSystemRef) -> AppResult { + async fn redact_supported_options( + &self, + file_ref: &FileSystemRef, + ) -> AppResult { match self { - Redacters::GcpDlp(redacter) => redacter.is_redact_supported(file_ref).await, + Redacters::GcpDlp(redacter) => redacter.redact_supported_options(file_ref).await, + Redacters::AwsComprehendDlp(redacter) => { + redacter.redact_supported_options(file_ref).await + } } } fn options(&self) -> &RedacterOptions { match self { Redacters::GcpDlp(redacter) => redacter.options(), + Redacters::AwsComprehendDlp(redacter) => redacter.options(), } } }