Skip to content

Commit

Permalink
Sampling size argument support for text
Browse files Browse the repository at this point in the history
  • Loading branch information
abdolence committed Aug 8, 2024
1 parent 94dcbdf commit 9255019
Show file tree
Hide file tree
Showing 8 changed files with 38 additions and 7 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,8 @@ Options:
URL for image redact endpoint for MsPresidio redacter
--gemini-model <GEMINI_MODEL>
Gemini model name for Gemini LLM redacter. Default is 'models/gemini-1.5-flash'
--sampling-size <SAMPLING_SIZE>
Sampling size in bytes before redacting files. Disabled by default
-h, --help
Print help
```
Expand Down
7 changes: 7 additions & 0 deletions src/args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,12 @@ pub struct RedacterArgs {
help = "Gemini model name for Gemini LLM redacter. Default is 'models/gemini-1.5-flash'"
)]
pub gemini_model: Option<GeminiLlmModelName>,

#[arg(
long,
help = "Sampling size in bytes before redacting files. Disabled by default"
)]
pub sampling_size: Option<usize>,
}

impl TryInto<RedacterOptions> for RedacterArgs {
Expand Down Expand Up @@ -189,6 +195,7 @@ impl TryInto<RedacterOptions> for RedacterArgs {
allow_unsupported_copies: self.allow_unsupported_copies,
csv_headers_disable: self.csv_headers_disable,
csv_delimiter: self.csv_delimiter.map(|c| c as u8),
sampling_size: self.sampling_size,
})
}
}
15 changes: 11 additions & 4 deletions src/commands/copy_command.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,20 +43,27 @@ pub async fn command_copy(
redacter_options: Option<RedacterOptions>,
) -> AppResult<CopyCommandResult> {
let bold_style = Style::new().bold();
let redacted_output = if let Some(ref options) = redacter_options {
let redacted_output = if let Some(ref options) = redacter_options.as_ref() {
bold_style
.clone()
.green()
.apply_to(format!("✓ Yes ({})", options))
.apply_to(format!("✓ Yes ({})", &options))
} else {
bold_style.clone().red().apply_to("✗ No".to_string())
};
let sampling_output =
if let Some(ref sampling_size) = redacter_options.as_ref().and_then(|o| o.sampling_size) {
Style::new().apply_to(format!("{} bytes.", sampling_size))
} else {
Style::new().dim().apply_to("-".to_string())
};
term.write_line(
format!(
"Copying from {} to {}.\nRedacting: {}.",
"Copying from {} to {}.\nRedacting: {}.\nSampling: {}\n",
bold_style.clone().white().apply_to(source),
bold_style.clone().yellow().apply_to(destination),
redacted_output
redacted_output,
sampling_output
)
.as_str(),
)?;
Expand Down
1 change: 1 addition & 0 deletions src/redacters/aws_comprehend.rs
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ mod tests {
allow_unsupported_copies: false,
csv_headers_disable: false,
csv_delimiter: None,
sampling_size: None,
};

let redacter = AwsComprehendRedacter::new(
Expand Down
1 change: 1 addition & 0 deletions src/redacters/gcp_dlp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -400,6 +400,7 @@ mod tests {
allow_unsupported_copies: false,
csv_headers_disable: false,
csv_delimiter: None,
sampling_size: None,
};

let redacter = GcpDlpRedacter::new(
Expand Down
1 change: 1 addition & 0 deletions src/redacters/gemini_llm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,7 @@ mod tests {
allow_unsupported_copies: false,
csv_headers_disable: false,
csv_delimiter: None,
sampling_size: None,
};

let redacter = GeminiLlmRedacter::new(
Expand Down
17 changes: 14 additions & 3 deletions src/redacters/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ pub struct RedacterOptions {
pub allow_unsupported_copies: bool,
pub csv_headers_disable: bool,
pub csv_delimiter: Option<u8>,
pub sampling_size: Option<usize>,
}

#[derive(Debug, Clone)]
Expand Down Expand Up @@ -189,9 +190,19 @@ pub async fn redact_stream<
{
let all_chunks: Vec<bytes::Bytes> = input.try_collect().await?;
let all_bytes = all_chunks.concat();
let content = String::from_utf8(all_bytes).map_err(|e| AppError::SystemError {
message: format!("Failed to convert bytes to string: {}", e),
})?;
let whole_content =
String::from_utf8(all_bytes).map_err(|e| AppError::SystemError {
message: format!("Failed to convert bytes to string: {}", e),
})?;
let content = if let Some(sampling_size) = redacter.options().sampling_size {
let sampling_size = std::cmp::min(sampling_size, whole_content.len());
whole_content
.chars()
.take(sampling_size)
.collect::<String>()
} else {
whole_content
};
Ok(RedacterDataItem {
content: RedacterDataItemContent::Value(content),
file_ref: file_ref.clone(),
Expand Down
1 change: 1 addition & 0 deletions src/redacters/ms_presidio.rs
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,7 @@ mod tests {
allow_unsupported_copies: false,
csv_headers_disable: false,
csv_delimiter: None,
sampling_size: None,
};

let redacter = MsPresidioRedacter::new(
Expand Down

0 comments on commit 9255019

Please sign in to comment.