Skip to content

Commit

Permalink
fix: unsound use of env::set_var, was breaking stdlib change to make …
Browse files Browse the repository at this point in the history
…unsafe

It is generally not safe to set env variables. The correct way to set a config
value that needs to be overridden is to hold a copy internal to the library and
only read from the environment.
  • Loading branch information
sftse committed Oct 22, 2024
1 parent 98b2cbc commit af7d167
Showing 1 changed file with 27 additions and 7 deletions.
34 changes: 27 additions & 7 deletions tokenizers/src/utils/parallelism.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use rayon::iter::IterBridge;
use rayon::prelude::*;
use rayon_cond::CondIterator;
use std::sync::atomic::AtomicBool;
use std::sync::atomic::AtomicU8;
use std::sync::atomic::Ordering;

// Re-export rayon current_num_threads
Expand All @@ -14,35 +15,54 @@ pub use rayon::current_num_threads;
pub const ENV_VARIABLE: &str = "TOKENIZERS_PARALLELISM";

static USED_PARALLELISM: AtomicBool = AtomicBool::new(false);
static PARALLELISM: AtomicU8 = AtomicU8::new(0);

/// Check if the TOKENIZERS_PARALLELISM env variable has been explicitly set
pub fn is_parallelism_configured() -> bool {
std::env::var(ENV_VARIABLE).is_ok()
std::env::var(ENV_VARIABLE).is_ok() || get_override_parallelism().is_some()
}

/// Check if at some point we used a parallel iterator
pub fn has_parallelism_been_used() -> bool {
USED_PARALLELISM.load(Ordering::SeqCst)
}

/// Get internally set parallelism
fn get_override_parallelism() -> Option<bool> {
match PARALLELISM.load(Ordering::SeqCst) {
0 => None,
1 => Some(false),
2 => Some(true),
_ => unreachable!(),
}
}

/// Get the currently set value for `TOKENIZERS_PARALLELISM` env variable
pub fn get_parallelism() -> bool {
fn get_env_parallelism() -> bool {
match std::env::var(ENV_VARIABLE) {
Ok(mut v) => {
v.make_ascii_lowercase();
!matches!(v.as_ref(), "" | "off" | "false" | "f" | "no" | "n" | "0")
}
#[cfg(not(miri))]
Err(_) => true, // If we couldn't get the variable, we use the default
// FIXME: for now turn parallelism off under miri, otherwise complains about crossbeam-epoch
#[cfg(miri)]
Err(_) => false,
}
}

pub fn get_parallelism() -> bool {
// FIXME: for now turn parallelism off under miri, otherwise complains about crossbeam-epoch
#[cfg(miri)]
return false;
#[cfg(not(miri))]
if let Some(parallel) = get_override_parallelism() {
parallel
} else {
get_env_parallelism()
}
}

/// Set the value for `TOKENIZERS_PARALLELISM` for the current process
pub fn set_parallelism(val: bool) {
std::env::set_var(ENV_VARIABLE, if val { "true" } else { "false" })
PARALLELISM.store(if val { 2 } else { 1 }, Ordering::SeqCst);
}

/// Allows to convert into an iterator that can be executed either parallelly or serially.
Expand Down

0 comments on commit af7d167

Please sign in to comment.