Skip to content

Commit

Permalink
Add regex search
Browse files Browse the repository at this point in the history
  • Loading branch information
sgreenbury committed Jul 9, 2024
1 parent 8736cf8 commit 6ba5ffe
Show file tree
Hide file tree
Showing 2 changed files with 112 additions and 15 deletions.
43 changes: 41 additions & 2 deletions src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ use popgetter::{
},
geo::BBox,
search::{
Country, DataPublisher, GeometryLevel, MetricId, SearchContext, SearchParams,
Country, DataPublisher, GeometryLevel, MetricId, SearchContext, SearchParams, SearchRegex,
SearchResults, SearchText, SourceDataRelease, SourceMetricId, YearRange,
},
Popgetter,
Expand Down Expand Up @@ -214,6 +214,13 @@ struct SearchParamsArgs {
description: Vec<String>,
#[arg(short, long, help="Filter by HXL tag, name, or description", num_args=0..)]
text: Vec<String>,
#[arg(
short,
long,
help="Filter with case-insensitive regex by HXL tag, name, or description",
num_args=0..
)]
regex: Vec<String>,
}

/// Expected behaviour:
Expand Down Expand Up @@ -289,10 +296,42 @@ fn text_searches_from_args(
all_text_searches
}

fn regex_searches_from_args(
hxl: Vec<String>,
name: Vec<String>,
description: Vec<String>,
regex: Vec<String>,
) -> Vec<SearchRegex> {
let mut all_regex_searches: Vec<SearchRegex> = vec![];
all_regex_searches.extend(hxl.iter().map(|r| SearchRegex {
regex: r.clone(),
context: nonempty![SearchContext::Hxl],
}));
all_regex_searches.extend(name.iter().map(|r| SearchRegex {
regex: r.clone(),
context: nonempty![SearchContext::HumanReadableName],
}));
all_regex_searches.extend(description.iter().map(|r| SearchRegex {
regex: r.clone(),
context: nonempty![SearchContext::Description],
}));
all_regex_searches.extend(regex.iter().map(|r| SearchRegex {
regex: r.clone(),
context: SearchContext::all(),
}));
all_regex_searches
}

impl From<SearchParamsArgs> for SearchParams {
fn from(args: SearchParamsArgs) -> Self {
SearchParams {
text: text_searches_from_args(args.hxl, args.name, args.description, args.text),
text: text_searches_from_args(
args.hxl.clone(),
args.name.clone(),
args.description.clone(),
args.text.clone(),
),
regex: regex_searches_from_args(args.hxl, args.name, args.description, args.regex),
year_range: args.year_range.clone(),
geometry_level: args.geometry_level.clone().map(GeometryLevel),
source_data_release: args.source_data_release.clone().map(SourceDataRelease),
Expand Down
84 changes: 71 additions & 13 deletions src/search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,13 @@ fn _combine_exprs_with_and1(exprs: NonEmpty<Expr>) -> Expr {
query
}

/// Search in a column for a regex string literal.
fn case_insensitive_regex_contains(column: &str, pattern: &str) -> Expr {
// Make case-insensitive version
let regex = format!("(?i){}", pattern);
col(column).str().contains(lit(regex), true)
}

/// Search in a column case-insensitively for a string literal (i.e. not a regex!). The search
/// parameter can appear anywhere in the column value.
fn case_insensitive_contains(column: &str, value: &str) -> Expr {
Expand Down Expand Up @@ -110,6 +117,23 @@ impl From<SearchText> for Expr {
}
}

/// Implementing conversion from `SearchText` to a polars expression enables a
/// `SearchText` to be passed to polars dataframe for filtering results.
impl From<SearchRegex> for Expr {
fn from(val: SearchRegex) -> Self {
let queries: NonEmpty<Expr> = val.context.map(|field| match field {
SearchContext::Hxl => case_insensitive_regex_contains(COL::METRIC_HXL_TAG, &val.regex),
SearchContext::HumanReadableName => {
case_insensitive_regex_contains(COL::METRIC_HUMAN_READABLE_NAME, &val.regex)
}
SearchContext::Description => {
case_insensitive_regex_contains(COL::METRIC_DESCRIPTION, &val.regex)
}
});
combine_exprs_with_or1(queries)
}
}

impl From<YearRange> for Expr {
fn from(value: YearRange) -> Self {
match value {
Expand Down Expand Up @@ -182,13 +206,10 @@ pub struct SearchText {
pub context: NonEmpty<SearchContext>,
}

impl Default for SearchText {
fn default() -> Self {
Self {
text: "".to_string(),
context: SearchContext::all(),
}
}
#[derive(Clone, Debug, Deserialize, Serialize)]
pub struct SearchRegex {
pub regex: String,
pub context: NonEmpty<SearchContext>,
}

/// Search over years
Expand Down Expand Up @@ -235,6 +256,7 @@ pub struct SourceMetricId(pub String);
#[derive(Clone, Debug, Deserialize, Serialize)]
pub struct SearchParams {
pub text: Vec<SearchText>,
pub regex: Vec<SearchRegex>,
pub year_range: Option<Vec<YearRange>>,
pub metric_id: Vec<MetricId>,
pub geometry_level: Option<GeometryLevel>,
Expand Down Expand Up @@ -270,12 +292,13 @@ fn _to_optqueries_then_or<T: Into<Option<Expr>>>(queries: Vec<T>) -> Option<Expr

impl From<SearchParams> for Option<Expr> {
fn from(value: SearchParams) -> Self {
let mut subexprs: Vec<Option<Expr>> = value
.text
.into_iter()
.map(|text| Some(text.into()))
.collect();
// Any of provided text (combine with OR into simple Expr)
let mut subexprs: Vec<Option<Expr>> = vec![to_queries_then_or(value.text)];
// Any of provided regex (combine with OR into simple Expr)
subexprs.extend([to_queries_then_or(value.regex)]);
// Any of provided metric IDs (combine with OR into simple Expr)
subexprs.extend([to_queries_then_or(value.metric_id)]);
// Any of provided year ranges (combine with OR into simple Expr)
if let Some(year_range) = value.year_range {
subexprs.extend([to_queries_then_or(year_range)]);
}
Expand All @@ -289,6 +312,7 @@ impl From<SearchParams> for Option<Expr> {
subexprs.extend(other_subexprs);
// Remove the Nones and unwrap the Somes
let valid_subexprs: Vec<Expr> = subexprs.into_iter().flatten().collect();
// All expressions are combined with AND
combine_exprs_with_and(valid_subexprs)
}
}
Expand Down Expand Up @@ -374,9 +398,43 @@ impl SearchResults {

#[cfg(test)]
mod tests {
// use super::*;
use polars::prelude::*;

use super::*;

fn df() -> DataFrame {
df!(
"Fruit" => &["Apple", "Apple", "Pear"],
"FruitDup" => &["Apple", "Applle", "Pear"],
"Color" => &["Red", "Yellow", "Green"]
)
.unwrap()
}

// #[test]
// fn test_search_request() {
// let mut sr = SearchRequest{search_string: None}.with_country("a").with_country("b");
// }

#[test]
fn test_search_regex() -> anyhow::Result<()> {
assert!(
df().lazy()
.filter(case_insensitive_regex_contains("Fruit", "^a.*pl[^l]{1}$"))
.collect()?
.shape()
== (2, 3)
);
assert!(
df().lazy()
.filter(case_insensitive_regex_contains(
"FruitDup",
"^a.*pl[^l]{1}$"
))
.collect()?
.shape()
== (1, 3)
);
Ok(())
}
}

0 comments on commit 6ba5ffe

Please sign in to comment.