diff --git a/src/cli.rs b/src/cli.rs index 90e5fa6..c12f381 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -12,7 +12,7 @@ use popgetter::{ }, geo::BBox, search::{ - Country, DataPublisher, GeometryLevel, MetricId, SearchContext, SearchParams, + Country, DataPublisher, GeometryLevel, MetricId, SearchContext, SearchParams, SearchRegex, SearchResults, SearchText, SourceDataRelease, SourceMetricId, YearRange, }, Popgetter, @@ -214,6 +214,13 @@ struct SearchParamsArgs { description: Vec, #[arg(short, long, help="Filter by HXL tag, name, or description", num_args=0..)] text: Vec, + #[arg( + short, + long, + help="Filter with case-insensitive regex by HXL tag, name, or description", + num_args=0.. + )] + regex: Vec, } /// Expected behaviour: @@ -289,10 +296,42 @@ fn text_searches_from_args( all_text_searches } +fn regex_searches_from_args( + hxl: Vec, + name: Vec, + description: Vec, + regex: Vec, +) -> Vec { + let mut all_regex_searches: Vec = vec![]; + all_regex_searches.extend(hxl.iter().map(|r| SearchRegex { + regex: r.clone(), + context: nonempty![SearchContext::Hxl], + })); + all_regex_searches.extend(name.iter().map(|r| SearchRegex { + regex: r.clone(), + context: nonempty![SearchContext::HumanReadableName], + })); + all_regex_searches.extend(description.iter().map(|r| SearchRegex { + regex: r.clone(), + context: nonempty![SearchContext::Description], + })); + all_regex_searches.extend(regex.iter().map(|r| SearchRegex { + regex: r.clone(), + context: SearchContext::all(), + })); + all_regex_searches +} + impl From for SearchParams { fn from(args: SearchParamsArgs) -> Self { SearchParams { - text: text_searches_from_args(args.hxl, args.name, args.description, args.text), + text: text_searches_from_args( + args.hxl.clone(), + args.name.clone(), + args.description.clone(), + args.text.clone(), + ), + regex: regex_searches_from_args(args.hxl, args.name, args.description, args.regex), year_range: args.year_range.clone(), geometry_level: args.geometry_level.clone().map(GeometryLevel), source_data_release: args.source_data_release.clone().map(SourceDataRelease), diff --git a/src/search.rs b/src/search.rs index ac000ba..ea1d761 100644 --- a/src/search.rs +++ b/src/search.rs @@ -64,6 +64,13 @@ fn _combine_exprs_with_and1(exprs: NonEmpty) -> Expr { query } +/// Search in a column for a regex string literal. +fn case_insensitive_regex_contains(column: &str, pattern: &str) -> Expr { + // Make case-insensitive version + let regex = format!("(?i){}", pattern); + col(column).str().contains(lit(regex), true) +} + /// Search in a column case-insensitively for a string literal (i.e. not a regex!). The search /// parameter can appear anywhere in the column value. fn case_insensitive_contains(column: &str, value: &str) -> Expr { @@ -110,6 +117,23 @@ impl From for Expr { } } +/// Implementing conversion from `SearchText` to a polars expression enables a +/// `SearchText` to be passed to polars dataframe for filtering results. +impl From for Expr { + fn from(val: SearchRegex) -> Self { + let queries: NonEmpty = val.context.map(|field| match field { + SearchContext::Hxl => case_insensitive_regex_contains(COL::METRIC_HXL_TAG, &val.regex), + SearchContext::HumanReadableName => { + case_insensitive_regex_contains(COL::METRIC_HUMAN_READABLE_NAME, &val.regex) + } + SearchContext::Description => { + case_insensitive_regex_contains(COL::METRIC_DESCRIPTION, &val.regex) + } + }); + combine_exprs_with_or1(queries) + } +} + impl From for Expr { fn from(value: YearRange) -> Self { match value { @@ -182,13 +206,10 @@ pub struct SearchText { pub context: NonEmpty, } -impl Default for SearchText { - fn default() -> Self { - Self { - text: "".to_string(), - context: SearchContext::all(), - } - } +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct SearchRegex { + pub regex: String, + pub context: NonEmpty, } /// Search over years @@ -235,6 +256,7 @@ pub struct SourceMetricId(pub String); #[derive(Clone, Debug, Deserialize, Serialize)] pub struct SearchParams { pub text: Vec, + pub regex: Vec, pub year_range: Option>, pub metric_id: Vec, pub geometry_level: Option, @@ -270,12 +292,13 @@ fn _to_optqueries_then_or>>(queries: Vec) -> Option for Option { fn from(value: SearchParams) -> Self { - let mut subexprs: Vec> = value - .text - .into_iter() - .map(|text| Some(text.into())) - .collect(); + // Any of provided text (combine with OR into simple Expr) + let mut subexprs: Vec> = vec![to_queries_then_or(value.text)]; + // Any of provided regex (combine with OR into simple Expr) + subexprs.extend([to_queries_then_or(value.regex)]); + // Any of provided metric IDs (combine with OR into simple Expr) subexprs.extend([to_queries_then_or(value.metric_id)]); + // Any of provided year ranges (combine with OR into simple Expr) if let Some(year_range) = value.year_range { subexprs.extend([to_queries_then_or(year_range)]); } @@ -289,6 +312,7 @@ impl From for Option { subexprs.extend(other_subexprs); // Remove the Nones and unwrap the Somes let valid_subexprs: Vec = subexprs.into_iter().flatten().collect(); + // All expressions are combined with AND combine_exprs_with_and(valid_subexprs) } } @@ -374,9 +398,43 @@ impl SearchResults { #[cfg(test)] mod tests { - // use super::*; + use polars::prelude::*; + + use super::*; + + fn df() -> DataFrame { + df!( + "Fruit" => &["Apple", "Apple", "Pear"], + "FruitDup" => &["Apple", "Applle", "Pear"], + "Color" => &["Red", "Yellow", "Green"] + ) + .unwrap() + } + // #[test] // fn test_search_request() { // let mut sr = SearchRequest{search_string: None}.with_country("a").with_country("b"); // } + + #[test] + fn test_search_regex() -> anyhow::Result<()> { + assert!( + df().lazy() + .filter(case_insensitive_regex_contains("Fruit", "^a.*pl[^l]{1}$")) + .collect()? + .shape() + == (2, 3) + ); + assert!( + df().lazy() + .filter(case_insensitive_regex_contains( + "FruitDup", + "^a.*pl[^l]{1}$" + )) + .collect()? + .shape() + == (1, 3) + ); + Ok(()) + } }