Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Search with regex #64

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 41 additions & 2 deletions src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ use popgetter::{
},
geo::BBox,
search::{
Country, DataPublisher, GeometryLevel, MetricId, SearchContext, SearchParams,
Country, DataPublisher, GeometryLevel, MetricId, SearchContext, SearchParams, SearchRegex,
SearchResults, SearchText, SourceDataRelease, SourceMetricId, YearRange,
},
Popgetter,
Expand Down Expand Up @@ -214,6 +214,13 @@ struct SearchParamsArgs {
description: Vec<String>,
#[arg(short, long, help="Filter by HXL tag, name, or description", num_args=0..)]
text: Vec<String>,
#[arg(
short,
long,
help="Filter with case-insensitive regex by HXL tag, name, or description",
num_args=0..
)]
regex: Vec<String>,
}

/// Expected behaviour:
Expand Down Expand Up @@ -289,10 +296,42 @@ fn text_searches_from_args(
all_text_searches
}

fn regex_searches_from_args(
hxl: Vec<String>,
name: Vec<String>,
description: Vec<String>,
regex: Vec<String>,
) -> Vec<SearchRegex> {
let mut all_regex_searches: Vec<SearchRegex> = vec![];
all_regex_searches.extend(hxl.iter().map(|r| SearchRegex {
regex: r.clone(),
context: nonempty![SearchContext::Hxl],
}));
all_regex_searches.extend(name.iter().map(|r| SearchRegex {
regex: r.clone(),
context: nonempty![SearchContext::HumanReadableName],
}));
all_regex_searches.extend(description.iter().map(|r| SearchRegex {
regex: r.clone(),
context: nonempty![SearchContext::Description],
}));
all_regex_searches.extend(regex.iter().map(|r| SearchRegex {
regex: r.clone(),
context: SearchContext::all(),
}));
all_regex_searches
}

impl From<SearchParamsArgs> for SearchParams {
fn from(args: SearchParamsArgs) -> Self {
SearchParams {
text: text_searches_from_args(args.hxl, args.name, args.description, args.text),
text: text_searches_from_args(
args.hxl.clone(),
args.name.clone(),
args.description.clone(),
args.text.clone(),
),
regex: regex_searches_from_args(args.hxl, args.name, args.description, args.regex),
year_range: args.year_range.clone(),
geometry_level: args.geometry_level.clone().map(GeometryLevel),
source_data_release: args.source_data_release.clone().map(SourceDataRelease),
Expand Down
84 changes: 71 additions & 13 deletions src/search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,13 @@ fn _combine_exprs_with_and1(exprs: NonEmpty<Expr>) -> Expr {
query
}

/// Search in a column for a regex string literal.
fn case_insensitive_regex_contains(column: &str, pattern: &str) -> Expr {
// Make case-insensitive version
let regex = format!("(?i){}", pattern);
col(column).str().contains(lit(regex), true)
}

/// Search in a column case-insensitively for a string literal (i.e. not a regex!). The search
/// parameter can appear anywhere in the column value.
fn case_insensitive_contains(column: &str, value: &str) -> Expr {
Expand Down Expand Up @@ -110,6 +117,23 @@ impl From<SearchText> for Expr {
}
}

/// Implementing conversion from `SearchText` to a polars expression enables a
/// `SearchText` to be passed to polars dataframe for filtering results.
impl From<SearchRegex> for Expr {
fn from(val: SearchRegex) -> Self {
let queries: NonEmpty<Expr> = val.context.map(|field| match field {
SearchContext::Hxl => case_insensitive_regex_contains(COL::METRIC_HXL_TAG, &val.regex),
SearchContext::HumanReadableName => {
case_insensitive_regex_contains(COL::METRIC_HUMAN_READABLE_NAME, &val.regex)
}
SearchContext::Description => {
case_insensitive_regex_contains(COL::METRIC_DESCRIPTION, &val.regex)
}
});
combine_exprs_with_or1(queries)
}
}

impl From<YearRange> for Expr {
fn from(value: YearRange) -> Self {
match value {
Expand Down Expand Up @@ -182,13 +206,10 @@ pub struct SearchText {
pub context: NonEmpty<SearchContext>,
}

impl Default for SearchText {
fn default() -> Self {
Self {
text: "".to_string(),
context: SearchContext::all(),
}
}
#[derive(Clone, Debug, Deserialize, Serialize)]
pub struct SearchRegex {
pub regex: String,
pub context: NonEmpty<SearchContext>,
}

/// Search over years
Expand Down Expand Up @@ -235,6 +256,7 @@ pub struct SourceMetricId(pub String);
#[derive(Clone, Debug, Deserialize, Serialize)]
pub struct SearchParams {
pub text: Vec<SearchText>,
pub regex: Vec<SearchRegex>,
pub year_range: Option<Vec<YearRange>>,
pub metric_id: Vec<MetricId>,
pub geometry_level: Option<GeometryLevel>,
Expand Down Expand Up @@ -270,12 +292,13 @@ fn _to_optqueries_then_or<T: Into<Option<Expr>>>(queries: Vec<T>) -> Option<Expr

impl From<SearchParams> for Option<Expr> {
fn from(value: SearchParams) -> Self {
let mut subexprs: Vec<Option<Expr>> = value
.text
.into_iter()
.map(|text| Some(text.into()))
.collect();
// Any of provided text (combine with OR into simple Expr)
let mut subexprs: Vec<Option<Expr>> = vec![to_queries_then_or(value.text)];
// Any of provided regex (combine with OR into simple Expr)
subexprs.extend([to_queries_then_or(value.regex)]);
// Any of provided metric IDs (combine with OR into simple Expr)
subexprs.extend([to_queries_then_or(value.metric_id)]);
// Any of provided year ranges (combine with OR into simple Expr)
if let Some(year_range) = value.year_range {
subexprs.extend([to_queries_then_or(year_range)]);
}
Expand All @@ -289,6 +312,7 @@ impl From<SearchParams> for Option<Expr> {
subexprs.extend(other_subexprs);
// Remove the Nones and unwrap the Somes
let valid_subexprs: Vec<Expr> = subexprs.into_iter().flatten().collect();
// All expressions are combined with AND
combine_exprs_with_and(valid_subexprs)
}
}
Expand Down Expand Up @@ -374,9 +398,43 @@ impl SearchResults {

#[cfg(test)]
mod tests {
// use super::*;
use polars::prelude::*;

use super::*;

fn df() -> DataFrame {
df!(
"Fruit" => &["Apple", "Apple", "Pear"],
"FruitDup" => &["Apple", "Applle", "Pear"],
"Color" => &["Red", "Yellow", "Green"]
)
.unwrap()
}

// #[test]
// fn test_search_request() {
// let mut sr = SearchRequest{search_string: None}.with_country("a").with_country("b");
// }

#[test]
fn test_search_regex() -> anyhow::Result<()> {
assert!(
df().lazy()
.filter(case_insensitive_regex_contains("Fruit", "^a.*pl[^l]{1}$"))
.collect()?
.shape()
== (2, 3)
);
assert!(
df().lazy()
.filter(case_insensitive_regex_contains(
"FruitDup",
"^a.*pl[^l]{1}$"
))
.collect()?
.shape()
== (1, 3)
);
Ok(())
}
}
Loading