Skip to content

Commit

Permalink
[Search] Case sensitive across engines (#309)
Browse files Browse the repository at this point in the history
* [Search] Add case-sensitivity option 
* [Search] Reworked search function so that it is readable and documented
* [Tests] Reworked the test so that we check the results content
* [SQLite] Add Case Sensitivity to LIKEs for SQLite

---------

Co-authored-by: Thibault Clérice <[email protected]>
  • Loading branch information
Juliettejns and PonteIneptique authored Mar 14, 2024
1 parent 139b6f2 commit 2ca595d
Show file tree
Hide file tree
Showing 6 changed files with 155 additions and 65 deletions.
13 changes: 12 additions & 1 deletion app/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,11 @@
from flask_sqlalchemy import SQLAlchemy
from flask_wtf import CSRFProtect
from flask_babel import Babel
from sqlalchemy.engine import Engine
from .ext_config import get_locale
from .markdown_ext import Markdown
from sqlite3 import Connection as SQLite3Connection


basedir = os.path.abspath(os.path.dirname(__file__))

Expand Down Expand Up @@ -38,7 +41,15 @@ def create_app(config_name="dev"):
app.config.from_object(config)
else:
app.config.from_object(config[config_name])


# SQLite does not perform CASE SENSITIVE LIKEs by default.
if app.config["SQLALCHEMY_DATABASE_URI"].startswith("sqlite:"):
@db.event.listens_for(Engine, "connect")
def _set_sqlite_case_insensitive_pragma(dbapi_con, connection_record):
""" This ensures that SQLite is not case-insensitive when using LIKEs"""
if isinstance(dbapi_con, SQLite3Connection):
dbapi_con.execute("PRAGMA case_sensitive_like=ON;")

app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False

config[config_name].init_app(app)
Expand Down
44 changes: 28 additions & 16 deletions app/main/views/tokens.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
from sqlalchemy import func
import math
from csv import DictWriter
from io import StringIO
from itertools import product
from typing import Dict, Optional, List, Tuple

from .utils import render_template_with_nav_info, request_wants_json, requires_corpus_access
from .. import main
Expand All @@ -15,10 +18,7 @@
from ...utils.forms import string_to_none, strip_or_none, column_search_filter, prepare_search_string
from ...utils.pagination import int_or
from ...utils.tsv import TSV_CONFIG, stream_tsv
from ...utils.tsv import TSV_CONFIG, stream_tsv
from ...utils.response import stream_template
from io import StringIO
from itertools import product


@main.route('/corpus/<int:corpus_id>/tokens/correct')
Expand Down Expand Up @@ -271,47 +271,59 @@ def tokens_search_through_fields(corpus_id):
:param corpus_id: Id of the corpus
"""
corpus = Corpus.query.get_or_404(corpus_id)
# test suppression:
if not corpus.has_access(current_user):
abort(403)

# nom des colonnes disponibles pour le corpus (POS, form, etc)
columns = tuple(["form"] + [
col if col == "POS" else col.lower()
for col in corpus.get_columns_headings()
])

input_values = {}
input_values: Dict[str, Optional[str]] = {}

# make a dict with values splitted for each OR operator
fields = {}
source_dict = request.form if request.method == "POST" else request.args
# make a dict with values split for each OR operator
fields: Dict[str, List[str]] = {}
source_dict: Dict[str, str] = request.form if request.method == "POST" else request.args

for name in columns:
value = strip_or_none(source_dict.get(name))
value: Optional[str] = strip_or_none(source_dict.get(name))
input_values[name] = value

# split values with the '|' OR operator but keep escaped '\|' ones
fields[name] = prepare_search_string(value) if value is not None else ""
if value:
fields[name] = prepare_search_string(value)

# all search combinations
search_branches = [
dict(prod)
for prod in product(*[
flat_fields: List[List[Tuple[str, str]]] = [
[
(field, value)
for value in fields[field]
]
for field in fields
])
]
]
# Création combinaison de recherches possibles pipe product
# If source_dict = {"POS": "NOM|VER", "lemma": "mang*"}
# Then flat_fields = [[("POS", "NOM"), ("POS", "VER")], [("lemma", "mang*")]]
# And search_branches :
# [{"POS": "NOM", "lemma": "mang*"}, {"POS": "VER", "lemma": "mang*"}]
# * => flat_fields = [["a", "b"], ["c"]]
# product(*flat_fields) == product(flat_fields[0], flat_fields[1])
search_branches: List[Dict[str, str]] = [dict(prod) for prod in product(*flat_fields)]

value_filters = []
case_insensitive = True
if 'caseBox' in source_dict:
case_insensitive = False
# for each branch filter (= OR clauses if any)
for search_branch in search_branches:
# filtre minimal = bon corpus (id)
branch_filters = [WordToken.corpus == corpus_id]

# for each field (lemma, pos, form, morph)
for name, value in search_branch.items():
branch_filters.extend(column_search_filter(getattr(WordToken, name), value))
# transformation couple clé valeur en filtre SQLalchemy
branch_filters.extend(column_search_filter(getattr(WordToken, name), value, case_sensitive=case_insensitive))

value_filters.append(branch_filters)

Expand Down
2 changes: 1 addition & 1 deletion app/templates/main/tokens_search_through_fields.html
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ <h1 class="mt-3"><i class="fa fa-search mr-2"></i>{{ _('Corpus') }} {{ corpus.na
</tr>
</tbody>
</table>

<input type="checkbox" name="caseBox" id="caseBox"/> Deactivate case sensitivity (majuscule and minuscule are taken into consideration in the search).
<div class="mb-3">
<small class="form-text text-muted">{{ _('* can be used to match partial words, eg.') }} <b>ADV*</b></small>
<small class="form-text text-muted">{{ _('! can be used to negate a match, eg.') }} <b>!PRE</b></small>
Expand Down
115 changes: 77 additions & 38 deletions app/utils/forms.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
from typing import List, Optional
from csv import DictReader
from sqlalchemy import func

from app.utils import StringDictReader
from app.utils.tsv import TSV_CONFIG
from sqlalchemy.sql import ColumnExpressionArgument, ColumnElement


def string_to_none(string):
def string_to_none(string: Optional[str]) -> Optional[str]:
""" Converts a string to None, including a string marked as None
"""
if string is None:
return
elif string.strip() == "None":
Expand All @@ -13,13 +18,14 @@ def string_to_none(string):
return string


def strip_or_none(string):
def strip_or_none(string: Optional[str]) -> Optional[str]:
"""Strip a string if it is not none"""
if string is not None:
return string.strip()
return string


def prepare_search_string(string: str) -> list:
def prepare_search_string(string: str) -> List[str]:
""" Transform a search string into a list of strings if "|" was used inside the string
Agrees with escaped pipes.
Expand All @@ -32,52 +38,85 @@ def prepare_search_string(string: str) -> list:
return value


def column_search_filter(field, value: str) -> list:
def column_search_filter(
field: ColumnElement,
value: str,
case_sensitive: bool = True) -> List[ColumnExpressionArgument]:
""" Based on a field name and a string value, computes the list of search WHERE that needs to be \
applied to a query
:param field: ORM Field Property
:param value: Search String
:param case_sensitive: Enable case sensitivity
:return: List of WHERE clauses
"""
branch_filters = []
if len(value) > 0:
value = value.replace(" ", "")
# escape search operators
value = value.replace('%', '\\%')
value = value.replace('\\*', '¤$¤')
value = value.replace('\\!', '¤$$¤')

value = string_to_none(value)
# distinguish LIKE from EQ
if value is not None and "*" in value:
value = value.replace("*", "%")
# unescape '\*'
value = value.replace('¤$¤', '*')

if value.startswith("!") and len(value) > 1:
value = value[1:]
branch_filters.append(field.notlike(value, escape='\\'))
else:
# unescape '\!'
value = value.replace('¤$$¤', '!')
branch_filters.append(field.like(value, escape='\\'))
if not value:
return []

# Clean-up the string
value = value.replace(" ", "")
# Escape search operators from LIKE
value = value.replace('%', '\\%')
# Replace * and ! which are escaped, so that they are not treated as wildcard or NOTs.
value = value.replace('\\*', '¤$¤')
value = value.replace('\\!', '¤$$¤')
value = string_to_none(value)

# If all operation produced an empty string, return an empty list
if not value or value == "!":
return []

# If we are case-sensitive, we keep using like and or not like
if case_sensitive:
notlike = lambda x: field.notlike(x, escape="\\")
like = lambda x: field.like(x, escape="\\")
eq_field = field
eq_value = lambda x: x
else:
notlike = lambda x: field.notilike(x, escape="\\")
like = lambda x: field.ilike(x, escape="\\")
eq_field = func.lower(field)
eq_value = lambda x: x.lower()

# distinguish LIKE from EQ when wild cards are used
if value is not None and "*" in value:
# Replace unescaped * as LIKE operator wildcards
value = value.replace("*", "%")
# Re-introduce previously escaped * (as ¤$¤) as the search character *
value = value.replace('¤$¤', '*')

# Then we check if we are in a not or not request
if value.startswith("!") and len(value) > 1:
value = value[1:]
operator = notlike
else:
# unescape '\*'
value = value.replace('¤$¤', '*')

if value is not None and value.startswith("!") and len(value) > 1:
value = value[1:]
branch_filters.append(field != value)
else:
# unescape '\!'
value = value.replace('¤$$¤', '!')
branch_filters.append(field == value)
operator = like
else:
# Re-introduce previously escaped * (as ¤$¤) as the search character *
value = value.replace('¤$¤', '*')

# Then we check if we are in a not or not request
if value.startswith("!") and len(value) > 1:
value = value[1:]
operator = eq_field.__ne__
else:
operator = eq_field.__eq__

value = eq_value(value)

# Re-introduce previously escaped ! (as ¤$$¤) as the search character !
value = value.replace('¤$$¤', '!')
branch_filters.append(operator(value))
return branch_filters


def read_input_lemma(values):
return [x.replace('\r', '') for x in values.split("\n") if len(x.replace('\r', '').strip()) > 0]
def read_input_lemma(values: str) -> List[str]:
return [
x.replace('\r', '')
for x in values.split("\n")
if len(x.replace('\r', '').strip()) > 0
]


def read_input_morph(values):
Expand Down Expand Up @@ -124,4 +163,4 @@ def create_input_format_convertion(tokens, allowed_lemma, allowed_morph, allowed
if tokens:
tokens = read_input_tokens(tokens)

return tokens, allowed_lemma, allowed_morph, allowed_POS
return tokens, allowed_lemma, allowed_morph, allowed_POS
7 changes: 3 additions & 4 deletions tests/test_selenium/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -706,12 +706,11 @@ def setUp(self):
db.session.add(new_token)
db.session.commit()

def search(self, form="", lemma="", pos="", morph=""):

def search(self, form="", lemma="", pos="", morph="", case_insensitivity=False):
self.go_to_search_tokens_page(TokensSearchThroughFieldsBase.CORPUS_ID, as_callback=False)

self.fill_filter_row(form, lemma, pos, morph)

if case_insensitivity:
self.driver_find_element_by_id('caseBox').click()
self.driver_find_element_by_id("submit_search").click()

result = []
Expand Down
39 changes: 34 additions & 5 deletions tests/test_selenium/test_tokens_search_through_fields.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from app.models import WordToken
from app import db
from tests.test_selenium.base import TokensSearchThroughFieldsBase

from typing import List, Dict, Set

class TestTokensSearchThroughFields(TokensSearchThroughFieldsBase):
""" Test searching tokens through fields (Form, Lemma, POS, Morph) within a corpus """
Expand Down Expand Up @@ -158,14 +159,42 @@ def test_search_with_negation_and_like_operator(self):
def test_search_with_or_operator(self):
# search with OR operator
rows = self.search(form="seint|seinz|Seinz|seinte")
rows_wildcard = self.search(form="sein*")
rows_wildcard = self.search(form="sein*", case_insensitivity=True)

rows_lemma = self.search(lemma="saint")
self.assertTrue(rows_lemma == rows and rows == rows_wildcard)
self.assertEqual(rows_lemma, rows)
self.assertEqual(rows_wildcard, rows)

# test combination with an other field
# test combination with another field
rows = self.search(lemma="m*", pos="NOMcom|NOMpro")
self.assertTrue(len(rows) == 9)

# test combination with an other field
# test combination with another field
rows = self.search(form="Martins|mere", lemma="martin|mere")
self.assertTrue(len(rows) == 3)

def test_search_with_case_sensitivy(self):
"""Test a simple enable case sensitivity"""
# search with and without case sensitivity
rows_case_sensitivity_min = self.search(form="de")
row_case_sensitivity_maj = self.search(form="De")
rows_case_insensitivity = self.search(form="de", case_insensitivity=True)

def form_only(results: List[Dict[str, str]]) -> Set[str]:
return set([line["form"] for line in results])

self.assertEqual(
form_only(rows_case_sensitivity_min), {"de"}, "Min. search should retrieve `de` only")
self.assertEqual(
form_only(row_case_sensitivity_maj), {"De"}, "Maj search should retrieve `De` only")
self.assertEqual(
form_only(rows_case_insensitivity), {"De", "de"}, "Insentivity should retrieve both forms")

seinz_sens = self.search(form="sein*", case_insensitivity=False)
seinz_insens = self.search(form="sein*", case_insensitivity=True)
self.assertEqual(
form_only(seinz_sens), {'seinz', 'seinte', 'seint'},
"Sensitivity should retrieve only form in minuscules")
self.assertEqual(
form_only(seinz_insens), {'seinz', 'seinte', 'seint', 'Seinz'},
"Insensitivity should retrieve all forms")

0 comments on commit 2ca595d

Please sign in to comment.