From 8188e1472d396fafb39dc7e12db3d67d7c4678c4 Mon Sep 17 00:00:00 2001 From: zhuwenxing Date: Wed, 27 Nov 2024 16:08:36 +0800 Subject: [PATCH] test: add custom analyzer testcases (#37781) Signed-off-by: zhuwenxing --- tests/python_client/common/common_func.py | 12 + tests/python_client/requirements.txt | 1 + tests/python_client/testcases/test_query.py | 1250 +++++++++++++++---- 3 files changed, 993 insertions(+), 270 deletions(-) diff --git a/tests/python_client/common/common_func.py b/tests/python_client/common/common_func.py index f8e3722be3747..90b4ca860a4e7 100644 --- a/tests/python_client/common/common_func.py +++ b/tests/python_client/common/common_func.py @@ -222,6 +222,18 @@ def manual_check_text_match(df, word, col): id_list.append(row["id"]) return id_list + +def get_top_english_tokens(counter, n=10): + english_pattern = re.compile(r'^[a-zA-Z]+$') + + english_tokens = { + word: freq + for word, freq in counter.items() + if english_pattern.match(str(word)) + } + english_counter = Counter(english_tokens) + return english_counter.most_common(n) + def analyze_documents(texts, language="en"): tokenizer = custom_tokenizer(language) diff --git a/tests/python_client/requirements.txt b/tests/python_client/requirements.txt index c105c71d24c14..5e7b517f6103c 100644 --- a/tests/python_client/requirements.txt +++ b/tests/python_client/requirements.txt @@ -66,6 +66,7 @@ ml-dtypes==0.2.0 # for full text search bm25s==0.2.0 jieba==0.42.1 +Unidecode==1.3.8 # for perf test diff --git a/tests/python_client/testcases/test_query.py b/tests/python_client/testcases/test_query.py index e12a88d924dfb..0ce63bb44ccbb 100644 --- a/tests/python_client/testcases/test_query.py +++ b/tests/python_client/testcases/test_query.py @@ -36,7 +36,6 @@ pd.set_option("expand_frame_repr", False) - prefix = "query" exp_res = "exp_res" count = "count(*)" @@ -381,7 +380,7 @@ def test_query_expr_by_int64(self): expected: verify query output number """ self._connect() - df = cf.gen_default_dataframe_data(nb=ct.default_nb*10) + df = cf.gen_default_dataframe_data(nb=ct.default_nb * 10) self.collection_wrap.construct_from_dataframe(cf.gen_unique_str(prefix), df, primary_field=ct.default_int64_field_name) assert self.collection_wrap.num_entities == ct.default_nb * 10 @@ -433,7 +432,8 @@ def test_query_with_expression(self, enable_dynamic_field): # 1. initialize with data nb = 2000 collection_w, _vectors, _, insert_ids = self.init_collection_general(prefix, True, nb, - enable_dynamic_field=enable_dynamic_field)[0:4] + enable_dynamic_field=enable_dynamic_field)[ + 0:4] # filter result with expression in collection _vectors = _vectors[0] @@ -1022,7 +1022,7 @@ def test_query_expr_json_contains_list_in_list(self, expr_prefix, enable_dynamic # 3. query collection_w.load() - _id = random.randint(3, ct.default_nb-3) + _id = random.randint(3, ct.default_nb - 3) ids = [[_id, _id + 1]] expression = f"{expr_prefix}({json_field}['list'], {ids})" res = collection_w.query(expression)[0] @@ -1323,9 +1323,9 @@ def test_query_expr_out_of_range(self, expression): # increase the value to cover the int range _vectors["int16"] = \ - pd.Series(data=[np.int16(i*40) for i in range(start, start + ct.default_nb)], dtype="int16") + pd.Series(data=[np.int16(i * 40) for i in range(start, start + ct.default_nb)], dtype="int16") _vectors["int32"] = \ - pd.Series(data=[np.int32(i*2200000) for i in range(start, start + ct.default_nb)], dtype="int32") + pd.Series(data=[np.int32(i * 2200000) for i in range(start, start + ct.default_nb)], dtype="int32") insert_ids = collection_w.insert(_vectors)[0].primary_keys # filter result with expression in collection @@ -1970,7 +1970,6 @@ def test_query_pagination_with_invalid_limit_value(self, limit): collection_w.query(term_expr, offset=10, limit=limit, check_task=CheckTasks.err_res, check_items=error) - @pytest.mark.tags(CaseLabel.L2) @pytest.mark.parametrize("offset", ["12 s", " ", [0, 1], {2}]) def test_query_pagination_with_invalid_offset_type(self, offset): @@ -2075,7 +2074,8 @@ def test_enable_mmap_query_with_expression(self, enable_dynamic_field): # 1. initialize with data nb = 1000 collection_w, _vectors, _, insert_ids = self.init_collection_general(prefix, True, nb, is_index=False, - enable_dynamic_field=enable_dynamic_field)[0:4] + enable_dynamic_field=enable_dynamic_field)[ + 0:4] # enable mmap collection_w.set_properties({'mmap.enabled': True}) collection_w.create_index(ct.default_float_vec_field_name, default_index_params, index_name="query_expr_index") @@ -2165,10 +2165,11 @@ def test_mmap_query_string_expr_with_prefixes(self): method: specify string is primary field, use prefix string expr expected: verify query successfully """ - collection_w, vectors = self.init_collection_general(prefix, insert_data=True,is_index=False, + collection_w, vectors = self.init_collection_general(prefix, insert_data=True, is_index=False, primary_field=ct.default_string_field_name)[0:2] - collection_w.create_index(ct.default_float_vec_field_name, default_index_params, index_name="query_expr_pre_index") + collection_w.create_index(ct.default_float_vec_field_name, default_index_params, + index_name="query_expr_pre_index") collection_w.set_properties({'mmap.enabled': True}) collection_w.alter_index("query_expr_pre_index", {'mmap.enabled': True}) @@ -2640,7 +2641,7 @@ def test_query_multi_logical_exprs(self): collection_w.load() multi_exprs = " || ".join(f'{default_int_field_name} == {i}' for i in range(60)) _, check_res = collection_w.query(multi_exprs, output_fields=[f'{default_int_field_name}']) - assert(check_res == True) + assert (check_res == True) @pytest.mark.tags(CaseLabel.L0) def test_search_multi_logical_exprs(self): @@ -2664,7 +2665,7 @@ def test_search_multi_logical_exprs(self): limit = 1000 _, check_res = collection_w.search(vectors_s[:ct.default_nq], ct.default_float_vec_field_name, ct.default_search_params, limit, multi_exprs) - assert(check_res == True) + assert (check_res == True) class TestQueryString(TestcaseBase): @@ -2768,7 +2769,7 @@ def test_bitmap_alter_offset_cache_param(self): target: test bitmap index with enable offset cache. expected: verify create index and load successfully """ - collection_w, vectors = self.init_collection_general(prefix, insert_data=True,is_index=False, + collection_w, vectors = self.init_collection_general(prefix, insert_data=True, is_index=False, primary_field=default_int_field_name)[0:2] collection_w.create_index(ct.default_float_vec_field_name, default_index_params, index_name="test_vec") @@ -2779,7 +2780,7 @@ def test_bitmap_alter_offset_cache_param(self): result, _ = collection_w.query(expression, output_fields=['varchar']) res_len = len(result) collection_w.release() - collection_w.alter_index("bitmap_offset_cache", {'indexoffsetcache.enabled': True}) + collection_w.alter_index("bitmap_offset_cache", {'indexoffsetcache.enabled': True}) collection_w.create_index("varchar", index_name="bitmap_offset_cache", index_params={"index_type": "BITMAP"}) collection_w.load() expression = 'varchar like "0%"' @@ -2787,7 +2788,7 @@ def test_bitmap_alter_offset_cache_param(self): res_len_new = len(result) assert res_len_new == res_len collection_w.release() - collection_w.alter_index("bitmap_offset_cache", {'indexoffsetcache.enabled': False}) + collection_w.alter_index("bitmap_offset_cache", {'indexoffsetcache.enabled': False}) collection_w.create_index("varchar", index_name="bitmap_offset_cache", index_params={"index_type": "BITMAP"}) collection_w.load() expression = 'varchar like "0%"' @@ -2802,20 +2803,21 @@ def test_query_string_expr_with_prefixes_auto_index(self): target: test query with prefix string expression and indexed with auto index expected: verify query successfully """ - collection_w, vectors = self.init_collection_general(prefix, insert_data=True,is_index=False, + collection_w, vectors = self.init_collection_general(prefix, insert_data=True, is_index=False, primary_field=default_int_field_name)[0:2] - collection_w.create_index(ct.default_float_vec_field_name, default_index_params, index_name="query_expr_pre_index") + collection_w.create_index(ct.default_float_vec_field_name, default_index_params, + index_name="query_expr_pre_index") collection_w.create_index("varchar", index_name="varchar_auto_index") time.sleep(1) collection_w.load() expression = 'varchar like "0%"' - result , _ = collection_w.query(expression, output_fields=['varchar']) + result, _ = collection_w.query(expression, output_fields=['varchar']) res_len = len(result) collection_w.release() collection_w.drop_index(index_name="varchar_auto_index") collection_w.load() - result , _ = collection_w.query(expression, output_fields=['varchar']) + result, _ = collection_w.query(expression, output_fields=['varchar']) res_len_1 = len(result) assert res_len_1 == res_len @@ -2825,20 +2827,21 @@ def test_query_string_expr_with_prefixes_bitmap(self): target: test query with prefix string expression and indexed with bitmap expected: verify query successfully """ - collection_w, vectors = self.init_collection_general(prefix, insert_data=True,is_index=False, + collection_w, vectors = self.init_collection_general(prefix, insert_data=True, is_index=False, primary_field=default_int_field_name)[0:2] - collection_w.create_index(ct.default_float_vec_field_name, default_index_params, index_name="query_expr_pre_index") + collection_w.create_index(ct.default_float_vec_field_name, default_index_params, + index_name="query_expr_pre_index") collection_w.create_index("varchar", index_name="bitmap_auto_index", index_params={"index_type": "BITMAP"}) time.sleep(1) collection_w.load() expression = 'varchar like "0%"' - result , _ = collection_w.query(expression, output_fields=['varchar']) + result, _ = collection_w.query(expression, output_fields=['varchar']) res_len = len(result) collection_w.release() collection_w.drop_index(index_name="varchar_bitmap_index") collection_w.load() - result , _ = collection_w.query(expression, output_fields=['varchar']) + result, _ = collection_w.query(expression, output_fields=['varchar']) res_len_1 = len(result) assert res_len_1 == res_len @@ -2848,20 +2851,21 @@ def test_query_string_expr_with_match_auto_index(self): target: test query with match string expression and indexed with auto index expected: verify query successfully """ - collection_w, vectors = self.init_collection_general(prefix, insert_data=True,is_index=False, + collection_w, vectors = self.init_collection_general(prefix, insert_data=True, is_index=False, primary_field=default_int_field_name)[0:2] - collection_w.create_index(ct.default_float_vec_field_name, default_index_params, index_name="query_expr_pre_index") + collection_w.create_index(ct.default_float_vec_field_name, default_index_params, + index_name="query_expr_pre_index") collection_w.create_index("varchar", index_name="varchar_auto_index") time.sleep(1) collection_w.load() expression = 'varchar like "%0%"' - result , _ = collection_w.query(expression, output_fields=['varchar']) + result, _ = collection_w.query(expression, output_fields=['varchar']) res_len = len(result) collection_w.release() collection_w.drop_index(index_name="varchar_auto_index") collection_w.load() - result , _ = collection_w.query(expression, output_fields=['varchar']) + result, _ = collection_w.query(expression, output_fields=['varchar']) res_len_1 = len(result) assert res_len_1 == res_len @@ -2871,12 +2875,12 @@ def test_query_string_expr_with_match_bitmap(self): target: test query with match string expression and indexed with bitmap expected: verify query successfully """ - collection_w, vectors = self.init_collection_general(prefix, insert_data=True,is_index=False, + collection_w, vectors = self.init_collection_general(prefix, insert_data=True, is_index=False, primary_field=default_int_field_name)[0:2] collection_w.create_index(ct.default_float_vec_field_name, default_index_params, index_name="query_expr_pre_index") - collection_w.create_index("varchar", index_name="bitmap_auto_index", index_params={"index_type": "BITMAP"}) + collection_w.create_index("varchar", index_name="bitmap_auto_index", index_params={"index_type": "BITMAP"}) time.sleep(1) collection_w.load() expression = 'varchar like "%0%"' @@ -2885,7 +2889,7 @@ def test_query_string_expr_with_match_bitmap(self): collection_w.release() collection_w.drop_index(index_name="varchar_bitmap_index") collection_w.load() - result , _ = collection_w.query(expression, output_fields=['varchar']) + result, _ = collection_w.query(expression, output_fields=['varchar']) res_len_1 = len(result) assert res_len_1 == res_len @@ -2931,8 +2935,8 @@ def test_query_compare_invalid_fields(self): expression = 'varchar == int64' collection_w.query(expression, check_task=CheckTasks.err_res, check_items={ct.err_code: 1100, ct.err_msg: - f"failed to create query plan: cannot parse expression: {expression}, " - f"error: comparisons between VarChar and Int64 are not supported: invalid parameter"}) + f"failed to create query plan: cannot parse expression: {expression}, " + f"error: comparisons between VarChar and Int64 are not supported: invalid parameter"}) @pytest.mark.tags(CaseLabel.L1) @pytest.mark.skip(reason="issue 24637") @@ -3123,12 +3127,14 @@ def test_query_array_with_inverted_index(self, array_element_data_type): additional_params = {"max_length": 1000} if array_element_data_type == DataType.VARCHAR else {} fields = [ FieldSchema(name="id", dtype=DataType.INT64, is_primary=True), - FieldSchema(name="contains", dtype=DataType.ARRAY, element_type=array_element_data_type, max_capacity=2000, **additional_params), + FieldSchema(name="contains", dtype=DataType.ARRAY, element_type=array_element_data_type, max_capacity=2000, + **additional_params), FieldSchema(name="contains_any", dtype=DataType.ARRAY, element_type=array_element_data_type, max_capacity=2000, **additional_params), FieldSchema(name="contains_all", dtype=DataType.ARRAY, element_type=array_element_data_type, max_capacity=2000, **additional_params), - FieldSchema(name="equals", dtype=DataType.ARRAY, element_type=array_element_data_type, max_capacity=2000, **additional_params), + FieldSchema(name="equals", dtype=DataType.ARRAY, element_type=array_element_data_type, max_capacity=2000, + **additional_params), FieldSchema(name="array_length_field", dtype=DataType.ARRAY, element_type=array_element_data_type, max_capacity=2000, **additional_params), FieldSchema(name="array_access", dtype=DataType.ARRAY, element_type=array_element_data_type, @@ -3559,7 +3565,7 @@ def test_query_count_expr_json(self): collection_w = self.init_collection_general(prefix, enable_dynamic_field=True, with_json=True)[0] # 2. insert data - array = cf.gen_default_rows_data( with_json=False) + array = cf.gen_default_rows_data(with_json=False) for i in range(ct.default_nb): if i % 2 == 0: array[i][json_field] = {"string": str(i), "bool": bool(i)} @@ -3608,13 +3614,13 @@ def test_json_expr_on_search_n_query(self): for i in range(10): data = [ cf.gen_vectors(nb, dim), - cf.gen_json_data_for_diff_json_types(nb=nb, start=i*nb, json_type=json_int), - cf.gen_json_data_for_diff_json_types(nb=nb, start=i*nb, json_type=json_float), - cf.gen_json_data_for_diff_json_types(nb=nb, start=i*nb, json_type=json_string), - cf.gen_json_data_for_diff_json_types(nb=nb, start=i*nb, json_type=json_bool), - cf.gen_json_data_for_diff_json_types(nb=nb, start=i*nb, json_type=json_array), - cf.gen_json_data_for_diff_json_types(nb=nb, start=i*nb, json_type=json_embedded_object), - cf.gen_json_data_for_diff_json_types(nb=nb, start=i*nb, json_type=json_objects_array) + cf.gen_json_data_for_diff_json_types(nb=nb, start=i * nb, json_type=json_int), + cf.gen_json_data_for_diff_json_types(nb=nb, start=i * nb, json_type=json_float), + cf.gen_json_data_for_diff_json_types(nb=nb, start=i * nb, json_type=json_string), + cf.gen_json_data_for_diff_json_types(nb=nb, start=i * nb, json_type=json_bool), + cf.gen_json_data_for_diff_json_types(nb=nb, start=i * nb, json_type=json_array), + cf.gen_json_data_for_diff_json_types(nb=nb, start=i * nb, json_type=json_embedded_object), + cf.gen_json_data_for_diff_json_types(nb=nb, start=i * nb, json_type=json_objects_array) ] collection_w.insert(data) @@ -4059,8 +4065,8 @@ def test_count_query_search_after_release_partition_load(self): # insert data collection_w = self.init_collection_general(prefix, True, 200, partition_num=1, is_index=True)[0] collection_w.query(expr='', output_fields=[ct.default_count_output], - check_task=CheckTasks.check_query_results, - check_items={"exp_res": [{ct.default_count_output: 200}]}) + check_task=CheckTasks.check_query_results, + check_items={"exp_res": [{ct.default_count_output: 200}]}) collection_w.release() partition_w1, partition_w2 = collection_w.partitions # load @@ -4118,14 +4124,16 @@ def test_query_by_normal_with_none_data(self, enable_dynamic_field, null_data_pe # create collection, insert default_nb, load collection collection_w, vectors = self.init_collection_general(prefix, insert_data=True, enable_dynamic_field=enable_dynamic_field, - nullable_fields={default_float_field_name: null_data_percent})[0:2] + nullable_fields={ + default_float_field_name: null_data_percent})[0:2] pos = 5 if enable_dynamic_field: int_values, float_values = [], [] for vector in vectors[0]: int_values.append(vector[ct.default_int64_field_name]) float_values.append(vector[default_float_field_name]) - res = [{ct.default_int64_field_name: int_values[i], default_float_field_name: float_values[i]} for i in range(pos)] + res = [{ct.default_int64_field_name: int_values[i], default_float_field_name: float_values[i]} for i in + range(pos)] else: int_values = vectors[0][ct.default_int64_field_name].values.tolist() res = vectors[0].iloc[0:pos, :2].to_dict('records') @@ -4144,14 +4152,16 @@ def test_query_by_expr_none_with_none_data(self, enable_dynamic_field, null_data # create collection, insert default_nb, load collection collection_w, vectors = self.init_collection_general(prefix, insert_data=True, enable_dynamic_field=enable_dynamic_field, - nullable_fields={default_float_field_name: null_data_percent})[0:2] + nullable_fields={ + default_float_field_name: null_data_percent})[0:2] pos = 5 if enable_dynamic_field: int_values, float_values = [], [] for vector in vectors[0]: int_values.append(vector[ct.default_int64_field_name]) float_values.append(vector[default_float_field_name]) - res = [{ct.default_int64_field_name: int_values[i], default_float_field_name: float_values[i]} for i in range(pos)] + res = [{ct.default_int64_field_name: int_values[i], default_float_field_name: float_values[i]} for i in + range(pos)] else: res = vectors[0].iloc[0:pos, :2].to_dict('records') @@ -4174,14 +4184,16 @@ def test_query_by_nullable_field_with_none_data(self): for vector in vectors[0]: int_values.append(vector[ct.default_int64_field_name]) float_values.append(vector[default_float_field_name]) - res = [{ct.default_int64_field_name: int_values[i], default_float_field_name: float_values[i]} for i in range(pos)] + res = [{ct.default_int64_field_name: int_values[i], default_float_field_name: float_values[i]} for i in + range(pos)] term_expr = f'{default_float_field_name} < {pos}' collection_w.query(term_expr, output_fields=[ct.default_int64_field_name, default_float_field_name], check_task=CheckTasks.check_query_results, check_items={exp_res: res}) @pytest.mark.tags(CaseLabel.L0) - def test_query_after_none_data_all_field_datatype(self, varchar_scalar_index, numeric_scalar_index, null_data_percent): + def test_query_after_none_data_all_field_datatype(self, varchar_scalar_index, numeric_scalar_index, + null_data_percent): """ target: test query after different index on scalar fields method: query after different index on nullable fields @@ -4225,7 +4237,8 @@ def test_query_after_none_data_all_field_datatype(self, varchar_scalar_index, nu for i in range(pos): int64_values.append(scalar_fields[0][i]) float_values.append(scalar_fields[5][i]) - res = [{ct.default_int64_field_name: int64_values[i], default_float_field_name: float_values[i]} for i in range(pos)] + res = [{ct.default_int64_field_name: int64_values[i], default_float_field_name: float_values[i]} for i in + range(pos)] term_expr = f'0 <= {ct.default_int64_field_name} < {pos}' collection_w.query(term_expr, output_fields=[ct.default_int64_field_name, ct.default_float_field_name], @@ -4240,14 +4253,16 @@ def test_query_default_value_with_insert(self, enable_dynamic_field): """ # 1. initialize with data collection_w, vectors = self.init_collection_general(prefix, True, enable_dynamic_field=enable_dynamic_field, - default_value_fields={ct.default_float_field_name: np.float32(10.0)})[0:2] + default_value_fields={ + ct.default_float_field_name: np.float32(10.0)})[0:2] pos = 5 if enable_dynamic_field: int_values, float_values = [], [] for vector in vectors[0]: int_values.append(vector[ct.default_int64_field_name]) float_values.append(vector[default_float_field_name]) - res = [{ct.default_int64_field_name: int_values[i], default_float_field_name: float_values[i]} for i in range(pos)] + res = [{ct.default_int64_field_name: int_values[i], default_float_field_name: float_values[i]} for i in + range(pos)] else: int_values = vectors[0][ct.default_int64_field_name].values.tolist() res = vectors[0].iloc[0:pos, :2].to_dict('records') @@ -4266,7 +4281,8 @@ def test_query_default_value_without_insert(self, enable_dynamic_field): """ # 1. initialize with data collection_w, vectors = self.init_collection_general(prefix, False, enable_dynamic_field=enable_dynamic_field, - default_value_fields={ct.default_float_field_name: np.float32(10.0)})[0:2] + default_value_fields={ + ct.default_float_field_name: np.float32(10.0)})[0:2] term_expr = f'{ct.default_int64_field_name} > 0' # 2. query @@ -4289,7 +4305,8 @@ def test_query_after_default_data_all_field_datatype(self, varchar_scalar_index, ct.default_double_field_name: 10.0, ct.default_string_field_name: "1"} collection_w, vectors = self.init_collection_general(prefix, True, 1000, partition_num=1, is_all_data_type=True, - is_index=False, default_value_fields=default_value_fields)[0:2] + is_index=False, default_value_fields=default_value_fields)[ + 0:2] # 2. create index on vector field and load index = "HNSW" params = cf.get_index_params_params(index) @@ -4317,7 +4334,8 @@ def test_query_after_default_data_all_field_datatype(self, varchar_scalar_index, for i in range(pos): int64_values.append(scalar_fields[0][i]) float_values.append(scalar_fields[5][i]) - res = [{ct.default_int64_field_name: int64_values[i], default_float_field_name: float_values[i]} for i in range(pos)] + res = [{ct.default_int64_field_name: int64_values[i], default_float_field_name: float_values[i]} for i in + range(pos)] term_expr = f'0 <= {ct.default_int64_field_name} < {pos}' # 5. query @@ -4335,14 +4353,16 @@ def test_query_both_default_value_non_data(self, enable_dynamic_field): # 1. initialize with data collection_w, vectors = self.init_collection_general(prefix, True, enable_dynamic_field=enable_dynamic_field, nullable_fields={ct.default_float_field_name: 1}, - default_value_fields={ct.default_float_field_name: np.float32(10.0)})[0:2] + default_value_fields={ + ct.default_float_field_name: np.float32(10.0)})[0:2] pos = 5 if enable_dynamic_field: int_values, float_values = [], [] for vector in vectors[0]: int_values.append(vector[ct.default_int64_field_name]) float_values.append(vector[default_float_field_name]) - res = [{ct.default_int64_field_name: int_values[i], default_float_field_name: float_values[i]} for i in range(pos)] + res = [{ct.default_int64_field_name: int_values[i], default_float_field_name: float_values[i]} for i in + range(pos)] else: res = vectors[0].iloc[0:pos, :2].to_dict('records') @@ -4362,8 +4382,10 @@ def test_query_after_different_index_with_params_none_default_data(self, varchar # 1. initialize with data collection_w, vectors = self.init_collection_general(prefix, True, 1000, partition_num=1, is_all_data_type=True, is_index=False, - nullable_fields={ct.default_string_field_name: null_data_percent}, - default_value_fields={ct.default_float_field_name: np.float32(10.0)})[0:2] + nullable_fields={ + ct.default_string_field_name: null_data_percent}, + default_value_fields={ + ct.default_float_field_name: np.float32(10.0)})[0:2] # 2. create index on vector field and load index = "HNSW" params = cf.get_index_params_params(index) @@ -4385,7 +4407,8 @@ def test_query_after_different_index_with_params_none_default_data(self, varchar for i in range(pos): int64_values.append(scalar_fields[0][i]) float_values.append(scalar_fields[5][i]) - res = [{ct.default_int64_field_name: int64_values[i], default_float_field_name: float_values[i]} for i in range(pos)] + res = [{ct.default_int64_field_name: int64_values[i], default_float_field_name: float_values[i]} for i in + range(pos)] term_expr = f'{ct.default_int64_field_name} in {int64_values[:pos]}' # 5. query @@ -4403,7 +4426,8 @@ def test_query_iterator_with_none_data(self, null_data_percent): # 1. initialize with data batch_size = 100 collection_w = self.init_collection_general(prefix, True, is_index=False, - nullable_fields={ct.default_string_field_name: null_data_percent})[0] + nullable_fields={ct.default_string_field_name: null_data_percent})[ + 0] collection_w.create_index(ct.default_float_vec_field_name, {"metric_type": "L2"}) collection_w.load() # 2. search iterator @@ -4423,7 +4447,8 @@ def test_query_normal_none_data_partition_key(self, enable_dynamic_field, null_d """ # 1. initialize with data collection_w, vectors = self.init_collection_general(prefix, True, enable_dynamic_field=enable_dynamic_field, - nullable_fields={ct.default_float_field_name: null_data_percent}, + nullable_fields={ + ct.default_float_field_name: null_data_percent}, is_partition_key=ct.default_float_field_name)[0:2] pos = 5 if enable_dynamic_field: @@ -4431,7 +4456,8 @@ def test_query_normal_none_data_partition_key(self, enable_dynamic_field, null_d for vector in vectors[0]: int_values.append(vector[ct.default_int64_field_name]) float_values.append(vector[default_float_field_name]) - res = [{ct.default_int64_field_name: int_values[i], default_float_field_name: float_values[i]} for i in range(pos)] + res = [{ct.default_int64_field_name: int_values[i], default_float_field_name: float_values[i]} for i in + range(pos)] else: int_values = vectors[0][ct.default_int64_field_name].values.tolist() res = vectors[0].iloc[0:pos, :2].to_dict('records') @@ -4458,8 +4484,8 @@ def test_query_none_count(self, null_data_percent): nullable_fields={ct.default_float_field_name: null_data_percent}, default_value_fields={ct.default_string_field_name: "data"})[0] collection_w.query(expr='', output_fields=[ct.default_count_output], - check_task=CheckTasks.check_query_results, - check_items={"exp_res": [{ct.default_count_output: 200}]}) + check_task=CheckTasks.check_query_results, + check_items={"exp_res": [{ct.default_count_output: 200}]}) collection_w.release() partition_w1, partition_w2 = collection_w.partitions # load @@ -4497,7 +4523,7 @@ class TestQueryTextMatch(TestcaseBase): @pytest.mark.parametrize("enable_inverted_index", [True, False]) @pytest.mark.parametrize("tokenizer", ["standard"]) def test_query_text_match_en_normal( - self, tokenizer, enable_inverted_index, enable_partition_key + self, tokenizer, enable_inverted_index, enable_partition_key ): """ target: test text match normal @@ -4517,7 +4543,7 @@ def test_query_text_match_en_normal( dtype=DataType.VARCHAR, max_length=65535, enable_analyzer=True, - enable_match=True, + enable_match=True, is_partition_key=enable_partition_key, analyzer_params=analyzer_params, ), @@ -4526,7 +4552,7 @@ def test_query_text_match_en_normal( dtype=DataType.VARCHAR, max_length=65535, enable_analyzer=True, - enable_match=True, + enable_match=True, analyzer_params=analyzer_params, ), FieldSchema( @@ -4534,7 +4560,7 @@ def test_query_text_match_en_normal( dtype=DataType.VARCHAR, max_length=65535, enable_analyzer=True, - enable_match=True, + enable_match=True, analyzer_params=analyzer_params, ), FieldSchema( @@ -4542,7 +4568,7 @@ def test_query_text_match_en_normal( dtype=DataType.VARCHAR, max_length=65535, enable_analyzer=True, - enable_match=True, + enable_match=True, analyzer_params=analyzer_params, ), FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim), @@ -4575,9 +4601,9 @@ def test_query_text_match_en_normal( batch_size = 5000 for i in range(0, len(df), batch_size): collection_w.insert( - data[i : i + batch_size] + data[i: i + batch_size] if i + batch_size < len(df) - else data[i : len(df)] + else data[i: len(df)] ) # only if the collection is flushed, the inverted index ca be applied. # growing segment may be not applied, although in strong consistency. @@ -4730,6 +4756,15 @@ def test_query_text_match_zh_normal( wf_map = {} for field in text_fields: wf_map[field] = cf.analyze_documents(df[field].tolist(), language=language) + + # query with blank space and punctuation marks + for field in text_fields: + expr = f"text_match({field}, ' ') or text_match({field}, ',') or text_match({field}, '.')" + log.info(f"expr {expr}") + res, _ = collection_w.query(expr=expr, output_fields=["id", field]) + log.info(f"res len {len(res)}") + assert len(res) == 0 + # query single field for one token for field in text_fields: token = wf_map[field].most_common()[0][0] @@ -4762,7 +4797,8 @@ def test_query_text_match_zh_normal( res, _ = collection_w.query(expr=expr, output_fields=["id", field]) log.info(f"res len {len(res)}") for r in res: - assert any([token in r[field] for token in top_10_tokens]), f"top 10 tokens {top_10_tokens} not in {r[field]}" + assert any( + [token in r[field] for token in top_10_tokens]), f"top 10 tokens {top_10_tokens} not in {r[field]}" @@ -4899,27 +4935,23 @@ def test_query_text_match_with_growing_segment( for r in res: assert any([token in r[field] for token in top_10_tokens]) - @pytest.mark.skip("unimplemented") + @pytest.mark.tags(CaseLabel.L0) - def test_query_text_match_custom_analyzer(self): + @pytest.mark.parametrize("enable_partition_key", [True, False]) + @pytest.mark.parametrize("enable_inverted_index", [True, False]) + @pytest.mark.parametrize("lang_type", ["chinese"]) + def test_query_text_match_zh_en_mix( + self, lang_type, enable_inverted_index, enable_partition_key + ): """ - target: test text match with custom analyzer - method: 1. enable text match, use custom analyzer and insert data with varchar + target: test text match normal + method: 1. enable text match and insert data with varchar 2. get the most common words and query with text match 3. verify the result - expected: get the correct token, text match successfully and result is correct + expected: text match successfully and result is correct """ analyzer_params = { - "tokenizer": "standard", - # "lowercase", "asciifolding", "alphanumonly" was system filter - "filter":["lowercase", "asciifolding", "alphanumonly", - { - "type": "stop", - "stop_words": ["in", "of"], - }, { - "type": "stemmer", - "language": "english", - }], + "type": lang_type, } dim = 128 fields = [ @@ -4929,7 +4961,8 @@ def test_query_text_match_custom_analyzer(self): dtype=DataType.VARCHAR, max_length=65535, enable_analyzer=True, - enable_match=True, + enable_match=True, + is_partition_key=enable_partition_key, analyzer_params=analyzer_params, ), FieldSchema( @@ -4937,7 +4970,7 @@ def test_query_text_match_custom_analyzer(self): dtype=DataType.VARCHAR, max_length=65535, enable_analyzer=True, - enable_match=True, + enable_match=True, analyzer_params=analyzer_params, ), FieldSchema( @@ -4945,7 +4978,7 @@ def test_query_text_match_custom_analyzer(self): dtype=DataType.VARCHAR, max_length=65535, enable_analyzer=True, - enable_match=True, + enable_match=True, analyzer_params=analyzer_params, ), FieldSchema( @@ -4953,25 +4986,30 @@ def test_query_text_match_custom_analyzer(self): dtype=DataType.VARCHAR, max_length=65535, enable_analyzer=True, - enable_match=True, + enable_match=True, analyzer_params=analyzer_params, ), FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim), ] schema = CollectionSchema(fields=fields, description="test collection") - data_size = 5000 + data_size = 3000 collection_w = self.init_collection_wrap( name=cf.gen_unique_str(prefix), schema=schema ) fake = fake_en - language = "en" + if lang_type == "chinese": + language = "zh" + fake = fake_zh + else: + language = "en" + data = [ { "id": i, - "word": fake.word().lower(), - "sentence": fake.sentence().lower(), - "paragraph": fake.paragraph().lower(), - "text": fake.text().lower(), + "word": fake.word().lower() + " " + fake_en.word().lower(), + "sentence": fake.sentence().lower() + " " + fake_en.sentence().lower(), + "paragraph": fake.paragraph().lower() + " " + fake_en.paragraph().lower(), + "text": fake.text().lower() + " " + fake_en.text().lower(), "emb": [random.random() for _ in range(dim)], } for i in range(data_size) @@ -4981,31 +5019,45 @@ def test_query_text_match_custom_analyzer(self): batch_size = 5000 for i in range(0, len(df), batch_size): collection_w.insert( - data[i : i + batch_size] + data[i: i + batch_size] if i + batch_size < len(df) - else data[i : len(df)] + else data[i: len(df)] ) - collection_w.flush() + # only if the collection is flushed, the inverted index ca be applied. + # growing segment may be not applied, although in strong consistency. + collection_w.flush() collection_w.create_index( "emb", {"index_type": "IVF_SQ8", "metric_type": "L2", "params": {"nlist": 64}}, ) + if enable_inverted_index: + collection_w.create_index("word", {"index_type": "INVERTED"}) collection_w.load() # analyze the croup text_fields = ["word", "sentence", "paragraph", "text"] wf_map = {} for field in text_fields: wf_map[field] = cf.analyze_documents(df[field].tolist(), language=language) - # query single field for one word + # query single field for one token for field in text_fields: - token = list(wf_map[field].keys())[0] + token = wf_map[field].most_common()[0][0] expr = f"text_match({field}, '{token}')" log.info(f"expr: {expr}") res, _ = collection_w.query(expr=expr, output_fields=["id", field]) log.info(f"res len {len(res)}") + assert len(res) > 0 for r in res: assert token in r[field] + # verify inverted index + if enable_inverted_index: + if field == "word": + expr = f"{field} == '{token}'" + log.info(f"expr: {expr}") + res, _ = collection_w.query(expr=expr, output_fields=["id", field]) + log.info(f"res len {len(res)}") + for r in res: + assert r[field] == token # query single field for multi-word for field in text_fields: # match top 10 most common words @@ -5017,55 +5069,54 @@ def test_query_text_match_custom_analyzer(self): log.info(f"expr {expr}") res, _ = collection_w.query(expr=expr, output_fields=["id", field]) log.info(f"res len {len(res)}") + assert len(res) > 0 for r in res: - assert any([token in r[field] for token in top_10_tokens]) + assert any( + [token in r[field] for token in top_10_tokens]), f"top 10 tokens {top_10_tokens} not in {r[field]}" + + # query single field for multi-word + for field in text_fields: + # match latest 10 most common english words + top_10_tokens = [] + for word, count in cf.get_top_english_tokens(wf_map[field], 10): + top_10_tokens.append(word) + string_of_top_10_words = " ".join(top_10_tokens) + expr = f"text_match({field}, '{string_of_top_10_words}')" + log.info(f"expr {expr}") + res, _ = collection_w.query(expr=expr, output_fields=["id", field]) + log.info(f"res len {len(res)}") + assert len(res) > 0 + for r in res: + assert any( + [token in r[field] for token in top_10_tokens]), f"top 10 tokens {top_10_tokens} not in {r[field]}" @pytest.mark.tags(CaseLabel.L0) - def test_query_text_match_with_combined_expression_for_single_field(self): + def test_query_text_match_custom_analyzer_with_stop_words(self): """ - target: test query text match with combined expression for single field - method: 1. enable text match, and insert data with varchar - 2. get the most common words and form the combined expression with and operator + target: test text match with custom analyzer + method: 1. enable text match, use custom analyzer and insert data with varchar + 2. get the most common words and query with text match 3. verify the result - expected: query successfully and result is correct + expected: get the correct token, text match successfully and result is correct """ + stops_words = ["in", "of"] analyzer_params = { "tokenizer": "standard", + "filter": [ + { + "type": "stop", + "stop_words": stops_words, + }], } - # 1. initialize with data dim = 128 fields = [ FieldSchema(name="id", dtype=DataType.INT64, is_primary=True), - FieldSchema( - name="word", - dtype=DataType.VARCHAR, - max_length=65535, - enable_analyzer=True, - enable_match=True, - analyzer_params=analyzer_params, - ), FieldSchema( name="sentence", dtype=DataType.VARCHAR, max_length=65535, enable_analyzer=True, - enable_match=True, - analyzer_params=analyzer_params, - ), - FieldSchema( - name="paragraph", - dtype=DataType.VARCHAR, - max_length=65535, - enable_analyzer=True, - enable_match=True, - analyzer_params=analyzer_params, - ), - FieldSchema( - name="text", - dtype=DataType.VARCHAR, - max_length=65535, - enable_analyzer=True, - enable_match=True, + enable_match=True, analyzer_params=analyzer_params, ), FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim), @@ -5080,21 +5131,19 @@ def test_query_text_match_with_combined_expression_for_single_field(self): data = [ { "id": i, - "word": fake.word().lower(), - "sentence": fake.sentence().lower(), - "paragraph": fake.paragraph().lower(), - "text": fake.text().lower(), + "sentence": fake.sentence().lower() + " ".join(stops_words), "emb": [random.random() for _ in range(dim)], } for i in range(data_size) ] df = pd.DataFrame(data) + log.info(f"dataframe\n{df}") batch_size = 5000 for i in range(0, len(df), batch_size): collection_w.insert( - data[i : i + batch_size] + data[i: i + batch_size] if i + batch_size < len(df) - else data[i : len(df)] + else data[i: len(df)] ) collection_w.flush() collection_w.create_index( @@ -5102,84 +5151,42 @@ def test_query_text_match_with_combined_expression_for_single_field(self): {"index_type": "IVF_SQ8", "metric_type": "L2", "params": {"nlist": 64}}, ) collection_w.load() - # analyze the croup and get the tf-idf, then base on it to crate expr and ground truth - text_fields = ["word", "sentence", "paragraph", "text"] + # analyze the croup + text_fields = ["sentence"] wf_map = {} for field in text_fields: wf_map[field] = cf.analyze_documents(df[field].tolist(), language=language) - - df_new = cf.split_dataframes(df, fields=text_fields) - log.info(f"df \n{df}") - log.info(f"new df \n{df_new}") + # query single field for one word for field in text_fields: - expr_list = [] - wf_counter = Counter(wf_map[field]) - pd_tmp_res_list = [] - for word, count in wf_counter.most_common(2): - tmp = f"text_match({field}, '{word}')" - log.info(f"tmp expr {tmp}") - expr_list.append(tmp) - tmp_res = cf.manual_check_text_match(df_new, word, field) - log.info(f"manual check result for {tmp} {len(tmp_res)}") - pd_tmp_res_list.append(tmp_res) - log.info(f"manual res {len(pd_tmp_res_list)}, {pd_tmp_res_list}") - final_res = set(pd_tmp_res_list[0]) - for i in range(1, len(pd_tmp_res_list)): - final_res = final_res.intersection(set(pd_tmp_res_list[i])) - log.info(f"intersection res {len(final_res)}") - log.info(f"final res {final_res}") - and_expr = " and ".join(expr_list) - log.info(f"expr: {and_expr}") - res, _ = collection_w.query(expr=and_expr, output_fields=text_fields) - log.info(f"res len {len(res)}, final res {len(final_res)}") - assert len(res) == len(final_res) + for token in stops_words: + expr = f"text_match({field}, '{token}')" + log.info(f"expr: {expr}") + res, _ = collection_w.query(expr=expr, output_fields=["id", field]) + log.info(f"res len {len(res)}") + assert len(res) == 0 @pytest.mark.tags(CaseLabel.L0) - def test_query_text_match_with_combined_expression_for_multi_field(self): + def test_query_text_match_custom_analyzer_with_lowercase(self): """ - target: test query text match with combined expression for multi field - method: 1. enable text match, and insert data with varchar - 2. create the combined expression with `and`, `or` and `not` operator for multi field + target: test text match with custom analyzer + method: 1. enable text match, use custom analyzer and insert data with varchar + 2. get the most common words and query with text match 3. verify the result - expected: query successfully and result is correct + expected: get the correct token, text match successfully and result is correct """ analyzer_params = { "tokenizer": "standard", + "filter": ["lowercase"], } - # 1. initialize with data dim = 128 fields = [ FieldSchema(name="id", dtype=DataType.INT64, is_primary=True), - FieldSchema( - name="word", - dtype=DataType.VARCHAR, - max_length=65535, - enable_analyzer=True, - enable_match=True, - analyzer_params=analyzer_params, - ), FieldSchema( name="sentence", dtype=DataType.VARCHAR, max_length=65535, enable_analyzer=True, - enable_match=True, - analyzer_params=analyzer_params, - ), - FieldSchema( - name="paragraph", - dtype=DataType.VARCHAR, - max_length=65535, - enable_analyzer=True, - enable_match=True, - analyzer_params=analyzer_params, - ), - FieldSchema( - name="text", - dtype=DataType.VARCHAR, - max_length=65535, - enable_analyzer=True, - enable_match=True, + enable_match=True, analyzer_params=analyzer_params, ), FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim), @@ -5194,21 +5201,19 @@ def test_query_text_match_with_combined_expression_for_multi_field(self): data = [ { "id": i, - "word": fake.word().lower(), - "sentence": fake.sentence().lower(), - "paragraph": fake.paragraph().lower(), - "text": fake.text().lower(), + "sentence": fake.sentence(), "emb": [random.random() for _ in range(dim)], } for i in range(data_size) ] df = pd.DataFrame(data) + log.info(f"dataframe\n{df}") batch_size = 5000 for i in range(0, len(df), batch_size): collection_w.insert( - data[i : i + batch_size] + data[i: i + batch_size] if i + batch_size < len(df) - else data[i : len(df)] + else data[i: len(df)] ) collection_w.flush() collection_w.create_index( @@ -5216,44 +5221,753 @@ def test_query_text_match_with_combined_expression_for_multi_field(self): {"index_type": "IVF_SQ8", "metric_type": "L2", "params": {"nlist": 64}}, ) collection_w.load() - # analyze the croup and get the tf-idf, then base on it to crate expr and ground truth - text_fields = ["word", "sentence", "paragraph", "text"] + # analyze the croup + text_fields = ["sentence"] wf_map = {} for field in text_fields: wf_map[field] = cf.analyze_documents(df[field].tolist(), language=language) + # query single field for one word + for field in text_fields: + tokens =[item[0] for item in wf_map[field].most_common(1)] + for token in tokens: + # search with Capital case + token = token.capitalize() + expr = f"text_match({field}, '{token}')" + log.info(f"expr: {expr}") + capital_case_res, _ = collection_w.query(expr=expr, output_fields=["id", field]) + log.info(f"res len {len(capital_case_res)}") + # search with lower case + token = token.lower() + expr = f"text_match({field}, '{token}')" + log.info(f"expr: {expr}") + lower_case_res, _ = collection_w.query(expr=expr, output_fields=["id", field]) + log.info(f"res len {len(lower_case_res)}") - df_new = cf.split_dataframes(df, fields=text_fields) - log.info(f"new df \n{df_new}") - for i in range(2): - query, text_match_expr, pandas_expr = ( - cf.generate_random_query_from_freq_dict( - wf_map, min_freq=3, max_terms=5, p_not=0.2 - ) - ) - log.info(f"expr: {text_match_expr}") - res, _ = collection_w.query(expr=text_match_expr, output_fields=text_fields) - onetime_res = res - log.info(f"res len {len(res)}") - step_by_step_results = [] - for expr in query: - if isinstance(expr, dict): - if "not" in expr: - key = expr["not"]["field"] - else: - key = expr["field"] + # search with upper case + token = token.upper() + expr = f"text_match({field}, '{token}')" + log.info(f"expr: {expr}") + upper_case_res, _ = collection_w.query(expr=expr, output_fields=["id", field]) + log.info(f"res len {len(upper_case_res)}") + assert len(capital_case_res) == len(lower_case_res) and len(capital_case_res) == len(upper_case_res) - tmp_expr = cf.generate_text_match_expr(expr) - res, _ = collection_w.query( - expr=tmp_expr, output_fields=text_fields - ) - text_match_df = pd.DataFrame(res) - log.info( - f"text match res {len(text_match_df)}\n{text_match_df[key]}" - ) - log.info(f"tmp expr {tmp_expr} {len(res)}") - tmp_idx = [r["id"] for r in res] - step_by_step_results.append(tmp_idx) - pandas_filter_res = cf.generate_pandas_text_match_result( + @pytest.mark.tags(CaseLabel.L0) + def test_query_text_match_custom_analyzer_with_length_filter(self): + """ + target: test text match with custom analyzer + method: 1. enable text match, use custom analyzer and insert data with varchar + 2. get the most common words and query with text match + 3. verify the result + expected: get the correct token, text match successfully and result is correct + """ + analyzer_params = { + "tokenizer": "standard", + "filter": [ + { + "type": "length", # Specifies the filter type as length + "max": 10, # Sets the maximum token length to 10 characters + } + ], + } + + long_word = "a" * 11 + max_length_word = "a" * 10 + dim = 128 + fields = [ + FieldSchema(name="id", dtype=DataType.INT64, is_primary=True), + FieldSchema( + name="sentence", + dtype=DataType.VARCHAR, + max_length=65535, + enable_analyzer=True, + enable_match=True, + analyzer_params=analyzer_params, + ), + FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim), + ] + schema = CollectionSchema(fields=fields, description="test collection") + data_size = 5000 + collection_w = self.init_collection_wrap( + name=cf.gen_unique_str(prefix), schema=schema + ) + fake = fake_en + language = "en" + data = [ + { + "id": i, + "sentence": fake.sentence() + " " + long_word + " " + max_length_word, + "emb": [random.random() for _ in range(dim)], + } + for i in range(data_size) + ] + df = pd.DataFrame(data) + log.info(f"dataframe\n{df}") + batch_size = 5000 + for i in range(0, len(df), batch_size): + collection_w.insert( + data[i: i + batch_size] + if i + batch_size < len(df) + else data[i: len(df)] + ) + collection_w.flush() + collection_w.create_index( + "emb", + {"index_type": "IVF_SQ8", "metric_type": "L2", "params": {"nlist": 64}}, + ) + collection_w.load() + # analyze the croup + text_fields = ["sentence"] + wf_map = {} + for field in text_fields: + wf_map[field] = cf.analyze_documents(df[field].tolist(), language=language) + # query sentence field with long word + for field in text_fields: + tokens =[long_word] + for token in tokens: + expr = f"text_match({field}, '{token}')" + log.info(f"expr: {expr}") + res, _ = collection_w.query(expr=expr, output_fields=["id", field]) + assert len(res) == 0 + # query sentence field with max length word + for field in text_fields: + tokens =[max_length_word] + for token in tokens: + expr = f"text_match({field}, '{token}')" + log.info(f"expr: {expr}") + res, _ = collection_w.query(expr=expr, output_fields=["id", field]) + assert len(res) == data_size + + + @pytest.mark.tags(CaseLabel.L0) + def test_query_text_match_custom_analyzer_with_stemmer_filter(self): + """ + target: test text match with custom analyzer + method: 1. enable text match, use custom analyzer and insert data with varchar + 2. get the most common words and query with text match + 3. verify the result + expected: get the correct token, text match successfully and result is correct + """ + analyzer_params = { + "tokenizer": "standard", + "filter": [{ + "type": "stemmer", # Specifies the filter type as stemmer + "language": "english", # Sets the language for stemming to English + }] + } + word_pairs = { + "play": ['play', 'plays', 'played', 'playing'], + "book": ['book', 'books', 'booked', 'booking'], + "study": ['study', 'studies', 'studied', 'studying'], + } + + dim = 128 + fields = [ + FieldSchema(name="id", dtype=DataType.INT64, is_primary=True), + FieldSchema( + name="sentence", + dtype=DataType.VARCHAR, + max_length=65535, + enable_analyzer=True, + enable_match=True, + analyzer_params=analyzer_params, + ), + FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim), + ] + schema = CollectionSchema(fields=fields, description="test collection") + data_size = 5000 + collection_w = self.init_collection_wrap( + name=cf.gen_unique_str(prefix), schema=schema + ) + fake = fake_en + language = "en" + data = [ + { + "id": i, + "sentence": fake.sentence() + " " + " ".join(word_pairs.keys()), + "emb": [random.random() for _ in range(dim)], + } + for i in range(data_size) + ] + df = pd.DataFrame(data) + log.info(f"dataframe\n{df}") + batch_size = 5000 + for i in range(0, len(df), batch_size): + collection_w.insert( + data[i: i + batch_size] + if i + batch_size < len(df) + else data[i: len(df)] + ) + collection_w.flush() + collection_w.create_index( + "emb", + {"index_type": "IVF_SQ8", "metric_type": "L2", "params": {"nlist": 64}}, + ) + collection_w.load() + # analyze the croup + text_fields = ["sentence"] + wf_map = {} + for field in text_fields: + wf_map[field] = cf.analyze_documents(df[field].tolist(), language=language) + # query sentence field with variant word + for field in text_fields: + for stem in word_pairs.keys(): + tokens = word_pairs[stem] + for token in tokens: + expr = f"text_match({field}, '{token}')" + log.info(f"expr: {expr}") + res, _ = collection_w.query(expr=expr, output_fields=["id", field]) + pytest.assume(len(res) == data_size, f"stem {stem} token {token} not found in {res}") + + + @pytest.mark.tags(CaseLabel.L0) + def test_query_text_match_custom_analyzer_with_ascii_folding_filter(self): + """ + target: test text match with custom analyzer + method: 1. enable text match, use custom analyzer and insert data with varchar + 2. get the most common words and query with text match + 3. verify the result + expected: get the correct token, text match successfully and result is correct + """ + from unidecode import unidecode + analyzer_params = { + "tokenizer": "standard", + "filter": ["asciifolding"], + } + + origin_texts = [ + "Café Möller serves crème brûlée", + "José works at Škoda in São Paulo", + "The œuvre of Łukasz includes æsthetic pieces", + "München's König Street has günstig prices", + "El niño está jugando en el jardín", + "Le système éducatif français" + ] + + dim = 128 + fields = [ + FieldSchema(name="id", dtype=DataType.INT64, is_primary=True), + FieldSchema( + name="sentence", + dtype=DataType.VARCHAR, + max_length=65535, + enable_analyzer=True, + enable_match=True, + analyzer_params=analyzer_params, + ), + FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim), + ] + schema = CollectionSchema(fields=fields, description="test collection") + data_size = 5000 + collection_w = self.init_collection_wrap( + name=cf.gen_unique_str(prefix), schema=schema + ) + fake = fake_en + language = "en" + data = [ + { + "id": i, + "sentence": fake.sentence() + " " + " ".join(origin_texts), + "emb": [random.random() for _ in range(dim)], + } + for i in range(data_size) + ] + df = pd.DataFrame(data) + log.info(f"dataframe\n{df}") + batch_size = 5000 + for i in range(0, len(df), batch_size): + collection_w.insert( + data[i: i + batch_size] + if i + batch_size < len(df) + else data[i: len(df)] + ) + collection_w.flush() + collection_w.create_index( + "emb", + {"index_type": "IVF_SQ8", "metric_type": "L2", "params": {"nlist": 64}}, + ) + collection_w.load() + # analyze the croup + text_fields = ["sentence"] + wf_map = {} + for field in text_fields: + wf_map[field] = cf.analyze_documents(df[field].tolist(), language=language) + # query sentence field with variant word + for field in text_fields: + for text in origin_texts: + ascii_folding_text = unidecode(text) + expr = f"""text_match({field}, "{ascii_folding_text}")""" + log.info(f"expr: {expr}") + res, _ = collection_w.query(expr=expr, output_fields=["id", field]) + pytest.assume(len(res) == data_size, f"origin {text} ascii_folding text {ascii_folding_text} not found in {res}") + + @pytest.mark.tags(CaseLabel.L0) + def test_query_text_match_custom_analyzer_with_decompounder_filter(self): + """ + target: test text match with custom analyzer + method: 1. enable text match, use custom analyzer and insert data with varchar + 2. get the most common words and query with text match + 3. verify the result + expected: get the correct token, text match successfully and result is correct + """ + word_list = ["dampf", "schiff", "fahrt", "brot", "backen", "automat"] + analyzer_params = { + "tokenizer": "standard", + "filter": ["lowercase", + { + "type": "decompounder", # Specifies the filter type as decompounder + "word_list": word_list, # Sets the word list for decompounding + }], + } + + origin_texts = [ + "Die tägliche Dampfschifffahrt von Hamburg nach Oslo startet um sechs Uhr morgens.", + "Unser altes Dampfschiff macht eine dreistündige Rundfahrt durch den Hafen.", + "Der erfahrene Dampfschifffahrtskapitän kennt jede Route auf dem Fluss.", + "Die internationale Dampfschifffahrtsgesellschaft erweitert ihre Flotte.", + "Während der Dampfschifffahrt können Sie die Küstenlandschaft bewundern.", + "Der neue Brotbackautomat produziert stündlich frische Brötchen.", + "Im Maschinenraum des Dampfschiffs steht ein moderner Brotbackautomat.", + "Die Brotbackautomatentechnologie wird ständig verbessert.", + "Unser Brotbackautomat arbeitet mit traditionellen Rezepten.", + "Der programmierbare Brotbackautomat bietet zwanzig verschiedene Programme.", + ] + + dim = 128 + fields = [ + FieldSchema(name="id", dtype=DataType.INT64, is_primary=True), + FieldSchema( + name="sentence", + dtype=DataType.VARCHAR, + max_length=65535, + enable_analyzer=True, + enable_match=True, + analyzer_params=analyzer_params, + ), + FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim), + ] + schema = CollectionSchema(fields=fields, description="test collection") + data_size = 5000 + collection_w = self.init_collection_wrap( + name=cf.gen_unique_str(prefix), schema=schema + ) + fake = fake_en + language = "en" + data = [ + { + "id": i, + "sentence": fake.sentence() + " " + " ".join(origin_texts), + "emb": [random.random() for _ in range(dim)], + } + for i in range(data_size) + ] + df = pd.DataFrame(data) + log.info(f"dataframe\n{df}") + batch_size = 5000 + for i in range(0, len(df), batch_size): + collection_w.insert( + data[i: i + batch_size] + if i + batch_size < len(df) + else data[i: len(df)] + ) + collection_w.flush() + collection_w.create_index( + "emb", + {"index_type": "IVF_SQ8", "metric_type": "L2", "params": {"nlist": 64}}, + ) + collection_w.load() + # analyze the croup + text_fields = ["sentence"] + # query sentence field with word list + for field in text_fields: + match_text = " ".join(word_list) + expr = f"text_match({field}, '{match_text}')" + log.info(f"expr: {expr}") + res, _ = collection_w.query(expr=expr, output_fields=["id", field]) + pytest.assume(len(res) == data_size, f"res len {len(res)}, data size {data_size}") + + @pytest.mark.tags(CaseLabel.L0) + def test_query_text_match_custom_analyzer_with_alphanumonly_filter(self): + """ + target: test text match with custom analyzer + method: 1. enable text match, use custom analyzer and insert data with varchar + 2. get the most common words and query with text match + 3. verify the result + expected: get the correct token, text match successfully and result is correct + """ + common_non_ascii = [ + 'é', # common in words like café, résumé + '©', # copyright + '™', # trademark + '®', # registered trademark + '°', # degrees, e.g. 20°C + '€', # euro currency + '£', # pound sterling + '±', # plus-minus sign + '→', # right arrow + '•' # bullet point + ] + analyzer_params = { + "tokenizer": "standard", + "filter": ["alphanumonly"], + } + + dim = 128 + fields = [ + FieldSchema(name="id", dtype=DataType.INT64, is_primary=True), + FieldSchema( + name="sentence", + dtype=DataType.VARCHAR, + max_length=65535, + enable_analyzer=True, + enable_match=True, + analyzer_params=analyzer_params, + ), + FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim), + ] + schema = CollectionSchema(fields=fields, description="test collection") + data_size = 5000 + collection_w = self.init_collection_wrap( + name=cf.gen_unique_str(prefix), schema=schema + ) + fake = fake_en + language = "en" + data = [ + { + "id": i, + "sentence": fake.sentence() + " " + " ".join(common_non_ascii), + "emb": [random.random() for _ in range(dim)], + } + for i in range(data_size) + ] + df = pd.DataFrame(data) + log.info(f"dataframe\n{df}") + batch_size = 5000 + for i in range(0, len(df), batch_size): + collection_w.insert( + data[i: i + batch_size] + if i + batch_size < len(df) + else data[i: len(df)] + ) + collection_w.flush() + collection_w.create_index( + "emb", + {"index_type": "IVF_SQ8", "metric_type": "L2", "params": {"nlist": 64}}, + ) + collection_w.load() + # analyze the croup + text_fields = ["sentence"] + # query sentence field with word list + for field in text_fields: + match_text = " ".join(common_non_ascii) + expr = f"text_match({field}, '{match_text}')" + log.info(f"expr: {expr}") + res, _ = collection_w.query(expr=expr, output_fields=["id", field]) + pytest.assume(len(res) == 0, f"res len {len(res)}, data size {data_size}") + + + @pytest.mark.tags(CaseLabel.L0) + def test_query_text_match_custom_analyzer_with_cncharonly_filter(self): + """ + target: test text match with custom analyzer + method: 1. enable text match, use custom analyzer and insert data with varchar + 2. get the most common words and query with text match + 3. verify the result + expected: get the correct token, text match successfully and result is correct + """ + non_zh_char_word_list = ["hello", "milvus", "vector", "database", "19530"] + + analyzer_params = { + "tokenizer": "standard", + "filter": ["cncharonly"], + } + + dim = 128 + fields = [ + FieldSchema(name="id", dtype=DataType.INT64, is_primary=True), + FieldSchema( + name="sentence", + dtype=DataType.VARCHAR, + max_length=65535, + enable_analyzer=True, + enable_match=True, + analyzer_params=analyzer_params, + ), + FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim), + ] + schema = CollectionSchema(fields=fields, description="test collection") + data_size = 5000 + collection_w = self.init_collection_wrap( + name=cf.gen_unique_str(prefix), schema=schema + ) + fake = fake_en + data = [ + { + "id": i, + "sentence": fake.sentence() + " " + " ".join(non_zh_char_word_list), + "emb": [random.random() for _ in range(dim)], + } + for i in range(data_size) + ] + df = pd.DataFrame(data) + log.info(f"dataframe\n{df}") + batch_size = 5000 + for i in range(0, len(df), batch_size): + collection_w.insert( + data[i: i + batch_size] + if i + batch_size < len(df) + else data[i: len(df)] + ) + collection_w.flush() + collection_w.create_index( + "emb", + {"index_type": "IVF_SQ8", "metric_type": "L2", "params": {"nlist": 64}}, + ) + collection_w.load() + # analyze the croup + text_fields = ["sentence"] + # query sentence field with word list + for field in text_fields: + match_text = " ".join(non_zh_char_word_list) + expr = f"text_match({field}, '{match_text}')" + log.info(f"expr: {expr}") + res, _ = collection_w.query(expr=expr, output_fields=["id", field]) + pytest.assume(len(res) == 0, f"res len {len(res)}, data size {data_size}") + + + @pytest.mark.tags(CaseLabel.L0) + def test_query_text_match_with_combined_expression_for_single_field(self): + """ + target: test query text match with combined expression for single field + method: 1. enable text match, and insert data with varchar + 2. get the most common words and form the combined expression with and operator + 3. verify the result + expected: query successfully and result is correct + """ + analyzer_params = { + "tokenizer": "standard", + } + # 1. initialize with data + dim = 128 + fields = [ + FieldSchema(name="id", dtype=DataType.INT64, is_primary=True), + FieldSchema( + name="word", + dtype=DataType.VARCHAR, + max_length=65535, + enable_analyzer=True, + enable_match=True, + analyzer_params=analyzer_params, + ), + FieldSchema( + name="sentence", + dtype=DataType.VARCHAR, + max_length=65535, + enable_analyzer=True, + enable_match=True, + analyzer_params=analyzer_params, + ), + FieldSchema( + name="paragraph", + dtype=DataType.VARCHAR, + max_length=65535, + enable_analyzer=True, + enable_match=True, + analyzer_params=analyzer_params, + ), + FieldSchema( + name="text", + dtype=DataType.VARCHAR, + max_length=65535, + enable_analyzer=True, + enable_match=True, + analyzer_params=analyzer_params, + ), + FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim), + ] + schema = CollectionSchema(fields=fields, description="test collection") + data_size = 5000 + collection_w = self.init_collection_wrap( + name=cf.gen_unique_str(prefix), schema=schema + ) + fake = fake_en + language = "en" + data = [ + { + "id": i, + "word": fake.word().lower(), + "sentence": fake.sentence().lower(), + "paragraph": fake.paragraph().lower(), + "text": fake.text().lower(), + "emb": [random.random() for _ in range(dim)], + } + for i in range(data_size) + ] + df = pd.DataFrame(data) + batch_size = 5000 + for i in range(0, len(df), batch_size): + collection_w.insert( + data[i: i + batch_size] + if i + batch_size < len(df) + else data[i: len(df)] + ) + collection_w.flush() + collection_w.create_index( + "emb", + {"index_type": "IVF_SQ8", "metric_type": "L2", "params": {"nlist": 64}}, + ) + collection_w.load() + # analyze the croup and get the tf-idf, then base on it to crate expr and ground truth + text_fields = ["word", "sentence", "paragraph", "text"] + wf_map = {} + for field in text_fields: + wf_map[field] = cf.analyze_documents(df[field].tolist(), language=language) + + df_new = cf.split_dataframes(df, fields=text_fields) + log.info(f"df \n{df}") + log.info(f"new df \n{df_new}") + for field in text_fields: + expr_list = [] + wf_counter = Counter(wf_map[field]) + pd_tmp_res_list = [] + for word, count in wf_counter.most_common(2): + tmp = f"text_match({field}, '{word}')" + log.info(f"tmp expr {tmp}") + expr_list.append(tmp) + tmp_res = cf.manual_check_text_match(df_new, word, field) + log.info(f"manual check result for {tmp} {len(tmp_res)}") + pd_tmp_res_list.append(tmp_res) + log.info(f"manual res {len(pd_tmp_res_list)}, {pd_tmp_res_list}") + final_res = set(pd_tmp_res_list[0]) + for i in range(1, len(pd_tmp_res_list)): + final_res = final_res.intersection(set(pd_tmp_res_list[i])) + log.info(f"intersection res {len(final_res)}") + log.info(f"final res {final_res}") + and_expr = " and ".join(expr_list) + log.info(f"expr: {and_expr}") + res, _ = collection_w.query(expr=and_expr, output_fields=text_fields) + log.info(f"res len {len(res)}, final res {len(final_res)}") + assert len(res) == len(final_res) + + @pytest.mark.tags(CaseLabel.L0) + def test_query_text_match_with_combined_expression_for_multi_field(self): + """ + target: test query text match with combined expression for multi field + method: 1. enable text match, and insert data with varchar + 2. create the combined expression with `and`, `or` and `not` operator for multi field + 3. verify the result + expected: query successfully and result is correct + """ + analyzer_params = { + "tokenizer": "standard", + } + # 1. initialize with data + dim = 128 + fields = [ + FieldSchema(name="id", dtype=DataType.INT64, is_primary=True), + FieldSchema( + name="word", + dtype=DataType.VARCHAR, + max_length=65535, + enable_analyzer=True, + enable_match=True, + analyzer_params=analyzer_params, + ), + FieldSchema( + name="sentence", + dtype=DataType.VARCHAR, + max_length=65535, + enable_analyzer=True, + enable_match=True, + analyzer_params=analyzer_params, + ), + FieldSchema( + name="paragraph", + dtype=DataType.VARCHAR, + max_length=65535, + enable_analyzer=True, + enable_match=True, + analyzer_params=analyzer_params, + ), + FieldSchema( + name="text", + dtype=DataType.VARCHAR, + max_length=65535, + enable_analyzer=True, + enable_match=True, + analyzer_params=analyzer_params, + ), + FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim), + ] + schema = CollectionSchema(fields=fields, description="test collection") + data_size = 5000 + collection_w = self.init_collection_wrap( + name=cf.gen_unique_str(prefix), schema=schema + ) + fake = fake_en + language = "en" + data = [ + { + "id": i, + "word": fake.word().lower(), + "sentence": fake.sentence().lower(), + "paragraph": fake.paragraph().lower(), + "text": fake.text().lower(), + "emb": [random.random() for _ in range(dim)], + } + for i in range(data_size) + ] + df = pd.DataFrame(data) + batch_size = 5000 + for i in range(0, len(df), batch_size): + collection_w.insert( + data[i: i + batch_size] + if i + batch_size < len(df) + else data[i: len(df)] + ) + collection_w.flush() + collection_w.create_index( + "emb", + {"index_type": "IVF_SQ8", "metric_type": "L2", "params": {"nlist": 64}}, + ) + collection_w.load() + # analyze the croup and get the tf-idf, then base on it to crate expr and ground truth + text_fields = ["word", "sentence", "paragraph", "text"] + wf_map = {} + for field in text_fields: + wf_map[field] = cf.analyze_documents(df[field].tolist(), language=language) + + df_new = cf.split_dataframes(df, fields=text_fields) + log.info(f"new df \n{df_new}") + for i in range(2): + query, text_match_expr, pandas_expr = ( + cf.generate_random_query_from_freq_dict( + wf_map, min_freq=3, max_terms=5, p_not=0.2 + ) + ) + log.info(f"expr: {text_match_expr}") + res, _ = collection_w.query(expr=text_match_expr, output_fields=text_fields) + onetime_res = res + log.info(f"res len {len(res)}") + step_by_step_results = [] + for expr in query: + if isinstance(expr, dict): + if "not" in expr: + key = expr["not"]["field"] + else: + key = expr["field"] + + tmp_expr = cf.generate_text_match_expr(expr) + res, _ = collection_w.query( + expr=tmp_expr, output_fields=text_fields + ) + text_match_df = pd.DataFrame(res) + log.info( + f"text match res {len(text_match_df)}\n{text_match_df[key]}" + ) + log.info(f"tmp expr {tmp_expr} {len(res)}") + tmp_idx = [r["id"] for r in res] + step_by_step_results.append(tmp_idx) + pandas_filter_res = cf.generate_pandas_text_match_result( expr, df_new ) tmp_pd_idx = pandas_filter_res["id"].tolist() @@ -5298,7 +6012,7 @@ def test_query_text_match_with_multi_lang(self): dtype=DataType.VARCHAR, max_length=65535, enable_analyzer=True, - enable_match=True, + enable_match=True, analyzer_params=analyzer_params, ), FieldSchema( @@ -5306,7 +6020,7 @@ def test_query_text_match_with_multi_lang(self): dtype=DataType.VARCHAR, max_length=65535, enable_analyzer=True, - enable_match=True, + enable_match=True, analyzer_params=analyzer_params, ), FieldSchema( @@ -5314,7 +6028,7 @@ def test_query_text_match_with_multi_lang(self): dtype=DataType.VARCHAR, max_length=65535, enable_analyzer=True, - enable_match=True, + enable_match=True, analyzer_params=analyzer_params, ), FieldSchema( @@ -5322,7 +6036,7 @@ def test_query_text_match_with_multi_lang(self): dtype=DataType.VARCHAR, max_length=65535, enable_analyzer=True, - enable_match=True, + enable_match=True, analyzer_params=analyzer_params, ), FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim), @@ -5362,9 +6076,9 @@ def test_query_text_match_with_multi_lang(self): batch_size = 5000 for i in range(0, len(df), batch_size): collection_w.insert( - data[i : i + batch_size] + data[i: i + batch_size] if i + batch_size < len(df) - else data[i : len(df)] + else data[i: len(df)] ) collection_w.flush() collection_w.create_index( @@ -5383,9 +6097,9 @@ def test_query_text_match_with_multi_lang(self): batch_size = 5000 for i in range(0, len(df), batch_size): collection_w.insert( - data[i : i + batch_size] + data[i: i + batch_size] if i + batch_size < len(df) - else data[i : len(df)] + else data[i: len(df)] ) collection_w.flush() collection_w.create_index( @@ -5442,7 +6156,7 @@ def test_query_text_match_with_addition_inverted_index(self): dtype=DataType.VARCHAR, max_length=65535, enable_analyzer=True, - enable_match=True, + enable_match=True, analyzer_params=analyzer_params, ), FieldSchema( @@ -5450,7 +6164,7 @@ def test_query_text_match_with_addition_inverted_index(self): dtype=DataType.VARCHAR, max_length=65535, enable_analyzer=True, - enable_match=True, + enable_match=True, analyzer_params=analyzer_params, ), FieldSchema( @@ -5458,7 +6172,7 @@ def test_query_text_match_with_addition_inverted_index(self): dtype=DataType.VARCHAR, max_length=65535, enable_analyzer=True, - enable_match=True, + enable_match=True, analyzer_params=analyzer_params, ), FieldSchema( @@ -5466,7 +6180,7 @@ def test_query_text_match_with_addition_inverted_index(self): dtype=DataType.VARCHAR, max_length=65535, enable_analyzer=True, - enable_match=True, + enable_match=True, analyzer_params=analyzer_params, ), FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim), @@ -5493,7 +6207,7 @@ def test_query_text_match_with_addition_inverted_index(self): batch_size = 5000 for i in range(0, data_size, batch_size): collection_w.insert( - data[i : i + batch_size] + data[i: i + batch_size] if i + batch_size < data_size else data[i:data_size] ) @@ -5614,7 +6328,7 @@ def test_query_text_match_with_non_varchar_fields_expr(self, combine_op): batch_size = 5000 for i in range(0, data_size, batch_size): collection_w.insert( - data[i : i + batch_size] + data[i: i + batch_size] if i + batch_size < data_size else data[i:data_size] ) @@ -5645,9 +6359,6 @@ def test_query_text_match_with_non_varchar_fields_expr(self, combine_op): if combine_op == "or": assert token in r[field] or r["age"] > 10 - - - @pytest.mark.tags(CaseLabel.L1) def test_query_text_match_with_some_empty_string(self): """ @@ -5670,7 +6381,7 @@ def test_query_text_match_with_some_empty_string(self): dtype=DataType.VARCHAR, max_length=65535, enable_analyzer=True, - enable_match=True, + enable_match=True, analyzer_params=analyzer_params, ), FieldSchema( @@ -5678,7 +6389,7 @@ def test_query_text_match_with_some_empty_string(self): dtype=DataType.VARCHAR, max_length=65535, enable_analyzer=True, - enable_match=True, + enable_match=True, analyzer_params=analyzer_params, ), FieldSchema( @@ -5686,7 +6397,7 @@ def test_query_text_match_with_some_empty_string(self): dtype=DataType.VARCHAR, max_length=65535, enable_analyzer=True, - enable_match=True, + enable_match=True, analyzer_params=analyzer_params, ), FieldSchema( @@ -5694,7 +6405,7 @@ def test_query_text_match_with_some_empty_string(self): dtype=DataType.VARCHAR, max_length=65535, enable_analyzer=True, - enable_match=True, + enable_match=True, analyzer_params=analyzer_params, ), FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim), @@ -5733,9 +6444,9 @@ def test_query_text_match_with_some_empty_string(self): batch_size = 5000 for i in range(0, len(df), batch_size): collection_w.insert( - data[i : i + batch_size] + data[i: i + batch_size] if i + batch_size < len(df) - else data[i : len(df)] + else data[i: len(df)] ) collection_w.flush() collection_w.create_index( @@ -5928,7 +6639,7 @@ def test_query_text_match_with_unsupported_tokenizer(self): dtype=DataType.VARCHAR, max_length=65535, enable_analyzer=True, - enable_match=True, + enable_match=True, analyzer_params=analyzer_params, ), FieldSchema( @@ -5936,7 +6647,7 @@ def test_query_text_match_with_unsupported_tokenizer(self): dtype=DataType.VARCHAR, max_length=65535, enable_analyzer=True, - enable_match=True, + enable_match=True, analyzer_params=analyzer_params, ), FieldSchema( @@ -5944,7 +6655,7 @@ def test_query_text_match_with_unsupported_tokenizer(self): dtype=DataType.VARCHAR, max_length=65535, enable_analyzer=True, - enable_match=True, + enable_match=True, analyzer_params=analyzer_params, ), FieldSchema( @@ -5952,7 +6663,7 @@ def test_query_text_match_with_unsupported_tokenizer(self): dtype=DataType.VARCHAR, max_length=65535, enable_analyzer=True, - enable_match=True, + enable_match=True, analyzer_params=analyzer_params, ), FieldSchema( @@ -5960,7 +6671,7 @@ def test_query_text_match_with_unsupported_tokenizer(self): dtype=DataType.VARCHAR, max_length=65535, enable_analyzer=True, - enable_match=True, + enable_match=True, analyzer_params=analyzer_params, ), FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim), @@ -6023,7 +6734,8 @@ def test_query_invalid(self): (f"empty({default_int_field_name})", "function empty(int64_t) not found"), # starts_with (f"starts_with({default_int_field_name})", "function starts_with(int64_t) not found"), - (f"starts_with({default_int_field_name}, {default_int_field_name})", "function starts_with(int64_t, int64_t) not found"), + (f"starts_with({default_int_field_name}, {default_int_field_name})", + "function starts_with(int64_t, int64_t) not found"), ] for call_expr, err_msg in test_cases: error = {ct.err_code: 65535, ct.err_msg: err_msg} @@ -6081,5 +6793,3 @@ def test_query_text_match_with_unsupported_fields(self): check_task=CheckTasks.err_res, check_items=error, ) - -