From fbd67d556f54ec737a073cb75cb9fe8f2430b4d0 Mon Sep 17 00:00:00 2001 From: Yuhong Sun Date: Wed, 6 Sep 2023 15:18:01 -0700 Subject: [PATCH] works as discussed --- backend/danswer/search/semantic_search.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/backend/danswer/search/semantic_search.py b/backend/danswer/search/semantic_search.py index 5103d70d222..f4a332387a6 100644 --- a/backend/danswer/search/semantic_search.py +++ b/backend/danswer/search/semantic_search.py @@ -55,19 +55,26 @@ def semantic_reranking( query: str, chunks: list[InferenceChunk], ) -> list[InferenceChunk]: + model_max = 12 # These are just based on observations from model selection + model_min = -12 cross_encoders = get_default_reranking_model_ensemble() sim_scores = [ encoder.predict([(query, chunk.content) for chunk in chunks]) # type: ignore for encoder in cross_encoders ] + cross_models_min = numpy.min(sim_scores) + shifted_sim_scores = sum( - [enc_n_scores - numpy.min(enc_n_scores) for enc_n_scores in sim_scores] + [enc_n_scores - cross_models_min for enc_n_scores in sim_scores] ) / len(sim_scores) boosts = [translate_boost_count_to_multiplier(chunk.boost) for chunk in chunks] boosted_sim_scores = shifted_sim_scores * boosts - scored_results = list(zip(boosted_sim_scores, chunks)) + normalized_b_s_scores = (boosted_sim_scores + cross_models_min - model_min) / ( + model_max - model_min + ) + scored_results = list(zip(normalized_b_s_scores, chunks)) scored_results.sort(key=lambda x: x[0], reverse=True) ranked_sim_scores, ranked_chunks = zip(*scored_results)