Skip to content

Commit

Permalink
Openai encoding temp hotfix (#2094)
Browse files Browse the repository at this point in the history
  • Loading branch information
hagen-danswer authored Aug 9, 2024
1 parent 8cd1eda commit b230082
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 10 deletions.
22 changes: 22 additions & 0 deletions backend/danswer/natural_language_processing/search_nlp_models.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
import time

import requests
Expand Down Expand Up @@ -32,6 +33,25 @@ def clean_model_name(model_str: str) -> str:
return model_str.replace("/", "_").replace("-", "_").replace(".", "_")


_WHITELIST = set(
" !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\n\t"
)
_INITIAL_FILTER = re.compile(
"["
"\U00000080-\U0000FFFF" # All Unicode characters beyond ASCII
"\U00010000-\U0010FFFF" # All Unicode characters in supplementary planes
"]+",
flags=re.UNICODE,
)


def clean_openai_text(text: str) -> str:
# First, remove all weird characters
cleaned = _INITIAL_FILTER.sub("", text)
# Then, keep only whitelisted characters
return "".join(char for char in cleaned if char in _WHITELIST)


def build_model_server_url(
model_server_host: str,
model_server_port: int,
Expand Down Expand Up @@ -180,6 +200,8 @@ def encode(
]

if self.provider_type:
if self.provider_type == "openai":
texts = [clean_openai_text(text) for text in texts]
return self._encode_api_model(
texts=texts, text_type=text_type, batch_size=api_embedding_batch_size
)
Expand Down
14 changes: 4 additions & 10 deletions backend/danswer/natural_language_processing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,18 +111,12 @@ def _check_tokenizer_cache(tokenizer_name: str) -> BaseTokenizer:
return _TOKENIZER_CACHE[tokenizer_name]


def get_tokenizer(model_name: str | None, provider_type: str | None) -> BaseTokenizer:
if provider_type:
if provider_type.lower() == "openai":
# Used across ada and text-embedding-3 models
return _check_tokenizer_cache("openai")
# If we are given a cloud provider_type that isn't OpenAI, we default to trying to use the model_name
# this means we are approximating the token count which may leave some performance on the table
_DEFAULT_TOKENIZER: BaseTokenizer = HuggingFaceTokenizer(DOCUMENT_ENCODER_MODEL)

if not model_name:
raise ValueError("Need to provide a model_name or provider_type")

return _check_tokenizer_cache(model_name)
def get_tokenizer(model_name: str | None, provider_type: str | None) -> BaseTokenizer:
global _DEFAULT_TOKENIZER
return _DEFAULT_TOKENIZER


def tokenizer_trim_content(
Expand Down

0 comments on commit b230082

Please sign in to comment.