check in partial progress (#102)

* check in partial progress * fix embedding * Add the helper for embddings
SciPhi-AI · Oct 25, 2023 · 8b1de19 · 8b1de19
1 parent ce15e9f
commit 8b1de19
Show file tree

Hide file tree

Showing 13 changed files with 372 additions and 11 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -44,6 +44,7 @@ llama-index = { version = "^0.8.29.post1", optional = true }
 # vllm
 # accelerate = { version = "^0.23.0", optional = true } ## Defined above in 'hf'
 vllm = { version = "0.2.0", optional = true }
+blingfire = "^0.1.8"
 
 [tool.poetry.extras]
 anthropic_support = ["anthropic"]

diff --git a/sciphi/llm/__init__.py b/sciphi/llm/__init__.py
@@ -1,20 +1,28 @@
-from sciphi.llm.anthropic_llm import AnthropicConfig, AnthropicLLM
 from sciphi.llm.base import LLM, LLMConfig, ModelName
 from sciphi.llm.config_manager import LLMConfigManager
-from sciphi.llm.hugging_face_llm import HuggingFaceConfig, HuggingFaceLLM
-from sciphi.llm.lite_llm import LiteLLM, LiteLLMConfig
-from sciphi.llm.llama_index_llm import LLamaIndexConfig, LlamaIndexLLM
-from sciphi.llm.llamacpp_llm import LlamaCPP, LLamaCPPConfig
-from sciphi.llm.openai_llm import OpenAIConfig, OpenAILLM
-from sciphi.llm.vllm_llm import vLLM, vLLMConfig
+from sciphi.llm.embedding_helpers import (
+    process_documents,
+    sectionize_documents,
+    sentencize,
+)
+from sciphi.llm.models.anthropic_llm import AnthropicConfig, AnthropicLLM
+from sciphi.llm.models.hugging_face_llm import (
+    HuggingFaceConfig,
+    HuggingFaceLLM,
+)
+from sciphi.llm.models.lite_llm import LiteLLM, LiteLLMConfig
+from sciphi.llm.models.llama_index_llm import LLamaIndexConfig, LlamaIndexLLM
+from sciphi.llm.models.llamacpp_llm import LlamaCPP, LLamaCPPConfig
+from sciphi.llm.models.openai_llm import OpenAIConfig, OpenAILLM
+from sciphi.llm.models.vllm_llm import vLLM, vLLMConfig
 
 __all__ = [
     # Base
     "LLM",
     "ModelName",
     "LLMConfig",
     "LLMConfigManager",
-    # Provider Models
+    # Provider LLM Models
     "AnthropicConfig",
     "AnthropicLLM",
     "HuggingFaceConfig",
@@ -29,4 +37,8 @@
     "LiteLLM",
     "LLamaCPPConfig",
     "LlamaCPP",
+    # Embedding Helpers
+    "process_documents",
+    "sectionize_documents",
+    "sentencize",
 ]
diff --git a/sciphi/llm/embedding_helpers.py b/sciphi/llm/embedding_helpers.py
@@ -0,0 +1,116 @@
+import blingfire as bf
+import pandas as pd
+from tqdm import tqdm
+
+
+def process_documents(
+    documents: list[str],
+    document_ids: list[int],
+    split_sentences: bool = True,
+    filter_len: int = 3,
+    disable_progress_bar: bool = False,
+) -> pd.DataFrame:
+    """
+    Main helper function to process documents from the EMR.
+
+    :param documents: Iterable containing documents which are strings
+    :param document_ids: Iterable containing document unique identifiers
+    :param document_type: String denoting the document type to be processed
+    :param document_sections: List of sections for a given document type to process
+    :param split_sentences: Flag to determine whether to further split sections into sentences
+    :param filter_len: Minimum character length of a sentence (otherwise filter out)
+    :param disable_progress_bar: Flag to disable tqdm progress bar
+    :return: Pandas DataFrame containing the columns `document_id`, `text`, `section`, `offset`
+    """
+
+    df = sectionize_documents(documents, document_ids, disable_progress_bar)
+
+    if split_sentences:
+        df = sentencize(
+            df.text.values,
+            df.document_id.values,
+            df.offset.values,
+            filter_len,
+            disable_progress_bar,
+        )
+    return df
+
+
+def sectionize_documents(
+    documents: list[str],
+    document_ids: list[int],
+    disable_progress_bar: bool = False,
+) -> pd.DataFrame:
+    """
+    Obtains the sections of the imaging reports and returns only the
+    selected sections (defaults to FINDINGS, IMPRESSION, and ADDENDUM).
+
+    :param documents: Iterable containing documents which are strings
+    :param document_ids: Iterable containing document unique identifiers
+    :param disable_progress_bar: Flag to disable tqdm progress bar
+    :return: Pandas DataFrame containing the columns `document_id`, `text`, `offset`
+    """
+    processed_documents = []
+    for document_id, document in tqdm(
+        zip(document_ids, documents),
+        total=len(documents),
+        disable=disable_progress_bar,
+    ):
+        text, start, end = (document, 0, len(document))
+        row = {
+            "document_id": document_id,
+            "text": text,
+            "offset": (start, end),
+        }
+        processed_documents.append(row)
+
+    _df = pd.DataFrame(processed_documents)
+    if _df.shape[0] > 0:
+        return _df.sort_values(["document_id", "offset"]).reset_index(
+            drop=True
+        )
+    else:
+        return _df
+
+
+def sentencize(
+    documents: list[str],
+    document_ids: list[int],
+    offsets: list[tuple[int, int]],
+    filter_len: int = 3,
+    disable_progress_bar: bool = False,
+) -> pd.DataFrame:
+    """
+    Split a document into sentences. Can be used with `sectionize_documents`
+    to further split documents into more manageable pieces. Takes in offsets
+    to ensure that after splitting, the sentences can be matched to the
+    location in the original documents.
+
+    :param documents: Iterable containing documents which are strings
+    :param document_ids: Iterable containing document unique identifiers
+    :param offsets: Iterable tuple of the start and end indices
+    :param filter_len: Minimum character length of a sentence (otherwise filter out)
+    :return: Pandas DataFrame containing the columns `document_id`, `text`, `section`, `offset`
+    """
+
+    document_sentences = []
+    for document, document_id, offset in tqdm(
+        zip(documents, document_ids, offsets),
+        total=len(documents),
+        disable=disable_progress_bar,
+    ):
+        try:
+            _, sentence_offsets = bf.text_to_sentences_and_offsets(document)
+            for o in sentence_offsets:
+                if o[1] - o[0] > filter_len:
+                    sentence = document[o[0] : o[1]]
+                    abs_offsets = (o[0] + offset[0], o[1] + offset[0])
+                    row = {
+                        "document_id": document_id,
+                        "text": sentence,
+                        "offset": abs_offsets,
+                    }
+                    document_sentences.append(row)
+        except:
+            continue
+    return pd.DataFrame(document_sentences)
diff --git a/sciphi/llm/models/__init__.py b/sciphi/llm/models/__init__.py
diff --git a/sciphi/llm/anthropic_llm.py → sciphi/llm/models/anthropic_llm.py b/sciphi/llm/anthropic_llm.py → sciphi/llm/models/anthropic_llm.py
diff --git a/sciphi/llm/hugging_face_llm.py → sciphi/llm/models/hugging_face_llm.py b/sciphi/llm/hugging_face_llm.py → sciphi/llm/models/hugging_face_llm.py
diff --git a/sciphi/llm/lite_llm.py → sciphi/llm/models/lite_llm.py b/sciphi/llm/lite_llm.py → sciphi/llm/models/lite_llm.py
diff --git a/sciphi/llm/llama_index_llm.py → sciphi/llm/models/llama_index_llm.py b/sciphi/llm/llama_index_llm.py → sciphi/llm/models/llama_index_llm.py
@@ -2,19 +2,19 @@
 import os
 from dataclasses import dataclass
 
+from sciphi.core import LLMProviderName
 from sciphi.core.utils import get_data_dir
-from sciphi.interface.base import LLMProviderName
 from sciphi.llm.base import LLM
 from sciphi.llm.config_manager import model_config
-from sciphi.llm.openai_llm import OpenAIConfig
+from sciphi.llm.models.openai_llm import OpenAIConfig
 
 
 @model_config
 @dataclass
 class LLamaIndexConfig(OpenAIConfig):
     """A class to manage the configurations for LlamaIndex."""
 
-    llm_provider_name: LLMProviderName = LLMProviderName.LLAMA_INDEX
+    llm_provider_name: "LLMProviderName" = LLMProviderName.LLAMA_INDEX
 
     # LlamaIndex-specific configs
     # Defaults to the library of phi textbook

diff --git a/sciphi/llm/llamacpp_llm.py → sciphi/llm/models/llamacpp_llm.py b/sciphi/llm/llamacpp_llm.py → sciphi/llm/models/llamacpp_llm.py
diff --git a/sciphi/llm/openai_llm.py → sciphi/llm/models/openai_llm.py b/sciphi/llm/openai_llm.py → sciphi/llm/models/openai_llm.py
diff --git a/sciphi/llm/vllm_llm.py → sciphi/llm/models/vllm_llm.py b/sciphi/llm/vllm_llm.py → sciphi/llm/models/vllm_llm.py
diff --git a/sciphi/scripts/__init__.py b/sciphi/scripts/__init__.py
@@ -0,0 +1,3 @@
+import dotenv
+
+dotenv.load_dotenv()