Skip to content
This repository has been archived by the owner on Feb 12, 2024. It is now read-only.

Commit

Permalink
check in partial progress (#102)
Browse files Browse the repository at this point in the history
* check in partial progress

* fix embedding

* Add the helper for embddings
  • Loading branch information
emrgnt-cmplxty authored Oct 25, 2023
1 parent ce15e9f commit 8b1de19
Show file tree
Hide file tree
Showing 13 changed files with 372 additions and 11 deletions.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ llama-index = { version = "^0.8.29.post1", optional = true }
# vllm
# accelerate = { version = "^0.23.0", optional = true } ## Defined above in 'hf'
vllm = { version = "0.2.0", optional = true }
blingfire = "^0.1.8"

[tool.poetry.extras]
anthropic_support = ["anthropic"]
Expand Down
28 changes: 20 additions & 8 deletions sciphi/llm/__init__.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,28 @@
from sciphi.llm.anthropic_llm import AnthropicConfig, AnthropicLLM
from sciphi.llm.base import LLM, LLMConfig, ModelName
from sciphi.llm.config_manager import LLMConfigManager
from sciphi.llm.hugging_face_llm import HuggingFaceConfig, HuggingFaceLLM
from sciphi.llm.lite_llm import LiteLLM, LiteLLMConfig
from sciphi.llm.llama_index_llm import LLamaIndexConfig, LlamaIndexLLM
from sciphi.llm.llamacpp_llm import LlamaCPP, LLamaCPPConfig
from sciphi.llm.openai_llm import OpenAIConfig, OpenAILLM
from sciphi.llm.vllm_llm import vLLM, vLLMConfig
from sciphi.llm.embedding_helpers import (
process_documents,
sectionize_documents,
sentencize,
)
from sciphi.llm.models.anthropic_llm import AnthropicConfig, AnthropicLLM
from sciphi.llm.models.hugging_face_llm import (
HuggingFaceConfig,
HuggingFaceLLM,
)
from sciphi.llm.models.lite_llm import LiteLLM, LiteLLMConfig
from sciphi.llm.models.llama_index_llm import LLamaIndexConfig, LlamaIndexLLM
from sciphi.llm.models.llamacpp_llm import LlamaCPP, LLamaCPPConfig
from sciphi.llm.models.openai_llm import OpenAIConfig, OpenAILLM
from sciphi.llm.models.vllm_llm import vLLM, vLLMConfig

__all__ = [
# Base
"LLM",
"ModelName",
"LLMConfig",
"LLMConfigManager",
# Provider Models
# Provider LLM Models
"AnthropicConfig",
"AnthropicLLM",
"HuggingFaceConfig",
Expand All @@ -29,4 +37,8 @@
"LiteLLM",
"LLamaCPPConfig",
"LlamaCPP",
# Embedding Helpers
"process_documents",
"sectionize_documents",
"sentencize",
]
116 changes: 116 additions & 0 deletions sciphi/llm/embedding_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import blingfire as bf
import pandas as pd
from tqdm import tqdm


def process_documents(
documents: list[str],
document_ids: list[int],
split_sentences: bool = True,
filter_len: int = 3,
disable_progress_bar: bool = False,
) -> pd.DataFrame:
"""
Main helper function to process documents from the EMR.
:param documents: Iterable containing documents which are strings
:param document_ids: Iterable containing document unique identifiers
:param document_type: String denoting the document type to be processed
:param document_sections: List of sections for a given document type to process
:param split_sentences: Flag to determine whether to further split sections into sentences
:param filter_len: Minimum character length of a sentence (otherwise filter out)
:param disable_progress_bar: Flag to disable tqdm progress bar
:return: Pandas DataFrame containing the columns `document_id`, `text`, `section`, `offset`
"""

df = sectionize_documents(documents, document_ids, disable_progress_bar)

if split_sentences:
df = sentencize(
df.text.values,
df.document_id.values,
df.offset.values,
filter_len,
disable_progress_bar,
)
return df


def sectionize_documents(
documents: list[str],
document_ids: list[int],
disable_progress_bar: bool = False,
) -> pd.DataFrame:
"""
Obtains the sections of the imaging reports and returns only the
selected sections (defaults to FINDINGS, IMPRESSION, and ADDENDUM).
:param documents: Iterable containing documents which are strings
:param document_ids: Iterable containing document unique identifiers
:param disable_progress_bar: Flag to disable tqdm progress bar
:return: Pandas DataFrame containing the columns `document_id`, `text`, `offset`
"""
processed_documents = []
for document_id, document in tqdm(
zip(document_ids, documents),
total=len(documents),
disable=disable_progress_bar,
):
text, start, end = (document, 0, len(document))
row = {
"document_id": document_id,
"text": text,
"offset": (start, end),
}
processed_documents.append(row)

_df = pd.DataFrame(processed_documents)
if _df.shape[0] > 0:
return _df.sort_values(["document_id", "offset"]).reset_index(
drop=True
)
else:
return _df


def sentencize(
documents: list[str],
document_ids: list[int],
offsets: list[tuple[int, int]],
filter_len: int = 3,
disable_progress_bar: bool = False,
) -> pd.DataFrame:
"""
Split a document into sentences. Can be used with `sectionize_documents`
to further split documents into more manageable pieces. Takes in offsets
to ensure that after splitting, the sentences can be matched to the
location in the original documents.
:param documents: Iterable containing documents which are strings
:param document_ids: Iterable containing document unique identifiers
:param offsets: Iterable tuple of the start and end indices
:param filter_len: Minimum character length of a sentence (otherwise filter out)
:return: Pandas DataFrame containing the columns `document_id`, `text`, `section`, `offset`
"""

document_sentences = []
for document, document_id, offset in tqdm(
zip(documents, document_ids, offsets),
total=len(documents),
disable=disable_progress_bar,
):
try:
_, sentence_offsets = bf.text_to_sentences_and_offsets(document)
for o in sentence_offsets:
if o[1] - o[0] > filter_len:
sentence = document[o[0] : o[1]]
abs_offsets = (o[0] + offset[0], o[1] + offset[0])
row = {
"document_id": document_id,
"text": sentence,
"offset": abs_offsets,
}
document_sentences.append(row)
except:
continue
return pd.DataFrame(document_sentences)
Empty file added sciphi/llm/models/__init__.py
Empty file.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,19 @@
import os
from dataclasses import dataclass

from sciphi.core import LLMProviderName
from sciphi.core.utils import get_data_dir
from sciphi.interface.base import LLMProviderName
from sciphi.llm.base import LLM
from sciphi.llm.config_manager import model_config
from sciphi.llm.openai_llm import OpenAIConfig
from sciphi.llm.models.openai_llm import OpenAIConfig


@model_config
@dataclass
class LLamaIndexConfig(OpenAIConfig):
"""A class to manage the configurations for LlamaIndex."""

llm_provider_name: LLMProviderName = LLMProviderName.LLAMA_INDEX
llm_provider_name: "LLMProviderName" = LLMProviderName.LLAMA_INDEX

# LlamaIndex-specific configs
# Defaults to the library of phi textbook
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
3 changes: 3 additions & 0 deletions sciphi/scripts/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
import dotenv

dotenv.load_dotenv()
Loading

0 comments on commit 8b1de19

Please sign in to comment.