Aleph-Alpha · volkerstampa · Feb 29, 2024 · Feb 29, 2024 · Feb 29, 2024 · Feb 29, 2024
diff --git a/docs/intelligence_layer.core.rst b/docs/intelligence_layer.core.rst
@@ -6,4 +6,4 @@ Module contents
 
 .. automodule:: intelligence_layer.core
 
-    .. autoclass:: Chunk
+    .. autoclass:: TextChunk
diff --git a/src/examples/classification.ipynb b/src/examples/classification.ipynb
@@ -45,12 +45,12 @@
     "from dotenv import load_dotenv\n",
     "\n",
     "from intelligence_layer.connectors import LimitedConcurrencyClient\n",
-    "from intelligence_layer.core import Chunk, InMemoryTracer\n",
+    "from intelligence_layer.core import TextChunk, InMemoryTracer\n",
     "from intelligence_layer.use_cases import ClassifyInput, PromptBasedClassify\n",
     "\n",
     "load_dotenv()\n",
     "\n",
-    "text_to_classify = Chunk(\n",
+    "text_to_classify = TextChunk(\n",
     "    \"In the distant future, a space exploration party embarked on a thrilling journey to the uncharted regions of the galaxy. \\n\\\n",
     "With excitement in their hearts and the cosmos as their canvas, they ventured into the unknown, discovering breathtaking celestial wonders. \\n\\\n",
     "As they gazed upon distant stars and nebulas, they forged unforgettable memories that would forever bind them as pioneers of the cosmos.\"\n",

diff --git a/src/examples/evaluation.ipynb b/src/examples/evaluation.ipynb
@@ -99,13 +99,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from intelligence_layer.core import Chunk, NoOpTracer\n",
+    "from intelligence_layer.core import TextChunk, NoOpTracer\n",
     "from intelligence_layer.use_cases import ClassifyInput\n",
     "from intelligence_layer.evaluation import Example\n",
     "\n",
     "\n",
     "classify_input = ClassifyInput(\n",
-    "    chunk=Chunk(\"This is good\"),\n",
+    "    chunk=TextChunk(\"This is good\"),\n",
     "    labels=frozenset({\"positive\", \"negative\"}),\n",
     ")\n",
     "\n",
@@ -191,7 +191,7 @@
     "dataset_id = dataset_repository.create_dataset(\n",
     "    examples=[\n",
     "        Example(\n",
-    "            input=ClassifyInput(chunk=Chunk(d[\"text\"]), labels=all_labels),\n",
+    "            input=ClassifyInput(chunk=TextChunk(d[\"text\"]), labels=all_labels),\n",
     "            expected_output=d[\"label_name\"],\n",
     "        )\n",
     "        for d in data\n",
@@ -391,7 +391,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.6"
   }
  },
  "nbformat": 4,

diff --git a/src/intelligence_layer/core/__init__.py b/src/intelligence_layer/core/__init__.py
@@ -7,15 +7,15 @@
 from .chunk import Chunk as Chunk
 from .chunk import ChunkInput as ChunkInput
 from .chunk import ChunkOutput as ChunkOutput
-from .chunk import ChunkOverlapTask as ChunkOverlapTask
-from .chunk import ChunkTask as ChunkTask
+from .chunk import ChunkOverlap as ChunkOverlap
+from .chunk import TextChunk as TextChunk
 from .detect_language import DetectLanguage as DetectLanguage
 from .detect_language import DetectLanguageInput as DetectLanguageInput
 from .detect_language import DetectLanguageOutput as DetectLanguageOutput
 from .detect_language import Language as Language
+from .echo import Echo as Echo
 from .echo import EchoInput as EchoInput
 from .echo import EchoOutput as EchoOutput
-from .echo import EchoTask as EchoTask
 from .echo import TokenWithLogProb as TokenWithLogProb
 from .instruct import Instruct as Instruct
 from .instruct import InstructInput as InstructInput

diff --git a/src/intelligence_layer/core/chunk.py b/src/intelligence_layer/core/chunk.py
@@ -3,11 +3,11 @@
 from pydantic import BaseModel
 from semantic_text_splitter import HuggingFaceTextSplitter
 
-from intelligence_layer.core.model import ControlModel
+from intelligence_layer.core.model import ControlModel, LuminousControlModel
 from intelligence_layer.core.task import Task
 from intelligence_layer.core.tracer.tracer import TaskSpan
 
-Chunk = NewType("Chunk", str)
+TextChunk = NewType("TextChunk", str)
 """Segment of a larger text.
 
 This type infers that the string is smaller than the context size of the model where it is used.
@@ -35,10 +35,10 @@ class ChunkOutput(BaseModel):
         chunks: A list of smaller sections of the input text.
     """
 
-    chunks: Sequence[Chunk]
+    chunks: Sequence[TextChunk]
 
 
-class ChunkTask(Task[ChunkInput, ChunkOutput]):
+class Chunk(Task[ChunkInput, ChunkOutput]):
     """Splits a longer text into smaller text chunks.
 
     Provide a text of any length and chunk it into smaller pieces using a
@@ -50,20 +50,23 @@ class ChunkTask(Task[ChunkInput, ChunkOutput]):
         max_tokens_per_chunk: The maximum number of tokens to fit into one chunk.
     """
 
-    def __init__(self, model: ControlModel, max_tokens_per_chunk: int):
+    def __init__(
+        self, model: ControlModel | None = None, max_tokens_per_chunk: int = 512
+    ):
         super().__init__()
+        model = model or LuminousControlModel()
         self._splitter = HuggingFaceTextSplitter(model.get_tokenizer())
         self._max_tokens_per_chunk = max_tokens_per_chunk
 
     def do_run(self, input: ChunkInput, task_span: TaskSpan) -> ChunkOutput:
         chunks = [
-            Chunk(t)
+            TextChunk(t)
             for t in self._splitter.chunks(input.text, self._max_tokens_per_chunk)
         ]
         return ChunkOutput(chunks=chunks)
 
 
-class ChunkOverlapTask(Task[ChunkInput, ChunkOutput]):
+class ChunkOverlap(Task[ChunkInput, ChunkOutput]):
     """Splits a longer text into smaller text chunks, where every chunk overlaps
     with the previous chunk by `overlap_length_tokens` number of tokens.
 
@@ -79,9 +82,9 @@ class ChunkOverlapTask(Task[ChunkInput, ChunkOutput]):
 
     def __init__(
         self,
-        model: ControlModel,
-        max_tokens_per_chunk: int,
-        overlap_length_tokens: int,
+        model: ControlModel | None = None,
+        max_tokens_per_chunk: int = 512,
+        overlap_length_tokens: int = 0,
     ):
         super().__init__()
         if overlap_length_tokens >= max_tokens_per_chunk:
@@ -90,8 +93,10 @@ def __init__(
                     overlap_length_tokens, max_tokens_per_chunk
                 )
             )
-        self.chunk_task = ChunkTask(model, overlap_length_tokens // 2)
+
+        model = model or LuminousControlModel()
         self.tokenizer = model.get_tokenizer()
+        self.chunk_task = Chunk(model, overlap_length_tokens // 2)
         self.max_tokens_per_chunk = max_tokens_per_chunk
         self.overlap_length_tokens = overlap_length_tokens
 

diff --git a/src/intelligence_layer/core/echo.py b/src/intelligence_layer/core/echo.py
@@ -4,7 +4,11 @@
 from pydantic import BaseModel
 from tokenizers import Encoding  # type: ignore
 
-from intelligence_layer.core.model import CompleteInput, ControlModel
+from intelligence_layer.core.model import (
+    CompleteInput,
+    ControlModel,
+    LuminousControlModel,
+)
 from intelligence_layer.core.prompt_template import PromptTemplate
 from intelligence_layer.core.task import Task, Token
 from intelligence_layer.core.tracer.tracer import TaskSpan
@@ -18,7 +22,7 @@ class TokenWithLogProb(BaseModel):
 
 
 class EchoInput(BaseModel):
-    """The input for an `EchoTask`.
+    """The input for an `Echo` task.
 
     Attributes:
         prompt: The input text that serves as the starting point for the LLM.
@@ -31,7 +35,7 @@ class EchoInput(BaseModel):
 
 
 class EchoOutput(BaseModel):
-    """The output of an `EchoTask`.
+    """The output of an `Echo` task.
 
     Attributes:
         tokens_with_log_probs: Every token of the `expected_completion` of the
@@ -42,7 +46,7 @@ class EchoOutput(BaseModel):
     tokens_with_log_probs: Sequence[TokenWithLogProb]
 
 
-class EchoTask(Task[EchoInput, EchoOutput]):
+class Echo(Task[EchoInput, EchoOutput]):
     """Task that returns probabilities of a completion given a prompt.
 
     Analyzes the likelihood of generating tokens in the expected completion based on
@@ -53,10 +57,10 @@ class EchoTask(Task[EchoInput, EchoOutput]):
 
     Example:
         >>> from aleph_alpha_client import Prompt
-        >>> from intelligence_layer.core import EchoTask,EchoInput, InMemoryTracer, LuminousControlModel
+        >>> from intelligence_layer.core import Echo, EchoInput, InMemoryTracer, LuminousControlModel
 
         >>> model = LuminousControlModel(name="luminous-base-control")
-        >>> task = EchoTask(model)
+        >>> task = Echo(model)
         >>> input = EchoInput(
         ...     prompt=Prompt.from_text("This is a "),
         ...     expected_completion="happy text",
@@ -67,9 +71,9 @@ class EchoTask(Task[EchoInput, EchoOutput]):
 
     PROMPT_TEMPLATE_STR: str = "{{prompt}}{{expected_completion}}"
 
-    def __init__(self, model: ControlModel) -> None:
+    def __init__(self, model: ControlModel | None = None) -> None:
         super().__init__()
-        self._model = model
+        self._model = model or LuminousControlModel()
 
     def do_run(self, input: EchoInput, task_span: TaskSpan) -> EchoOutput:
         # We tokenize the prompt separately so we don't have an overlap in the tokens.

diff --git a/src/intelligence_layer/core/instruct.py b/src/intelligence_layer/core/instruct.py
@@ -2,7 +2,12 @@
 
 from pydantic import BaseModel
 
-from intelligence_layer.core.model import CompleteInput, CompleteOutput, ControlModel
+from intelligence_layer.core.model import (
+    CompleteInput,
+    CompleteOutput,
+    ControlModel,
+    LuminousControlModel,
+)
 from intelligence_layer.core.task import Task
 from intelligence_layer.core.tracer.tracer import TaskSpan
 
@@ -15,9 +20,10 @@ class InstructInput(BaseModel):
 
 
 class Instruct(Task[InstructInput, CompleteOutput]):
-    def __init__(self, model: ControlModel) -> None:
+
+    def __init__(self, model: ControlModel | None = None) -> None:
         super().__init__()
-        self._model = model
+        self._model = model or LuminousControlModel()
 
     def do_run(self, input: InstructInput, task_span: TaskSpan) -> CompleteOutput:
         prompt = self._model.to_instruct_prompt(

diff --git a/src/intelligence_layer/core/model.py b/src/intelligence_layer/core/model.py
@@ -209,7 +209,7 @@ def __init__(
             "luminous-base-control-20240215",
             "luminous-extended-control-20240215",
             "luminous-supreme-control-20240215",
-        ],
+        ] = "luminous-base-control",
         client: Optional[AlephAlphaClientProtocol] = None,
     ) -> None:
         super().__init__(name, client)

diff --git a/src/intelligence_layer/core/text_highlight.py b/src/intelligence_layer/core/text_highlight.py
@@ -9,7 +9,12 @@
 from aleph_alpha_client.explanation import TextScoreWithRaw
 from pydantic import BaseModel
 
-from intelligence_layer.core.model import ControlModel, ExplainInput, ExplainOutput
+from intelligence_layer.core.model import (
+    ControlModel,
+    ExplainInput,
+    ExplainOutput,
+    LuminousControlModel,
+)
 from intelligence_layer.core.prompt_template import (
     Cursor,
     PromptRange,
@@ -97,11 +102,11 @@ class TextHighlight(Task[TextHighlightInput, TextHighlightOutput]):
 
     def __init__(
         self,
-        model: ControlModel,
+        model: ControlModel | None = None,
         granularity: PromptGranularity = PromptGranularity.Sentence,
     ) -> None:
         super().__init__()
-        self._model = model
+        self._model = model or LuminousControlModel()
         self._granularity = granularity
 
     def do_run(

diff --git a/src/intelligence_layer/core/tracer/composite_tracer.py b/src/intelligence_layer/core/tracer/composite_tracer.py
@@ -21,14 +21,14 @@ class CompositeTracer(Tracer, Generic[TracerVar]):
         tracers: tracers that will be forwarded all subsequent log and span calls.
 
     Example:
-        >>> from intelligence_layer.core import InMemoryTracer, FileTracer, CompositeTracer, Chunk
+        >>> from intelligence_layer.core import InMemoryTracer, FileTracer, CompositeTracer, TextChunk
         >>> from intelligence_layer.use_cases import PromptBasedClassify, ClassifyInput
 
         >>> tracer_1 = InMemoryTracer()
         >>> tracer_2 = InMemoryTracer()
         >>> tracer = CompositeTracer([tracer_1, tracer_2])
         >>> task = PromptBasedClassify()
-        >>> response = task.run(ClassifyInput(chunk=Chunk("Cool"), labels=frozenset({"label", "other label"})), tracer)
+        >>> response = task.run(ClassifyInput(chunk=TextChunk("Cool"), labels=frozenset({"label", "other label"})), tracer)
     """
 
     def __init__(self, tracers: Sequence[TracerVar]) -> None:

diff --git a/src/intelligence_layer/use_cases/classify/classify.py b/src/intelligence_layer/use_cases/classify/classify.py
@@ -3,7 +3,7 @@
 
 from pydantic import BaseModel
 
-from intelligence_layer.core import Chunk
+from intelligence_layer.core import TextChunk
 from intelligence_layer.evaluation import Example, MeanAccumulator
 from intelligence_layer.evaluation.base_logic import (
     AggregationLogic,
@@ -21,7 +21,7 @@ class ClassifyInput(BaseModel):
         labels: Possible labels the model will choose a label from
     """
 
-    chunk: Chunk
+    chunk: TextChunk
     labels: frozenset[str]
 
 

diff --git a/src/intelligence_layer/use_cases/classify/embedding_based_classify.py b/src/intelligence_layer/use_cases/classify/embedding_based_classify.py
@@ -12,7 +12,7 @@
     QdrantInMemoryRetriever,
     RetrieverType,
 )
-from intelligence_layer.core import Chunk, Task, TaskSpan
+from intelligence_layer.core import Task, TaskSpan, TextChunk
 from intelligence_layer.use_cases.classify.classify import (
     ClassifyInput,
     MultiLabelClassifyOutput,
@@ -129,7 +129,7 @@ class EmbeddingBasedClassify(Task[ClassifyInput, MultiLabelClassifyOutput]):
         >>> from intelligence_layer.connectors.limited_concurrency_client import (
         ...     LimitedConcurrencyClient,
         ... )
-        >>> from intelligence_layer.core import Chunk, InMemoryTracer
+        >>> from intelligence_layer.core import TextChunk, InMemoryTracer
         >>> from intelligence_layer.use_cases.classify.classify import ClassifyInput
         >>> from intelligence_layer.use_cases.classify.embedding_based_classify import (
         ...     EmbeddingBasedClassify,
@@ -153,7 +153,7 @@ class EmbeddingBasedClassify(Task[ClassifyInput, MultiLabelClassifyOutput]):
         ... ]
         >>> client = LimitedConcurrencyClient.from_env()
         >>> task = EmbeddingBasedClassify(client, labels_with_examples)
-        >>> input = ClassifyInput(chunk=Chunk("This is a happy text."), labels=frozenset({"positive", "negative"}))
+        >>> input = ClassifyInput(chunk=TextChunk("This is a happy text."), labels=frozenset({"positive", "negative"}))
         >>> tracer = InMemoryTracer()
         >>> output = task.run(input, tracer)
     """
@@ -212,7 +212,7 @@ def _validate_input_labels(self, input: ClassifyInput) -> None:
             raise ValueError(f"Got unexpected labels: {', '.join(unknown_labels)}.")
 
     def _label_search(
-        self, chunk: Chunk, label: str, task_span: TaskSpan
+        self, chunk: TextChunk, label: str, task_span: TaskSpan
     ) -> SearchOutput[int]:
         search_input = QdrantSearchInput(
             query=chunk,

diff --git a/src/intelligence_layer/use_cases/classify/keyword_extract.py b/src/intelligence_layer/use_cases/classify/keyword_extract.py
@@ -3,7 +3,7 @@
 from pydantic import BaseModel
 
 from intelligence_layer.core import Task, TaskSpan
-from intelligence_layer.core.chunk import Chunk
+from intelligence_layer.core.chunk import TextChunk
 from intelligence_layer.core.detect_language import Language, language_config
 from intelligence_layer.core.model import (
     CompleteInput,
@@ -31,7 +31,7 @@
 
 
 class KeywordExtractInput(BaseModel):
-    chunk: Chunk
+    chunk: TextChunk
     language: Language
Original file line number	Diff line number	Diff line change
Expand Up		@@ -6,4 +6,4 @@ Module contents

		.. automodule:: intelligence_layer.core

		.. autoclass:: Chunk
		.. autoclass:: TextChunk