Lots of documentation updates

parkervg · Oct 22, 2024 · b04fb51 · b04fb51
1 parent e67217b
commit b04fb51
Show file tree

Hide file tree

Showing 12 changed files with 305 additions and 168 deletions.
diff --git a/blendsql/ingredients/builtin/map/main.py b/blendsql/ingredients/builtin/map/main.py
@@ -1,6 +1,7 @@
 import copy
 import logging
-from typing import Union, Iterable, Any, Dict, Optional, List, Callable, Tuple
+from typing import Union, Iterable, Any, Optional, List, Callable, Tuple, Literal
+from collections.abc import Collection
 from pathlib import Path
 import re
 import json
@@ -260,13 +261,12 @@ def run(
         question: str,
         values: List[str],
         few_shot_retriever: Callable[[str], List[AnnotatedMapExample]] = None,
-        options: List[str] = None,
+        options: Collection[str] = None,
         list_options_in_prompt: bool = None,
         value_limit: Union[int, None] = None,
         example_outputs: Optional[str] = None,
-        output_type: Optional[str] = None,
-        regex: Optional[Callable[[int], str]] = None,
-        table_to_title: Optional[Dict[str, str]] = None,
+        output_type: Optional[Literal["integer", "float", "string", "boolean"]] = None,
+        regex: Optional[str] = None,
         batch_size: int = DEFAULT_MAP_BATCH_SIZE,
         **kwargs,
     ) -> Iterable[Any]:
@@ -277,10 +277,9 @@ def run(
             model: The Model (blender) we will make calls to.
             values: The list of values to apply question to.
             value_limit: Optional limit on the number of values to pass to the Model
-            example_outputs: If binary == False, this gives the Model an example of the output we expect.
-            output_type: One of 'numeric', 'string', 'bool'
+            example_outputs: This gives the Model an example of the output we expect.
+            output_type: In the absence of example_outputs, give the Model some signal as to what we expect as output.
             regex: Optional regex to constrain answer generation.
-            table_to_title: Mapping from tablename to a title providing some more context.
 
         Returns:
             Iterable[Any] containing the output of the Model for each value.

diff --git a/blendsql/ingredients/builtin/qa/main.py b/blendsql/ingredients/builtin/qa/main.py
@@ -1,7 +1,8 @@
 import copy
 from ast import literal_eval
 from pathlib import Path
-from typing import Dict, Union, Optional, Set, Tuple, Callable, List, Literal
+from typing import Union, Optional, Tuple, Callable, List, Literal
+from collections.abc import Collection
 import pandas as pd
 import json
 from colorama import Fore
@@ -270,17 +271,48 @@ def run(
         question: str,
         context_formatter: Callable[[pd.DataFrame], str],
         few_shot_retriever: Callable[[str], List[AnnotatedQAExample]] = None,
-        options: Optional[Set[str]] = None,
+        options: Optional[Collection[str]] = None,
         list_options_in_prompt: bool = None,
         modifier: Optional[Literal["*", "+"]] = None,
-        output_type: Optional[str] = None,
+        output_type: Optional[
+            Literal[
+                "integer",
+                "float",
+                "string",
+                "boolean",
+                "List[integer]",
+                "List[float]",
+                "List[string]",
+                "List[boolean]",
+            ]
+        ] = None,
         regex: Optional[str] = None,
         context: Optional[pd.DataFrame] = None,
         value_limit: Optional[int] = None,
-        table_to_title: Optional[Dict[str, str]] = None,
         long_answer: bool = False,
         **kwargs,
-    ) -> Union[str, int, float]:
+    ) -> Union[str, int, float, tuple]:
+        """
+        Args:
+            question: The question to map onto the values. Will also be the new column name
+            context: Table subset to use as context in answering question
+            model: The Model (blender) we will make calls to.
+            context_formatter: Callable defining how we want to serialize table context.
+            few_shot_retriever: Callable which takes a string, and returns n most similar few-shot examples
+            options: Optional collection with which we try to constrain generation.
+            list_options_in_prompt: Defines whether we include options in the prompt for the current inference example
+            modifier: If we expect an array of scalars, this defines the regex we want to apply.
+                Used directly for constrained decoding at inference time if we have a guidance model.
+            output_type: In the absence of example_outputs, give the Model some signal as to what we expect as output.
+            regex: Optional regex to constrain answer generation.
+            value_limit: Optional limit on how many rows from context we use
+            long_answer: If true, we more closely mimic long-form end-to-end question answering.
+                If false, we just give the answer with no explanation or context
+
+        Returns:
+            Union[str, int, float, tuple] containing the response from the model.
+                Response will only be a tuple if `modifier` is not None.
+        """
         if model is None:
             raise IngredientException(
                 "LLMQA requires a `Model` object, but nothing was passed!\nMost likely you forgot to set the `default_model` argument in `blend()`"

diff --git a/blendsql/ingredients/builtin/rag_qa/main.py b/blendsql/ingredients/builtin/rag_qa/main.py
@@ -1,28 +1,47 @@
-from typing import Literal
+from typing import Literal, Tuple
+from collections.abc import Collection
 from textwrap import dedent
-from attr import attrs
 
 from blendsql.db.utils import double_quote_escape
-from blendsql.ingredients.ingredient import AliasIngredient
-from blendsql.ingredients.builtin.qa import LLMQA
+from blendsql.ingredients.ingredient import AliasIngredient, Ingredient
 from blendsql.ingredients.builtin.web_search import BingWebSearch
 from blendsql._exceptions import IngredientException
 
 
-@attrs
 class RAGQA(AliasIngredient):
     def run(
         self, question: str, source: Literal["bing"] = "bing", *args, **kwargs
-    ) -> str:
+    ) -> Tuple[str, Collection[Ingredient]]:
+        '''Returns a subquery which first fetches relevant context from a source,
+        and returns a retrieval-augmented LM generation.
+
+        Arguments:
+            question: The query string to use for both retrieval and generation
+            source: The source of the retrieved information. Currently only supports 'bing'
+
+        Examples:
+            ```python
+            from blendsql.ingredients import RAGQA
+            # Among the schools with the average score in Math over 560 in the SAT test, how many schools are in the bay area?
+            blendsql_query = """
+            SELECT COUNT(DISTINCT s.CDSCode)
+                FROM schools s
+                JOIN satscores sa ON s.CDSCode = sa.cds
+                WHERE sa.AvgScrMath > 560
+                AND s.County IN {{RAGQA('Which counties are in the Bay Area?')}}
+            """
+            ingredients = {RAGQA}
+            ...
+            ```
+        '''
         if source == "bing":
             rag_ingredient = BingWebSearch
         else:
             raise IngredientException(
                 f"RAGQA not setup to handle source '{source}' yet"
             )
-        return (
-            dedent(
-                f"""
+        new_query = dedent(
+            f"""
         {{{{
             LLMQA(
                 "{double_quote_escape(question)}", 
@@ -34,6 +53,5 @@ def run(
             )
         }}}}
         """
-            ),
-            {LLMQA, rag_ingredient},
         )
+        return (new_query, {rag_ingredient})
diff --git a/blendsql/ingredients/ingredient.py b/blendsql/ingredients/ingredient.py
@@ -77,10 +77,39 @@ def maybe_get_temp_table(
 
 @attrs
 class AliasIngredient(Ingredient):
-    """This ingredient performs no other function than to act as a stand-in for
+    '''This ingredient performs no other function than to act as a stand-in for
     complex chainings of other ingredients. This allows us (or our lms) to write less verbose
     BlendSQL queries, while maximizing the information we embed.
-    """
+
+    The `run()` function should return a tuple containing both the query text that should get subbed in,
+    and any dependent ingredient classes we need to load to execute the aliased query.
+
+    Examples:
+        ```python
+        from textwrap import dedent
+        from typing import Tuple, Collection
+
+        from blendsql.ingredients import AliasIngredient, LLMQA
+
+        class FetchDefinition(AliasIngredient):
+            def run(self, term: str, *args, **kwargs) -> Tuple[str, Collection[Ingredient]]:
+                new_query = dedent(
+                f"""
+                {{{{
+                    LLMQA(
+                        "What does {term} mean?"
+                    )
+                }}}}
+                """)
+                dependent_ingredients = {LLMQA}
+                return (new_query, dependent_ingredients)
+
+        # Now, we can use the ingredient like below
+        blendsql_query = """
+        SELECT {{FetchDefinition('delve')}} AS "Definition"
+        """
+        ```
+    '''
 
     ingredient_type: str = IngredientType.ALIAS.value
     allowed_output_types: Tuple[Type] = Tuple[str, Collection[Ingredient]]
@@ -91,8 +120,50 @@ def __call__(self, *args, **kwargs):
 
 @attrs
 class MapIngredient(Ingredient):
-    """For a given table/column pair, maps an external function
-    to each of the given values, creating a new column."""
+    '''For a given table/column pair, maps an external function
+    to each of the given values, creating a new column.
+
+    Examples:
+        ```python
+        from typing import List
+        from blendsql.ingredients import MapIngredient
+        import requests
+
+
+        class GetQRCode(MapIngredient):
+            """Calls API to generate QR code for a given URL.
+            Saves bytes to file in qr_codes/ and returns list of paths.
+            https://goqr.me/api/doc/create-qr-code/"""
+
+
+            def run(self, values: List[str], **kwargs) -> List[str]:
+                imgs_as_bytes = []
+                for value in values:
+                    qr_code_bytes = requests.get(
+                        "https://api.qrserver.com/v1/create-qr-code/?data=https://{}/&size=100x100".format(value)
+                    ).content
+                    imgs_as_bytes.append(qr_code_bytes)
+                return imgs_as_bytes
+
+
+            if __name__ == "__main__":
+                from blendsql import blend
+                from blendsql.db import SQLite
+                from blendsql.utils import fetch_from_hub
+
+                blendsql = "SELECT genre, url, {{GetQRCode('QR Code as Bytes:', 'w::url')}} FROM w WHERE genre = 'social'"
+
+                smoothie = blend(
+                    query=blendsql,
+                    default_model=None,
+                    db=SQLite(fetch_from_hub("urls.db")),
+                    ingredients={GetQRCode}
+                )
+                # | genre  | url           | QR Code as Bytes:      |
+                # |--------|---------------|-----------------------|
+                # | social | facebook.com  | b'...'                |
+        ```
+    '''
 
     ingredient_type: str = IngredientType.MAP.value
     allowed_output_types: Tuple[Type] = (Iterable[Any],)
@@ -218,11 +289,33 @@ def run(self, *args, **kwargs) -> Iterable[Any]:
 
 @attrs
 class JoinIngredient(Ingredient):
-    """Executes an `INNER JOIN` using dict mapping.
-    Example:
-        'Join on color of food'
-        {"tomato": "red", "broccoli": "green", "lemon": "yellow"}
-    """
+    '''Executes an `INNER JOIN` using dict mapping.
+    'Join on color of food'
+    {"tomato": "red", "broccoli": "green", "lemon": "yellow"}
+
+    Examples:
+        ```python
+        from blendsql.ingredients import JoinIngredient
+
+        class do_join(JoinIngredient):
+            """A very silly, overcomplicated way to do a traditional SQL join.
+            But useful for testing.
+            """
+
+            def run(self, left_values: List[str], right_values: List[str], **kwargs) -> dict:
+                return {left_value: left_value for left_value in left_values}
+
+        blendsql_query = """
+        SELECT Account, Quantity FROM returns
+        JOIN {{
+            do_join(
+                left_on='account_history::Symbol',
+                right_on='returns::Symbol'
+            )
+        }}
+        """
+        ```
+    '''
 
     use_skrub_joiner: bool = attrib(default=True)
 
@@ -376,6 +469,79 @@ def run(self, *args, **kwargs) -> dict:
 
 @attrs
 class QAIngredient(Ingredient):
+    '''
+    Given a table subset in the form of a pd.DataFrame 'context',
+    returns a scalar or array of scalars (in the form of a tuple).
+
+    Useful for end-to-end question answering tasks.
+
+    Examples:
+        ```python
+        import pandas as pd
+        import guidance
+
+        from blendsql.models import Model, LocalModel, RemoteModel
+        from blendsql.ingredients import QAIngredient
+        from blendsql.ingredients.generate import generate
+        from blendsql._program import Program
+
+
+        class SummaryProgram(Program):
+            """Program to call Model and return summary of the passed table.
+            """
+
+            def __call__(self, model: Model, serialized_db: str):
+                prompt = f"Summarize the table. {serialized_db}"
+                if isinstance(model, LocalModel):
+                    # Below we follow the guidance pattern for unconstrained text generation
+                    # https://github.com/guidance-ai/guidance
+                    response = (model.model_obj + guidance.gen(max_tokens=20, name="response"))._variables["response"]
+                else:
+                    response = generate(
+                        model.model_obj,
+                        messages_list=[[{"role": "user", "content": prompt}]],
+                        max_tokens=20
+                    )[0]
+                # Finally, return (response, prompt) tuple
+                # Returning the prompt here allows the underlying BlendSQL classes to track token usage
+                return (response, prompt)
+
+
+            class TableSummary(QAIngredient):
+                def run(self, model: Model, context: pd.DataFrame, **kwargs) -> str:
+                    result = model.predict(program=SummaryProgram, serialized_db=context.to_string())
+                    return f"'{result}'"
+
+
+            if __name__ == "__main__":
+                from blendsql import blend
+                from blendsql.db import SQLite
+                from blendsql.utils import fetch_from_hub
+                from blendsql.models import OpenaiLLM
+
+                blendsql = """
+                SELECT {{
+                    TableSummary(
+                        context=(SELECT * FROM transactions LIMIT 10)
+                    )
+                }} AS "Summary"
+                """
+
+                smoothie = blend(
+                    query=blendsql,
+                    default_model=OpenaiLLM("gpt-4o-mini"),
+                    db=SQLite(fetch_from_hub("single_table.db")),
+                    ingredients={TableSummary}
+                )
+                # Now, we can get results
+                print(smoothie.df)
+                # 'The table summarizes a series of cash flow transactions made through Zelle'
+                # ...and token usage
+                print(smoothie.meta.prompt_tokens)
+                print(smoothie.meta.completion_tokens)
+        ```
+    '''
+
     ingredient_type: str = IngredientType.QA.value
     allowed_output_types: Tuple[Type] = (Union[str, int, float, tuple],)
 

diff --git a/blendsql/parse/_transforms.py b/blendsql/parse/_transforms.py
@@ -178,10 +178,10 @@ def set_ingredient_nodes_to_true(node) -> Union[exp.Expression, None]:
 
     Used with node.transform()
     """
-    # Case 1: we have an exp.Struct in isolation
+    # Case 1: we have an Ingredient in isolation
     if check.is_ingredient_node(node):
         return exp.true()
-    # Case 2: we have an exp.Struct within a predicate (=, <, >, etc.)
+    # Case 2: we have an Ingredient within a predicate (=, <, >, etc.)
     if isinstance(node, exp.Predicate):
         if any(
             check.is_ingredient_node(x)