Skip to content

Commit

Permalink
Lots of documentation updates
Browse files Browse the repository at this point in the history
  • Loading branch information
parkervg committed Oct 22, 2024
1 parent e67217b commit b04fb51
Show file tree
Hide file tree
Showing 12 changed files with 305 additions and 168 deletions.
15 changes: 7 additions & 8 deletions blendsql/ingredients/builtin/map/main.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import copy
import logging
from typing import Union, Iterable, Any, Dict, Optional, List, Callable, Tuple
from typing import Union, Iterable, Any, Optional, List, Callable, Tuple, Literal
from collections.abc import Collection
from pathlib import Path
import re
import json
Expand Down Expand Up @@ -260,13 +261,12 @@ def run(
question: str,
values: List[str],
few_shot_retriever: Callable[[str], List[AnnotatedMapExample]] = None,
options: List[str] = None,
options: Collection[str] = None,
list_options_in_prompt: bool = None,
value_limit: Union[int, None] = None,
example_outputs: Optional[str] = None,
output_type: Optional[str] = None,
regex: Optional[Callable[[int], str]] = None,
table_to_title: Optional[Dict[str, str]] = None,
output_type: Optional[Literal["integer", "float", "string", "boolean"]] = None,
regex: Optional[str] = None,
batch_size: int = DEFAULT_MAP_BATCH_SIZE,
**kwargs,
) -> Iterable[Any]:
Expand All @@ -277,10 +277,9 @@ def run(
model: The Model (blender) we will make calls to.
values: The list of values to apply question to.
value_limit: Optional limit on the number of values to pass to the Model
example_outputs: If binary == False, this gives the Model an example of the output we expect.
output_type: One of 'numeric', 'string', 'bool'
example_outputs: This gives the Model an example of the output we expect.
output_type: In the absence of example_outputs, give the Model some signal as to what we expect as output.
regex: Optional regex to constrain answer generation.
table_to_title: Mapping from tablename to a title providing some more context.
Returns:
Iterable[Any] containing the output of the Model for each value.
Expand Down
42 changes: 37 additions & 5 deletions blendsql/ingredients/builtin/qa/main.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import copy
from ast import literal_eval
from pathlib import Path
from typing import Dict, Union, Optional, Set, Tuple, Callable, List, Literal
from typing import Union, Optional, Tuple, Callable, List, Literal
from collections.abc import Collection
import pandas as pd
import json
from colorama import Fore
Expand Down Expand Up @@ -270,17 +271,48 @@ def run(
question: str,
context_formatter: Callable[[pd.DataFrame], str],
few_shot_retriever: Callable[[str], List[AnnotatedQAExample]] = None,
options: Optional[Set[str]] = None,
options: Optional[Collection[str]] = None,
list_options_in_prompt: bool = None,
modifier: Optional[Literal["*", "+"]] = None,
output_type: Optional[str] = None,
output_type: Optional[
Literal[
"integer",
"float",
"string",
"boolean",
"List[integer]",
"List[float]",
"List[string]",
"List[boolean]",
]
] = None,
regex: Optional[str] = None,
context: Optional[pd.DataFrame] = None,
value_limit: Optional[int] = None,
table_to_title: Optional[Dict[str, str]] = None,
long_answer: bool = False,
**kwargs,
) -> Union[str, int, float]:
) -> Union[str, int, float, tuple]:
"""
Args:
question: The question to map onto the values. Will also be the new column name
context: Table subset to use as context in answering question
model: The Model (blender) we will make calls to.
context_formatter: Callable defining how we want to serialize table context.
few_shot_retriever: Callable which takes a string, and returns n most similar few-shot examples
options: Optional collection with which we try to constrain generation.
list_options_in_prompt: Defines whether we include options in the prompt for the current inference example
modifier: If we expect an array of scalars, this defines the regex we want to apply.
Used directly for constrained decoding at inference time if we have a guidance model.
output_type: In the absence of example_outputs, give the Model some signal as to what we expect as output.
regex: Optional regex to constrain answer generation.
value_limit: Optional limit on how many rows from context we use
long_answer: If true, we more closely mimic long-form end-to-end question answering.
If false, we just give the answer with no explanation or context
Returns:
Union[str, int, float, tuple] containing the response from the model.
Response will only be a tuple if `modifier` is not None.
"""
if model is None:
raise IngredientException(
"LLMQA requires a `Model` object, but nothing was passed!\nMost likely you forgot to set the `default_model` argument in `blend()`"
Expand Down
40 changes: 29 additions & 11 deletions blendsql/ingredients/builtin/rag_qa/main.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,47 @@
from typing import Literal
from typing import Literal, Tuple
from collections.abc import Collection
from textwrap import dedent
from attr import attrs

from blendsql.db.utils import double_quote_escape
from blendsql.ingredients.ingredient import AliasIngredient
from blendsql.ingredients.builtin.qa import LLMQA
from blendsql.ingredients.ingredient import AliasIngredient, Ingredient
from blendsql.ingredients.builtin.web_search import BingWebSearch
from blendsql._exceptions import IngredientException


@attrs
class RAGQA(AliasIngredient):
def run(
self, question: str, source: Literal["bing"] = "bing", *args, **kwargs
) -> str:
) -> Tuple[str, Collection[Ingredient]]:
'''Returns a subquery which first fetches relevant context from a source,
and returns a retrieval-augmented LM generation.
Arguments:
question: The query string to use for both retrieval and generation
source: The source of the retrieved information. Currently only supports 'bing'
Examples:
```python
from blendsql.ingredients import RAGQA
# Among the schools with the average score in Math over 560 in the SAT test, how many schools are in the bay area?
blendsql_query = """
SELECT COUNT(DISTINCT s.CDSCode)
FROM schools s
JOIN satscores sa ON s.CDSCode = sa.cds
WHERE sa.AvgScrMath > 560
AND s.County IN {{RAGQA('Which counties are in the Bay Area?')}}
"""
ingredients = {RAGQA}
...
```
'''
if source == "bing":
rag_ingredient = BingWebSearch
else:
raise IngredientException(
f"RAGQA not setup to handle source '{source}' yet"
)
return (
dedent(
f"""
new_query = dedent(
f"""
{{{{
LLMQA(
"{double_quote_escape(question)}",
Expand All @@ -34,6 +53,5 @@ def run(
)
}}}}
"""
),
{LLMQA, rag_ingredient},
)
return (new_query, {rag_ingredient})
184 changes: 175 additions & 9 deletions blendsql/ingredients/ingredient.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,10 +77,39 @@ def maybe_get_temp_table(

@attrs
class AliasIngredient(Ingredient):
"""This ingredient performs no other function than to act as a stand-in for
'''This ingredient performs no other function than to act as a stand-in for
complex chainings of other ingredients. This allows us (or our lms) to write less verbose
BlendSQL queries, while maximizing the information we embed.
"""
The `run()` function should return a tuple containing both the query text that should get subbed in,
and any dependent ingredient classes we need to load to execute the aliased query.
Examples:
```python
from textwrap import dedent
from typing import Tuple, Collection
from blendsql.ingredients import AliasIngredient, LLMQA
class FetchDefinition(AliasIngredient):
def run(self, term: str, *args, **kwargs) -> Tuple[str, Collection[Ingredient]]:
new_query = dedent(
f"""
{{{{
LLMQA(
"What does {term} mean?"
)
}}}}
""")
dependent_ingredients = {LLMQA}
return (new_query, dependent_ingredients)
# Now, we can use the ingredient like below
blendsql_query = """
SELECT {{FetchDefinition('delve')}} AS "Definition"
"""
```
'''

ingredient_type: str = IngredientType.ALIAS.value
allowed_output_types: Tuple[Type] = Tuple[str, Collection[Ingredient]]
Expand All @@ -91,8 +120,50 @@ def __call__(self, *args, **kwargs):

@attrs
class MapIngredient(Ingredient):
"""For a given table/column pair, maps an external function
to each of the given values, creating a new column."""
'''For a given table/column pair, maps an external function
to each of the given values, creating a new column.
Examples:
```python
from typing import List
from blendsql.ingredients import MapIngredient
import requests
class GetQRCode(MapIngredient):
"""Calls API to generate QR code for a given URL.
Saves bytes to file in qr_codes/ and returns list of paths.
https://goqr.me/api/doc/create-qr-code/"""
def run(self, values: List[str], **kwargs) -> List[str]:
imgs_as_bytes = []
for value in values:
qr_code_bytes = requests.get(
"https://api.qrserver.com/v1/create-qr-code/?data=https://{}/&size=100x100".format(value)
).content
imgs_as_bytes.append(qr_code_bytes)
return imgs_as_bytes
if __name__ == "__main__":
from blendsql import blend
from blendsql.db import SQLite
from blendsql.utils import fetch_from_hub
blendsql = "SELECT genre, url, {{GetQRCode('QR Code as Bytes:', 'w::url')}} FROM w WHERE genre = 'social'"
smoothie = blend(
query=blendsql,
default_model=None,
db=SQLite(fetch_from_hub("urls.db")),
ingredients={GetQRCode}
)
# | genre | url | QR Code as Bytes: |
# |--------|---------------|-----------------------|
# | social | facebook.com | b'...' |
```
'''

ingredient_type: str = IngredientType.MAP.value
allowed_output_types: Tuple[Type] = (Iterable[Any],)
Expand Down Expand Up @@ -218,11 +289,33 @@ def run(self, *args, **kwargs) -> Iterable[Any]:

@attrs
class JoinIngredient(Ingredient):
"""Executes an `INNER JOIN` using dict mapping.
Example:
'Join on color of food'
{"tomato": "red", "broccoli": "green", "lemon": "yellow"}
"""
'''Executes an `INNER JOIN` using dict mapping.
'Join on color of food'
{"tomato": "red", "broccoli": "green", "lemon": "yellow"}
Examples:
```python
from blendsql.ingredients import JoinIngredient
class do_join(JoinIngredient):
"""A very silly, overcomplicated way to do a traditional SQL join.
But useful for testing.
"""
def run(self, left_values: List[str], right_values: List[str], **kwargs) -> dict:
return {left_value: left_value for left_value in left_values}
blendsql_query = """
SELECT Account, Quantity FROM returns
JOIN {{
do_join(
left_on='account_history::Symbol',
right_on='returns::Symbol'
)
}}
"""
```
'''

use_skrub_joiner: bool = attrib(default=True)

Expand Down Expand Up @@ -376,6 +469,79 @@ def run(self, *args, **kwargs) -> dict:

@attrs
class QAIngredient(Ingredient):
'''
Given a table subset in the form of a pd.DataFrame 'context',
returns a scalar or array of scalars (in the form of a tuple).
Useful for end-to-end question answering tasks.
Examples:
```python
import pandas as pd
import guidance
from blendsql.models import Model, LocalModel, RemoteModel
from blendsql.ingredients import QAIngredient
from blendsql.ingredients.generate import generate
from blendsql._program import Program
class SummaryProgram(Program):
"""Program to call Model and return summary of the passed table.
"""
def __call__(self, model: Model, serialized_db: str):
prompt = f"Summarize the table. {serialized_db}"
if isinstance(model, LocalModel):
# Below we follow the guidance pattern for unconstrained text generation
# https://github.com/guidance-ai/guidance
response = (model.model_obj + guidance.gen(max_tokens=20, name="response"))._variables["response"]
else:
response = generate(
model.model_obj,
messages_list=[[{"role": "user", "content": prompt}]],
max_tokens=20
)[0]
# Finally, return (response, prompt) tuple
# Returning the prompt here allows the underlying BlendSQL classes to track token usage
return (response, prompt)
class TableSummary(QAIngredient):
def run(self, model: Model, context: pd.DataFrame, **kwargs) -> str:
result = model.predict(program=SummaryProgram, serialized_db=context.to_string())
return f"'{result}'"
if __name__ == "__main__":
from blendsql import blend
from blendsql.db import SQLite
from blendsql.utils import fetch_from_hub
from blendsql.models import OpenaiLLM
blendsql = """
SELECT {{
TableSummary(
context=(SELECT * FROM transactions LIMIT 10)
)
}} AS "Summary"
"""
smoothie = blend(
query=blendsql,
default_model=OpenaiLLM("gpt-4o-mini"),
db=SQLite(fetch_from_hub("single_table.db")),
ingredients={TableSummary}
)
# Now, we can get results
print(smoothie.df)
# 'The table summarizes a series of cash flow transactions made through Zelle'
# ...and token usage
print(smoothie.meta.prompt_tokens)
print(smoothie.meta.completion_tokens)
```
'''

ingredient_type: str = IngredientType.QA.value
allowed_output_types: Tuple[Type] = (Union[str, int, float, tuple],)

Expand Down
4 changes: 2 additions & 2 deletions blendsql/parse/_transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,10 +178,10 @@ def set_ingredient_nodes_to_true(node) -> Union[exp.Expression, None]:
Used with node.transform()
"""
# Case 1: we have an exp.Struct in isolation
# Case 1: we have an Ingredient in isolation
if check.is_ingredient_node(node):
return exp.true()
# Case 2: we have an exp.Struct within a predicate (=, <, >, etc.)
# Case 2: we have an Ingredient within a predicate (=, <, >, etc.)
if isinstance(node, exp.Predicate):
if any(
check.is_ingredient_node(x)
Expand Down
Loading

0 comments on commit b04fb51

Please sign in to comment.