diff --git a/README.md b/README.md
index 168ae1a3..5b2b55a7 100644
--- a/README.md
+++ b/README.md
@@ -155,10 +155,15 @@ For in-depth descriptions of the above queries, check out our [documentation](ht
```python
import pandas as pd
-from blendsql import blend, LLMMap, LLMQA, LLMJoin
+import blendsql
+from blendsql.ingredients import LLMMap, LLMQA, LLMJoin
from blendsql.db import Pandas
from blendsql.models import TransformersLLM, OpenaiLLM
+# Optionally set how many async calls to allow concurrently
+# This depends on your OpenAI/Anthropic/etc. rate limits
+blendsql.config.set_async_limit(10)
+
# Load model
# model = OpenaiLLM("gpt-4o") # If you have a .env present with OpenAI API keys
model = TransformersLLM('Qwen/Qwen1.5-0.5B')
@@ -190,7 +195,7 @@ db = Pandas(
)
# Write BlendSQL query
-blendsql = """
+query = """
SELECT * FROM w
WHERE city = {{
LLMQA(
@@ -200,8 +205,8 @@ WHERE city = {{
)
}}
"""
-smoothie = blend(
- query=blendsql,
+smoothie = blendsql.blend(
+ query=query,
db=db,
ingredients={LLMMap, LLMQA, LLMJoin},
default_model=model,
diff --git a/blendsql/__init__.py b/blendsql/__init__.py
index 2d2227c4..e0b73ae1 100644
--- a/blendsql/__init__.py
+++ b/blendsql/__init__.py
@@ -1,2 +1,3 @@
-from .ingredients.builtin import LLMMap, LLMQA, LLMJoin, LLMValidate, ImageCaption
from .blend import blend
+from .ingredients import LLMQA, LLMMap, LLMJoin, LLMValidate, ImageCaption
+from . import _configure as config
diff --git a/docs/quickstart.md b/docs/quickstart.md
index e6f0068a..7d5a59af 100644
--- a/docs/quickstart.md
+++ b/docs/quickstart.md
@@ -7,42 +7,47 @@ hide:
```python
import pandas as pd
-from blendsql import blend, LLMMap, LLMQA, LLMJoin
+import blendsql
+from blendsql.ingredients import LLMMap, LLMQA, LLMJoin
from blendsql.db import Pandas
from blendsql.models import TransformersLLM, OpenaiLLM
+# Optionally set how many async calls to allow concurrently
+# This depends on your OpenAI/Anthropic/etc. rate limits
+blendsql.config.set_async_limit(10)
+
# Load model
# model = OpenaiLLM("gpt-4o") # If you have a .env present with OpenAI API keys
model = TransformersLLM('Qwen/Qwen1.5-0.5B')
# Prepare our local database
db = Pandas(
- {
- "w": pd.DataFrame(
- (
- ['11 jun', 'western districts', 'bathurst', 'bathurst ground', '11-0'],
- ['12 jun', 'wallaroo & university nsq', 'sydney', 'cricket ground',
- '23-10'],
- ['5 jun', 'northern districts', 'newcastle', 'sports ground', '29-0']
- ),
- columns=['date', 'rival', 'city', 'venue', 'score']
- ),
- "documents": pd.DataFrame(
- (
- ['bathurst, new south wales',
- 'bathurst /ˈbæθərst/ is a city in the central tablelands of new south wales , australia . it is about 200 kilometres ( 120 mi ) west-northwest of sydney and is the seat of the bathurst regional council .'],
- ['sydney',
- 'sydney ( /ˈsɪdni/ ( listen ) sid-nee ) is the state capital of new south wales and the most populous city in australia and oceania . located on australia s east coast , the metropolis surrounds port jackson.'],
- ['newcastle, new south wales',
- 'the newcastle ( /ˈnuːkɑːsəl/ new-kah-səl ) metropolitan area is the second most populated area in the australian state of new south wales and includes the newcastle and lake macquarie local government areas .']
- ),
- columns=['title', 'content']
- )
- }
+ {
+ "w": pd.DataFrame(
+ (
+ ['11 jun', 'western districts', 'bathurst', 'bathurst ground', '11-0'],
+ ['12 jun', 'wallaroo & university nsq', 'sydney', 'cricket ground',
+ '23-10'],
+ ['5 jun', 'northern districts', 'newcastle', 'sports ground', '29-0']
+ ),
+ columns=['date', 'rival', 'city', 'venue', 'score']
+ ),
+ "documents": pd.DataFrame(
+ (
+ ['bathurst, new south wales',
+ 'bathurst /ˈbæθərst/ is a city in the central tablelands of new south wales , australia . it is about 200 kilometres ( 120 mi ) west-northwest of sydney and is the seat of the bathurst regional council .'],
+ ['sydney',
+ 'sydney ( /ˈsɪdni/ ( listen ) sid-nee ) is the state capital of new south wales and the most populous city in australia and oceania . located on australia s east coast , the metropolis surrounds port jackson.'],
+ ['newcastle, new south wales',
+ 'the newcastle ( /ˈnuːkɑːsəl/ new-kah-səl ) metropolitan area is the second most populated area in the australian state of new south wales and includes the newcastle and lake macquarie local government areas .']
+ ),
+ columns=['title', 'content']
+ )
+ }
)
# Write BlendSQL query
-blendsql = """
+query = """
SELECT * FROM w
WHERE city = {{
LLMQA(
@@ -52,14 +57,14 @@ WHERE city = {{
)
}}
"""
-smoothie = blend(
- query=blendsql,
- db=db,
- ingredients={LLMMap, LLMQA, LLMJoin},
- default_model=model,
- # Optional args below
- infer_gen_constraints=True,
- verbose=True
+smoothie = blendsql.blend(
+ query=query,
+ db=db,
+ ingredients={LLMMap, LLMQA, LLMJoin},
+ default_model=model,
+ # Optional args below
+ infer_gen_constraints=True,
+ verbose=True
)
print(smoothie.df)
# ┌────────┬───────────────────┬──────────┬─────────────────┬─────────┐
diff --git a/research/.gitignore b/research/.gitignore
new file mode 100644
index 00000000..adbb97d2
--- /dev/null
+++ b/research/.gitignore
@@ -0,0 +1 @@
+data/
\ No newline at end of file
diff --git a/research/README.md b/research/README.md
deleted file mode 100644
index 4abf2d37..00000000
--- a/research/README.md
+++ /dev/null
@@ -1,2823 +0,0 @@
-# BlendSQL Research
-
-Example usage:
-```
-caffeinate -is python -m research.run-evaluate research/configs/gpt-4-all.json
-```
-## Initial Results
-
-Below experiments were done on the first 200 items in the WikiTableQuestions validation set.
-
-They included only 4 few-shot examples found in [./prompts/few_shot.py](./prompts/few_shot.py).
-
-Full model outputs are in [./outputs](./outputs).
-
-| **Parser** | **QA** | **Accuracy** | **# With Ingredients** |
-|------------------|------------------|--------------|------------------------|
-| gpt-4 | gpt-4 | 67.35 | 6 |
-| gpt-4 | text-davinci-003 | 66.84 | 6 |
-| text-davinci-003 | text-davinci-003 | 45.65 | 53 |
-| text-davinci-003 | gpt-4 | 49.46 | 53 |
-
-Some can be solved with both BlendSQL and SQLite
-"what is the largest penalty?"
-
-`SELECT MAX(CAST(SUBSTR("Penalties (P+P+S+S)", 0, 3) AS integer)) FROM w;`
-`SELECT MAX({{LLMMap('total penalties?', 'w::penalties (p+p+s+s)')}}) FROM w`
-
-
-WizardCoder + BlendSQL on whole WikiTQ dev set:
-```json
-{
- "metric_score": 0.46872166817769717,
- "num_with_ingredients": 27,
- "num_errors": 42,
- "num_completed": 2164,
- "time_elapsed": 489.1062750816345
-}
-```
-
-## Helpful Tips
-
-- Use `max_val_samples` for debugging, to restrict how many datapoints we iterate over during a `do_eval` loop.
-- To add a dataset, do the following:
- 1) Add a dataset script to [datasets/](./datasets)
- 1) See [./datasets/wikitq.py](./datasets/wikitq/wikitq.py) for an example
- 2) Add a metric script to [metrics/](./metrics)
- 1) See [./metrics/wikitq.py](./metrics/wikitq/wikitq.py) for an example
- 3) Put all dataset-specif utils in `./utils/{dataset_name}`
- 1) As done in [./utils/wikitq/wikitq.py](./utils/wikitq/wikitq.py), this should also contain a pre-process function
- 4) Modify `load_dataset()` in [./utils/dataset_loader.py](./utils/dataset_loader.py) with dataset-specific logic
-
-
-## General Notes
-- GPT-4 is much more capable at discerning when we need BlendSQL, vs. can just use native SQL
- - Examples:
- - using `SUBSTR` calls
- - Leveraging table title, like if we have 'Hospitals in California' table, we don't need a condition `state = 'CA'`
-- Using no `binary` argument in the LLM MapIngredient call improves results
- - On initial 200:
- - 0.479 vs 0.463 with text-davinci-003 as parser, gpt-4 as QA
- - 0.432 vs 0.406 with text-davinci-003 as parser and QA
- - This avoids the catastrophic parsing issue where we fail to include `binary=0` in our ingredient call
- - But, puts more pressure on the QA model to be right
-
-The new `LLMQA` ingredient works, but we have some issues with response formatting.
-
-https://www.sqlite.org/fts3.html#appendix_a
-"In SQLite, when a sub-query used in a join contains a LIMIT clause, the results of the sub-query are calculated and stored in temporary table before the main query is executed."
-
-https://neuml.hashnode.dev/building-an-efficient-sparse-keyword-index-in-python
-
-```python
-metric.add(
- prediction=["The same problem was oil pressure."],
- reference={
- "answer_text": ["Oil pressure"],
- "question": "what was the same problem that bernard collomb had as innes ireland?",
- },
-)
-```
-The above WikiTQ metric computation results in a score of 0.0
-We could either do some post-processing (e.g. Bridge to extract 'oil pressure') or better prompting.
-
-Could benefit from some external tools, such as below
-For 'difference of points'
-
-```json
- "sql": "SELECT MAX ( abs ( `Result` - `Result` ) ) FROM w",
-"answer_text": [
- "29"
-],
-"input_program_args": {
- "bridge_hints": "",
- "question": "what was the largest difference of points in a single game?",
- "serialized_db": "Table Description: 1960 Buffalo Bills season\nCREATE TABLE \"w\" (\n\"index\" INTEGER,\n \"row_id\" INTEGER,\n \"week\" INTEGER,\n \"date\" TEXT,\n \"opponent\" TEXT,\n \"result\" TEXT,\n \"game site\" TEXT,\n \"attendance\" INTEGER\n)\n\n/*\n3 example rows:\nSELECT * FROM w LIMIT 3\n index row_id week date opponent result game site attendance\n 0 0 1 1960-9-11 at new york titans l 27\u20133 polo grounds 10200\n 1 1 2 1960-9-18 denver broncos l 27\u201321 war memorial stadium 15229\n 2 2 3 1960-9-23 at boston patriots w 13\u20130 nickerson field 20732\n*/"
-},
-"db_path": "research/db/wikitq/csv/203-csv/775.db",
-"pred_text": [
- "8"
-],
-"pred_sql": "SELECT MAX({{LLMMap('difference of points', 'w::result')}}) FROM w",
-"pred_has_ingredient": true,
-```
-
-## Good Examples
-- [Example 1](#good_example_1)
-- [Example 2](#good_example_2)
-- [Example 3](#good_example_3)
-- [Example 4](#good_example_4)
-- [Example 5](#good_example_5)
-
-### Good Example 1
-```json
-{
- "id": "nt-267",
- "question": "total number of players whose home town was in north carolina (nc)",
- "table_id": "csv/204-csv/526.csv",
- "table": {
- "page_title": "2010\u201311 North Carolina Tar Heels men's basketball team",
- "header": [
- "Name",
- "#",
- "Position",
- "Height",
- "Weight",
- "Year",
- "Home Town",
- "High School"
- ],
- "rows": [
- [
- "Harrison Barnes",
- "40",
- "Forward",
- "6\u20138",
- "210",
- "Freshman",
- "Ames, IA",
- "Ames"
- ],
- [
- "Daniel Bolick",
- "3",
- "Guard",
- "5\u201310",
- "175",
- "Senior",
- "Carrboro, NC",
- "Chapel Hill"
- ],
- [
- "Reggie Bullock",
- "35",
- "Guard",
- "6\u20137",
- "190",
- "Freshman",
- "Kinston, NC",
- "Kinston"
- ],
- [
- "Stewart Cooper",
- "15",
- "Forward",
- "6\u20135",
- "205",
- "Junior",
- "Winston-Salem, NC",
- "Forsyth County Day"
- ],
- [
- "Patrick Crouch",
- "30",
- "Guard",
- "5\u201311",
- "175",
- "Junior",
- "Asheville, NC",
- "T.C. Roberson"
- ],
- [
- "Larry Drew II*",
- "11",
- "Guard",
- "6\u20132",
- "180",
- "Junior",
- "Encino, CA",
- "Woodland Hills Taft"
- ],
- [
- "David Dupont",
- "22",
- "Forward",
- "6\u20135",
- "195",
- "Junior",
- "Greensboro, NC",
- "Grimsley"
- ],
- [
- "Van Hatchell",
- "13",
- "Forward",
- "6\u20134",
- "185",
- "Senior",
- "Chapel Hill, NC",
- "Cresset Christian"
- ],
- [
- "John Henson",
- "31",
- "Forward",
- "6\u201310",
- "210",
- "Sophomore",
- "Tampa, FL",
- "Sickles"
- ],
- [
- "D.J. Johnston",
- "32",
- "Forward",
- "6\u20134",
- "195",
- "Junior",
- "Lower Gwynedd, PA",
- "Germantown"
- ],
- [
- "Justin Knox",
- "25",
- "Forward",
- "6\u20139",
- "240",
- "Graduate",
- "Tuscaloosa, AL",
- "Central"
- ],
- [
- "Kendall Marshall",
- "5",
- "Guard",
- "6\u20133",
- "186",
- "Freshman",
- "Dumfries, VA",
- "Bishop O'Connell"
- ],
- [
- "Leslie McDonald",
- "15",
- "Guard",
- "6\u20134",
- "215",
- "Sophomore",
- "Memphis, TN",
- "Briarcrest Christian"
- ],
- [
- "Dexter Strickland",
- "1",
- "Guard",
- "6\u20133",
- "180",
- "Sophomore",
- "Rahway, NJ",
- "St. Patrick"
- ],
- [
- "Justin Watts",
- "24",
- "Guard",
- "6\u20134",
- "210",
- "Junior",
- "Durham, NC",
- "Jordan"
- ],
- [
- "Tyler Zeller",
- "44",
- "Forward",
- "7\u20130",
- "250",
- "Junior",
- "Washington, IN",
- "Washington"
- ]
- ]
- },
- "sql": "SELECT COUNT ( `Name` ) FROM w WHERE `Home Town` = 'nc'",
- "answer_text": [
- "7"
- ],
- "input_program_args": {
- "bridge_hints": "",
- "examples": [
- {
- "blendsql": "SELECT DISTINCT description, merchant FROM transactions WHERE\n {{Model('is this a pizza shop?', 'transactions::merchant', endpoint_name='gpt-4')}} = 1\n AND {{\n Model(\n 'what state is this transaction from? Choose -1 when N.A.',\n 'transactions::description',\n binary=0,\n example_outputs='TX;CA;MA;-1;'\n )\n }} = 'MA'\n AND parent_category = 'Food'",
- "question": "Pizza shops in MA",
- "serialized_db": "CREATE TABLE \"transactions\" (\n \"index\" INTEGER,\n \"description\" TEXT,\n \"amount\" REAL,\n \"merchant\" TEXT,\n \"cash_flow\" TEXT,\n \"parent_category\" TEXT,\n \"child_category\" TEXT,\n \"date\" TEXT,\n)"
- },
- {
- "blendsql": "SELECT DISTINCT description, merchant, child_category, date FROM transactions WHERE\n {{Model('would this gift be appropriate for a child?', 'transactions::description')}} = 1\n AND child_category = 'Gifts'",
- "question": "Show gifts for my kid",
- "serialized_db": "CREATE TABLE \"transactions\" (\n \"index\" INTEGER,\n \"description\" TEXT,\n \"amount\" REAL,\n \"merchant\" TEXT,\n \"cash_flow\" TEXT,\n \"parent_category\" TEXT,\n \"child_category\" TEXT,\n \"date\" TEXT,\n)"
- }
- ],
- "program": "\n{{#system~}}\nGenerate BlendSQL given the question, table, passages, image captions to answer the question correctly.\nBlendSQL is a superset of SQLite, which adds external function calls for information not found within native SQLite.\nThese external ingredients should be wrapped in double curly brackets.\n\nIf question-relevant column(s) contents are not suitable for SQL comparisons or calculations, map it to a new column with clean content by a new grammar:\n `Model('question', '{table}::{column})'`\n\nExamples:\n{{~#each examples}}\n{{this.serialized_db}}\nQuestion: {{this.question}}\nBlendSQL: {{this.blendsql}}\n{{/each}}\n\n{{~/system}}\n\n{{#user~}}\n{{serialized_db}}\n\n{{#if bridge_hints}}\nHere are some values that may be helpful:\n{{bridge_hints}}\n{{/if}}\n\nQuestion: {{question}}\nBlendSQL:\n{{~/user}}\n\n{{#assistant~}}\n{{gen \"result\" temperature=0.0}}\n{{~/assistant}}\n",
- "question": "total number of players whose home town was in north carolina (nc)",
- "serialized_db": "CREATE TABLE \"w\" (\n\"index\" INTEGER,\n \"row_id\" INTEGER,\n \"name\" TEXT,\n \"#\" INTEGER,\n \"position\" TEXT,\n \"height\" TEXT,\n \"weight\" INTEGER,\n \"year\" TEXT,\n \"home town\" TEXT,\n \"high school\" TEXT\n)\n\n/*\n3 example rows:\nSELECT * FROM w LIMIT 3\n index row_id name # position height weight year home town high school\n 0 0 harrison barnes 40 forward 6\u20138 210 freshman ames, ia ames\n 1 1 daniel bolick 3 guard 5\u201310 175 senior carrboro, nc chapel hill\n 2 2 reggie bullock 35 guard 6\u20137 190 freshman kinston, nc kinston\n*/"
- },
- "db_path": "./research/tmp/8fa07b3b-f51d-4690-ad04-e908fe0cf071.db",
- "pred_text": [
- 7
- ],
- "pred_sql": "SELECT COUNT(*) FROM w WHERE {{Model('is this town in North Carolina?', 'w::home town')}} = 1"
-}
-```
-### Good Example 2
-```json
-{
- "id": "nt-647",
- "question": "in how many games did sri lanka score at least 2 goals?",
- "table_id": "csv/204-csv/913.csv",
- "table": {
- "page_title": "Kasun Jayasuriya",
- "header": [
- "#",
- "Date",
- "Venue",
- "Opponent",
- "Score",
- "Result",
- "Competition"
- ],
- "rows": [
- [
- "1.",
- "1 May 2000",
- "Rasmee Dhandu Stadium, Mal\u00e9, Maldives",
- "Maldives",
- "1\u20131",
- "1\u20131",
- "2000 MFF Golden Jubilee Tournament"
- ],
- [
- "2.",
- "25 August 2000",
- "Ho Chi Minh City, Vietnam",
- "Vietnam",
- "2\u20131",
- "2\u20132",
- "2000 HCM Cup"
- ],
- [
- "3.",
- "28 March 2001",
- "Colombo, Sri Lanka",
- "Cambodia",
- "1\u20130",
- "1\u20130",
- "Friendly"
- ],
- [
- "4.",
- "17 May 2001",
- "Beirut Municipal Stadium, Beirut, Lebanon",
- "Pakistan",
- "3\u20132",
- "3\u20133",
- "2002 FIFA World Cup qualification (AFC)"
- ],
- [
- "5.",
- "30 May 2001",
- "Suphachalasai Stadium, Bangkok, Thailand",
- "Pakistan",
- "1\u20130",
- "3\u20131",
- "2002 FIFA World Cup qualification (AFC)"
- ],
- [
- "6.",
- "30 May 2001",
- "Supachalasai Stadium, Bangkok, Thailand",
- "Pakistan",
- "3\u20130",
- "3\u20131",
- "2002 FIFA World Cup qualification (AFC)"
- ],
- [
- "7.",
- "4 April 2002",
- "Sugathadasa Stadium, Colombo, Sri Lanka",
- "Maldives",
- "1\u20130",
- "1\u20130",
- "Friendly"
- ],
- [
- "8.",
- "21 March 2003",
- "Sugathadasa Stadium, Colombo, Sri Lanka",
- "Timor-Leste",
- "1\u20131",
- "3\u20132",
- "2004 AFC Asian Cup qualification"
- ],
- [
- "9.",
- "21 March 2003",
- "Sugathadasa Stadium, Colombo, Sri Lanka",
- "Timor-Leste",
- "3\u20132",
- "3\u20132",
- "2004 AFC Asian Cup qualification"
- ],
- [
- "10.",
- "3 December 2003",
- "Sugathadasa Stadium, Colombo, Sri Lanka",
- "Laos",
- "2\u20130",
- "3\u20130",
- "2006 FIFA World Cup qualification (AFC)"
- ],
- [
- "11.",
- "2 April 2006",
- "MA Aziz Stadium, Chittagong, Bangladesh",
- "Brunei",
- "2\u20130",
- "3\u20130",
- "2006 AFC Challenge Cup"
- ],
- [
- "12.",
- "12 April 2006",
- "MA Aziz Stadium, Chittagong, Bangladesh",
- "Nepal",
- "1\u20130",
- "1\u20131",
- "2006 AFC Challenge Cup"
- ],
- [
- "13.",
- "24 March 2007",
- "Sugathadasa Stadium, Colombo, Sri Lanka",
- "Malaysia",
- "1\u20134",
- "1\u20134",
- "Friendly"
- ],
- [
- "14.",
- "2 April 2008",
- "Chungshan Stadium, Taipei, Taiwan",
- "Guam",
- "5\u20131",
- "5\u20131",
- "2008 AFC Challenge Cup qualification"
- ],
- [
- "15.",
- "4 April 2008",
- "Chungshan Stadium, Taipei, Taiwan",
- "Pakistan",
- "1\u20130",
- "7\u20131",
- "2008 AFC Challenge Cup qualification"
- ],
- [
- "16.",
- "4 April 2008",
- "Chungshan Stadium, Taipei, Taiwan",
- "Pakistan",
- "4\u20131",
- "7\u20131",
- "2008 AFC Challenge Cup qualification"
- ],
- [
- "17.",
- "4 April 2008",
- "Chungshan Stadium, Taipei, Taiwan",
- "Pakistan",
- "7\u20131",
- "7\u20131",
- "2008 AFC Challenge Cup qualification"
- ],
- [
- "18.",
- "6 April 2008",
- "Chungshan Stadium, Taipei, Taiwan",
- "Chinese Taipei",
- "2\u20132",
- "2\u20132",
- "2008 AFC Challenge Cup qualification"
- ],
- [
- "19.",
- "2 August 2008",
- "Gachibowli Athletic Stadium, Hyderabad, India",
- "Myanmar",
- "1\u20131",
- "1\u20133",
- "2008 AFC Challenge Cup"
- ],
- [
- "20.",
- "4 April 2009",
- "Sugathadasa Stadium, Colombo, Sri Lanka",
- "Brunei",
- "1\u20130",
- "5\u20131",
- "2010 AFC Challenge Cup qualification"
- ],
- [
- "21.",
- "4 April 2009",
- "Sugathadasa Stadium, Colombo, Sri Lanka",
- "Brunei",
- "3\u20130",
- "5\u20131",
- "2010 AFC Challenge Cup qualification"
- ],
- [
- "22.",
- "4 April 2009",
- "Sugathadasa Stadium, Colombo, Sri Lanka",
- "Brunei",
- "4\u20130",
- "5\u20131",
- "2010 AFC Challenge Cup qualification"
- ],
- [
- "23.",
- "4 April 2009",
- "Sugathadasa Stadium, Colombo, Sri Lanka",
- "Brunei",
- "5\u20130",
- "5\u20131",
- "2010 AFC Challenge Cup qualification"
- ],
- [
- "24.",
- "6 April 2009",
- "Sugathadasa Stadium, Colombo, Sri Lanka",
- "Chinese Taipei",
- "1\u20130",
- "2\u20131",
- "2010 AFC Challenge Cup qualification"
- ],
- [
- "25.",
- "6 December 2009",
- "Bangabandhu National Stadium, Dhaka, Bangladesh",
- "Bhutan",
- "3\u20130",
- "6\u20130",
- "2009 South Asian Football Federation Cup"
- ],
- [
- "26.",
- "6 December 2009",
- "Bangabandhu National Stadium, Dhaka, Bangladesh",
- "Bhutan",
- "4\u20130",
- "6\u20130",
- "2009 South Asian Football Federation Cup"
- ],
- [
- "27.",
- "6 December 2009",
- "Bangabandhu National Stadium, Dhaka, Bangladesh",
- "Bhutan",
- "5\u20130",
- "6\u20130",
- "2009 South Asian Football Federation Cup"
- ]
- ]
- },
- "sql": "SELECT COUNT ( * ) FROM w WHERE `Score` >= 2",
- "answer_text": [
- "16"
- ],
- "input_program_args": {
- "bridge_hints": "",
- "examples": [
- {
- "blendsql": "SELECT DISTINCT description, merchant FROM transactions WHERE\n {{Model('is this a pizza shop?', 'transactions::merchant', endpoint_name='gpt-4')}} = 1\n AND {{\n Model(\n 'what state is this transaction from? Choose -1 when N.A.',\n 'transactions::description',\n binary=0,\n example_outputs='TX;CA;MA;-1;'\n )\n }} = 'MA'\n AND parent_category = 'Food'",
- "question": "Pizza shops in MA",
- "serialized_db": "CREATE TABLE \"transactions\" (\n \"index\" INTEGER,\n \"description\" TEXT,\n \"amount\" REAL,\n \"merchant\" TEXT,\n \"cash_flow\" TEXT,\n \"parent_category\" TEXT,\n \"child_category\" TEXT,\n \"date\" TEXT,\n)"
- },
- {
- "blendsql": "SELECT DISTINCT description, merchant, child_category, date FROM transactions WHERE\n {{Model('would this gift be appropriate for a child?', 'transactions::description')}} = 1\n AND child_category = 'Gifts'",
- "question": "Show gifts for my kid",
- "serialized_db": "CREATE TABLE \"transactions\" (\n \"index\" INTEGER,\n \"description\" TEXT,\n \"amount\" REAL,\n \"merchant\" TEXT,\n \"cash_flow\" TEXT,\n \"parent_category\" TEXT,\n \"child_category\" TEXT,\n \"date\" TEXT,\n)"
- }
- ],
- "program": "\n{{#system~}}\nGenerate BlendSQL given the question, table, passages, image captions to answer the question correctly.\nBlendSQL is a superset of SQLite, which adds external function calls for information not found within native SQLite.\nThese external ingredients should be wrapped in double curly brackets.\n\nIf question-relevant column(s) contents are not suitable for SQL comparisons or calculations, map it to a new column with clean content by a new grammar:\n `Model('question', '{table}::{column})'`\n\nExamples:\n{{~#each examples}}\n{{this.serialized_db}}\nQuestion: {{this.question}}\nBlendSQL: {{this.blendsql}}\n{{/each}}\n\n{{~/system}}\n\n{{#user~}}\n{{serialized_db}}\n\n{{#if bridge_hints}}\nHere are some values that may be helpful:\n{{bridge_hints}}\n{{/if}}\n\nQuestion: {{question}}\nBlendSQL:\n{{~/user}}\n\n{{#assistant~}}\n{{gen \"result\" temperature=0.0}}\n{{~/assistant}}\n",
- "question": "in how many games did sri lanka score at least 2 goals?",
- "serialized_db": "CREATE TABLE \"w\" (\n\"index\" INTEGER,\n \"row_id\" INTEGER,\n \"#\" REAL,\n \"date\" TEXT,\n \"venue\" TEXT,\n \"opponent\" TEXT,\n \"score\" TEXT,\n \"result\" TEXT,\n \"competition\" TEXT\n)\n\n/*\n3 example rows:\nSELECT * FROM w LIMIT 3\n index row_id # date venue opponent score result competition\n 0 0 1.0 2000-5-1 rasmee dhandu stadium, mal\u00e9, maldives maldives 1\u20131 1\u20131 2000 mff golden jubilee tournament\n 1 1 2.0 2000-8-25 ho chi minh city, vietnam vietnam 2\u20131 2\u20132 2000 hcm cup\n 2 2 3.0 2001-3-28 colombo, sri lanka cambodia 1\u20130 1\u20130 friendly\n*/"
- },
- "db_path": "./research/tmp/4bd65c0e-5640-42e8-8af9-5274ea1f36d4.db",
- "pred_text": [
- 16
- ],
- "pred_sql": "SELECT COUNT(*) FROM w WHERE {{Model('how many goals did sri lanka score in this game?', 'w::score')}} >= 2"
-}
-```
-
-### Good Example 3
-```json
-{
- "id": "nt-373",
- "question": "tell me the number of shows that only have one host per day.",
- "table_id": "csv/204-csv/839.csv",
- "table": {
- "page_title": "List of programs broadcast by Fox News Channel",
- "header": [
- "ET",
- "Days available",
- "Program",
- "Host(s)",
- "Description"
- ],
- "rows": [
- [
- "9am\u201310am",
- "Monday\u2013Friday",
- "The Morning Click",
- "Jamie Colby (M,T), Harris Faulkner (W\u2013F)",
- "Current events."
- ],
- [
- "10am\u201310:30am",
- "Monday\u2013Friday",
- "",
- "Lauren Green (M), Uma Pemmaraju (T), Gregg Jarrett (W), Arthel Neville (F), Rick Folbaum (F), Heather Childers",
- "Current events."
- ],
- [
- "11am\u201311:30am",
- "Monday\u2013Friday",
- "FBN Live",
- "Lauren Simonetti",
- "Business news and information."
- ],
- [
- "12pm\u20131pm",
- "Monday\u2013Friday",
- "On the Hunt",
- "Jonathan Hunt",
- "Current events."
- ],
- [
- "2pm\u20133pm",
- "Monday",
- "Entertainment Hour",
- "Ashley Dvorkin",
- "Entertainment news, musical performances."
- ],
- [
- "2pm\u20133pm",
- "Tuesday",
- "",
- "Kimberly Guilfoyle",
- "Crime and legal news."
- ],
- [
- "2pm\u20133pm",
- "Wednesday",
- "Defcon 3",
- "Kathleen Troia McFarland",
- "Military news and information."
- ],
- [
- "2pm\u20133pm",
- "Thursday",
- "Fox Car Report",
- "Gary Gastelu",
- "Automotive news and information."
- ],
- [
- "2pm\u20133pm",
- "Friday",
- "Spirited Debate",
- "Lauren Green",
- "Discussion on religious topics."
- ]
- ]
- },
- "sql": "SELECT COUNT ( `Program` ) FROM w WHERE `Host(s)` = 1",
- "answer_text": [
- "7"
- ],
- "input_program_args": {
- "bridge_hints": "",
- "examples": [
- {
- "blendsql": "SELECT year FROM w WHERE opponent = 'elizabeth matory'",
- "question": "What year was Elizabeth Matory the opponent of Charles Albert Ruppersberger?",
- "serialized_db": "\nTable Description: Dutch Ruppersberger (Electoral history)\nCREATE TABLE w(\n\trow_id int,\n\tyear int,\n\toffice text,\n\telection text,\n\tfilledcolumnname real,\n\tsubject text,\n\tparty text,\n\tvotes text,\n\t% text,\n\tfilledcolumnname_2 real,\n\topponent text,\n\tparty_2 text,\n\tvotes_2 text,\n\t%_2 text)\n/*\n3 example rows:\nSELECT * FROM w LIMIT 3;\nrow_id\tyear\toffice\telection\tfilledcolumnname\tsubject\tparty\tvotes\t%\tfilledcolumnname_2\topponent\tparty_2\tvotes_2\t%_2\n0\t1994\tbaltimore county executive\tgeneral\tnan\tdutch ruppersberger\tdemocratic\tn/a\tn/a\tnan\tn/a\tn/a\tn/a\tn/a\n1\t1998\tbaltimore county executive\tgeneral\tnan\tdutch ruppersberger\tdemocratic\t166482\t70.47\tnan\tjohn j. bishop\trepublican\t69449\t29.4\n2\t2002\tnone\tgeneral\tnan\tdutch ruppersberger\tdemocratic\t105718\t54.16\tnan\thelen delich bentley\trepublican\t88954\t45.57\n*/\n"
- },
- {
- "blendsql": "SELECT system FROM w WHERE system IN ('game boy advance', 'msx (japan only)') ORDER BY japan LIMIT 1",
- "question": "Which system has a lower number for Japan of the virtual console systems: Game Boy Advance or the Japan-only console MSX?",
- "serialized_db": "\nTable Description: Virtual Console (Titles)\nCREATE TABLE w (\n\trow_id int,\n\tsystem text,\n\tjapan int,\n\t[[list of virtual console games for wii u (north america)|north america]] real,\n\tpal region - europe real,\n\tpal region - australia real\n)\n/*\n3 example rows:\nSELECT * FROM w LIMIT 3;\nrow_id\tsystem\tjapan\t[[list of virtual console games for wii u (north america)|north america]]\tpal region - europe\tpal region - australia\n0\tnes/famicom\t148\t94.0\t89.0\t89.0\n1\tsuper nes/super famicom\t101\t51.0\t49.0\t49.0\n2\tnintendo 64\t22\t21.0\t21.0\t21.0\n*/ \n "
- },
- {
- "blendsql": "SELECT player FROM w WHERE \n {{\n Model('Has crossed swords on its logo?'; 'w::gold coast titans')\n }} = 1\n ",
- "question": "What player was transferred from the team that has crossed swords on its logo to the Warrington Wolves in the 2018 season?",
- "serialized_db": "\nTable Description: 2018 Warrington Wolves season (Transfers | In)\nCREATE TABLE w(\n\trow_id int,\n\tplayer text,\n\tsigned from text,\n\tcontract length text,\n\tannounced text)\n/*\n3 example rows:\nSELECT * FROM w LIMIT 3;\nrow_id\tplayer\tsigned from\tcontract length\tannounced\n0\tsitaleki akauola\tpenrith panthers\tp2y\t2017-08-01 00:00:00\n1\tbryson goodwin\tsouth sydney rabbitohs\tp2y\t2017-10-01 00:00:00\n2\ttyrone roberts\tgold coast titans\tp3y\t2017-10-01 00:00:00\n*/\nCREATE TABLE Images(\n\trow_id int,\n\tgold coast titans text)\n/*\nAll rows of the table:\nSELECT * FROM w;\nrow_id\tgold coast titans\n0\ta logo for the golden knights is painted on the beach.\n*/\n"
- },
- {
- "blendsql": "SELECT DISTINCT merchant FROM w WHERE\n {{Model('is this a pizza shop?', 'w::merchant'}} = 1\n AND {{\n Model(\n 'what state is this transaction from? Choose -1 when N.A.',\n 'w::description',\n example_outputs='TX;CA;MA;-1;'\n )\n }} = 'MA'\n AND parent_category = 'Food'",
- "question": "Pizza shop merchants in MA",
- "serialized_db": "\nTable Description: transactions\nCREATE TABLE w (\n \"index\" INTEGER,\n \"description\" TEXT,\n \"amount\" REAL,\n \"merchant\" TEXT,\n \"cash_flow\" TEXT,\n \"parent_category\" TEXT,\n \"child_category\" TEXT,\n \"date\" TEXT,\n/*\n3 example rows:\nSELECT * FROM transactions LIMIT 3\n index description amount merchant cash_flow parent_category child_category date confidence_score\n 0 Zelle payment from JUSTIN ORENIC BACucw5n1t14 2000.0 Zelle Expense Cash/ATM Online Payment Service 2022-06-20 0.698891\n 1 Zelle Transfer Conf UJ7QJF7TE Emilia Galindo 1500.0 Zelle Expense Cash/ATM Online Payment Service 2022-01-16 0.701658\n 2 Zelle payment from SOPHIA YANEZ WFCT0Q9R3CNR 1474.0 Zelle Expense Cash/ATM Online Payment Service 2022-03-27 0.861237\n*/\n)"
- }
- ],
- "program": "\nGenerate BlendSQL given the question, table, passages, image captions to answer the question correctly.\nBlendSQL is a superset of SQLite, which adds external function calls for information not found within native SQLite.\nThese external ingredients should be wrapped in double curly brackets.\n\nIf question-relevant column(s) contents are not suitable for SQL comparisons or calculations, map it to a new column with clean content by a new grammar:\n `Model('question', '{table}::{column})'`\n\nExamples:\n{{~#each examples}}\n{{this.serialized_db}}\nQuestion: {{this.question}}\nBlendSQL: {{this.blendsql}}\n{{/each}}\n\n{{serialized_db}}\n\n{{#if bridge_hints}}\nHere are some values that may be helpful:\n{{bridge_hints}}\n{{/if}}\n\nQuestion: {{question}}\nBlendSQL: {{gen \"result\" temperature=0.0}}\n",
- "question": "tell me the number of shows that only have one host per day.",
- "serialized_db": "Table Description: List of programs broadcast by Fox News Channel\nCREATE TABLE \"w\" (\n\"index\" INTEGER,\n \"row_id\" INTEGER,\n \"et\" TEXT,\n \"days available\" TEXT,\n \"program\" TEXT,\n \"host(s)\" TEXT,\n \"description\" TEXT\n)\n\n/*\n3 example rows:\nSELECT * FROM w LIMIT 3\n index row_id et days available program host(s) description\n 0 0 (t09,t10,pt1h) (xxxx-wxx-1,xxxx-wxx-5,p4d) the tmo click jamie colby (m,t), harris faulkner (w\u2013f) current events.\n 1 1 (t10,t10:30,pt30m) (xxxx-wxx-1,xxxx-wxx-5,p4d) none lauren green (m), uma pemmaraju (t), gregg jarrett (w), arthel neville (f), rick folbaum (f), heather childers current events.\n 2 2 (t11,t11:30,pt30m) (xxxx-wxx-1,xxxx-wxx-5,p4d) fbn live lauren simonetti business news and information.\n*/"
- },
- "db_path": "./research/tmp/2838e14c-9b66-41b8-9fda-e8141a39617c.db",
- "pred_text": [
- 7
- ],
- "pred_sql": " SELECT COUNT(*) FROM w WHERE \n {{\n Model('How many hosts per day?', 'w::host(s)')\n }} = 1",
- "pred_has_ingredient": true,
- "example_map_outputs": [
- [
- "2",
- "6",
- "1",
- "1",
- "1",
- "1",
- "1",
- "1",
- "1"
- ]
- ]
- }
-```
-### Good Example 4
-```json
-{
- "id": "nt-40",
- "question": "what was the number of silver medals won by ukraine?",
- "table_id": "csv/203-csv/175.csv",
- "table": {
- "page_title": "Sailing at the 2004 Summer Olympics",
- "header": [
- "Rank",
- "Nation",
- "Gold",
- "Silver",
- "Bronze",
- "Total"
- ],
- "rows": [
- [
- "1",
- "Great Britain\u00a0(GBR)",
- "2",
- "1",
- "2",
- "5"
- ],
- [
- "2",
- "Brazil\u00a0(BRA)",
- "2",
- "0",
- "0",
- "2"
- ],
- [
- "3",
- "Spain\u00a0(ESP)",
- "1",
- "2",
- "0",
- "3"
- ],
- [
- "4",
- "Austria\u00a0(AUT)",
- "1",
- "1",
- "0",
- "2"
- ],
- [
- "4",
- "Greece\u00a0(GRE)",
- "1",
- "1",
- "0",
- "2"
- ],
- [
- "4",
- "United States\u00a0(USA)",
- "1",
- "1",
- "0",
- "2"
- ],
- [
- "7",
- "France\u00a0(FRA)",
- "1",
- "0",
- "1",
- "2"
- ],
- [
- "7",
- "Israel\u00a0(ISR)",
- "1",
- "0",
- "0",
- "1"
- ],
- [
- "7",
- "Norway\u00a0(NOR)",
- "1",
- "0",
- "0",
- "1"
- ],
- [
- "10",
- "Ukraine\u00a0(UKR)",
- "0",
- "2",
- "0",
- "2"
- ],
- [
- "11",
- "China\u00a0(CHN)",
- "0",
- "1",
- "0",
- "1"
- ],
- [
- "11",
- "Czech Republic\u00a0(CZE)",
- "0",
- "1",
- "0",
- "1"
- ],
- [
- "11",
- "Canada\u00a0(CAN)",
- "0",
- "1",
- "0",
- "1"
- ],
- [
- "14",
- "Denmark\u00a0(DEN)",
- "0",
- "0",
- "2",
- "2"
- ],
- [
- "15",
- "Argentina\u00a0(ARG)",
- "0",
- "0",
- "1",
- "1"
- ],
- [
- "15",
- "Italy\u00a0(ITA)",
- "0",
- "0",
- "1",
- "1"
- ],
- [
- "15",
- "Japan\u00a0(JPN)",
- "0",
- "0",
- "1",
- "1"
- ],
- [
- "15",
- "Poland\u00a0(POL)",
- "0",
- "0",
- "1",
- "1"
- ],
- [
- "15",
- "Slovenia\u00a0(SLO)",
- "0",
- "0",
- "1",
- "1"
- ],
- [
- "15",
- "Sweden\u00a0(SWE)",
- "0",
- "0",
- "1",
- "1"
- ],
- [
- "Total",
- "",
- "11",
- "11",
- "11",
- "33"
- ]
- ]
- },
- "sql": "SELECT `Silver` FROM w WHERE `Nation` = 'ukraine'",
- "answer_text": [
- "2"
- ],
- "input_program_args": {
- "bridge_hints": "",
- "endpoint_name": "gpt-4",
- "examples": [
- {
- "blendsql": "SELECT DISTINCT description, merchant FROM transactions WHERE\n {{Model('is this a pizza shop?', 'transactions::merchant', endpoint_name='gpt-4')}} = 1\n AND {{\n Model(\n 'what state is this transaction from? Choose -1 when N.A.',\n 'transactions::description',\n binary=0,\n example_outputs='TX;CA;MA;-1;',\n endpoint_name='gpt-4'\n )\n }} = 'MA'\n AND parent_category = 'Food'",
- "question": "Pizza shops in MA",
- "serialized_db": "CREATE TABLE \"transactions\" (\n \"index\" INTEGER,\n \"description\" TEXT,\n \"amount\" REAL,\n \"merchant\" TEXT,\n \"cash_flow\" TEXT,\n \"parent_category\" TEXT,\n \"child_category\" TEXT,\n \"date\" TEXT,\n)"
- },
- {
- "blendsql": "SELECT DISTINCT description, merchant, child_category, date FROM transactions WHERE\n {{Model('would this gift be appropriate for a child?', 'transactions::description', enpoint_name='gpt-4')}} = 1\n AND child_category = 'Gifts'",
- "question": "Show gifts for my kid",
- "serialized_db": "CREATE TABLE \"transactions\" (\n \"index\" INTEGER,\n \"description\" TEXT,\n \"amount\" REAL,\n \"merchant\" TEXT,\n \"cash_flow\" TEXT,\n \"parent_category\" TEXT,\n \"child_category\" TEXT,\n \"date\" TEXT,\n)"
- }
- ],
- "program": "\n{{#system~}}\nGenerate BlendSQL given the question, table, passages, image captions to answer the question correctly.\nBlendSQL is a superset of SQLite, which adds external function calls for information not found within native SQLite.\nThese external ingredients should be wrapped in double curly brackets.\n\nIf question-relevant column(s) contents are not suitable for SQL comparisons or calculations, map it to a new column with clean content by a new grammar:\n `Model('question', '{table}::{column})'`\n\nExamples:\n{{~#each examples}}\n{{this.serialized_db}}\nQuestion: {{this.question}}\nBlendSQL: {{this.blendsql}}\n{{/each}}\n\n{{~/system}}\n\n{{#user~}}\n{{serialized_db}}\n\n{{#if bridge_hints}}\nHere are some values that may be helpful:\n{{bridge_hints}}\n{{/if}}\n\nQuestion: {{question}}\nBlendSQL:\n{{~/user}}\n\n{{#assistant~}}\n{{gen \"result\" temperature=0.0}}\n{{~/assistant}}\n",
- "question": "what was the number of silver medals won by ukraine?",
- "serialized_db": "CREATE TABLE \"w\" (\n\"index\" INTEGER,\n \"row_id\" INTEGER,\n \"rank\" TEXT,\n \"nation\" TEXT,\n \"gold\" INTEGER,\n \"silver\" INTEGER,\n \"bronze\" INTEGER,\n \"total\" INTEGER\n)\n\n/*\n3 example rows:\nSELECT * FROM w LIMIT 3\n index row_id rank nation gold silver bronze total\n 0 0 1 great britain\u00a0(gbr) 2 1 2 5\n 1 1 2 brazil\u00a0(bra) 2 0 0 2\n 2 2 3 spain\u00a0(esp) 1 2 0 3\n*/"
- },
- "db_path": "./research/tmp/66b50e28-4ffd-4890-8236-5a7f842f284b.db",
- "pred_text": [
- 2
- ],
- "pred_sql": "SELECT silver FROM w WHERE {{Model('is this the country ukraine?', 'w::nation', endpoint_name='gpt-4')}} = 1"
- }
-```
-
-### Good Example 5
-```json
-{
- "id": "nt-353",
- "question": "between bjorn ferry, simon elder and erik lesser - who had the most penalties?",
- "table_id": "csv/204-csv/664.csv",
- "table": {
- "page_title": "Biathlon World Championships 2013 \u2013 Men's pursuit",
- "header": [
- "Rank",
- "Bib",
- "Name",
- "Nationality",
- "Start",
- "Penalties (P+P+S+S)",
- "Time",
- "Deficit"
- ],
- "rows": [
- [
- "",
- "1",
- "Emil Hegle Svendsen",
- "Norway",
- "0:00",
- "1 (0+0+0+1)",
- "32:35.5",
- ""
- ],
- [
- "",
- "2",
- "Martin Fourcade",
- "France",
- "0:08",
- "2 (0+1+1+0)",
- "32:35.6",
- "+0.1"
- ],
- [
- "",
- "7",
- "Anton Shipulin",
- "Russia",
- "0:33",
- "1 (0+0+1+0)",
- "32:39.1",
- "+3.6"
- ],
- [
- "4",
- "5",
- "Dmitry Malyshko",
- "Russia",
- "0:23",
- "0 (0+0+0+0)",
- "32:40.9",
- "+5.4"
- ],
- [
- "5",
- "15",
- "Dominik Landertinger",
- "Austria",
- "0:58",
- "0 (0+0+0+0)",
- "32:55.9",
- "+20.4"
- ],
- [
- "6",
- "3",
- "Jakov Fak",
- "Slovenia",
- "0:11",
- "1 (0+0+0+1)",
- "33:05.2",
- "+29.7"
- ],
- [
- "7",
- "8",
- "Fredrik Lindstr\u00f6m",
- "Sweden",
- "0:38",
- "1 (0+0+1+0)",
- "33:18.9",
- "+43.4"
- ],
- [
- "8",
- "6",
- "Alexis B\u0153uf",
- "France",
- "0:25",
- "2 (0+0+2+0)",
- "33:22.1",
- "+46.6"
- ],
- [
- "9",
- "13",
- "Bj\u00f6rn Ferry",
- "Sweden",
- "0:52",
- "0 (0+0+0+0)",
- "33:23.6",
- "+48.1"
- ],
- [
- "10",
- "4",
- "Ole Einar Bj\u00f8rndalen",
- "Norway",
- "0:20",
- "4 (2+0+1+1)",
- "33:27.0",
- "+51.5"
- ],
- [
- "11",
- "14",
- "Lukas Hofer",
- "Italy",
- "0:57",
- "2 (0+2+0+0)",
- "33:28.8",
- "+53.3"
- ],
- [
- "12",
- "10",
- "Simon Eder",
- "Austria",
- "0:39",
- "3 (0+0+2+1)",
- "33:50.5",
- "+1:15.0"
- ],
- [
- "13",
- "32",
- "Lowell Bailey",
- "United States",
- "1:38",
- "0 (0+0+0+0)",
- "33:56.4",
- "+1:20.9"
- ],
- [
- "14",
- "12",
- "Erik Lesser",
- "Germany",
- "0:49",
- "4 (1+0+2+1)",
- "34:06.0",
- "+1:30.5"
- ],
- [
- "15",
- "36",
- "Jean-Philippe Leguellec",
- "Canada",
- "1:45",
- "1 (0+0+0+1)",
- "34:08.7",
- "+1:33.2"
- ],
- [
- "16",
- "9",
- "Evgeny Ustyugov",
- "Russia",
- "0:38",
- "3 (0+1+2+0)",
- "34:16.7",
- "+1:41.2"
- ],
- [
- "17",
- "18",
- "Tarjei B\u00f8",
- "Norway",
- "1:14",
- "3 (2+0+1+0)",
- "34:18.9",
- "+1:43.4"
- ],
- [
- "18",
- "29",
- "Simon Schempp",
- "Germany",
- "1:36",
- "1 (0+0+0+1)",
- "34:24.4",
- "+1:48.9"
- ],
- [
- "19",
- "11",
- "Krasimir Anev",
- "Bulgaria",
- "0:48",
- "2 (0+1+0+1)",
- "34:24.7",
- "+1:49.2"
- ],
- [
- "20",
- "37",
- "Henrik L'Abee-Lund",
- "Norway",
- "1:45",
- "1 (0+0+0+1)",
- "34:36.1",
- "+2:00.6"
- ],
- [
- "21",
- "16",
- "Arnd Peiffer",
- "Germany",
- "1:08",
- "3 (0+0+2+1)",
- "34:39.8",
- "+2:04.3"
- ],
- [
- "22",
- "23",
- "Andreas Birnbacher",
- "Germany",
- "1:25",
- "3 (0+1+1+1)",
- "34:41.7",
- "+2:06.2"
- ],
- [
- "23",
- "34",
- "Simon Fourcade",
- "France",
- "1:42",
- "3 (0+3+0+0)",
- "34:51.1",
- "+2:15.6"
- ],
- [
- "24",
- "21",
- "Klemen Bauer",
- "Slovenia",
- "1:16",
- "4 (1+0+3+0)",
- "34:55.4",
- "+2:19.9"
- ],
- [
- "25",
- "26",
- "Christian De Lorenzi",
- "Italy",
- "1:30",
- "1 (0+0+0+1)",
- "34:57.5",
- "+2:22.0"
- ],
- [
- "26",
- "20",
- "Jean-Guillaume B\u00e9atrix",
- "France",
- "1:15",
- "4 (0+1+2+1)",
- "34:58.4",
- "+2:22.9"
- ],
- [
- "27",
- "33",
- "Daniel Mesotitsch",
- "Austria",
- "1:39",
- "3 (0+0+2+1)",
- "34:58.7",
- "+2:23.2"
- ],
- [
- "28",
- "19",
- "Evgeniy Garanichev",
- "Russia",
- "1:14",
- "5 (2+0+2+1)",
- "35:04.2",
- "+2:28.7"
- ],
- [
- "29",
- "30",
- "Matej Kaz\u00e1r",
- "Slovakia",
- "1:36",
- "5 (2+1+1+1)",
- "35:04.9",
- "+2:29.4"
- ],
- [
- "30",
- "17",
- "Andriy Deryzemlya",
- "Ukraine",
- "1:13",
- "4 (0+0+2+2)",
- "35:05.3",
- "+2:29.8"
- ],
- [
- "31",
- "35",
- "Ond\u0159ej Moravec",
- "Czech Republic",
- "1:42",
- "3 (0+1+1+1)",
- "35:05.3",
- "+2:48.2"
- ],
- [
- "32",
- "28",
- "Tim Burke",
- "United States",
- "1:36",
- "4 (0+1+2+1)",
- "35:33.8",
- "+2:58.3"
- ],
- [
- "33",
- "58",
- "Dominik Windisch",
- "Italy",
- "2:31",
- "2 (0+0+1+1)",
- "35:36.4",
- "+3:00.9"
- ],
- [
- "34",
- "54",
- "Carl Johan Bergman",
- "Sweden",
- "2:19",
- "2 (0+0+1+1)",
- "35:37.0",
- "+3:01.5"
- ],
- [
- "35",
- "25",
- "Benjamin Weger",
- "Switzerland",
- "1:28",
- "3 (0+1+1+1)",
- "36:13.1",
- "+3:37.6"
- ],
- [
- "36",
- "42",
- "Artem Pryma",
- "Ukraine",
- "1:59",
- "3 (1+2+0+0)",
- "36:13.2",
- "+3:37.7"
- ],
- [
- "37",
- "50",
- "Pavol Hurajt",
- "Slovakia",
- "2:14",
- "2 (0+1+0+1)",
- "36:18.9",
- "+3:43.4"
- ],
- [
- "38",
- "22",
- "Tomas Kauk\u0117nas",
- "Lithuania",
- "1:20",
- "4 (0+0+2+2)",
- "36:33.4",
- "+3:57.9"
- ],
- [
- "39",
- "43",
- "Vladimir Iliev",
- "Bulgaria",
- "2:03",
- "5 (1+1+2+1)",
- "36:43.4",
- "+4:07.9"
- ],
- [
- "40",
- "40",
- "Edgars Piksons",
- "Latvia",
- "1:55",
- "4 (1+0+2+1)",
- "36:49.8",
- "+4:14.3"
- ],
- [
- "41",
- "60",
- "Christian Martinelli",
- "Italy",
- "2:36",
- "2 (1+0+0+1)",
- "36:53.0",
- "+4:17.5"
- ],
- [
- "42",
- "27",
- "Jaroslav Soukup",
- "Czech Republic",
- "1:32",
- "6 (2+2+2+0)",
- "36:59.8",
- "+4:24.3"
- ],
- [
- "43",
- "53",
- "Leif Nordgren",
- "United States",
- "2:19",
- "6 (2+2+2+0)",
- "37:08.3",
- "+4:32.8"
- ],
- [
- "44",
- "56",
- "Hidenori Isa",
- "Japan",
- "2:27",
- "2 (0+0+0+2)",
- "37:13.9",
- "+4:38.4"
- ],
- [
- "45",
- "41",
- "Serhiy Semenov",
- "Ukraine",
- "1:55",
- "5 (3+1+1+0)",
- "37:14.0",
- "+4:38.5"
- ],
- [
- "46",
- "24",
- "Zden\u011bk V\u00edtek",
- "Czech Republic",
- "1:25",
- "7 (2+2+1+2)",
- "37:21.0",
- "+4:45.5"
- ],
- [
- "47",
- "49",
- "Milanko Petrovi\u0107",
- "Serbia",
- "2:14",
- "6 (2+2+2+0)",
- "37:26.6",
- "+4:51.1"
- ],
- [
- "48",
- "31",
- "Kauri K\u00f5iv",
- "Estonia",
- "1:37",
- "6 (2+1+2+1)",
- "37:29.9",
- "+4:54.4"
- ],
- [
- "49",
- "47",
- "Yan Savitskiy",
- "Kazakhstan",
- "2:11",
- "5 (2+2+0+1)",
- "37:51.8",
- "+5:16.3"
- ],
- [
- "50",
- "45",
- "Scott Gow",
- "Canada",
- "2:06",
- "5 (0+0+1+4)",
- "37:57.4",
- "+5:21.9"
- ],
- [
- "51",
- "48",
- "Claudio B\u00f6ckli",
- "Switzerland",
- "2:13",
- "3 (2+0+0+1)",
- "38:02.8",
- "+5:27.3"
- ],
- [
- "52",
- "44",
- "Russell Currier",
- "United States",
- "2:05",
- "7 (3+1+2+1)",
- "38:06.2",
- "+5:30.7"
- ],
- [
- "53",
- "57",
- "Vladimir Chepelin",
- "Belarus",
- "2:28",
- "4 (0+1+2+1)",
- "38:06.5",
- "+5:31.0"
- ],
- [
- "54",
- "46",
- "Aliaksandr Babchyn",
- "Belarus",
- "2:06",
- "6 (3+2+1+0)",
- "38:19.8",
- "+5:44.3"
- ],
- [
- "55",
- "59",
- "Miroslav Matia\u0161ko",
- "Slovakia",
- "2:36",
- "6 (3+0+2+1)",
- "38:54.6",
- "+6:19.1"
- ],
- [
- "56",
- "38",
- "Jarkko Kauppinen",
- "Finland",
- "1:46",
- "6 (1+1+3+1)",
- "39:06.6",
- "+6:31.1"
- ],
- [
- "57",
- "51",
- "Junji Nagai",
- "Japan",
- "2:17",
- "7 (0+2+3+2)",
- "39:49.9",
- "+7:14.4"
- ],
- [
- "",
- "55",
- "Stefan Gavrila",
- "Romania",
- "2:25",
- "10 (4+3+3)",
- "LAP",
- ""
- ],
- [
- "",
- "39",
- "Sergei Sednev",
- "Ukraine",
- "1:54",
- "",
- "DNS",
- ""
- ],
- [
- "",
- "52",
- "Michal \u0160lesingr",
- "Czech Republic",
- "2:18",
- "",
- "DNS",
- ""
- ]
- ]
- },
- "sql": "SELECT `Name` FROM w WHERE `Name` IN ( 'bjorn ferry' , 'simon eder' , 'erik lesser' ) ORDER by `Penalties (P+P+S+S)` desc LIMIT 1",
- "answer_text": [
- "Erik Lesser"
- ],
- "input_program_args": {
- "bridge_hints": "name ( erik lesser )",
- "examples": [
- {
- "blendsql": "SELECT DISTINCT description, merchant FROM transactions WHERE\n {{Model('is this a pizza shop?', 'transactions::merchant', endpoint_name='gpt-4')}} = 1\n AND {{\n Model(\n 'what state is this transaction from? Choose -1 when N.A.',\n 'transactions::description',\n binary=0,\n example_outputs='TX;CA;MA;-1;'\n )\n }} = 'MA'\n AND parent_category = 'Food'",
- "question": "Pizza shops in MA",
- "serialized_db": "CREATE TABLE \"transactions\" (\n \"index\" INTEGER,\n \"description\" TEXT,\n \"amount\" REAL,\n \"merchant\" TEXT,\n \"cash_flow\" TEXT,\n \"parent_category\" TEXT,\n \"child_category\" TEXT,\n \"date\" TEXT,\n)"
- },
- {
- "blendsql": "SELECT DISTINCT description, merchant, child_category, date FROM transactions WHERE\n {{Model('would this gift be appropriate for a child?', 'transactions::description')}} = 1\n AND child_category = 'Gifts'",
- "question": "Show gifts for my kid",
- "serialized_db": "CREATE TABLE \"transactions\" (\n \"index\" INTEGER,\n \"description\" TEXT,\n \"amount\" REAL,\n \"merchant\" TEXT,\n \"cash_flow\" TEXT,\n \"parent_category\" TEXT,\n \"child_category\" TEXT,\n \"date\" TEXT,\n)"
- }
- ],
- "program": "\n{{#system~}}\nGenerate BlendSQL given the question, table, passages, image captions to answer the question correctly.\nBlendSQL is a superset of SQLite, which adds external function calls for information not found within native SQLite.\nThese external ingredients should be wrapped in double curly brackets.\n\nIf question-relevant column(s) contents are not suitable for SQL comparisons or calculations, map it to a new column with clean content by a new grammar:\n `Model('question', '{table}::{column})'`\n\nExamples:\n{{~#each examples}}\n{{this.serialized_db}}\nQuestion: {{this.question}}\nBlendSQL: {{this.blendsql}}\n{{/each}}\n\n{{~/system}}\n\n{{#user~}}\n{{serialized_db}}\n\n{{#if bridge_hints}}\nHere are some values that may be helpful:\n{{bridge_hints}}\n{{/if}}\n\nQuestion: {{question}}\nBlendSQL:\n{{~/user}}\n\n{{#assistant~}}\n{{gen \"result\" temperature=0.0}}\n{{~/assistant}}\n",
- "question": "between bjorn ferry, simon elder and erik lesser - who had the most penalties?",
- "serialized_db": "CREATE TABLE \"w\" (\n\"index\" INTEGER,\n \"row_id\" INTEGER,\n \"rank\" REAL,\n \"bib\" INTEGER,\n \"name\" TEXT,\n \"nationality\" TEXT,\n \"start\" TEXT,\n \"penalties (p+p+s+s)\" TEXT,\n \"time\" TEXT,\n \"deficit\" TEXT\n)\n\n/*\n3 example rows:\nSELECT * FROM w LIMIT 3\n index row_id rank bib name nationality start penalties (p+p+s+s) time deficit\n 0 0 None 1 emil hegle svendsen norway 0:0 1 (0+0+0+1) 32:35.5 none\n 1 1 None 2 martin fourcade france 0:8 2 (0+1+1+0) 32:35.6 +0.1\n 2 2 None 7 anton shipulin russia 0:33 1 (0+0+1+0) 32:39.1 +3.6\n*/"
- },
- "db_path": "./research/tmp/bc195c10-5a92-4b4c-a665-59548a2a822a.db",
- "pred_text": [
- "erik lesser",
- null
- ],
- "pred_sql": "SELECT name, MAX({{Model('how many penalties does this person have?', 'w::penalties (p+p+s+s)', binary=0, example_outputs='0;1;2;3;4;5;6;7;8;9;10;') }}) as penalties FROM w WHERE name IN ('bjorn ferry', 'simon elder', 'erik lesser') GROUP BY name ORDER BY penalties DESC LIMIT 1"
-}
-```
-## Interesting Mistakes
-1) Correctly checks out the 'remarks' column, but should be 'how many pylons does this remark mention?'
-```json
-{
- "id": "nt-53",
- "question": "what is the total number of pylons listed?",
- "table_id": "csv/203-csv/375.csv",
- "table": {
- "page_title": "Aerial lift pylon",
- "header": [
- "Name",
- "Year built",
- "Country",
- "Town",
- "Height",
- "Remarks"
- ],
- "rows": [
- [
- "Glacial Aerial Tramway Kaprun III",
- "1966",
- "Austria",
- "Kaprun",
- "113.6 m",
- "The tallest pylon is on the third section"
- ],
- [
- "Mississippi Aerial River Transit",
- "1984",
- "USA",
- "New Orleans",
- "109 m",
- "The tallest pylon on a gondola lift; on 87 m pile foundations; demolished 1994"
- ],
- [
- "Torre Jaume I",
- "1931",
- "Spain",
- "Barcelona",
- "107 m",
- "Intermediate stop of the harbour aerial tramway, also observation tower"
- ],
- [
- "Gant Hohtaelli aerial tramway",
- "",
- "Switzerland",
- "Zermatt",
- "94 m",
- "One pylon"
- ],
- [
- "London Emirates Air Line",
- "Early 2012",
- "UK",
- "London",
- "88 m",
- "North Main Tower. River Thames aerial crossing between Greenwich Peninsular and Royal Docks"
- ],
- [
- "Singapore cable car",
- "1972",
- "Singapore",
- "Singapore",
- "88 m",
- "Pylon I"
- ],
- [
- "Eibsee Aerial Tramway",
- "1962",
- "Germany",
- "Garmisch-Partenkirchen",
- "85 m",
- "Pylon II"
- ],
- [
- "Nizhny Novgorod Volga Aerial Tramway, Tower 4 & 5",
- "2012",
- "Russia",
- "Nizhny Novgorod",
- "82 m",
- ""
- ],
- [
- "Mittersill goods aerial tramway",
- "194?",
- "Austria",
- "Mittersill",
- "80 m",
- "Two pylons for a tramway that never went in service and was demolished in the 1950s. One of the pylons was built of timber, the other of steel."
- ],
- [
- "Singapore cable car",
- "1972",
- "Singapore",
- "Singapore",
- "80 m",
- "Pylon II"
- ],
- [
- "3S Aerial Tramway",
- "2004",
- "Austria",
- "Kitzb\u00fchel",
- "80 m",
- "One pylon"
- ],
- [
- "Torre Sant Sebastia",
- "1931",
- "Spain",
- "Barcelona",
- "78 m",
- "Terminal of harbour aerial tramway"
- ],
- [
- "Roosevelt Island Tramway",
- "1976",
- "USA",
- "New York City",
- "76 m",
- "Central pylon of commuter tramway"
- ],
- [
- "Wendelstein Aerial Tramway",
- "1970",
- "Germany",
- "Bayerischzell-Osterhofen",
- "75 m",
- ""
- ],
- [
- "Vinpearl Cable Car",
- "2007",
- "Vietnam",
- "Nha Trang",
- "75 m",
- "7 pylons standing in the sea. Total height from sea bed is 115 m"
- ],
- [
- "Sandia Peak Tramway",
- "1965",
- "USA",
- "Albuquerque",
- "70.7 m",
- "Pylon 1, inclined at an angle of 18 degrees"
- ],
- [
- "Eibsee Aerial Tramway",
- "1962",
- "Germany",
- "Garmisch-Partenkirchen",
- "65 m",
- "Pylon I"
- ]
- ]
- },
- "sql": "SELECT COUNT ( `Name` ) FROM w",
- "answer_text": [
- "17"
- ],
- "input_program_args": {
- "bridge_hints": "",
- "examples": [
- {
- "blendsql": "SELECT DISTINCT description, merchant FROM transactions WHERE\n {{Model('is this a pizza shop?', 'transactions::merchant', endpoint_name='gpt-4')}} = 1\n AND {{\n Model(\n 'what state is this transaction from? Choose -1 when N.A.',\n 'transactions::description',\n binary=0,\n example_outputs='TX;CA;MA;-1;'\n )\n }} = 'MA'\n AND parent_category = 'Food'",
- "question": "Pizza shops in MA",
- "serialized_db": "CREATE TABLE \"transactions\" (\n \"index\" INTEGER,\n \"description\" TEXT,\n \"amount\" REAL,\n \"merchant\" TEXT,\n \"cash_flow\" TEXT,\n \"parent_category\" TEXT,\n \"child_category\" TEXT,\n \"date\" TEXT,\n)"
- },
- {
- "blendsql": "SELECT DISTINCT description, merchant, child_category, date FROM transactions WHERE\n {{Model('would this gift be appropriate for a child?', 'transactions::description')}} = 1\n AND child_category = 'Gifts'",
- "question": "Show gifts for my kid",
- "serialized_db": "CREATE TABLE \"transactions\" (\n \"index\" INTEGER,\n \"description\" TEXT,\n \"amount\" REAL,\n \"merchant\" TEXT,\n \"cash_flow\" TEXT,\n \"parent_category\" TEXT,\n \"child_category\" TEXT,\n \"date\" TEXT,\n)"
- }
- ],
- "program": "\n{{#system~}}\nGenerate BlendSQL given the question, table, passages, image captions to answer the question correctly.\nBlendSQL is a superset of SQLite, which adds external function calls for information not found within native SQLite.\nThese external ingredients should be wrapped in double curly brackets.\n\nIf question-relevant column(s) contents are not suitable for SQL comparisons or calculations, map it to a new column with clean content by a new grammar:\n `Model('question', '{table}::{column})'`\n\nExamples:\n{{~#each examples}}\n{{this.serialized_db}}\nQuestion: {{this.question}}\nBlendSQL: {{this.blendsql}}\n{{/each}}\n\n{{~/system}}\n\n{{#user~}}\n{{serialized_db}}\n\n{{#if bridge_hints}}\nHere are some values that may be helpful:\n{{bridge_hints}}\n{{/if}}\n\nQuestion: {{question}}\nBlendSQL:\n{{~/user}}\n\n{{#assistant~}}\n{{gen \"result\" temperature=0.0}}\n{{~/assistant}}\n",
- "question": "what is the total number of pylons listed?",
- "serialized_db": "CREATE TABLE \"w\" (\n\"index\" INTEGER,\n \"row_id\" INTEGER,\n \"name\" TEXT,\n \"year built\" TEXT,\n \"country\" TEXT,\n \"town\" TEXT,\n \"height\" TEXT,\n \"remarks\" TEXT\n)\n\n/*\n3 example rows:\nSELECT * FROM w LIMIT 3\n index row_id name year built country town height remarks\n 0 0 glacial aerial tramway kaprun iii 1966 austria kaprun 113.6 m the tallest pylon is on the third section\n 1 1 mississippi aerial river transit 1984 usa new orleans 109 m the tallest pylon on a gondola lift; on 87 m pile foundations; demolished 1994\n 2 2 torre jaume i 1931 spain barcelona 107 m intermediate stop of the harbour aerial tramway, also observation tower\n*/"
- },
- "db_path": "./research/tmp/ec0862aa-6fa1-4032-acb0-e340eba39301.db",
- "pred_text": [
- 12
- ],
- "pred_sql": "SELECT COUNT(*) FROM w WHERE {{Model('does this remark mention a pylon?', 'w::remarks')}} = 1"
- }
-```
-
-2) Failing to add arguments `binary=0`
-```json
-{
- "id": "nt-395",
- "question": "what is the pylon with the least height listed here?",
- "table_id": "csv/203-csv/375.csv",
- "table": {
- "page_title": "Aerial lift pylon",
- "header": [
- "Name",
- "Year built",
- "Country",
- "Town",
- "Height",
- "Remarks"
- ],
- "rows": [
- [
- "Glacial Aerial Tramway Kaprun III",
- "1966",
- "Austria",
- "Kaprun",
- "113.6 m",
- "The tallest pylon is on the third section"
- ],
- [
- "Mississippi Aerial River Transit",
- "1984",
- "USA",
- "New Orleans",
- "109 m",
- "The tallest pylon on a gondola lift; on 87 m pile foundations; demolished 1994"
- ],
- [
- "Torre Jaume I",
- "1931",
- "Spain",
- "Barcelona",
- "107 m",
- "Intermediate stop of the harbour aerial tramway, also observation tower"
- ],
- [
- "Gant Hohtaelli aerial tramway",
- "",
- "Switzerland",
- "Zermatt",
- "94 m",
- "One pylon"
- ],
- [
- "London Emirates Air Line",
- "Early 2012",
- "UK",
- "London",
- "88 m",
- "North Main Tower. River Thames aerial crossing between Greenwich Peninsular and Royal Docks"
- ],
- [
- "Singapore cable car",
- "1972",
- "Singapore",
- "Singapore",
- "88 m",
- "Pylon I"
- ],
- [
- "Eibsee Aerial Tramway",
- "1962",
- "Germany",
- "Garmisch-Partenkirchen",
- "85 m",
- "Pylon II"
- ],
- [
- "Nizhny Novgorod Volga Aerial Tramway, Tower 4 & 5",
- "2012",
- "Russia",
- "Nizhny Novgorod",
- "82 m",
- ""
- ],
- [
- "Mittersill goods aerial tramway",
- "194?",
- "Austria",
- "Mittersill",
- "80 m",
- "Two pylons for a tramway that never went in service and was demolished in the 1950s. One of the pylons was built of timber, the other of steel."
- ],
- [
- "Singapore cable car",
- "1972",
- "Singapore",
- "Singapore",
- "80 m",
- "Pylon II"
- ],
- [
- "3S Aerial Tramway",
- "2004",
- "Austria",
- "Kitzb\u00fchel",
- "80 m",
- "One pylon"
- ],
- [
- "Torre Sant Sebastia",
- "1931",
- "Spain",
- "Barcelona",
- "78 m",
- "Terminal of harbour aerial tramway"
- ],
- [
- "Roosevelt Island Tramway",
- "1976",
- "USA",
- "New York City",
- "76 m",
- "Central pylon of commuter tramway"
- ],
- [
- "Wendelstein Aerial Tramway",
- "1970",
- "Germany",
- "Bayerischzell-Osterhofen",
- "75 m",
- ""
- ],
- [
- "Vinpearl Cable Car",
- "2007",
- "Vietnam",
- "Nha Trang",
- "75 m",
- "7 pylons standing in the sea. Total height from sea bed is 115 m"
- ],
- [
- "Sandia Peak Tramway",
- "1965",
- "USA",
- "Albuquerque",
- "70.7 m",
- "Pylon 1, inclined at an angle of 18 degrees"
- ],
- [
- "Eibsee Aerial Tramway",
- "1962",
- "Germany",
- "Garmisch-Partenkirchen",
- "65 m",
- "Pylon I"
- ]
- ]
- },
- "sql": "SELECT `Name` FROM w ORDER by `Height` asc LIMIT 1",
- "answer_text": [
- "Eibsee Aerial Tramway"
- ],
- "input_program_args": {
- "bridge_hints": "",
- "examples": [
- {
- "blendsql": "SELECT DISTINCT description, merchant FROM transactions WHERE\n {{Model('is this a pizza shop?', 'transactions::merchant', endpoint_name='gpt-4')}} = 1\n AND {{\n Model(\n 'what state is this transaction from? Choose -1 when N.A.',\n 'transactions::description',\n binary=0,\n example_outputs='TX;CA;MA;-1;'\n )\n }} = 'MA'\n AND parent_category = 'Food'",
- "question": "Pizza shops in MA",
- "serialized_db": "CREATE TABLE \"transactions\" (\n \"index\" INTEGER,\n \"description\" TEXT,\n \"amount\" REAL,\n \"merchant\" TEXT,\n \"cash_flow\" TEXT,\n \"parent_category\" TEXT,\n \"child_category\" TEXT,\n \"date\" TEXT,\n)"
- },
- {
- "blendsql": "SELECT DISTINCT description, merchant, child_category, date FROM transactions WHERE\n {{Model('would this gift be appropriate for a child?', 'transactions::description')}} = 1\n AND child_category = 'Gifts'",
- "question": "Show gifts for my kid",
- "serialized_db": "CREATE TABLE \"transactions\" (\n \"index\" INTEGER,\n \"description\" TEXT,\n \"amount\" REAL,\n \"merchant\" TEXT,\n \"cash_flow\" TEXT,\n \"parent_category\" TEXT,\n \"child_category\" TEXT,\n \"date\" TEXT,\n)"
- }
- ],
- "program": "\n{{#system~}}\nGenerate BlendSQL given the question, table, passages, image captions to answer the question correctly.\nBlendSQL is a superset of SQLite, which adds external function calls for information not found within native SQLite.\nThese external ingredients should be wrapped in double curly brackets.\n\nIf question-relevant column(s) contents are not suitable for SQL comparisons or calculations, map it to a new column with clean content by a new grammar:\n `Model('question', '{table}::{column})'`\n\nExamples:\n{{~#each examples}}\n{{this.serialized_db}}\nQuestion: {{this.question}}\nBlendSQL: {{this.blendsql}}\n{{/each}}\n\n{{~/system}}\n\n{{#user~}}\n{{serialized_db}}\n\n{{#if bridge_hints}}\nHere are some values that may be helpful:\n{{bridge_hints}}\n{{/if}}\n\nQuestion: {{question}}\nBlendSQL:\n{{~/user}}\n\n{{#assistant~}}\n{{gen \"result\" temperature=0.0}}\n{{~/assistant}}\n",
- "question": "what is the pylon with the least height listed here?",
- "serialized_db": "CREATE TABLE \"w\" (\n\"index\" INTEGER,\n \"row_id\" INTEGER,\n \"name\" TEXT,\n \"year built\" TEXT,\n \"country\" TEXT,\n \"town\" TEXT,\n \"height\" TEXT,\n \"remarks\" TEXT\n)\n\n/*\n3 example rows:\nSELECT * FROM w LIMIT 3\n index row_id name year built country town height remarks\n 0 0 glacial aerial tramway kaprun iii 1966 austria kaprun 113.6 m the tallest pylon is on the third section\n 1 1 mississippi aerial river transit 1984 usa new orleans 109 m the tallest pylon on a gondola lift; on 87 m pile foundations; demolished 1994\n 2 2 torre jaume i 1931 spain barcelona 107 m intermediate stop of the harbour aerial tramway, also observation tower\n*/"
- },
- "db_path": "./research/tmp/4820622f-ef25-43f9-aa93-6501cd7db235.db",
- "pred_text": [
- "mississippi aerial river transit",
- "109 m"
- ],
- "pred_sql": "SELECT name, height FROM w ORDER BY {{Model('what is the numerical value of this height?', 'w::height')}} ASC LIMIT 1"
-}
-```
-
-3) Could have reused previous ingredient (maybe bad few-shot prompts at fault here)
-```json
-{
- "id": "nt-397",
- "question": "how many provinces have the same number of districts as kabul?",
- "table_id": "csv/204-csv/704.csv",
- "table": {
- "page_title": "Outline of Afghanistan",
- "header": [
- "Province",
- "Map #",
- "ISO 3166-2:AF",
- "Centers",
- "Population",
- "Area (km\u00b2)",
- "Language",
- "Notes",
- "U.N. Region"
- ],
- "rows": [
- [
- "Badakhshan",
- "30",
- "AF-BDS",
- "Fayzabad",
- "823,000",
- "44,059",
- "Dari Persian,",
- "Composed of 29 districts",
- "North East Afghanistan"
- ],
- [
- "Badghis",
- "4",
- "AF-BDG",
- "Qala i Naw",
- "429,500",
- "20,591",
- "Dari Persian, Pashto",
- "7 districts",
- "West Afghanistan"
- ],
- [
- "Baghlan",
- "19",
- "AF-BGL",
- "Puli Khumri",
- "779,000",
- "21,118",
- "Dari Persian Uzbek, Turkmen, Pashto",
- "16 districts",
- "North East Afghanistan"
- ],
- [
- "Balkh",
- "13",
- "AF-BAL",
- "Mazari Sharif",
- "1,096,100",
- "17,249",
- "Dari Persian,",
- "15 districts",
- "North Afghanistan"
- ],
- [
- "Bamyan",
- "15",
- "AF-BAM",
- "Bamyan",
- "387,300",
- "14,175",
- "Dari Persian",
- "7 districts",
- "East Central Afghanistan"
- ],
- [
- "Daykundi",
- "10",
- "AF-DAY",
- "Nili",
- "399,600",
- "8,088",
- "Dari Persian",
- "8 districts\\nFormed from Or\u016bzg\u0101n Province in 2004",
- "South West Afghanistan"
- ],
- [
- "Farah",
- "2",
- "AF-FRA",
- "Farah",
- "438,000",
- "48,471",
- "Pashto, Dari Persian, Baloch",
- "11 districts",
- "West Afghanistan"
- ],
- [
- "Faryab",
- "5",
- "AF-FYB",
- "Maymana",
- "858,600",
- "20,293",
- "Dari Persian, Turkmen and Uzbek",
- "14 districts",
- "West Afghanistan"
- ],
- [
- "Ghazni",
- "16",
- "AF-GHA",
- "Ghazni",
- "931,000",
- "22,915",
- "Pashto, Dari Persian",
- "19 districts",
- "South Afghanistan"
- ],
- [
- "Ghor",
- "6",
- "AF-GHO",
- "Chaghcharan",
- "485,000",
- "36,479",
- "Dari Persian",
- "10 districts",
- "East Central Afghanistan"
- ],
- [
- "Helmand",
- "7",
- "AF-HEL",
- "Lashkar Gah",
- "745,000",
- "58,584",
- "Pashto,Baloch",
- "13 districts",
- "South West Afghanistan"
- ],
- [
- "Herat",
- "1",
- "AF-HER",
- "Herat",
- "1,182,000",
- "54,778",
- "Dari Persian, Pashto",
- "15 districts",
- "West Afghanistan"
- ],
- [
- "Jowzjan",
- "8",
- "AF-JOW",
- "Sheberghan",
- "441,000",
- "11,798",
- "Uzbek, Dari Persian",
- "9 districts",
- "North Afghanistan"
- ],
- [
- "Kabul",
- "22",
- "AF-KAB",
- "Kabul",
- "3,314,000",
- "4,462",
- "Pashto, Dari Persian, Turkmen, Baloch and Uzbek",
- "15 districts",
- "Central Afghanistan"
- ],
- [
- "Kandahar",
- "12",
- "AF-KAN",
- "Kandahar",
- "886,000",
- "54,022",
- "Pashto, Dari Persian,Baloch",
- "16 districts",
- "South West Afghanistan"
- ],
- [
- "Kapisa",
- "29",
- "AF-KAP",
- "Mahmud-i-Raqi",
- "360,000",
- "1,842",
- "Dari Persian",
- "7 districts",
- "Central Afghanistan"
- ],
- [
- "Khost",
- "26",
- "AF-KHO",
- "Khost",
- "300,000",
- "4,152",
- "Pashto",
- "13 districts",
- "South Afghanistan"
- ],
- [
- "Kunar",
- "34",
- "AF-KNR",
- "Asadabad",
- "321,000",
- "4,942",
- "Pashto",
- "15 districts",
- "East Afghanistan"
- ],
- [
- "Kunduz",
- "18",
- "AF-KDZ",
- "Kunduz",
- "820,000",
- "8,040",
- "Dari Persian, Uzbek, Turkmen, Pashto",
- "7 districts",
- "North East Afghanistan"
- ],
- [
- "Laghman",
- "32",
- "AF-LAG",
- "Mihtarlam District",
- "373,000",
- "3,843",
- "Pashto, Dari Persian and Pashai",
- "5 districts",
- "East Afghanistan"
- ],
- [
- "Lowgar",
- "23",
- "AF-LOW",
- "Pul-i-Alam",
- "292,000",
- "3,880",
- "Dari Persian, Pashto",
- "7 districts",
- "Central Afghanistan"
- ],
- [
- "Nangarhar",
- "33",
- "AF-NAN",
- "Jalalabad",
- "1,089,000",
- "7,727",
- "Pashto, Dari Persian",
- "23 districts",
- "East Afghanistan"
- ],
- [
- "Nimruz",
- "3",
- "AF-NIM",
- "Zaranj",
- "149,000",
- "41,005",
- "Balochi, Dari Persian and Balochi",
- "5 districts",
- "South West Afghanistan"
- ],
- [
- "Nurestan",
- "31",
- "AF-NUR",
- "Parun",
- "112,000",
- "9,225",
- "Nuristani, Pashto, Dari Persian",
- "7 districts",
- "East Afghanistan"
- ],
- [
- "Or\u016bzg\u0101n",
- "11",
- "AF-ORU",
- "Tarin Kowt",
- "627,000",
- "22,696",
- "Pashto",
- "6 districts",
- "South West Afghanistan"
- ],
- [
- "Paktia",
- "24",
- "AF-PIA",
- "Gardez",
- "415,000",
- "6,432",
- "Pashto, Dari Persian",
- "11 districts",
- "South Afghanistan"
- ],
- [
- "Paktika",
- "25",
- "AF-PKA",
- "Sharan",
- "352,000",
- "19,482",
- "Pashto",
- "15 districts",
- "South Afghanistan"
- ],
- [
- "Panjshir",
- "28",
- "AF-PAN",
- "Bazarak",
- "128,620",
- "3,610",
- "Dari Persian",
- "5 districts\\nCreated in 2004 from Parwan Province",
- "Central Afghanistan"
- ],
- [
- "Parwan",
- "20",
- "AF-PAR",
- "Charikar",
- "726,000",
- "5,974",
- "Dari Persian",
- "Composed of 9 districts",
- "Central Afghanistan"
- ],
- [
- "Samangan",
- "14",
- "AF-SAM",
- "Aybak",
- "378,000",
- "11,262",
- "Dari Persian, Uzbek",
- "5 districts",
- "North Afghanistan"
- ],
- [
- "Sar-e Pol",
- "9",
- "AF-SAR",
- "Sar-e Pol",
- "468,000",
- "15,999",
- "Dari Persian and Uzbek",
- "6 districts",
- "North Afghanistan"
- ],
- [
- "Takhar",
- "27",
- "AF-TAK",
- "Taloqan",
- "810,800",
- "12,333",
- "Dari Persian, and Uzbek",
- "12 districts",
- "North East Afghanistan"
- ],
- [
- "Wardak",
- "21",
- "AF-WAR",
- "Meydan Shahr",
- "413,000",
- "8,938",
- "Pashto, Dari Persian",
- "9 districts",
- "Central Afghanistan"
- ],
- [
- "Zabul",
- "17",
- "AF-ZAB",
- "Qalat",
- "365,920",
- "17,343",
- "Pashto",
- "9 districts",
- "South West Afghanistan"
- ]
- ]
- },
- "sql": "SELECT COUNT ( `Province` ) FROM w WHERE `Province` != 'kabul' and `Notes` = ( SELECT `Notes` FROM w WHERE `Province` = 'kabul' )",
- "answer_text": [
- "4"
- ],
- "input_program_args": {
- "bridge_hints": "province ( kabul )\ncenters ( kabul )",
- "examples": [
- {
- "blendsql": "SELECT DISTINCT description, merchant FROM transactions WHERE\n {{Model('is this a pizza shop?', 'transactions::merchant', endpoint_name='gpt-4')}} = 1\n AND {{\n Model(\n 'what state is this transaction from? Choose -1 when N.A.',\n 'transactions::description',\n binary=0,\n example_outputs='TX;CA;MA;-1;'\n )\n }} = 'MA'\n AND parent_category = 'Food'",
- "question": "Pizza shops in MA",
- "serialized_db": "CREATE TABLE \"transactions\" (\n \"index\" INTEGER,\n \"description\" TEXT,\n \"amount\" REAL,\n \"merchant\" TEXT,\n \"cash_flow\" TEXT,\n \"parent_category\" TEXT,\n \"child_category\" TEXT,\n \"date\" TEXT,\n)"
- },
- {
- "blendsql": "SELECT DISTINCT description, merchant, child_category, date FROM transactions WHERE\n {{Model('would this gift be appropriate for a child?', 'transactions::description')}} = 1\n AND child_category = 'Gifts'",
- "question": "Show gifts for my kid",
- "serialized_db": "CREATE TABLE \"transactions\" (\n \"index\" INTEGER,\n \"description\" TEXT,\n \"amount\" REAL,\n \"merchant\" TEXT,\n \"cash_flow\" TEXT,\n \"parent_category\" TEXT,\n \"child_category\" TEXT,\n \"date\" TEXT,\n)"
- }
- ],
- "program": "\n{{#system~}}\nGenerate BlendSQL given the question, table, passages, image captions to answer the question correctly.\nBlendSQL is a superset of SQLite, which adds external function calls for information not found within native SQLite.\nThese external ingredients should be wrapped in double curly brackets.\n\nIf question-relevant column(s) contents are not suitable for SQL comparisons or calculations, map it to a new column with clean content by a new grammar:\n `Model('question', '{table}::{column})'`\n\nExamples:\n{{~#each examples}}\n{{this.serialized_db}}\nQuestion: {{this.question}}\nBlendSQL: {{this.blendsql}}\n{{/each}}\n\n{{~/system}}\n\n{{#user~}}\n{{serialized_db}}\n\n{{#if bridge_hints}}\nHere are some values that may be helpful:\n{{bridge_hints}}\n{{/if}}\n\nQuestion: {{question}}\nBlendSQL:\n{{~/user}}\n\n{{#assistant~}}\n{{gen \"result\" temperature=0.0}}\n{{~/assistant}}\n",
- "question": "how many provinces have the same number of districts as kabul?",
- "serialized_db": "CREATE TABLE \"w\" (\n\"index\" INTEGER,\n \"row_id\" INTEGER,\n \"province\" TEXT,\n \"map #\" INTEGER,\n \"iso 3166-2:af\" TEXT,\n \"centers\" TEXT,\n \"population\" INTEGER,\n \"area (km\u00b2)\" INTEGER,\n \"language\" TEXT,\n \"notes\" TEXT,\n \"u.n. region\" TEXT\n)\n\n/*\n3 example rows:\nSELECT * FROM w LIMIT 3\n index row_id province map # iso 3166-2:af centers population area (km\u00b2) language notes u.n. region\n 0 0 badakhshan 30 af-bds fayzabad 823000 44059 dari persian, composed of 29 districts north east afghanistan\n 1 1 badghis 4 af-bdg qala i naw 429500 20591 dari persian, pashto 7 districts west afghanistan\n 2 2 baghlan 19 af-bgl puli khumri 779000 21118 dari persian uzbek, turkmen, pashto 16 districts north east afghanistan\n*/"
- },
- "db_path": "./research/tmp/874c2ef9-1a93-49e3-99c9-4b7f79949c74.db",
- "pred_text": [
- 34
- ],
- "pred_sql": "SELECT COUNT(DISTINCT province) FROM w WHERE\n {{Model('how many districts does this province have?', 'w::notes')}} = \n {{Model('how many districts does this province have?', 'w::notes', example_inputs='kabul')}}"
-}
-```
-4) **IMPORTANT** Didn't know what 'that year' means if we don't pass in table header
-```json
-{
- "id": "nt-624",
- "question": "how many players made an appearance that year?",
- "table_id": "csv/203-csv/456.csv",
- "table": {
- "page_title": "1968 in Japanese football",
- "header": [
- "Player",
- "-1967",
- "03.30",
- "03.31",
- "04.04",
- "10.14",
- "1968",
- "Total"
- ],
- "rows": [
- [
- "Shigeo Yaegashi",
- "41(11)",
- "O",
- "O",
- "O",
- "O",
- "4(0)",
- "45(11)"
- ],
- [
- "Mitsuo Kamata",
- "37(2)",
- "O",
- "-",
- "O",
- "O",
- "3(0)",
- "40(2)"
- ],
- [
- "Masakatsu Miyamoto",
- "36(1)",
- "-",
- "O",
- "-",
- "O",
- "2(0)",
- "38(1)"
- ],
- [
- "Masashi Watanabe",
- "34(11)",
- "-",
- "O",
- "O",
- "-",
- "2(0)",
- "36(11)"
- ],
- [
- "Teruki Miyamoto",
- "33(15)",
- "O",
- "O",
- "O",
- "O",
- "4(0)",
- "37(15)"
- ],
- [
- "Ryuichi Sugiyama",
- "31(11)",
- "O",
- "O",
- "O(1)",
- "O",
- "4(1)",
- "35(12)"
- ],
- [
- "Hiroshi Katayama",
- "26(0)",
- "O",
- "-",
- "O",
- "O",
- "3(0)",
- "29(0)"
- ],
- [
- "Ryozo Suzuki",
- "22(0)",
- "O",
- "-",
- "O",
- "-",
- "2(0)",
- "24(0)"
- ],
- [
- "Kunishige Kamamoto",
- "17(21)",
- "O(2)",
- "O",
- "O(2)",
- "O(3)",
- "4(7)",
- "21(28)"
- ],
- [
- "Aritatsu Ogi",
- "16(5)",
- "O",
- "-",
- "O",
- "O",
- "3(0)",
- "19(5)"
- ],
- [
- "Yoshitada Yamaguchi",
- "16(0)",
- "O",
- "O",
- "O",
- "-",
- "3(0)",
- "19(0)"
- ],
- [
- "Kenzo Yokoyama",
- "16(0)",
- "O",
- "-",
- "O",
- "O",
- "3(0)",
- "19(0)"
- ],
- [
- "Hisao Kami",
- "13(0)",
- "O",
- "O",
- "-",
- "-",
- "2(0)",
- "15(0)"
- ],
- [
- "Takaji Mori",
- "9(1)",
- "O",
- "O",
- "O",
- "O",
- "4(0)",
- "13(1)"
- ],
- [
- "Ikuo Matsumoto",
- "7(1)",
- "O",
- "-",
- "-",
- "O",
- "2(0)",
- "9(1)"
- ],
- [
- "Yasuyuki Kuwahara",
- "5(3)",
- "-",
- "O(1)",
- "-",
- "O",
- "2(1)",
- "7(4)"
- ],
- [
- "Takeo Kimura",
- "3(1)",
- "O",
- "O",
- "-",
- "-",
- "2(0)",
- "5(1)"
- ],
- [
- "Kiyoshi Tomizawa",
- "3(0)",
- "-",
- "O",
- "-",
- "-",
- "1(0)",
- "4(0)"
- ],
- [
- "Koji Funamoto",
- "1(0)",
- "-",
- "O",
- "-",
- "-",
- "1(0)",
- "2(0)"
- ],
- [
- "Junji Kawano",
- "0(0)",
- "-",
- "O",
- "-",
- "-",
- "1(0)",
- "1(0)"
- ]
- ]
- },
- "sql": "SELECT COUNT ( `Player` ) FROM w",
- "answer_text": [
- "20"
- ],
- "input_program_args": {
- "bridge_hints": "",
- "examples": [
- {
- "blendsql": "SELECT DISTINCT description, merchant FROM transactions WHERE\n {{Model('is this a pizza shop?', 'transactions::merchant', endpoint_name='gpt-4')}} = 1\n AND {{\n Model(\n 'what state is this transaction from? Choose -1 when N.A.',\n 'transactions::description',\n binary=0,\n example_outputs='TX;CA;MA;-1;'\n )\n }} = 'MA'\n AND parent_category = 'Food'",
- "question": "Pizza shops in MA",
- "serialized_db": "CREATE TABLE \"transactions\" (\n \"index\" INTEGER,\n \"description\" TEXT,\n \"amount\" REAL,\n \"merchant\" TEXT,\n \"cash_flow\" TEXT,\n \"parent_category\" TEXT,\n \"child_category\" TEXT,\n \"date\" TEXT,\n)"
- },
- {
- "blendsql": "SELECT DISTINCT description, merchant, child_category, date FROM transactions WHERE\n {{Model('would this gift be appropriate for a child?', 'transactions::description')}} = 1\n AND child_category = 'Gifts'",
- "question": "Show gifts for my kid",
- "serialized_db": "CREATE TABLE \"transactions\" (\n \"index\" INTEGER,\n \"description\" TEXT,\n \"amount\" REAL,\n \"merchant\" TEXT,\n \"cash_flow\" TEXT,\n \"parent_category\" TEXT,\n \"child_category\" TEXT,\n \"date\" TEXT,\n)"
- }
- ],
- "program": "\n{{#system~}}\nGenerate BlendSQL given the question, table, passages, image captions to answer the question correctly.\nBlendSQL is a superset of SQLite, which adds external function calls for information not found within native SQLite.\nThese external ingredients should be wrapped in double curly brackets.\n\nIf question-relevant column(s) contents are not suitable for SQL comparisons or calculations, map it to a new column with clean content by a new grammar:\n `Model('question', '{table}::{column})'`\n\nExamples:\n{{~#each examples}}\n{{this.serialized_db}}\nQuestion: {{this.question}}\nBlendSQL: {{this.blendsql}}\n{{/each}}\n\n{{~/system}}\n\n{{#user~}}\n{{serialized_db}}\n\n{{#if bridge_hints}}\nHere are some values that may be helpful:\n{{bridge_hints}}\n{{/if}}\n\nQuestion: {{question}}\nBlendSQL:\n{{~/user}}\n\n{{#assistant~}}\n{{gen \"result\" temperature=0.0}}\n{{~/assistant}}\n",
- "question": "how many players made an appearance that year?",
- "serialized_db": "CREATE TABLE \"w\" (\n\"index\" INTEGER,\n \"row_id\" INTEGER,\n \"player\" TEXT,\n \"-1967\" TEXT,\n \"03.30\" TEXT,\n \"03.31\" TEXT,\n \"04.04\" TEXT,\n \"10.14\" TEXT,\n \"1968\" TEXT,\n \"total\" TEXT\n)\n\n/*\n3 example rows:\nSELECT * FROM w LIMIT 3\n index row_id player -1967 03.30 03.31 04.04 10.14 1968 total\n 0 0 shigeo yaegashi 41(11) o o o o 4(0) 45(11)\n 1 1 mitsuo kamata 37(2) o none o o 3(0) 40(2)\n 2 2 masakatsu miyamoto 36(1) none o none o 2(0) 38(1)\n*/"
- },
- "db_path": "./research/tmp/26258ef4-49b5-406a-84c0-50271e2aa092.db",
- "pred_text": [
- 12
- ],
- "pred_sql": "SELECT COUNT(DISTINCT player) FROM w WHERE {{Model('did this player make an appearance this year?', 'w::total')}} = 1"
-}
-```
-5) Unnecessary use of BlendSQL
-```json
-{
- "id": "nt-683",
- "question": "what is the total of deaths in internment camps and during the wartime flight?",
- "table_id": "csv/203-csv/234.csv",
- "table": {
- "page_title": "Demographic estimates of the flight and expulsion of Germans",
- "header": [
- "Description",
- "Total",
- "Poland",
- "Baltic States(Resettled in Poland during war)",
- "Resettled in Poland during war(from USSR and Romania)",
- "Sudetendeutsch",
- "SE Europe\\n(Hungary, Romania, Yugoslavia & Slovakia)"
- ],
- "rows": [
- [
- "Total Population Before Flight & Expulsion",
- "16,199,086",
- "11,038,826",
- "145,615",
- "365,622",
- "3,160,216",
- "1,488,807"
- ],
- [
- "Violent Deaths",
- "58,256",
- "44,603",
- "383",
- "747",
- "5,596",
- "6,927"
- ],
- [
- "Suicides",
- "14,356",
- "10,330",
- "157",
- "84",
- "3,411",
- "374"
- ],
- [
- "Deported( Forced Labor USSR)",
- "49,542",
- "32,947",
- "1,566",
- "6,465",
- "705",
- "7,859"
- ],
- [
- "In Internment Camps",
- "80,522",
- "27,847",
- "952",
- "1,037",
- "6,615",
- "44,071"
- ],
- [
- "During the wartime Flight",
- "93,283",
- "86,860",
- "2,394",
- "738",
- "629",
- "2,662"
- ],
- [
- "In the course of the Expulsions",
- "63,876",
- "57,814",
- "3,510",
- "561",
- "1,481",
- "510"
- ],
- [
- "Cause undetermined",
- "112,612",
- "106,991",
- "64",
- "3,116",
- "379",
- "2,062"
- ],
- [
- "Other Misc.",
- "566",
- "-",
- "38",
- "141",
- "73",
- "314"
- ],
- [
- "Total Confirmed Deaths",
- "473,013",
- "367,392",
- "9,064",
- "12,889",
- "18,889",
- "64,779"
- ],
- [
- "Total Unsolved Cases",
- "1,905,991",
- "1,404,993",
- "19,374",
- "73,794",
- "287,169",
- "120,661"
- ]
- ]
- },
- "sql": "SELECT SUM ( `Total` ) FROM w WHERE `Description` IN ( 'in internment camps' , 'during the wartime flight' )",
- "answer_text": [
- "173,805"
- ],
- "input_program_args": {
- "bridge_hints": "description ( during the wartime flight , in internment camps )",
- "examples": [
- {
- "blendsql": "SELECT year FROM w WHERE opponent = 'elizabeth matory'",
- "question": "What year was Elizabeth Matory the opponent of Charles Albert Ruppersberger?",
- "serialized_db": "\nTable Description: Dutch Ruppersberger (Electoral history)\nCREATE TABLE w(\n\trow_id int,\n\tyear int,\n\toffice text,\n\telection text,\n\tfilledcolumnname real,\n\tsubject text,\n\tparty text,\n\tvotes text,\n\t% text,\n\tfilledcolumnname_2 real,\n\topponent text,\n\tparty_2 text,\n\tvotes_2 text,\n\t%_2 text)\n/*\n3 example rows:\nSELECT * FROM w LIMIT 3;\nrow_id\tyear\toffice\telection\tfilledcolumnname\tsubject\tparty\tvotes\t%\tfilledcolumnname_2\topponent\tparty_2\tvotes_2\t%_2\n0\t1994\tbaltimore county executive\tgeneral\tnan\tdutch ruppersberger\tdemocratic\tn/a\tn/a\tnan\tn/a\tn/a\tn/a\tn/a\n1\t1998\tbaltimore county executive\tgeneral\tnan\tdutch ruppersberger\tdemocratic\t166482\t70.47\tnan\tjohn j. bishop\trepublican\t69449\t29.4\n2\t2002\tnone\tgeneral\tnan\tdutch ruppersberger\tdemocratic\t105718\t54.16\tnan\thelen delich bentley\trepublican\t88954\t45.57\n*/\n"
- },
- {
- "blendsql": "SELECT system FROM w WHERE system IN ('game boy advance', 'msx (japan only)') ORDER BY japan LIMIT 1",
- "question": "Which system has a lower number for Japan of the virtual console systems: Game Boy Advance or the Japan-only console MSX?",
- "serialized_db": "\nTable Description: Virtual Console (Titles)\nCREATE TABLE w (\n\trow_id int,\n\tsystem text,\n\tjapan int,\n\t[[list of virtual console games for wii u (north america)|north america]] real,\n\tpal region - europe real,\n\tpal region - australia real\n)\n/*\n3 example rows:\nSELECT * FROM w LIMIT 3;\nrow_id\tsystem\tjapan\t[[list of virtual console games for wii u (north america)|north america]]\tpal region - europe\tpal region - australia\n0\tnes/famicom\t148\t94.0\t89.0\t89.0\n1\tsuper nes/super famicom\t101\t51.0\t49.0\t49.0\n2\tnintendo 64\t22\t21.0\t21.0\t21.0\n*/ \n "
- },
- {
- "blendsql": "SELECT player FROM w WHERE \n {{\n Model('Has crossed swords on its logo?'; 'w::gold coast titans')\n }} = 1\n ",
- "question": "What player was transferred from the team that has crossed swords on its logo to the Warrington Wolves in the 2018 season?",
- "serialized_db": "\nTable Description: 2018 Warrington Wolves season (Transfers | In)\nCREATE TABLE w(\n\trow_id int,\n\tplayer text,\n\tsigned from text,\n\tcontract length text,\n\tannounced text)\n/*\n3 example rows:\nSELECT * FROM w LIMIT 3;\nrow_id\tplayer\tsigned from\tcontract length\tannounced\n0\tsitaleki akauola\tpenrith panthers\tp2y\t2017-08-01 00:00:00\n1\tbryson goodwin\tsouth sydney rabbitohs\tp2y\t2017-10-01 00:00:00\n2\ttyrone roberts\tgold coast titans\tp3y\t2017-10-01 00:00:00\n*/\nCREATE TABLE Images(\n\trow_id int,\n\tgold coast titans text)\n/*\nAll rows of the table:\nSELECT * FROM w;\nrow_id\tgold coast titans\n0\ta logo for the golden knights is painted on the beach.\n*/\n"
- },
- {
- "blendsql": "SELECT DISTINCT merchant FROM w WHERE\n {{Model('is this a pizza shop?', 'w::merchant'}} = 1\n AND {{\n Model(\n 'what state is this transaction from? Choose -1 when N.A.',\n 'w::description',\n example_outputs='TX;CA;MA;-1;'\n )\n }} = 'MA'\n AND parent_category = 'Food'",
- "question": "Pizza shop merchants in MA",
- "serialized_db": "\nTable Description: transactions\nCREATE TABLE w (\n \"index\" INTEGER,\n \"description\" TEXT,\n \"amount\" REAL,\n \"merchant\" TEXT,\n \"cash_flow\" TEXT,\n \"parent_category\" TEXT,\n \"child_category\" TEXT,\n \"date\" TEXT,\n/*\n3 example rows:\nSELECT * FROM transactions LIMIT 3\n index description amount merchant cash_flow parent_category child_category date confidence_score\n 0 Zelle payment from JUSTIN ORENIC BACucw5n1t14 2000.0 Zelle Expense Cash/ATM Online Payment Service 2022-06-20 0.698891\n 1 Zelle Transfer Conf UJ7QJF7TE Emilia Galindo 1500.0 Zelle Expense Cash/ATM Online Payment Service 2022-01-16 0.701658\n 2 Zelle payment from SOPHIA YANEZ WFCT0Q9R3CNR 1474.0 Zelle Expense Cash/ATM Online Payment Service 2022-03-27 0.861237\n*/\n)"
- }
- ],
- "program": "\nGenerate BlendSQL given the question, table, passages, image captions to answer the question correctly.\nBlendSQL is a superset of SQLite, which adds external function calls for information not found within native SQLite.\nThese external ingredients should be wrapped in double curly brackets.\n\nIf question-relevant column(s) contents are not suitable for SQL comparisons or calculations, map it to a new column with clean content by a new grammar:\n `Model('question', '{table}::{column})'`\n\nExamples:\n{{~#each examples}}\n{{this.serialized_db}}\nQuestion: {{this.question}}\nBlendSQL: {{this.blendsql}}\n{{/each}}\n\n{{serialized_db}}\n\n{{#if bridge_hints}}\nHere are some values that may be helpful:\n{{bridge_hints}}\n{{/if}}\n\nQuestion: {{question}}\nBlendSQL: {{gen \"result\" temperature=0.0}}\n",
- "question": "what is the total of deaths in internment camps and during the wartime flight?",
- "serialized_db": "Table Description: Demographic estimates of the flight and expulsion of Germans\nCREATE TABLE \"w\" (\n\"index\" INTEGER,\n \"row_id\" INTEGER,\n \"description\" TEXT,\n \"total\" INTEGER,\n \"poland\" TEXT,\n \"baltic states(resettled in poland during war)\" INTEGER,\n \"resettled in poland during war(from ussr and romania)\" INTEGER,\n \"sudetendeutsch\" INTEGER,\n \"se europe\\n(hungary, romania, yugoslavia & slovakia)\" INTEGER\n)\n\n/*\n3 example rows:\nSELECT * FROM w LIMIT 3\n index row_id description total poland baltic states(resettled in poland during war) resettled in poland during war(from ussr and romania) sudetendeutsch se europe\\n(hungary, romania, yugoslavia & slovakia)\n 0 0 total population before flight & expulsion 16199086 11038826 145615 365622 3160216 1488807\n 1 1 violent deaths 58256 44603 383 747 5596 6927\n 2 2 suicides 14356 10330 157 84 3411 374\n*/"
- },
- "db_path": "./research/tmp/431a7265-d77f-4fdc-b506-3293988744ce.db",
- "pred_text": [
- 173805
- ],
- "pred_sql": " SELECT SUM(\"total\") FROM w WHERE \n {{\n Model('is this a death during the wartime flight or in internment camps?', 'w::description', example_outputs='during the wartime flight;in internment camps;')\n }} IN ('during the wartime flight', 'in internment camps')",
- "pred_has_ingredient": true
-}
-```
\ No newline at end of file
diff --git a/research/__init__.py b/research/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/research/configs/fetaqa/get-train.json b/research/configs/fetaqa/get-train.json
deleted file mode 100644
index 5202e65e..00000000
--- a/research/configs/fetaqa/get-train.json
+++ /dev/null
@@ -1,22 +0,0 @@
-{
- "max_val_samples": 200,
- "overwrite_cache": false,
- "overwrite_output_dir": true,
- "use_bridge_encoder": false,
- "bypass_models": true,
- "parser_model_name_or_path": "gpt-4",
- "blender_model_name_or_path": "gpt-4",
- "fallback_to_prompt_and_pray": false,
- "run_name": "base",
- "do_eval": false,
- "do_predict": false,
- "do_train": true,
- "dataset": "fetaqa",
- "fetaqa_dataset_url": "fetaqa.zip",
- "long_answer": true,
- "source_prefix": "",
- "schema_serialization_type": "code",
- "normalize_query": true,
- "output_dir": "./research/outputs/fetaqa-test/train-set",
- "report_to": []
-}
\ No newline at end of file
diff --git a/research/configs/feverous/blendsql.json b/research/configs/feverous/blendsql.json
deleted file mode 100644
index f930d73c..00000000
--- a/research/configs/feverous/blendsql.json
+++ /dev/null
@@ -1,18 +0,0 @@
-{
- "overwrite_cache": true,
- "overwrite_output_dir": true,
- "use_bridge_encoder": true,
- "parser_model_name_or_path": "gpt-4",
- "blender_model_name_or_path": "gpt-4",
- "fallback_to_prompt_and_pray": false,
- "run_name": "base",
- "do_eval": true,
- "do_predict": false,
- "do_train": false,
- "dataset": "feverous",
- "source_prefix": "",
- "schema_serialization_type": "code",
- "normalize_query": true,
- "output_dir": "./research/outputs/new-paper-results/feverous/gpt-4-blendsql-only",
- "report_to": []
-}
\ No newline at end of file
diff --git a/research/configs/feverous/get-train.json b/research/configs/feverous/get-train.json
deleted file mode 100644
index 782ed601..00000000
--- a/research/configs/feverous/get-train.json
+++ /dev/null
@@ -1,20 +0,0 @@
-{
- "max_val_samples": 200,
- "overwrite_cache": true,
- "overwrite_output_dir": true,
- "use_bridge_encoder": true,
- "bypass_models": true,
- "parser_model_name_or_path": "gpt-4",
- "blender_model_name_or_path": "gpt-4",
- "fallback_to_prompt_and_pray": false,
- "run_name": "base",
- "do_eval": false,
- "do_predict": false,
- "do_train": true,
- "dataset": "feverous",
- "source_prefix": "",
- "schema_serialization_type": "code",
- "normalize_query": true,
- "output_dir": "./research/outputs/feverous-test/train-set",
- "report_to": []
-}
\ No newline at end of file
diff --git a/research/configs/hybridqa/blendsql-deepseek-33b-instruct-GPTQ.json b/research/configs/hybridqa/blendsql-deepseek-33b-instruct-GPTQ.json
deleted file mode 100644
index aba53152..00000000
--- a/research/configs/hybridqa/blendsql-deepseek-33b-instruct-GPTQ.json
+++ /dev/null
@@ -1,18 +0,0 @@
-{
- "overwrite_cache": false,
- "overwrite_output_dir": true,
- "use_bridge_encoder": true,
- "parser_model_name_or_path": "dlsg-ds-deepseek-coder-33b-instruct-GPTQ-v1-1-ept",
- "blender_model_name_or_path": "gpt-4",
- "fallback_to_prompt_and_pray": false,
- "run_name": "base",
- "do_eval": true,
- "do_predict": false,
- "do_train": false,
- "dataset": "hybridqa",
- "source_prefix": "",
- "schema_serialization_type": "code",
- "normalize_query": true,
- "output_dir": "./research/outputs/new-paper-results/hybridqa/gpt-4-blendsql-deepseek-coder-33b-GPTQ",
- "report_to": []
-}
\ No newline at end of file
diff --git a/research/configs/hybridqa/blendsql-deepseek-coder-7b.json b/research/configs/hybridqa/blendsql-deepseek-coder-7b.json
deleted file mode 100644
index 8cc7d205..00000000
--- a/research/configs/hybridqa/blendsql-deepseek-coder-7b.json
+++ /dev/null
@@ -1,18 +0,0 @@
-{
- "overwrite_cache": false,
- "overwrite_output_dir": true,
- "use_bridge_encoder": true,
- "parser_model_name_or_path": "dlsg-ds-deepseek-coder-7b-instruct-v1-1-ept",
- "blender_model_name_or_path": "gpt-4",
- "fallback_to_prompt_and_pray": false,
- "run_name": "base",
- "do_eval": true,
- "do_predict": false,
- "do_train": false,
- "dataset": "hybridqa",
- "source_prefix": "",
- "schema_serialization_type": "code",
- "normalize_query": true,
- "output_dir": "./research/outputs/new-paper-results/hybridqa/gpt-4-blendsql-deepseek-coder-temp0",
- "report_to": []
-}
\ No newline at end of file
diff --git a/research/configs/hybridqa/blendsql-starcoder2-15b-instruct.json b/research/configs/hybridqa/blendsql-starcoder2-15b-instruct.json
deleted file mode 100644
index b758186d..00000000
--- a/research/configs/hybridqa/blendsql-starcoder2-15b-instruct.json
+++ /dev/null
@@ -1,18 +0,0 @@
-{
- "overwrite_cache": true,
- "overwrite_output_dir": true,
- "use_bridge_encoder": true,
- "parser_model_name_or_path": "dlsg-ds-starcoder2-15b-instruct-v1-fp16-ept",
- "blender_model_name_or_path": "gpt-4",
- "fallback_to_prompt_and_pray": false,
- "run_name": "base",
- "do_eval": true,
- "do_predict": false,
- "do_train": false,
- "dataset": "hybridqa",
- "source_prefix": "",
- "schema_serialization_type": "code",
- "normalize_query": true,
- "output_dir": "./research/outputs/new-paper-results/hybridqa/gpt-4-blendsql-starcoder2-instruct",
- "report_to": []
-}
\ No newline at end of file
diff --git a/research/configs/hybridqa/blendsql-starcoder2-15b.json b/research/configs/hybridqa/blendsql-starcoder2-15b.json
deleted file mode 100644
index 658ba85f..00000000
--- a/research/configs/hybridqa/blendsql-starcoder2-15b.json
+++ /dev/null
@@ -1,18 +0,0 @@
-{
- "overwrite_cache": true,
- "overwrite_output_dir": true,
- "use_bridge_encoder": true,
- "parser_model_name_or_path": "dlsg-ds-starcoder2-15b-v1-fp16-ept",
- "blender_model_name_or_path": "gpt-4",
- "fallback_to_prompt_and_pray": false,
- "run_name": "base",
- "do_eval": true,
- "do_predict": false,
- "do_train": false,
- "dataset": "hybridqa",
- "source_prefix": "",
- "schema_serialization_type": "code",
- "normalize_query": true,
- "output_dir": "./research/outputs/new-paper-results/hybridqa/gpt-4-blendsql-starcoder2-temp0",
- "report_to": []
-}
\ No newline at end of file
diff --git a/research/configs/hybridqa/blendsql.json b/research/configs/hybridqa/blendsql.json
deleted file mode 100644
index b32b2e11..00000000
--- a/research/configs/hybridqa/blendsql.json
+++ /dev/null
@@ -1,18 +0,0 @@
-{
- "overwrite_cache": true,
- "overwrite_output_dir": true,
- "use_bridge_encoder": true,
- "parser_model_name_or_path": "gpt-4",
- "blender_model_name_or_path": "gpt-4",
- "fallback_to_prompt_and_pray": false,
- "run_name": "base",
- "do_eval": true,
- "do_predict": false,
- "do_train": false,
- "dataset": "hybridqa",
- "source_prefix": "",
- "schema_serialization_type": "code",
- "normalize_query": true,
- "output_dir": "./research/outputs/new-paper-results/hybridqa/gpt-4-blendsql-only-run5-no-list-options ",
- "report_to": []
-}
\ No newline at end of file
diff --git a/research/configs/hybridqa/get-train.json b/research/configs/hybridqa/get-train.json
deleted file mode 100644
index 8dbfe8cd..00000000
--- a/research/configs/hybridqa/get-train.json
+++ /dev/null
@@ -1,20 +0,0 @@
-{
- "max_val_samples": 200,
- "overwrite_cache": true,
- "overwrite_output_dir": true,
- "use_bridge_encoder": true,
- "bypass_models": true,
- "parser_model_name_or_path": "gpt-4",
- "blender_model_name_or_path": "gpt-4",
- "fallback_to_prompt_and_pray": false,
- "run_name": "base",
- "do_eval": false,
- "do_predict": false,
- "do_train": true,
- "dataset": "hybridqa",
- "source_prefix": "",
- "schema_serialization_type": "code",
- "normalize_query": true,
- "output_dir": "./research/outputs/hybridqa-test/train-set",
- "report_to": []
-}
\ No newline at end of file
diff --git a/research/configs/hybridqa/testing-blendsql-plus-pp.json b/research/configs/hybridqa/testing-blendsql-plus-pp.json
deleted file mode 100644
index ea01601c..00000000
--- a/research/configs/hybridqa/testing-blendsql-plus-pp.json
+++ /dev/null
@@ -1,20 +0,0 @@
-{
- "max_val_samples": 200,
- "overwrite_cache": true,
- "overwrite_output_dir": true,
- "use_bridge_encoder": true,
- "parser_model_name_or_path": "gpt-4",
- "blender_model_name_or_path": "gpt-4",
- "prompt_and_pray_model_name_or_path": "gpt-4",
- "fallback_to_prompt_and_pray": true,
- "run_name": "base",
- "do_eval": true,
- "do_predict": false,
- "do_train": false,
- "dataset": "hybridqa",
- "source_prefix": "",
- "schema_serialization_type": "code",
- "normalize_query": true,
- "output_dir": "./research/outputs/hybridqa-test/blendsql-ablation-test/gpt-4-blendsql-plus-pp",
- "report_to": []
-}
\ No newline at end of file
diff --git a/research/configs/hybridqa/testing-pp.json b/research/configs/hybridqa/testing-pp.json
deleted file mode 100644
index 03675708..00000000
--- a/research/configs/hybridqa/testing-pp.json
+++ /dev/null
@@ -1,19 +0,0 @@
-{
- "max_val_samples": 200,
- "overwrite_cache": true,
- "overwrite_output_dir": true,
- "use_bridge_encoder": true,
- "prompt_and_pray_only": true,
- "parser_model_name_or_path": "",
- "prompt_and_pray_model_name_or_path": "gpt-4",
- "run_name": "base",
- "do_eval": true,
- "do_predict": false,
- "do_train": false,
- "dataset": "hybridqa",
- "source_prefix": "",
- "schema_serialization_type": "code",
- "normalize_query": true,
- "output_dir": "./research/outputs/hybridqa-test/blendsql-ablation-test/gpt-4-pp-only",
- "report_to": []
-}
\ No newline at end of file
diff --git a/research/configs/ottqa/blendsql.json b/research/configs/ottqa/blendsql.json
deleted file mode 100644
index 06fbe805..00000000
--- a/research/configs/ottqa/blendsql.json
+++ /dev/null
@@ -1,20 +0,0 @@
-{
- "overwrite_cache": false,
- "overwrite_output_dir": true,
- "use_bridge_encoder": true,
- "parser_model_name_or_path": "gpt-4",
- "blender_model_name_or_path": "gpt-4",
- "fallback_to_prompt_and_pray": false,
- "run_name": "base",
- "do_eval": true,
- "do_predict": false,
- "do_train": false,
- "dataset": "ottqa",
- "schema_qualify": false,
- "num_serialized_rows": 5,
- "source_prefix": "",
- "schema_serialization_type": "code",
- "normalize_query": true,
- "output_dir": "./research/outputs/new-paper-results/ottqa/gpt-4-blendsql-only-3-tables",
- "report_to": []
-}
\ No newline at end of file
diff --git a/research/configs/ottqa/get-train.json b/research/configs/ottqa/get-train.json
deleted file mode 100644
index 1632b260..00000000
--- a/research/configs/ottqa/get-train.json
+++ /dev/null
@@ -1,21 +0,0 @@
-{
- "max_val_samples": 200,
- "overwrite_cache": true,
- "overwrite_output_dir": true,
- "use_bridge_encoder": true,
- "bypass_models": true,
- "parser_model_name_or_path": "gpt-4",
- "blender_model_name_or_path": "gpt-4",
- "fallback_to_prompt_and_pray": false,
- "run_name": "base",
- "do_eval": false,
- "do_predict": false,
- "do_train": true,
- "dataset": "ottqa",
- "ottqa_dataset_url": "ottqa.zip",
- "source_prefix": "",
- "schema_serialization_type": "code",
- "normalize_query": true,
- "output_dir": "./research/outputs/ottqa-test/train-set",
- "report_to": []
-}
\ No newline at end of file
diff --git a/research/constants.py b/research/constants.py
deleted file mode 100644
index 33ad848a..00000000
--- a/research/constants.py
+++ /dev/null
@@ -1,16 +0,0 @@
-from dataclasses import dataclass
-
-
-@dataclass
-class EvalField:
- QUESTION = "question"
- GOLD_ANSWER = "gold_answer"
- PREDICTION = "prediction"
- PRED_BLENDSQL = "pred_blendsql"
- UID = "uid"
- DB_PATH = "db_path"
-
-
-SINGLE_TABLE_NAME = "w"
-DOCS_TABLE_NAME = "documents"
-CREATE_VIRTUAL_TABLE_CMD = f"CREATE VIRTUAL TABLE {DOCS_TABLE_NAME} USING fts5(title, content, tokenize = 'trigram');"
diff --git a/research/datasets/fetaqa/evaluator.py b/research/datasets/fetaqa/evaluator.py
deleted file mode 100644
index b2d27e92..00000000
--- a/research/datasets/fetaqa/evaluator.py
+++ /dev/null
@@ -1,89 +0,0 @@
-from datasets import load_metric
-import nltk
-
-
-# the code below refers to the https://github.com/Yale-LILY/FeTaQA/blob/main/end2end/train.py
-def postprocess_text(preds, labels, metric_name):
- preds = [pred.strip() for pred in preds]
- labels = [label.strip() for label in labels]
-
- # rougeLSum expects newline after each sentence
- if metric_name == "rouge":
- preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
- labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]
- elif metric_name == "sacrebleu": # sacrebleu
- labels = [[label] for label in labels]
- elif metric_name == "bleu":
- preds = [pred.split(" ") for pred in preds]
- labels = [[label.split(" ")] for label in labels]
- else:
- pass
-
- return preds, labels
-
-
-class EvaluateTool(object):
- def __init__(self, args):
- self.args = args
-
- def evaluate(self, preds, golds, section):
- summary = {}
-
- gold_text = [item["seq_out"] for item in golds]
-
- assert len(preds) == len(gold_text)
-
- metric_list = []
- if section in ["train", "dev"]:
- metric_list = ["sacrebleu"]
- elif section == "test":
- metric_list = ["sacrebleu", "rouge", "meteor", "bertscore", "bleurt"]
-
- for metric_name in metric_list:
- metric = load_metric(metric_name)
- processed_preds, processed_golds = postprocess_text(
- preds, gold_text, metric_name
- )
-
- if metric_name == "bertscore":
- res = metric.compute(
- predictions=processed_preds, references=processed_golds, lang="en"
- )
- for k, v in res.items():
- if k == "hashcode":
- continue
- summary[f"{metric_name}_{k}"] = round(1.0 * sum(v) / len(v), 2)
-
- else:
- res = metric.compute(
- predictions=processed_preds, references=processed_golds
- )
- if metric_name == "sacrebleu":
- summary[metric_name] = (
- res["score"] * 0.01
- ) # limit it to range of [0, 1] for unifying
- elif metric_name == "bleurt":
- summary["bleurt"] = round(
- 1.0 * sum(res["scores"]) / len(res["scores"]), 2
- )
- elif metric_name == "rouge":
- for sub_metric_name in res.keys():
- for i, key in enumerate(["precision", "recall", "fmeasure"]):
- summary["{}_{}".format(sub_metric_name, key)] = res[
- sub_metric_name
- ][1][i]
- # this the the fmeasure('f-score') from the mid('mean aggregation')
- else:
- summary[metric_name] = res[metric_name]
- return summary
-
-
-if __name__ == "__main__":
- import json
-
- with open("predictions_predict.json") as f:
- test_data = json.load(f)
- preds = [item["prediction"] for item in test_data]
- evaluator = EvaluateTool(args=None)
- score = evaluator.evaluate(preds, test_data, section="test")
- print(score)
diff --git a/research/datasets/fetaqa/fetaqa.py b/research/datasets/fetaqa/fetaqa.py
deleted file mode 100644
index 7f334f3a..00000000
--- a/research/datasets/fetaqa/fetaqa.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Datasets Authors, The Google AI Language Team Authors and the current dataset script contributor.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# TODO: This code can be push to HuggingFace as a new contribution.
-"""FeTaQA, a Free-form Table Question Answering dataset"""
-import os
-import json
-import datasets
-
-from research.constants import EvalField
-
-# Find for instance the citation on arxiv or on the dataset repo/website
-_CITATION = """\
-@article{nan2021fetaqa,
- title={FeTaQA: Free-form Table Question Answering},
- author={Nan, Linyong and Hsieh, Chiachun and Mao, Ziming and Lin, Xi Victoria and Verma, Neha and Zhang, Rui and Kry{\'s}ci{\'n}ski, Wojciech and Schoelkopf, Nick and Kong, Riley and Tang, Xiangru and others},
- journal={arXiv preprint arXiv:2104.00369},
- year={2021}
-}
-"""
-
-_DESCRIPTION = """\
-FeTaQA is a Free-form Table Question Answering dataset with 10K Wikipedia-based
- {table, question, free-form answer, supporting table cells} pairs. It yields a
-more challenging table QA setting because it requires generating free-form text
- answers after retrieval, inference, and integration of multiple discontinuous
-facts from a structured knowledge source. Unlike datasets of generative QA over
- text in which answers are prevalent with copies of short text spans from the source,
- answers in our dataset are human-generated explanations involving entities and their
- high-level relations.
-
-This dataset is obtained from the official release of the FETAQA.
-"""
-
-_HOMEPAGE = "https://github.com/Yale-LILY/FeTaQA/"
-
-_LICENSE = "CC-BY-SA-4.0 License"
-
-_URL = "https://github.com/Yale-LILY/FeTaQA/raw/main/data/"
-_TRAINING_FILE = "fetaQA-v1_train.jsonl"
-_DEV_FILE = "fetaQA-v1_dev.jsonl"
-_TEST_FILE = "fetaQA-v1_test.jsonl"
-
-
-class FETAQA(datasets.GeneratorBasedBuilder):
- """The FETAQA dataset"""
-
- def __init__(
- self, *args, writer_batch_size=None, fetaqa_dataset_url=_URL, **kwargs
- ) -> None:
- super().__init__(*args, writer_batch_size=writer_batch_size, **kwargs)
- self._url = fetaqa_dataset_url
-
- def _info(self):
- return datasets.DatasetInfo(
- description=_DESCRIPTION,
- features=datasets.Features(
- {
- EvalField.UID: datasets.Value("int32"),
- EvalField.QUESTION: datasets.Value("string"),
- "table_id": datasets.Value("string"),
- "table": {
- "header": datasets.features.Sequence(datasets.Value("string")),
- "rows": datasets.features.Sequence(
- datasets.features.Sequence(datasets.Value("string"))
- ),
- },
- "meta": datasets.Value("string"),
- EvalField.GOLD_ANSWER: datasets.Value("string"),
- }
- ),
- supervised_keys=None,
- homepage=_HOMEPAGE,
- license=_LICENSE,
- citation=_CITATION,
- )
-
- def _split_generators(self, dl_manager):
- """Returns SplitGenerators."""
- downloaded_files = dl_manager.download_and_extract(self._url)
- return [
- datasets.SplitGenerator(
- name=datasets.Split.TRAIN,
- gen_kwargs={"filepath": os.path.join(downloaded_files, _TRAINING_FILE)},
- ),
- datasets.SplitGenerator(
- name=datasets.Split.VALIDATION,
- gen_kwargs={"filepath": os.path.join(downloaded_files, _DEV_FILE)},
- ),
- datasets.SplitGenerator(
- name=datasets.Split.TEST,
- gen_kwargs={"filepath": os.path.join(downloaded_files, _TEST_FILE)},
- ),
- ]
-
- def _generate_examples(self, filepath):
- """Yields examples."""
- with open(filepath, encoding="utf-8") as f:
- for idx, line in enumerate(f):
- example = json.loads(line)
- yield idx, {
- EvalField.UID: example["feta_id"],
- EvalField.QUESTION: example["question"],
- "table_id": example["table_source_json"],
- "table": {
- "header": example["table_array"][0],
- "rows": example["table_array"][1:],
- },
- "meta": example["table_page_title"]
- + " | "
- + example["table_section_title"],
- EvalField.GOLD_ANSWER: example["answer"],
- }
diff --git a/research/datasets/feverous/feverous.py b/research/datasets/feverous/feverous.py
deleted file mode 100644
index 0b4a573e..00000000
--- a/research/datasets/feverous/feverous.py
+++ /dev/null
@@ -1,439 +0,0 @@
-import warnings
-
-warnings.simplefilter(action="ignore", category=FutureWarning)
-
-import json
-import os
-import sqlite3
-from collections import defaultdict
-import pandas as pd
-import numpy as np
-from collections import Counter
-from typing import Dict, List
-
-import datasets
-from wikiextractor.extract import Extractor, ignoreTag, resetIgnoredTags
-from research.constants import EvalField
-
-_CITATION = """\
-@article{aly2021feverous,
- title={FEVEROUS: Fact Extraction and VERification Over Unstructured and Structured information},
- author={Aly, Rami and Guo, Zhijiang and Schlichtkrull, Michael and Thorne, James and Vlachos, Andreas and Christodoulopoulos, Christos and Cocarascu, Oana and Mittal, Arpit},
- journal={arXiv preprint arXiv:2106.05707},
- year={2021}
-}
-"""
-
-_DESCRIPTION = """\
-This dataset is obtained from the official release of the FEVEROUS.
-"""
-
-_HOMEPAGE = "https://fever.ai/dataset/feverous.html"
-
-_LICENSE = ""
-
-_URL = "https://fever.ai/download/feverous/"
-_TRAINING_FILE = "feverous_train_challenges.jsonl"
-_DEV_FILE = "feverous_dev_challenges.jsonl"
-_DATABASE = "feverous-wiki-pages-db.zip"
-
-_URLS = {
- "train": f"{_URL}{_TRAINING_FILE}",
- "dev": f"{_URL}{_DEV_FILE}",
- "database": f"{_URL}{_DATABASE}",
-}
-
-EVIDENCE_TYPES = ["sentence", "cell", "header_cell", "table_caption", "item"]
-
-extractor = Extractor(0, "", [], "", "")
-
-
-def clean_markup(markup, keep_links=False, ignore_headers=True):
- """
- Clean Wikimarkup to produce plaintext.
-
- :param keep_links: Set to True to keep internal and external links
- :param ignore_headers: if set to True, the output list will not contain
- headers, only
-
- Returns a list of paragraphs (unicode strings).
- """
-
- if not keep_links:
- ignoreTag("a")
-
- # returns a list of strings
- paragraphs = extractor.clean_text(markup)
- resetIgnoredTags()
-
- if ignore_headers:
- paragraphs = filter(lambda s: not s.startswith("## "), paragraphs)
-
- return " ".join(list(paragraphs))
-
-
-def get_table_id(meta):
- """
- meta types:
- - table_caption_18
- - cell_18_1_1
- - header_cell_18_0_0
- """
- if meta.startswith("table_caption"):
- return meta.split("_")[-1]
- if meta.startswith("header_cell") or meta.startswith("cell"):
- return meta.split("_")[-3]
-
-
-def get_list_id(meta):
- """ "
- meta types:
- - item_4_25
- """
- return meta.split("_")[1]
-
-
-def set_first_row_as_header(df: pd.DataFrame):
- df.columns = df.iloc[0]
- df = df.iloc[1:]
- return df
-
-
-def format_nested_table_json(table_json: dict):
- """
- TODO: how to handle table 'type'?
- """
- # Create numpy array
- # row_start column_start
- # 0 0
- # 1 0
- tablename = None
- flattened_values = []
- start_indices_arr = []
- span_indices_arr = []
- for row in table_json["table"]:
- start_indices_arr.extend(
- [list(map(lambda x: int(x), item["id"].split("_")[-2:])) for item in row]
- )
- span_indices_arr.extend(
- [[int(item["column_span"]) - 1, int(item["row_span"]) - 1] for item in row]
- )
- flattened_values.extend([clean_markup(item["value"]) for item in row])
- start_indices_arr, span_indices_arr, flattened_values = (
- np.array(start_indices_arr),
- np.array(span_indices_arr),
- np.array(flattened_values),
- )
- n_rows = start_indices_arr[:, 0].max()
-
- naive_df_data = []
- to_add_queue = []
- for row in range(n_rows):
- row_entry = [to_add_queue.pop()] if len(to_add_queue) > 0 else []
- indices = np.where(start_indices_arr[:, 0] == row)[0]
- spans = span_indices_arr[indices]
- values = flattened_values[indices]
- for _pos in range(indices.shape[0]):
- for _ in range(spans[_pos][0] + 1):
- row_entry.append(values[_pos])
- for _idx in range(spans[_pos][1]):
- to_add_queue.append(values[_pos])
- naive_df_data.append(row_entry)
- naive_df = pd.DataFrame(naive_df_data)
- naive_df = naive_df.replace("", np.nan)
- naive_df = naive_df.ffill()
- naive_df = naive_df.fillna("")
- if len(naive_df.columns) == 2:
- # Transpose, so Model gets whole `attribute` context
- # naive_df.columns = ["attribute", "value"]
- naive_df = naive_df.T
- try:
- return set_first_row_as_header(naive_df)
- except:
- return naive_df
-
- # Simplest case: if less than 3 cells span multiple indices
- # But, if it has only 2 columns, use 'attribute', 'value' formatting
- is_simple_table = span_indices_arr[span_indices_arr > 0].shape[0] < 3
- if is_simple_table:
- if len(naive_df.columns) == 2:
- naive_df.columns = ["attribute", "value"]
- return (tablename, naive_df)
- try:
- return (tablename, set_first_row_as_header(naive_df))
- except:
- return (tablename, naive_df)
- try:
- reformatted_df = {}
- handled_rows = set()
- for idx, row in naive_df.iterrows():
- if idx in handled_rows:
- continue
- handled = False
- values_as_set = set(row.values)
- if len(values_as_set) == 1:
- # This should be tablename
- tablename = values_as_set.pop()
- continue
- for i in range(row.values.shape[0]):
- if handled or i == row.values.shape[0] - 1:
- break
- _values = list(dict.fromkeys(row.values[i:].tolist()))
- # Check if they have any words in common
- tokenized_overlapping_values = [i.split(" ") for i in _values]
- tokens_in_common: set = set.intersection(
- *map(set, tokenized_overlapping_values)
- )
- if len(tokens_in_common) > 0:
- # We have some tokens in common
- # Only get difference, and assign as column/values
- columnname = " ".join(tokens_in_common)
- values = [
- " ".join([tok for tok in item if tok not in tokens_in_common])
- for item in tokenized_overlapping_values
- ]
- reformatted_df[columnname] = values
- handled = True
- if not handled:
- # Check if values are repeated even number of times
- # E.g. ['Number', 'Percent', 'Number', 'Percent']
- values_counter = Counter(row.values)
- duplicate_values = {(k, v) for k, v in values_counter.items() if v > 1}
- if len(duplicate_values) > 1:
- evenly_duplicated_values = [i[0] for i in duplicate_values]
- num_duplications = [i[1] for i in duplicate_values][0]
- subtable = pd.DataFrame(naive_df.iloc[idx:, :])
- handled_rows.update(range(idx, len(naive_df)))
- subtable = set_first_row_as_header(subtable)
- seen_columns = set()
- for columnname in subtable.columns:
- if columnname in seen_columns:
- continue
- if columnname in evenly_duplicated_values:
- if columnname not in reformatted_df:
- reformatted_df[columnname] = []
- for _, row in subtable[columnname].T.iterrows():
- reformatted_df[columnname].extend(row.values.tolist())
- else:
- # Make this a new column too
- reformatted_df[columnname] = [
- i
- for i in subtable[columnname].tolist()
- if i != columnname
- ] * num_duplications
- seen_columns.add(columnname)
- handled = True
- max_v = max(len(v) for v in reformatted_df.values())
- for k, values in reformatted_df.items():
- if len(values) != max_v:
- assert max_v % len(values) == 0
- mult = max_v // len(values)
- multiplied_values = [
- x for xs in [[v] * mult for v in values] for x in xs
- ]
- reformatted_df[k] = multiplied_values
- return (tablename, pd.DataFrame(reformatted_df))
- except:
- try:
- if len(set(naive_df.iloc[0].values.tolist())) == 1:
- tablename = naive_df.iloc[0].values[0]
- naive_df = set_first_row_as_header(naive_df.iloc[1:, :])
- if len(set(naive_df.columns)) == 1 and len(naive_df.columns) == 2:
- tablename = f"{tablename} - {naive_df.columns[0]}"
- naive_df.columns = ["Attribute", "Value"]
- except:
- pass
- return (tablename, naive_df)
-
-
-def retrieve_context(example, cur):
- pages = {}
- evidences = []
- # Collect all page
- """
- meta types:
- - table_caption_18
- - cell_18_1_1
- - header_cell_18_0_0
- - sentence_0
- - item_4_25
- """
- tables = []
- for evidence in example["evidence"][:1]:
- content = evidence["content"]
- for item in content:
- # Example: 'Michael Folivi_header_cell_1_0_0'
- # page_id = Michael Folivi
- # meta = header_cell_1_0_0
- page_id, meta = item.split("_", 1)
- if page_id not in pages:
- data = cur.execute(
- """
- SELECT data FROM wiki WHERE id = "{}"
- """.format(
- page_id
- )
- )
- for item in data.fetchall():
- pages[page_id] = json.loads(item[0])
- if (
- meta.startswith("table_caption")
- or meta.startswith("cell")
- or meta.startswith("header_cell")
- ):
- table_id = get_table_id(meta)
- if table_id in tables:
- continue
- else:
- tables.append(table_id)
- table_json = pages[page_id]["table_{}".format(table_id)]
- evidences.append({"table": table_json, "tablename": page_id})
- elif meta.startswith("item"):
- list_id = get_list_id(meta)
- context = None
- for item in pages[page_id]["list_{}".format(list_id)]["list"]:
- if item["id"] == meta:
- context = item["value"]
- if context is not None:
- evidences.append(
- {"content": clean_markup(context), "title": page_id}
- )
- else:
- context = pages[page_id][meta]
- evidences.append({"content": clean_markup(context), "title": page_id})
-
- table_list, context_list = [], []
- title_to_content: Dict[str, List[str]] = {}
- for evidence in evidences:
- if "table" in evidence:
- df = format_nested_table_json(evidence["table"])
- df_dict = df.to_dict(orient="split")
- table_list.append(
- {
- "header": df_dict["columns"],
- "rows": df_dict["data"],
- "table_description": evidence["tablename"],
- }
- )
- else:
- if evidence["title"] not in title_to_content:
- title_to_content[evidence["title"]] = []
- title_to_content[evidence["title"]].append(evidence["content"])
- context_list.extend(
- [{"title": k, "content": " ".join(v)} for k, v in title_to_content.items()]
- )
- # Remove overlaps
- filtered_context_list = []
- context_list_titles = [item["title"] for item in context_list]
- for title in set(context_list_titles):
- content_candidates = []
- for item in context_list:
- if item["title"] == title:
- content_candidates.append(item["content"])
- chosen_content = sorted(content_candidates, key=len, reverse=True)[0]
- filtered_context_list.append({"title": title, "content": chosen_content})
- return table_list, filtered_context_list
-
-
-def is_table_involved(example):
- # Check if the example is involving table.
- # We only consider the first evidence
- for evidence in example["evidence"][:1]: # list
- is_valid = False
- content = evidence["content"]
- evidence_type_count = defaultdict(int)
- for item in content:
- page_id, meta = item.split("_", 1)
- for evidence_type in EVIDENCE_TYPES:
- if meta.startswith(evidence_type):
- evidence_type_count[evidence_type] += 1
- for evidence_type in evidence_type_count:
- if evidence_type in ["cell", "header_cell", "table_caption"]:
- is_valid = True
- if is_valid:
- return True
- return False
-
-
-class FEVEROUS(datasets.GeneratorBasedBuilder):
- """The FEVEROUS dataset"""
-
- def _info(self):
- return datasets.DatasetInfo(
- description=_DESCRIPTION,
- features=datasets.Features(
- {
- EvalField.UID: datasets.Value("string"),
- EvalField.QUESTION: datasets.Value("string"),
- "table": datasets.features.Sequence(
- {
- "header": datasets.features.Sequence(
- datasets.Value("string")
- ),
- "rows": datasets.features.Sequence(
- datasets.features.Sequence(datasets.Value("string"))
- ),
- "table_description": datasets.Value("string"),
- }
- ),
- "context": datasets.features.Sequence(
- {
- "title": datasets.Value("string"),
- "content": datasets.Value("string"),
- }
- ),
- EvalField.GOLD_ANSWER: datasets.Value("string"),
- }
- ),
- supervised_keys=None,
- homepage=_HOMEPAGE,
- license=_LICENSE,
- citation=_CITATION,
- )
-
- def _split_generators(self, dl_manager):
- """Returns SplitGenerators."""
- downloaded_files = dl_manager.download_and_extract(_URLS)
-
- return [
- datasets.SplitGenerator(
- name=datasets.Split.TRAIN,
- gen_kwargs={
- "filepath": downloaded_files["train"],
- "database": os.path.join(
- downloaded_files["database"], "feverous_wikiv1.db"
- ),
- },
- ),
- datasets.SplitGenerator(
- name=datasets.Split.VALIDATION,
- gen_kwargs={
- "filepath": downloaded_files["dev"],
- "database": os.path.join(
- downloaded_files["database"], "feverous_wikiv1.db"
- ),
- },
- ),
- ]
-
- def _generate_examples(self, filepath, database):
- con = sqlite3.connect(database)
- cur = con.cursor()
- with open(filepath, "r") as f:
- count = -1
- for _idx, line in enumerate(f):
- example = json.loads(line)
- statement = example["claim"]
- label = example["label"]
- # possible label: "NOT ENOUGH INFO", "REFUTES", "SUPPORTS"
- if is_table_involved(example):
- # Retrieve related context from database
- tables, contexts = retrieve_context(example, cur)
- count += 1
- yield count, {
- EvalField.UID: str(example["id"]),
- EvalField.QUESTION: statement,
- "table": tables,
- "context": contexts,
- EvalField.GOLD_ANSWER: label,
- }
diff --git a/research/datasets/hybridqa/hybridqa.py b/research/datasets/hybridqa/hybridqa.py
deleted file mode 100644
index 6c7dead2..00000000
--- a/research/datasets/hybridqa/hybridqa.py
+++ /dev/null
@@ -1,221 +0,0 @@
-import json
-from typing import Dict
-import datasets
-import os
-import nltk
-
-from research.constants import EvalField
-
-_CITATION = """\
-@article{chen2020hybridqa,
- title={Hybridqa: A dataset of multi-hop question answering over tabular and textual data},
- author={Chen, Wenhu and Zha, Hanwen and Chen, Zhiyu and Xiong, Wenhan and Wang, Hong and Wang, William},
- journal={arXiv preprint arXiv:2004.07347},
- year={2020}
-}
-"""
-
-_DESCRIPTION = """\
-This dataset is obtained from the official release of the HybridQA.
-"""
-
-_HOMEPAGE = "https://github.com/wenhuchen/HybridQA"
-
-_LICENSE = "MIT License"
-
-_URL = "https://raw.githubusercontent.com/wenhuchen/HybridQA/master/released_data/"
-_TRAINING_FILE = "train.traced.json"
-_DEV_FILE = "dev.traced.json"
-_CONTEXT_FILE_URL = (
- "https://github.com/wenhuchen/WikiTables-WithLinks/archive/refs/heads/master.zip"
-)
-
-_URLS = {
- "train": f"{_URL}{_TRAINING_FILE}",
- "dev": f"{_URL}{_DEV_FILE}",
- "context": _CONTEXT_FILE_URL,
-}
-
-WINDOW_SIZE = 3
-
-"""
-TODO:
- - use json files in tables_tok to create hyperlinks from values
-"""
-
-
-class HybridQA(datasets.GeneratorBasedBuilder):
- """The Hybrid dataset"""
-
- def _info(self):
- return datasets.DatasetInfo(
- description=_DESCRIPTION,
- features=datasets.Features(
- {
- EvalField.UID: datasets.Value("string"),
- EvalField.QUESTION: datasets.Value("string"),
- "table_id": datasets.Value("string"),
- "table": {
- "header": datasets.features.Sequence(datasets.Value("string")),
- "rows": datasets.features.Sequence(
- datasets.features.Sequence(datasets.Value("string"))
- ),
- },
- "passages": {
- "header": datasets.features.Sequence(datasets.Value("string")),
- "rows": datasets.features.Sequence(
- datasets.features.Sequence(datasets.Value("string"))
- ),
- },
- "gold_passage": datasets.Value("string"),
- "context": datasets.Value("string"),
- EvalField.GOLD_ANSWER: datasets.Value("string"),
- }
- ),
- supervised_keys=None,
- homepage=_HOMEPAGE,
- license=_LICENSE,
- citation=_CITATION,
- )
-
- def _split_generators(self, dl_manager):
- """Returns SplitGenerators."""
- downloaded_files = dl_manager.download_and_extract(_URLS)
-
- return [
- datasets.SplitGenerator(
- name=datasets.Split.TRAIN,
- gen_kwargs={
- "filepath": downloaded_files["train"],
- "contextpath": downloaded_files["context"],
- },
- ),
- datasets.SplitGenerator(
- name=datasets.Split.VALIDATION,
- gen_kwargs={
- "filepath": downloaded_files["dev"],
- "contextpath": downloaded_files["context"],
- },
- ),
- ]
-
- def _generate_examples(self, filepath, contextpath):
- """Yields examples."""
- # data_id, question, table_id, gold_result_str
- table_tok_path = os.path.join(
- contextpath, "WikiTables-WithLinks-master", "tables_tok"
- )
- passage_tok_path = os.path.join(
- contextpath, "WikiTables-WithLinks-master", "request_tok"
- )
- with open(filepath, encoding="utf-8") as f:
- data = json.load(f)
- for idx, example in enumerate(data):
- answer_node = example["answer-node"]
- table_id = example["table_id"]
- table = json.load(
- open(os.path.join(table_tok_path, "{}.json".format(table_id)))
- )
- passages = json.load(
- open(os.path.join(passage_tok_path, "{}.json".format(table_id)))
- )
- answer = example["answer-text"]
- # how to construct context?
- # keep all cells and appending the sentences that contains answer span into the cell
- header, rows, gold_passage = self.construct_expanded_table(
- table, passages, answer, answer_node
- )
- passage_headers, passage_data = self.preprocess_passages(passages)
- yield idx, {
- EvalField.UID: example["question_id"],
- EvalField.QUESTION: example["question"],
- "table_id": example["table_id"],
- "table": {"header": header, "rows": rows},
- "passages": {"header": passage_headers, "rows": passage_data},
- "gold_passage": gold_passage,
- "context": table["title"]
- + " | "
- + table["section_title"]
- + " | "
- + table["section_text"]
- + " | "
- + table["intro"],
- EvalField.GOLD_ANSWER: example["answer-text"],
- }
-
- # @staticmethod
- # def preprocess_passages(passages: Dict[str, str], rows: List[List[str]]) -> tuple:
- # """If the passage title isn't a match to a value,
- # use rapidfuzz to align to closest value and set that as `title` field.
- # """
- # def preprocess(text: str):
- # text = text.lower()
- # text = re.sub(r'\s+', ' ', text)
- # text = re.sub(r'\([^)]*\)', '', text)
- # text = re.sub(r"\b(a|an|the)\b", " ", text, flags=re.UNICODE)
- # text = "".join(ch for ch in text if ch not in set(string.punctuation))
- # return text.strip()
- #
- # def flatten_and_lower(xss):
- # return set([x for xs in xss for x in xs])
- #
- # values_as_set = flatten_and_lower(rows)
- # norm_value_to_value = {preprocess(value): value for value in values_as_set}
- #
- # header = ["title", "content"]
- # data = []
- # for k, v in passages.items():
- # title = k.split("/")[-1].replace("_", " ")
- # if title not in values_as_set:
- # # re.sub to remove stuff in parentheses
- # norm_title = preprocess(title)
- # aligned_value, score, _ = process.extractOne(norm_title, list(norm_value_to_value.keys()))
- # if score >= 95:
- # title = norm_value_to_value[aligned_value]
- # data.append([title, v])
- # return (header, data)
-
- @staticmethod
- def preprocess_passages(passages: Dict[str, str]) -> tuple:
- """If the passage title isn't a match to a value,
- use rapidfuzz to align to closest value and set that as `title` field.
- """
- header = ["title", "content"]
- data = []
- for k, v in passages.items():
- title = k.split("/")[-1].replace("_", " ")
- data.append([title, v])
- return (header, data)
-
- @staticmethod
- def construct_expanded_table(table, passages, answer, answer_nodes):
- def process_link(link):
- return link.split("/")[-1].replace("_", " ")
-
- header = [column[0] for column in table["header"]]
- data = [[cell[0] for cell in row] for row in table["data"]]
- selected_passage = {}
- for answer_node in answer_nodes:
- link = answer_node[2]
- type_ = answer_node[3]
- if type_ == "passage":
- # Get passage and locate the sentence of answer
- passage_text = passages[link]
- sents = nltk.sent_tokenize(passage_text)
- has_answer_sent_idx = -1
- for idx, sent in enumerate(sents):
- if " " + answer.lower() + " " in " " + sent.lower() + " ":
- has_answer_sent_idx = idx
- selected_sents = sents[
- max(0, has_answer_sent_idx - (WINDOW_SIZE - 1) // 2) : min(
- len(sents) - 1, has_answer_sent_idx + (WINDOW_SIZE - 1) // 2
- )
- ]
- selected_passage[process_link(link)] = " ".join(selected_sents)
- else:
- pass
- # linearize selected passgae
- passage_context_str = "passages: "
- for key in selected_passage:
- passage_context_str += "{}: {} | ".format(key, selected_passage[key])
- return header, data, passage_context_str
diff --git a/research/datasets/ottqa/ottqa.py b/research/datasets/ottqa/ottqa.py
deleted file mode 100644
index a0633e81..00000000
--- a/research/datasets/ottqa/ottqa.py
+++ /dev/null
@@ -1,281 +0,0 @@
-import os
-import json
-import nltk
-import datasets
-from pathlib import Path
-
-from research.constants import EvalField
-
-logger = datasets.logging.get_logger(__name__)
-
-_CITATION = """\
-@article{chen2020open,
- title={Open question answering over tables and text},
- author={Chen, Wenhu and Chang, Ming-Wei and Schlinger, Eva and Wang, William and Cohen, William W},
- journal={arXiv preprint arXiv:2010.10439},
- year={2020}
-}
-"""
-
-_DESCRIPTION = """\
-This dataset is obtained from the official release of the OTT-QA.
-"""
-
-_HOMEPAGE = "https://ott-qa.github.io"
-
-_LICENSE = "MIT License"
-
-_URL = "https://github.com/wenhuchen/OTT-QA/raw/a14ec408b2c22e24a44622b01e4242d95b7ecf08/released_data/"
-_TRAINING_FILE = "train.traced.json"
-_DEV_FILE = "dev.traced.json"
-
-_URLS = {
- "tables": "https://opendomainhybridqa.s3-us-west-2.amazonaws.com/all_plain_tables.json",
- "passages": "https://opendomainhybridqa.s3-us-west-2.amazonaws.com/all_passages.json",
-}
-
-WINDOW_SIZE = 3
-
-
-class OTTQA(datasets.GeneratorBasedBuilder):
- """The OTTQA dataset"""
-
- def __init__(
- self,
- *args,
- db_output_dir: str,
- writer_batch_size=None,
- ottqa_dataset_url=_URL,
- **kwargs,
- ) -> None:
- super().__init__(*args, writer_batch_size=writer_batch_size, **kwargs)
-
- self._url = ottqa_dataset_url
- self.db_output_dir = Path(db_output_dir)
-
- def _info(self):
- return datasets.DatasetInfo(
- description=_DESCRIPTION,
- features=datasets.Features(
- {
- EvalField.UID: datasets.Value("string"),
- EvalField.DB_PATH: datasets.Value("string"),
- EvalField.QUESTION: datasets.Value("string"),
- "table_id": datasets.Value("string"),
- "table": {
- "header": datasets.features.Sequence(datasets.Value("string")),
- "rows": datasets.features.Sequence(
- datasets.features.Sequence(datasets.Value("string"))
- ),
- },
- "passage": datasets.Value("string"),
- "context": datasets.Value("string"),
- EvalField.GOLD_ANSWER: datasets.Value("string"),
- }
- ),
- supervised_keys=None,
- homepage=_HOMEPAGE,
- license=_LICENSE,
- citation=_CITATION,
- )
-
- def _split_generators(self, dl_manager):
- """Returns SplitGenerators."""
- downloaded_files = dl_manager.download_and_extract(_URLS)
- data_dir = dl_manager.download_and_extract(self._url)
- train_filepath = os.path.join(data_dir, "train.traced.json")
- dev_filepath = os.path.join(data_dir, "dev.traced.json")
- return [
- datasets.SplitGenerator(
- name=datasets.Split.TRAIN,
- gen_kwargs={
- "filepath": train_filepath,
- "tablepath": downloaded_files["tables"],
- "passagepath": downloaded_files["passages"],
- "data_dir": data_dir,
- },
- ),
- datasets.SplitGenerator(
- name=datasets.Split.VALIDATION,
- gen_kwargs={
- "filepath": dev_filepath,
- "tablepath": downloaded_files["tables"],
- "passagepath": downloaded_files["passages"],
- "data_dir": data_dir,
- },
- ),
- ]
-
- def _generate_examples(self, filepath, tablepath, passagepath, data_dir):
- """Yields examples."""
- # data_id, question, table_id, gold_result_str
-
- with open(tablepath, encoding="utf-8") as f:
- tables = json.load(f)
- with open(passagepath, encoding="utf-8") as f:
- passages = json.load(f)
-
- # Format to database file
- if not self.db_output_dir.is_dir():
- self.db_output_dir.mkdir(parents=True)
-
- # dataset_split = Path(filepath).stem.split(".")[0]
- # output_db_filepath = None
- # output_db_filepath = self.db_output_dir / "ottqa.db"
- # add_tables = False
- # add_documents = False
- # if dataset_split == "train":
- # db_filename = f"ottqa.db"
- # output_db_filepath = self.db_output_dir / db_filename
- # # if not output_db_filepath.is_file():
- # if add_tables:
- # logger.info(f"\nConstructing {db_filename} in {data_dir}...")
- # tablename_to_table_json = {}
- # tablename_to_unique_idx = {}
- # for _table_id, table_data in tqdm(
- # tables.items(), total=len(tables), desc="Formatting tables..."
- # ):
- # _tablename = table_data["title"]
- # if _tablename not in tablename_to_unique_idx:
- # tablename_to_unique_idx[_tablename] = 0
- #
- # tablename = f"{_tablename} ({tablename_to_unique_idx[_tablename]})"
- # tablename_to_table_json[tablename] = {
- # "header": table_data["header"],
- # "rows": table_data["data"],
- # }
- # tablename_to_unique_idx[_tablename] += 1
- #
- # csv_output_dir = self.db_output_dir / "csv"
- # if not csv_output_dir.is_dir():
- # csv_output_dir.mkdir(parents=True)
- #
- # for tablename, table_json in tqdm(
- # tablename_to_table_json.items(),
- # total=len(tablename_to_table_json),
- # desc="Saving tables to csv...",
- # ):
- # csv_save_path = (
- # self.db_output_dir
- # / "csv"
- # / f"{tablename.replace('/', ' ')}.csv"
- # )
- # if csv_save_path.is_file():
- # continue
- # df = prepare_df_for_neuraldb_from_table(
- # table_json, add_row_id=False
- # )
- # df.to_csv(csv_save_path, index=False)
- #
- # # Use csvs-to-sqlite to create many sqlite tables from our csvs
- # # https://github.com/simonw/csvs-to-sqlite
- # # error_bad_lines deprecated: https://github.com/simonw/csvs-to-sqlite/issues/88
- # # need to run `pip install 'pandas==1.4.0'`
- # # os.system("csvs-to-sqlite ")
- #
- # # try:
- # # df.to_sql(
- # # tablename, sqlite_conn, if_exists="fail"
- # # )
- # # except ValueError:
- # # logger.info(f"FAILED ON TABLE {tablename}")
- #
- # if add_documents:
- # documents_table_json = {"header": ["title", "content"], "rows": []}
- # for doc_id, content in tqdm(
- # passages.items(),
- # total=len(passages),
- # desc="Formatting documents...",
- # ):
- # title = doc_id.split("/")[-1].replace("_", " ")
- # documents_table_json["rows"].append([title, content])
- #
- # # Put into database
- # sqlite_conn = sqlite3.connect(
- # str(output_db_filepath), check_same_thread=True
- # )
- #
- # chunksize = 10000
- #
- # def chunker(seq, size):
- # return (seq[pos : pos + size] for pos in range(0, len(seq), size))
- #
- # documents_df = pd.DataFrame(
- # data=documents_table_json["rows"],
- # columns=documents_table_json["header"],
- # )
- # c = sqlite_conn.cursor()
- # c.execute(CREATE_VIRTUAL_TABLE_CMD)
- # c.close()
- # with tqdm(
- # total=len(documents_df), desc="Uploading documents to db..."
- # ) as pbar:
- # for _i, cdf in enumerate(chunker(documents_df, chunksize)):
- # cdf.to_sql(
- # DOCS_TABLE_NAME,
- # sqlite_conn,
- # method="multi",
- # if_exists="append",
- # index=False,
- # )
- # pbar.update(chunksize)
- #
- # sqlite_conn.close()
- #
- # logger.info("\nFinished.")
-
- with open(filepath, encoding="utf-8") as f:
- data = json.load(f)
- for idx, example in enumerate(data):
- table = tables[example["table_id"]]
- answer_node = example["answer-node"]
- answer = example["answer-text"]
- header, data, passage_context_str = self.construct_expanded_table(
- table, passages, answer, answer_node
- )
- yield idx, {
- EvalField.UID: example["question_id"],
- EvalField.DB_PATH: str(self.db_output_dir / "ottqa.db"),
- EvalField.QUESTION: example["question"],
- "table_id": example["table_id"],
- "table": {"header": header, "rows": data},
- "passage": passage_context_str,
- "context": table["title"]
- + " | "
- + table["section_title"]
- + " | "
- + table["section_text"]
- + " | "
- + table["intro"],
- EvalField.GOLD_ANSWER: example["answer-text"],
- }
-
- def construct_expanded_table(self, table, passages, answer, answer_nodes):
- def process_link(link):
- return link.split("/")[-1].replace("_", " ")
-
- selected_passage = {}
- for answer_node in answer_nodes:
- link = answer_node[2]
- type_ = answer_node[3]
- if type_ == "passage":
- # Get passage and locate the sentence of answer
- passage_text = passages[link]
- sents = nltk.sent_tokenize(passage_text)
- has_answer_sent_idx = -1
- for idx, sent in enumerate(sents):
- if " " + answer.lower() + " " in " " + sent.lower() + " ":
- has_answer_sent_idx = idx
- selected_sents = sents[
- max(0, has_answer_sent_idx - (WINDOW_SIZE - 1) // 2) : min(
- len(sents) - 1, has_answer_sent_idx + (WINDOW_SIZE - 1) // 2
- )
- ]
- selected_passage[process_link(link)] = " ".join(selected_sents)
- else:
- pass
- # linearize selected passgae
- passage_context_str = "passages: "
- for key in selected_passage:
- passage_context_str += "{}: {} | ".format(key, selected_passage[key])
- return table["header"], table["data"], passage_context_str
diff --git a/research/datasets/wikitq/wikitq.py b/research/datasets/wikitq/wikitq.py
deleted file mode 100644
index 2ad840cf..00000000
--- a/research/datasets/wikitq/wikitq.py
+++ /dev/null
@@ -1,267 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Datasets Authors, The Google AI Language Team Authors and the current dataset script contributor.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Squall: On the Potential of Lexico-logical Alignments for Semantic Parsing to SQL Queries"""
-
-import json
-import os
-import datasets
-import shutil
-
-try:
- from ...wikitq.utils import _load_table_w_page as _load_table
- from ...wikitq.wikitq.utils import (
- WTQDBEngine,
- process_table_structure,
- retrieve_wtq_query_answer,
- )
- from ....constants import EvalField
-except ModuleNotFoundError:
- from research.utils.wikitq.utils import _load_table_w_page as _load_table
- from research.utils.wikitq.utils import (
- WTQDBEngine,
- process_table_structure,
- retrieve_wtq_query_answer,
- )
- from research.constants import EvalField
-
-logger = datasets.logging.get_logger(__name__)
-# Find for instance the citation on arxiv or on the dataset repo/website
-_CITATION = """\
-@inproceedings{Shi:Zhao:Boyd-Graber:Daume-III:Lee-2020,
- Title = {On the Potential of Lexico-logical Alignments for Semantic Parsing to {SQL} Queries},
- Author = {Tianze Shi and Chen Zhao and Jordan Boyd-Graber and Hal {Daum\'{e} III} and Lillian Lee},
- Booktitle = {Findings of EMNLP},
- Year = {2020},
-}
-"""
-
-_DESCRIPTION = """\
-Two important aspects of semantic parsing for question answering are the breadth of the knowledge source and the depth of
-logical compositionality. While existing work trades off one aspect for another, this paper simultaneously makes progress
-on both fronts through a new task: answering complex questions on semi-structured tables using question-answer pairs as
-supervision. The central challenge arises from two compounding factors: the broader domain results in an open-ended set
-of relations, and the deeper compositionality results in a combinatorial explosion in the space of logical forms. We
-propose a logical-form driven parsing algorithm guided by strong typing constraints and show that it obtains significant
- improvements over natural baselines. For evaluation, we created a new dataset of 22,033 complex questions on Wikipedia
- tables, which is made publicly available.
-"""
-
-_HOMEPAGE = "https://github.com/tzshi/squall"
-
-_LICENSE = "CC-BY-SA-4.0 License"
-
-_URL = "https://github.com/ppasupat/WikiTableQuestions/archive/refs/heads/master.zip"
-_SQUALL_URL = "https://github.com/tzshi/squall/archive/refs/heads/main.zip"
-
-
-class WikiTableQuestion(datasets.GeneratorBasedBuilder):
- """The Squall dataset"""
-
- VERSION = datasets.Version("1.0.0")
-
- BUILDER_CONFIGS = [
- datasets.BuilderConfig(
- name="squall",
- version=VERSION,
- description="SQUALL: On the Potential of Lexico-logical Alignments for Semantic Parsing to {SQL} Queries",
- ),
- ]
-
- def __init__(
- self,
- *args,
- writer_batch_size=None,
- wikitq_dataset_url=_URL,
- squall_dataset_url=_SQUALL_URL,
- **kwargs
- ) -> None:
- super().__init__(*args, writer_batch_size=writer_batch_size, **kwargs)
-
- self._url = wikitq_dataset_url
- self._squall_url = squall_dataset_url
-
- def _info(self):
- return datasets.DatasetInfo(
- description=_DESCRIPTION,
- features=datasets.Features(
- {
- EvalField.UID: datasets.Value("string"),
- EvalField.QUESTION: datasets.Value("string"),
- "table_id": datasets.Value("string"),
- "table": {
- "page_title": datasets.Value("string"),
- "header": datasets.features.Sequence(datasets.Value("string")),
- "rows": datasets.features.Sequence(
- datasets.features.Sequence(datasets.Value("string"))
- ),
- },
- "sql": datasets.Value("string"),
- EvalField.GOLD_ANSWER: datasets.features.Sequence(
- datasets.Value("string")
- ),
- }
- ),
- supervised_keys=None,
- homepage=_HOMEPAGE,
- license=_LICENSE,
- citation=_CITATION,
- )
-
- def _split_generators(self, dl_manager):
- """Returns SplitGenerators."""
- data_dir = os.path.join(
- dl_manager.download_and_extract(self._url), "WikiTableQuestions-master"
- )
- squall_dir = os.path.join(
- dl_manager.download_and_extract(self._squall_url), "squall-main"
- )
-
- return [
- datasets.SplitGenerator(
- name=datasets.Split.TRAIN,
- gen_kwargs={
- "filepath": os.path.join(data_dir, "data/random-split-1-train.tsv"),
- "data_dir": data_dir,
- "squall_path": os.path.join(squall_dir, "data/squall.json"),
- "squall_tables_path": os.path.join(squall_dir, "tables/json"),
- "squall_db_path": os.path.join(squall_dir, "tables/db"),
- "squall_tmp_db_path": os.path.join(squall_dir, "tables/tmp_db"),
- },
- ),
- datasets.SplitGenerator(
- name=datasets.Split.VALIDATION,
- gen_kwargs={
- "filepath": os.path.join(data_dir, "data/random-split-1-dev.tsv"),
- "data_dir": data_dir,
- "squall_path": os.path.join(squall_dir, "data/squall.json"),
- "squall_tables_path": os.path.join(squall_dir, "tables/json"),
- "squall_db_path": os.path.join(squall_dir, "tables/db"),
- "squall_tmp_db_path": os.path.join(squall_dir, "tables/tmp_db"),
- },
- ),
- datasets.SplitGenerator(
- name=datasets.Split.TEST,
- gen_kwargs={
- "filepath": os.path.join(
- data_dir, "data/pristine-unseen-tables.tsv"
- ),
- "data_dir": data_dir,
- "squall_path": os.path.join(squall_dir, "data/squall.json"),
- "squall_tables_path": os.path.join(squall_dir, "tables/json"),
- "squall_db_path": os.path.join(squall_dir, "tables/db"),
- "squall_tmp_db_path": os.path.join(squall_dir, "tables/tmp_db"),
- },
- ),
- ]
-
- def _generate_examples(
- self,
- filepath,
- data_dir,
- squall_path,
- squall_tables_path,
- squall_db_path,
- squall_tmp_db_path,
- ):
- """https://huggingface.co/docs/datasets/v1.2.1/add_dataset.html"""
- if not os.path.exists(squall_tmp_db_path):
- os.makedirs(squall_tmp_db_path)
-
- # source table should not be truncated!
- src_table_content_map = {}
- # tgt table should be truncated!
- tgt_table_content_map = {}
- table_drop_rows_map = {}
- db_engine_map = {}
-
- for table_json_file in os.listdir(squall_tables_path):
- table_id = table_json_file[:-5]
- check_table_file = open(
- os.path.join(squall_tables_path, table_json_file), "r", encoding="utf8"
- )
- src_table_content = json.load(check_table_file)
- src_table_content = process_table_structure(src_table_content)
- src_table_content_map[table_id] = json.loads(json.dumps(src_table_content))
- tgt_table_content_map[table_id] = src_table_content
-
- for table_db_file in os.listdir(squall_db_path):
- table_id = table_db_file[:-3]
- # copy table db file into a temp file since we may delete some rows
- database_path = os.path.join(squall_db_path, table_db_file)
- temp_database_path = os.path.join(squall_tmp_db_path, table_db_file)
- if os.path.exists(temp_database_path):
- os.remove(temp_database_path)
- # future operations on the temp db to avoid effecting the original database
- shutil.copy(database_path, temp_database_path)
- db_engine_map[table_id] = WTQDBEngine(temp_database_path)
- if (
- table_id in table_drop_rows_map
- and len(table_drop_rows_map[table_id]) != 0
- ):
- table_drop_rows = table_drop_rows_map[table_id]
- db_engine_map[table_id].delete_rows(table_drop_rows)
-
- """Yields examples."""
- squall_id_map = {}
- with open(squall_path) as f:
- squall_data = json.load(f)
- for squall_item in squall_data:
- squall_id_map[squall_item["nt"]] = squall_item
-
- # data_id, question, table_id, gold_result_str
- with open(filepath, encoding="utf-8") as f:
- for idx, line in enumerate(f):
- # skip the header
- if idx == 0:
- continue
- data_id, question, table_id, gold_result_str = line.strip("\n").split(
- "\t"
- )
-
- if data_id in squall_id_map.keys():
- # Data annotation from WikiTableQuestion dataset
- table = _load_table(
- os.path.join(data_dir, table_id.replace(".csv", ".tsv"))
- )
- gold_result = gold_result_str.split("|")
-
- # Data annotation from Squall dataset.
- squall_data_item = squall_id_map[data_id]
- squall_table_id = squall_data_item["tbl"]
- sql_struct = squall_data_item["sql"]
- engine, src_table_content = (
- db_engine_map[squall_table_id],
- src_table_content_map[squall_table_id],
- )
- try:
- encode_sql_str, _, exec_sql_str = retrieve_wtq_query_answer(
- engine, table, sql_struct
- )
- except IndexError:
- # In case header is modified.
- encode_sql_str, _, exec_sql_str = retrieve_wtq_query_answer(
- engine, src_table_content, sql_struct
- )
-
- yield idx, {
- EvalField.UID: data_id,
- EvalField.QUESTION: question,
- "table_id": table_id,
- "table": table,
- "sql": encode_sql_str,
- EvalField.GOLD_ANSWER: gold_result,
- }
- else:
- continue
diff --git a/research/metrics/__init__.py b/research/metrics/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/research/metrics/fetaqa/evaluator.py b/research/metrics/fetaqa/evaluator.py
deleted file mode 100644
index b2d27e92..00000000
--- a/research/metrics/fetaqa/evaluator.py
+++ /dev/null
@@ -1,89 +0,0 @@
-from datasets import load_metric
-import nltk
-
-
-# the code below refers to the https://github.com/Yale-LILY/FeTaQA/blob/main/end2end/train.py
-def postprocess_text(preds, labels, metric_name):
- preds = [pred.strip() for pred in preds]
- labels = [label.strip() for label in labels]
-
- # rougeLSum expects newline after each sentence
- if metric_name == "rouge":
- preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
- labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]
- elif metric_name == "sacrebleu": # sacrebleu
- labels = [[label] for label in labels]
- elif metric_name == "bleu":
- preds = [pred.split(" ") for pred in preds]
- labels = [[label.split(" ")] for label in labels]
- else:
- pass
-
- return preds, labels
-
-
-class EvaluateTool(object):
- def __init__(self, args):
- self.args = args
-
- def evaluate(self, preds, golds, section):
- summary = {}
-
- gold_text = [item["seq_out"] for item in golds]
-
- assert len(preds) == len(gold_text)
-
- metric_list = []
- if section in ["train", "dev"]:
- metric_list = ["sacrebleu"]
- elif section == "test":
- metric_list = ["sacrebleu", "rouge", "meteor", "bertscore", "bleurt"]
-
- for metric_name in metric_list:
- metric = load_metric(metric_name)
- processed_preds, processed_golds = postprocess_text(
- preds, gold_text, metric_name
- )
-
- if metric_name == "bertscore":
- res = metric.compute(
- predictions=processed_preds, references=processed_golds, lang="en"
- )
- for k, v in res.items():
- if k == "hashcode":
- continue
- summary[f"{metric_name}_{k}"] = round(1.0 * sum(v) / len(v), 2)
-
- else:
- res = metric.compute(
- predictions=processed_preds, references=processed_golds
- )
- if metric_name == "sacrebleu":
- summary[metric_name] = (
- res["score"] * 0.01
- ) # limit it to range of [0, 1] for unifying
- elif metric_name == "bleurt":
- summary["bleurt"] = round(
- 1.0 * sum(res["scores"]) / len(res["scores"]), 2
- )
- elif metric_name == "rouge":
- for sub_metric_name in res.keys():
- for i, key in enumerate(["precision", "recall", "fmeasure"]):
- summary["{}_{}".format(sub_metric_name, key)] = res[
- sub_metric_name
- ][1][i]
- # this the the fmeasure('f-score') from the mid('mean aggregation')
- else:
- summary[metric_name] = res[metric_name]
- return summary
-
-
-if __name__ == "__main__":
- import json
-
- with open("predictions_predict.json") as f:
- test_data = json.load(f)
- preds = [item["prediction"] for item in test_data]
- evaluator = EvaluateTool(args=None)
- score = evaluator.evaluate(preds, test_data, section="test")
- print(score)
diff --git a/research/metrics/fetaqa/fetaqa.py b/research/metrics/fetaqa/fetaqa.py
deleted file mode 100644
index 7b7b8685..00000000
--- a/research/metrics/fetaqa/fetaqa.py
+++ /dev/null
@@ -1,137 +0,0 @@
-"""Spider concept metrics."""
-
-from typing import Optional, Union
-import datasets
-from datasets import load_metric
-from .evaluator import postprocess_text
-
-# Find for instance the citation on arxiv or on the dataset repo/website
-_CITATION = """\
-@article{Nan2021FeTaQAFT,
- title={FeTaQA: Free-form Table Question Answering},
- author={Nan, Linyong and Hsieh, Chiachun and Mao, Ziming and Lin, Xi Victoria and Verma, Neha and Zhang, Rui and Kryściński, Wojciech and Schoelkopf, Hailey and Kong, Riley and Tang, Xiangru and Mutuma, Mutethia and Rosand, Ben and Trindade, Isabel and Bandaru, Renusree and Cunningham, Jacob and Xiong, Caiming and Radev, Dragomir},
- journal={Transactions of the Association for Computational Linguistics},
- year={2022},
- volume={10},
- pages={35-49}
-}
-"""
-
-_DESCRIPTION = """\
-FeTaQA is a Free-form Table Question Answering dataset with 10K Wikipedia-based {table,
-question, free-form answer, supporting table cells} pairs. It yields a more challenging table
-QA setting because it requires generating free-form text answers after retrieval, inference,
-and integration of multiple discontinuous facts from a structured knowledge source.
-Unlike datasets of generative QA over text in which answers are prevalent with copies of
-short text spans from the source, answers in our dataset are human-generated explanations
-involving entities and their high-level relations.
-"""
-
-_HOMEPAGE = "https://github.com/Yale-LILY/FeTaQA"
-
-_LICENSE = "CC-BY-SA-4.0 License"
-
-
-@datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION)
-class FetaQAQuestion(datasets.Metric):
- def __init__(
- self,
- config_name: Optional[str] = None,
- keep_in_memory: bool = False,
- cache_dir: Optional[str] = None,
- num_process: int = 1,
- process_id: int = 0,
- seed: Optional[int] = None,
- experiment_id: Optional[str] = None,
- max_concurrent_cache_files: int = 10000,
- timeout: Union[int, float] = 100,
- **kwargs,
- ):
- super().__init__(
- config_name=config_name,
- keep_in_memory=keep_in_memory,
- cache_dir=cache_dir,
- num_process=num_process,
- process_id=process_id,
- seed=seed,
- experiment_id=experiment_id,
- max_concurrent_cache_files=max_concurrent_cache_files,
- timeout=timeout,
- **kwargs,
- )
-
- def _info(self):
- return datasets.MetricInfo(
- description=_DESCRIPTION,
- citation=_CITATION,
- features=datasets.Features(
- {
- "predictions": datasets.features.Sequence(datasets.Value("string")),
- "references": datasets.features.Features(
- {
- "answer_text": datasets.features.Sequence(
- datasets.Value("string")
- ),
- "question": datasets.Value("string"),
- }
- ),
- }
- ),
- reference_urls=[""],
- )
-
- @staticmethod
- def eval_metric(preds, labels, metric_name="bertscore"):
- summary = {}
- if metric_name == "all":
- metric_list = ["sacrebleu", "rouge", "meteor", "bertscore", "bleurt"]
- else:
- metric_list = [metric_name]
-
- for metric_name in metric_list:
- metric = load_metric(metric_name)
- pred, gold = postprocess_text(preds, labels, metric_name)
-
- if metric_name == "bertscore":
- res = metric.compute(predictions=pred, references=gold, lang="en")
- for k, v in res.items():
- if k == "hashcode":
- continue
- summary[f"{metric_name}_{k}"] = round(1.0 * sum(v) / len(v), 2)
- else:
- res = metric.compute(predictions=pred, references=gold)
- if metric_name == "sacrebleu":
- summary[metric_name] = res["score"] * 0.01
- # return res["score"] * 0.01 # limit it to range of [0, 1] for unifying
- elif metric_name == "bleurt":
- summary["bleurt"] = round(
- 1.0 * sum(res["scores"]) / len(res["scores"]), 2
- )
- # return round(1.0 * sum(res["scores"]) / len(res["scores"]), 2)
- elif metric_name == "rouge":
- for sub_metric_name in res.keys():
- for i, key in enumerate(["precision", "recall", "fmeasure"]):
- summary["{}_{}".format(sub_metric_name, key)] = res[
- sub_metric_name
- ][1][i]
- # return res[sub_metric_name][1][-1] #'fmeasure'
- # this the the fmeasure('f-score') from the mid('mean aggregation')
- else:
- summary[metric_name] = res[metric_name]
- # return res[metric_name]
- return summary
-
- def _compute(self, predictions, references, allow_semantic: bool = True, **kwargs):
- assert len(predictions) == len(references)
- n_total_samples = len(predictions)
- n_correct_dict = None
- for pred, ref in zip(predictions, references):
- score = self.eval_metric(
- preds=pred, labels=ref["answer_text"], metric_name=self.config_name
- )
- if n_correct_dict:
- for key, value in score.items():
- n_correct_dict[key] += value / n_total_samples
- else:
- n_correct_dict = {k: v / n_total_samples for k, v in score.items()}
- return n_correct_dict
diff --git a/research/metrics/feverous/evaluator.py b/research/metrics/feverous/evaluator.py
deleted file mode 100644
index b18073a6..00000000
--- a/research/metrics/feverous/evaluator.py
+++ /dev/null
@@ -1,23 +0,0 @@
-import numpy as np
-
-
-class EvaluateTool(object):
- def __init__(self, args=None):
- self.args = args
-
- def evaluate(self, preds, golds, section=None):
- summary = {}
- all_match = []
-
- for pred, gold_item in zip(preds, golds):
- # IMPORTANT!
- # Below we ignore "NOT ENOUGH INFO"
- # Consider this when comparing to other results
- if gold_item["seq_out"] == "NOT ENOUGH INFO":
- continue
- match_or_not = pred == gold_item["seq_out"]
- all_match.append(match_or_not)
-
- summary["all"] = float(np.mean(all_match))
-
- return summary
diff --git a/research/metrics/feverous/feverous.py b/research/metrics/feverous/feverous.py
deleted file mode 100644
index 0dee3f73..00000000
--- a/research/metrics/feverous/feverous.py
+++ /dev/null
@@ -1,69 +0,0 @@
-from typing import Optional, Union
-import datasets
-
-try:
- from .evaluator import EvaluateTool
-except:
- from research.metrics.feverous.evaluator import EvaluateTool
-
-_CITATION = """\
-@article{aly2021feverous,
- title={FEVEROUS: Fact Extraction and VERification Over Unstructured and Structured information},
- author={Aly, Rami and Guo, Zhijiang and Schlichtkrull, Michael and Thorne, James and Vlachos, Andreas and Christodoulopoulos, Christos and Cocarascu, Oana and Mittal, Arpit},
- journal={arXiv preprint arXiv:2106.05707},
- year={2021}
-}
-"""
-
-_DESCRIPTION = """\
-This dataset is obtained from the official release of the FEVEROUS.
-"""
-
-
-@datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION)
-class FEVEROUS(datasets.Metric):
- def __init__(
- self,
- config_name: Optional[str] = None,
- keep_in_memory: bool = False,
- cache_dir: Optional[str] = None,
- num_process: int = 1,
- process_id: int = 0,
- seed: Optional[int] = None,
- experiment_id: Optional[str] = None,
- max_concurrent_cache_files: int = 10000,
- timeout: Union[int, float] = 100,
- **kwargs
- ):
- super().__init__(
- config_name=config_name,
- keep_in_memory=keep_in_memory,
- cache_dir=cache_dir,
- num_process=num_process,
- process_id=process_id,
- seed=seed,
- experiment_id=experiment_id,
- max_concurrent_cache_files=max_concurrent_cache_files,
- timeout=timeout,
- **kwargs
- )
- self.evaluator = EvaluateTool()
-
- def _info(self):
- return datasets.MetricInfo(
- description=_DESCRIPTION,
- citation=_CITATION,
- features=datasets.Features(
- {
- "predictions": datasets.Value("string"),
- "references": datasets.features.Features(
- {"seq_out": datasets.Value("string")}
- ),
- }
- ),
- reference_urls=[""],
- )
-
- def _compute(self, predictions, references, **kwargs):
- assert len(predictions) == len(references)
- return self.evaluator.evaluate(preds=predictions, golds=references)
diff --git a/research/metrics/hybridqa/evaluator.py b/research/metrics/hybridqa/evaluator.py
deleted file mode 100644
index abd866e8..00000000
--- a/research/metrics/hybridqa/evaluator.py
+++ /dev/null
@@ -1,170 +0,0 @@
-import re
-import collections
-import string
-from ...utils.normalizer import str_normalize
-from ..wikitq.evaluator import to_value_list, check_denotation
-
-# copy from https://github.com/wenhuchen/HybridQA/blob/master/evaluate_script.py
-
-
-def normalize_answer(s):
- """Lower text and remove punctuation, articles and extra whitespace."""
-
- def remove_articles(text):
- regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
- return re.sub(regex, " ", text)
-
- def white_space_fix(text):
- return " ".join(text.split())
-
- def remove_punc(text):
- exclude = set(string.punctuation)
- return "".join(ch for ch in text if ch not in exclude)
-
- def lower(text):
- return text.lower()
-
- return white_space_fix(remove_articles(remove_punc(lower(s))))
-
-
-def get_tokens(s):
- if not s:
- return []
- return normalize_answer(s).split()
-
-
-def compute_exact(a_gold, a_pred):
- return int(normalize_answer(a_gold) == normalize_answer(a_pred))
-
-
-def compute_f1(a_gold, a_pred):
- gold_toks = get_tokens(a_gold)
- pred_toks = get_tokens(a_pred)
- common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
- num_same = sum(common.values())
- if len(gold_toks) == 0 or len(pred_toks) == 0:
- # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
- return int(gold_toks == pred_toks)
- if num_same == 0:
- return 0
- precision = 1.0 * num_same / len(pred_toks)
- recall = 1.0 * num_same / len(gold_toks)
- f1 = (2 * precision * recall) / (precision + recall)
- return f1
-
-
-def eval_ex_match(pred, gold, allow_semantic=True, question=None):
- """Taken from the Binder codebase, used to evaluate WikiTQ dataset."""
- if not isinstance(pred, list):
- pred = [pred]
- gold = [gold]
-
- pred = [str(p).lower().strip() for p in pred]
- gold = [str(g).lower().strip() for g in gold]
- if not allow_semantic:
- # WikiTQ eval w. string normalization using recognizer
- pred = [str_normalize(span) for span in pred]
- gold = [str_normalize(span) for span in gold]
- pred = to_value_list(pred)
- gold = to_value_list(gold)
- return check_denotation(pred, gold)
- else:
- assert isinstance(question, str)
- question = re.sub("\s+", " ", question).strip().lower()
- pred = [str_normalize(span) for span in pred]
- gold = [str_normalize(span) for span in gold]
- pred = sorted(list(set(pred)))
- gold = sorted(list(set(gold)))
- # (1) 0 matches 'no', 1 matches 'yes'; 0 matches 'more', 1 matches 'less', etc.
- if len(pred) == 1 and len(gold) == 1:
- if (pred[0] == "0" and gold[0] == "no") or (
- pred[0] == "1" and gold[0] == "yes"
- ):
- return True
- question_tokens = question.split()
- try:
- pos_or = question_tokens.index("or")
- token_before_or, token_after_or = (
- question_tokens[pos_or - 1],
- question_tokens[pos_or + 1],
- )
- if (pred[0] == "0" and gold[0] == token_after_or) or (
- pred[0] == "1" and gold[0] == token_before_or
- ):
- return True
- except Exception:
- pass
- # (2) Number value (allow units) and Date substring match
- if len(pred) == 1 and len(gold) == 1:
- NUMBER_UNITS_PATTERN = re.compile(
- "^\$*[+-]?([0-9]*[.])?[0-9]+(\s*%*|\s+\w+)$"
- )
- DATE_PATTERN = re.compile(
- "[0-9]{4}-[0-9]{1,2}-[0-9]{1,2}\s*([0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2})?"
- )
- DURATION_PATTERN = re.compile("(P|PT)(\d+)(Y|M|D|H|S)")
- p, g = pred[0], gold[0]
- # Restore `duration` type, e.g., from 'P3Y' -> '3'
- if re.match(DURATION_PATTERN, p):
- p = re.match(DURATION_PATTERN, p).group(2)
- if re.match(DURATION_PATTERN, g):
- g = re.match(DURATION_PATTERN, g).group(2)
- match = False
- num_flag, date_flag = False, False
- # Number w. unit match after string normalization.
- # Either pred or gold being number w. units suffices it.
- if re.match(NUMBER_UNITS_PATTERN, p) or re.match(NUMBER_UNITS_PATTERN, g):
- num_flag = True
- # Date match after string normalization.
- # Either pred or gold being date suffices it.
- if re.match(DATE_PATTERN, p) or re.match(DATE_PATTERN, g):
- date_flag = True
- if num_flag:
- p_set, g_set = set(p.split()), set(g.split())
- if p_set.issubset(g_set) or g_set.issubset(p_set):
- match = True
- if date_flag:
- p_set, g_set = set(p.replace("-", " ").split()), set(
- g.replace("-", " ").split()
- )
- if p_set.issubset(g_set) or g_set.issubset(p_set):
- match = True
- if match:
- return True
- pred = to_value_list(pred)
- gold = to_value_list(gold)
- return check_denotation(pred, gold)
-
-
-class EvaluateTool(object):
- def __init__(self, args: dict = None):
- self.args = args
-
- def evaluate(self, preds, golds, section=None):
- summary = {}
- exact_scores = {}
- f1_scores = {}
- denotation_scores = {}
- for pred, gold in zip(preds, golds):
- qas_id = gold["id"]
- gold_answers = [gold["answer_text"]]
-
- exact_scores[qas_id] = max(compute_exact(a, pred) for a in gold_answers)
- f1_scores[qas_id] = max(compute_f1(a, pred) for a in gold_answers)
- denotation_scores[qas_id] = max(
- eval_ex_match(a, pred, question=gold["question"]) for a in gold_answers
- )
-
- total = len(golds)
- qid_list = list(exact_scores.keys())
-
- summary["acc"] = sum(exact_scores[k] for k in qid_list) / total
- summary["f1"] = sum(f1_scores[k] for k in qid_list) / total
- summary["denotation_acc"] = sum(denotation_scores[k] for k in qid_list) / total
- return summary
-
-
-if __name__ == "__main__":
- evaluator = EvaluateTool()
- evaluator.evaluate(preds=["1"], golds=[{"answer_text": "1", "id": 0}])
- print()
diff --git a/research/metrics/hybridqa/hybridqa.py b/research/metrics/hybridqa/hybridqa.py
deleted file mode 100644
index c2bbff6c..00000000
--- a/research/metrics/hybridqa/hybridqa.py
+++ /dev/null
@@ -1,78 +0,0 @@
-from typing import Optional, Union
-import datasets
-
-try:
- from .evaluator import EvaluateTool
-except:
- from research.metrics.hybridqa.evaluator import EvaluateTool
-
-# Find for instance the citation on arxiv or on the dataset repo/website
-_CITATION = """\
-@article{chen2020hybridqa,
- title={HybridQA: A Dataset of Multi-Hop Question Answering over Tabular and Textual Data},
- author={Chen, Wenhu and Zha, Hanwen and Chen, Zhiyu and Xiong, Wenhan and Wang, Hong and Wang, William},
- journal={Findings of EMNLP 2020},
- year={2020}
-}
-"""
-
-_DESCRIPTION = """\
-Existing question answering datasets focus on dealing with homogeneous information, based either only on text or KB/Table information alone. However, as human knowledge is distributed over heterogeneous forms, using homogeneous information alone might lead to severe coverage problems. To fill in the gap, we present HybridQA, a new large-scale question-answering dataset that requires reasoning on heterogeneous information. Each question is aligned with a Wikipedia table and multiple free-form corpora linked with the entities in the table. The questions are designed to aggregate both tabular information and text information, i.e., lack of either form would render the question unanswerable. We test with three different models: 1) a table-only model. 2) text-only model. 3) a hybrid model that combines heterogeneous information to find the answer. The experimental results show that the EM scores obtained by two baselines are below 20%, while the hybrid model can achieve an EM over 40%. This gap suggests the necessity to aggregate heterogeneous information in HybridQA. However, the hybrid model’s score is still far behind human performance. Hence, HybridQA can serve as a challenging benchmark to study question answering with heterogeneous information.
-"""
-
-_HOMEPAGE = "https://hybridqa.github.io/"
-
-_LICENSE = "CC-BY-SA-4.0 License"
-
-
-@datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION)
-class HybridQA(datasets.Metric):
- def __init__(
- self,
- config_name: Optional[str] = None,
- keep_in_memory: bool = False,
- cache_dir: Optional[str] = None,
- num_process: int = 1,
- process_id: int = 0,
- seed: Optional[int] = None,
- experiment_id: Optional[str] = None,
- max_concurrent_cache_files: int = 10000,
- timeout: Union[int, float] = 100,
- **kwargs
- ):
- super().__init__(
- config_name=config_name,
- keep_in_memory=keep_in_memory,
- cache_dir=cache_dir,
- num_process=num_process,
- process_id=process_id,
- seed=seed,
- experiment_id=experiment_id,
- max_concurrent_cache_files=max_concurrent_cache_files,
- timeout=timeout,
- **kwargs
- )
- self.evaluator = EvaluateTool()
-
- def _info(self):
- return datasets.MetricInfo(
- description=_DESCRIPTION,
- citation=_CITATION,
- features=datasets.Features(
- {
- "predictions": datasets.Value("string"),
- "references": datasets.features.Features(
- {
- "answer_text": datasets.Value("string"),
- "id": datasets.Value("string"),
- "question": datasets.Value("string"),
- }
- ),
- }
- ),
- reference_urls=[""],
- )
-
- def _compute(self, predictions, references, **kwargs):
- assert len(predictions) == len(references)
- return self.evaluator.evaluate(predictions, references)
diff --git a/research/metrics/ottqa/evaluator.py b/research/metrics/ottqa/evaluator.py
deleted file mode 100644
index 8d5494b3..00000000
--- a/research/metrics/ottqa/evaluator.py
+++ /dev/null
@@ -1,163 +0,0 @@
-import re
-import collections
-import string
-from ...utils.normalizer import str_normalize
-from ..wikitq.evaluator import to_value_list, check_denotation
-
-# copy from https://github.com/wenhuchen/OTT-QA/blob/master/evaluate_script.py
-
-
-def normalize_answer(s):
- """Lower text and remove punctuation, articles and extra whitespace."""
-
- def remove_articles(text):
- regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
- return re.sub(regex, " ", text)
-
- def white_space_fix(text):
- return " ".join(text.split())
-
- def remove_punc(text):
- exclude = set(string.punctuation)
- return "".join(ch for ch in text if ch not in exclude)
-
- def lower(text):
- return text.lower()
-
- return white_space_fix(remove_articles(remove_punc(lower(s))))
-
-
-def get_tokens(s):
- if not s:
- return []
- return normalize_answer(s).split()
-
-
-def compute_exact(a_gold, a_pred):
- return int(normalize_answer(a_gold) == normalize_answer(a_pred))
-
-
-def compute_f1(a_gold, a_pred):
- gold_toks = get_tokens(a_gold)
- pred_toks = get_tokens(a_pred)
- common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
- num_same = sum(common.values())
- if len(gold_toks) == 0 or len(pred_toks) == 0:
- # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
- return int(gold_toks == pred_toks)
- if num_same == 0:
- return 0
- precision = 1.0 * num_same / len(pred_toks)
- recall = 1.0 * num_same / len(gold_toks)
- f1 = (2 * precision * recall) / (precision + recall)
- return f1
-
-
-def eval_ex_match(pred, gold, allow_semantic=True, question=None):
- """Taken from the Binder codebase, used to evaluate WikiTQ dataset."""
- if not isinstance(pred, list):
- pred = [pred]
- gold = [gold]
-
- pred = [str(p).lower().strip() for p in pred]
- gold = [str(g).lower().strip() for g in gold]
- if not allow_semantic:
- # WikiTQ eval w. string normalization using recognizer
- pred = [str_normalize(span) for span in pred]
- gold = [str_normalize(span) for span in gold]
- pred = to_value_list(pred)
- gold = to_value_list(gold)
- return check_denotation(pred, gold)
- else:
- assert isinstance(question, str)
- question = re.sub("\s+", " ", question).strip().lower()
- pred = [str_normalize(span) for span in pred]
- gold = [str_normalize(span) for span in gold]
- pred = sorted(list(set(pred)))
- gold = sorted(list(set(gold)))
- # (1) 0 matches 'no', 1 matches 'yes'; 0 matches 'more', 1 matches 'less', etc.
- if len(pred) == 1 and len(gold) == 1:
- if (pred[0] == "0" and gold[0] == "no") or (
- pred[0] == "1" and gold[0] == "yes"
- ):
- return True
- question_tokens = question.split()
- try:
- pos_or = question_tokens.index("or")
- token_before_or, token_after_or = (
- question_tokens[pos_or - 1],
- question_tokens[pos_or + 1],
- )
- if (pred[0] == "0" and gold[0] == token_after_or) or (
- pred[0] == "1" and gold[0] == token_before_or
- ):
- return True
- except Exception:
- pass
- # (2) Number value (allow units) and Date substring match
- if len(pred) == 1 and len(gold) == 1:
- NUMBER_UNITS_PATTERN = re.compile(
- "^\$*[+-]?([0-9]*[.])?[0-9]+(\s*%*|\s+\w+)$"
- )
- DATE_PATTERN = re.compile(
- "[0-9]{4}-[0-9]{1,2}-[0-9]{1,2}\s*([0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2})?"
- )
- DURATION_PATTERN = re.compile("(P|PT)(\d+)(Y|M|D|H|S)")
- p, g = pred[0], gold[0]
- # Restore `duration` type, e.g., from 'P3Y' -> '3'
- if re.match(DURATION_PATTERN, p):
- p = re.match(DURATION_PATTERN, p).group(2)
- if re.match(DURATION_PATTERN, g):
- g = re.match(DURATION_PATTERN, g).group(2)
- match = False
- num_flag, date_flag = False, False
- # Number w. unit match after string normalization.
- # Either pred or gold being number w. units suffices it.
- if re.match(NUMBER_UNITS_PATTERN, p) or re.match(NUMBER_UNITS_PATTERN, g):
- num_flag = True
- # Date match after string normalization.
- # Either pred or gold being date suffices it.
- if re.match(DATE_PATTERN, p) or re.match(DATE_PATTERN, g):
- date_flag = True
- if num_flag:
- p_set, g_set = set(p.split()), set(g.split())
- if p_set.issubset(g_set) or g_set.issubset(p_set):
- match = True
- if date_flag:
- p_set, g_set = set(p.replace("-", " ").split()), set(
- g.replace("-", " ").split()
- )
- if p_set.issubset(g_set) or g_set.issubset(p_set):
- match = True
- if match:
- return True
- pred = to_value_list(pred)
- gold = to_value_list(gold)
- return check_denotation(pred, gold)
-
-
-class EvaluateTool(object):
- def __init__(self, args=None):
- self.args = args
-
- def evaluate(self, preds, golds, section=None):
- summary = {}
- exact_scores = {}
- f1_scores = {}
- denotation_scores = {}
- for pred, gold in zip(preds, golds):
- qas_id = gold["id"]
- gold_answers = [gold["answer_text"]]
-
- exact_scores[qas_id] = max(compute_exact(a, pred) for a in gold_answers)
- f1_scores[qas_id] = max(compute_f1(a, pred) for a in gold_answers)
- denotation_scores[qas_id] = max(
- eval_ex_match(a, pred, question=gold["question"]) for a in gold_answers
- )
- total = len(golds)
- qid_list = list(exact_scores.keys())
-
- summary["exact"] = sum(exact_scores[k] for k in qid_list) / total
- summary["f1"] = sum(f1_scores[k] for k in qid_list) / total
- summary["denotation_acc"] = sum(denotation_scores[k] for k in qid_list) / total
- return summary
diff --git a/research/metrics/ottqa/ottqa.py b/research/metrics/ottqa/ottqa.py
deleted file mode 100644
index 091b84c1..00000000
--- a/research/metrics/ottqa/ottqa.py
+++ /dev/null
@@ -1,73 +0,0 @@
-from typing import Optional, Union
-import datasets
-
-try:
- from .evaluator import EvaluateTool
-except:
- from research.metrics.ottqa.evaluator import EvaluateTool
-
-_CITATION = """\
-@article{chen2020open,
- title={Open question answering over tables and text},
- author={Chen, Wenhu and Chang, Ming-Wei and Schlinger, Eva and Wang, William and Cohen, William W},
- journal={arXiv preprint arXiv:2010.10439},
- year={2020}
-}
-"""
-
-_DESCRIPTION = """\
-This dataset is obtained from the official release of the OTT-QA.
-"""
-
-
-@datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION)
-class OTTQA(datasets.Metric):
- def __init__(
- self,
- config_name: Optional[str] = None,
- keep_in_memory: bool = False,
- cache_dir: Optional[str] = None,
- num_process: int = 1,
- process_id: int = 0,
- seed: Optional[int] = None,
- experiment_id: Optional[str] = None,
- max_concurrent_cache_files: int = 10000,
- timeout: Union[int, float] = 100,
- **kwargs
- ):
- super().__init__(
- config_name=config_name,
- keep_in_memory=keep_in_memory,
- cache_dir=cache_dir,
- num_process=num_process,
- process_id=process_id,
- seed=seed,
- experiment_id=experiment_id,
- max_concurrent_cache_files=max_concurrent_cache_files,
- timeout=timeout,
- **kwargs
- )
- self.evaluator = EvaluateTool()
-
- def _info(self):
- return datasets.MetricInfo(
- description=_DESCRIPTION,
- citation=_CITATION,
- features=datasets.Features(
- {
- "predictions": datasets.Value("string"),
- "references": datasets.features.Features(
- {
- "answer_text": datasets.Value("string"),
- "id": datasets.Value("string"),
- "question": datasets.Value("string"),
- }
- ),
- }
- ),
- reference_urls=[""],
- )
-
- def _compute(self, predictions, references, **kwargs):
- assert len(predictions) == len(references)
- return self.evaluator.evaluate(preds=predictions, golds=references)
diff --git a/research/metrics/wikitq/evaluator.py b/research/metrics/wikitq/evaluator.py
deleted file mode 100644
index 6a74863c..00000000
--- a/research/metrics/wikitq/evaluator.py
+++ /dev/null
@@ -1,369 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-"""Official Evaluator for WikiTableQuestions Dataset
-
-There are 3 value types
-1. String (unicode)
-2. Number (float)
-3. Date (a struct with 3 fields: year, month, and date)
- Some fields (but not all) can be left unspecified. However, if only the year
- is specified, the date is automatically converted into a number.
-
-Target denotation = a set of items
-- Each item T is a raw unicode string from Mechanical Turk
-- If T can be converted to a number or date (via Stanford CoreNLP), the
- converted value (number T_N or date T_D) is precomputed
-
-Predicted denotation = a set of items
-- Each item P is a string, a number, or a date
-- If P is read from a text file, assume the following
- - A string that can be converted into a number (float) is converted into a
- number
- - A string of the form "yyyy-mm-dd" is converted into a date. Unspecified
- fields can be marked as "xx". For example, "xx-01-02" represents the date
- January 2nd of an unknown year.
- - Otherwise, it is kept as a string
-
-The predicted denotation is correct if
-1. The sizes of the target denotation and the predicted denotation are equal
-2. Each item in the target denotation matches an item in the predicted
- denotation
-
-A target item T matches a predicted item P if one of the following is true:
-1. normalize(raw string of T) and normalize(string form of P) are identical.
- The normalize method performs the following normalizations on strings:
- - Remove diacritics (é → e)
- - Convert smart quotes (‘’´`“”) and dashes (‐‑‒–—−) into ASCII ones
- - Remove citations (trailing •♦†‡*#+ or [...])
- - Remove details in parenthesis (trailing (...))
- - Remove outermost quotation marks
- - Remove trailing period (.)
- - Convert to lowercase
- - Collapse multiple whitespaces and strip outermost whitespaces
-2. T can be interpreted as a number T_N, P is a number, and P = T_N
-3. T can be interpreted as a date T_D, P is a date, and P = T_D
- (exact match on all fields; e.g., xx-01-12 and 1990-01-12 do not match)
-"""
-__version__ = "1.0.2"
-
-import re
-import unicodedata
-from math import isnan, isinf
-from abc import ABCMeta, abstractmethod
-
-
-################ String Normalization ################
-
-
-def normalize(x):
- if not isinstance(x, str):
- x = x.decode("utf8", errors="ignore")
- # Remove diacritics
- x = "".join(
- c for c in unicodedata.normalize("NFKD", x) if unicodedata.category(c) != "Mn"
- )
- # Normalize quotes and dashes
- x = re.sub(r"[‘’´`]", "'", x)
- x = re.sub(r"[“”]", '"', x)
- x = re.sub(r"[‐‑‒–—−]", "-", x)
- while True:
- old_x = x
- # Remove citations
- x = re.sub(r"((? backslash + n
- vertical bar (0x7C) -> backslash + p
- backslash (0x5C) -> backslash + backslash
-
- Args:
- x (str or unicode)
- Returns:
- a unicode
- """
- return x.replace(r"\n", "\n").replace(r"\p", "|").replace("\\\\", "\\")
-
-
-def tsv_unescape_list(x):
- """Unescape a list in the TSV file.
- List items are joined with vertical bars (0x5C)
-
- Args:
- x (str or unicode)
- Returns:
- a list of unicodes
- """
- return [tsv_unescape(y) for y in x.split("|")]
-
-
-def main():
- pred_answer = ["ABC"]
- gold_answer = ["Abc"]
- pred_answer_val = to_value_list(pred_answer)
- gold_answer_val = to_value_list(gold_answer)
- correct = check_denotation(pred_answer_val, gold_answer_val)
- print(pred_answer_val)
- print(gold_answer_val)
- print(correct)
-
-
-if __name__ == "__main__":
- main()
diff --git a/research/metrics/wikitq/wikitq.py b/research/metrics/wikitq/wikitq.py
deleted file mode 100644
index 064119b1..00000000
--- a/research/metrics/wikitq/wikitq.py
+++ /dev/null
@@ -1,191 +0,0 @@
-"""Spider concept metrics."""
-
-from typing import Optional, Union
-import re
-import datasets
-
-try:
- from .evaluator import to_value_list, check_denotation
- from ...utils.normalizer import str_normalize
-except:
- from research.metrics.wikitq.evaluator import to_value_list, check_denotation
- from research.utils.normalizer import str_normalize
-
-# Find for instance the citation on arxiv or on the dataset repo/website
-_CITATION = """\
-@inproceedings{pasupat-liang-2015-compositional,
- title = "Compositional Semantic Parsing on Semi-Structured Tables",
- author = "Pasupat, Panupong and
- Liang, Percy",
- booktitle = "Proceedings of the 53rd Annual Meeting of the Association for Computational Linguistics and the 7th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)",
- month = jul,
- year = "2015",
- address = "Beijing, China",
- publisher = "Association for Computational Linguistics",
- url = "https://aclanthology.org/P15-1142",
- doi = "10.3115/v1/P15-1142",
- pages = "1470--1480",
-}
-"""
-
-_DESCRIPTION = """\
-Two important aspects of semantic parsing for question answering are the breadth of the knowledge source and the depth of
-logical compositionality. While existing work trades off one aspect for another, this paper simultaneously makes progress
-on both fronts through a new task: answering complex questions on semi-structured tables using question-answer pairs as
-supervision. The central challenge arises from two compounding factors: the broader domain results in an open-ended set
-of relations, and the deeper compositionality results in a combinatorial explosion in the space of logical forms. We
-propose a logical-form driven parsing algorithm guided by strong typing constraints and show that it obtains significant
- improvements over natural baselines. For evaluation, we created a new dataset of 22,033 complex questions on Wikipedia
- tables, which is made publicly available.
-"""
-
-_HOMEPAGE = "https://ppasupat.github.io/WikiTableQuestions/"
-
-_LICENSE = "CC-BY-SA-4.0 License"
-
-
-@datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION)
-class WikiTableQuestion(datasets.Metric):
- def __init__(
- self,
- config_name: Optional[str] = None,
- keep_in_memory: bool = False,
- cache_dir: Optional[str] = None,
- num_process: int = 1,
- process_id: int = 0,
- seed: Optional[int] = None,
- experiment_id: Optional[str] = None,
- max_concurrent_cache_files: int = 10000,
- timeout: Union[int, float] = 100,
- **kwargs
- ):
- super().__init__(
- config_name=config_name,
- keep_in_memory=keep_in_memory,
- cache_dir=cache_dir,
- num_process=num_process,
- process_id=process_id,
- seed=seed,
- experiment_id=experiment_id,
- max_concurrent_cache_files=max_concurrent_cache_files,
- timeout=timeout,
- **kwargs
- )
-
- def _info(self):
- return datasets.MetricInfo(
- description=_DESCRIPTION,
- citation=_CITATION,
- features=datasets.Features(
- {
- "predictions": datasets.features.Sequence(datasets.Value("string")),
- "references": datasets.features.Features(
- {
- "answer_text": datasets.features.Sequence(
- datasets.Value("string")
- ),
- "question": datasets.Value("string"),
- }
- ),
- }
- ),
- reference_urls=[""],
- )
-
- @staticmethod
- def eval_ex_match(pred, gold, allow_semantic=True, question=None):
- if not isinstance(pred, list):
- pred = [pred]
- gold = [gold]
-
- pred = [str(p).lower().strip() for p in pred]
- gold = [str(g).lower().strip() for g in gold]
- if not allow_semantic:
- # WikiTQ eval w. string normalization using recognizer
- pred = [str_normalize(span) for span in pred]
- gold = [str_normalize(span) for span in gold]
- pred = to_value_list(pred)
- gold = to_value_list(gold)
- return check_denotation(pred, gold)
- else:
- assert isinstance(question, str)
- question = re.sub("\s+", " ", question).strip().lower()
- pred = [str_normalize(span) for span in pred]
- gold = [str_normalize(span) for span in gold]
- pred = sorted(list(set(pred)))
- gold = sorted(list(set(gold)))
- # (1) 0 matches 'no', 1 matches 'yes'; 0 matches 'more', 1 matches 'less', etc.
- if len(pred) == 1 and len(gold) == 1:
- if (pred[0] == "0" and gold[0] == "no") or (
- pred[0] == "1" and gold[0] == "yes"
- ):
- return True
- question_tokens = question.split()
- try:
- pos_or = question_tokens.index("or")
- token_before_or, token_after_or = (
- question_tokens[pos_or - 1],
- question_tokens[pos_or + 1],
- )
- if (pred[0] == "0" and gold[0] == token_after_or) or (
- pred[0] == "1" and gold[0] == token_before_or
- ):
- return True
- except Exception:
- pass
- # (2) Number value (allow units) and Date substring match
- if len(pred) == 1 and len(gold) == 1:
- NUMBER_UNITS_PATTERN = re.compile(
- "^\$*[+-]?([0-9]*[.])?[0-9]+(\s*%*|\s+\w+)$"
- )
- DATE_PATTERN = re.compile(
- "[0-9]{4}-[0-9]{1,2}-[0-9]{1,2}\s*([0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2})?"
- )
- DURATION_PATTERN = re.compile("(P|PT)(\d+)(Y|M|D|H|S)")
- p, g = pred[0], gold[0]
- # Restore `duration` type, e.g., from 'P3Y' -> '3'
- if re.match(DURATION_PATTERN, p):
- p = re.match(DURATION_PATTERN, p).group(2)
- if re.match(DURATION_PATTERN, g):
- g = re.match(DURATION_PATTERN, g).group(2)
- match = False
- num_flag, date_flag = False, False
- # Number w. unit match after string normalization.
- # Either pred or gold being number w. units suffices it.
- if re.match(NUMBER_UNITS_PATTERN, p) or re.match(
- NUMBER_UNITS_PATTERN, g
- ):
- num_flag = True
- # Date match after string normalization.
- # Either pred or gold being date suffices it.
- if re.match(DATE_PATTERN, p) or re.match(DATE_PATTERN, g):
- date_flag = True
- if num_flag:
- p_set, g_set = set(p.split()), set(g.split())
- if p_set.issubset(g_set) or g_set.issubset(p_set):
- match = True
- if date_flag:
- p_set, g_set = set(p.replace("-", " ").split()), set(
- g.replace("-", " ").split()
- )
- if p_set.issubset(g_set) or g_set.issubset(p_set):
- match = True
- if match:
- return True
- pred = to_value_list(pred)
- gold = to_value_list(gold)
- return check_denotation(pred, gold)
-
- def _compute(self, predictions, references, allow_semantic: bool = True, **kwargs):
- assert len(predictions) == len(references)
- n_total_samples = len(predictions)
- n_correct_samples = 0
- for pred, ref in zip(predictions, references):
- score = self.eval_ex_match(
- pred=pred,
- gold=ref["answer_text"],
- allow_semantic=allow_semantic,
- question=ref["question"],
- )
- n_correct_samples += score
- return n_correct_samples / n_total_samples
diff --git a/research/notebooks/hybridqa-explore.ipynb b/research/notebooks/hybridqa-explore.ipynb
deleted file mode 100644
index 46a5eac1..00000000
--- a/research/notebooks/hybridqa-explore.ipynb
+++ /dev/null
@@ -1,178 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 42,
- "metadata": {
- "collapsed": true,
- "ExecuteTime": {
- "end_time": "2024-04-12T18:12:40.638283Z",
- "start_time": "2024-04-12T18:12:39.955076Z"
- }
- },
- "outputs": [],
- "source": [
- "import json\n",
- "from recognizers_number import recognize_number\n",
- "from recognizers_date_time import recognize_datetime, Culture\n",
- "import re"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 34,
- "outputs": [],
- "source": [
- "with open(\"../outputs/new-paper-results/hybridqa/gpt-4-blendsql-only-run5/rerun-predictions.json\", \"r\") as f:\n",
- " predictions = json.load(f)"
- ],
- "metadata": {
- "collapsed": false,
- "ExecuteTime": {
- "end_time": "2024-04-12T18:11:04.473632Z",
- "start_time": "2024-04-12T18:11:04.414918Z"
- }
- }
- },
- {
- "cell_type": "code",
- "execution_count": 51,
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "101 natural language numbers\n",
- "697 digit numbers\n"
- ]
- }
- ],
- "source": [
- "nl_answers = []\n",
- "digit_answers = []\n",
- "money_answers = []\n",
- "for item in predictions:\n",
- " answer = item[\"gold_answer\"]\n",
- " if len(recognize_number(answer, Culture.English)) > 0:\n",
- " if not re.search('\\d+', answer):\n",
- " nl_answers.append(answer)\n",
- " else:\n",
- " if not recognize_datetime(answer, Culture.English):\n",
- " digit_answers.append(answer)\n",
- " if \"$\" in answer:\n",
- " money_answers.append(answer)\n",
- "print(f\"{len(nl_answers)} natural language numbers\")\n",
- "print(f\"{len(digit_answers)} digit numbers\")"
- ],
- "metadata": {
- "collapsed": false,
- "ExecuteTime": {
- "end_time": "2024-04-12T18:17:49.003375Z",
- "start_time": "2024-04-12T18:17:39.481520Z"
- }
- }
- },
- {
- "cell_type": "code",
- "execution_count": 52,
- "outputs": [
- {
- "data": {
- "text/plain": "['$ 37 billion',\n '$ 20 million',\n '$ 30,689',\n '$ 375,000',\n '$ 116,230',\n '$ 103.2 million',\n '$ 1.59 billion',\n '$ 240',\n '$ 450,000',\n '$ 1.4 billion',\n 'US $ 476 million',\n '$ 1887/-',\n '$ 5,490,187',\n '$ 675,000',\n '$ 900',\n '$ 90 million']"
- },
- "execution_count": 52,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "money_answers"
- ],
- "metadata": {
- "collapsed": false,
- "ExecuteTime": {
- "end_time": "2024-04-12T18:17:49.947237Z",
- "start_time": "2024-04-12T18:17:49.941482Z"
- }
- }
- },
- {
- "cell_type": "code",
- "execution_count": 50,
- "outputs": [
- {
- "data": {
- "text/plain": "['five',\n 'seven times',\n 'Seven',\n 'two',\n 'three campuses',\n 'Three',\n 'over one million people',\n 'than twenty Brahmanas',\n 'five',\n 'Seven',\n 'Three',\n 'Four',\n 'number one',\n 'ten',\n 'Four',\n 'Murdered his wife , mother-in-law and three children',\n 'three',\n 'twelve',\n 'a quarter',\n 'two',\n 'nine',\n 'Nine federated states',\n 'four',\n 'fourteen',\n 'six',\n 'five',\n 'three',\n 'twelve',\n 'five',\n 'Big Ten',\n 'thirteen',\n 'two',\n 'two',\n 'nine',\n 'four teams',\n \"One , B'Day was her second\",\n 'three',\n 'Twenty',\n 'ten',\n 'four',\n 'Three Great Gardens of Japan',\n 'three',\n 'number one',\n 'nine',\n 'seven',\n 'Two miles',\n 'six',\n 'fourteen',\n 'one',\n 'fourteen',\n 'Twenty Plenty outlet',\n 'Three',\n 'two',\n 'two years',\n 'four',\n 'five',\n 'Two',\n 'two',\n 'five friends',\n 'Network Ten',\n 'two-thirds',\n 'one',\n 'twenty-three',\n 'seven',\n 'six',\n 'every four years',\n 'three',\n 'twenty-six',\n 'four',\n 'three',\n 'four times',\n 'eighteen',\n 'two',\n 'nine',\n 'five million',\n 'twelve',\n 'one-day Olympics',\n 'eight',\n 'three disciplines',\n 'Want One',\n 'thirty-two',\n 'two',\n 'three years',\n 'six',\n 'six Primetime Emmy Awards',\n 'three',\n 'six',\n 'seven million',\n 'Sixteen in total',\n 'nine members',\n 'Twenty Twenty',\n 'five',\n 'two',\n 'Seven Beauties',\n 'The process of digestion has three stages',\n 'nine',\n 'six',\n 'five',\n 'four',\n 'three',\n 'three']"
- },
- "execution_count": 50,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "nl_answers"
- ],
- "metadata": {
- "collapsed": false,
- "ExecuteTime": {
- "end_time": "2024-04-12T18:16:02.922668Z",
- "start_time": "2024-04-12T18:16:02.917306Z"
- }
- }
- },
- {
- "cell_type": "code",
- "execution_count": 49,
- "outputs": [
- {
- "data": {
- "text/plain": "['503 Peeples Street SW',\n '32',\n '3',\n '524 km',\n '2',\n '$ 37 billion',\n '5270',\n '75 million',\n '139.25',\n '3.37',\n '9',\n '47 acres',\n '28,108',\n '2,549',\n '4',\n '34,932',\n '24',\n '76101-054',\n '11 players',\n 'Masheka Mogilev 2',\n '2,500',\n '9.412',\n '20 million albums',\n '84',\n '$ 20 million',\n '2,448 km2',\n '171',\n '1.1',\n '185,038',\n 'No . 1',\n '342',\n '60',\n '6',\n '274,905',\n '4 million',\n '30 km',\n '3',\n '37.3M',\n '5,230,330',\n '5',\n '37',\n '9× Platinum',\n '5,000',\n '9,461,105',\n '5,400',\n '10',\n '2',\n '2 hourly',\n '580,367 square kilometres ( 224,081 sq mi )',\n '2',\n '8000 BC',\n 'US 20',\n '47',\n '9 Watergate Street',\n '21,397,000',\n '23',\n 'Volume 1',\n 'United States Army UH-60 Black Hawk',\n '7.48',\n 'Proposition 13',\n '124-acre',\n 'from 237 to 227 million years ago',\n '407',\n '3',\n '32',\n '65',\n '3',\n '6,000',\n '15',\n '7',\n '3,674',\n '15',\n '36',\n '566',\n '107 Nobel laureates',\n '58',\n '12,960',\n 'Red 121 , White 167',\n '88',\n '40',\n '306',\n '25',\n '2,514',\n 'CR 202',\n '308,580 hectares',\n '586',\n 'Thailand Route 4106',\n '15',\n '18',\n '7',\n '1-2',\n '250',\n '16',\n '3',\n '12 seasons',\n '15,941,000',\n '36,969',\n '18,000',\n \"Men 's 50 m backstroke\",\n '46',\n '11,249-capacity',\n '261',\n '1.5 million',\n '354 cc',\n '300-foot',\n '25,062',\n 'under-48-kilogram',\n '16 kilometres',\n 'Over 40%',\n '11,200',\n '38',\n '4',\n '5,527',\n '16',\n 'Highway 2',\n \"Half of the world 's 20 largest drugmakers\",\n '3.1 million hectares',\n '24+',\n '4',\n '3',\n '7',\n '5343 Southeast Abshier Boulevard',\n '522.59 km',\n '12.5 km',\n '39,519,400',\n 'No . 2',\n '30',\n '2',\n '$ 30,689',\n '# 70000667',\n '111',\n '56°43′55″N 2°39′42″W / 56.73194°N 2.66167°W / 56.73194 ; -2.66167 ( Brechin Cathedral )',\n '121',\n '4,612 km²',\n '1,163',\n '30',\n '1,500-meter freestyle',\n '338',\n '38.5',\n '$ 375,000',\n '19',\n '22',\n '25',\n '1',\n 'NA-13 ( Mansehra-I )',\n '16',\n '85 million',\n '0',\n '46.15%',\n '149.4',\n '47,774',\n '9.98',\n '15',\n '7',\n '0-11 ( 11 )',\n '151,801',\n '80',\n '3,800',\n '44634',\n '22,000',\n '3',\n '79112-054',\n '2',\n '12,711',\n '22',\n '3,000',\n '5',\n '56 square miles',\n '36,474',\n '17',\n '26',\n '24 miles',\n '27',\n 'FR-07',\n '3',\n '0.3',\n '26 million',\n '750',\n '16',\n '10',\n '5',\n '46',\n 'batting .358',\n '21',\n '76',\n '6,863',\n '17',\n '10,888',\n '6',\n '12',\n '631,486',\n '2,138',\n '4',\n 'The city was initially founded between 1771 and 1776',\n '30°41′22″N 88°2′37″W / 30.68944°N 88.04361°W / 30.68944 ; -88.04361 ( Government Street Presbyterian Church )',\n 'K League 1',\n '3,000',\n 'the National Premier Leagues Victoria Division 1',\n '36',\n '10',\n '2500',\n '44',\n '558,797',\n 'nearly 600',\n '2',\n '37',\n '15',\n '3.8 million square miles',\n '1.12 square miles',\n 'Amalgam Comics # 1',\n '4',\n '$ 116,230',\n '19',\n '6',\n '4.05',\n '60',\n '24,000',\n '3',\n '4',\n '2 million',\n 'Cyborg 009 : Legend of the Super Galaxy',\n '4',\n \"Men 's freestyle 74 kg\",\n '77',\n '40-0',\n '95550',\n '125 metres',\n 'KNM-LT 329',\n '4,298,655',\n '707 ha',\n '80827',\n '2',\n '146,100 square kilometres ( 56,400 sq mi )',\n '1',\n '2,766.6',\n '3,500',\n '16',\n 'KVVV 15',\n '0',\n '0',\n 'CR 43',\n '3',\n '5.3 miles',\n 'Guitar Hero Track Pack 1',\n '2,168',\n '6,211',\n '13.75',\n 'roughly 400',\n '4,935',\n '6',\n '17.34 million',\n '1,800',\n '3',\n '2',\n '51%',\n '2,411',\n '378',\n '5',\n '5-2',\n '46,800',\n '37.9 million',\n 'Over 40%',\n '4',\n '79,343',\n '130 km',\n '5,000',\n '114',\n '86',\n '20',\n '19',\n '6',\n '19.6',\n '72,465',\n '# 06000989',\n '120 steps per minute',\n '637.4 square miles',\n '6 ft 8\\u200b3⁄4 in',\n '17',\n '51,700',\n '3,857 km2 ( 1,489 sq mi )',\n '4',\n '230 square kilometres',\n '2,000 kilometres',\n '1.4-acre ( 0.57 ha )',\n '20,000',\n \"Men 's 10 m platform\",\n '47',\n 'No . 1',\n 'League 1',\n '15 miles',\n '80-inch',\n '83,500',\n '11',\n '34',\n '3,905',\n '1,884',\n '24.3 km',\n '1.8 million square kilometres',\n \"Men 's 200 m\",\n '37,188',\n '649,021',\n '491,918',\n '2',\n '84,298',\n '3000 m steeplechase',\n '18 AF/AFTRANS',\n '1',\n '1008211',\n 'U.S . Routes 20 and 169',\n '100 Philip Aziz Avenue',\n '6,763',\n '11,406',\n '6',\n '13',\n '80,093',\n '47.6 million people',\n '1,550 m',\n '1,864',\n '7',\n '310',\n '13,486',\n '8,055',\n '3,550',\n '3',\n '20,000',\n '10',\n '38°03′25″N 83°56′26″W / 38.056944°N 83.940556°W / 38.056944 ; -83.940556 ( Methodist Episcopal Church South )',\n '4.5 million',\n '17',\n '17',\n '203',\n '58.5',\n '8',\n '11 miles',\n '2.6',\n '24-32 ( 10-20 )',\n '2,317,105',\n '55°56′58″N 3°11′27″W / 55.94944°N 3.19083°W / 55.94944 ; -3.19083 ( St. Giles Cathedral )',\n '2',\n '8.84',\n '40',\n '€757,3 million',\n '# 76001760',\n '3',\n 'USD 2 Million',\n '68',\n '9',\n '9',\n '253',\n '18',\n 'More than 11,000',\n 'A1A-2',\n '542,868',\n '65 million',\n '6',\n '15',\n '450,000',\n '82',\n '2.1',\n '7,500',\n '2',\n '3',\n '2.11',\n '103,608',\n '82,418',\n '23,450㎡',\n '20%',\n '20',\n '7.4 million',\n 'Airbus A319-100',\n 'more than 19,000 students',\n '1-20 ( 23 )',\n 'slight frame , weighing just 70 kilograms',\n '8.1 percent',\n '82.684 miles',\n '760',\n '50,000 square meters',\n '7,866',\n '4',\n '22-16-3',\n '5',\n '3',\n '9',\n '12',\n '143',\n '2',\n '34,000',\n '7,500',\n '6',\n '8',\n \"Women 's freestyle 48 kg\",\n '2,954 meters',\n '3,776.24 m',\n '2',\n '10',\n '25,000',\n '2',\n '1 ( 120 )',\n '51 feet long',\n '6',\n '2',\n '$ 103.2 million',\n '968',\n '2.09',\n '25',\n '55',\n '572',\n 'CR 132',\n '20%',\n '505,526',\n '9,000',\n '4',\n '60',\n '9',\n '29 teams from the United States and 1 from Canada',\n '5.6 million',\n '55,000',\n '90',\n '$ 1.59 billion',\n '22,970 square kilometres',\n '47,803.7',\n '4 million',\n '128,624 inhabitants',\n 'between 14.5 and 17.4 million',\n '7,500',\n '5',\n '570',\n 'between 18 and 24° C',\n '2.6 nautical miles',\n '44 races',\n '81',\n '17',\n '2',\n '2',\n '50,782',\n '250 feet',\n '76 points',\n '715 miles',\n 'Thai League 2',\n '634,300',\n '979',\n '3',\n '10 miles',\n '2,193',\n '512358',\n '8',\n '1',\n 'between 60 and 64 kilograms',\n '3 ft 4 in',\n '10,768',\n '5',\n '18',\n '25-44',\n '37',\n '278',\n '3 times',\n '14',\n '12',\n '26,512',\n '5608',\n '26,700',\n '35,938',\n '656-acre',\n '500 m',\n '266',\n '2',\n '7,536,383',\n '30 national championships',\n 'Highway 25',\n '4',\n '27,000',\n '17,448',\n '93,348',\n '$ 240',\n '1.428 billion',\n '368',\n 'over 500,000 copies',\n '$ 450,000',\n '19-22',\n '70',\n '4',\n '1',\n '10 km',\n '21,000',\n '104',\n '45,750',\n '$ 1.4 billion',\n '3',\n 'around 600 B.C',\n '5,735',\n '2.655 million',\n '12',\n '501.66',\n '15,133',\n '1,614',\n '5.7 square miles',\n '22.42',\n '11',\n '147',\n '# 91000775',\n '38.245',\n '7,289',\n '2',\n '5',\n '619',\n '19,000',\n '24,000',\n '15',\n '34,533',\n 'US $ 476 million',\n '110',\n '20 million',\n '5',\n '6',\n '200',\n '18',\n '903,366.2',\n '5',\n '2',\n '21 seasons',\n '3',\n '21',\n '42',\n 'Finding Teddy 2',\n '7',\n '14',\n '309 beds',\n '1,081',\n '41°49′51″N 90°50′20″W / 41.830833°N 90.838889°W / 41.830833 ; -90.838889 ( City Hotel )',\n '9.2 million',\n '114,922',\n '7,163,416',\n '85,000',\n '99.72',\n '4',\n '4',\n '117 mi',\n '1,200',\n '4.5×40mmR',\n '506,535',\n 'Mortal Kombat 9',\n '2.9 million',\n '4,883 points',\n '3',\n '10',\n '2',\n '316,000 listeners',\n '3',\n '631,313',\n '539 BCE',\n '1,521',\n '20',\n '18.25',\n '3',\n '0',\n '88 km',\n '9000',\n '7',\n '4',\n '20,600',\n '3',\n '50 miles',\n '17-county area',\n '15',\n '0',\n '6p21.3',\n '0',\n '110',\n 'Sesame Street 4-D Movie Magic',\n 'Avengers # 9',\n '8,987',\n '1,174,209',\n '20',\n '492 m',\n '80% of its staff',\n '10',\n '17,606',\n '43',\n '8',\n '34',\n '8,500',\n '11.7',\n '15',\n '86 episodes',\n '3,933',\n '1,630',\n '2',\n '302 Calumet Avenue',\n '1 , 2 , 3',\n '581 feet',\n '65',\n '8,540',\n '13',\n '13',\n '43,013',\n '8,029',\n '107 mm',\n '$ 5,490,187',\n '742',\n '6',\n '3',\n '23',\n '26',\n '20',\n '9.5 million',\n '112,525',\n '48',\n '476,179',\n '1,178 feet ( 359 m )',\n '4',\n '$ 675,000',\n '8,020',\n '35,715',\n '142',\n '6',\n '22.6-mile-long',\n '40%',\n 'Up to 250 aircraft of all types',\n '5579 sq mi',\n '50-mile',\n '4000',\n \"6 ' 0\",\n '20',\n '16,686',\n '30',\n '2370-2371',\n '4.5 million',\n '57',\n '251 registered members',\n '7',\n '1,961,269',\n '500',\n '264,091',\n '5',\n '70',\n '2',\n '146.7 million',\n '1.47 million',\n '749',\n '5',\n '1,827,165',\n '4',\n '885,287',\n '190 miles',\n '300,000',\n '36 states',\n '$ 900',\n '50 km',\n '183.64',\n '19',\n '8.95',\n '806 km',\n '50 centimetres',\n '32',\n '10,277',\n '34,240',\n '357,386',\n '240,342',\n '8.595',\n '18',\n '10.2',\n '100 and 200 metres',\n '57,449 people',\n '12,491,300',\n '5 MW',\n '57 million yuan',\n '10,942',\n '7',\n '242',\n '2,238,537',\n '5',\n '3',\n '25',\n '74',\n '44',\n '8,000',\n '4',\n 'Bill C-168',\n '5,642',\n '9.837',\n '3,300 acres',\n '2',\n '$ 90 million',\n '9.575',\n '5,500',\n '2',\n '17,000',\n '21 Avondale St',\n '14,462',\n '4 - 3',\n '100',\n '+8.37%',\n '13',\n '17,317',\n '71',\n '10.8%',\n '14']"
- },
- "execution_count": 49,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "digit_answers"
- ],
- "metadata": {
- "collapsed": false,
- "ExecuteTime": {
- "end_time": "2024-04-12T18:15:56.691142Z",
- "start_time": "2024-04-12T18:15:56.686177Z"
- }
- }
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "outputs": [],
- "source": [],
- "metadata": {
- "collapsed": false
- }
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 2
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython2",
- "version": "2.7.6"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 0
-}
diff --git a/research/notebooks/open-model-eval.ipynb b/research/notebooks/open-model-eval.ipynb
deleted file mode 100644
index 8059db82..00000000
--- a/research/notebooks/open-model-eval.ipynb
+++ /dev/null
@@ -1,316 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 1,
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "04/12/2024 13:02:54 - INFO - datasets - PyTorch version 2.2.0 available.\n",
- "04/12/2024 13:02:54 - INFO - datasets - Apache Beam version 2.50.0 available.\n"
- ]
- }
- ],
- "source": [
- "import json\n",
- "from collections import Counter\n",
- "import pandas as pd\n",
- "from research.constants import EvalField\n",
- "from research.utils.hybridqa.hybridqa import hybridqa_metric_format_func\n",
- "from datasets import load_metric\n",
- "import copy\n",
- "import re\n",
- "import seaborn as sns\n",
- "import matplotlib.pyplot as plt"
- ],
- "metadata": {
- "collapsed": false,
- "ExecuteTime": {
- "end_time": "2024-04-12T17:03:09.544800Z",
- "start_time": "2024-04-12T17:02:40.867171Z"
- }
- }
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "outputs": [],
- "source": [
- "with open(\"outputs/new-paper-results/hybridqa/gpt-4-blendsql-starcoder2/rerun-predictions.json\", \"r\") as f:\n",
- " starcoder = json.load(f)\n",
- "with open(\"outputs/new-paper-results/hybridqa/gpt-4-blendsql-only-run5/rerun-predictions.json\", \"r\") as f:\n",
- " gpt4 = json.load(f)\n",
- "with open(\"outputs/new-paper-results/hybridqa/gpt-4-blendsql-deepseek-coder/rerun-predictions.json\", \"r\") as f:\n",
- " deepseek = json.load(f)\n",
- "results = {\n",
- " \"starcoder\": starcoder,\n",
- " \"gpt4\": gpt4,\n",
- " \"deepseek\": deepseek\n",
- "}"
- ],
- "metadata": {
- "collapsed": false,
- "ExecuteTime": {
- "end_time": "2024-04-12T17:05:13.839260Z",
- "start_time": "2024-04-12T17:05:12.915098Z"
- }
- }
- },
- {
- "cell_type": "code",
- "execution_count": 36,
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/a720504/miniconda3/envs/blendsql/lib/python3.9/site-packages/datasets/load.py:848: FutureWarning: The repository for hybridqa contains custom code which must be executed to correctly load the metric. You can inspect the repository content at ./metrics/hybridqa/hybridqa.py\n",
- "You can avoid this message in future by passing the argument `trust_remote_code=True`.\n",
- "Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.\n",
- " warnings.warn(\n"
- ]
- }
- ],
- "source": [
- "def is_empty_blendsql_pred(p: list):\n",
- " if len(p) == 0 or (\n",
- " any(x in str(p[0]) for x in [\"table\", \"passage\", \"text\"])\n",
- " and any(x in str(p[0]) for x in [\"not\", \"empty\"])\n",
- " ) or str(p[0]).strip() == \"\":\n",
- " return True\n",
- " return False\n",
- "\n",
- "def cleanup_error(error: str, prediction: list):\n",
- " if error is None:\n",
- " if prediction in [[], ['']]:\n",
- " return \"No Results\"\n",
- " return error\n",
- " error = error.lower()\n",
- " if error == \"empty subtable passed to qaingredient!\":\n",
- " return \"No Results\"\n",
- " if error.startswith(\"the model attempted to generate\"):\n",
- " return \"Constrained Decoding Error\"\n",
- " elif any(x in error for x in {\"no such column\", \"unknown column\"}):\n",
- " return \"Hallucinated Column\"\n",
- " elif \"fts5: syntax error\" in error:\n",
- " return \"Generic Syntax Error\"\n",
- " elif \"maximum context length is\" in error:\n",
- " return \"Context Overflow\"\n",
- " elif any(x in error for x in {\"error tokenizing\", \"unbalanced parenthesis\", \"ambiguous column name\", \"syntax error\", \"alias already used\", \"unterminated subpattern\", \"invalid expression\", \"invalid format\", \"duplicate column name\"}):\n",
- " return \"Generic Syntax Error\"\n",
- " elif error.startswith(\"expecting\"):\n",
- " return \"Generic Syntax Error\"\n",
- " elif \"the response was filtered due to the prompt triggering\" in error:\n",
- " return None\n",
- " elif error[0] == \"'\" and error[-1] == \"'\":\n",
- " return \"Generic Syntax Error\"\n",
- " elif error.startswith(\"expected }\") or 'unrecognized token: \"{\"' in error:\n",
- " return \"Generic Syntax Error\"\n",
- " elif \"no such function\" in error:\n",
- " return \"Generic Syntax Error\"\n",
- " elif \"execution failed on sql\" in error:\n",
- " return \"Generic Syntax Error\"\n",
- " elif \"required keyword\" in error:\n",
- " return \"Generic Syntax Error\"\n",
- " elif re.search(r'\\{\\{[a-z]\\(\\)\\}\\}', error):\n",
- " return \"Generic Syntax Error\"\n",
- " return \"Generic Syntax Error\"\n",
- "\n",
- "m = load_metric(path=\"./metrics/hybridqa/\")\n",
- "metric_format_func = hybridqa_metric_format_func"
- ],
- "metadata": {
- "collapsed": false,
- "ExecuteTime": {
- "end_time": "2024-04-12T17:53:53.263112Z",
- "start_time": "2024-04-12T17:53:53.239129Z"
- }
- }
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/var/folders/k_/rqtwz3wd3g19jrnz5dkt4d609b2zbl/T/ipykernel_68562/2857013869.py:43: FutureWarning: load_metric is deprecated and will be removed in the next major version of datasets. Use 'evaluate.load' instead, from the new library 🤗 Evaluate: https://huggingface.co/docs/evaluate\n",
- " m = load_metric(path=\"./metrics/hybridqa/\")\n",
- "/Users/a720504/miniconda3/envs/blendsql/lib/python3.9/site-packages/datasets/load.py:848: FutureWarning: The repository for hybridqa contains custom code which must be executed to correctly load the metric. You can inspect the repository content at ./metrics/hybridqa/hybridqa.py\n",
- "You can avoid this message in future by passing the argument `trust_remote_code=True`.\n",
- "Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.\n",
- " warnings.warn(\n"
- ]
- }
- ],
- "source": [
- "for name, predictions in results.items():\n",
- " print(f\"HybridQA {name} results:\")\n",
- " for item in predictions:\n",
- " _item = copy.deepcopy(item)\n",
- " if is_empty_blendsql_pred(_item[EvalField.PREDICTION]):\n",
- " _item[EvalField.PREDICTION] = \"\"\n",
- " m.add(**hybridqa_metric_format_func(item))\n",
- " print(json.dumps(m.compute(), indent=4))\n",
- " print()"
- ],
- "metadata": {
- "collapsed": false,
- "ExecuteTime": {
- "end_time": "2024-04-12T02:55:20.541707Z",
- "start_time": "2024-04-12T02:55:20.471770Z"
- }
- }
- },
- {
- "cell_type": "code",
- "execution_count": 38,
- "outputs": [],
- "source": [
- "# Create error df\n",
- "df_list = []\n",
- "for name, predictions in results.items():\n",
- " cleaned_errors = filter(lambda x: x is not None, [cleanup_error(i[\"error\"], i[EvalField.PREDICTION]) for i in predictions])\n",
- " for error_type, count in Counter(cleaned_errors).items():\n",
- " df_list.append(\n",
- " {\n",
- " \"Model\": name,\n",
- " \"Error Type\": error_type,\n",
- " \"Count\": count\n",
- " }\n",
- " )\n",
- "df = pd.DataFrame(df_list)"
- ],
- "metadata": {
- "collapsed": false,
- "ExecuteTime": {
- "end_time": "2024-04-12T17:53:58.625037Z",
- "start_time": "2024-04-12T17:53:58.623046Z"
- }
- }
- },
- {
- "cell_type": "code",
- "execution_count": 26,
- "outputs": [
- {
- "data": {
- "text/plain": " count\nmodel \ndeepseek 0.498413\ngpt4 0.234921\nstarcoder 0.552092",
- "text/html": "
\n\n
\n \n \n | \n count | \n
\n \n model | \n | \n
\n \n \n \n deepseek | \n 0.498413 | \n
\n \n gpt4 | \n 0.234921 | \n
\n \n starcoder | \n 0.552092 | \n
\n \n
\n
"
- },
- "execution_count": 26,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df.groupby(\"model\").sum(\"count\") / len(predictions)"
- ],
- "metadata": {
- "collapsed": false,
- "ExecuteTime": {
- "end_time": "2024-04-12T17:50:34.061414Z",
- "start_time": "2024-04-12T17:50:34.050435Z"
- }
- }
- },
- {
- "cell_type": "code",
- "execution_count": 27,
- "outputs": [
- {
- "data": {
- "text/plain": " count\nmodel \ndeepseek 0.498413\ngpt4 0.234921\nstarcoder 0.552092",
- "text/html": "\n\n
\n \n \n | \n count | \n
\n \n model | \n | \n
\n \n \n \n deepseek | \n 0.498413 | \n
\n \n gpt4 | \n 0.234921 | \n
\n \n starcoder | \n 0.552092 | \n
\n \n
\n
"
- },
- "execution_count": 27,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df[~df['error_type'].isin(('empty_results', 'openai_content_management', 'context_overflow'))].groupby(\"model\").sum(\"count\") / len(predictions)"
- ],
- "metadata": {
- "collapsed": false,
- "ExecuteTime": {
- "end_time": "2024-04-12T17:50:34.655884Z",
- "start_time": "2024-04-12T17:50:34.655195Z"
- }
- }
- },
- {
- "cell_type": "code",
- "execution_count": 45,
- "outputs": [
- {
- "data": {
- "text/plain": "