From f259993c2464967cc500f827069289550c7ca3d5 Mon Sep 17 00:00:00 2001 From: Yingbei Date: Wed, 3 Jul 2024 17:41:53 -0700 Subject: [PATCH 1/3] test results for functionary small & medium --- docs/docs/benchmark.mdx | 2 ++ docs/src/components/BenchmarkTable.js | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/docs/docs/benchmark.mdx b/docs/docs/benchmark.mdx index 95016d8..e366193 100644 --- a/docs/docs/benchmark.mdx +++ b/docs/docs/benchmark.mdx @@ -30,6 +30,8 @@ Some of the LLMs above require using custom libraries to post-process LLM genera `Nexusflow/NexusRaven-V2-13B` required [nexusraven-pip](https://github.com/nexusflowai/nexusraven-pip). +`functionary-small-v2.5` and `functionary-medium-v3.0` models are tested using [MeetKai's functionary](https://github.com/MeetKai/functionary?tab=readme-ov-file#setup) with the vllm framework. For each model, we compared the results with functionary's `Grammar Sampling` feature enabled and disabled, taking the highest score from either configuration. The `functionary-small-v2.5` model achieved a higher score than the `functionary-medium-v3.0` model, primarily due to the medium model exhibiting more hallucinations in some of our more advanced test cases. + ::::: ∔ `Nexusflow/NexusRaven-V2-13B` and `gorilla-llm/gorilla-openfunctions-v2` don't accept tool observations, the result of running a tool or function once the LLM calls it, so we appended the observation to the prompt. \ No newline at end of file diff --git a/docs/src/components/BenchmarkTable.js b/docs/src/components/BenchmarkTable.js index 94f54ae..182de7d 100644 --- a/docs/src/components/BenchmarkTable.js +++ b/docs/src/components/BenchmarkTable.js @@ -200,6 +200,24 @@ const data = [ gsm8k: '-', math: '-', mtBench:'-', + }, + { + model: 'functionary-medium-v3.0', + functionCalling: '46.43%', + mmlu: '-', + gpqa: '-', + gsm8k: '-', + math: '-', + mtBench:'-', + }, + { + model: 'functionary-small-v2.5', + functionCalling: '57.14%', + mmlu: '-', + gpqa: '-', + gsm8k: '-', + math: '-', + mtBench:'-', } ]; From 1c323d4bc2cbd371b13dde8b58366d8f5cbb359b Mon Sep 17 00:00:00 2001 From: Yingbei Date: Wed, 3 Jul 2024 17:45:12 -0700 Subject: [PATCH 2/3] update param size of functionary models --- docs/src/components/BenchmarkTable.js | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/src/components/BenchmarkTable.js b/docs/src/components/BenchmarkTable.js index 182de7d..f83f019 100644 --- a/docs/src/components/BenchmarkTable.js +++ b/docs/src/components/BenchmarkTable.js @@ -203,6 +203,7 @@ const data = [ }, { model: 'functionary-medium-v3.0', + params: 70.6, functionCalling: '46.43%', mmlu: '-', gpqa: '-', @@ -212,6 +213,7 @@ const data = [ }, { model: 'functionary-small-v2.5', + params: 8.03, functionCalling: '57.14%', mmlu: '-', gpqa: '-', From be4ed860352121f8477e0dddff1b742c63b51e05 Mon Sep 17 00:00:00 2001 From: sanjay920 Date: Thu, 4 Jul 2024 17:27:55 -0700 Subject: [PATCH 3/3] add functionary models' benchmark results --- docs/src/components/BenchmarkTable.js | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/docs/src/components/BenchmarkTable.js b/docs/src/components/BenchmarkTable.js index f83f019..f2ee978 100644 --- a/docs/src/components/BenchmarkTable.js +++ b/docs/src/components/BenchmarkTable.js @@ -205,21 +205,21 @@ const data = [ model: 'functionary-medium-v3.0', params: 70.6, functionCalling: '46.43%', - mmlu: '-', - gpqa: '-', - gsm8k: '-', - math: '-', - mtBench:'-', + mmlu: '79.85', + gpqa: '38.39', + gsm8k: '89.54', + math: '43.02', + mtBench:'5.49', }, { model: 'functionary-small-v2.5', params: 8.03, functionCalling: '57.14%', - mmlu: '-', - gpqa: '-', - gsm8k: '-', - math: '-', - mtBench:'-', + mmlu: '63.92', + gpqa: '32.14', + gsm8k: '66.11', + math: '20.54', + mtBench:'7.09', } ];