diff --git a/src/examples/user_journey.ipynb b/src/examples/user_journey.ipynb index 6be9da22b..e1c09e0cc 100644 --- a/src/examples/user_journey.ipynb +++ b/src/examples/user_journey.ipynb @@ -15,15 +15,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "To start off, we are only given a few anecdotal examples.\n", - "Firstly, there are two e-mails, and secondly a number of potential departments to which they should be sent.\n", - "\n", - "Let's have a look.\n" + "To start off, we are only given a few anecdotal examples. Let's see how far we can get with these.\n" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -46,176 +43,156 @@ "}" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Luckily, the Intelligence provides some classification tasks out of the box.\n", - "\n", - "Let's import it and run!\n" - ] - }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[SingleLabelClassifyOutput(scores={'People & Culture': 0.00540895946326583, 'Product': 0.0012847404821690297, 'Legal': 0.011450767273591046, 'Infrastructure': 7.63933851429347e-06, 'Accounting': 0.0037175198485175764, 'Communication Department': 0.4868987707545542, 'Research': 0.0001194994797172138, 'Finance': 0.4868987707545542, 'CEO Office': 8.307395237093146e-07, 'Customer': 0.004212501865592915}),\n", - " SingleLabelClassifyOutput(scores={'People & Culture': 0.0002471222567108989, 'Product': 1.034420307682754e-06, 'Legal': 5.253219782981775e-06, 'Infrastructure': 1.797552960512546e-07, 'Accounting': 0.0024014826120226144, 'Communication Department': 0.9688271358793473, 'Research': 0.02010736772998545, 'Finance': 0.008381997922340201, 'CEO Office': 2.756636746614045e-08, 'Customer': 2.839863783934064e-05})]" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "from intelligence_layer.core import TextChunk, InMemoryTracer\n", "from intelligence_layer.use_cases import PromptBasedClassify, ClassifyInput\n", "\n", "\n", - "# instantiating the default task\n", "prompt_based_classify = PromptBasedClassify()\n", "\n", - "# building the input object for each example\n", "classify_inputs = [\n", " ClassifyInput(chunk=TextChunk(example), labels=labels) for example in examples\n", "]\n", "\n", - "# running the tasks concurrently\n", + "\n", "outputs = prompt_based_classify.run_concurrently(classify_inputs, InMemoryTracer())\n", "outputs" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Hmm, we have some results, but they aren't really legible (yet)." - ] - }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[('Communication Department', 0.4868987707545542),\n", - " ('Communication Department', 0.9688271358793473)]" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "[sorted(list(o.scores.items()), key=lambda i: i[1], reverse=True)[0] for o in outputs]" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "It appears that both inputs were mistakenly classified as having to be sent to the Comms Department.\n", - "We probably have to do some finetuning of our classification approach.\n", - "\n", - "However, let's first make sure that this evidence is not anecdotal.\n", - "For this, we need to do some eval. Luckily, we have by now got access to a few more examples...\n" - ] - }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'label': 'Finance',\n", - " 'message': 'I just traveled to Paris for a conference, where can I get the train ride refunded?'},\n", - " {'label': 'Customer',\n", - " 'message': 'Hello, we would like to get in contact with your sales team, because we are interested in your solution.'},\n", - " {'label': 'Communication Department',\n", - " 'message': 'We are working on a documentation on AI and would like to film a piece about you. Would you be interested?'},\n", - " {'label': 'Research',\n", - " 'message': 'I am working with Stanford and was hoping to win you over for a research collaboration.'},\n", - " {'label': 'IT Support', 'message': 'My laptop is broken'},\n", - " {'label': 'Communications',\n", - " 'message': 'I already tried to call many times. Can I get a meeting with Jonas?'},\n", - " {'label': 'Communications', 'message': 'Can you send your models via email?'},\n", - " {'label': 'Research', 'message': 'We should do a research collaboration.'},\n", - " {'label': 'Research',\n", - " 'message': 'H100 cluster available right now. Would you like to procure at low prices?'},\n", - " {'label': 'Research',\n", - " 'message': 'My company has been working on time series and signal processing for a long time. It would make sense to define a joint go to market.'},\n", - " {'label': 'People & Culture',\n", - " 'message': 'Full stack developer in your area available now.'},\n", - " {'label': 'Product',\n", - " 'message': 'Hi,\\n\\nI am having trouble running your docker container in my environment. It fails to start. Can you help?'},\n", - " {'label': 'Product',\n", - " 'message': 'Hello,\\n\\nI am getting strange errors from your API. It is saying the queue is full, but I am only sending one task at a time. Why is this happening?'},\n", - " {'label': 'Customer',\n", - " 'message': 'Can you show me a demo of different use cases your product can solve?'},\n", - " {'label': 'People & Culture',\n", - " 'message': 'Hey, I did not get a t-shirt in the onboarding. Could I still get one?'},\n", - " {'label': 'Customer',\n", - " 'message': 'Hi, can you name me a couple of timeslots for a first call? Would be really interested in learning more about the product?'},\n", - " {'label': 'Product', 'message': 'Hi Jan, is your tool ISO 37301 compliant?'},\n", - " {'label': 'I can’t login to Mattermost or Sharepoint, how can I gain access?',\n", - " 'message': 'IT Support'},\n", - " {'label': 'Ignore',\n", - " 'message': 'Hi, Jonas here. I need something really urgently right now. Could you share your number with me?'},\n", - " {'label': 'Finance',\n", - " 'message': 'I did not get paid last month, when do I get paid? What is going on?'},\n", - " {'label': 'Security',\n", - " 'message': 'Hi, I want to get a new badge, the photo of me looks ugly and I just got new glasses so it does not look like me. '},\n", - " {'label': 'Marketing',\n", - " 'message': 'Let us celebrate AI day in style, we want to invite you and the CEO to join us.'},\n", - " {'label': 'Sales',\n", - " 'message': 'Jonas, we have met each other at the event in Nürnberg, can we meet for a follow up in your Office in Heidelberg?'},\n", - " {'label': 'Security',\n", - " 'message': 'Your hTTPs Certificate is not valid on your www.aleph-alpha.de'},\n", - " {'label': 'HR', 'message': 'I want to take a week off immediatly'},\n", - " {'label': 'HR', 'message': 'I want to take a sabbatical'},\n", - " {'label': 'HR',\n", - " 'message': 'How can I work more, I want to work weekends, can I get paid overtime?'}]" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "import json\n", - "\n", - "\n", - "with open(\"data/classify_examples.json\", 'r') as file:\n", - " labeled_examples = json.load(file)\n", + "labeled_examples = [\n", + " {\n", + " \"label\": \"Finance\",\n", + " \"message\": \"I just traveled to Paris for a conference, where can I get the train ride refunded?\",\n", + " },\n", + " {\n", + " \"label\": \"Customer\",\n", + " \"message\": \"Hello, we would like to get in contact with your sales team, because we are interested in your solution.\",\n", + " },\n", + " {\n", + " \"label\": \"Communication Department\",\n", + " \"message\": \"We are working on a documentation on AI and would like to film a piece about you. Would you be interested?\",\n", + " },\n", + " {\n", + " \"label\": \"Research\",\n", + " \"message\": \"I am working with Stanford and was hoping to win you over for a research collaboration.\",\n", + " },\n", + " {\n", + " \"label\": \"IT Support\",\n", + " \"message\": \"My laptop is broken\"},\n", + " {\n", + " \"label\": \"Communications\",\n", + " \"message\": \"I already tried to call many times. Can I get a meeting with Jonas?\",\n", + " },\n", + " {\n", + " \"label\": \"Communications\",\n", + " \"message\": \"Can you send your models via email?\"\n", + " },\n", + " {\n", + " \"label\": \"Research\",\n", + " \"message\": \"We should do a research collaboration.\"},\n", + " {\n", + " \"label\": \"Research\",\n", + " \"message\": \"H100 cluster available right now. Would you like to procure at low prices?\",\n", + " },\n", + " {\n", + " \"label\": \"Research\",\n", + " \"message\": \"My company has been working on time series and signal processing for a long time. It would make sense to define a joint go to market.\",\n", + " },\n", + " {\n", + " \"label\": \"People & Culture\",\n", + " \"message\": \"Full stack developer in your area available now.\",\n", + " },\n", + " {\n", + " \"label\": \"Product\",\n", + " \"message\": \"Hi,\\n\\nI am having trouble running your docker container in my environment. It fails to start. Can you help?\",\n", + " },\n", + " {\n", + " \"label\": \"Product\",\n", + " \"message\": \"Hello,\\n\\nI am getting strange errors from your API. It is saying the queue is full, but I am only sending one task at a time. Why is this happening?\",\n", + " },\n", + " {\n", + " \"label\": \"Customer\",\n", + " \"message\": \"Can you show me a demo of different use cases your product can solve?\",\n", + " },\n", + " {\n", + " \"label\": \"People & Culture\",\n", + " \"message\": \"Hey, I did not get a t-shirt in the onboarding. Could I still get one?\",\n", + " },\n", + " {\n", + " \"label\": \"Customer\",\n", + " \"message\": \"Hi, can you name me a couple of timeslots for a first call? Would be really interested in learning more about the product?\",\n", + " },\n", + " {\n", + " \"label\": \"Product\",\n", + " \"message\": \"Hi Jan, is your tool ISO 37301 compliant?\"},\n", + " {\n", + " \"label\": \"I can’t login to Mattermost or Sharepoint, how can I gain access?\",\n", + " \"message\": \"IT Support\",\n", + " },\n", + " {\n", + " \"label\": \"Ignore\",\n", + " \"message\": \"Hi, Jonas here. I need something really urgently right now. Could you share your number with me?\",\n", + " },\n", + " {\n", + " \"label\": \"Finance\",\n", + " \"message\": \"I did not get paid last month, when do I get paid? What is going on?\"\n", + " },\n", + " {\n", + " \"label\": \"Security\",\n", + " \"message\": \"Hi, I want to get a new badge, the photo of me looks ugly and I just got new glasses so it does not look like me. \"\n", + " },\n", + " {\n", + " \"label\": \"Marketing\",\n", + " \"message\": \"Let us celebrate AI day in style, we want to invite you and the CEO to join us.\"\n", "\n", - "labeled_examples\n", - " " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The Intelligence layer offers support to run task evaluations.\n", + " },\n", + " {\n", + " \"label\": \"Sales\",\n", + " \"message\": \"Jonas, we have met each other at the event in Nürnberg, can we meet for a follow up in your Office in Heidelberg?\"\n", "\n", - "First, we have to create a dataset inside a repository.\n", - "There are different repositories (that persist datasets in different ways), but an `InMemoryDatasetRepository` will do for now.\n" + " },\n", + " {\n", + " \"label\": \"Security\",\n", + " \"message\": \"Your hTTPs Certificate is not valid on your www.aleph-alpha.de\"\n", + " },\n", + " {\n", + " \"label\": \"HR\",\n", + " \"message\": \"I want to take a week off immediatly\"\n", + " },\n", + " {\n", + " \"label\": \"HR\",\n", + " \"message\": \"I want to take a sabbatical\"\n", + " },\n", + " {\n", + " \"label\": \"HR\",\n", + " \"message\": \"How can I work more, I want to work weekends, can I get paid overtime?\"\n", + " }\n", + "]" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -231,46 +208,21 @@ " )\n", " for example in labeled_examples\n", " ]\n", - ")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "When a dataset is created, we generate a unique ID. We'll need it later." + ")" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'2521c77d-114d-4e65-8f81-449e53108f7f'" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "dataset_id" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now that we have a dataset, let's actually run an evaluation on it!\n" - ] - }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -291,19 +243,11 @@ "\n", "load_dotenv()\n", "\n", - "# we need a few repositories to store runs, evals and aggregated evaluations\n", "run_repository = InMemoryRunRepository()\n", "evaluation_repository = InMemoryEvaluationRepository()\n", "aggregation_repository = InMemoryAggregationRepository()\n", "\n", "\n", - "# each repository is used by a class that has a dedicated responsibility\n", - "runner = Runner(\n", - " prompt_based_classify,\n", - " dataset_repository,\n", - " run_repository,\n", - " \"prompt-based-classify\"\n", - ")\n", "evaluator = Evaluator(\n", " dataset_repository,\n", " run_repository,\n", @@ -316,117 +260,34 @@ " aggregation_repository,\n", " \"single-label-classify\",\n", " SingleLabelClassifyAggregationLogic(),\n", - ")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Before evaluating, we must generate predictions for each sample in our datasets.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Evaluating: 0it [00:00, ?it/s]" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Evaluating: 27it [00:10, 2.49it/s]\n" - ] - }, - { - "data": { - "text/plain": [ - "RunOverview(dataset_id='2521c77d-114d-4e65-8f81-449e53108f7f', id='e547db06-baee-4c69-8210-01c8e059a68f', start=datetime.datetime(2024, 3, 12, 15, 45, 25, 706068, tzinfo=datetime.timezone.utc), end=datetime.datetime(2024, 3, 12, 15, 45, 36, 593248, tzinfo=datetime.timezone.utc), failed_example_count=0, successful_example_count=27, description='prompt-based-classify')" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "run_overview = runner.run_dataset(dataset_id)\n", - "run_overview" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Next, let's evaluate this run." + ")\n", + "runner = Runner(\n", + " prompt_based_classify, dataset_repository, run_repository, \"prompt-based-classify\"\n", + ")\n", + "run_overview = runner.run_dataset(dataset_id)" ] }, { "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Evaluating: 0it [00:00, ?it/s]\n" - ] - }, - { - "data": { - "text/plain": [ - "EvaluationOverview(run_overviews=frozenset({RunOverview(dataset_id='2521c77d-114d-4e65-8f81-449e53108f7f', id='e547db06-baee-4c69-8210-01c8e059a68f', start=datetime.datetime(2024, 3, 12, 15, 45, 25, 706068, tzinfo=datetime.timezone.utc), end=datetime.datetime(2024, 3, 12, 15, 45, 36, 593248, tzinfo=datetime.timezone.utc), failed_example_count=0, successful_example_count=27, description='prompt-based-classify')}), id='48db85f8-e808-4b75-bb81-ad23947553d8', start=datetime.datetime(2024, 3, 12, 15, 45, 36, 598450, tzinfo=datetime.timezone.utc), description='single-label-classify')" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "eval_overview = evaluator.evaluate_runs(run_overview.id)\n", - "eval_overview\n" - ] - }, - { - "cell_type": "markdown", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "Finally, let's aggregate all individual evaluations to get seom eval statistics." + "eval_overview = evaluator.evaluate_runs(run_overview.id)" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "AggregationOverview(evaluation_overviews=frozenset({EvaluationOverview(run_overviews=frozenset({RunOverview(dataset_id='2521c77d-114d-4e65-8f81-449e53108f7f', id='e547db06-baee-4c69-8210-01c8e059a68f', start=datetime.datetime(2024, 3, 12, 15, 45, 25, 706068, tzinfo=datetime.timezone.utc), end=datetime.datetime(2024, 3, 12, 15, 45, 36, 593248, tzinfo=datetime.timezone.utc), failed_example_count=0, successful_example_count=27, description='prompt-based-classify')}), id='48db85f8-e808-4b75-bb81-ad23947553d8', start=datetime.datetime(2024, 3, 12, 15, 45, 36, 598450, tzinfo=datetime.timezone.utc), description='single-label-classify')}), id='c53bf1be-96b1-4de9-bd48-aa097c3eeefe', start=datetime.datetime(2024, 3, 12, 15, 45, 36, 607875, tzinfo=datetime.timezone.utc), end=datetime.datetime(2024, 3, 12, 15, 45, 36, 607970, tzinfo=datetime.timezone.utc), successful_evaluation_count=27, crashed_during_evaluation_count=0, description='single-label-classify', statistics=AggregatedSingleLabelClassifyEvaluation(percentage_correct=0.14814814814814814))" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "aggregation_overview = aggregator.aggregate_evaluation(eval_overview.id)\n", - "aggregation_overview\n" + "aggregation_overview = aggregator.aggregate_evaluation(eval_overview.id)" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -461,118 +322,16 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'input': ClassifyInput(chunk='I am working with Stanford and was hoping to win you over for a research collaboration.', labels=frozenset({'People & Culture', 'Product', 'Legal', 'Infrastructure', 'Accounting', 'Communication Department', 'Research', 'Finance', 'CEO Office', 'Customer'})),\n", - " 'expected_output': 'Research',\n", - " 'result': SingleLabelClassifyOutput(scores={'People & Culture': 0.0007585755008278973, 'Product': 2.1306824628346814e-06, 'Legal': 9.644205228204069e-05, 'Infrastructure': 2.992136017512143e-08, 'Accounting': 0.0001092831623539844, 'Communication Department': 0.9426427384693921, 'Research': 0.053180248281635, 'Finance': 0.003193722149143611, 'CEO Office': 7.066480296161476e-08, 'Customer': 1.6759115739504213e-05}),\n", - " 'eval': SingleLabelClassifyEvaluation(correct=False)},\n", - " {'input': ClassifyInput(chunk='Hey, I did not get a t-shirt in the onboarding. Could I still get one?', labels=frozenset({'People & Culture', 'Product', 'Legal', 'Infrastructure', 'Accounting', 'Communication Department', 'Research', 'Finance', 'CEO Office', 'Customer'})),\n", - " 'expected_output': 'People & Culture',\n", - " 'result': SingleLabelClassifyOutput(scores={'People & Culture': 0.06497576638911562, 'Product': 0.003234949688517952, 'Legal': 3.6502971001546174e-05, 'Infrastructure': 7.570868178312978e-07, 'Accounting': 0.0013913345237718097, 'Communication Department': 0.656231862610054, 'Research': 9.979356951984034e-06, 'Finance': 0.03267186714331024, 'CEO Office': 3.272105173075688e-05, 'Customer': 0.2414142591787284}),\n", - " 'eval': SingleLabelClassifyEvaluation(correct=False)},\n", - " {'input': ClassifyInput(chunk='Jonas, we have met each other at the event in Nürnberg, can we meet for a follow up in your Office in Heidelberg?', labels=frozenset({'People & Culture', 'Product', 'Legal', 'Infrastructure', 'Accounting', 'Communication Department', 'Research', 'Finance', 'CEO Office', 'Customer'})),\n", - " 'expected_output': 'Sales',\n", - " 'result': SingleLabelClassifyOutput(scores={'People & Culture': 0.035045249036097764, 'Product': 2.1967959997586276e-06, 'Legal': 0.0001916651684066177, 'Infrastructure': 3.402464204344424e-06, 'Accounting': 3.436367114888127e-05, 'Communication Department': 0.9621208400240253, 'Research': 6.031012108009633e-05, 'Finance': 0.0024090793356383025, 'CEO Office': 5.216856070430578e-06, 'Customer': 0.00012767652732847637}),\n", - " 'eval': SingleLabelClassifyEvaluation(correct=False)},\n", - " {'input': ClassifyInput(chunk='My laptop is broken', labels=frozenset({'People & Culture', 'Product', 'Legal', 'Infrastructure', 'Accounting', 'Communication Department', 'Research', 'Finance', 'CEO Office', 'Customer'})),\n", - " 'expected_output': 'IT Support',\n", - " 'result': SingleLabelClassifyOutput(scores={'People & Culture': 0.021287554897726435, 'Product': 0.002877073480345604, 'Legal': 8.420707163108891e-05, 'Infrastructure': 4.1039249097028335e-05, 'Accounting': 0.0012766936178089724, 'Communication Department': 0.8503283974885494, 'Research': 0.0006419618664283149, 'Finance': 0.12233627737798144, 'CEO Office': 1.1678716583665552e-07, 'Customer': 0.0011266781632659393}),\n", - " 'eval': SingleLabelClassifyEvaluation(correct=False)},\n", - " {'input': ClassifyInput(chunk='Full stack developer in your area available now.', labels=frozenset({'People & Culture', 'Product', 'Legal', 'Infrastructure', 'Accounting', 'Communication Department', 'Research', 'Finance', 'CEO Office', 'Customer'})),\n", - " 'expected_output': 'People & Culture',\n", - " 'result': SingleLabelClassifyOutput(scores={'People & Culture': 0.07520556131595738, 'Product': 5.340901013608409e-05, 'Legal': 0.0002887266696645425, 'Infrastructure': 5.992324863038155e-06, 'Accounting': 0.004516445008710039, 'Communication Department': 0.09656585220595768, 'Research': 1.5301937599542603e-05, 'Finance': 0.8085362239349625, 'CEO Office': 3.634528752390948e-06, 'Customer': 0.014808853063396901}),\n", - " 'eval': SingleLabelClassifyEvaluation(correct=False)},\n", - " {'input': ClassifyInput(chunk='Hi, Jonas here. I need something really urgently right now. Could you share your number with me?', labels=frozenset({'People & Culture', 'Product', 'Legal', 'Infrastructure', 'Accounting', 'Communication Department', 'Research', 'Finance', 'CEO Office', 'Customer'})),\n", - " 'expected_output': 'Ignore',\n", - " 'result': SingleLabelClassifyOutput(scores={'People & Culture': 0.0021822744662250595, 'Product': 1.1468770400028006e-08, 'Legal': 3.4925196728580044e-06, 'Infrastructure': 1.567595052136926e-08, 'Accounting': 2.5551819770665284e-06, 'Communication Department': 0.997615294993432, 'Research': 5.19116705513687e-07, 'Finance': 0.0001791320861230741, 'CEO Office': 4.261165144606115e-08, 'Customer': 1.6661879491978318e-05}),\n", - " 'eval': SingleLabelClassifyEvaluation(correct=False)},\n", - " {'input': ClassifyInput(chunk='Hi Jan, is your tool ISO 37301 compliant?', labels=frozenset({'People & Culture', 'Product', 'Legal', 'Infrastructure', 'Accounting', 'Communication Department', 'Research', 'Finance', 'CEO Office', 'Customer'})),\n", - " 'expected_output': 'Product',\n", - " 'result': SingleLabelClassifyOutput(scores={'People & Culture': 0.005123237201145437, 'Product': 0.00274227126292674, 'Legal': 0.02225431906609223, 'Infrastructure': 0.0005571261993909729, 'Accounting': 0.018449478511067714, 'Communication Department': 0.2711131076222367, 'Research': 0.0008106138369727336, 'Finance': 0.650366535745627, 'CEO Office': 8.19924259947419e-06, 'Customer': 0.02857511131194103}),\n", - " 'eval': SingleLabelClassifyEvaluation(correct=False)},\n", - " {'input': ClassifyInput(chunk='Hi,\\n\\nI am having trouble running your docker container in my environment. It fails to start. Can you help?', labels=frozenset({'People & Culture', 'Product', 'Legal', 'Infrastructure', 'Accounting', 'Communication Department', 'Research', 'Finance', 'CEO Office', 'Customer'})),\n", - " 'expected_output': 'Product',\n", - " 'result': SingleLabelClassifyOutput(scores={'People & Culture': 0.0031204086769206574, 'Product': 6.54216509800929e-05, 'Legal': 5.423645362193234e-05, 'Infrastructure': 0.467826723972039, 'Accounting': 0.0003322385577810547, 'Communication Department': 0.364343818971154, 'Research': 0.0010893682918060776, 'Finance': 0.16167658961311748, 'CEO Office': 2.2038994274435957e-06, 'Customer': 0.0014889899131522346}),\n", - " 'eval': SingleLabelClassifyEvaluation(correct=False)},\n", - " {'input': ClassifyInput(chunk='Hi, I want to get a new badge, the photo of me looks ugly and I just got new glasses so it does not look like me. ', labels=frozenset({'People & Culture', 'Product', 'Legal', 'Infrastructure', 'Accounting', 'Communication Department', 'Research', 'Finance', 'CEO Office', 'Customer'})),\n", - " 'expected_output': 'Security',\n", - " 'result': SingleLabelClassifyOutput(scores={'People & Culture': 0.07146902378260041, 'Product': 3.71334953157612e-05, 'Legal': 0.00022747023644055324, 'Infrastructure': 1.5175625982297593e-07, 'Accounting': 0.00022047171301237616, 'Communication Department': 0.9268244024639218, 'Research': 0.00011437921793082643, 'Finance': 0.0008996622373032723, 'CEO Office': 1.9109003033139195e-07, 'Customer': 0.00020711400718468996}),\n", - " 'eval': SingleLabelClassifyEvaluation(correct=False)},\n", - " {'input': ClassifyInput(chunk='My company has been working on time series and signal processing for a long time. It would make sense to define a joint go to market.', labels=frozenset({'People & Culture', 'Product', 'Legal', 'Infrastructure', 'Accounting', 'Communication Department', 'Research', 'Finance', 'CEO Office', 'Customer'})),\n", - " 'expected_output': 'Research',\n", - " 'result': SingleLabelClassifyOutput(scores={'People & Culture': 3.714248528900376e-05, 'Product': 6.939129799016706e-05, 'Legal': 5.466264975956213e-07, 'Infrastructure': 7.708669182896016e-07, 'Accounting': 0.0010854625304261897, 'Communication Department': 0.010962783206468764, 'Research': 0.0009579173209727495, 'Finance': 0.9868384926835331, 'CEO Office': 2.690592609983691e-06, 'Customer': 4.480238929427011e-05}),\n", - " 'eval': SingleLabelClassifyEvaluation(correct=False)},\n", - " {'input': ClassifyInput(chunk='Can you send your models via email?', labels=frozenset({'People & Culture', 'Product', 'Legal', 'Infrastructure', 'Accounting', 'Communication Department', 'Research', 'Finance', 'CEO Office', 'Customer'})),\n", - " 'expected_output': 'Communications',\n", - " 'result': SingleLabelClassifyOutput(scores={'People & Culture': 0.0020264155120091817, 'Product': 3.4522415807802506e-06, 'Legal': 0.000242020819540981, 'Infrastructure': 3.638635856144631e-07, 'Accounting': 2.8905264860123456e-05, 'Communication Department': 0.9861115426608039, 'Research': 0.0003748491782053352, 'Finance': 0.010954698759035424, 'CEO Office': 1.2187903452920109e-07, 'Customer': 0.00025762982134413344}),\n", - " 'eval': SingleLabelClassifyEvaluation(correct=False)},\n", - " {'input': ClassifyInput(chunk='H100 cluster available right now. Would you like to procure at low prices?', labels=frozenset({'People & Culture', 'Product', 'Legal', 'Infrastructure', 'Accounting', 'Communication Department', 'Research', 'Finance', 'CEO Office', 'Customer'})),\n", - " 'expected_output': 'Research',\n", - " 'result': SingleLabelClassifyOutput(scores={'People & Culture': 0.00016192925707715533, 'Product': 0.00022773793389890888, 'Legal': 6.945614562102543e-05, 'Infrastructure': 4.535936867110054e-07, 'Accounting': 0.0003112808486975587, 'Communication Department': 0.011352105986275144, 'Research': 1.5831959467641733e-06, 'Finance': 0.98776071405918, 'CEO Office': 2.2515495051381644e-07, 'Customer': 0.0001145138246662302}),\n", - " 'eval': SingleLabelClassifyEvaluation(correct=False)},\n", - " {'input': ClassifyInput(chunk='Hello, we would like to get in contact with your sales team, because we are interested in your solution.', labels=frozenset({'People & Culture', 'Product', 'Legal', 'Infrastructure', 'Accounting', 'Communication Department', 'Research', 'Finance', 'CEO Office', 'Customer'})),\n", - " 'expected_output': 'Customer',\n", - " 'result': SingleLabelClassifyOutput(scores={'People & Culture': 0.0007729167151368392, 'Product': 0.0011971168267247037, 'Legal': 0.0009323155220811545, 'Infrastructure': 5.843067369127303e-07, 'Accounting': 0.0003429797132586766, 'Communication Department': 0.8476065223258168, 'Research': 7.189245346083699e-05, 'Finance': 0.047818663898506905, 'CEO Office': 2.4845354673455315e-05, 'Customer': 0.10123216288360386}),\n", - " 'eval': SingleLabelClassifyEvaluation(correct=False)},\n", - " {'input': ClassifyInput(chunk='I want to take a sabbatical', labels=frozenset({'People & Culture', 'Product', 'Legal', 'Infrastructure', 'Accounting', 'Communication Department', 'Research', 'Finance', 'CEO Office', 'Customer'})),\n", - " 'expected_output': 'HR',\n", - " 'result': SingleLabelClassifyOutput(scores={'People & Culture': 0.10692717240754102, 'Product': 1.5427509546398317e-05, 'Legal': 6.701404672332701e-05, 'Infrastructure': 2.0512007286965897e-07, 'Accounting': 0.0028495126253898252, 'Communication Department': 0.742221689178672, 'Research': 0.0016236033528010429, 'Finance': 0.14615211618905893, 'CEO Office': 1.3908321679798301e-06, 'Customer': 0.00014186873802656585}),\n", - " 'eval': SingleLabelClassifyEvaluation(correct=False)},\n", - " {'input': ClassifyInput(chunk='Your hTTPs Certificate is not valid on your www.aleph-alpha.de', labels=frozenset({'People & Culture', 'Product', 'Legal', 'Infrastructure', 'Accounting', 'Communication Department', 'Research', 'Finance', 'CEO Office', 'Customer'})),\n", - " 'expected_output': 'Security',\n", - " 'result': SingleLabelClassifyOutput(scores={'People & Culture': 0.0008210630611636245, 'Product': 2.696220335935928e-06, 'Legal': 0.0019696278922144146, 'Infrastructure': 0.001353704133526628, 'Accounting': 6.33135848487783e-05, 'Communication Department': 0.37534475027474623, 'Research': 7.174369072950839e-05, 'Finance': 0.6188388736236019, 'CEO Office': 2.7977401714315443e-07, 'Customer': 0.0015339477448158669}),\n", - " 'eval': SingleLabelClassifyEvaluation(correct=False)},\n", - " {'input': ClassifyInput(chunk='I already tried to call many times. Can I get a meeting with Jonas?', labels=frozenset({'People & Culture', 'Product', 'Legal', 'Infrastructure', 'Accounting', 'Communication Department', 'Research', 'Finance', 'CEO Office', 'Customer'})),\n", - " 'expected_output': 'Communications',\n", - " 'result': SingleLabelClassifyOutput(scores={'People & Culture': 0.007570025565688809, 'Product': 1.2272971551662457e-07, 'Legal': 4.508178225012202e-05, 'Infrastructure': 2.8253984803761956e-08, 'Accounting': 2.413053916738237e-05, 'Communication Department': 0.9914774899548215, 'Research': 8.422385744263017e-05, 'Finance': 0.0007506790700884759, 'CEO Office': 2.289394373017081e-07, 'Customer': 4.7989307403396396e-05}),\n", - " 'eval': SingleLabelClassifyEvaluation(correct=False)},\n", - " {'input': ClassifyInput(chunk='Hello,\\n\\nI am getting strange errors from your API. It is saying the queue is full, but I am only sending one task at a time. Why is this happening?', labels=frozenset({'People & Culture', 'Product', 'Legal', 'Infrastructure', 'Accounting', 'Communication Department', 'Research', 'Finance', 'CEO Office', 'Customer'})),\n", - " 'expected_output': 'Product',\n", - " 'result': SingleLabelClassifyOutput(scores={'People & Culture': 0.0051654332928985506, 'Product': 0.00017675140754742127, 'Legal': 0.00019412344443782529, 'Infrastructure': 0.0014799214162767621, 'Accounting': 0.00139025751038781, 'Communication Department': 0.8686923102930786, 'Research': 0.0007921448513721468, 'Finance': 0.097464576033817, 'CEO Office': 1.5777405126327238e-06, 'Customer': 0.024642904009671292}),\n", - " 'eval': SingleLabelClassifyEvaluation(correct=False)},\n", - " {'input': ClassifyInput(chunk='Can you show me a demo of different use cases your product can solve?', labels=frozenset({'People & Culture', 'Product', 'Legal', 'Infrastructure', 'Accounting', 'Communication Department', 'Research', 'Finance', 'CEO Office', 'Customer'})),\n", - " 'expected_output': 'Customer',\n", - " 'result': SingleLabelClassifyOutput(scores={'People & Culture': 0.009160358336484717, 'Product': 0.03711030142912698, 'Legal': 0.0006796986761183324, 'Infrastructure': 3.716619370059602e-05, 'Accounting': 0.000794648260086934, 'Communication Department': 0.3989733400038124, 'Research': 0.012047947308993433, 'Finance': 0.5122919091456952, 'CEO Office': 3.098832961587151e-06, 'Customer': 0.028901531813019972}),\n", - " 'eval': SingleLabelClassifyEvaluation(correct=False)},\n", - " {'input': ClassifyInput(chunk='I want to take a week off immediatly', labels=frozenset({'People & Culture', 'Product', 'Legal', 'Infrastructure', 'Accounting', 'Communication Department', 'Research', 'Finance', 'CEO Office', 'Customer'})),\n", - " 'expected_output': 'HR',\n", - " 'result': SingleLabelClassifyOutput(scores={'People & Culture': 0.056933265475927836, 'Product': 1.3126513023215651e-05, 'Legal': 0.00194814921392479, 'Infrastructure': 5.251265907559957e-07, 'Accounting': 0.0005941519411250563, 'Communication Department': 0.612090043022019, 'Research': 2.778882828821402e-05, 'Finance': 0.327628321861519, 'CEO Office': 1.7218238032769019e-06, 'Customer': 0.0007629061937789308}),\n", - " 'eval': SingleLabelClassifyEvaluation(correct=False)},\n", - " {'input': ClassifyInput(chunk='Let us celebrate AI day in style, we want to invite you and the CEO to join us.', labels=frozenset({'People & Culture', 'Product', 'Legal', 'Infrastructure', 'Accounting', 'Communication Department', 'Research', 'Finance', 'CEO Office', 'Customer'})),\n", - " 'expected_output': 'Marketing',\n", - " 'result': SingleLabelClassifyOutput(scores={'People & Culture': 0.01595964835470913, 'Product': 0.00016655387619796816, 'Legal': 0.0014844785396950758, 'Infrastructure': 1.2916801823639696e-06, 'Accounting': 0.0001829236472888977, 'Communication Department': 0.9275656362888433, 'Research': 3.178736353973008e-05, 'Finance': 0.052329672335781376, 'CEO Office': 0.00011810438332089227, 'Customer': 0.0021599035304412997}),\n", - " 'eval': SingleLabelClassifyEvaluation(correct=False)},\n", - " {'input': ClassifyInput(chunk='Hi, can you name me a couple of timeslots for a first call? Would be really interested in learning more about the product?', labels=frozenset({'People & Culture', 'Product', 'Legal', 'Infrastructure', 'Accounting', 'Communication Department', 'Research', 'Finance', 'CEO Office', 'Customer'})),\n", - " 'expected_output': 'Customer',\n", - " 'result': SingleLabelClassifyOutput(scores={'People & Culture': 0.0013463074178401335, 'Product': 0.0004605432991996499, 'Legal': 1.575892509965863e-05, 'Infrastructure': 1.3009801875224026e-07, 'Accounting': 4.024182500489154e-05, 'Communication Department': 0.9532388470855437, 'Research': 0.00016942441154481957, 'Finance': 0.025144802088538892, 'CEO Office': 1.1532924799521372e-06, 'Customer': 0.019582791556729585}),\n", - " 'eval': SingleLabelClassifyEvaluation(correct=False)},\n", - " {'input': ClassifyInput(chunk='How can I work more, I want to work weekends, can I get paid overtime?', labels=frozenset({'People & Culture', 'Product', 'Legal', 'Infrastructure', 'Accounting', 'Communication Department', 'Research', 'Finance', 'CEO Office', 'Customer'})),\n", - " 'expected_output': 'HR',\n", - " 'result': SingleLabelClassifyOutput(scores={'People & Culture': 0.03738946059573627, 'Product': 2.979169115792472e-06, 'Legal': 0.005060114757822338, 'Infrastructure': 1.4832409644738856e-07, 'Accounting': 0.0012793973700934327, 'Communication Department': 0.10163509131327543, 'Research': 0.0001731476054537721, 'Finance': 0.8509802007625091, 'CEO Office': 1.697479394147361e-06, 'Customer': 0.0034777626225032695}),\n", - " 'eval': SingleLabelClassifyEvaluation(correct=False)},\n", - " {'input': ClassifyInput(chunk='IT Support', labels=frozenset({'People & Culture', 'Product', 'Legal', 'Infrastructure', 'Accounting', 'Communication Department', 'Research', 'Finance', 'CEO Office', 'Customer'})),\n", - " 'expected_output': 'I can’t login to Mattermost or Sharepoint, how can I gain access?',\n", - " 'result': SingleLabelClassifyOutput(scores={'People & Culture': 0.0024803340090770883, 'Product': 1.4603082026822413e-06, 'Legal': 7.973020607484235e-05, 'Infrastructure': 1.2227023109146764e-05, 'Accounting': 0.003184811909529985, 'Communication Department': 0.47266799664085485, 'Research': 6.3432960253175105e-06, 'Finance': 0.1534528313507817, 'CEO Office': 5.933965436267242e-08, 'Customer': 0.36811420591669}),\n", - " 'eval': SingleLabelClassifyEvaluation(correct=False)}]" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "[e for e in overview if not e[\"eval\"].correct]" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -584,17 +343,9 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Evaluating: 27it [00:32, 1.19s/it]\n" - ] - } - ], + "outputs": [], "source": [ "runner_prompt_adjusted = Runner(\n", " prompt_adjusted_classify_task,\n", @@ -607,37 +358,18 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Evaluating: 0it [00:00, ?it/s]\n" - ] - } - ], + "outputs": [], "source": [ "eval_overview_prompt_adjusted = evaluator.evaluate_runs(run_overview_prompt_adjusted.id)" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "AggregationOverview(evaluation_overviews=frozenset({EvaluationOverview(run_overviews=frozenset({RunOverview(dataset_id='2521c77d-114d-4e65-8f81-449e53108f7f', id='886a10f9-4ddb-4c36-a0e4-f1215c1ead00', start=datetime.datetime(2024, 3, 12, 15, 45, 36, 624811, tzinfo=datetime.timezone.utc), end=datetime.datetime(2024, 3, 12, 15, 46, 8, 689225, tzinfo=datetime.timezone.utc), failed_example_count=0, successful_example_count=27, description='running for adjusted prompt')}), id='1be5cc48-316e-498b-ad53-9662545f2e28', start=datetime.datetime(2024, 3, 12, 15, 46, 8, 693089, tzinfo=datetime.timezone.utc), description='single-label-classify')}), id='6a4e4df1-443f-47cc-b3bd-54267abb1c4a', start=datetime.datetime(2024, 3, 12, 15, 46, 8, 700178, tzinfo=datetime.timezone.utc), end=datetime.datetime(2024, 3, 12, 15, 46, 8, 700319, tzinfo=datetime.timezone.utc), successful_evaluation_count=27, crashed_during_evaluation_count=0, description='single-label-classify', statistics=AggregatedSingleLabelClassifyEvaluation(percentage_correct=0.25925925925925924))" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "aggregation_overview_prompt_adjusted = aggregator.aggregate_evaluation(\n", " eval_overview_prompt_adjusted.id\n", @@ -647,99 +379,9 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'input': ClassifyInput(chunk='Hey, I did not get a t-shirt in the onboarding. Could I still get one?', labels=frozenset({'People & Culture', 'Product', 'Legal', 'Infrastructure', 'Accounting', 'Communication Department', 'Research', 'Finance', 'CEO Office', 'Customer'})),\n", - " 'expected_output': 'People & Culture',\n", - " 'result': SingleLabelClassifyOutput(scores={'People & Culture': 0.11905821963385722, 'Product': 0.0007529779112762469, 'Legal': 0.0026281511497473755, 'Infrastructure': 9.573039863173891e-05, 'Accounting': 0.00976476612874965, 'Communication Department': 0.028255287665935772, 'Research': 1.1433354228584936e-05, 'Finance': 0.19613051178613103, 'CEO Office': 0.00021573178351041137, 'Customer': 0.6430871901879318}),\n", - " 'eval': SingleLabelClassifyEvaluation(correct=False)},\n", - " {'input': ClassifyInput(chunk='Jonas, we have met each other at the event in Nürnberg, can we meet for a follow up in your Office in Heidelberg?', labels=frozenset({'People & Culture', 'Product', 'Legal', 'Infrastructure', 'Accounting', 'Communication Department', 'Research', 'Finance', 'CEO Office', 'Customer'})),\n", - " 'expected_output': 'Sales',\n", - " 'result': SingleLabelClassifyOutput(scores={'People & Culture': 0.03466694356346471, 'Product': 0.0006973523989065594, 'Legal': 0.014227284972224445, 'Infrastructure': 0.002131281832159999, 'Accounting': 0.004840585080623741, 'Communication Department': 0.4357360185096552, 'Research': 0.024582544007061204, 'Finance': 0.47856239422605007, 'CEO Office': 0.001123106602569485, 'Customer': 0.003432488807284528}),\n", - " 'eval': SingleLabelClassifyEvaluation(correct=False)},\n", - " {'input': ClassifyInput(chunk='My laptop is broken', labels=frozenset({'People & Culture', 'Product', 'Legal', 'Infrastructure', 'Accounting', 'Communication Department', 'Research', 'Finance', 'CEO Office', 'Customer'})),\n", - " 'expected_output': 'IT Support',\n", - " 'result': SingleLabelClassifyOutput(scores={'People & Culture': 0.00011538318333471, 'Product': 0.0010283991187241372, 'Legal': 7.449698229461483e-05, 'Infrastructure': 6.574335612689703e-05, 'Accounting': 0.00010839246965362872, 'Communication Department': 0.0005504623813759523, 'Research': 8.809614760325034e-07, 'Finance': 0.002795481432311151, 'CEO Office': 7.303427156261302e-07, 'Customer': 0.9952600297719872}),\n", - " 'eval': SingleLabelClassifyEvaluation(correct=False)},\n", - " {'input': ClassifyInput(chunk='Hi, Jonas here. I need something really urgently right now. Could you share your number with me?', labels=frozenset({'People & Culture', 'Product', 'Legal', 'Infrastructure', 'Accounting', 'Communication Department', 'Research', 'Finance', 'CEO Office', 'Customer'})),\n", - " 'expected_output': 'Ignore',\n", - " 'result': SingleLabelClassifyOutput(scores={'People & Culture': 0.0027921150030431386, 'Product': 0.00015610958884849715, 'Legal': 0.0022234342816152253, 'Infrastructure': 0.00020681160399791868, 'Accounting': 0.0007218434255355227, 'Communication Department': 0.44111224825666556, 'Research': 0.00010729244673203881, 'Finance': 0.4106753395570174, 'CEO Office': 7.920617423034184e-05, 'Customer': 0.1419255996623142}),\n", - " 'eval': SingleLabelClassifyEvaluation(correct=False)},\n", - " {'input': ClassifyInput(chunk='Hi Jan, is your tool ISO 37301 compliant?', labels=frozenset({'People & Culture', 'Product', 'Legal', 'Infrastructure', 'Accounting', 'Communication Department', 'Research', 'Finance', 'CEO Office', 'Customer'})),\n", - " 'expected_output': 'Product',\n", - " 'result': SingleLabelClassifyOutput(scores={'People & Culture': 0.0006679883017215352, 'Product': 0.11117125853631647, 'Legal': 0.0950898165199237, 'Infrastructure': 0.023302761713130622, 'Accounting': 0.03443924930051577, 'Communication Department': 0.009087200473199403, 'Research': 0.004482289128038741, 'Finance': 0.6600538892118525, 'CEO Office': 0.0003109469009137648, 'Customer': 0.061394599914387366}),\n", - " 'eval': SingleLabelClassifyEvaluation(correct=False)},\n", - " {'input': ClassifyInput(chunk='Hi,\\n\\nI am having trouble running your docker container in my environment. It fails to start. Can you help?', labels=frozenset({'People & Culture', 'Product', 'Legal', 'Infrastructure', 'Accounting', 'Communication Department', 'Research', 'Finance', 'CEO Office', 'Customer'})),\n", - " 'expected_output': 'Product',\n", - " 'result': SingleLabelClassifyOutput(scores={'People & Culture': 1.2885345777895933e-05, 'Product': 0.0029619131690842015, 'Legal': 0.00024312863840838438, 'Infrastructure': 0.9906245476696584, 'Accounting': 4.787486747910973e-05, 'Communication Department': 0.00016709970654512957, 'Research': 0.0003537500815320124, 'Finance': 0.0007035160420105038, 'CEO Office': 1.9152356682796523e-06, 'Customer': 0.004883369243835948}),\n", - " 'eval': SingleLabelClassifyEvaluation(correct=False)},\n", - " {'input': ClassifyInput(chunk='Hi, I want to get a new badge, the photo of me looks ugly and I just got new glasses so it does not look like me. ', labels=frozenset({'People & Culture', 'Product', 'Legal', 'Infrastructure', 'Accounting', 'Communication Department', 'Research', 'Finance', 'CEO Office', 'Customer'})),\n", - " 'expected_output': 'Security',\n", - " 'result': SingleLabelClassifyOutput(scores={'People & Culture': 0.035447679019574685, 'Product': 1.4556878890486365e-05, 'Legal': 0.017260719640241097, 'Infrastructure': 1.7835460845620018e-05, 'Accounting': 0.007659397910420652, 'Communication Department': 0.004364193377137608, 'Research': 3.895622804027345e-05, 'Finance': 0.15384327338233697, 'CEO Office': 7.277981555304867e-05, 'Customer': 0.7812806082869597}),\n", - " 'eval': SingleLabelClassifyEvaluation(correct=False)},\n", - " {'input': ClassifyInput(chunk='My company has been working on time series and signal processing for a long time. It would make sense to define a joint go to market.', labels=frozenset({'People & Culture', 'Product', 'Legal', 'Infrastructure', 'Accounting', 'Communication Department', 'Research', 'Finance', 'CEO Office', 'Customer'})),\n", - " 'expected_output': 'Research',\n", - " 'result': SingleLabelClassifyOutput(scores={'People & Culture': 3.888249056491584e-05, 'Product': 0.11590710557768708, 'Legal': 0.000325559107583116, 'Infrastructure': 0.000105693567547841, 'Accounting': 0.0021229120535255904, 'Communication Department': 0.003966121861983323, 'Research': 0.3570190195097128, 'Finance': 0.5194596082420401, 'CEO Office': 0.0002237532842546323, 'Customer': 0.0008313443051006192}),\n", - " 'eval': SingleLabelClassifyEvaluation(correct=False)},\n", - " {'input': ClassifyInput(chunk='Can you send your models via email?', labels=frozenset({'People & Culture', 'Product', 'Legal', 'Infrastructure', 'Accounting', 'Communication Department', 'Research', 'Finance', 'CEO Office', 'Customer'})),\n", - " 'expected_output': 'Communications',\n", - " 'result': SingleLabelClassifyOutput(scores={'People & Culture': 0.006060842594286994, 'Product': 0.09480756753345904, 'Legal': 0.021154427727023792, 'Infrastructure': 0.05074683403236992, 'Accounting': 0.02891467444111507, 'Communication Department': 0.13794419680201822, 'Research': 0.016475084879233565, 'Finance': 0.49675613537002145, 'CEO Office': 0.0002994034848488399, 'Customer': 0.1468408331356231}),\n", - " 'eval': SingleLabelClassifyEvaluation(correct=False)},\n", - " {'input': ClassifyInput(chunk='H100 cluster available right now. Would you like to procure at low prices?', labels=frozenset({'People & Culture', 'Product', 'Legal', 'Infrastructure', 'Accounting', 'Communication Department', 'Research', 'Finance', 'CEO Office', 'Customer'})),\n", - " 'expected_output': 'Research',\n", - " 'result': SingleLabelClassifyOutput(scores={'People & Culture': 0.00030471647711875397, 'Product': 0.0013763560758355033, 'Legal': 0.00011520645285504865, 'Infrastructure': 0.07514649552749639, 'Accounting': 0.0118899021620508, 'Communication Department': 0.007440510648774454, 'Research': 0.0033017065860232947, 'Finance': 0.8873053577432958, 'CEO Office': 6.12454622557398e-05, 'Customer': 0.013058502864294285}),\n", - " 'eval': SingleLabelClassifyEvaluation(correct=False)},\n", - " {'input': ClassifyInput(chunk='I want to take a sabbatical', labels=frozenset({'People & Culture', 'Product', 'Legal', 'Infrastructure', 'Accounting', 'Communication Department', 'Research', 'Finance', 'CEO Office', 'Customer'})),\n", - " 'expected_output': 'HR',\n", - " 'result': SingleLabelClassifyOutput(scores={'People & Culture': 0.8386597459457881, 'Product': 1.204947209028871e-06, 'Legal': 0.0005955949107618142, 'Infrastructure': 4.5927347973219604e-05, 'Accounting': 0.00440088420785664, 'Communication Department': 0.009917545656902184, 'Research': 0.017405834690762557, 'Finance': 0.12861268897875777, 'CEO Office': 0.00021910712293561128, 'Customer': 0.0001414661910532324}),\n", - " 'eval': SingleLabelClassifyEvaluation(correct=False)},\n", - " {'input': ClassifyInput(chunk='Your hTTPs Certificate is not valid on your www.aleph-alpha.de', labels=frozenset({'People & Culture', 'Product', 'Legal', 'Infrastructure', 'Accounting', 'Communication Department', 'Research', 'Finance', 'CEO Office', 'Customer'})),\n", - " 'expected_output': 'Security',\n", - " 'result': SingleLabelClassifyOutput(scores={'People & Culture': 0.00010525045695257107, 'Product': 4.121667115293105e-05, 'Legal': 0.004337670857013201, 'Infrastructure': 0.7294848026308586, 'Accounting': 0.0001847203046218093, 'Communication Department': 0.1844426426428915, 'Research': 0.00017352866712861211, 'Finance': 0.07688713252695714, 'CEO Office': 5.364385410410433e-06, 'Customer': 0.004337670857013201}),\n", - " 'eval': SingleLabelClassifyEvaluation(correct=False)},\n", - " {'input': ClassifyInput(chunk='I already tried to call many times. Can I get a meeting with Jonas?', labels=frozenset({'People & Culture', 'Product', 'Legal', 'Infrastructure', 'Accounting', 'Communication Department', 'Research', 'Finance', 'CEO Office', 'Customer'})),\n", - " 'expected_output': 'Communications',\n", - " 'result': SingleLabelClassifyOutput(scores={'People & Culture': 0.07100219760312608, 'Product': 0.0016184377445183124, 'Legal': 0.005475101578707347, 'Infrastructure': 0.0005953899730241662, 'Accounting': 0.001147644210527974, 'Communication Department': 0.6736502967544478, 'Research': 0.0013843231623117025, 'Finance': 0.10997035491732951, 'CEO Office': 0.0025066854151630505, 'Customer': 0.13264956864084415}),\n", - " 'eval': SingleLabelClassifyEvaluation(correct=False)},\n", - " {'input': ClassifyInput(chunk='Hello,\\n\\nI am getting strange errors from your API. It is saying the queue is full, but I am only sending one task at a time. Why is this happening?', labels=frozenset({'People & Culture', 'Product', 'Legal', 'Infrastructure', 'Accounting', 'Communication Department', 'Research', 'Finance', 'CEO Office', 'Customer'})),\n", - " 'expected_output': 'Product',\n", - " 'result': SingleLabelClassifyOutput(scores={'People & Culture': 0.0002629620382111627, 'Product': 0.02223692957948374, 'Legal': 0.012670220549140852, 'Infrastructure': 0.0729120850776688, 'Accounting': 0.008180509227069187, 'Communication Department': 0.009269731375952349, 'Research': 0.0009178276981501435, 'Finance': 0.03902702681497116, 'CEO Office': 8.808124968285771e-05, 'Customer': 0.8344346263896697}),\n", - " 'eval': SingleLabelClassifyEvaluation(correct=False)},\n", - " {'input': ClassifyInput(chunk='Can you show me a demo of different use cases your product can solve?', labels=frozenset({'People & Culture', 'Product', 'Legal', 'Infrastructure', 'Accounting', 'Communication Department', 'Research', 'Finance', 'CEO Office', 'Customer'})),\n", - " 'expected_output': 'Customer',\n", - " 'result': SingleLabelClassifyOutput(scores={'People & Culture': 0.00041603343224054723, 'Product': 0.7177605689549155, 'Legal': 0.004267957365339802, 'Infrastructure': 0.001697676147785359, 'Accounting': 0.0016454441438113435, 'Communication Department': 0.00531155876025379, 'Research': 0.018539180251167547, 'Finance': 0.04447318147492857, 'CEO Office': 0.0002464917743120061, 'Customer': 0.20564190769524557}),\n", - " 'eval': SingleLabelClassifyEvaluation(correct=False)},\n", - " {'input': ClassifyInput(chunk='I want to take a week off immediatly', labels=frozenset({'People & Culture', 'Product', 'Legal', 'Infrastructure', 'Accounting', 'Communication Department', 'Research', 'Finance', 'CEO Office', 'Customer'})),\n", - " 'expected_output': 'HR',\n", - " 'result': SingleLabelClassifyOutput(scores={'People & Culture': 0.46299612684321056, 'Product': 1.9808579812054106e-05, 'Legal': 0.009055387538445772, 'Infrastructure': 4.463932369175321e-05, 'Accounting': 0.007991351454381538, 'Communication Department': 0.03812460555218306, 'Research': 0.0002129623027568396, 'Finance': 0.4098781370054803, 'CEO Office': 0.0004508411984741028, 'Customer': 0.07122614020156406}),\n", - " 'eval': SingleLabelClassifyEvaluation(correct=False)},\n", - " {'input': ClassifyInput(chunk='Let us celebrate AI day in style, we want to invite you and the CEO to join us.', labels=frozenset({'People & Culture', 'Product', 'Legal', 'Infrastructure', 'Accounting', 'Communication Department', 'Research', 'Finance', 'CEO Office', 'Customer'})),\n", - " 'expected_output': 'Marketing',\n", - " 'result': SingleLabelClassifyOutput(scores={'People & Culture': 0.01767441050909784, 'Product': 0.0006437903557333712, 'Legal': 0.0009971220089282922, 'Infrastructure': 0.0002725979880200259, 'Accounting': 0.0001873536745861537, 'Communication Department': 0.7060017099733116, 'Research': 0.0014061674169881045, 'Finance': 0.06566840156870157, 'CEO Office': 0.19001782043943738, 'Customer': 0.017130626065195588}),\n", - " 'eval': SingleLabelClassifyEvaluation(correct=False)},\n", - " {'input': ClassifyInput(chunk='I just traveled to Paris for a conference, where can I get the train ride refunded?', labels=frozenset({'People & Culture', 'Product', 'Legal', 'Infrastructure', 'Accounting', 'Communication Department', 'Research', 'Finance', 'CEO Office', 'Customer'})),\n", - " 'expected_output': 'Finance',\n", - " 'result': SingleLabelClassifyOutput(scores={'People & Culture': 0.00032538852910948184, 'Product': 9.322537443314816e-05, 'Legal': 1.2228507943008445e-05, 'Infrastructure': 0.00014439028452357174, 'Accounting': 0.002724444798046206, 'Communication Department': 0.0005364749891846305, 'Research': 0.00019735811922369747, 'Finance': 0.08475503576237196, 'CEO Office': 8.946567529803128e-06, 'Customer': 0.9112025070676344}),\n", - " 'eval': SingleLabelClassifyEvaluation(correct=False)},\n", - " {'input': ClassifyInput(chunk='How can I work more, I want to work weekends, can I get paid overtime?', labels=frozenset({'People & Culture', 'Product', 'Legal', 'Infrastructure', 'Accounting', 'Communication Department', 'Research', 'Finance', 'CEO Office', 'Customer'})),\n", - " 'expected_output': 'HR',\n", - " 'result': SingleLabelClassifyOutput(scores={'People & Culture': 0.25633979462850376, 'Product': 6.291336782180687e-05, 'Legal': 0.07344258078722231, 'Infrastructure': 7.5888007359612e-05, 'Accounting': 0.03258993137477255, 'Communication Department': 0.04184630021331723, 'Research': 0.005663280891773613, 'Finance': 0.5776706445424228, 'CEO Office': 0.0003195004448393767, 'Customer': 0.01198916574196698}),\n", - " 'eval': SingleLabelClassifyEvaluation(correct=False)},\n", - " {'input': ClassifyInput(chunk='IT Support', labels=frozenset({'People & Culture', 'Product', 'Legal', 'Infrastructure', 'Accounting', 'Communication Department', 'Research', 'Finance', 'CEO Office', 'Customer'})),\n", - " 'expected_output': 'I can’t login to Mattermost or Sharepoint, how can I gain access?',\n", - " 'result': SingleLabelClassifyOutput(scores={'People & Culture': 0.020056602767552136, 'Product': 0.008368983358924628, 'Legal': 0.017717137909874635, 'Infrastructure': 0.27714266636556395, 'Accounting': 0.03309997127739246, 'Communication Department': 0.054519498843658365, 'Research': 0.01563531932812848, 'Finance': 0.115530251640236, 'CEO Office': 0.0009985594532152073, 'Customer': 0.45693100905545425}),\n", - " 'eval': SingleLabelClassifyEvaluation(correct=False)}]" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "overview = [\n", " {\n",