From 767d267659fd2490be1c4e5f79f3792fd73db935 Mon Sep 17 00:00:00 2001 From: Ilya Gusev Date: Mon, 18 Nov 2024 16:58:21 +0000 Subject: [PATCH] Fixes and annotation templates --- .../first.jsonl} | 0 ...e_3_5_sonnet_player_saiga_nemo_12b_v3.json | 3 ++ ...judge_gpt_4o_player_saiga_nemo_12b_v3.json | 3 ++ src/annotations/convert_results.py | 4 +- src/annotations/convert_to_label_studio.py | 20 +++++----- src/annotations/merge_annotations.py | 11 +++--- .../label_studio_instruction_en.jinja | 10 +++++ .../label_studio_instruction_ru.jinja | 13 +++++++ templates/annotations/label_studio_ui_en.html | 39 +++++++++++++++++++ templates/annotations/label_studio_ui_ru.html | 39 +++++++++++++++++++ 10 files changed, 125 insertions(+), 17 deletions(-) rename results/{en_annotated.jsonl => en_annotations/first.jsonl} (100%) create mode 100644 results/v2/ru/judge_claude_3_5_sonnet_player_saiga_nemo_12b_v3.json create mode 100644 results/v2/ru/judge_gpt_4o_player_saiga_nemo_12b_v3.json create mode 100644 templates/annotations/label_studio_instruction_en.jinja create mode 100644 templates/annotations/label_studio_instruction_ru.jinja create mode 100644 templates/annotations/label_studio_ui_en.html create mode 100644 templates/annotations/label_studio_ui_ru.html diff --git a/results/en_annotated.jsonl b/results/en_annotations/first.jsonl similarity index 100% rename from results/en_annotated.jsonl rename to results/en_annotations/first.jsonl diff --git a/results/v2/ru/judge_claude_3_5_sonnet_player_saiga_nemo_12b_v3.json b/results/v2/ru/judge_claude_3_5_sonnet_player_saiga_nemo_12b_v3.json new file mode 100644 index 0000000..b9ac2f2 --- /dev/null +++ b/results/v2/ru/judge_claude_3_5_sonnet_player_saiga_nemo_12b_v3.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:226d9673a9d9241ec97e7453c4dd690b6b826ace8f91c6796cdb3f254e2aa86b +size 741640 diff --git a/results/v2/ru/judge_gpt_4o_player_saiga_nemo_12b_v3.json b/results/v2/ru/judge_gpt_4o_player_saiga_nemo_12b_v3.json new file mode 100644 index 0000000..400a0de --- /dev/null +++ b/results/v2/ru/judge_gpt_4o_player_saiga_nemo_12b_v3.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dbad6d495c51467ef86d9276f587c4e72755ac51a04e5a9e0baeac1329a69d7c +size 740978 diff --git a/src/annotations/convert_results.py b/src/annotations/convert_results.py index 53227be..5d9c699 100644 --- a/src/annotations/convert_results.py +++ b/src/annotations/convert_results.py @@ -1,4 +1,4 @@ -import fire +import fire # type: ignore import json mapping = { @@ -9,7 +9,7 @@ "Полностью согласен": 5, } -def main(input_path, orig_path, output_path): +def main(input_path: str, orig_path: str, output_path: str) -> None: orig_records = dict() with open(orig_path) as r: for idx, line in enumerate(r): diff --git a/src/annotations/convert_to_label_studio.py b/src/annotations/convert_to_label_studio.py index d1ea7b2..829ed6c 100644 --- a/src/annotations/convert_to_label_studio.py +++ b/src/annotations/convert_to_label_studio.py @@ -1,9 +1,9 @@ import json import csv -from typing import Dict, Any +from typing import Dict, Any, List -import fire -import markdown +import fire # type: ignore +import markdown # type: ignore def to_markdown(record: Dict[str, Any]) -> str: @@ -11,13 +11,12 @@ def to_markdown(record: Dict[str, Any]) -> str: messages = record["messages"] for m in messages: content = m["content"] - #content = content.replace("*", "**") result += "\n**{role}**:\n\n{content}\n\n".format(role=m["role"].capitalize(), content=content) return result -def markdown_to_html(text): - html = markdown.markdown(text) +def markdown_to_html(text: str) -> str: + html: str = markdown.markdown(text) user_color = "#6a9fb5" assistant_color = "#4f6b12" template = "{role}:

\n" @@ -28,8 +27,9 @@ def markdown_to_html(text): print(html) return html -def main(input_path: str, output_path: str): - new_records = [] + +def main(input_path: str, output_path: str) -> None: + new_records: List[Dict[str, Any]] = [] with open(input_path) as r: for idx, line in enumerate(r): record = json.loads(line) @@ -47,8 +47,8 @@ def main(input_path: str, output_path: str): writer = csv.writer(w) header = list(new_records[0].keys()) writer.writerow(header) - for r in new_records: - row = [r[k] for k in header] + for rec in new_records: + row = [rec[k] for k in header] writer.writerow(row) diff --git a/src/annotations/merge_annotations.py b/src/annotations/merge_annotations.py index 2510955..a722a64 100644 --- a/src/annotations/merge_annotations.py +++ b/src/annotations/merge_annotations.py @@ -1,15 +1,16 @@ -import fire +import fire # type: ignore import json +from typing import Dict, List from collections import defaultdict from statistics import mean -def main(files: str, output_path: str): - files = files.split(",") +def main(files: str, output_path: str) -> None: + all_files = files.split(",") records = dict() - scores = defaultdict(lambda: defaultdict(list)) - for f in files: + scores: Dict[str, Dict[str, List[float]]] = defaultdict(lambda: defaultdict(list)) + for f in all_files: with open(f) as r: for line in r: record = json.loads(line) diff --git a/templates/annotations/label_studio_instruction_en.jinja b/templates/annotations/label_studio_instruction_en.jinja new file mode 100644 index 0000000..04cad0f --- /dev/null +++ b/templates/annotations/label_studio_instruction_en.jinja @@ -0,0 +1,10 @@ +Please carefully read the character card and the dialogue. Based on the assistant's responses, answer 3 questions about the quality of these responses. The criteria for evaluating the answers are: +
    +
  • Adherence to a character card: everything the assistant says should not contradict the character card.
  • +
  • Entertainment value: you should find the assistant's answers interesting to read, and they should not repeat between different responses within the same dialogue.
  • +
  • Language fluency: responses should be in fluent English, unless otherwise specified in the character card.
  • +
+

Questions and Answers

+

Question: What should be done if the assistant responds in Chinese instead of English? Answer: Give the minimum score for the fluency question, for others - at your discretion.

+

Question: What should be done if the assistant's responses are repetitive? Answer: Give the minimum score for the entertainment question, for others - at your discretion.

+

Question: What should be done if the user's responses are not very appropriate? Answer: Nothing, your task is to evaluate only the assistant's responses.

diff --git a/templates/annotations/label_studio_instruction_ru.jinja b/templates/annotations/label_studio_instruction_ru.jinja new file mode 100644 index 0000000..b6c7bad --- /dev/null +++ b/templates/annotations/label_studio_instruction_ru.jinja @@ -0,0 +1,13 @@ +Внимательно прочитайте карточку персонажа и диалог. На основе реплик ассистента ответьте на 3 вопроса о качестве этих реплик. Критерии, по которым нужно оценить ответы: +
    +
  • Соответствие карточке персонажа: всё, что говорит ассистент, не должно противоречить карточке.
  • +
  • Развлекательность: вам должно быть интересно читать ответы ассистента, они не должны повторяться между разными репликами в рамках одного диалога.
  • +
  • Язык: ответы должны быть на хорошем русском языке, если иного не указано в карточке персонажа. +
+ +

Вопросы и ответы

+

Вопрос: Что делать, если ассистент отвечает на английском вместо русского? Ответ: В вопросе про язык нужно поставить минимальный балл, в остальных — на ваше усмотрение.

+ +

Вопрос: Что делать, если реплики ассистента повторяются? Ответ: В вопросе про развлекательность нужно поставить минимальный балл, в остальных — на ваше усмотрение.

+ +

Вопрос: Что делать, если реплики пользователя не очень корректны? Ответ: Ничего, ваша задача — оценка только ответов ассистента.

diff --git a/templates/annotations/label_studio_ui_en.html b/templates/annotations/label_studio_ui_en.html new file mode 100644 index 0000000..2a2a6f5 --- /dev/null +++ b/templates/annotations/label_studio_ui_en.html @@ -0,0 +1,39 @@ + + + + + + + + + +
+ + + + + + + + + +
+ + + + + + + + + +
+ + + + + + + + + diff --git a/templates/annotations/label_studio_ui_ru.html b/templates/annotations/label_studio_ui_ru.html new file mode 100644 index 0000000..da0e66b --- /dev/null +++ b/templates/annotations/label_studio_ui_ru.html @@ -0,0 +1,39 @@ + + + + + + + + + +
+ + + + + + + + + +
+ + + + + + + + + +
+ + + + + + + + +