From 57014385d4c280a38ccdb20b29aa0f9ca6fbf07a Mon Sep 17 00:00:00 2001 From: BJ Hargrave Date: Thu, 21 Mar 2024 20:00:44 -0400 Subject: [PATCH] schema: Define JSON schema for taxonomy yaml We use JSON Schema which has broad support including in vscode. This PR includes the schema for compositional skills (without attribution) as well as prototype schema for knowledge and metadata. The check yaml script is changed to use the check-jsonschema command to validate the compositional skill qna.yaml files against the schema. Signed-off-by: BJ Hargrave --- .github/schemas/compositional_skills.json | 54 ++++++++++++++++++ .github/schemas/knowledge.json | 55 +++++++++++++++++++ .github/schemas/knowledge_schema.yaml | 7 --- .github/schemas/metadata.json | 49 +++++++++++++++++ .github/schemas/skills_extraction_schema.yaml | 7 --- .github/schemas/skills_freeform_schema.yaml | 7 --- .github/schemas/skills_grounded_schema.yaml | 7 --- .github/scripts/check-yaml.sh | 32 +++++------ .github/workflows/lint.yml | 1 + 9 files changed, 172 insertions(+), 47 deletions(-) create mode 100644 .github/schemas/compositional_skills.json create mode 100644 .github/schemas/knowledge.json delete mode 100644 .github/schemas/knowledge_schema.yaml create mode 100644 .github/schemas/metadata.json delete mode 100644 .github/schemas/skills_extraction_schema.yaml delete mode 100644 .github/schemas/skills_freeform_schema.yaml delete mode 100644 .github/schemas/skills_grounded_schema.yaml diff --git a/.github/schemas/compositional_skills.json b/.github/schemas/compositional_skills.json new file mode 100644 index 000000000..43d999157 --- /dev/null +++ b/.github/schemas/compositional_skills.json @@ -0,0 +1,54 @@ +{ + "title": "Compositional Skill", + "description": "A compositional skill.", + "type": "object", + "required": [ + "created_by", + "task_description", + "seed_examples" + ], + "additionalProperties": false, + "properties": { + "created_by": { + "description": "The GitHub username of the contributor.", + "type": "string", + "minLength": 1 + }, + "task_description": { + "description": "A description of the skill.", + "type": "string", + "minLength": 1 + }, + "seed_examples": { + "description": "An array of seed examples for synthetic data generation.", + "type": "array", + "minItems": 5, + "uniqueItems": true, + "items": { + "type": "object", + "required": [ + "question", + "answer" + ], + "additionalProperties": false, + "properties": { + "context": { + "description": "Information that the model is expected to take into account during processing. This is different from knowledge, where the model is expected to gain facts and background knowledge from the tuning process.", + "type": "string", + "minLength": 1 + }, + "question": { + "description": "A question used for synthetic data generation.", + "type": "string", + "minLength": 1 + }, + "answer": { + "description": "The desired response for the question.", + "type": "string", + "minLength": 1 + } + } + } + } + } +} diff --git a/.github/schemas/knowledge.json b/.github/schemas/knowledge.json new file mode 100644 index 000000000..7799d0866 --- /dev/null +++ b/.github/schemas/knowledge.json @@ -0,0 +1,55 @@ +{ + "title": "Knowledge", + "description": "A knowledge skill.", + "type": "object", + "required": [ + "created_by", + "domain", + "task_description", + "seed_examples" + ], + "additionalProperties": false, + "properties": { + "created_by": { + "description": "The GitHub username of the contributor.", + "type": "string", + "minLength": 1 + }, + "domain": { + "description": "The knowledge domain.", + "type": "string", + "minLength": 1 + }, + "task_description": { + "description": "A description of the skill.", + "type": "string", + "minLength": 1 + }, + "seed_examples": { + "description": "An array of seed examples for synthetic data generation.", + "type": "array", + "minItems": 5, + "uniqueItems": true, + "items": { + "type": "object", + "required": [ + "question", + "answer" + ], + "additionalProperties": false, + "properties": { + "question": { + "description": "A question used for synthetic data generation.", + "type": "string", + "minLength": 1 + }, + "answer": { + "description": "The desired response for the question.", + "type": "string", + "minLength": 1 + } + } + } + } + } +} diff --git a/.github/schemas/knowledge_schema.yaml b/.github/schemas/knowledge_schema.yaml deleted file mode 100644 index f0d54affc..000000000 --- a/.github/schemas/knowledge_schema.yaml +++ /dev/null @@ -1,7 +0,0 @@ -seed_examples: list(include('seed'), min=5) -task_description: str(min=1) -domain: str(required=False) ---- -seed: - answer: str(min=1) - question: str(min=1) diff --git a/.github/schemas/metadata.json b/.github/schemas/metadata.json new file mode 100644 index 000000000..ca2036065 --- /dev/null +++ b/.github/schemas/metadata.json @@ -0,0 +1,49 @@ +{ + "title": "Taxonomy metadata", + "description": "Accompanying metadata for a peer taxonomy file.", + "type": "object", + "required": [ + "seed_examples" + ], + "additionalProperties": false, + "properties": { + "seed_examples": { + "description": "An array of metadata for the seed examples in the peer taxonomy file.", + "type": "array", + "minItems": 5, + "items": { + "type": "object", + "required": [ + "source", + "license" + ], + "additionalProperties": false, + "properties": { + "source": { + "title": "Attribution Source", + "description": "If information in the context, question, or answer of a seed example come from a 3rd party, for example Wikipedia, then the value must specify a URL to the source material. If the contributor self-authored all the information, then the value must be 'self-authored'.", + "type": "string", + "minLength": 1, + "examples": [ + "self-authored", + "https://some.domain/path/to/source/material" + ] + }, + "license": { + "title": "Attribution License", + "description": "The value must specify the SPDX License Identifier, https://spdx.org/licenses/, of the source information. See CONTRIBUTING.MD for guidance on acceptable licenses for source information. If the information is self-authored, then 'Apache-2.0' must be used.", + "type": "string", + "minLength": 1, + "examples": [ + "CC0-1.0", + "CDLA-Permissive-2.0", + "CC-BY-4.0", + "Apache-2.0", + "MIT" + ] + } + } + } + } + } +} diff --git a/.github/schemas/skills_extraction_schema.yaml b/.github/schemas/skills_extraction_schema.yaml deleted file mode 100644 index 3e05b6460..000000000 --- a/.github/schemas/skills_extraction_schema.yaml +++ /dev/null @@ -1,7 +0,0 @@ -seed_examples: list(include('seed'), min=5) -task_description: str(min=1) ---- -seed: - answer: str(min=1) - context: str(min=1) - question: str(min=1) diff --git a/.github/schemas/skills_freeform_schema.yaml b/.github/schemas/skills_freeform_schema.yaml deleted file mode 100644 index 0fd796d6b..000000000 --- a/.github/schemas/skills_freeform_schema.yaml +++ /dev/null @@ -1,7 +0,0 @@ -seed_examples: list(include('seed'), min=5) -task_description: str(min=1) ---- -seed: - answer: str(min=1) - context: str(min=1,required=False) - question: str(min=1) diff --git a/.github/schemas/skills_grounded_schema.yaml b/.github/schemas/skills_grounded_schema.yaml deleted file mode 100644 index 3e05b6460..000000000 --- a/.github/schemas/skills_grounded_schema.yaml +++ /dev/null @@ -1,7 +0,0 @@ -seed_examples: list(include('seed'), min=5) -task_description: str(min=1) ---- -seed: - answer: str(min=1) - context: str(min=1) - question: str(min=1) diff --git a/.github/scripts/check-yaml.sh b/.github/scripts/check-yaml.sh index 6cc7044fe..bf35db10f 100755 --- a/.github/scripts/check-yaml.sh +++ b/.github/scripts/check-yaml.sh @@ -18,28 +18,22 @@ if [ $# -lt 1 ]; then exit 1 fi +SCHEMAS="$(dirname ${BASH_SOURCE[0]})/../schemas" CHANGED_FILES="$@" ERR=0 -error() { echo "ERROR: $file:$@" 1>&2; ERR=1; } -warn() { echo "WARN: $file:$@" 1>&2; } +error() { printf "ERROR: %s: %s \"%s\"\n" "$1" "$2" "$3" 1>&2; ERR=1; } +warn() { printf "WARN: %s: %s \"%s\"\n" "$1" "$2" "$3" 1>&2; } for file in ${CHANGED_FILES}; do - case $file in knowledge*) - error "1:1: We do not accept knowledge PRs at this time" + case $file in + compositional_skills/*/qna.yaml) + eval "$(check-jsonschema --schemafile $SCHEMAS/compositional_skills.json -o JSON $file | jq -r '.errors[] | (.path | ltrimstr("$")) as $path | "\($path)|line" as $yqline | @sh "$(yq \($yqline) \(.filename))" as $yqcmd | @sh "\(.message[-200:])" as $message | "error \"\(.filename):\($yqcmd):1\" \"\($path)\" \($message)"')" + ;; + knowledge/*) + error "$file:1:1" "." "We do not accept knowledge PRs at this time" + ;; + *) + error "$file:1:1" "." "Taxonomy file must be named 'qna.yaml', not '$(basename $file)'" + ;; esac - if [[ "$file" != *"/qna.yaml" ]]; then - error "1:1: Skills file has to be named 'qna.yaml', not '$(basename $file)'" - fi - yq '.created_by | length > 0' $file | grep -q false && error "$(yq '.created_by|line' $file):1: missing/empty 'created_by'" - yq '.task_description | length > 0' $file | grep -q false && warn "$(yq '.task_description|line' $file):1: missing/empty 'task_description'" - yq '.seed_examples' $file | grep -q null && error "$(yq '.seed_examples|line' $file):1: missing 'seed_examples'" - yq '.seed_examples | length >= 5' $file | grep -q false && error "$(yq '.seed_examples|line' $file):1: less than 5 'seed_examples'" - yq '.seed_examples[] | .question | length > 0' $file | grep -q false && error "$(yq '.seed_examples|line' $file):1: missing/empty 'question's" - yq '.seed_examples[] | .answer | length > 0' $file | grep -q false && error "$(yq '.seed_examples|line' $file):1: missing/empty 'answer's" - if $( yq '.seed_examples[] | has("context")' $file | grep -q true ); then - yq '.seed_examples[] | .context| length > 0' $file | grep -q false && error "$(yq '.seed_examples|line' $file):1: missing/empty 'context's" - fi - yq '.seed_examples[].attribution | length > 0' $file | grep -q false && error "$(yq '.seed_examples|line' $file):1: missing/empty 'attribution's" - yq '.seed_examples[].attribution[].source | length > 0' $file | grep -q false && error "$(yq '.seed_examples|line' $file):1: missing/empty 'attribution source's" - yq '.seed_examples[].attribution[].license | length > 0' $file | grep -q false && error "$(yq '.seed_examples|line' $file):1: missing/empty 'attribution license's" done exit $ERR diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index eafa9d293..d9c98a87b 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -66,6 +66,7 @@ jobs: - name: "Check file contents" if: steps.changed-files.outputs.any_changed == 'true' run: | + pip install check-jsonschema echo "::add-matcher::.github/workflows/matchers/lint.json" echo .github/scripts/check-yaml.sh "${{ steps.changed-files.outputs.all_changed_files }}"