From ff2f9119055e5a19e52ea53f04afb4ffd25b23f3 Mon Sep 17 00:00:00 2001 From: eelpigna Date: Fri, 13 Dec 2024 13:43:43 +0100 Subject: [PATCH] feat: completion spark job (#210) * llm_metrics job with tests * feat(spark): add completion spark job * feat: add spark-test service * fix(spark): completion job entry point * fix(spark): minor fix * fix(spark): renamed completion classes --------- Co-authored-by: carlopignatiello --- docker-compose.yaml | 10 +++ spark-test/Dockerfile | 11 +++ spark-test/conf.py | 14 +++ spark-test/main.py | 37 ++++++++ spark-test/requirements.txt | 1 + spark/jobs/completion_job.py | 10 ++- spark/jobs/metrics/completion_metrics.py | 85 +++++++++++++++++++ spark/jobs/models/completion_dataset.py | 35 ++++++++ spark/tests/completion_metrics_test.py | 45 ++++++++++ spark/tests/resources/completion/metrics.json | 4 + .../resources/completion/metrics_one.json | 5 ++ .../results/completion_metrics_results.py | 45 ++++++++++ 12 files changed, 300 insertions(+), 2 deletions(-) create mode 100644 spark-test/Dockerfile create mode 100644 spark-test/conf.py create mode 100644 spark-test/main.py create mode 100644 spark-test/requirements.txt create mode 100644 spark/jobs/metrics/completion_metrics.py create mode 100644 spark/jobs/models/completion_dataset.py create mode 100644 spark/tests/completion_metrics_test.py create mode 100644 spark/tests/resources/completion/metrics.json create mode 100644 spark/tests/resources/completion/metrics_one.json create mode 100644 spark/tests/results/completion_metrics_results.py diff --git a/docker-compose.yaml b/docker-compose.yaml index 4227869e..42f727d4 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -203,6 +203,16 @@ services: start_period: 5s retries: 2 + spark-test: + profiles: ["spark-test"] + build: ./spark-test + environment: + JOB_NAME: "completion" + KUBECONFIG_FILE_PATH: "/opt/kubeconfig/kubeconfig.yaml" + SPARK_IMAGE: "radicalbit-spark-py:develop" + volumes: + - ./docker/k3s_data/kubeconfig/kubeconfig.yaml:/opt/kubeconfig/kubeconfig.yaml + dind: image: docker:dind privileged: true diff --git a/spark-test/Dockerfile b/spark-test/Dockerfile new file mode 100644 index 00000000..369dd8f9 --- /dev/null +++ b/spark-test/Dockerfile @@ -0,0 +1,11 @@ +FROM python:3.11.8-slim + +WORKDIR /spark-test + +COPY requirements.txt requirements.txt + +RUN pip install --no-cache-dir -r requirements.txt + +COPY . . + +CMD ["python3", "main.py"] \ No newline at end of file diff --git a/spark-test/conf.py b/spark-test/conf.py new file mode 100644 index 00000000..4165da02 --- /dev/null +++ b/spark-test/conf.py @@ -0,0 +1,14 @@ +def create_secrets(): + return { + "AWS_ACCESS_KEY_ID": "minio", + "AWS_SECRET_ACCESS_KEY": "minio123", + "AWS_REGION": "us-east-1", + "S3_ENDPOINT_URL": "http://minio:9000", + "POSTGRES_URL": "jdbc:postgresql://postgres:5432/radicalbit", + "POSTGRES_DB": "radicalbit", + "POSTGRES_HOST": "postgres", + "POSTGRES_PORT": "5432", + "POSTGRES_USER": "postgres", + "POSTGRES_PASSWORD": "postgres", + "POSTGRES_SCHEMA": "public", + } diff --git a/spark-test/main.py b/spark-test/main.py new file mode 100644 index 00000000..268e052c --- /dev/null +++ b/spark-test/main.py @@ -0,0 +1,37 @@ +import os +from conf import create_secrets +from uuid import uuid4 +from spark_on_k8s.k8s.sync_client import KubernetesClientManager +from spark_on_k8s.client import SparkOnK8S + +envs = ["KUBECONFIG_FILE_PATH", "JOB_NAME", "SPARK_IMAGE"] + +for var in envs: + if var not in os.environ: + raise EnvironmentError("Failed because {} is not set.".format(var)) + +kube_conf = os.environ["KUBECONFIG_FILE_PATH"] +job_name = os.environ["JOB_NAME"] +spark_image = os.environ["SPARK_IMAGE"] + +k8s_client_manager = KubernetesClientManager(kube_conf) +spark_k8s_client = SparkOnK8S(k8s_client_manager=k8s_client_manager) + +path = "s3a://test-bucket/metrics_one.json" + +spark_k8s_client.submit_app( + image=spark_image, + app_path=f"local:///opt/spark/custom_jobs/{job_name}_job.py", + app_arguments=[ + path, + str(uuid4()), + "completion_dataset_metrics", + "completion_dataset" + ], + app_name=f"{spark_image}-completion-job", + namespace="spark", + service_account="spark", + app_waiter="no_wait", + image_pull_policy="IfNotPresent", + secret_values=create_secrets(), +) diff --git a/spark-test/requirements.txt b/spark-test/requirements.txt new file mode 100644 index 00000000..24216a5a --- /dev/null +++ b/spark-test/requirements.txt @@ -0,0 +1 @@ +spark-on-k8s==0.10.1 \ No newline at end of file diff --git a/spark/jobs/completion_job.py b/spark/jobs/completion_job.py index d601c223..93ab9a91 100644 --- a/spark/jobs/completion_job.py +++ b/spark/jobs/completion_job.py @@ -1,8 +1,10 @@ import sys import os import uuid - +import orjson from pyspark.sql.types import StructField, StructType, StringType + +from metrics.completion_metrics import CompletionMetrics from utils.models import JobStatus from utils.db import update_job_status, write_to_db @@ -13,7 +15,11 @@ def compute_metrics(df: DataFrame) -> dict: complete_record = {} - # TODO: compute model quality metrics + completion_service = CompletionMetrics() + model_quality = completion_service.extract_metrics(df) + complete_record["MODEL_QUALITY"] = orjson.dumps(model_quality.model_dump(serialize_as_any=True)).decode( + "utf-8" + ) return complete_record diff --git a/spark/jobs/metrics/completion_metrics.py b/spark/jobs/metrics/completion_metrics.py new file mode 100644 index 00000000..51ab9c0f --- /dev/null +++ b/spark/jobs/metrics/completion_metrics.py @@ -0,0 +1,85 @@ +import pyspark.sql.functions as F +import numpy as np +from pyspark.sql import DataFrame +from pyspark.sql.types import FloatType + +from models.completion_dataset import CompletionMetricsModel + + +class CompletionMetrics: + def __init__(self): + pass + + @staticmethod + @F.udf(FloatType()) + def compute_probability_udf(log_prob: float) -> float: + return float(np.exp(log_prob)) + + @staticmethod + @F.udf(FloatType()) + def compute_perplexity(log_probs: list[float]) -> float: + return float(np.exp(-np.mean(log_probs))) + + @staticmethod + @F.udf(FloatType()) + def compute_prob_mean_per_phrase(probs: list[float]) -> float: + return float(np.mean(probs)) + + @staticmethod + def remove_columns(df: DataFrame) -> DataFrame: + df = df.drop( + "model", + "object", + "created", + "system_fingerprint", + "usage", + "service_tier", + ) + return df + + def compute_prob(self, df: DataFrame): + df = df.select(F.explode("choices").alias("element"), F.col("id")) + df = df.select( + F.col("id"), F.explode("element.logprobs.content").alias("content") + ) + df = df.select("id", "content.logprob", "content.token").withColumn( + "prob", self.compute_probability_udf("logprob") + ) + return df + + def extract_metrics(self, df: DataFrame) -> CompletionMetricsModel: + df = self.remove_columns(df) + df = self.compute_prob(df) + df_prob = df.drop("logprob") + df_prob = df_prob.groupBy("id").agg( + F.collect_list(F.struct("token", "prob")).alias("probs") + ) + df_mean_values = df.groupBy("id").agg( + self.compute_prob_mean_per_phrase(F.collect_list("prob")).alias( + "prob_per_phrase" + ), + self.compute_perplexity(F.collect_list("logprob")).alias( + "perplex_per_phrase" + ), + ) + df = df_mean_values.agg( + F.mean("prob_per_phrase").alias("prob_tot_mean"), + F.mean("perplex_per_phrase").alias("perplex_tot_mean"), + ) + tokens = [ + { + "id": row["id"], + "probs": [ + {"token": prob["token"], "prob": prob["prob"]} + for prob in row["probs"] + ], + } + for row in df_prob.toLocalIterator() + ] + + res = { + "tokens": tokens, + "mean_per_phrase": df_mean_values.toPandas().to_dict(orient="records"), + "mean_per_file": df.toPandas().to_dict(orient="records"), + } + return CompletionMetricsModel(**res) diff --git a/spark/jobs/models/completion_dataset.py b/spark/jobs/models/completion_dataset.py new file mode 100644 index 00000000..3169b1de --- /dev/null +++ b/spark/jobs/models/completion_dataset.py @@ -0,0 +1,35 @@ +from pydantic import BaseModel, confloat, ConfigDict +from typing import List + + +class Prob(BaseModel): + token: str + prob: confloat(ge=0, le=1) + + +class Probs(BaseModel): + id: str + probs: List[Prob] + + model_config = ConfigDict(ser_json_inf_nan="null") + + +class MeanPerPhrase(BaseModel): + id: str + prob_per_phrase: confloat(ge=0, le=1) + perplex_per_phrase: confloat(ge=1) + + model_config = ConfigDict(ser_json_inf_nan="null") + + +class MeanPerFile(BaseModel): + prob_tot_mean: confloat(ge=0, le=1) + perplex_tot_mean: confloat(ge=1) + + model_config = ConfigDict(ser_json_inf_nan="null") + + +class CompletionMetricsModel(BaseModel): + tokens: List[Probs] + mean_per_phrase: List[MeanPerPhrase] + mean_per_file: List[MeanPerFile] diff --git a/spark/tests/completion_metrics_test.py b/spark/tests/completion_metrics_test.py new file mode 100644 index 00000000..d4c81c43 --- /dev/null +++ b/spark/tests/completion_metrics_test.py @@ -0,0 +1,45 @@ +import pytest +import orjson +from jobs.completion_job import compute_metrics +from jobs.metrics.completion_metrics import CompletionMetrics +from jobs.models.completion_dataset import CompletionMetricsModel +from tests.results.completion_metrics_results import completion_metric_results + + +@pytest.fixture +def input_file(spark_fixture, test_data_dir): + yield spark_fixture.read.option("multiline", "true").json( + f"{test_data_dir}/completion/metrics.json" + ) + + +def test_remove_columns(spark_fixture, input_file): + completion_metrics_service = CompletionMetrics() + df = completion_metrics_service.remove_columns(input_file) + assert "id" in df.columns + assert "choices" in df.columns + assert len(df.columns) == 2 + + +def test_compute_prob(spark_fixture, input_file): + completion_metrics_service = CompletionMetrics() + df = completion_metrics_service.remove_columns(input_file) + df = completion_metrics_service.compute_prob(df) + assert {"id", "logprob", "token", "prob"} == set(df.columns) + assert not df.rdd.isEmpty() + + +def test_extract_metrics(spark_fixture, input_file): + completion_metrics_service = CompletionMetrics() + completion_metrics_model: CompletionMetricsModel = completion_metrics_service.extract_metrics(input_file) + assert len(completion_metrics_model.tokens) > 0 + assert len(completion_metrics_model.mean_per_phrase) > 0 + assert len(completion_metrics_model.mean_per_file) > 0 + + +def test_compute_metrics(spark_fixture, input_file): + complete_record = compute_metrics(input_file) + model_quality = complete_record.get("MODEL_QUALITY") + assert model_quality == orjson.dumps(completion_metric_results).decode( + "utf-8" + ) diff --git a/spark/tests/resources/completion/metrics.json b/spark/tests/resources/completion/metrics.json new file mode 100644 index 00000000..1062803f --- /dev/null +++ b/spark/tests/resources/completion/metrics.json @@ -0,0 +1,4 @@ +[ + {"id": "chatcmpl-AcWID2SsE5iuK6z5AhNCKv3WUcCxN", "choices": [{"finish_reason": "stop", "index": 0, "logprobs": {"content": [{"token": "Sure", "bytes": [83, 117, 114, 101], "logprob": -0.61251247, "top_logprobs": []}, {"token": ",", "bytes": [44], "logprob": -0.102561064, "top_logprobs": []}, {"token": " go", "bytes": [32, 103, 111], "logprob": -2.5411978, "top_logprobs": []}, {"token": " ahead", "bytes": [32, 97, 104, 101, 97, 100], "logprob": -0.0014073749, "top_logprobs": []}, {"token": ".", "bytes": [46], "logprob": -2.0402107, "top_logprobs": []}, {"token": " What's", "bytes": [32, 87, 104, 97, 116, 39, 115], "logprob": -0.8943377, "top_logprobs": []}, {"token": " up", "bytes": [32, 117, 112], "logprob": -0.08216706, "top_logprobs": []}, {"token": "?", "bytes": [63], "logprob": -4.978234e-05, "top_logprobs": []}], "refusal": null}, "message": {"content": "Sure, go ahead. What's up?", "refusal": null, "role": "assistant", "tool_calls": [], "parsed": null}}], "created": 1733743961, "model": "gpt-4o-2024-08-06", "object": "chat.completion", "system_fingerprint": "afnfwuinawufwa", "usage": {"completion_tokens": 8, "prompt_tokens": 45, "total_tokens": 53, "completion_tokens_details": {"accepted_prediction_tokens": 0, "audio_tokens": 0, "reasoning_tokens": 0, "rejected_prediction_tokens": 0}, "prompt_tokens_details": {"audio_tokens": 0, "cached_tokens": 0}}}, + {"id": "chatcmpl-AcYMMPLnpkksCdLze3M8nnqQbfqVG", "choices": [{"finish_reason": "stop", "index": 0, "logprobs": {"content": [{"token": "Certainly", "bytes": [67, 101, 114, 116, 97, 105, 110, 108, 121], "logprob": -3.8160203, "top_logprobs": []}, {"token": "!", "bytes": [33], "logprob": -0.11697425, "top_logprobs": []}, {"token": " Just", "bytes": [32, 74, 117, 115, 116], "logprob": -5.9011784, "top_logprobs": []}, {"token": " let", "bytes": [32, 108, 101, 116], "logprob": -0.666558, "top_logprobs": []}, {"token": " me", "bytes": [32, 109, 101], "logprob": -5.574252e-05, "top_logprobs": []}, {"token": " know", "bytes": [32, 107, 110, 111, 119], "logprob": -0.0008052219, "top_logprobs": []}, {"token": " how", "bytes": [32, 104, 111, 119], "logprob": -0.7132411, "top_logprobs": []}, {"token": ".", "bytes": [46], "logprob": -0.034184996, "top_logprobs": []}], "refusal": null}, "message": {"content": "Certainly! Just let me know how.", "refusal": null, "role": "assistant", "tool_calls": [], "parsed": null}}], "created": 1733751906, "model": "gpt-4o-2024-08-06", "object": "chat.completion", "system_fingerprint": "afnfwuinawufwa", "usage": {"completion_tokens": 8, "prompt_tokens": 45, "total_tokens": 53, "completion_tokens_details": {"accepted_prediction_tokens": 0, "audio_tokens": 0, "reasoning_tokens": 0, "rejected_prediction_tokens": 0}, "prompt_tokens_details": {"audio_tokens": 0, "cached_tokens": 0}}} +] \ No newline at end of file diff --git a/spark/tests/resources/completion/metrics_one.json b/spark/tests/resources/completion/metrics_one.json new file mode 100644 index 00000000..7395065d --- /dev/null +++ b/spark/tests/resources/completion/metrics_one.json @@ -0,0 +1,5 @@ +[ + {"id": "chatcmpl-AcYPBK1t4QUvRG1oiS2JjoK0xdrGH", "choices": [{"finish_reason": "stop", "index": 0, "logprobs": {"content": [{"token": "Sure", "bytes": [83, 117, 114, 101], "logprob": -0.46845868, "top_logprobs": []}, {"token": "!", "bytes": [33], "logprob": -0.82938546, "top_logprobs": []}, {"token": " Please", "bytes": [32, 80, 108, 101, 97, 115, 101], "logprob": -2.1082883, "top_logprobs": []}, {"token": " ask", "bytes": [32, 97, 115, 107], "logprob": -0.67591065, "top_logprobs": []}, {"token": " your", "bytes": [32, 121, 111, 117, 114], "logprob": -0.022492433, "top_logprobs": []}, {"token": " question", "bytes": [32, 113, 117, 101, 115, 116, 105, 111, 110], "logprob": -0.13909559, "top_logprobs": []}, {"token": ".", "bytes": [46], "logprob": -3.3818815, "top_logprobs": []}, {"token": " I'll", "bytes": [32, 73, 39, 108, 108], "logprob": -0.4059926, "top_logprobs": []}, {"token": " be", "bytes": [32, 98, 101], "logprob": -1.799196, "top_logprobs": []}, {"token": " concise", "bytes": [32, 99, 111, 110, 99, 105, 115, 101], "logprob": -0.23712571, "top_logprobs": []}, {"token": ".", "bytes": [46], "logprob": -0.03838387, "top_logprobs": []}], "refusal": null}, "message": {"content": "Sure! Please ask your question. I'll be concise.", "refusal": null, "role": "assistant", "tool_calls": [], "parsed": null}}], "created": 1733752081, "model": "gpt-4o-2024-08-06", "object": "chat.completion", "system_fingerprint": "dwadawdwd", "usage": {"completion_tokens": 11, "prompt_tokens": 45, "total_tokens": 56, "completion_tokens_details": {"accepted_prediction_tokens": 0, "audio_tokens": 0, "reasoning_tokens": 0, "rejected_prediction_tokens": 0}, "prompt_tokens_details": {"audio_tokens": 0, "cached_tokens": 0}}}, + {"id": "chatcmpl-AcYQfMsRAIA01ynfZAQb8Zmyc6WKp", "choices": [{"finish_reason": "stop", "index": 0, "logprobs": {"content": [{"token": "Yes", "bytes": [89, 101, 115], "logprob": -5.5577775e-06, "top_logprobs": []}, {"token": ",", "bytes": [44], "logprob": -5.2285613e-05, "top_logprobs": []}, {"token": " Python", "bytes": [32, 80, 121, 116, 104, 111, 110], "logprob": -0.00089550123, "top_logprobs": []}, {"token": " is", "bytes": [32, 105, 115], "logprob": -4.246537e-06, "top_logprobs": []}, {"token": " versatile", "bytes": [32, 118, 101, 114, 115, 97, 116, 105, 108, 101], "logprob": -5.1374755, "top_logprobs": []}, {"token": " and", "bytes": [32, 97, 110, 100], "logprob": -0.22549273, "top_logprobs": []}, {"token": " widely", "bytes": [32, 119, 105, 100, 101, 108, 121], "logprob": -0.06408858, "top_logprobs": []}, {"token": " used", "bytes": [32, 117, 115, 101, 100], "logprob": -0.0007200573, "top_logprobs": []}, {"token": " for", "bytes": [32, 102, 111, 114], "logprob": -0.24617708, "top_logprobs": []}, {"token": " data", "bytes": [32, 100, 97, 116, 97], "logprob": -2.132315, "top_logprobs": []}, {"token": " analysis", "bytes": [32, 97, 110, 97, 108, 121, 115, 105, 115], "logprob": -0.13818723, "top_logprobs": []}, {"token": ",", "bytes": [44], "logprob": -0.10035052, "top_logprobs": []}, {"token": " machine", "bytes": [32, 109, 97, 99, 104, 105, 110, 101], "logprob": -0.33921197, "top_logprobs": []}, {"token": " learning", "bytes": [32, 108, 101, 97, 114, 110, 105, 110, 103], "logprob": 0.0, "top_logprobs": []}, {"token": ",", "bytes": [44], "logprob": -9.138441e-05, "top_logprobs": []}, {"token": " and", "bytes": [32, 97, 110, 100], "logprob": -0.005160108, "top_logprobs": []}, {"token": " monitoring", "bytes": [32, 109, 111, 110, 105, 116, 111, 114, 105, 110, 103], "logprob": -1.1899015, "top_logprobs": []}, {"token": " tasks", "bytes": [32, 116, 97, 115, 107, 115], "logprob": -0.50267833, "top_logprobs": []}, {"token": " due", "bytes": [32, 100, 117, 101], "logprob": -1.4371668, "top_logprobs": []}, {"token": " to", "bytes": [32, 116, 111], "logprob": -8.418666e-06, "top_logprobs": []}, {"token": " its", "bytes": [32, 105, 116, 115], "logprob": -6.9882217e-06, "top_logprobs": []}, {"token": " ease", "bytes": [32, 101, 97, 115, 101], "logprob": -3.047081, "top_logprobs": []}, {"token": " of", "bytes": [32, 111, 102], "logprob": -5.5265704e-05, "top_logprobs": []}, {"token": " use", "bytes": [32, 117, 115, 101], "logprob": -0.000113794704, "top_logprobs": []}, {"token": " and", "bytes": [32, 97, 110, 100], "logprob": -0.0067897392, "top_logprobs": []}, {"token": " robust", "bytes": [32, 114, 111, 98, 117, 115, 116], "logprob": -2.8805246, "top_logprobs": []}, {"token": " libraries", "bytes": [32, 108, 105, 98, 114, 97, 114, 105, 101, 115], "logprob": -0.05405761, "top_logprobs": []}, {"token": ".", "bytes": [46], "logprob": -0.0008492942, "top_logprobs": []}], "refusal": null}, "message": {"content": "Yes, Python is versatile and widely used for data analysis, machine learning, and monitoring tasks due to its ease of use and robust libraries.", "refusal": null, "role": "assistant", "tool_calls": [], "parsed": null}}], "created": 1733752173, "model": "gpt-4o-2024-08-06", "object": "chat.completion", "system_fingerprint": "fawfaw", "usage": {"completion_tokens": 28, "prompt_tokens": 45, "total_tokens": 73, "completion_tokens_details": {"accepted_prediction_tokens": 0, "audio_tokens": 0, "reasoning_tokens": 0, "rejected_prediction_tokens": 0}, "prompt_tokens_details": {"audio_tokens": 0, "cached_tokens": 0}}}, + {"id": "chatcmpl-AcYR545NY6eo8xAdKuEbE60YgrF1a", "choices": [{"finish_reason": "stop", "index": 0, "logprobs": {"content": [{"token": "Yes", "bytes": [89, 101, 115], "logprob": -0.00012582695, "top_logprobs": []}, {"token": ",", "bytes": [44], "logprob": -2.8206474e-05, "top_logprobs": []}, {"token": " Rust", "bytes": [32, 82, 117, 115, 116], "logprob": -0.0001658757, "top_logprobs": []}, {"token": " is", "bytes": [32, 105, 115], "logprob": -0.0629955, "top_logprobs": []}, {"token": " known", "bytes": [32, 107, 110, 111, 119, 110], "logprob": -0.12907438, "top_logprobs": []}, {"token": " for", "bytes": [32, 102, 111, 114], "logprob": -6.704273e-07, "top_logprobs": []}, {"token": " its", "bytes": [32, 105, 116, 115], "logprob": -0.053365633, "top_logprobs": []}, {"token": " performance", "bytes": [32, 112, 101, 114, 102, 111, 114, 109, 97, 110, 99, 101], "logprob": -0.23435384, "top_logprobs": []}, {"token": ",", "bytes": [44], "logprob": -0.0018123905, "top_logprobs": []}, {"token": " memory", "bytes": [32, 109, 101, 109, 111, 114, 121], "logprob": -0.97972125, "top_logprobs": []}, {"token": " safety", "bytes": [32, 115, 97, 102, 101, 116, 121], "logprob": -1.8908588e-05, "top_logprobs": []}, {"token": ",", "bytes": [44], "logprob": -0.0049610855, "top_logprobs": []}, {"token": " and", "bytes": [32, 97, 110, 100], "logprob": -5.3954464e-05, "top_logprobs": []}, {"token": " concurrency", "bytes": [32, 99, 111, 110, 99, 117, 114, 114, 101, 110, 99, 121], "logprob": -0.08939207, "top_logprobs": []}, {"token": " capabilities", "bytes": [32, 99, 97, 112, 97, 98, 105, 108, 105, 116, 105, 101, 115], "logprob": -2.2125401, "top_logprobs": []}, {"token": ".", "bytes": [46], "logprob": -0.38892052, "top_logprobs": []}], "refusal": null}, "message": {"content": "Yes, Rust is known for its performance, memory safety, and concurrency capabilities.", "refusal": null, "role": "assistant", "tool_calls": [], "parsed": null}}], "created": 1733752199, "model": "gpt-4o-2024-08-06", "object": "chat.completion", "system_fingerprint": "wafwafwf", "usage": {"completion_tokens": 16, "prompt_tokens": 46, "total_tokens": 62, "completion_tokens_details": {"accepted_prediction_tokens": 0, "audio_tokens": 0, "reasoning_tokens": 0, "rejected_prediction_tokens": 0}, "prompt_tokens_details": {"audio_tokens": 0, "cached_tokens": 0}}} +] \ No newline at end of file diff --git a/spark/tests/results/completion_metrics_results.py b/spark/tests/results/completion_metrics_results.py new file mode 100644 index 00000000..b98aa14b --- /dev/null +++ b/spark/tests/results/completion_metrics_results.py @@ -0,0 +1,45 @@ +completion_metric_results = { + "tokens": [ + { + "id": "chatcmpl-AcWID2SsE5iuK6z5AhNCKv3WUcCxN", + "probs": [ + {"token": "Sure", "prob": 0.541987419128418}, + {"token": ",", "prob": 0.9025230407714844}, + {"token": " go", "prob": 0.07877199351787567}, + {"token": " ahead", "prob": 0.9985936284065247}, + {"token": ".", "prob": 0.13000132143497467}, + {"token": " What's", "prob": 0.40887829661369324}, + {"token": " up", "prob": 0.9211180806159973}, + {"token": "?", "prob": 0.9999502301216125}, + ], + }, + { + "id": "chatcmpl-AcYMMPLnpkksCdLze3M8nnqQbfqVG", + "probs": [ + {"token": "Certainly", "prob": 0.022015240043401718}, + {"token": "!", "prob": 0.8896080851554871}, + {"token": " Just", "prob": 0.0027362185064703226}, + {"token": " let", "prob": 0.5134729146957397}, + {"token": " me", "prob": 0.999944269657135}, + {"token": " know", "prob": 0.9991950988769531}, + {"token": " how", "prob": 0.49005329608917236}, + {"token": ".", "prob": 0.9663926959037781}, + ], + }, + ], + "mean_per_phrase": [ + { + "id": "chatcmpl-AcWID2SsE5iuK6z5AhNCKv3WUcCxN", + "prob_per_phrase": 0.6227279901504517, + "perplex_per_phrase": 2.190884828567505, + }, + { + "id": "chatcmpl-AcYMMPLnpkksCdLze3M8nnqQbfqVG", + "prob_per_phrase": 0.61042720079422, + "perplex_per_phrase": 4.080123424530029, + }, + ], + "mean_per_file": [ + {"prob_tot_mean": 0.6165775954723358, "perplex_tot_mean": 3.135504126548767} + ], +}