Merge branch 'main' of https://github.com/stanford-crfm/helm into new…

…vlms
stanford-crfm · Dec 27, 2024 · 6f4d829 · 6f4d829
2 parents bfa44f7 + e2e7270
commit 6f4d829
Show file tree

Hide file tree

Showing 50 changed files with 1,872 additions and 101 deletions.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -5,3 +5,4 @@ recursive-include src/helm/benchmark/static/ *.css *.html *.js *.png *.yaml
 recursive-include src/helm/benchmark/static_build/ *.css *.html *.js *.png *.yaml
 recursive-include src/helm/config/ *.yaml
 recursive-include src/helm/benchmark/annotation/omni_math/ *.txt
+recursive-include src/helm/benchmark/annotation/wildbench/ *.md
diff --git a/README.md b/README.md
@@ -23,6 +23,7 @@ This repository contains code used to produce results for the following papers:
 
 - **Holistic Evaluation of Vision-Language Models (VHELM)** - [paper](https://arxiv.org/abs/2410.07112), [leaderboard](https://crfm.stanford.edu/helm/vhelm/latest/), [documentation](https://crfm-helm.readthedocs.io/en/latest/vhelm/)
 - **Holistic Evaluation of Text-To-Image Models (HEIM)** - [paper](https://arxiv.org/abs/2311.04287), [leaderboard](https://crfm.stanford.edu/helm/heim/latest/), [documentation](https://crfm-helm.readthedocs.io/en/latest/heim/)
+- **Enterprise Benchmarks for Large Language Model Evaluation** - [paper](https://arxiv.org/abs/2410.12857), [documentation](https://crfm-helm.readthedocs.io/en/latest/enterprise_benchmark/)
 
 The HELM Python package can be used to reproduce the published model evaluation results from these papers. To get started, refer to the documentation links above for the corresponding paper, or the [main Reproducing Leaderboards documentation](https://crfm-helm.readthedocs.io/en/latest/reproducing_leaderboards/).
 

diff --git a/docs/enterprise_benchmark.md b/docs/enterprise_benchmark.md
@@ -13,7 +13,8 @@ from non-domain-specific benchmarks, such as benchmarks for language capabilitie
 Therefore, it is important to use a domain-specific dataset whose distribution is close to that of the actual application domain.
 
 <!-- Here, public datasets from the above four domains were curated and corresponding scenarios were implemented. -->
-The following scenarios are added. 
+The following scenarios are added.
+
 - Finance
     - gold_commodity_news (news_headline)
     - (WIP) financial_phrasebank
@@ -133,18 +134,19 @@ This study is published in the following paper. Please cite this paper if you us
 }
 ```
 
+![Finance benchmark results](helm-eb-finance-2024.png "Finance benchmark results")
+
+![Legal benchmark results](helm-eb-legal-2024.png "Legal benchmark results")
+
+![Climate and sustainability benchmark results](helm-eb-climate-2024.png "Climate and sustainability benchmark results")
 
-- ![Finance benchmark results](helm-eb-finance-2024.png "Finance benchmark results")
-- ![Legal benchmark results](helm-eb-legal-2024.png "Legal benchmark results")
-- ![Climate and sustainability benchmark results](helm-eb-climate-2024.png "Climate and sustainability benchmark results")
-- ![Cyber security benchmark results](helm-eb-cybersecurity-2024.png "Cyber security benchmark results")
+![Cyber security benchmark results](helm-eb-cybersecurity-2024.png "Cyber security benchmark results")
 
 ## Contributors
+
 Original contributors are as follows:
+
 - Yada Zhu, Kate Soule (MIT-IBM Watson AI Lab)
 - Mikio Takeuchi, Ryo Kawahara, Futoshi Iwama, Alisa Arno (IBM Research - Tokyo)
 - Bing Zhang, Shubhi Asthana (IBM Almaden Research Lab)
 - Md Maruf Hossain, Naoto Satoh, Guang-Jie Ren (former IBM members)
-
-Contributors of the integration to the HELM repository are as follows:
-- Yifan Mai (Stanford University)
diff --git a/docs/index.md b/docs/index.md
@@ -24,5 +24,6 @@ This repository contains code used to produce results for the following papers:
 
 - **Holistic Evaluation of Vision-Language Models (VHELM)** - [paper](https://arxiv.org/abs/2410.07112), [leaderboard](https://crfm.stanford.edu/helm/vhelm/latest/), [documentation](https://crfm-helm.readthedocs.io/en/latest/vhelm/)
 - **Holistic Evaluation of Text-To-Image Models (HEIM)** - [paper](https://arxiv.org/abs/2311.04287), [leaderboard](https://crfm.stanford.edu/helm/heim/latest/), [documentation](https://crfm-helm.readthedocs.io/en/latest/heim/)
+- **Enterprise Benchmarks for Large Language Model Evaluation** - [paper](https://arxiv.org/abs/2410.12857), [documentation](https://crfm-helm.readthedocs.io/en/latest/enterprise_benchmark/)
 
 The HELM Python package can be used to reproduce the published model evaluation results from these papers. To get started, refer to the documentation links above for the corresponding paper, or the [main Reproducing Leaderboards documentation](https://crfm-helm.readthedocs.io/en/latest/reproducing_leaderboards/).
diff --git a/docs/reproducing_leaderboards.md b/docs/reproducing_leaderboards.md
@@ -122,4 +122,14 @@ export SCHEMA_PATH=schema_air_bench.yaml
 export NUM_TRAIN_TRIALS=1
 export NUM_EVAL_INSTANCES=10000
 export PRIORITY=2
+```
+
+### Safety
+
+```bash
+export RUN_ENTRIES_CONF_PATH=run_entries_safety.conf
+export SCHEMA_PATH=schema_safety.yaml
+export NUM_TRAIN_TRIALS=1
+export NUM_EVAL_INSTANCES=1000
+export PRIORITY=2
 ```
diff --git a/helm-frontend/src/components/Instances.tsx b/helm-frontend/src/components/Instances.tsx
@@ -23,9 +23,15 @@ interface Props {
   runName: string;
   suite: string;
   metricFieldMap: MetricFieldMap;
+  userAgreed: boolean;
 }
 
-export default function Instances({ runName, suite, metricFieldMap }: Props) {
+export default function Instances({
+  runName,
+  suite,
+  metricFieldMap,
+  userAgreed,
+}: Props) {
   const [searchParams, setSearchParams] = useSearchParams();
   const [instances, setInstances] = useState<Instance[]>([]);
   const [displayPredictionsMap, setDisplayPredictionsMap] = useState<
@@ -43,9 +49,9 @@ export default function Instances({ runName, suite, metricFieldMap }: Props) {
 
       const [instancesResp, displayPredictions, displayRequests] =
         await Promise.all([
-          getInstances(runName, signal, suite),
-          getDisplayPredictionsByName(runName, signal, suite),
-          getDisplayRequestsByName(runName, signal, suite),
+          getInstances(runName, signal, suite, userAgreed),
+          getDisplayPredictionsByName(runName, signal, suite, userAgreed),
+          getDisplayRequestsByName(runName, signal, suite, userAgreed),
         ]);
       setInstances(instancesResp);
 
@@ -93,7 +99,7 @@ export default function Instances({ runName, suite, metricFieldMap }: Props) {
     void fetchData();
 
     return () => controller.abort();
-  }, [runName, suite]);
+  }, [runName, suite, userAgreed]);
 
   const pagedInstances = instances.slice(
     (currentInstancesPage - 1) * INSTANCES_PAGE_SIZE,

diff --git a/helm-frontend/src/components/Landing/CapabilitiesLanding.tsx b/helm-frontend/src/components/Landing/CapabilitiesLanding.tsx
@@ -0,0 +1,40 @@
+import MiniLeaderboard from "@/components/MiniLeaderboard";
+import { Link } from "react-router-dom";
+
+export default function CapabilitiesLanding() {
+  return (
+    <div className="container mx-auto px-16">
+      <h1 className="text-3xl my-8 font-bold text-center">HELM Capabilities</h1>
+      <div className="flex flex-col lg:flex-row gap-8">
+        <div className="flex-1 text-l">
+          <p>
+            HELM Capabilities is a new leaderboard for benchmarking the
+            capabilities of foundation models, featuring 6 challenging
+            scenarios.
+          </p>
+          <div className="flex flex-row justify-center my-4">
+            <Link to="#" className="px-10 btn rounded-md mx-4">
+              Blog Post
+            </Link>
+            <Link to="leaderboard" className="px-10 btn rounded-md mx-4">
+              Leaderboard
+            </Link>
+          </div>
+        </div>
+        <div
+          className="py-2 pb-6 rounded-3xl bg-gray-100 h-full" // Stretched to full height
+          style={{ maxWidth: "100%" }}
+        >
+          <MiniLeaderboard />
+          <div className="flex justify-end">
+            <Link to="leaderboard">
+              <button className="px-4 mx-3 mt-1 btn bg-white rounded-md">
+                <span>See More</span>
+              </button>
+            </Link>
+          </div>
+        </div>
+      </div>
+    </div>
+  );
+}
diff --git a/helm-frontend/src/components/Landing/HomeLanding.tsx b/helm-frontend/src/components/Landing/HomeLanding.tsx
@@ -15,6 +15,7 @@ import bigscience from "@/assets/logos/bigscience.png";
 import cohere from "@/assets/logos/cohere.png";
 import eleutherai from "@/assets/logos/eleutherai.png";
 import google from "@/assets/logos/google.png";
+import ibm from "@/assets/logos/ibm.png";
 import meta from "@/assets/logos/meta.png";
 import microsoft from "@/assets/logos/microsoft.png";
 import mistral from "@/assets/logos/mistral.png";
@@ -36,6 +37,7 @@ const logos = [
   cohere,
   eleutherai,
   google,
+  ibm,
   meta,
   microsoft,
   mistral,

diff --git a/helm-frontend/src/components/Landing/LiteLanding.tsx b/helm-frontend/src/components/Landing/LiteLanding.tsx
@@ -9,12 +9,14 @@ import Hero from "@/components/Hero";
 //import scenariosByMetrics from "@/assets/scenarios-by-metrics.png";
 //import taxonomyScenarios from "@/assets/taxonomy-scenarios.png";
 import ai21 from "@/assets/logos/ai21.png";
+import aisingapore from "@/assets/logos/aisingapore.png";
 import alephAlpha from "@/assets/logos/aleph-alpha.png";
 import anthropic from "@/assets/logos/anthropic.png";
 import bigscience from "@/assets/logos/bigscience.png";
 import cohere from "@/assets/logos/cohere.png";
 import eleutherai from "@/assets/logos/eleutherai.png";
 import google from "@/assets/logos/google.png";
+import ibm from "@/assets/logos/ibm.png";
 import meta from "@/assets/logos/meta.png";
 import microsoft from "@/assets/logos/microsoft.png";
 import mistral from "@/assets/logos/mistral.png";
@@ -29,12 +31,14 @@ import zeroOne from "@/assets/logos/01.png";
 
 const logos = [
   ai21,
+  aisingapore,
   alephAlpha,
   anthropic,
   bigscience,
   cohere,
   eleutherai,
   google,
+  ibm,
   meta,
   microsoft,
   mistral,

diff --git a/helm-frontend/src/routes/Home.tsx b/helm-frontend/src/routes/Home.tsx
@@ -14,6 +14,7 @@ import Image2StructLanding from "@/components/Landing/Image2StructLanding";
 import EWoKLanding from "@/components/Landing/EWoKLanding";
 import MedicalLanding from "@/components/Landing/MedicalLanding";
 import SafetyLanding from "@/components/Landing/SafetyLanding";
+import CapabilitiesLanding from "@/components/Landing/CapabilitiesLanding";
 
 export default function Home() {
   // TODO consider a more streamlined way to do this?
@@ -47,6 +48,8 @@ export default function Home() {
     return <MedicalLanding />;
   } else if (window.PROJECT_ID === "safety") {
     return <SafetyLanding />;
+  } else if (window.PROJECT_ID === "capabilities") {
+    return <CapabilitiesLanding />;
   } else if (window.PROJECT_ID === "home") {
     return <HomeLanding />;
   } else {

diff --git a/helm-frontend/src/routes/Run.tsx b/helm-frontend/src/routes/Run.tsx
@@ -37,6 +37,9 @@ export default function Run() {
     MetricFieldMap | undefined
   >({});
 
+  const [agreeInput, setAgreeInput] = useState("");
+  const [userAgreed, setUserAgreed] = useState(false);
+
   useEffect(() => {
     const controller = new AbortController();
     async function fetchData() {
@@ -93,6 +96,16 @@ export default function Run() {
     return <Loading />;
   }
 
+  // Handler for agreement
+  const handleAgreement = () => {
+    if (agreeInput.trim() === "Yes, I agree") {
+      setUserAgreed(true);
+    } else {
+      setUserAgreed(false);
+      alert("Please type 'Yes, I agree' exactly.");
+    }
+  };
+
   return (
     <>
       <div className="flex justify-between gap-8 mb-12">
@@ -178,11 +191,47 @@ export default function Run() {
           </Tab>
         </Tabs>
       </div>
+
+      {activeTab === 0 && runName.includes("gpqa") && !userAgreed && (
+        <div className="mb-8">
+          <hr className="my-4" />
+          <p className="mb-4">
+            The GPQA dataset instances are encrypted by default to comply with
+            the following request:
+          </p>
+          <blockquote className="italic border-l-4 border-gray-300 pl-4 text-gray-700 mb-4">
+            “We ask that you do not reveal examples from this dataset in plain
+            text or images online, to minimize the risk of these instances being
+            included in foundation model training corpora.”
+          </blockquote>
+          <p className="mb-4">
+            If you agree to this condition, please type{" "}
+            <strong>"Yes, I agree"</strong> in the box below and then click{" "}
+            <strong>Decrypt</strong>.
+          </p>
+          <div className="flex gap-2 mt-2">
+            <input
+              type="text"
+              value={agreeInput}
+              onChange={(e) => setAgreeInput(e.target.value)}
+              className="input input-bordered"
+              placeholder='Type "Yes, I agree"'
+            />
+            <button onClick={handleAgreement} className="btn btn-primary">
+              Decrypt
+            </button>
+          </div>
+          <hr className="my-4" />
+        </div>
+      )}
+
       {activeTab === 0 ? (
         <Instances
+          key={userAgreed ? "instances-agreed" : "instances-not-agreed"}
           runName={runName}
           suite={runSuite}
           metricFieldMap={metricFieldMap}
+          userAgreed={userAgreed} // Pass the boolean to Instances
         />
       ) : (
         <RunMetrics

diff --git a/helm-frontend/src/services/getDisplayPredictionsByName.ts b/helm-frontend/src/services/getDisplayPredictionsByName.ts
@@ -1,23 +1,93 @@
 import type DisplayPrediction from "@/types/DisplayPrediction";
+import { EncryptionDataMap } from "@/types/EncryptionDataMap";
 import getBenchmarkEndpoint from "@/utils/getBenchmarkEndpoint";
 import getBenchmarkSuite from "@/utils/getBenchmarkSuite";
 
+async function decryptField(
+  ciphertext: string,
+  key: string,
+  iv: string,
+  tag: string,
+): Promise<string> {
+  const decodeBase64 = (str: string) =>
+    Uint8Array.from(atob(str), (c) => c.charCodeAt(0));
+
+  const cryptoKey = await window.crypto.subtle.importKey(
+    "raw",
+    decodeBase64(key),
+    "AES-GCM",
+    true,
+    ["decrypt"],
+  );
+
+  const combinedCiphertext = new Uint8Array([
+    ...decodeBase64(ciphertext),
+    ...decodeBase64(tag),
+  ]);
+
+  const ivArray = decodeBase64(iv);
+
+  const decrypted = await window.crypto.subtle.decrypt(
+    { name: "AES-GCM", iv: ivArray },
+    cryptoKey,
+    combinedCiphertext,
+  );
+
+  return new TextDecoder().decode(decrypted);
+}
+
 export default async function getDisplayPredictionsByName(
   runName: string,
   signal: AbortSignal,
   suite?: string,
+  userAgreed?: boolean,
 ): Promise<DisplayPrediction[]> {
   try {
-    const displayPrediction = await fetch(
+    const response = await fetch(
       getBenchmarkEndpoint(
         `/runs/${
           suite || getBenchmarkSuite()
         }/${runName}/display_predictions.json`,
       ),
       { signal },
     );
+    const displayPredictions = (await response.json()) as DisplayPrediction[];
+
+    if (runName.includes("gpqa") && userAgreed) {
+      const encryptionResponse = await fetch(
+        getBenchmarkEndpoint(
+          `/runs/${
+            suite || getBenchmarkSuite()
+          }/${runName}/encryption_data.json`,
+        ),
+        { signal },
+      );
+      const encryptionData =
+        (await encryptionResponse.json()) as EncryptionDataMap;
+
+      for (const prediction of displayPredictions) {
+        const encryptedText = prediction.predicted_text;
+        const encryptionDetails = encryptionData[encryptedText];
+
+        if (encryptionDetails) {
+          try {
+            prediction.predicted_text = await decryptField(
+              encryptionDetails.ciphertext,
+              encryptionDetails.key,
+              encryptionDetails.iv,
+              encryptionDetails.tag,
+            );
+          } catch (error) {
+            console.error(
+              `Failed to decrypt predicted_text for instance_id: ${prediction.instance_id}`,
+              error,
+            );
+          }
+        }
+      }
+    }
 
-    return (await displayPrediction.json()) as DisplayPrediction[];
+    return displayPredictions;
   } catch (error) {
     if (error instanceof Error && error.name === "AbortError") {
       console.log(error);