Introducing Cross-Backend GPU Error Computation via NN Output Saving …

…and Loading (#858) * Save and load NN outputs for cross-backend test In order to improve testing and cross-backend checks, this change introduces the ability to save and load base files for the GPU error test. A new `baseFileName` argument is added to the `runFP16Test` function in `testnnevalcanary.cpp`. When provided, this argument enables the test to load a previously saved base file instead of recomputing the base positions from scratch. This allows a backend to load a baseline NN output file during testing. The commit introduces two new functions in `testnnevalcanary.cpp`: `saveBaseToFile` and `loadBaseFromFile`. The former takes a vector of NNOutput and saves it to a binary file specified by `baseFileName`, while the latter loads and populates a vector with NNOutput from a binary file. By allowing the test to load base files, it becomes easier to compare and validate results between different backends. This enhances the overall accuracy and reliability of the testing process. * Return 1 if testgpuerror is not successful * Set expected concurrent evals to 2 for Eigen This commit sets the expected concurrent evaluations to 2 for Eigen backend to fix a problem of memory usage explosion by too many concurrent evaluations.
lightvector · Dec 17, 2023 · 9d2043e · 9d2043e
1 parent 55023a4
commit 9d2043e
Show file tree

Hide file tree

Showing 4 changed files with 111 additions and 16 deletions.
diff --git a/cpp/command/contribute.cpp b/cpp/command/contribute.cpp
@@ -918,7 +918,8 @@ int MainCmds::contribute(const vector<string>& args) {
       // Cap test to avoid spawning too many threads when many selfplay games are running
       const int maxBatchSizeCap = std::min(4, 1 + nnEval->getMaxBatchSize()/2);
       bool fp32BatchSuccessBuf = true;
-      bool success = Tests::runFP16Test(nnEval,nnEval32,logger,boardSizeTest,maxBatchSizeCap,verbose,quickTest,fp32BatchSuccessBuf);
+      string baseFileName = "";
+      bool success = Tests::runFP16Test(nnEval,nnEval32,logger,boardSizeTest,maxBatchSizeCap,verbose,quickTest,fp32BatchSuccessBuf, baseFileName);
       if(!fp32BatchSuccessBuf) {
         logger.write("Error: large GPU numerical errors, unable to continue");
         shouldStop.store(true);

diff --git a/cpp/command/gputest.cpp b/cpp/command/gputest.cpp
@@ -26,6 +26,7 @@ int MainCmds::testgpuerror(const vector<string>& args) {
   string modelFile;
   int boardSize;
   bool quickTest;
+  string baseFileName;
   try {
     KataGoCommandLine cmd("Test GPU error between FP16 and FP32 with and without batching");
     cmd.addConfigFileArg(KataGoCommandLine::defaultGtpConfigFileName(),"gtp_example.cfg");
@@ -34,6 +35,8 @@ int MainCmds::testgpuerror(const vector<string>& args) {
     TCLAP::SwitchArg quickArg("","quick","Faster shorter test");
     cmd.add(boardSizeArg);
     cmd.add(quickArg);
+    TCLAP::ValueArg<string> baseFileArg("", "basefile", "Base file to be generated by Eigen backend; loaded by other backends for cross-backend check", false, "", "FILE");
+    cmd.add(baseFileArg);
 
     cmd.setShortUsageArgLimit();
     cmd.addOverrideConfigArg();
@@ -43,6 +46,7 @@ int MainCmds::testgpuerror(const vector<string>& args) {
     modelFile = cmd.getModelFile();
     boardSize = boardSizeArg.getValue();
     quickTest = quickArg.getValue();
+    baseFileName = baseFileArg.getValue();
     cmd.getConfig(cfg);
 
     if(boardSize != 19 && boardSize != 13 && boardSize != 9)
@@ -74,7 +78,13 @@ int MainCmds::testgpuerror(const vector<string>& args) {
     logger.write("For batch test, using default batch size 16");
   }
   const int maxConcurrentEvals = maxBatchSize * 2 + 16;
-  const int expectedConcurrentEvals = maxBatchSize * 2 + 16;
+  int expectedConcurrentEvals = maxBatchSize * 2 + 16;
+
+#ifdef USE_EIGEN_BACKEND
+  if(expectedConcurrentEvals > 2)
+    expectedConcurrentEvals = 2;
+#endif
+
   const bool defaultRequireExactNNLen = false;
 
   NNEvaluator* nnEval;
@@ -106,7 +116,7 @@ int MainCmds::testgpuerror(const vector<string>& args) {
   const int maxBatchSizeCap = -1;
   const bool verbose = true;
   bool fp32BatchSuccessBuf = true;
-  bool success = Tests::runFP16Test(nnEval,nnEval32,logger,boardSize,maxBatchSizeCap,verbose,quickTest,fp32BatchSuccessBuf);
+  bool success = Tests::runFP16Test(nnEval,nnEval32,logger,boardSize,maxBatchSizeCap,verbose,quickTest,fp32BatchSuccessBuf, baseFileName);
   (void)success;
   // cout << success << endl;
 
@@ -116,5 +126,5 @@ int MainCmds::testgpuerror(const vector<string>& args) {
   NeuralNet::globalCleanup();
   ScoreValue::freeTables();
 
-  return 0;
+  return success ? 0 : 1;
 }
diff --git a/cpp/tests/testnnevalcanary.cpp b/cpp/tests/testnnevalcanary.cpp
@@ -276,7 +276,79 @@ struct GpuErrorStats {
   }
 };
 
-bool Tests::runFP16Test(NNEvaluator* nnEval, NNEvaluator* nnEval32, Logger& logger, int boardSize, int maxBatchSizeCap, bool verbose, bool quickTest, bool& fp32BatchSuccessBuf) {
+void saveBaseToFile(const std::vector<std::shared_ptr<NNOutput>>& base, const string& baseFileName, Logger& logger, bool verbose) {
+  assert(baseFileName != "");
+  std::ofstream outFile(baseFileName, std::ios::binary);
+
+  if (!outFile)
+    throw StringError("Unable to save base to: " + baseFileName);
+
+  size_t size = base.size();
+  outFile.write(reinterpret_cast<const char*>(&size), sizeof(size));
+
+  for (const auto& nnOutputPtr : base) {
+    if (nnOutputPtr) {
+      outFile.write(reinterpret_cast<const char*>(&nnOutputPtr->nnHash), sizeof(nnOutputPtr->nnHash));
+      outFile.write(reinterpret_cast<const char*>(&nnOutputPtr->whiteWinProb), sizeof(nnOutputPtr->whiteWinProb));
+      outFile.write(reinterpret_cast<const char*>(&nnOutputPtr->whiteLossProb), sizeof(nnOutputPtr->whiteLossProb));
+      outFile.write(reinterpret_cast<const char*>(&nnOutputPtr->whiteNoResultProb), sizeof(nnOutputPtr->whiteNoResultProb));
+      outFile.write(reinterpret_cast<const char*>(&nnOutputPtr->whiteScoreMean), sizeof(nnOutputPtr->whiteScoreMean));
+      outFile.write(reinterpret_cast<const char*>(&nnOutputPtr->whiteScoreMeanSq), sizeof(nnOutputPtr->whiteScoreMeanSq));
+      outFile.write(reinterpret_cast<const char*>(&nnOutputPtr->whiteLead), sizeof(nnOutputPtr->whiteLead));
+      outFile.write(reinterpret_cast<const char*>(&nnOutputPtr->varTimeLeft), sizeof(nnOutputPtr->varTimeLeft));
+      outFile.write(reinterpret_cast<const char*>(&nnOutputPtr->shorttermWinlossError), sizeof(nnOutputPtr->shorttermWinlossError));
+      outFile.write(reinterpret_cast<const char*>(&nnOutputPtr->shorttermScoreError), sizeof(nnOutputPtr->shorttermScoreError));
+      outFile.write(reinterpret_cast<const char*>(nnOutputPtr->policyProbs), sizeof(float) * NNPos::MAX_NN_POLICY_SIZE);
+      outFile.write(reinterpret_cast<const char*>(&nnOutputPtr->nnXLen), sizeof(nnOutputPtr->nnXLen));
+      outFile.write(reinterpret_cast<const char*>(&nnOutputPtr->nnYLen), sizeof(nnOutputPtr->nnYLen));
+    }
+  }
+
+  if (verbose)
+    logger.write("Saved " + Global::uint64ToString((uint64_t)base.size()) + " positions to: " + baseFileName);
+
+  outFile.close();
+}
+
+void loadBaseFromFile(std::vector<std::shared_ptr<NNOutput>>& base, const string& baseFileName, Logger& logger, bool verbose) {
+  assert(baseFileName != "");
+  std::ifstream inFile(baseFileName, std::ios::binary);
+
+  if (!inFile)
+    throw StringError("Unable to load: " + baseFileName);
+
+  size_t size;
+  inFile.read(reinterpret_cast<char*>(&size), sizeof(size));
+  base.resize(size);
+
+  for (size_t i = 0; i < size; ++i) {
+    base[i] = std::make_shared<NNOutput>();
+
+    inFile.read(reinterpret_cast<char*>(&base[i]->nnHash), sizeof(base[i]->nnHash));
+    inFile.read(reinterpret_cast<char*>(&base[i]->whiteWinProb), sizeof(base[i]->whiteWinProb));
+    inFile.read(reinterpret_cast<char*>(&base[i]->whiteLossProb), sizeof(base[i]->whiteLossProb));
+    inFile.read(reinterpret_cast<char*>(&base[i]->whiteNoResultProb), sizeof(base[i]->whiteNoResultProb));
+    inFile.read(reinterpret_cast<char*>(&base[i]->whiteScoreMean), sizeof(base[i]->whiteScoreMean));
+    inFile.read(reinterpret_cast<char*>(&base[i]->whiteScoreMeanSq), sizeof(base[i]->whiteScoreMeanSq));
+    inFile.read(reinterpret_cast<char*>(&base[i]->whiteLead), sizeof(base[i]->whiteLead));
+    inFile.read(reinterpret_cast<char*>(&base[i]->varTimeLeft), sizeof(base[i]->varTimeLeft));
+    inFile.read(reinterpret_cast<char*>(&base[i]->shorttermWinlossError), sizeof(base[i]->shorttermWinlossError));
+    inFile.read(reinterpret_cast<char*>(&base[i]->shorttermScoreError), sizeof(base[i]->shorttermScoreError));
+    inFile.read(reinterpret_cast<char*>(&base[i]->policyProbs), sizeof(float) * NNPos::MAX_NN_POLICY_SIZE);
+    inFile.read(reinterpret_cast<char*>(&base[i]->nnXLen), sizeof(base[i]->nnXLen));
+    inFile.read(reinterpret_cast<char*>(&base[i]->nnYLen), sizeof(base[i]->nnYLen));
+
+    base[i]->whiteOwnerMap = nullptr;
+    base[i]->noisedPolicyProbs = nullptr;
+  }
+
+  if (verbose)
+    logger.write("Loaded " + Global::uint64ToString((uint64_t)base.size()) + " positions from: " + baseFileName);
+
+  inFile.close();
+}
+
+bool Tests::runFP16Test(NNEvaluator* nnEval, NNEvaluator* nnEval32, Logger& logger, int boardSize, int maxBatchSizeCap, bool verbose, bool quickTest, bool& fp32BatchSuccessBuf, const string& baseFileName) {
 
   int maxBatchSize = nnEval->getMaxBatchSize();
   if(maxBatchSize != nnEval32->getMaxBatchSize())
@@ -287,13 +359,10 @@ bool Tests::runFP16Test(NNEvaluator* nnEval, NNEvaluator* nnEval32, Logger& logg
     throw StringError("Invalid max batch size for fp16 test");
 
 #ifdef USE_EIGEN_BACKEND
-  (void)logger;
-  (void)boardSize;
-  (void)verbose;
-  (void)quickTest;
-  fp32BatchSuccessBuf = true;
-  return true;
-#else
+  if (baseFileName == "")
+    return true;
+#endif
+
   Rand filterRand("Tests::runFP16Test filter rand");
   auto loadHists = [&](const std::vector<string>& sgfStrs) {
     std::vector<BoardHistory> hists;
@@ -346,8 +415,24 @@ bool Tests::runFP16Test(NNEvaluator* nnEval, NNEvaluator* nnEval32, Logger& logg
     if(verbose)
       logger.write("Running evaluations in fp32");
     std::vector<std::shared_ptr<NNOutput>> base;
-    for(const BoardHistory& hist: hists)
-      base.push_back(evalBoard(nnEval32,hist));
+
+    bool loadedBaseFromFile = false;
+
+#ifndef USE_EIGEN_BACKEND
+    if (baseFileName != "") {
+      loadBaseFromFile(base, baseFileName, logger, verbose);
+      loadedBaseFromFile = true;
+    }
+#endif
+
+    if (!loadedBaseFromFile)
+      for(const BoardHistory& hist: hists)
+        base.push_back(evalBoard(nnEval32,hist));
+
+#ifdef USE_EIGEN_BACKEND
+    assert(baseFileName != "");
+    saveBaseToFile(base, baseFileName, logger, verbose);
+#endif
 
     std::vector<std::shared_ptr<NNOutput>> batched(hists.size());
     std::vector<std::shared_ptr<NNOutput>> current;
@@ -430,5 +515,4 @@ bool Tests::runFP16Test(NNEvaluator* nnEval, NNEvaluator* nnEval32, Logger& logg
 
     return success;
   }
-#endif
 }
diff --git a/cpp/tests/tests.h b/cpp/tests/tests.h
@@ -80,7 +80,7 @@ namespace Tests {
 
   //testnnevalcanary.cpp
   void runCanaryTests(NNEvaluator* nnEval, int symmetry, bool print);
-  bool runFP16Test(NNEvaluator* nnEval, NNEvaluator* nnEval32, Logger& logger, int boardSize, int maxBatchSizeCap, bool verbose, bool quickTest, bool& fp32BatchSuccessBuf);
+  bool runFP16Test(NNEvaluator* nnEval, NNEvaluator* nnEval32, Logger& logger, int boardSize, int maxBatchSizeCap, bool verbose, bool quickTest, bool& fp32BatchSuccessBuf, const std::string& baseFileName);
 
   //testconfig.cpp
   void runInlineConfigTests();