Cleanup testgpuerror implementation and coverage of most other nn out…

…puts
lightvector · Dec 17, 2023 · 53d2b33 · 53d2b33
1 parent 9d2043e
commit 53d2b33
Show file tree

Hide file tree

Showing 5 changed files with 326 additions and 190 deletions.
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
@@ -44,6 +44,7 @@ Sebastian H ("nerai") - Minor code cleanup
 Jochen Voss ("seehuhn") - Typo fix in doc
 "kinfkong" - Added trt build configuration option
 "TTXS123OK" - Minor code style improvement.
+Chin-Chang Yang - For a very useful GPU backend error testing command.
 
 Separately from the authors of the content in this repo, additional special thanks to:
 Junyan Xu ("alreadydone") - for much testing and troubleshooting for Windows support

diff --git a/cpp/command/contribute.cpp b/cpp/command/contribute.cpp
@@ -918,8 +918,8 @@ int MainCmds::contribute(const vector<string>& args) {
       // Cap test to avoid spawning too many threads when many selfplay games are running
       const int maxBatchSizeCap = std::min(4, 1 + nnEval->getMaxBatchSize()/2);
       bool fp32BatchSuccessBuf = true;
-      string baseFileName = "";
-      bool success = Tests::runFP16Test(nnEval,nnEval32,logger,boardSizeTest,maxBatchSizeCap,verbose,quickTest,fp32BatchSuccessBuf, baseFileName);
+      string referenceFileName = "";
+      bool success = Tests::runBackendErrorTest(nnEval,nnEval32,logger,boardSizeTest,maxBatchSizeCap,verbose,quickTest,fp32BatchSuccessBuf,referenceFileName);
       if(!fp32BatchSuccessBuf) {
         logger.write("Error: large GPU numerical errors, unable to continue");
         shouldStop.store(true);

diff --git a/cpp/command/gputest.cpp b/cpp/command/gputest.cpp
@@ -26,7 +26,7 @@ int MainCmds::testgpuerror(const vector<string>& args) {
   string modelFile;
   int boardSize;
   bool quickTest;
-  string baseFileName;
+  string referenceFileName;
   try {
     KataGoCommandLine cmd("Test GPU error between FP16 and FP32 with and without batching");
     cmd.addConfigFileArg(KataGoCommandLine::defaultGtpConfigFileName(),"gtp_example.cfg");
@@ -35,8 +35,8 @@ int MainCmds::testgpuerror(const vector<string>& args) {
     TCLAP::SwitchArg quickArg("","quick","Faster shorter test");
     cmd.add(boardSizeArg);
     cmd.add(quickArg);
-    TCLAP::ValueArg<string> baseFileArg("", "basefile", "Base file to be generated by Eigen backend; loaded by other backends for cross-backend check", false, "", "FILE");
-    cmd.add(baseFileArg);
+    TCLAP::ValueArg<string> referenceFileArg("", "reference-file", "Reference file to be generated by Eigen backend; loaded by other backends for cross-backend check, if not specified then will use the backend's own FP32 as reference", false, "", "FILE");
+    cmd.add(referenceFileArg);
 
     cmd.setShortUsageArgLimit();
     cmd.addOverrideConfigArg();
@@ -46,7 +46,7 @@ int MainCmds::testgpuerror(const vector<string>& args) {
     modelFile = cmd.getModelFile();
     boardSize = boardSizeArg.getValue();
     quickTest = quickArg.getValue();
-    baseFileName = baseFileArg.getValue();
+    referenceFileName = referenceFileArg.getValue();
     cmd.getConfig(cfg);
 
     if(boardSize != 19 && boardSize != 13 && boardSize != 9)
@@ -78,12 +78,7 @@ int MainCmds::testgpuerror(const vector<string>& args) {
     logger.write("For batch test, using default batch size 16");
   }
   const int maxConcurrentEvals = maxBatchSize * 2 + 16;
-  int expectedConcurrentEvals = maxBatchSize * 2 + 16;
-
-#ifdef USE_EIGEN_BACKEND
-  if(expectedConcurrentEvals > 2)
-    expectedConcurrentEvals = 2;
-#endif
+  const int expectedConcurrentEvals = maxBatchSize;
 
   const bool defaultRequireExactNNLen = false;
 
@@ -116,7 +111,7 @@ int MainCmds::testgpuerror(const vector<string>& args) {
   const int maxBatchSizeCap = -1;
   const bool verbose = true;
   bool fp32BatchSuccessBuf = true;
-  bool success = Tests::runFP16Test(nnEval,nnEval32,logger,boardSize,maxBatchSizeCap,verbose,quickTest,fp32BatchSuccessBuf, baseFileName);
+  bool success = Tests::runBackendErrorTest(nnEval,nnEval32,logger,boardSize,maxBatchSizeCap,verbose,quickTest,fp32BatchSuccessBuf,referenceFileName);
   (void)success;
   // cout << success << endl;