Skip to content

Commit

Permalink
Cleanup testgpuerror implementation and coverage of most other nn out…
Browse files Browse the repository at this point in the history
…puts
  • Loading branch information
lightvector committed Dec 17, 2023
1 parent 9d2043e commit 53d2b33
Show file tree
Hide file tree
Showing 5 changed files with 326 additions and 190 deletions.
1 change: 1 addition & 0 deletions CONTRIBUTORS
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ Sebastian H ("nerai") - Minor code cleanup
Jochen Voss ("seehuhn") - Typo fix in doc
"kinfkong" - Added trt build configuration option
"TTXS123OK" - Minor code style improvement.
Chin-Chang Yang - For a very useful GPU backend error testing command.

Separately from the authors of the content in this repo, additional special thanks to:
Junyan Xu ("alreadydone") - for much testing and troubleshooting for Windows support
Expand Down
4 changes: 2 additions & 2 deletions cpp/command/contribute.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -918,8 +918,8 @@ int MainCmds::contribute(const vector<string>& args) {
// Cap test to avoid spawning too many threads when many selfplay games are running
const int maxBatchSizeCap = std::min(4, 1 + nnEval->getMaxBatchSize()/2);
bool fp32BatchSuccessBuf = true;
string baseFileName = "";
bool success = Tests::runFP16Test(nnEval,nnEval32,logger,boardSizeTest,maxBatchSizeCap,verbose,quickTest,fp32BatchSuccessBuf, baseFileName);
string referenceFileName = "";
bool success = Tests::runBackendErrorTest(nnEval,nnEval32,logger,boardSizeTest,maxBatchSizeCap,verbose,quickTest,fp32BatchSuccessBuf,referenceFileName);
if(!fp32BatchSuccessBuf) {
logger.write("Error: large GPU numerical errors, unable to continue");
shouldStop.store(true);
Expand Down
17 changes: 6 additions & 11 deletions cpp/command/gputest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ int MainCmds::testgpuerror(const vector<string>& args) {
string modelFile;
int boardSize;
bool quickTest;
string baseFileName;
string referenceFileName;
try {
KataGoCommandLine cmd("Test GPU error between FP16 and FP32 with and without batching");
cmd.addConfigFileArg(KataGoCommandLine::defaultGtpConfigFileName(),"gtp_example.cfg");
Expand All @@ -35,8 +35,8 @@ int MainCmds::testgpuerror(const vector<string>& args) {
TCLAP::SwitchArg quickArg("","quick","Faster shorter test");
cmd.add(boardSizeArg);
cmd.add(quickArg);
TCLAP::ValueArg<string> baseFileArg("", "basefile", "Base file to be generated by Eigen backend; loaded by other backends for cross-backend check", false, "", "FILE");
cmd.add(baseFileArg);
TCLAP::ValueArg<string> referenceFileArg("", "reference-file", "Reference file to be generated by Eigen backend; loaded by other backends for cross-backend check, if not specified then will use the backend's own FP32 as reference", false, "", "FILE");
cmd.add(referenceFileArg);

cmd.setShortUsageArgLimit();
cmd.addOverrideConfigArg();
Expand All @@ -46,7 +46,7 @@ int MainCmds::testgpuerror(const vector<string>& args) {
modelFile = cmd.getModelFile();
boardSize = boardSizeArg.getValue();
quickTest = quickArg.getValue();
baseFileName = baseFileArg.getValue();
referenceFileName = referenceFileArg.getValue();
cmd.getConfig(cfg);

if(boardSize != 19 && boardSize != 13 && boardSize != 9)
Expand Down Expand Up @@ -78,12 +78,7 @@ int MainCmds::testgpuerror(const vector<string>& args) {
logger.write("For batch test, using default batch size 16");
}
const int maxConcurrentEvals = maxBatchSize * 2 + 16;
int expectedConcurrentEvals = maxBatchSize * 2 + 16;

#ifdef USE_EIGEN_BACKEND
if(expectedConcurrentEvals > 2)
expectedConcurrentEvals = 2;
#endif
const int expectedConcurrentEvals = maxBatchSize;

const bool defaultRequireExactNNLen = false;

Expand Down Expand Up @@ -116,7 +111,7 @@ int MainCmds::testgpuerror(const vector<string>& args) {
const int maxBatchSizeCap = -1;
const bool verbose = true;
bool fp32BatchSuccessBuf = true;
bool success = Tests::runFP16Test(nnEval,nnEval32,logger,boardSize,maxBatchSizeCap,verbose,quickTest,fp32BatchSuccessBuf, baseFileName);
bool success = Tests::runBackendErrorTest(nnEval,nnEval32,logger,boardSize,maxBatchSizeCap,verbose,quickTest,fp32BatchSuccessBuf,referenceFileName);
(void)success;
// cout << success << endl;

Expand Down
Loading

0 comments on commit 53d2b33

Please sign in to comment.