Skip to content

Commit

Permalink
Introducing Cross-Backend GPU Error Computation via NN Output Saving …
Browse files Browse the repository at this point in the history
…and Loading (#858)

* Save and load NN outputs for cross-backend test

In order to improve testing and cross-backend checks, this change
introduces the ability to save and load base files for the GPU error
test. A new `baseFileName` argument is added to the `runFP16Test`
function in `testnnevalcanary.cpp`. When provided, this argument enables
the test to load a previously saved base file instead of recomputing the
base positions from scratch. This allows a backend to load a baseline NN
output file during testing.

The commit introduces two new functions in `testnnevalcanary.cpp`:
`saveBaseToFile` and `loadBaseFromFile`. The former takes a vector of
NNOutput and saves it to a binary file specified by `baseFileName`,
while the latter loads and populates a vector with NNOutput from a
binary file.

By allowing the test to load base files, it becomes easier to compare
and validate results between different backends. This enhances the
overall accuracy and reliability of the testing process.

* Return 1 if testgpuerror is not successful

* Set expected concurrent evals to 2 for Eigen

This commit sets the expected concurrent evaluations to 2 for Eigen
backend to fix a problem of memory usage explosion by too many
concurrent evaluations.
  • Loading branch information
ChinChangYang authored Dec 17, 2023
1 parent 55023a4 commit 9d2043e
Show file tree
Hide file tree
Showing 4 changed files with 111 additions and 16 deletions.
3 changes: 2 additions & 1 deletion cpp/command/contribute.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -918,7 +918,8 @@ int MainCmds::contribute(const vector<string>& args) {
// Cap test to avoid spawning too many threads when many selfplay games are running
const int maxBatchSizeCap = std::min(4, 1 + nnEval->getMaxBatchSize()/2);
bool fp32BatchSuccessBuf = true;
bool success = Tests::runFP16Test(nnEval,nnEval32,logger,boardSizeTest,maxBatchSizeCap,verbose,quickTest,fp32BatchSuccessBuf);
string baseFileName = "";
bool success = Tests::runFP16Test(nnEval,nnEval32,logger,boardSizeTest,maxBatchSizeCap,verbose,quickTest,fp32BatchSuccessBuf, baseFileName);
if(!fp32BatchSuccessBuf) {
logger.write("Error: large GPU numerical errors, unable to continue");
shouldStop.store(true);
Expand Down
16 changes: 13 additions & 3 deletions cpp/command/gputest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ int MainCmds::testgpuerror(const vector<string>& args) {
string modelFile;
int boardSize;
bool quickTest;
string baseFileName;
try {
KataGoCommandLine cmd("Test GPU error between FP16 and FP32 with and without batching");
cmd.addConfigFileArg(KataGoCommandLine::defaultGtpConfigFileName(),"gtp_example.cfg");
Expand All @@ -34,6 +35,8 @@ int MainCmds::testgpuerror(const vector<string>& args) {
TCLAP::SwitchArg quickArg("","quick","Faster shorter test");
cmd.add(boardSizeArg);
cmd.add(quickArg);
TCLAP::ValueArg<string> baseFileArg("", "basefile", "Base file to be generated by Eigen backend; loaded by other backends for cross-backend check", false, "", "FILE");
cmd.add(baseFileArg);

cmd.setShortUsageArgLimit();
cmd.addOverrideConfigArg();
Expand All @@ -43,6 +46,7 @@ int MainCmds::testgpuerror(const vector<string>& args) {
modelFile = cmd.getModelFile();
boardSize = boardSizeArg.getValue();
quickTest = quickArg.getValue();
baseFileName = baseFileArg.getValue();
cmd.getConfig(cfg);

if(boardSize != 19 && boardSize != 13 && boardSize != 9)
Expand Down Expand Up @@ -74,7 +78,13 @@ int MainCmds::testgpuerror(const vector<string>& args) {
logger.write("For batch test, using default batch size 16");
}
const int maxConcurrentEvals = maxBatchSize * 2 + 16;
const int expectedConcurrentEvals = maxBatchSize * 2 + 16;
int expectedConcurrentEvals = maxBatchSize * 2 + 16;

#ifdef USE_EIGEN_BACKEND
if(expectedConcurrentEvals > 2)
expectedConcurrentEvals = 2;
#endif

const bool defaultRequireExactNNLen = false;

NNEvaluator* nnEval;
Expand Down Expand Up @@ -106,7 +116,7 @@ int MainCmds::testgpuerror(const vector<string>& args) {
const int maxBatchSizeCap = -1;
const bool verbose = true;
bool fp32BatchSuccessBuf = true;
bool success = Tests::runFP16Test(nnEval,nnEval32,logger,boardSize,maxBatchSizeCap,verbose,quickTest,fp32BatchSuccessBuf);
bool success = Tests::runFP16Test(nnEval,nnEval32,logger,boardSize,maxBatchSizeCap,verbose,quickTest,fp32BatchSuccessBuf, baseFileName);
(void)success;
// cout << success << endl;

Expand All @@ -116,5 +126,5 @@ int MainCmds::testgpuerror(const vector<string>& args) {
NeuralNet::globalCleanup();
ScoreValue::freeTables();

return 0;
return success ? 0 : 1;
}
106 changes: 95 additions & 11 deletions cpp/tests/testnnevalcanary.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,79 @@ struct GpuErrorStats {
}
};

bool Tests::runFP16Test(NNEvaluator* nnEval, NNEvaluator* nnEval32, Logger& logger, int boardSize, int maxBatchSizeCap, bool verbose, bool quickTest, bool& fp32BatchSuccessBuf) {
void saveBaseToFile(const std::vector<std::shared_ptr<NNOutput>>& base, const string& baseFileName, Logger& logger, bool verbose) {
assert(baseFileName != "");
std::ofstream outFile(baseFileName, std::ios::binary);

if (!outFile)
throw StringError("Unable to save base to: " + baseFileName);

size_t size = base.size();
outFile.write(reinterpret_cast<const char*>(&size), sizeof(size));

for (const auto& nnOutputPtr : base) {
if (nnOutputPtr) {
outFile.write(reinterpret_cast<const char*>(&nnOutputPtr->nnHash), sizeof(nnOutputPtr->nnHash));
outFile.write(reinterpret_cast<const char*>(&nnOutputPtr->whiteWinProb), sizeof(nnOutputPtr->whiteWinProb));
outFile.write(reinterpret_cast<const char*>(&nnOutputPtr->whiteLossProb), sizeof(nnOutputPtr->whiteLossProb));
outFile.write(reinterpret_cast<const char*>(&nnOutputPtr->whiteNoResultProb), sizeof(nnOutputPtr->whiteNoResultProb));
outFile.write(reinterpret_cast<const char*>(&nnOutputPtr->whiteScoreMean), sizeof(nnOutputPtr->whiteScoreMean));
outFile.write(reinterpret_cast<const char*>(&nnOutputPtr->whiteScoreMeanSq), sizeof(nnOutputPtr->whiteScoreMeanSq));
outFile.write(reinterpret_cast<const char*>(&nnOutputPtr->whiteLead), sizeof(nnOutputPtr->whiteLead));
outFile.write(reinterpret_cast<const char*>(&nnOutputPtr->varTimeLeft), sizeof(nnOutputPtr->varTimeLeft));
outFile.write(reinterpret_cast<const char*>(&nnOutputPtr->shorttermWinlossError), sizeof(nnOutputPtr->shorttermWinlossError));
outFile.write(reinterpret_cast<const char*>(&nnOutputPtr->shorttermScoreError), sizeof(nnOutputPtr->shorttermScoreError));
outFile.write(reinterpret_cast<const char*>(nnOutputPtr->policyProbs), sizeof(float) * NNPos::MAX_NN_POLICY_SIZE);
outFile.write(reinterpret_cast<const char*>(&nnOutputPtr->nnXLen), sizeof(nnOutputPtr->nnXLen));
outFile.write(reinterpret_cast<const char*>(&nnOutputPtr->nnYLen), sizeof(nnOutputPtr->nnYLen));
}
}

if (verbose)
logger.write("Saved " + Global::uint64ToString((uint64_t)base.size()) + " positions to: " + baseFileName);

outFile.close();
}

void loadBaseFromFile(std::vector<std::shared_ptr<NNOutput>>& base, const string& baseFileName, Logger& logger, bool verbose) {
assert(baseFileName != "");
std::ifstream inFile(baseFileName, std::ios::binary);

if (!inFile)
throw StringError("Unable to load: " + baseFileName);

size_t size;
inFile.read(reinterpret_cast<char*>(&size), sizeof(size));
base.resize(size);

for (size_t i = 0; i < size; ++i) {
base[i] = std::make_shared<NNOutput>();

inFile.read(reinterpret_cast<char*>(&base[i]->nnHash), sizeof(base[i]->nnHash));
inFile.read(reinterpret_cast<char*>(&base[i]->whiteWinProb), sizeof(base[i]->whiteWinProb));
inFile.read(reinterpret_cast<char*>(&base[i]->whiteLossProb), sizeof(base[i]->whiteLossProb));
inFile.read(reinterpret_cast<char*>(&base[i]->whiteNoResultProb), sizeof(base[i]->whiteNoResultProb));
inFile.read(reinterpret_cast<char*>(&base[i]->whiteScoreMean), sizeof(base[i]->whiteScoreMean));
inFile.read(reinterpret_cast<char*>(&base[i]->whiteScoreMeanSq), sizeof(base[i]->whiteScoreMeanSq));
inFile.read(reinterpret_cast<char*>(&base[i]->whiteLead), sizeof(base[i]->whiteLead));
inFile.read(reinterpret_cast<char*>(&base[i]->varTimeLeft), sizeof(base[i]->varTimeLeft));
inFile.read(reinterpret_cast<char*>(&base[i]->shorttermWinlossError), sizeof(base[i]->shorttermWinlossError));
inFile.read(reinterpret_cast<char*>(&base[i]->shorttermScoreError), sizeof(base[i]->shorttermScoreError));
inFile.read(reinterpret_cast<char*>(&base[i]->policyProbs), sizeof(float) * NNPos::MAX_NN_POLICY_SIZE);
inFile.read(reinterpret_cast<char*>(&base[i]->nnXLen), sizeof(base[i]->nnXLen));
inFile.read(reinterpret_cast<char*>(&base[i]->nnYLen), sizeof(base[i]->nnYLen));

base[i]->whiteOwnerMap = nullptr;
base[i]->noisedPolicyProbs = nullptr;
}

if (verbose)
logger.write("Loaded " + Global::uint64ToString((uint64_t)base.size()) + " positions from: " + baseFileName);

inFile.close();
}

bool Tests::runFP16Test(NNEvaluator* nnEval, NNEvaluator* nnEval32, Logger& logger, int boardSize, int maxBatchSizeCap, bool verbose, bool quickTest, bool& fp32BatchSuccessBuf, const string& baseFileName) {

int maxBatchSize = nnEval->getMaxBatchSize();
if(maxBatchSize != nnEval32->getMaxBatchSize())
Expand All @@ -287,13 +359,10 @@ bool Tests::runFP16Test(NNEvaluator* nnEval, NNEvaluator* nnEval32, Logger& logg
throw StringError("Invalid max batch size for fp16 test");

#ifdef USE_EIGEN_BACKEND
(void)logger;
(void)boardSize;
(void)verbose;
(void)quickTest;
fp32BatchSuccessBuf = true;
return true;
#else
if (baseFileName == "")
return true;
#endif

Rand filterRand("Tests::runFP16Test filter rand");
auto loadHists = [&](const std::vector<string>& sgfStrs) {
std::vector<BoardHistory> hists;
Expand Down Expand Up @@ -346,8 +415,24 @@ bool Tests::runFP16Test(NNEvaluator* nnEval, NNEvaluator* nnEval32, Logger& logg
if(verbose)
logger.write("Running evaluations in fp32");
std::vector<std::shared_ptr<NNOutput>> base;
for(const BoardHistory& hist: hists)
base.push_back(evalBoard(nnEval32,hist));

bool loadedBaseFromFile = false;

#ifndef USE_EIGEN_BACKEND
if (baseFileName != "") {
loadBaseFromFile(base, baseFileName, logger, verbose);
loadedBaseFromFile = true;
}
#endif

if (!loadedBaseFromFile)
for(const BoardHistory& hist: hists)
base.push_back(evalBoard(nnEval32,hist));

#ifdef USE_EIGEN_BACKEND
assert(baseFileName != "");
saveBaseToFile(base, baseFileName, logger, verbose);
#endif

std::vector<std::shared_ptr<NNOutput>> batched(hists.size());
std::vector<std::shared_ptr<NNOutput>> current;
Expand Down Expand Up @@ -430,5 +515,4 @@ bool Tests::runFP16Test(NNEvaluator* nnEval, NNEvaluator* nnEval32, Logger& logg

return success;
}
#endif
}
2 changes: 1 addition & 1 deletion cpp/tests/tests.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ namespace Tests {

//testnnevalcanary.cpp
void runCanaryTests(NNEvaluator* nnEval, int symmetry, bool print);
bool runFP16Test(NNEvaluator* nnEval, NNEvaluator* nnEval32, Logger& logger, int boardSize, int maxBatchSizeCap, bool verbose, bool quickTest, bool& fp32BatchSuccessBuf);
bool runFP16Test(NNEvaluator* nnEval, NNEvaluator* nnEval32, Logger& logger, int boardSize, int maxBatchSizeCap, bool verbose, bool quickTest, bool& fp32BatchSuccessBuf, const std::string& baseFileName);

//testconfig.cpp
void runInlineConfigTests();
Expand Down

0 comments on commit 9d2043e

Please sign in to comment.