diff --git a/cpp/non_iid/lz78y_test.h b/cpp/non_iid/lz78y_test.h index e875977..86eee2e 100644 --- a/cpp/non_iid/lz78y_test.h +++ b/cpp/non_iid/lz78y_test.h @@ -1,12 +1,12 @@ #pragma once #include "../shared/utils.h" -#define B 16 +#define B_len 16 #define MAX_DICTIONARY_SIZE 65536 static double binaryLZ78YPredictionEstimate(const uint8_t *S, long L, const int verbose, const char *label) { - long *binaryDict[B]; + long *binaryDict[B_len]; long curRunOfCorrects=0; long maxRunOfCorrects=0; long correctCount=0; @@ -14,12 +14,12 @@ static double binaryLZ78YPredictionEstimate(const uint8_t *S, long L, const int uint32_t curPattern=0; long dictElems=0; - assert(L>B); - assert(L-B > 2); - assert(B < 32); //B < 32 to make the bit shifts well defined + assert(L>B_len); + assert(L-B_len > 2); + assert(B_len < 32); //B < 32 to make the bit shifts well defined //Initialize the data structure tables - for(j=0; j< B; j++) { + for(j=0; j< B_len; j++) { //For a length m prefix, we need 2^m sets of length 2 arrays. //Here, j+1 is the length of the prefix, so we need 2^(j+1) prefixes, or 2*2^(j+1) = 2^(j+2) storage total. //Note: 2^(j+2) = 1<<(j+2). @@ -29,17 +29,17 @@ static double binaryLZ78YPredictionEstimate(const uint8_t *S, long L, const int } // initialize B counts with {(S[15]), S[16]}, {(S[14], S[15]), S[16]}, ..., {(S[0]), S[1], ..., S[15]), S[16]}, - for(j=0; j0; j--) { + for(j=B_len; j>0; j--) { long curCount; long *binaryDictEntry; @@ -104,58 +104,58 @@ static double binaryLZ78YPredictionEstimate(const uint8_t *S, long L, const int } } - for(j=0; j x; + array x; if(alph_size==2) return binaryLZ78YPredictionEstimate(data, len, verbose, label); - array, PostfixDictionary>, B> D; + array, PostfixDictionary>, B_len> D; - if(len < B+2){ - printf("\t*** Warning: not enough samples to run LZ78Y test (need more than %d) ***\n", B+2); + if(len < B_len+2){ + printf("\t*** Warning: not enough samples to run LZ78Y test (need more than %d) ***\n", B_len+2); return -1.0; } - N = len-B-1; + N = len-B_len-1; C = 0; run_len = 0; max_run_len = 0; // initialize dictionary counts dict_size = 0; - memset(x.data(), 0, B); + memset(x.data(), 0, B_len); // initialize LZ78Y counts with {(S[15]), S[16]}, {(S[14], S[15]), S[16]}, ..., {(S[0]), S[1], ..., S[15]), S[16]} - for(j = 1; j <= B; j++){ - memcpy(x.data(), data+B-j, j); - D[j-1][x].incrementPostfix(data[B], true); + for(j = 1; j <= B_len; j++){ + memcpy(x.data(), data+B_len-j, j); + D[j-1][x].incrementPostfix(data[B_len], true); dict_size++; } // perform predictions - for(i = B+1; i < len; i++) { + for(i = B_len+1; i < len; i++) { bool found_x; bool have_prediction = false; uint8_t prediction = 0; long max_count = 0; - for(j = B; j > 0; j--) { - map, PostfixDictionary>::iterator curp; + for(j = B_len; j > 0; j--) { + map, PostfixDictionary>::iterator curp; // check if x has been previously seen. //For the prediction, roundPrediction is the max across all pairs //The prefix string should contain the j-tuple (S[i-j] ... S[i-1]) - memset(x.data(), 0, B); + memset(x.data(), 0, B_len); memcpy(x.data(), data+i-j, j); curp = D[j-1].find(x); diff --git a/cpp/restart_main.cpp b/cpp/restart_main.cpp index e3a8b56..d7e0e54 100644 --- a/cpp/restart_main.cpp +++ b/cpp/restart_main.cpp @@ -13,6 +13,8 @@ #include "non_iid/multi_mcw_test.h" #include "non_iid/compression_test.h" #include "non_iid/markov_test.h" +#include "iid/chi_square_tests.h" +#include "iid/permutation_tests.h" #include #include @@ -146,6 +148,7 @@ int main(int argc, char* argv[]) { long int X_cutoff; long i, j, X_i, X_r, X_c, X_max; double H_I, H_r, H_c, alpha, ret_min_entropy; + double rawmean, median; uint8_t *rdata, *cdata; data_t data; int opt; @@ -459,6 +462,16 @@ int main(int argc, char* argv[]) { exit(-1); } else if (verbose > 1) printf("\nRestart Sanity Check Passed...\n"); + + // Calculate baseline statistics + int alphabet_size = data.alph_size; + int sample_size = data.len; + + if ((verbose == 1) || (verbose == 2)) + printf("Calculating baseline statistics...\n"); + + calc_stats(&data, rawmean, median); + // The maximum min-entropy is -log2(1/2^word_size) = word_size H_c = data.word_size; H_r = data.word_size; @@ -495,6 +508,13 @@ int main(int argc, char* argv[]) { testRunNonIid.testCases.push_back(tc631nonIid); testRunIid.testCases.push_back(tc631Iid); + IidTestCase tcOverallIid; + tcOverallIid.h_r = H_r; + tcOverallIid.h_c = H_c; + tcOverallIid.h_i = H_I; + tcOverallIid.testCaseNumber = "Overall"; + + if (!iid) { if (data.alph_size == 2) { @@ -683,6 +703,86 @@ int main(int argc, char* argv[]) { } testRunNonIid.testCases.push_back(tc6310); + } else { /* IID tests */ + + // Compute chi square stats + bool chi_square_test_pass_row = chi_square_tests(rdata, sample_size, alphabet_size, verbose); + bool chi_square_test_pass_col = chi_square_tests(cdata, sample_size, alphabet_size, verbose); + bool chi_square_test_pass = chi_square_test_pass_row && chi_square_test_pass_col; + + tcOverallIid.passed_chi_square_tests = chi_square_test_pass; + + if ((verbose == 1) || (verbose == 2)) { + if (chi_square_test_pass) { + printf("** Passed chi square tests\n\n"); + } + else { + printf("** Failed chi square tests\n\n"); + } + } + else if (verbose > 2) { + if (chi_square_test_pass) { + printf("Chi square tests: Passed\n"); + } + else { + printf("Chi square tests: Failed\n"); + } + } + + // Compute length of the longest repeated substring stats + bool len_LRS_test_pass_row = len_LRS_test(rdata, sample_size, alphabet_size, verbose, "Literal"); + bool len_LRS_test_pass_col = len_LRS_test(cdata, sample_size, alphabet_size, verbose, "Literal"); + bool len_LRS_test_pass = len_LRS_test_pass_row && len_LRS_test_pass_col; + + tcOverallIid.passed_longest_repeated_substring_test = len_LRS_test_pass; + + if ((verbose == 1) || (verbose == 2)) { + if (len_LRS_test_pass) { + printf("** Passed length of longest repeated substring test\n\n"); + } + else { + printf("** Failed length of longest repeated substring test\n\n"); + } + } + else if (verbose > 2) { + if (len_LRS_test_pass) { + printf("Length of longest repeated substring test: Passed\n"); + } + else { + printf("Length of longest repeated substring test: Failed\n"); + } + } + + // Compute permutation stats + bool perm_test_pass_row = permutation_tests(&data, rawmean, median, verbose, tcOverallIid); + + data_t data_col; + memcpy(&data_col, &data, sizeof(data)); + data_col.symbols = rdata; + + bool perm_test_pass_col = permutation_tests(&data_col, rawmean, median, verbose, tcOverallIid); + bool perm_test_pass = perm_test_pass_row && perm_test_pass_col; + + tcOverallIid.passed_iid_permutation_tests = perm_test_pass; + + if ((verbose == 1) || (verbose == 2)) { + if (perm_test_pass) { + printf("** Passed IID permutation tests\n\n"); + } + else { + printf("** Failed IID permutation tests\n\n"); + } + } + else if (verbose > 2) { + if (perm_test_pass) { + printf("IID permutation tests: Passed\n"); + } + else { + printf("IID permutation tests: Failed\n"); + } + } + + } if (verbose > 0) { @@ -715,11 +815,7 @@ int main(int argc, char* argv[]) { exit(-1); } - IidTestCase tcOverallIid; - tcOverallIid.h_r = H_r; - tcOverallIid.h_c = H_c; - tcOverallIid.h_i = H_I; - tcOverallIid.testCaseNumber = "Overall"; + testRunIid.testCases.push_back(tcOverallIid); testRunIid.errorLevel = 0; diff --git a/cpp/shared/utils.h b/cpp/shared/utils.h index cdceefb..1f1f6f3 100644 --- a/cpp/shared/utils.h +++ b/cpp/shared/utils.h @@ -36,7 +36,7 @@ #define ZALPHA 2.5758293035489008 //Version of the tool -#define VERSION "GitHub Commit 0a2372810ad535158795177381f61ea4821bf2ce" +#define VERSION "1.1.6" typedef struct data_t data_t;