Merge pull request #222 from usnistgov/FEATURE/RestartFullIIDTests

Feature/restart full iid tests
usnistgov · Jun 15, 2023 · 1d8b0af · 1d8b0af
2 parents dca3f79 + 4bbb789
commit 1d8b0af
Show file tree

Hide file tree

Showing 3 changed files with 129 additions and 33 deletions.
diff --git a/cpp/non_iid/lz78y_test.h b/cpp/non_iid/lz78y_test.h
@@ -1,25 +1,25 @@
 #pragma once
 #include "../shared/utils.h"
 
-#define B 16
+#define B_len 16
 #define MAX_DICTIONARY_SIZE 65536
 
 static double binaryLZ78YPredictionEstimate(const uint8_t *S, long L, const int verbose, const char *label)
 {
-   long *binaryDict[B];
+   long *binaryDict[B_len];
    long curRunOfCorrects=0;
    long maxRunOfCorrects=0;
    long correctCount=0;
    long i, j;
    uint32_t curPattern=0;
    long dictElems=0;
 
-   assert(L>B);
-   assert(L-B > 2);
-   assert(B < 32); //B < 32 to make the bit shifts well defined
+   assert(L>B_len);
+   assert(L-B_len > 2);
+   assert(B_len < 32); //B < 32 to make the bit shifts well defined
 
    //Initialize the data structure tables
-   for(j=0; j< B; j++) {
+   for(j=0; j< B_len; j++) {
       //For a length m prefix, we need 2^m sets of length 2 arrays.
       //Here, j+1 is the length of the prefix, so we need 2^(j+1) prefixes, or 2*2^(j+1) = 2^(j+2) storage total.
       //Note: 2^(j+2) = 1<<(j+2).
@@ -29,28 +29,28 @@ static double binaryLZ78YPredictionEstimate(const uint8_t *S, long L, const int
    }
 
    // initialize B counts with {(S[15]), S[16]}, {(S[14], S[15]), S[16]}, ..., {(S[0]), S[1], ..., S[15]), S[16]},
-   for(j=0; j<B; j++) {
-      curPattern = curPattern | (((uint32_t)(S[B - j - 1]&1)) << j);
+   for(j=0; j<B_len; j++) {
+      curPattern = curPattern | (((uint32_t)(S[B_len - j - 1]&1)) << j);
 
       //This is necessarily the first symbol of this length
-      (BINARYDICTLOC(j+1, curPattern))[S[B]&0x1] = 1;
+      (BINARYDICTLOC(j+1, curPattern))[S[B_len]&0x1] = 1;
       dictElems++;
    }
 
    //In C, arrays are 0 indexed.
    //i is the index of the bit to be predicted.
-   for(i=B+1; i<L; i++) {
+   for(i=B_len+1; i<L; i++) {
       bool found_x;
       bool havePrediction = false;
       uint8_t roundPrediction=2;
       uint8_t curPrediction=2;
       long maxCount = 0;
 
       //But the first B bits into curPattern
-      curPattern = compressedBitSymbols(S+i-B, B);
+      curPattern = compressedBitSymbols(S+i-B_len, B_len);
 
       //j is the length of the prefix to be used
-      for(j=B; j>0; j--) {
+      for(j=B_len; j>0; j--) {
          long curCount;
          long *binaryDictEntry;
 
@@ -104,58 +104,58 @@ static double binaryLZ78YPredictionEstimate(const uint8_t *S, long L, const int
       }
    }
 
-   for(j=0; j<B; j++) {
+   for(j=0; j<B_len; j++) {
       delete[](binaryDict[j]);
       binaryDict[j] = NULL;
    }
 
-   return(predictionEstimate(correctCount, L-B-1, maxRunOfCorrects, 2, "LZ78Y", verbose, label));
+   return(predictionEstimate(correctCount, L-B_len-1, maxRunOfCorrects, 2, "LZ78Y", verbose, label));
 }
 
 // Section 6.3.10 - LZ78Y Prediction Estimate
 double LZ78Y_test(uint8_t *data, long len, int alph_size, const int verbose, const char *label) {
 	int dict_size;
 	long i, j, N, C, run_len, max_run_len;
-	array<uint8_t, B> x;
+	array<uint8_t, B_len> x;
 
 	if(alph_size==2) return binaryLZ78YPredictionEstimate(data, len, verbose, label);
 
-	array<map<array<uint8_t, B>, PostfixDictionary>, B> D;
+	array<map<array<uint8_t, B_len>, PostfixDictionary>, B_len> D;
 
-	if(len < B+2){	
-		printf("\t*** Warning: not enough samples to run LZ78Y test (need more than %d) ***\n", B+2);
+	if(len < B_len+2){	
+		printf("\t*** Warning: not enough samples to run LZ78Y test (need more than %d) ***\n", B_len+2);
 		return -1.0;
 	}
 
-	N = len-B-1;
+	N = len-B_len-1;
 	C = 0;
 	run_len = 0;
 	max_run_len = 0;
 
 	// initialize dictionary counts
 	dict_size = 0;
-	memset(x.data(), 0, B);
+	memset(x.data(), 0, B_len);
 	// initialize LZ78Y counts with {(S[15]), S[16]}, {(S[14], S[15]), S[16]}, ..., {(S[0]), S[1], ..., S[15]), S[16]}
-	for(j = 1; j <= B; j++){
-		memcpy(x.data(), data+B-j, j);
-		D[j-1][x].incrementPostfix(data[B], true);
+	for(j = 1; j <= B_len; j++){
+		memcpy(x.data(), data+B_len-j, j);
+		D[j-1][x].incrementPostfix(data[B_len], true);
 		dict_size++;
 	}
 
 	// perform predictions
-	for(i = B+1; i < len; i++) {
+	for(i = B_len+1; i < len; i++) {
 		bool found_x;
 		bool have_prediction = false;
 		uint8_t prediction = 0;
 		long max_count = 0;
 
-		for(j = B; j > 0; j--) {
-			map<array<uint8_t, B>, PostfixDictionary>::iterator curp;
+		for(j = B_len; j > 0; j--) {
+			map<array<uint8_t, B_len>, PostfixDictionary>::iterator curp;
 
 			// check if x has been previously seen. 
 			//For the prediction, roundPrediction is the max across all pairs
 			//The prefix string should contain the j-tuple (S[i-j] ... S[i-1])
-			memset(x.data(), 0, B);
+			memset(x.data(), 0, B_len);
 			memcpy(x.data(), data+i-j, j);
 			curp = D[j-1].find(x);
 

diff --git a/cpp/restart_main.cpp b/cpp/restart_main.cpp
@@ -13,6 +13,8 @@
 #include "non_iid/multi_mcw_test.h"
 #include "non_iid/compression_test.h"
 #include "non_iid/markov_test.h"
+#include "iid/chi_square_tests.h"
+#include "iid/permutation_tests.h"
 
 #include <cstdint>
 #include <getopt.h>
@@ -146,6 +148,7 @@ int main(int argc, char* argv[]) {
     long int X_cutoff;
     long i, j, X_i, X_r, X_c, X_max;
     double H_I, H_r, H_c, alpha, ret_min_entropy;
+	double rawmean, median;
     uint8_t *rdata, *cdata;
     data_t data;
     int opt;
@@ -459,6 +462,16 @@ int main(int argc, char* argv[]) {
         exit(-1);
     } else if (verbose > 1) printf("\nRestart Sanity Check Passed...\n");
 
+
+    // Calculate baseline statistics
+    int alphabet_size = data.alph_size;
+    int sample_size = data.len;
+
+    if ((verbose == 1) || (verbose == 2))
+        printf("Calculating baseline statistics...\n");
+
+    calc_stats(&data, rawmean, median);
+
     // The maximum min-entropy is -log2(1/2^word_size) = word_size
     H_c = data.word_size;
     H_r = data.word_size;
@@ -495,6 +508,13 @@ int main(int argc, char* argv[]) {
     testRunNonIid.testCases.push_back(tc631nonIid);
     testRunIid.testCases.push_back(tc631Iid);
 
+    IidTestCase tcOverallIid;
+    tcOverallIid.h_r = H_r;
+    tcOverallIid.h_c = H_c;
+    tcOverallIid.h_i = H_I;
+    tcOverallIid.testCaseNumber = "Overall";
+
+
     if (!iid) {
 
         if (data.alph_size == 2) {
@@ -683,6 +703,86 @@ int main(int argc, char* argv[]) {
         }
         testRunNonIid.testCases.push_back(tc6310);
 
+    } else { /* IID tests */
+
+        // Compute chi square stats
+        bool chi_square_test_pass_row = chi_square_tests(rdata, sample_size, alphabet_size, verbose);
+        bool chi_square_test_pass_col = chi_square_tests(cdata, sample_size, alphabet_size, verbose);
+        bool chi_square_test_pass = chi_square_test_pass_row && chi_square_test_pass_col;
+
+        tcOverallIid.passed_chi_square_tests = chi_square_test_pass;
+
+        if ((verbose == 1) || (verbose == 2)) {
+            if (chi_square_test_pass) {
+                printf("** Passed chi square tests\n\n");
+            }
+            else {
+                printf("** Failed chi square tests\n\n");
+            }
+        }
+        else if (verbose > 2) {
+            if (chi_square_test_pass) {
+                printf("Chi square tests: Passed\n");
+            }
+            else {
+                printf("Chi square tests: Failed\n");
+            }
+        }
+
+        // Compute length of the longest repeated substring stats
+        bool len_LRS_test_pass_row = len_LRS_test(rdata, sample_size, alphabet_size, verbose, "Literal");
+        bool len_LRS_test_pass_col = len_LRS_test(cdata, sample_size, alphabet_size, verbose, "Literal");
+        bool len_LRS_test_pass = len_LRS_test_pass_row && len_LRS_test_pass_col;
+
+        tcOverallIid.passed_longest_repeated_substring_test = len_LRS_test_pass;
+
+        if ((verbose == 1) || (verbose == 2)) {
+            if (len_LRS_test_pass) {
+                printf("** Passed length of longest repeated substring test\n\n");
+            }
+            else {
+                printf("** Failed length of longest repeated substring test\n\n");
+            }
+        }
+        else if (verbose > 2) {
+            if (len_LRS_test_pass) {
+                printf("Length of longest repeated substring test: Passed\n");
+            }
+            else {
+                printf("Length of longest repeated substring test: Failed\n");
+            }
+        }
+
+        // Compute permutation stats
+        bool perm_test_pass_row = permutation_tests(&data, rawmean, median, verbose, tcOverallIid);
+
+        data_t data_col;
+        memcpy(&data_col, &data, sizeof(data));
+        data_col.symbols = rdata;
+
+        bool perm_test_pass_col = permutation_tests(&data_col, rawmean, median, verbose, tcOverallIid);
+        bool perm_test_pass = perm_test_pass_row && perm_test_pass_col;
+
+        tcOverallIid.passed_iid_permutation_tests = perm_test_pass;
+
+        if ((verbose == 1) || (verbose == 2)) {
+            if (perm_test_pass) {
+                printf("** Passed IID permutation tests\n\n");
+            }
+            else {
+                printf("** Failed IID permutation tests\n\n");
+            }
+        }
+        else if (verbose > 2) {
+            if (perm_test_pass) {
+                printf("IID permutation tests: Passed\n");
+            }
+            else {
+                printf("IID permutation tests: Failed\n");
+            }
+        }
+
+
     }
 
     if (verbose > 0) {
@@ -715,11 +815,7 @@ int main(int argc, char* argv[]) {
         exit(-1);
     }
 
-    IidTestCase tcOverallIid;
-    tcOverallIid.h_r = H_r;
-    tcOverallIid.h_c = H_c;
-    tcOverallIid.h_i = H_I;
-    tcOverallIid.testCaseNumber = "Overall";
+
     testRunIid.testCases.push_back(tcOverallIid);
     testRunIid.errorLevel = 0;
 

diff --git a/cpp/shared/utils.h b/cpp/shared/utils.h
@@ -36,7 +36,7 @@
 #define ZALPHA 2.5758293035489008
 
 //Version of the tool
-#define VERSION "GitHub Commit 0a2372810ad535158795177381f61ea4821bf2ce"
+#define VERSION "1.1.6"
 
 typedef struct data_t data_t;