Skip to content

Commit

Permalink
Merge pull request #222 from usnistgov/FEATURE/RestartFullIIDTests
Browse files Browse the repository at this point in the history
Feature/restart full iid tests
  • Loading branch information
celic authored Jun 15, 2023
2 parents dca3f79 + 4bbb789 commit 1d8b0af
Show file tree
Hide file tree
Showing 3 changed files with 129 additions and 33 deletions.
54 changes: 27 additions & 27 deletions cpp/non_iid/lz78y_test.h
Original file line number Diff line number Diff line change
@@ -1,25 +1,25 @@
#pragma once
#include "../shared/utils.h"

#define B 16
#define B_len 16
#define MAX_DICTIONARY_SIZE 65536

static double binaryLZ78YPredictionEstimate(const uint8_t *S, long L, const int verbose, const char *label)
{
long *binaryDict[B];
long *binaryDict[B_len];
long curRunOfCorrects=0;
long maxRunOfCorrects=0;
long correctCount=0;
long i, j;
uint32_t curPattern=0;
long dictElems=0;

assert(L>B);
assert(L-B > 2);
assert(B < 32); //B < 32 to make the bit shifts well defined
assert(L>B_len);
assert(L-B_len > 2);
assert(B_len < 32); //B < 32 to make the bit shifts well defined

//Initialize the data structure tables
for(j=0; j< B; j++) {
for(j=0; j< B_len; j++) {
//For a length m prefix, we need 2^m sets of length 2 arrays.
//Here, j+1 is the length of the prefix, so we need 2^(j+1) prefixes, or 2*2^(j+1) = 2^(j+2) storage total.
//Note: 2^(j+2) = 1<<(j+2).
Expand All @@ -29,28 +29,28 @@ static double binaryLZ78YPredictionEstimate(const uint8_t *S, long L, const int
}

// initialize B counts with {(S[15]), S[16]}, {(S[14], S[15]), S[16]}, ..., {(S[0]), S[1], ..., S[15]), S[16]},
for(j=0; j<B; j++) {
curPattern = curPattern | (((uint32_t)(S[B - j - 1]&1)) << j);
for(j=0; j<B_len; j++) {
curPattern = curPattern | (((uint32_t)(S[B_len - j - 1]&1)) << j);

//This is necessarily the first symbol of this length
(BINARYDICTLOC(j+1, curPattern))[S[B]&0x1] = 1;
(BINARYDICTLOC(j+1, curPattern))[S[B_len]&0x1] = 1;
dictElems++;
}

//In C, arrays are 0 indexed.
//i is the index of the bit to be predicted.
for(i=B+1; i<L; i++) {
for(i=B_len+1; i<L; i++) {
bool found_x;
bool havePrediction = false;
uint8_t roundPrediction=2;
uint8_t curPrediction=2;
long maxCount = 0;

//But the first B bits into curPattern
curPattern = compressedBitSymbols(S+i-B, B);
curPattern = compressedBitSymbols(S+i-B_len, B_len);

//j is the length of the prefix to be used
for(j=B; j>0; j--) {
for(j=B_len; j>0; j--) {
long curCount;
long *binaryDictEntry;

Expand Down Expand Up @@ -104,58 +104,58 @@ static double binaryLZ78YPredictionEstimate(const uint8_t *S, long L, const int
}
}

for(j=0; j<B; j++) {
for(j=0; j<B_len; j++) {
delete[](binaryDict[j]);
binaryDict[j] = NULL;
}

return(predictionEstimate(correctCount, L-B-1, maxRunOfCorrects, 2, "LZ78Y", verbose, label));
return(predictionEstimate(correctCount, L-B_len-1, maxRunOfCorrects, 2, "LZ78Y", verbose, label));
}

// Section 6.3.10 - LZ78Y Prediction Estimate
double LZ78Y_test(uint8_t *data, long len, int alph_size, const int verbose, const char *label) {
int dict_size;
long i, j, N, C, run_len, max_run_len;
array<uint8_t, B> x;
array<uint8_t, B_len> x;

if(alph_size==2) return binaryLZ78YPredictionEstimate(data, len, verbose, label);

array<map<array<uint8_t, B>, PostfixDictionary>, B> D;
array<map<array<uint8_t, B_len>, PostfixDictionary>, B_len> D;

if(len < B+2){
printf("\t*** Warning: not enough samples to run LZ78Y test (need more than %d) ***\n", B+2);
if(len < B_len+2){
printf("\t*** Warning: not enough samples to run LZ78Y test (need more than %d) ***\n", B_len+2);
return -1.0;
}

N = len-B-1;
N = len-B_len-1;
C = 0;
run_len = 0;
max_run_len = 0;

// initialize dictionary counts
dict_size = 0;
memset(x.data(), 0, B);
memset(x.data(), 0, B_len);
// initialize LZ78Y counts with {(S[15]), S[16]}, {(S[14], S[15]), S[16]}, ..., {(S[0]), S[1], ..., S[15]), S[16]}
for(j = 1; j <= B; j++){
memcpy(x.data(), data+B-j, j);
D[j-1][x].incrementPostfix(data[B], true);
for(j = 1; j <= B_len; j++){
memcpy(x.data(), data+B_len-j, j);
D[j-1][x].incrementPostfix(data[B_len], true);
dict_size++;
}

// perform predictions
for(i = B+1; i < len; i++) {
for(i = B_len+1; i < len; i++) {
bool found_x;
bool have_prediction = false;
uint8_t prediction = 0;
long max_count = 0;

for(j = B; j > 0; j--) {
map<array<uint8_t, B>, PostfixDictionary>::iterator curp;
for(j = B_len; j > 0; j--) {
map<array<uint8_t, B_len>, PostfixDictionary>::iterator curp;

// check if x has been previously seen.
//For the prediction, roundPrediction is the max across all pairs
//The prefix string should contain the j-tuple (S[i-j] ... S[i-1])
memset(x.data(), 0, B);
memset(x.data(), 0, B_len);
memcpy(x.data(), data+i-j, j);
curp = D[j-1].find(x);

Expand Down
106 changes: 101 additions & 5 deletions cpp/restart_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
#include "non_iid/multi_mcw_test.h"
#include "non_iid/compression_test.h"
#include "non_iid/markov_test.h"
#include "iid/chi_square_tests.h"
#include "iid/permutation_tests.h"

#include <cstdint>
#include <getopt.h>
Expand Down Expand Up @@ -146,6 +148,7 @@ int main(int argc, char* argv[]) {
long int X_cutoff;
long i, j, X_i, X_r, X_c, X_max;
double H_I, H_r, H_c, alpha, ret_min_entropy;
double rawmean, median;
uint8_t *rdata, *cdata;
data_t data;
int opt;
Expand Down Expand Up @@ -459,6 +462,16 @@ int main(int argc, char* argv[]) {
exit(-1);
} else if (verbose > 1) printf("\nRestart Sanity Check Passed...\n");


// Calculate baseline statistics
int alphabet_size = data.alph_size;
int sample_size = data.len;

if ((verbose == 1) || (verbose == 2))
printf("Calculating baseline statistics...\n");

calc_stats(&data, rawmean, median);

// The maximum min-entropy is -log2(1/2^word_size) = word_size
H_c = data.word_size;
H_r = data.word_size;
Expand Down Expand Up @@ -495,6 +508,13 @@ int main(int argc, char* argv[]) {
testRunNonIid.testCases.push_back(tc631nonIid);
testRunIid.testCases.push_back(tc631Iid);

IidTestCase tcOverallIid;
tcOverallIid.h_r = H_r;
tcOverallIid.h_c = H_c;
tcOverallIid.h_i = H_I;
tcOverallIid.testCaseNumber = "Overall";


if (!iid) {

if (data.alph_size == 2) {
Expand Down Expand Up @@ -683,6 +703,86 @@ int main(int argc, char* argv[]) {
}
testRunNonIid.testCases.push_back(tc6310);

} else { /* IID tests */

// Compute chi square stats
bool chi_square_test_pass_row = chi_square_tests(rdata, sample_size, alphabet_size, verbose);
bool chi_square_test_pass_col = chi_square_tests(cdata, sample_size, alphabet_size, verbose);
bool chi_square_test_pass = chi_square_test_pass_row && chi_square_test_pass_col;

tcOverallIid.passed_chi_square_tests = chi_square_test_pass;

if ((verbose == 1) || (verbose == 2)) {
if (chi_square_test_pass) {
printf("** Passed chi square tests\n\n");
}
else {
printf("** Failed chi square tests\n\n");
}
}
else if (verbose > 2) {
if (chi_square_test_pass) {
printf("Chi square tests: Passed\n");
}
else {
printf("Chi square tests: Failed\n");
}
}

// Compute length of the longest repeated substring stats
bool len_LRS_test_pass_row = len_LRS_test(rdata, sample_size, alphabet_size, verbose, "Literal");
bool len_LRS_test_pass_col = len_LRS_test(cdata, sample_size, alphabet_size, verbose, "Literal");
bool len_LRS_test_pass = len_LRS_test_pass_row && len_LRS_test_pass_col;

tcOverallIid.passed_longest_repeated_substring_test = len_LRS_test_pass;

if ((verbose == 1) || (verbose == 2)) {
if (len_LRS_test_pass) {
printf("** Passed length of longest repeated substring test\n\n");
}
else {
printf("** Failed length of longest repeated substring test\n\n");
}
}
else if (verbose > 2) {
if (len_LRS_test_pass) {
printf("Length of longest repeated substring test: Passed\n");
}
else {
printf("Length of longest repeated substring test: Failed\n");
}
}

// Compute permutation stats
bool perm_test_pass_row = permutation_tests(&data, rawmean, median, verbose, tcOverallIid);

data_t data_col;
memcpy(&data_col, &data, sizeof(data));
data_col.symbols = rdata;

bool perm_test_pass_col = permutation_tests(&data_col, rawmean, median, verbose, tcOverallIid);
bool perm_test_pass = perm_test_pass_row && perm_test_pass_col;

tcOverallIid.passed_iid_permutation_tests = perm_test_pass;

if ((verbose == 1) || (verbose == 2)) {
if (perm_test_pass) {
printf("** Passed IID permutation tests\n\n");
}
else {
printf("** Failed IID permutation tests\n\n");
}
}
else if (verbose > 2) {
if (perm_test_pass) {
printf("IID permutation tests: Passed\n");
}
else {
printf("IID permutation tests: Failed\n");
}
}


}

if (verbose > 0) {
Expand Down Expand Up @@ -715,11 +815,7 @@ int main(int argc, char* argv[]) {
exit(-1);
}

IidTestCase tcOverallIid;
tcOverallIid.h_r = H_r;
tcOverallIid.h_c = H_c;
tcOverallIid.h_i = H_I;
tcOverallIid.testCaseNumber = "Overall";

testRunIid.testCases.push_back(tcOverallIid);
testRunIid.errorLevel = 0;

Expand Down
2 changes: 1 addition & 1 deletion cpp/shared/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
#define ZALPHA 2.5758293035489008

//Version of the tool
#define VERSION "GitHub Commit 0a2372810ad535158795177381f61ea4821bf2ce"
#define VERSION "1.1.6"

typedef struct data_t data_t;

Expand Down

0 comments on commit 1d8b0af

Please sign in to comment.