Skip to content

Commit

Permalink
Predictable behavior from count-sketch.
Browse files Browse the repository at this point in the history
* Size of sketch and accuracy of estimates are correlated.

* Verified that estimates are not guaranteed to be non-negative and that this is correct behavior.
  • Loading branch information
ujvl committed Jul 17, 2018
1 parent 830afed commit 2996b33
Show file tree
Hide file tree
Showing 4 changed files with 222 additions and 84 deletions.
55 changes: 36 additions & 19 deletions libconfluo/confluo/container/sketch/count_sketch.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
#define CONFLUO_CONTAINER_SKETCH_COUNT_SKETCH_H_

#include <array>
#include <algorithm>
#include <vector>

#include "atomic.h"
#include "hash_manager.h"
#include "types/primitive_types.h"
Expand All @@ -26,21 +28,26 @@ class count_sketch {
* Constructor.
* @param num_estimates number of estimates to track per update (depth)
* @param num_buckets number of buckets (width)
* @param manager hash manager
*/
count_sketch(size_t num_estimates, size_t num_buckets, const hash_manager& manager)
count_sketch(size_t num_estimates, size_t num_buckets)
: num_estimates_(num_estimates),
num_buckets_(num_buckets),
counters_(num_estimates_ * num_buckets_),
hash_manager_(manager) {
hash_manager_.guarantee_initialized(2 * num_estimates_);
bucket_hash_manager_(),
sign_hash_manager_() {
for (size_t i = 0; i < counters_.size(); i++) {
atomic::store(&counters_[i], counter_t());
}
bucket_hash_manager_.guarantee_initialized(num_estimates_);
sign_hash_manager_.guarantee_initialized(num_estimates_);
}

count_sketch(const count_sketch& other)
: num_estimates_(other.num_estimates_),
num_buckets_(other.num_buckets_),
counters_(num_estimates_ * num_buckets_),
hash_manager_(other.hash_manager_) {
bucket_hash_manager_(other.bucket_hash_manager_),
sign_hash_manager_(other.sign_hash_manager_) {
for (size_t i = 0; i < counters_.size(); i++) {
atomic::store(&counters_[i], atomic::load(&other.counters_[i]));
}
Expand All @@ -50,7 +57,8 @@ class count_sketch {
num_estimates_ = other.num_estimates_;
num_buckets_ = other.num_buckets_;
counters_ = std::vector<atomic_counter_t>(num_estimates_ * num_buckets_);
hash_manager_ = other.hash_manager_;
bucket_hash_manager_ = other.bucket_hash_manager_;
sign_hash_manager_ = other.sign_hash_manager_;
for (size_t i = 0; i < counters_.size(); i++) {
atomic::store(&counters_[i], atomic::load(&other.counters_[i]));
}
Expand All @@ -63,8 +71,8 @@ class count_sketch {
*/
void update(T key) {
for (size_t i = 0; i < num_estimates_; i++) {
int bucket_idx = hash_manager_.hash(i, key) % num_buckets_;
counter_t sign = to_sign(hash_manager_.hash(num_estimates_ + i, key));
size_t bucket_idx = bucket_hash_manager_.hash(i, key) % num_buckets_;
counter_t sign = to_sign(sign_hash_manager_.hash(i, key));
atomic::faa<counter_t>(&counters_[num_buckets_ * i + bucket_idx], sign);
}
}
Expand All @@ -77,11 +85,11 @@ class count_sketch {
counter_t estimate(T key) {
std::vector<counter_t> median_buf(num_estimates_);
for (size_t i = 0; i < num_estimates_; i++) {
size_t bucket_idx = hash_manager_.hash(i, key) % num_buckets_;
counter_t sign = to_sign(hash_manager_.hash(num_estimates_ + i, key));
size_t bucket_idx = bucket_hash_manager_.hash(i, key) % num_buckets_;
counter_t sign = to_sign(sign_hash_manager_.hash(i, key));
median_buf[i] = sign * atomic::load(&counters_[num_buckets_ * i + bucket_idx]);
}
return median<counter_t>(median_buf);
return median(median_buf);
}

/**
Expand All @@ -92,12 +100,21 @@ class count_sketch {
counter_t update_and_estimate(T key) {
std::vector<counter_t> median_buf(num_estimates_);
for (size_t i = 0; i < num_estimates_; i++) {
int bucket_idx = hash_manager_.hash(i, key) % num_buckets_;
counter_t sign = to_sign(hash_manager_.hash(num_estimates_ + i, key));
size_t bucket_idx = bucket_hash_manager_.hash(i, key) % num_buckets_;
counter_t sign = to_sign(sign_hash_manager_.hash(i, key));
counter_t old_count = atomic::faa<counter_t>(&counters_[num_buckets_ * i + bucket_idx], sign);
median_buf[i] = sign * old_count;
}
return median<counter_t>(median_buf);
return median(median_buf);
}

/**
* @return storage size of data structure in bytes
*/
size_t storage_size() {
size_t counters_size_bytes = sizeof(atomic_counter_t) * (num_estimates_ * num_buckets_);
// TODO account for hashes (O(n) increase)
return counters_size_bytes;
}

/**
Expand All @@ -108,10 +125,9 @@ class count_sketch {
* @return count min sketch with accuracy guarantees
*/
// TODO rename func
static count_sketch create_parameterized(double gamma, double epsilon, hash_manager& manager) {
static count_sketch create_parameterized(double gamma, double epsilon) {
return count_sketch(count_sketch<T>::perror_to_num_estimates(gamma),
count_sketch<T>::error_margin_to_num_buckets(epsilon),
manager);
count_sketch<T>::error_margin_to_num_buckets(epsilon));
}

// TODO move
Expand All @@ -122,7 +138,7 @@ class count_sketch {
*/
static size_t perror_to_num_estimates(double gamma) {
// TODO assert
double n = std::pow(2, sizeof(T) * 8) - 1;
double n = std::pow(2.0, sizeof(T) * 8) - 1;
return std::ceil(std::log2(n / gamma));
}

Expand All @@ -145,7 +161,8 @@ class count_sketch {
size_t num_buckets_; // width

std::vector<atomic_counter_t> counters_;
hash_manager hash_manager_;
hash_manager<T> bucket_hash_manager_;
hash_manager<T> sign_hash_manager_;

};

Expand Down
43 changes: 21 additions & 22 deletions libconfluo/confluo/container/sketch/hash_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,42 +8,42 @@
namespace confluo {
namespace sketch {

// TODO rename
class simple_hash {
/**
* Pairwise-independent hash
*/
template<typename T>
class pairwise_indep_hash {

public:
static const size_t PRIME = 4294967311;
static const size_t PRIME = 39916801;

simple_hash()
: simple_hash(0, 0) {
pairwise_indep_hash()
: pairwise_indep_hash(0, 0) {
}

simple_hash(size_t a, size_t b)
pairwise_indep_hash(size_t a, size_t b)
: a_(a),
b_(b) {
b_(b),
hash_() {
}

template<typename T>
typename std::enable_if<!std::is_arithmetic<T>::value, size_t>::type apply(T elem) {
return (a_ * std::hash<T>(elem) + b_) % PRIME;
size_t apply(T elem) {
return (a_ * hash_(elem) + b_) % PRIME;
}

template<typename T>
typename std::enable_if<std::is_arithmetic<T>::value, size_t>::type apply(T elem) {
return (a_ * elem + b_) % PRIME;
}

static simple_hash generate_random() {
return simple_hash(utils::rand_utils::rand_uint64(PRIME), utils::rand_utils::rand_uint64(PRIME));
static pairwise_indep_hash<T> generate_random() {
return pairwise_indep_hash<T>(utils::rand_utils::rand_uint64(PRIME), utils::rand_utils::rand_uint64(PRIME));
}

private:
size_t a_, b_;
std::hash<T> hash_;

};

const size_t simple_hash::PRIME;
template<typename T> const size_t pairwise_indep_hash<T>::PRIME;

template<typename T>
class hash_manager {
public:
/**
Expand All @@ -63,7 +63,7 @@ class hash_manager {
size_t cur_size = hashes_.size();
size_t num_new_hashes = num_hashes > cur_size ? num_hashes - cur_size : 0;
for (size_t i = 0; i < num_new_hashes; i++) {
hashes_.push_back(simple_hash::generate_random());
hashes_.push_back(pairwise_indep_hash<T>::generate_random());
}
}

Expand All @@ -73,13 +73,12 @@ class hash_manager {
* @param elem element to hash
* @return hashed value
*/
template<typename T>
size_t hash(size_t hash_id, T elem) {
return hashes_[hash_id].apply<T>(elem);
return hashes_[hash_id].apply(elem);
}

private:
std::vector<simple_hash> hashes_;
std::vector<pairwise_indep_hash<T>> hashes_;

};

Expand Down
Loading

0 comments on commit 2996b33

Please sign in to comment.