Skip to content

Commit

Permalink
Merge pull request #2 from domcermak/1-simplify-confidence-interval
Browse files Browse the repository at this point in the history
1 simplify confidence interval
  • Loading branch information
domcermak authored Dec 21, 2020
2 parents bc4578a + 882b334 commit 6ca8ebd
Show file tree
Hide file tree
Showing 15 changed files with 286 additions and 128 deletions.
2 changes: 1 addition & 1 deletion Gemfile.lock
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
PATH
remote: .
specs:
normal_distribution (0.1.2)
normal_distribution (0.2.0)

GEM
remote: https://rubygems.org/
Expand Down
2 changes: 1 addition & 1 deletion LICENSE.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
The MIT License (MIT)

Copyright (c) 2020 Dominik
Copyright (c) 2020 Dominik Čermák

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down
8 changes: 5 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,16 @@ data = [1, 1.5, 1.8, 1.9, 2, 2.1, 2, 2.3, 2.7, 2.9, 3.1]
model = NormalDistribution::Model.new data

percentage = 95
bottom, top = model.confidence_interval(percentage)
interval = model.confidence_interval(percentage)

potential_anomaly = 3.0
if bottom > potential_anomaly or top < potential_anomaly
puts "#{ potential_anomaly } is in group of rarest 5 %. Therefore, it's an anomaly"
unless interval.include? potential_anomaly
puts "#{ potential_anomaly } is in group of values with probability lower then 5 %. Therefore, it's an anomaly"
end
```

![95% confidence interval](https://upload.wikimedia.org/wikipedia/commons/b/bf/NormalDist1.96.png)

## Development

After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
Expand Down
41 changes: 41 additions & 0 deletions ext/normal_distribution/confidence_interval.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#include "confidence_interval.h"

static VALUE t_init( VALUE self, VALUE lower_bound, VALUE upper_bound ) {
double lower = NUM2DBL( lower_bound );
double upper = NUM2DBL( upper_bound );

if ( lower > upper ) {
rb_raise( rb_eArgError, "lower bound must not be greater then upper bound" );
}

rb_iv_set( self, "@lower_bound", lower_bound );
rb_iv_set( self, "@upper_bound", upper_bound );

return self;
}

static VALUE t_attr_get_upper_bound( VALUE self ) {
return rb_iv_get( self, "@upper_bound" );
}

static VALUE t_attr_get_lower_bound( VALUE self ) {
return rb_iv_get( self, "@lower_bound" );
}

static VALUE t_include( VALUE self, VALUE value ) {
double lower = NUM2DBL( rb_iv_get( self, "@lower_bound" ) );
double upper = NUM2DBL( rb_iv_get( self, "@upper_bound" ) );
double v = NUM2DBL( value );

return v < lower || v > upper ? Qfalse : Qtrue;
}

void Init_confidence_interval( void ) {
VALUE rb_mNormalDistribution = rb_path2class( "NormalDistribution" );
VALUE rb_cConfidenceInterval = rb_define_class_under( rb_mNormalDistribution, "ConfidenceInterval", rb_cObject );

rb_define_method( rb_cConfidenceInterval, "initialize", t_init, 2 );
rb_define_method( rb_cConfidenceInterval, "lower_bound", t_attr_get_lower_bound, 0 );
rb_define_method( rb_cConfidenceInterval, "upper_bound", t_attr_get_upper_bound, 0 );
rb_define_method( rb_cConfidenceInterval, "include?", t_include, 1 );
}
8 changes: 8 additions & 0 deletions ext/normal_distribution/confidence_interval.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#ifndef NORMAL_DISTRIBUTION_CONFIDENCE_INTERVAL_H
#define NORMAL_DISTRIBUTION_CONFIDENCE_INTERVAL_H

#include "ruby.h"

void Init_confidence_interval( void );

#endif //NORMAL_DISTRIBUTION_CONFIDENCE_INTERVAL_H
2 changes: 1 addition & 1 deletion ext/normal_distribution/erf_inv.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,6 @@

#include <math.h>

long double t_erf_inv(long double x);
long double t_erf_inv( long double x );

#endif //NORMAL_DISTRIBUTION_ERF_INV_H
111 changes: 111 additions & 0 deletions ext/normal_distribution/model.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
#include "model.h"

static double t_parse_percentage( VALUE percentage ) {
double perc = NUM2DBL( percentage );

if ( perc > 100 || perc < 0 ) {
rb_raise( rb_eArgError, "percentage must be between 0 and 100" );
}

return perc;
}

static double t_mean( double * data, long size ) {
double sum = .0;

for ( long i = 0 ; i < size ; ++ i ) {
sum += data[i];
}

return sum / size;
}

static double t_z_score( double percentage ) {
return sqrt( 2 ) * t_erf_inv( percentage / 100 );
}

static double t_variance( double * data, long size, double mean ) {
double * squared_diff = ALLOC_N( double, size );

for ( long i = 0 ; i < size ; ++ i ) {
squared_diff[i] = pow( mean - data[i], 2 );
}

double variance = t_mean( squared_diff, size );
free( squared_diff );

return variance;
}

static double t_stddev( double * data, long size, double mean ) {
return sqrt( t_variance( data, size, mean ) );
}

static double * t_parse_dbl_ary( VALUE ary, long * size ) {
Check_Type( ary, T_ARRAY );
long len = RARRAY_LEN( ary );

if ( len == 0 ) {
rb_raise( rb_eArgError, "data must not be empty" );
}

VALUE * values = RARRAY_PTR( ary );
double * d_data = ALLOC_N( double, len );

for ( int i = 0 ; i < len ; ++ i ) {
d_data[i] = NUM2DBL( values[i] );
}

*size = len;

return d_data;
}

static VALUE t_init( VALUE self, VALUE values ) {
long size;
double * data = t_parse_dbl_ary( values, &size );
double mean = t_mean( data, size );
double stddev = t_stddev( data, size, mean );

rb_iv_set( self, "@mean", rb_float_new( mean ) );
rb_iv_set( self, "@standard_deviation", rb_float_new( stddev ) );
xfree( data );

return self;
}

static VALUE t_confidence_interval( VALUE self, VALUE percentage ) {
double perc = t_parse_percentage( percentage );
double z = t_z_score( perc );
double stddev = NUM2DBL( rb_iv_get( self, "@standard_deviation" ) );
double mean = NUM2DBL( rb_iv_get( self, "@mean" ) );
double lower_bound = - z * stddev + mean;
double upper_bound = z * stddev + mean;

VALUE rb_cConfidenceInterval = rb_path2class( "NormalDistribution::ConfidenceInterval" );
VALUE interval = rb_funcall(
rb_cConfidenceInterval, rb_intern( "new" ), 2,
rb_float_new( lower_bound ),
rb_float_new( upper_bound )
);

return interval;
}

static VALUE t_attr_mean( VALUE self ) {
return rb_iv_get( self, "@mean" );
}

static VALUE t_attr_stddev( VALUE self ) {
return rb_iv_get( self, "@standard_deviation" );
}

void Init_model( void ) {
VALUE rb_mNormalDistribution = rb_path2class( "NormalDistribution" );
VALUE rb_cModel = rb_define_class_under( rb_mNormalDistribution, "Model", rb_cObject );

rb_define_method( rb_cModel, "initialize", t_init, 1 );
rb_define_method( rb_cModel, "confidence_interval", t_confidence_interval, 1 );
rb_define_method( rb_cModel, "mean", t_attr_mean, 0 );
rb_define_method( rb_cModel, "standard_deviation", t_attr_stddev, 0 );
}
10 changes: 10 additions & 0 deletions ext/normal_distribution/model.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#ifndef NORMAL_DISTRIBUTION_MODEL_H
#define NORMAL_DISTRIBUTION_MODEL_H

#include "ruby.h"
#include "erf_inv.h"
#include "confidence_interval.h"

void Init_model( void );

#endif //NORMAL_DISTRIBUTION_MODEL_H
106 changes: 3 additions & 103 deletions ext/normal_distribution/normal_distribution.c
Original file line number Diff line number Diff line change
@@ -1,108 +1,8 @@
#include "normal_distribution.h"

static double t_parse_percentage(VALUE percentage) {
double perc = NUM2DBL( percentage );

if (perc > 100 || perc < 0) {
rb_raise(rb_eArgError, "percentage must be between 0 and 100");
}

return perc;
}

static double t_mean( double * data, long size ) {
double sum = .0;

for ( long i = 0 ; i < size ; ++ i ) {
sum += data[i];
}

return sum / size;
}

static double t_z_score( double percentage ) {
return sqrt( 2 ) * t_erf_inv( percentage / 100 );
}

static double t_variance( double * data, long size, double mean ) {
double * squared_diff = ALLOC_N( double, size );

for ( long i = 0 ; i < size ; ++ i ) {
squared_diff[i] = pow( mean - data[i], 2 );
}

double variance = t_mean( squared_diff, size );
free( squared_diff );

return variance;
}

static double t_stddev( double * data, long size, double mean ) {
return sqrt( t_variance( data, size, mean ) );
}

static double * t_parse_dbl_ary( VALUE ary, long * size ) {
Check_Type(ary, T_ARRAY);
long len = RARRAY_LEN( ary );

if (len == 0) {
rb_raise(rb_eArgError, "data must not be empty");
}

VALUE * values = RARRAY_PTR( ary );
double * d_data = ALLOC_N( double, len );

for ( int i = 0 ; i < len ; ++ i ) {
d_data[i] = NUM2DBL( values[i] );
}

*size = len;

return d_data;
}

static VALUE t_init( VALUE self, VALUE values ) {
long size;
double * data = t_parse_dbl_ary( values, &size );
double mean = t_mean( data, size );
double stddev = t_stddev( data, size, mean );

rb_iv_set( self, "@mean", rb_float_new( mean ) );
rb_iv_set( self, "@standard_deviation", rb_float_new( stddev ) );
free( data );

return self;
}

static VALUE t_confidence_interval( VALUE self, VALUE percentage ) {
double perc = t_parse_percentage( percentage );
double z = t_z_score( perc );
double stddev = NUM2DBL( rb_iv_get( self, "@standard_deviation" ) );
double mean = NUM2DBL( rb_iv_get( self, "@mean" ) );
double lower_bound = - z * stddev + mean;
double upper_bound = z * stddev + mean;

VALUE pair = rb_ary_new();
rb_ary_push( pair, rb_float_new( lower_bound ) );
rb_ary_push( pair, rb_float_new( upper_bound ) );

return pair;
}

static VALUE t_attr_mean( VALUE self ) {
return rb_iv_get( self, "@mean" );
}

static VALUE t_attr_stddev( VALUE self ) {
return rb_iv_get( self, "@standard_deviation" );
}

void Init_normal_distribution( void ) {
VALUE module = rb_define_module( "NormalDistribution" );
VALUE model = rb_define_class_under( module, "Model", rb_cObject );
rb_define_module( "NormalDistribution" );

rb_define_method( model, "initialize", t_init, 1 );
rb_define_method( model, "confidence_interval", t_confidence_interval, 1 );
rb_define_method( model, "mean", t_attr_mean, 0 );
rb_define_method( model, "standard_deviation", t_attr_stddev, 0 );
Init_confidence_interval();
Init_model();
}
11 changes: 4 additions & 7 deletions ext/normal_distribution/normal_distribution.h
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
#ifndef NORMAL_DISTRIBUTION_H
#define NORMAL_DISTRIBUTION_H 1
#define NORMAL_DISTRIBUTION_H

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "ruby.h"
#include "erf_inv.h"
#include "model.h"
#include "confidence_interval.h"

#endif /* NORMAL_DISTRIBUTION_H */
#endif //NORMAL_DISTRIBUTION_H
32 changes: 32 additions & 0 deletions lib/normal_distribution/confidence_interval.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# frozen_string_literal: true

module NormalDistribution
# Confidence interval of normal distribution
#
# @since 0.2.0
class ConfidenceInterval
# @!parse [ruby]
#
# # @return [Float] lower bound of confidence interval
# attr_reader :lower_bound
#
# # @return [Float] upper bound of confidence interval
# attr_reader :upper_bound
#
# # Initializes confidence interval
# #
# # @param lower_bound [Numeric] lower bound of confidence interval
# # @param upper_bound [Numeric] upper bound of confidence interval
# def initialize(lower_bound, upper_bound)
# # This is stub used for indexing
# end
#
# # Decides, whether value is from the interval
# #
# # @param value [Numeric] value to be compared with interval bounds
# # @return [Boolean] true if value is from the interval. Otherwise returns false.
# def include?(value)
# # This is stub used for indexing
# end
end
end
8 changes: 5 additions & 3 deletions lib/normal_distribution/model.rb
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,12 @@ class Model
# # This is stub used for indexing
# end
#
# # Calculates confidence interval for given percentage
# # Calculates confidence interval for given probability in percentage
# #
# # @param percentage [Numeric] a number in interval <0, 100> representing probability
# # @return [Array<Float>] an array containing 2 values, lower bound and upper_bound of confidence interval
# # @param percentage [Numeric] a number in interval <0, 100> representing probability in percentage
# # @return [ConfidenceInterval] an instance of ConfidenceInterval class
# #
# # @since 0.2.0
# def confidence_interval(percentage)
# # This is stub used for indexing
# end
Expand Down
Loading

0 comments on commit 6ca8ebd

Please sign in to comment.