diff --git a/Gemfile.lock b/Gemfile.lock index d1b5df5..aac0d0e 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,7 +1,7 @@ PATH remote: . specs: - normal_distribution (0.1.2) + normal_distribution (0.2.0) GEM remote: https://rubygems.org/ diff --git a/LICENSE.txt b/LICENSE.txt index 515e8ba..00b0a69 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,6 +1,6 @@ The MIT License (MIT) -Copyright (c) 2020 Dominik +Copyright (c) 2020 Dominik Čermák Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index b8e5e7c..f5dcae2 100644 --- a/README.md +++ b/README.md @@ -25,14 +25,16 @@ data = [1, 1.5, 1.8, 1.9, 2, 2.1, 2, 2.3, 2.7, 2.9, 3.1] model = NormalDistribution::Model.new data percentage = 95 -bottom, top = model.confidence_interval(percentage) +interval = model.confidence_interval(percentage) potential_anomaly = 3.0 -if bottom > potential_anomaly or top < potential_anomaly - puts "#{ potential_anomaly } is in group of rarest 5 %. Therefore, it's an anomaly" +unless interval.include? potential_anomaly + puts "#{ potential_anomaly } is in group of values with probability lower then 5 %. Therefore, it's an anomaly" end ``` +![95% confidence interval](https://upload.wikimedia.org/wikipedia/commons/b/bf/NormalDist1.96.png) + ## Development After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment. diff --git a/ext/normal_distribution/confidence_interval.c b/ext/normal_distribution/confidence_interval.c new file mode 100644 index 0000000..b8c33ba --- /dev/null +++ b/ext/normal_distribution/confidence_interval.c @@ -0,0 +1,41 @@ +#include "confidence_interval.h" + +static VALUE t_init( VALUE self, VALUE lower_bound, VALUE upper_bound ) { + double lower = NUM2DBL( lower_bound ); + double upper = NUM2DBL( upper_bound ); + + if ( lower > upper ) { + rb_raise( rb_eArgError, "lower bound must not be greater then upper bound" ); + } + + rb_iv_set( self, "@lower_bound", lower_bound ); + rb_iv_set( self, "@upper_bound", upper_bound ); + + return self; +} + +static VALUE t_attr_get_upper_bound( VALUE self ) { + return rb_iv_get( self, "@upper_bound" ); +} + +static VALUE t_attr_get_lower_bound( VALUE self ) { + return rb_iv_get( self, "@lower_bound" ); +} + +static VALUE t_include( VALUE self, VALUE value ) { + double lower = NUM2DBL( rb_iv_get( self, "@lower_bound" ) ); + double upper = NUM2DBL( rb_iv_get( self, "@upper_bound" ) ); + double v = NUM2DBL( value ); + + return v < lower || v > upper ? Qfalse : Qtrue; +} + +void Init_confidence_interval( void ) { + VALUE rb_mNormalDistribution = rb_path2class( "NormalDistribution" ); + VALUE rb_cConfidenceInterval = rb_define_class_under( rb_mNormalDistribution, "ConfidenceInterval", rb_cObject ); + + rb_define_method( rb_cConfidenceInterval, "initialize", t_init, 2 ); + rb_define_method( rb_cConfidenceInterval, "lower_bound", t_attr_get_lower_bound, 0 ); + rb_define_method( rb_cConfidenceInterval, "upper_bound", t_attr_get_upper_bound, 0 ); + rb_define_method( rb_cConfidenceInterval, "include?", t_include, 1 ); +} diff --git a/ext/normal_distribution/confidence_interval.h b/ext/normal_distribution/confidence_interval.h new file mode 100644 index 0000000..94e0c19 --- /dev/null +++ b/ext/normal_distribution/confidence_interval.h @@ -0,0 +1,8 @@ +#ifndef NORMAL_DISTRIBUTION_CONFIDENCE_INTERVAL_H +#define NORMAL_DISTRIBUTION_CONFIDENCE_INTERVAL_H + +#include "ruby.h" + +void Init_confidence_interval( void ); + +#endif //NORMAL_DISTRIBUTION_CONFIDENCE_INTERVAL_H diff --git a/ext/normal_distribution/erf_inv.h b/ext/normal_distribution/erf_inv.h index 2ad9bf1..30058bb 100644 --- a/ext/normal_distribution/erf_inv.h +++ b/ext/normal_distribution/erf_inv.h @@ -7,6 +7,6 @@ #include -long double t_erf_inv(long double x); +long double t_erf_inv( long double x ); #endif //NORMAL_DISTRIBUTION_ERF_INV_H diff --git a/ext/normal_distribution/model.c b/ext/normal_distribution/model.c new file mode 100644 index 0000000..3ac64c8 --- /dev/null +++ b/ext/normal_distribution/model.c @@ -0,0 +1,111 @@ +#include "model.h" + +static double t_parse_percentage( VALUE percentage ) { + double perc = NUM2DBL( percentage ); + + if ( perc > 100 || perc < 0 ) { + rb_raise( rb_eArgError, "percentage must be between 0 and 100" ); + } + + return perc; +} + +static double t_mean( double * data, long size ) { + double sum = .0; + + for ( long i = 0 ; i < size ; ++ i ) { + sum += data[i]; + } + + return sum / size; +} + +static double t_z_score( double percentage ) { + return sqrt( 2 ) * t_erf_inv( percentage / 100 ); +} + +static double t_variance( double * data, long size, double mean ) { + double * squared_diff = ALLOC_N( double, size ); + + for ( long i = 0 ; i < size ; ++ i ) { + squared_diff[i] = pow( mean - data[i], 2 ); + } + + double variance = t_mean( squared_diff, size ); + free( squared_diff ); + + return variance; +} + +static double t_stddev( double * data, long size, double mean ) { + return sqrt( t_variance( data, size, mean ) ); +} + +static double * t_parse_dbl_ary( VALUE ary, long * size ) { + Check_Type( ary, T_ARRAY ); + long len = RARRAY_LEN( ary ); + + if ( len == 0 ) { + rb_raise( rb_eArgError, "data must not be empty" ); + } + + VALUE * values = RARRAY_PTR( ary ); + double * d_data = ALLOC_N( double, len ); + + for ( int i = 0 ; i < len ; ++ i ) { + d_data[i] = NUM2DBL( values[i] ); + } + + *size = len; + + return d_data; +} + +static VALUE t_init( VALUE self, VALUE values ) { + long size; + double * data = t_parse_dbl_ary( values, &size ); + double mean = t_mean( data, size ); + double stddev = t_stddev( data, size, mean ); + + rb_iv_set( self, "@mean", rb_float_new( mean ) ); + rb_iv_set( self, "@standard_deviation", rb_float_new( stddev ) ); + xfree( data ); + + return self; +} + +static VALUE t_confidence_interval( VALUE self, VALUE percentage ) { + double perc = t_parse_percentage( percentage ); + double z = t_z_score( perc ); + double stddev = NUM2DBL( rb_iv_get( self, "@standard_deviation" ) ); + double mean = NUM2DBL( rb_iv_get( self, "@mean" ) ); + double lower_bound = - z * stddev + mean; + double upper_bound = z * stddev + mean; + + VALUE rb_cConfidenceInterval = rb_path2class( "NormalDistribution::ConfidenceInterval" ); + VALUE interval = rb_funcall( + rb_cConfidenceInterval, rb_intern( "new" ), 2, + rb_float_new( lower_bound ), + rb_float_new( upper_bound ) + ); + + return interval; +} + +static VALUE t_attr_mean( VALUE self ) { + return rb_iv_get( self, "@mean" ); +} + +static VALUE t_attr_stddev( VALUE self ) { + return rb_iv_get( self, "@standard_deviation" ); +} + +void Init_model( void ) { + VALUE rb_mNormalDistribution = rb_path2class( "NormalDistribution" ); + VALUE rb_cModel = rb_define_class_under( rb_mNormalDistribution, "Model", rb_cObject ); + + rb_define_method( rb_cModel, "initialize", t_init, 1 ); + rb_define_method( rb_cModel, "confidence_interval", t_confidence_interval, 1 ); + rb_define_method( rb_cModel, "mean", t_attr_mean, 0 ); + rb_define_method( rb_cModel, "standard_deviation", t_attr_stddev, 0 ); +} diff --git a/ext/normal_distribution/model.h b/ext/normal_distribution/model.h new file mode 100644 index 0000000..0a6c617 --- /dev/null +++ b/ext/normal_distribution/model.h @@ -0,0 +1,10 @@ +#ifndef NORMAL_DISTRIBUTION_MODEL_H +#define NORMAL_DISTRIBUTION_MODEL_H + +#include "ruby.h" +#include "erf_inv.h" +#include "confidence_interval.h" + +void Init_model( void ); + +#endif //NORMAL_DISTRIBUTION_MODEL_H diff --git a/ext/normal_distribution/normal_distribution.c b/ext/normal_distribution/normal_distribution.c index 8520ad3..3aa0882 100644 --- a/ext/normal_distribution/normal_distribution.c +++ b/ext/normal_distribution/normal_distribution.c @@ -1,108 +1,8 @@ #include "normal_distribution.h" -static double t_parse_percentage(VALUE percentage) { - double perc = NUM2DBL( percentage ); - - if (perc > 100 || perc < 0) { - rb_raise(rb_eArgError, "percentage must be between 0 and 100"); - } - - return perc; -} - -static double t_mean( double * data, long size ) { - double sum = .0; - - for ( long i = 0 ; i < size ; ++ i ) { - sum += data[i]; - } - - return sum / size; -} - -static double t_z_score( double percentage ) { - return sqrt( 2 ) * t_erf_inv( percentage / 100 ); -} - -static double t_variance( double * data, long size, double mean ) { - double * squared_diff = ALLOC_N( double, size ); - - for ( long i = 0 ; i < size ; ++ i ) { - squared_diff[i] = pow( mean - data[i], 2 ); - } - - double variance = t_mean( squared_diff, size ); - free( squared_diff ); - - return variance; -} - -static double t_stddev( double * data, long size, double mean ) { - return sqrt( t_variance( data, size, mean ) ); -} - -static double * t_parse_dbl_ary( VALUE ary, long * size ) { - Check_Type(ary, T_ARRAY); - long len = RARRAY_LEN( ary ); - - if (len == 0) { - rb_raise(rb_eArgError, "data must not be empty"); - } - - VALUE * values = RARRAY_PTR( ary ); - double * d_data = ALLOC_N( double, len ); - - for ( int i = 0 ; i < len ; ++ i ) { - d_data[i] = NUM2DBL( values[i] ); - } - - *size = len; - - return d_data; -} - -static VALUE t_init( VALUE self, VALUE values ) { - long size; - double * data = t_parse_dbl_ary( values, &size ); - double mean = t_mean( data, size ); - double stddev = t_stddev( data, size, mean ); - - rb_iv_set( self, "@mean", rb_float_new( mean ) ); - rb_iv_set( self, "@standard_deviation", rb_float_new( stddev ) ); - free( data ); - - return self; -} - -static VALUE t_confidence_interval( VALUE self, VALUE percentage ) { - double perc = t_parse_percentage( percentage ); - double z = t_z_score( perc ); - double stddev = NUM2DBL( rb_iv_get( self, "@standard_deviation" ) ); - double mean = NUM2DBL( rb_iv_get( self, "@mean" ) ); - double lower_bound = - z * stddev + mean; - double upper_bound = z * stddev + mean; - - VALUE pair = rb_ary_new(); - rb_ary_push( pair, rb_float_new( lower_bound ) ); - rb_ary_push( pair, rb_float_new( upper_bound ) ); - - return pair; -} - -static VALUE t_attr_mean( VALUE self ) { - return rb_iv_get( self, "@mean" ); -} - -static VALUE t_attr_stddev( VALUE self ) { - return rb_iv_get( self, "@standard_deviation" ); -} - void Init_normal_distribution( void ) { - VALUE module = rb_define_module( "NormalDistribution" ); - VALUE model = rb_define_class_under( module, "Model", rb_cObject ); + rb_define_module( "NormalDistribution" ); - rb_define_method( model, "initialize", t_init, 1 ); - rb_define_method( model, "confidence_interval", t_confidence_interval, 1 ); - rb_define_method( model, "mean", t_attr_mean, 0 ); - rb_define_method( model, "standard_deviation", t_attr_stddev, 0 ); + Init_confidence_interval(); + Init_model(); } diff --git a/ext/normal_distribution/normal_distribution.h b/ext/normal_distribution/normal_distribution.h index ab54f1c..9dc057f 100644 --- a/ext/normal_distribution/normal_distribution.h +++ b/ext/normal_distribution/normal_distribution.h @@ -1,10 +1,7 @@ #ifndef NORMAL_DISTRIBUTION_H -#define NORMAL_DISTRIBUTION_H 1 +#define NORMAL_DISTRIBUTION_H -#include -#include -#include -#include "ruby.h" -#include "erf_inv.h" +#include "model.h" +#include "confidence_interval.h" -#endif /* NORMAL_DISTRIBUTION_H */ +#endif //NORMAL_DISTRIBUTION_H diff --git a/lib/normal_distribution/confidence_interval.rb b/lib/normal_distribution/confidence_interval.rb new file mode 100644 index 0000000..6dc5ed3 --- /dev/null +++ b/lib/normal_distribution/confidence_interval.rb @@ -0,0 +1,32 @@ +# frozen_string_literal: true + +module NormalDistribution + # Confidence interval of normal distribution + # + # @since 0.2.0 + class ConfidenceInterval + # @!parse [ruby] + # + # # @return [Float] lower bound of confidence interval + # attr_reader :lower_bound + # + # # @return [Float] upper bound of confidence interval + # attr_reader :upper_bound + # + # # Initializes confidence interval + # # + # # @param lower_bound [Numeric] lower bound of confidence interval + # # @param upper_bound [Numeric] upper bound of confidence interval + # def initialize(lower_bound, upper_bound) + # # This is stub used for indexing + # end + # + # # Decides, whether value is from the interval + # # + # # @param value [Numeric] value to be compared with interval bounds + # # @return [Boolean] true if value is from the interval. Otherwise returns false. + # def include?(value) + # # This is stub used for indexing + # end + end +end \ No newline at end of file diff --git a/lib/normal_distribution/model.rb b/lib/normal_distribution/model.rb index c13c7bc..a4dc3a8 100644 --- a/lib/normal_distribution/model.rb +++ b/lib/normal_distribution/model.rb @@ -20,10 +20,12 @@ class Model # # This is stub used for indexing # end # - # # Calculates confidence interval for given percentage + # # Calculates confidence interval for given probability in percentage # # - # # @param percentage [Numeric] a number in interval <0, 100> representing probability - # # @return [Array] an array containing 2 values, lower bound and upper_bound of confidence interval + # # @param percentage [Numeric] a number in interval <0, 100> representing probability in percentage + # # @return [ConfidenceInterval] an instance of ConfidenceInterval class + # # + # # @since 0.2.0 # def confidence_interval(percentage) # # This is stub used for indexing # end diff --git a/lib/normal_distribution/version.rb b/lib/normal_distribution/version.rb index 0bea5ff..1606bfd 100644 --- a/lib/normal_distribution/version.rb +++ b/lib/normal_distribution/version.rb @@ -2,5 +2,5 @@ module NormalDistribution # Current version of NormalDistribution gem - VERSION = "0.1.2" + VERSION = "0.2.0" end diff --git a/spec/normal_distribution/confidence_interval_spec.rb b/spec/normal_distribution/confidence_interval_spec.rb new file mode 100644 index 0000000..1ace87f --- /dev/null +++ b/spec/normal_distribution/confidence_interval_spec.rb @@ -0,0 +1,53 @@ +# frozen_string_literal: true + +RSpec.describe NormalDistribution::ConfidenceInterval do + describe 'functionality' do + context 'with success' do + it 'correctly returns given bounds' do + interval = NormalDistribution::ConfidenceInterval.new -2.1, 4.4 + + expect(interval).to have_attributes lower_bound: -2.1, + upper_bound: 4.4 + end + + it 'includes value between bounds' do + interval = NormalDistribution::ConfidenceInterval.new -2.1, 4.4 + + expect(interval.include?(0.1)).to eq true + end + + it 'does not include value between bounds' do + interval = NormalDistribution::ConfidenceInterval.new -2.1, 4.4 + + expect(interval.include?(4.41)).to eq false + end + end + + context 'with raised error' do + it 'raises type error when lower bound is not a number' do + expect { + NormalDistribution::ConfidenceInterval.new 'hello', 4.4 + }.to raise_error TypeError, 'no implicit conversion to float from string' + end + + it 'raises type error when upper bound is not a number' do + expect { + NormalDistribution::ConfidenceInterval.new -2.1, 'world' + }.to raise_error TypeError, 'no implicit conversion to float from string' + end + + it 'raises argument error when lower bound is greater then upper bound' do + expect { + NormalDistribution::ConfidenceInterval.new 4.0, 2.0 + }.to raise_error ArgumentError, 'lower bound must not be greater then upper bound' + end + + it 'raises error when #include? non-numerical argument' do + interval = NormalDistribution::ConfidenceInterval.new -2.1, 4.4 + + expect { interval.include?('hello') }.to raise_error TypeError, + 'no implicit conversion to float from string' + end + end + end +end \ No newline at end of file diff --git a/spec/normal_distribution/model_spec.rb b/spec/normal_distribution/model_spec.rb index 3bb0a51..2590a5c 100644 --- a/spec/normal_distribution/model_spec.rb +++ b/spec/normal_distribution/model_spec.rb @@ -16,10 +16,11 @@ end it 'calculates correctly confidence interval' do - bottom, top = subject.confidence_interval(95) + interval = subject.confidence_interval(95) - expect(bottom).to be_within(0.0001).of(0.7368) - expect(top).to be_within(0.0001).of(5.2631) + expect(interval).to be_instance_of(NormalDistribution::ConfidenceInterval) + expect(interval.lower_bound).to be_within(0.0001).of(0.7368) + expect(interval.upper_bound).to be_within(0.0001).of(5.2631) end end @@ -38,8 +39,9 @@ end it 'raises error when percentage is a string' do - expect { subject.confidence_interval('invalid_type') }.to raise_error TypeError, - 'no implicit conversion to float from string' + expect { + subject.confidence_interval('invalid_type') + }.to raise_error TypeError, 'no implicit conversion to float from string' end end @@ -69,7 +71,7 @@ end end - describe 'performace' do + describe 'performance' do specify 'constructor performs linearly depending on data' do sizes = bench_range(8, 100_000) data_arrays = sizes.map { |n| Array.new(n) { rand(n) } } @@ -79,8 +81,8 @@ }.to perform_linear.in_range(sizes) end - specify 'confidence_interval calculation performs constantly' do - min, max = 8, 100_000 + specify 'confidence interval calculation performs constantly' do + min, max = 8, 1_000_000 sizes = bench_range(min, max) models = sizes.map do |n| data = Array.new(n) { rand(n) }