Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Autocorrelation #160

Open
wants to merge 12 commits into
base: develop
Choose a base branch
from
2 changes: 2 additions & 0 deletions metalearn/metafeatures/metafeatures.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from metalearn.metafeatures.decision_tree_metafeatures import resources_info as dt_resources
from metalearn.metafeatures.general_resource_computers import resources_info as general_resources
from metalearn.metafeatures.text_metafeatures import resources_info as text_resources
from metalearn.metafeatures.simple_metafeatures import resources_info as simple_resources

from metalearn.metafeatures.decision_tree_metafeatures import metafeatures_info as dt_metafeatures
from metalearn.metafeatures.information_theoretic_metafeatures import metafeatures_info as info_theoretic_metafeatures
Expand All @@ -38,6 +39,7 @@ class Metafeatures(object):
_resources_info.update(dt_resources)
_resources_info.update(general_resources)
_resources_info.update(text_resources)
_resources_info.update(simple_resources)

# noop resource computers for the user-provided resources
# `_get_arguments` and `_resource_is_target_dependent` assumes ResourceComputer's
Expand Down
105 changes: 87 additions & 18 deletions metalearn/metafeatures/simple_metafeatures.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,27 @@
from collections import Counter

import numpy as np
from pandas import DataFrame

from metalearn.metafeatures.common_operations import *
from metalearn.metafeatures.base import build_resources_info, MetafeatureComputer
from metalearn.metafeatures.base import build_resources_info, MetafeatureComputer, ResourceComputer
from metalearn.metafeatures.constants import ProblemType, MetafeatureGroup


def get_dataset_stats(X, column_types):
def get_dataset_stats(X, column_types, num_binary_numeric, num_binary_categorical):
number_of_instances = X.shape[0]
number_of_features = X.shape[1]
numeric_features = len(get_numeric_features(X, column_types))
categorical_features = number_of_features - numeric_features
ratio_of_binary_numeric_features = num_binary_numeric / number_of_features
ratio_of_binary_categorical_features = num_binary_categorical / number_of_features
ratio_of_numeric_features = numeric_features / number_of_features
ratio_of_categorical_features = categorical_features / number_of_features
return (number_of_instances, number_of_features, numeric_features, categorical_features, ratio_of_numeric_features, ratio_of_categorical_features)
return (
number_of_instances, number_of_features, numeric_features, categorical_features,
ratio_of_numeric_features, ratio_of_categorical_features, ratio_of_binary_numeric_features,
ratio_of_binary_categorical_features
)

get_dataset_stats = MetafeatureComputer(
get_dataset_stats,
Expand All @@ -23,11 +31,17 @@ def get_dataset_stats(X, column_types):
"NumberOfNumericFeatures",
"NumberOfCategoricalFeatures",
"RatioOfNumericFeatures",
"RatioOfCategoricalFeatures"
"RatioOfCategoricalFeatures",
"RatioOfBinaryNumericFeatures",
"RatioOfBinaryCategoricalFeatures"
],
ProblemType.ANY,
[MetafeatureGroup.SIMPLE],
{ "X": "X_raw" }
{
"X": "X_raw",
"num_binary_numeric": "NumberOfBinaryNumericFeatures",
"num_binary_categorical": "NumberOfBinaryCategoricalFeatures"
}
)


Expand Down Expand Up @@ -88,8 +102,8 @@ def get_class_stats(Y):
return (number_of_classes, *profile_distribution(probs), minority_class_size, majority_class_size)

get_class_stats = MetafeatureComputer(
computer=get_class_stats,
returns=[
get_class_stats,
[
"NumberOfClasses",
"MeanClassProbability",
"StdevClassProbability",
Expand All @@ -103,17 +117,55 @@ def get_class_stats(Y):
"MinorityClassSize",
"MajorityClassSize"
],
problem_type=ProblemType.CLASSIFICATION,
groups=[MetafeatureGroup.SIMPLE]
ProblemType.CLASSIFICATION,
[MetafeatureGroup.SIMPLE]
)


def get_categorical_cardinalities_at_values(CategoricalCardinalities):
counts = Counter(CategoricalCardinalities)
return counts.get(2, 0), counts.get(3, 0), counts.get(4, 0)

get_categorical_cardinalities_at_values = MetafeatureComputer(
get_categorical_cardinalities_at_values,
[
"NumberOfBinaryCategoricalFeatures",
"CategoricalCardinalityAtThree",
"CategoricalCardinalityAtFour"
],
ProblemType.ANY,
[MetafeatureGroup.SIMPLE]
)


def get_numeric_cardinalities_at_values(NumericCardinalities):
counts = Counter(NumericCardinalities)
return counts.get(2, 0), counts.get(3, 0), counts.get(4, 0)

get_numeric_cardinalities_at_values = MetafeatureComputer(
get_numeric_cardinalities_at_values,
[
"NumberOfBinaryNumericFeatures",
"NumericCardinalityAtThree",
"NumericCardinalityAtFour"
],
ProblemType.ANY,
[MetafeatureGroup.SIMPLE]
)


def get_categorical_cardinalities(X, column_types):
cardinalities = [X[feature].unique().shape[0] for feature in get_categorical_features(X, column_types)]
return profile_distribution(cardinalities)
cat_cards = [X[feature].unique().shape[0] for feature in get_categorical_features(X, column_types)]
return (cat_cards,)

get_categorical_cardinalities = MetafeatureComputer(
get_categorical_cardinalities = ResourceComputer(
get_categorical_cardinalities,
["CategoricalCardinalities"]
)


profile_categorical_cardinalities = MetafeatureComputer(
profile_distribution,
[
"MeanCardinalityOfCategoricalFeatures",
"StdevCardinalityOfCategoricalFeatures",
Expand All @@ -127,15 +179,21 @@ def get_categorical_cardinalities(X, column_types):
],
ProblemType.ANY,
[MetafeatureGroup.SIMPLE],
{ "data": "CategoricalCardinalities" }
)


def get_numeric_cardinalities(X, column_types):
cardinalities = [X[feature].unique().shape[0] for feature in get_numeric_features(X, column_types)]
return profile_distribution(cardinalities)
num_cards = [X[feature].unique().shape[0] for feature in get_numeric_features(X, column_types)]
return (num_cards,)

get_numeric_cardinalities = MetafeatureComputer(
get_numeric_cardinalities = ResourceComputer(
get_numeric_cardinalities,
["NumericCardinalities"]
)

profile_numeric_cardinalities = MetafeatureComputer(
profile_distribution,
[
"MeanCardinalityOfNumericFeatures",
"StdevCardinalityOfNumericFeatures",
Expand All @@ -148,9 +206,18 @@ def get_numeric_cardinalities(X, column_types):
"MaxCardinalityOfNumericFeatures"
],
ProblemType.ANY,
[MetafeatureGroup.SIMPLE]
[MetafeatureGroup.SIMPLE],
{ "data": "NumericCardinalities" }
)

"""
A list of all ResourceComputer
instances in this module.
"""
resources_info = build_resources_info(
get_categorical_cardinalities,
get_numeric_cardinalities
)

"""
A list of all MetafeatureComputer
Expand All @@ -159,8 +226,10 @@ def get_numeric_cardinalities(X, column_types):
metafeatures_info = build_resources_info(
get_dataset_stats,
get_class_stats,
get_categorical_cardinalities_at_values,
get_numeric_cardinalities_at_values,
get_dimensionality,
get_missing_values,
get_categorical_cardinalities,
get_numeric_cardinalities
profile_categorical_cardinalities,
profile_numeric_cardinalities
)
23 changes: 22 additions & 1 deletion metalearn/metafeatures/statistical_metafeatures.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import metalearn.metafeatures.constants as consts



def get_numeric_means(numeric_features_array):
means = [feature.mean() for feature in numeric_features_array]
return profile_distribution(means)
Expand Down Expand Up @@ -213,6 +214,25 @@ def preprocess(series):

return correlations

def autocorrelation(Y, column_types):
bjschoenfeld marked this conversation as resolved.
Show resolved Hide resolved
if Y.size == 0:
ac = None
elif Y.size == 1:
ac = 1.0
elif column_types[Y.name] == consts.NUMERIC:
ac = Y.autocorr()
elif column_types[Y.name] == consts.CATEGORICAL:
ac = np.equal(Y[:-1].values, Y[1:].values).mean()
else:
ac = None
return (ac,)

autocorrelation = MetafeatureComputer(
autocorrelation,
["Autocorrelation"],
ProblemType.ANY,
[MetafeatureGroup.STATISTICAL]
)

"""
A list of all MetafeatureComputer
Expand All @@ -223,5 +243,6 @@ def preprocess(series):
get_numeric_stdev,
get_numeric_skewness,
get_numeric_kurtosis,
get_pca
get_pca,
autocorrelation
)
29 changes: 28 additions & 1 deletion tests/data/dataset_metafeatures/38_sick_train_data_mf.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,13 @@
{
"Autocorrelation": {
"value": 0.8865022540440202
},
"CategoricalCardinalityAtFour": {
"value": 0
},
"CategoricalCardinalityAtThree": {
"value": 1
},
"CategoricalNoiseToSignalRatio": {
"value": 65.12375165878576
},
Expand Down Expand Up @@ -290,6 +299,12 @@
"NaiveBayesKappa": {
"value": 0.04421998456446913
},
"NumberOfBinaryCategoricalFeatures": {
"value": 19
},
"NumberOfBinaryNumericFeatures": {
"value": 0
},
"NumberOfCategoricalFeatures": {
"value": 22
},
Expand Down Expand Up @@ -323,6 +338,12 @@
"NumberOfTokensContainingNumericChar": {
"value": 0
},
"NumericCardinalityAtFour": {
"value": 0
},
"NumericCardinalityAtThree": {
"value": 0
},
"NumericNoiseToSignalRatio": {
"value": 43.499206497970064
},
Expand Down Expand Up @@ -545,6 +566,12 @@
"RandomTreeDepth3Kappa": {
"value": 0.16934246039579826
},
"RatioOfBinaryCategoricalFeatures": {
"value": 0.6551724137931034
},
"RatioOfBinaryNumericFeatures": {
"value": 0.0
},
"RatioOfCategoricalFeatures": {
"value": 0.7586206896551724
},
Expand Down Expand Up @@ -692,4 +719,4 @@
"kNN1NKappa": {
"value": 0.18924686295501464
}
}
}
29 changes: 28 additions & 1 deletion tests/data/dataset_metafeatures/small_test_dataset_mf.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,13 @@
{
"Autocorrelation": {
"value": 0.6666666666666666
},
"CategoricalCardinalityAtFour": {
"value": 0
},
"CategoricalCardinalityAtThree": {
"value": 0
},
"CategoricalNoiseToSignalRatio": {
"value": 1.0834895643494507
},
Expand Down Expand Up @@ -290,6 +299,12 @@
"NaiveBayesKappa": {
"value": 0.1875
},
"NumberOfBinaryCategoricalFeatures": {
"value": 1
},
"NumberOfBinaryNumericFeatures": {
"value": 0
},
"NumberOfCategoricalFeatures": {
"value": 2
},
Expand Down Expand Up @@ -323,6 +338,12 @@
"NumberOfTokensContainingNumericChar": {
"value": 0
},
"NumericCardinalityAtFour": {
"value": 0
},
"NumericCardinalityAtThree": {
"value": 0
},
"NumericNoiseToSignalRatio": {
"value": 0.8185194089132799
},
Expand Down Expand Up @@ -545,6 +566,12 @@
"RandomTreeDepth3Kappa": {
"value": 0.11764705882352944
},
"RatioOfBinaryCategoricalFeatures": {
"value": 0.2
},
"RatioOfBinaryNumericFeatures": {
"value": 0.0
},
"RatioOfCategoricalFeatures": {
"value": 0.4
},
Expand Down Expand Up @@ -692,4 +719,4 @@
"kNN1NKappa": {
"value": 0.20833333333333337
}
}
}
Loading