Skip to content

Commit

Permalink
updating docstrings to include information about how keys are chosen,…
Browse files Browse the repository at this point in the history
… indexes created, and updated readME
  • Loading branch information
allisonportis committed Aug 9, 2019
1 parent 1184162 commit 0c55fef
Show file tree
Hide file tree
Showing 7 changed files with 65 additions and 379 deletions.
9 changes: 6 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,12 @@ Returns:
```shell
normalize_dataframe(df, dependencies)
```
Normalizes dataframe based on the dependencies given.
Normalizes dataframe based on the dependencies given. Keys for the newly created DataFrames can only be columns that are strings, ints, or categories. Keys are chosen according to the priority:
1) shortest lenghts
2) has "id" in some form in the name of an attribute
3) has attribute furthest to left in the table

Returns:
Returns:x

`new_dfs` (list[pd.DataFrame]) : list of new dataframes

Expand All @@ -71,7 +74,7 @@ Returns:
```shell
make_entityset(df, dependencies, name=None, time_index=None):
```
Creates a normalized EntitySet from dataframe based on the dependencies given.
Creates a normalized EntitySet from dataframe based on the dependencies given. Keys are chosen in the same fashion as for `normalize_dataframe`and a new index will be created if any key has more than a single attribute.

Returns:

Expand Down
Binary file modified autonormalize/.DS_Store
Binary file not shown.
28 changes: 13 additions & 15 deletions autonormalize/autonormalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from .classes import Dependencies


def find_dependencies(df, accuracy=0.98, rep_percent=0.85, index=None):
def find_dependencies(df, accuracy=0.98, index=None):
"""
Finds dependencies within dataframe df with the DFD search algorithm.
Returns the dependencies as a Dependencies object.
Expand All @@ -17,19 +17,14 @@ def find_dependencies(df, accuracy=0.98, rep_percent=0.85, index=None):
required in order to conclude a dependency (i.e. with accuracy = 0.98,
0.98 of the rows must hold true the dependency LHS --> RHS)
rep_percent (0 < float <= 1.00; default = 0.85) : the maximum amount of
data that may be unique in order to determine a dependency (i.e. with
rep_percent = 0.85, if less than 15% of rows are repeated for the columns
in LHS + RHS, no dependency will be concluded.)
index (str, optional) : name of column that is intended index of df
Returns:
dependencies (Dependencies) : the dependencies found in the data
within the contraints provided
"""
deps = Dependencies(dfd.dfd(df, accuracy, rep_percent, index))
deps = Dependencies(dfd.dfd(df, accuracy, index))
if index is None:
prim_key = normalize.choose_index(deps.find_candidate_keys(), df)
deps.set_prim_key(prim_key)
Expand Down Expand Up @@ -57,7 +52,11 @@ def normalize_dependencies(df, dependencies):

def normalize_dataframe(df, dependencies):
"""
Normalizes a dataframe based on the dependencies given.
Normalizes a dataframe based on the dependencies given. Keys for the newly
created DataFrames can only be columns that are strings, ints, or
categories. Keys are chosen according to the priority:
1) shortest lenghts 2) has "id" in some form in the name of an attribute
3) has attribute furthest to left in the table
Arguments:
df (pd.DataFrame) : dataframe to split up
Expand All @@ -74,6 +73,10 @@ def normalize_dataframe(df, dependencies):
def make_entityset(df, dependencies, name=None, time_index=None):
"""
Creates a normalized EntitySet from df based on the dependencies given.
Keys for the newly created DataFrames can only be columns that are strings,
ints, or categories. Keys are chosen according to the priority:
1) shortest lenghts 2) has "id" in some form in the name of an attribute
3) has attribute furthest to left in the table
Arguments:
df (pd.DataFrame) : dataframe to normalize and make entity set from
Expand Down Expand Up @@ -107,7 +110,7 @@ def make_entityset(df, dependencies, name=None, time_index=None):
return ft.EntitySet(name, entities, relationships)


def auto_entityset(df, accuracy=0.98, rep_percent=0.85, index=None, name=None, time_index=None):
def auto_entityset(df, accuracy=0.98, index=None, name=None, time_index=None):
"""
Creates a normalized entityset from a dataframe.
Expand All @@ -119,11 +122,6 @@ def auto_entityset(df, accuracy=0.98, rep_percent=0.85, index=None, name=None, t
required in order to conclude a dependency (i.e. with accuracy = 0.98,
0.98 of the rows must hold true the dependency LHS --> RHS)
rep_percent (0 < float <= 1.00; default = 0.85) : the maximum amount of
data that may be unique in order to determine a dependency (i.e. with
rep_percent = 0.85, if less than 15% of rows are repeated for the columns
in LHS + RHS, no dependency will be concluded.)
index (str, optional) : name of column that is intended index of df
name (str, optional) : the name of created EntitySet
Expand All @@ -134,7 +132,7 @@ def auto_entityset(df, accuracy=0.98, rep_percent=0.85, index=None, name=None, t
entityset (ft.EntitySet) : created entity set
"""
return make_entityset(df, find_dependencies(df, accuracy, rep_percent, index), name, time_index)
return make_entityset(df, find_dependencies(df, accuracy, index), name, time_index)


def auto_normalize(df):
Expand Down
50 changes: 29 additions & 21 deletions autonormalize/demos/AutoNormalize + FeatureTools Demo.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -39,7 +39,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 2,
"metadata": {
"scrolled": true
},
Expand Down Expand Up @@ -146,7 +146,7 @@
"2 1973-07-28 A "
]
},
"execution_count": 3,
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
Expand Down Expand Up @@ -174,7 +174,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 3,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -254,7 +254,7 @@
"3 1 2014-01-03 18:39:30 2834.44"
]
},
"execution_count": 6,
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -278,7 +278,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 4,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -351,7 +351,7 @@
"3 1 2014-01-03 17:39:30 True"
]
},
"execution_count": 7,
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -364,7 +364,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 5,
"metadata": {
"scrolled": false
},
Expand Down Expand Up @@ -419,7 +419,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 6,
"metadata": {
"scrolled": true
},
Expand All @@ -428,7 +428,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 10/10 [00:13<00:00, 1.41s/it]\n"
"100%|██████████| 10/10 [00:01<00:00, 7.11it/s]\n"
]
},
{
Expand All @@ -449,7 +449,7 @@
}
],
"source": [
"es = an.auto_entityset(transaction_df, name=\"transactions\", time_index='transaction_time')\n",
"es = an.auto_entityset(transaction_df, accuracy=1, name=\"transactions\", time_index='transaction_time')\n",
"es.add_last_time_indexes()\n",
"print(es)"
]
Expand Down Expand Up @@ -643,15 +643,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"\n",
"*write*\n",
"\n",
"\n",
"\n",
"In the feature matrix, let’s extract the labels and fill any missing values with zeros. Then, one-hot encode all categorical features by using encode_features().\n",
"\n",
"After preprocessing, we split the features and corresponding labels each into training and testing sets."
"Now we preprocess our features, and split the features and corresponding labels into training and testing sets."
]
},
{
Expand All @@ -672,6 +664,13 @@
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now, we train a random forest classifer on the training set, and then test the models performance by evaluating predictions on the testing set."
]
},
{
"cell_type": "code",
"execution_count": 10,
Expand Down Expand Up @@ -702,7 +701,9 @@
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
Expand All @@ -725,6 +726,13 @@
"print(classification_report(y_test, y_hat))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This plot is based on scores obtained by the model to illustrate which features are considered important for predictions."
]
},
{
"cell_type": "code",
"execution_count": 14,
Expand Down
29 changes: 7 additions & 22 deletions autonormalize/dfd.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
# run script.py to see a couple examples


def dfd(df, accuracy, rep_percent, index=None):
def dfd(df, accuracy, index=None):
"""
Main loop of DFD algorithm. It returns all the dependencies represented
in the data in dataframe df. Refer to section 3.2 of paper for literature.
Expand All @@ -28,11 +28,6 @@ def dfd(df, accuracy, rep_percent, index=None):
to conclude a dependency (i.e. with accuracy = 0.98, 0.98 of the rows
must hold true the dependency LHS --> RHS)
rep_percent (0 < float <= 1.00) : the maximum amount of
data that may be unique in order to determine a dependency (i.e. with
rep_percent = 0.85, if less than 15% of rows are repeated for the columns
in LHS + RHS, no dependency will be concluded.)
Returns:
minimal_dependencies (DfdDependencies) : the minimal dependencies
Expand All @@ -49,12 +44,12 @@ def dfd(df, accuracy, rep_percent, index=None):
non_uniq.remove(i)
dependencies.add_unique_lhs(i)
for i in tqdm(non_uniq):
lhss = find_LHSs(i, non_uniq, df, partitions, accuracy, masks, rep_percent)
lhss = find_LHSs(i, non_uniq, df, partitions, accuracy, masks)
dependencies.add_LHSs(i, lhss)
return dependencies


def find_LHSs(rhs, attrs, df, partitions, accuracy, masks, rep_percent):
def find_LHSs(rhs, attrs, df, partitions, accuracy, masks):
"""
Finds all LHS sets of attributes that satisfy a dependency relation for the
RHS attribute i. This is such that LHS --> RHS.
Expand All @@ -76,11 +71,6 @@ def find_LHSs(rhs, attrs, df, partitions, accuracy, masks, rep_percent):
masks (Masks) : contains past calculated masks
rep_percent (0 < float <= 1.00) : the maximum amount of data that may be
unique in order to determine a dependency (i.e. with rep_percent = 0.85,
if less than 15% of rows are repeated for the columns in LHS + RHS, no
dependency will be concluded.)
Returns:
lhss (LHSs) : all the LHS that determine rhs
"""
Expand All @@ -106,7 +96,7 @@ def find_LHSs(rhs, attrs, df, partitions, accuracy, masks, rep_percent):
else:
node.infer_type()
if node.category == 0:
if compute_partitions(df, rhs, node.attrs, partitions, accuracy, masks, rep_percent):
if compute_partitions(df, rhs, node.attrs, partitions, accuracy, masks):
if node.is_minimal():
min_deps.add_dep(node.attrs)
node.category = 2
Expand Down Expand Up @@ -296,7 +286,7 @@ def generate_next_seeds(max_non_deps, min_deps, lhs_attrs):
return list(seeds)


def compute_partitions(df, rhs, lhs_set, partitions, accuracy, masks, rep_percent):
def compute_partitions(df, rhs, lhs_set, partitions, accuracy, masks):
"""
Returns true if lhs_set --> rhs for dataframe df.
Expand All @@ -318,17 +308,12 @@ def compute_partitions(df, rhs, lhs_set, partitions, accuracy, masks, rep_percen
masks (Masks) : contains past calculated masks
rep_percent (0 < float <= 1.00) : the maximum amount of data that may be
unique in order to determine a dependency (i.e. with rep_percent = 0.85,
if less than 15% of rows are repeated for the columns in LHS + RHS, no
dependency will be concluded.)
Returns:
is_dependency (bool) : True if is a dependency, false otherwise
"""
# for approximate dependencies see TANE section 2.3s
if accuracy < 1:
return approximate_dependencies(list(lhs_set), rhs, df, accuracy, masks, rep_percent)
return approximate_dependencies(list(lhs_set), rhs, df, accuracy, masks)
part_rhs = partition(lhs_set.union(set([rhs])), df, partitions)
# if part_rhs > df.shape[0] * rep_percent:
# return False
Expand All @@ -347,7 +332,7 @@ def partition(attrs, df, partitions):
return shape


def approximate_dependencies(lhs_set, rhs, df, accuracy, masks, rep_percent):
def approximate_dependencies(lhs_set, rhs, df, accuracy, masks):
"""
Checks whether the columns represented in lhs_set functionally determines the column rhs
for the dataframe df.
Expand Down
Loading

0 comments on commit 0c55fef

Please sign in to comment.