updating docstrings to include information about how keys are chosen,…

… indexes created, and updated readME
alteryx · Aug 9, 2019 · 0c55fef · 0c55fef
1 parent 1184162
commit 0c55fef
Show file tree

Hide file tree

Showing 7 changed files with 65 additions and 379 deletions.
diff --git a/README.md b/README.md
@@ -60,9 +60,12 @@ Returns:
 ```shell
 normalize_dataframe(df, dependencies)
 ```
-Normalizes dataframe based on the dependencies given.
+Normalizes dataframe based on the dependencies given. Keys for the newly created DataFrames can only be columns that are strings, ints, or categories. Keys are chosen according to the priority: 
+1) shortest lenghts 
+2) has "id" in some form in the name of an attribute 
+3) has attribute furthest to left in the table
 
-Returns:
+Returns:x
 
 `new_dfs` (list[pd.DataFrame]) : list of new dataframes
 
@@ -71,7 +74,7 @@ Returns:
 ```shell
 make_entityset(df, dependencies, name=None, time_index=None):
 ```
-Creates a normalized EntitySet from dataframe based on the dependencies given.
+Creates a normalized EntitySet from dataframe based on the dependencies given. Keys are chosen in the same fashion as for `normalize_dataframe`and a new index will be created if any key has more than a single attribute.
 
 Returns:
 

diff --git a/autonormalize/.DS_Store b/autonormalize/.DS_Store
diff --git a/autonormalize/autonormalize.py b/autonormalize/autonormalize.py
@@ -4,7 +4,7 @@
 from .classes import Dependencies
 
 
-def find_dependencies(df, accuracy=0.98, rep_percent=0.85, index=None):
+def find_dependencies(df, accuracy=0.98, index=None):
     """
     Finds dependencies within dataframe df with the DFD search algorithm.
     Returns the dependencies as a Dependencies object.
@@ -17,19 +17,14 @@ def find_dependencies(df, accuracy=0.98, rep_percent=0.85, index=None):
         required in order to conclude a dependency (i.e. with accuracy = 0.98,
         0.98 of the rows must hold true the dependency LHS --> RHS)
 
-        rep_percent (0 < float <= 1.00; default = 0.85) : the maximum amount of
-        data that may be unique in order to determine a dependency (i.e. with
-        rep_percent = 0.85, if less than 15% of rows are repeated for the columns
-        in LHS + RHS, no dependency will be concluded.)
-
         index (str, optional) : name of column that is intended index of df
 
     Returns:
 
         dependencies (Dependencies) : the dependencies found in the data
         within the contraints provided
     """
-    deps = Dependencies(dfd.dfd(df, accuracy, rep_percent, index))
+    deps = Dependencies(dfd.dfd(df, accuracy, index))
     if index is None:
         prim_key = normalize.choose_index(deps.find_candidate_keys(), df)
         deps.set_prim_key(prim_key)
@@ -57,7 +52,11 @@ def normalize_dependencies(df, dependencies):
 
 def normalize_dataframe(df, dependencies):
     """
-    Normalizes a dataframe based on the dependencies given.
+    Normalizes a dataframe based on the dependencies given. Keys for the newly
+    created DataFrames can only be columns that are strings, ints, or
+    categories. Keys are chosen according to the priority:
+    1) shortest lenghts 2) has "id" in some form in the name of an attribute
+    3) has attribute furthest to left in the table
 
     Arguments:
         df (pd.DataFrame) : dataframe to split up
@@ -74,6 +73,10 @@ def normalize_dataframe(df, dependencies):
 def make_entityset(df, dependencies, name=None, time_index=None):
     """
     Creates a normalized EntitySet from df based on the dependencies given.
+    Keys for the newly created DataFrames can only be columns that are strings,
+    ints, or categories. Keys are chosen according to the priority:
+    1) shortest lenghts 2) has "id" in some form in the name of an attribute
+    3) has attribute furthest to left in the table
 
     Arguments:
         df (pd.DataFrame) : dataframe to normalize and make entity set from
@@ -107,7 +110,7 @@ def make_entityset(df, dependencies, name=None, time_index=None):
     return ft.EntitySet(name, entities, relationships)
 
 
-def auto_entityset(df, accuracy=0.98, rep_percent=0.85, index=None, name=None, time_index=None):
+def auto_entityset(df, accuracy=0.98, index=None, name=None, time_index=None):
     """
     Creates a normalized entityset from a dataframe.
 
@@ -119,11 +122,6 @@ def auto_entityset(df, accuracy=0.98, rep_percent=0.85, index=None, name=None, t
         required in order to conclude a dependency (i.e. with accuracy = 0.98,
         0.98 of the rows must hold true the dependency LHS --> RHS)
 
-        rep_percent (0 < float <= 1.00; default = 0.85) : the maximum amount of
-        data that may be unique in order to determine a dependency (i.e. with
-        rep_percent = 0.85, if less than 15% of rows are repeated for the columns
-        in LHS + RHS, no dependency will be concluded.)
-
         index (str, optional) : name of column that is intended index of df
 
         name (str, optional) : the name of created EntitySet
@@ -134,7 +132,7 @@ def auto_entityset(df, accuracy=0.98, rep_percent=0.85, index=None, name=None, t
 
         entityset (ft.EntitySet) : created entity set
     """
-    return make_entityset(df, find_dependencies(df, accuracy, rep_percent, index), name, time_index)
+    return make_entityset(df, find_dependencies(df, accuracy, index), name, time_index)
 
 
 def auto_normalize(df):

diff --git a/autonormalize/demos/AutoNormalize + FeatureTools Demo.ipynb b/autonormalize/demos/AutoNormalize + FeatureTools Demo.ipynb
@@ -16,7 +16,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -39,7 +39,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
    "metadata": {
     "scrolled": true
    },
@@ -146,7 +146,7 @@
        "2    1973-07-28     A  "
       ]
      },
-     "execution_count": 3,
+     "execution_count": 2,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -174,7 +174,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -254,7 +254,7 @@
        "3                   1 2014-01-03 18:39:30      2834.44"
       ]
      },
-     "execution_count": 6,
+     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -278,7 +278,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
@@ -351,7 +351,7 @@
        "3                   1 2014-01-03 17:39:30         True"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -364,7 +364,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 5,
    "metadata": {
     "scrolled": false
    },
@@ -419,7 +419,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 6,
    "metadata": {
     "scrolled": true
    },
@@ -428,7 +428,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 10/10 [00:13<00:00,  1.41s/it]\n"
+      "100%|██████████| 10/10 [00:01<00:00,  7.11it/s]\n"
      ]
     },
     {
@@ -449,7 +449,7 @@
     }
    ],
    "source": [
-    "es = an.auto_entityset(transaction_df, name=\"transactions\", time_index='transaction_time')\n",
+    "es = an.auto_entityset(transaction_df, accuracy=1, name=\"transactions\", time_index='transaction_time')\n",
     "es.add_last_time_indexes()\n",
     "print(es)"
    ]
@@ -643,15 +643,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "\n",
-    "\n",
-    "*write*\n",
-    "\n",
-    "\n",
-    "\n",
-    "In the feature matrix, let’s extract the labels and fill any missing values with zeros. Then, one-hot encode all categorical features by using encode_features().\n",
-    "\n",
-    "After preprocessing, we split the features and corresponding labels each into training and testing sets."
+    "Now we preprocess our features, and split the features and corresponding labels into training and testing sets."
    ]
   },
   {
@@ -672,6 +664,13 @@
     ")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now, we train a random forest classifer on the training set, and then test the models performance by evaluating predictions on the testing set."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 10,
@@ -702,7 +701,9 @@
   {
    "cell_type": "code",
    "execution_count": 11,
-   "metadata": {},
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -725,6 +726,13 @@
     "print(classification_report(y_test, y_hat))"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This plot is based on scores obtained by the model to illustrate which features are considered important for predictions."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 14,

diff --git a/autonormalize/dfd.py b/autonormalize/dfd.py
@@ -10,7 +10,7 @@
 # run script.py  to see a couple examples
 
 
-def dfd(df, accuracy, rep_percent, index=None):
+def dfd(df, accuracy, index=None):
     """
     Main loop of DFD algorithm. It returns all the dependencies represented
     in the data in dataframe df. Refer to section 3.2 of paper for literature.
@@ -28,11 +28,6 @@ def dfd(df, accuracy, rep_percent, index=None):
         to conclude a dependency (i.e. with accuracy = 0.98, 0.98 of the rows
         must hold true the dependency LHS --> RHS)
 
-        rep_percent (0 < float <= 1.00) : the maximum amount of
-        data that may be unique in order to determine a dependency (i.e. with
-        rep_percent = 0.85, if less than 15% of rows are repeated for the columns
-        in LHS + RHS, no dependency will be concluded.)
-
     Returns:
 
         minimal_dependencies (DfdDependencies) : the minimal dependencies
@@ -49,12 +44,12 @@ def dfd(df, accuracy, rep_percent, index=None):
             non_uniq.remove(i)
             dependencies.add_unique_lhs(i)
     for i in tqdm(non_uniq):
-        lhss = find_LHSs(i, non_uniq, df, partitions, accuracy, masks, rep_percent)
+        lhss = find_LHSs(i, non_uniq, df, partitions, accuracy, masks)
         dependencies.add_LHSs(i, lhss)
     return dependencies
 
 
-def find_LHSs(rhs, attrs, df, partitions, accuracy, masks, rep_percent):
+def find_LHSs(rhs, attrs, df, partitions, accuracy, masks):
     """
     Finds all LHS sets of attributes that satisfy a dependency relation for the
     RHS attribute i. This is such that LHS --> RHS.
@@ -76,11 +71,6 @@ def find_LHSs(rhs, attrs, df, partitions, accuracy, masks, rep_percent):
 
         masks (Masks) : contains past calculated masks
 
-        rep_percent (0 < float <= 1.00) : the maximum amount of data that may be
-        unique in order to determine a dependency (i.e. with rep_percent = 0.85,
-        if less than 15% of rows are repeated for the columns in LHS + RHS, no
-        dependency will be concluded.)
-
     Returns:
         lhss (LHSs) : all the LHS that determine rhs
     """
@@ -106,7 +96,7 @@ def find_LHSs(rhs, attrs, df, partitions, accuracy, masks, rep_percent):
             else:
                 node.infer_type()
                 if node.category == 0:
-                    if compute_partitions(df, rhs, node.attrs, partitions, accuracy, masks, rep_percent):
+                    if compute_partitions(df, rhs, node.attrs, partitions, accuracy, masks):
                         if node.is_minimal():
                             min_deps.add_dep(node.attrs)
                             node.category = 2
@@ -296,7 +286,7 @@ def generate_next_seeds(max_non_deps, min_deps, lhs_attrs):
     return list(seeds)
 
 
-def compute_partitions(df, rhs, lhs_set, partitions, accuracy, masks, rep_percent):
+def compute_partitions(df, rhs, lhs_set, partitions, accuracy, masks):
     """
     Returns true if lhs_set --> rhs for dataframe df.
 
@@ -318,17 +308,12 @@ def compute_partitions(df, rhs, lhs_set, partitions, accuracy, masks, rep_percen
 
         masks (Masks) : contains past calculated masks
 
-        rep_percent (0 < float <= 1.00) : the maximum amount of data that may be
-        unique in order to determine a dependency (i.e. with rep_percent = 0.85,
-        if less than 15% of rows are repeated for the columns in LHS + RHS, no
-        dependency will be concluded.)
-
     Returns:
         is_dependency (bool) : True if is a dependency, false otherwise
     """
     # for approximate dependencies see TANE section 2.3s
     if accuracy < 1:
-        return approximate_dependencies(list(lhs_set), rhs, df, accuracy, masks, rep_percent)
+        return approximate_dependencies(list(lhs_set), rhs, df, accuracy, masks)
     part_rhs = partition(lhs_set.union(set([rhs])), df, partitions)
     # if part_rhs > df.shape[0] * rep_percent:
     #     return False
@@ -347,7 +332,7 @@ def partition(attrs, df, partitions):
     return shape
 
 
-def approximate_dependencies(lhs_set, rhs, df, accuracy, masks, rep_percent):
+def approximate_dependencies(lhs_set, rhs, df, accuracy, masks):
     """
     Checks whether the columns represented in lhs_set functionally determines the column rhs
     for the dataframe df.