Merge pull request #38 from VIDA-NYU/automatic-formatter

Automatic formatting using black
VIDA-NYU · Jun 3, 2024 · 84dd3b5 · 84dd3b5
2 parents 2c1fedb + 9a42107
commit 84dd3b5
Show file tree

Hide file tree

Showing 21 changed files with 468 additions and 327 deletions.
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -0,0 +1,12 @@
+name: Lint
+
+on: [push, pull_request]
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: psf/black@stable
+        with:
+          src: ./bdikit/
diff --git a/Makefile b/Makefile
@@ -1,6 +1,14 @@
-all: test
+SRC := ./bdikit/
 
-PHONY: test
+all: lint test
+
+PHONY: format test lint
+
+lint:
+	black --check ${SRC}
 
 test:
 	python3 -m pytest
+
+format:
+	black ${SRC}
diff --git a/README.md b/README.md
@@ -23,6 +23,25 @@ pip install git+https://github.com/VIDA-NYU/bdi-kit@devel
 ## Documentation
 See our examples [here](https://github.com/VIDA-NYU/bdi-kit/tree/devel/examples).
 
+## Contributing
+We format code using the [black](https://black.readthedocs.io/en/stable/) code formatter.
+The CI runs for every pull request and will fail if code is not properly formatted.
+To make sure formatting is correct, you can do the following steps.
+
+Make sure you have black installed:
+```
+pip install black
+```
+
+To format the code, anyone can use the command before committing your changes:
+```
+make format
+```
+
+Or you can use the black command directly:
+```
+black ./bdikit/
+```
 
 ## Folder Structure
 

diff --git a/bdikit/__init__.py b/bdikit/__init__.py
@@ -1,3 +1,3 @@
-__version__ = '0.2.0.dev0'
+__version__ = "0.2.0.dev0"
 # To shortcut the import path
-from bdikit.api import APIManager
+from bdikit.api import APIManager
diff --git a/bdikit/api.py b/bdikit/api.py
@@ -2,19 +2,24 @@
 from bdikit.mapping_recommendation.scope_reducing_manager import ScopeReducingManager
 from bdikit.mapping_recommendation.value_mapping_manager import ValueMappingManager
 from bdikit.mapping_recommendation.column_mapping_manager import ColumnMappingManager
-from bdikit.visualization.mappings import plot_reduce_scope, plot_column_mappings, plot_value_mappings
+from bdikit.visualization.mappings import (
+    plot_reduce_scope,
+    plot_column_mappings,
+    plot_value_mappings,
+)
 from bdikit.utils import get_gdc_data
 from os.path import join, dirname
 import os
 
-os.environ["TOKENIZERS_PARALLELISM"] = "false" # Disable huggingface messages
+os.environ["TOKENIZERS_PARALLELISM"] = "false"  # Disable huggingface messages
 
-GDC_DATA_PATH = join(dirname(__file__), './resource/gdc_table.csv')
+GDC_DATA_PATH = join(dirname(__file__), "./resource/gdc_table.csv")
 
 
-class APIManager():
-
-    def __init__(self,):
+class APIManager:
+    def __init__(
+        self,
+    ):
         # TODO: move into database object (in data_ingestion folder)
         self.dataset = None
         # TODO: move into database object (in data_ingestion folder)
@@ -23,8 +28,8 @@ def __init__(self,):
         self.reduced_scope = None
         self.column_manager = None
         self.value_manager = None
-        self.column_mappings = None # TODO move this to a property in column_manager
-        self.value_mappings = None # TODO move this to a property in value_manager
+        self.column_mappings = None  # TODO move this to a property in column_manager
+        self.value_mappings = None  # TODO move this to a property in value_manager
 
     def load_global_table(self, global_table_path=None):
         if global_table_path is None:
@@ -45,41 +50,58 @@ def reduce_scope(self):
         self.reduced_scope = self.scope_manager.reduce()
         plot_reduce_scope(self.reduced_scope, self.dataset)
 
-    def map_columns(self, algorithm='SimFloodAlgorithm'):
-        self.column_manager = ColumnMappingManager(self.dataset, self.global_table, algorithm)
+    def map_columns(self, algorithm="SimFloodAlgorithm"):
+        self.column_manager = ColumnMappingManager(
+            self.dataset, self.global_table, algorithm
+        )
         self.column_manager.reduced_scope = self.reduced_scope
         self.column_mappings = self.column_manager.map()
         plot_column_mappings(self.column_mappings)
 
         return self.column_mappings
 
-    def map_values(self, algorithm='EditAlgorithm'):
+    def map_values(self, algorithm="EditAlgorithm"):
         self.global_table_all = get_gdc_data(self.column_mappings.values())
-        self.value_manager = ValueMappingManager(self.dataset, self.column_mappings, self.global_table_all, algorithm)
+        self.value_manager = ValueMappingManager(
+            self.dataset, self.column_mappings, self.global_table_all, algorithm
+        )
         self.value_mappings = self.value_manager.map()
         plot_value_mappings(self.value_mappings)
 
         return self.value_mappings
 
-    def update_reduced_scope(self, original_column, new_candidate_name, new_candidate_sim=1.0):
+    def update_reduced_scope(
+        self, original_column, new_candidate_name, new_candidate_sim=1.0
+    ):
         for index in range(len(self.reduced_scope)):
-            if self.reduced_scope[index]['Candidate column'] == original_column:
-                self.reduced_scope[index]['Top k columns'].append((new_candidate_name, new_candidate_sim))
-                print('Reduced scope updated!')
+            if self.reduced_scope[index]["Candidate column"] == original_column:
+                self.reduced_scope[index]["Top k columns"].append(
+                    (new_candidate_name, new_candidate_sim)
+                )
+                print("Reduced scope updated!")
                 plot_reduce_scope(self.reduced_scope)
                 break
 
     def update_column_mappings(self, new_mappings):
         for original_column, new_target_column in new_mappings:
             self.column_mappings[original_column] = new_target_column
 
-        print('Column mapping updated!')
+        print("Column mapping updated!")
         plot_column_mappings(self.column_mappings)
 
-    def update_value_mappings(self, original_column, original_value, new_target_value, new_similarity=1.0):
-        for index in range(len(self.value_mappings[original_column]['matches'])):
-            if self.value_mappings[original_column]['matches'][index][0] == original_value:
-                self.value_mappings[original_column]['matches'][index] = (original_value, new_target_value, new_similarity)
-                print('Value mapping updated!')
+    def update_value_mappings(
+        self, original_column, original_value, new_target_value, new_similarity=1.0
+    ):
+        for index in range(len(self.value_mappings[original_column]["matches"])):
+            if (
+                self.value_mappings[original_column]["matches"][index][0]
+                == original_value
+            ):
+                self.value_mappings[original_column]["matches"][index] = (
+                    original_value,
+                    new_target_value,
+                    new_similarity,
+                )
+                print("Value mapping updated!")
                 plot_value_mappings(self.value_mappings)
                 break
diff --git a/bdikit/data_ingestion/column.py b/bdikit/data_ingestion/column.py
@@ -1,41 +1,43 @@
 from enum import Enum
 
+
 class ColumnType(Enum):
-        STRING = 'string'
-        FLOAT = 'float'
-        INTEGER = 'integer'
-        # TODO semantic types?
+    STRING = "string"
+    FLOAT = "float"
+    INTEGER = "integer"
+    # TODO semantic types?
+
 
 class Column:
-    def __init__(self, df_name, column_name, column_type=ColumnType.STRING, domain_values=None, null_values_representations=None):
+    def __init__(
+        self,
+        df_name,
+        column_name,
+        column_type=ColumnType.STRING,
+        domain_values=None,
+        null_values_representations=None,
+    ):
         self.df_name = df_name
         self.column_name = column_name
         self.column_type = column_type
-        
+
         if domain_values is None:
             self.domain_values = set()
         else:
             self.domain_values = set(domain_values)
-        
+
         if null_values_representations is None:
             self.null_values_representations = set()
         else:
             self.null_values_representations = set(null_values_representations)
-
-
 
     def __str__(self):
         return f"Column(df_name={self.df_name}, column_name={self.column_name}, column_type={self.column_type}, domain_values={self.domain_values}, null_values_representations={self.null_values_representations})"
-    
+
     def __eq__(self, value):
         if not isinstance(value, Column):
             return False
         return self.df_name == value.df_name and self.column_name == value.column_name
-    
+
     def __hash__(self):
         return hash((self.df_name, self.column_name))
-
-
-
-
-
diff --git a/bdikit/data_ingestion/database.py b/bdikit/data_ingestion/database.py
@@ -3,6 +3,7 @@
 
 from .column import Column, ColumnType
 
+
 class Database:
     """
     A class representing a database that stores dataframes.
@@ -14,7 +15,7 @@ class Database:
         load_data(df_name, file_path): Load data from a CSV file into a dataframe and store it in the database.
         load_data_from_folder(folder_path): Load data from all CSV files in a folder.
         get_dataframe(df_name): Retrieve a dataframe by its name.
-        get_dataframe_names(): Get the names of all dataframes stored in the database. 
+        get_dataframe_names(): Get the names of all dataframes stored in the database.
         describe_database(): Print out the names, shape, columns, and head of all dataframes stored in the database.
     """
 
@@ -32,7 +33,8 @@ def load_data(self, df_name, file_path):
         """
         if df_name in self.dataframes:
             raise ValueError(
-                f"Dataframe associated with file name '{df_name}' already exists in the database.")
+                f"Dataframe associated with file name '{df_name}' already exists in the database."
+            )
 
         df = pd.read_csv(file_path)
         self.dataframes[df_name] = df
@@ -42,7 +44,6 @@ def load_data(self, df_name, file_path):
             column = Column(df_name, c, ColumnType.STRING)
             self.columns.add(column)
 
-
     def load_data_from_folder(self, folder_path):
         """
         Function to load data from all CSV files in a folder using the Database class.
@@ -76,7 +77,7 @@ def get_dataframe_names(self):
             list: A list of dataframe names.
         """
         return list(self.dataframes.keys())
-    
+
     def get_columns(self):
         """
         Get the names of all columns stored in the database.
@@ -99,7 +100,6 @@ def describe_database(self):
             # print(f"\t\t- Head: \n{self.dataframes[df_name].head()}")
 
 
-
 # def main():
 #     col1 = Column('df1', 'col1', ColumnType.STRING, ['a', 'b', 'c'], ['n/a', 'na'])
 #     col2 = Column('df1', 'col2', ColumnType.INTEGER, [1, 2, 3], ['n/a', 'na'])
@@ -112,4 +112,4 @@ def describe_database(self):
 #     print(col3 == col4)
 
 # if __name__ == "__main__":
-#     main()
+#     main()
diff --git a/bdikit/data_ingestion/dataset_loader.py b/bdikit/data_ingestion/dataset_loader.py
@@ -4,4 +4,4 @@
 def load_dataframe(dataset_path):
     dataset = pd.read_csv(dataset_path)
 
-    return dataset
+    return dataset
diff --git a/bdikit/download.py b/bdikit/download.py
@@ -60,7 +60,7 @@ def get_cached_model_or_download(model_name: str):
     if len(sys.argv) < 2:
         print("Please provide a model_id as a command line argument.")
         sys.exit(1)
-    
+
     model_id = sys.argv[1]
     model_path = get_cached_model_or_download(model_id)
     print(f"Downloaded model: {model_path}")