diff --git a/.vscode/settings.json b/.vscode/settings.json index 0b02985..255ab5b 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,7 +1,13 @@ { "[python]": { - "editor.defaultFormatter": "ms-python.black-formatter" + "editor.defaultFormatter": "ms-python.black-formatter", }, + "python.linting.flake8Args": [ + "--max-line-length=150" + ], + "flake8.interpreter": [ + "venv/bin/python3" + ], "python.formatting.provider": "none", "cSpell.words": [ "dataframe", diff --git a/models/classification.py b/models/classification.py index 7bc0f08..bd24c0b 100644 --- a/models/classification.py +++ b/models/classification.py @@ -351,7 +351,7 @@ def prepare(self, col: str) -> DataFrame: self.data = df self.N_CLUSTER = int(np.sqrt(len(df))) - if self.viz == True: + if self.viz: self.generate_count_plot(data=df) self._save_data_frame(df, fileName="603_clean.csv") @@ -386,7 +386,7 @@ def prepare(self, col: str) -> DataFrame: self.testData = dfTest self.N_CLUSTER = int(np.sqrt(len(dfTrain))) self.N_TEST_CLUSTER = int(np.sqrt(len(dfTest))) - if self.viz == True: + if self.viz: self.generate_count_plot(data=dfTrain) self.generate_count_plot(data=dfTest) self._save_data_frame(dfTrain, fileName="603_clean.csv") @@ -468,7 +468,6 @@ def _data_encoder(self, df: DataFrame, col: str = "cluster") -> DataFrame: """ Encode the data for the classification. """ - from sklearn.preprocessing import LabelEncoder df[col] = self.encoder.fit_transform(df[col]) @@ -717,7 +716,6 @@ def _run_pca( Returns: None """ - import numpy as np from sklearn.decomposition import PCA pca = PCA(n_components=2, random_state=42) @@ -938,7 +936,8 @@ def generate_heat_map( def _print_sorted_similarities(self, sim_arr, threshold=0) -> DataFrame: """ - Store the similarities between the documents in a data frame that is sorted by the similarity score in descending order. Removing the diagonal values. + Store the similarities between the documents in a data frame that is sorted by the similarity score in descending order. + Removing the diagonal values. Args: sim_arr (numpy.ndarray): The similarity array. @@ -984,13 +983,13 @@ def run(self) -> None: self.read() self.prepare(col="features") self._create_model() - if self.viz == True: + if self.viz: self._run_word_cloud_per_cluster(df=self.data) if self.testPath is not None: # TODO: Fix test data not having x and y columns # self.generate_scatter_plot(data=self.testData) pass - if self.verbose == True: + if self.verbose: self._log("Successfully ran the classification model")