From aba1b8e7b966b6e9dc341cac54c3ffbb33eead6f Mon Sep 17 00:00:00 2001 From: Tamar Lavee Date: Fri, 26 Jul 2024 13:06:57 -0400 Subject: [PATCH 1/7] test precommit --- .pre-commit-config.yaml | 7 +------ .isort.cfg => _.isort.cfg | 0 pyproject.toml | 9 --------- skll/metrics.py | 5 +++++ 4 files changed, 6 insertions(+), 15 deletions(-) rename .isort.cfg => _.isort.cfg (100%) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0f5f5b14..b937421e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -22,16 +22,11 @@ repos: rev: '1.0.1' hooks: - id: flynt - - repo: https://github.com/psf/black - rev: 24.2.0 - hooks: - - id: black - args: [--line-length=100] - repo: https://github.com/charliermarsh/ruff-pre-commit rev: 'v0.3.1' hooks: - id: ruff - args: [--line-length=100, --select, "D,E,F,I", --ignore, "D212", --per-file-ignores, "tests/test*.py:D,tests/test_input.py:E501,skll/data/featureset.py:E501,skll/learner/__init__.py:E501,skll/learner/voting.py:E501,skll/learner/utils.py:E501"] + # args: [--line-length=100, --select, "D,E,F,I", --ignore, "D212", --per-file-ignores, "tests/test*.py:D,tests/test_input.py:E501,skll/data/featureset.py:E501,skll/learner/__init__.py:E501,skll/learner/voting.py:E501,skll/learner/utils.py:E501"] - repo: https://github.com/pre-commit/mirrors-mypy rev: 'v1.8.0' hooks: diff --git a/.isort.cfg b/_.isort.cfg similarity index 100% rename from .isort.cfg rename to _.isort.cfg diff --git a/pyproject.toml b/pyproject.toml index 663788e8..68d92f9b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -63,18 +63,9 @@ where = ["."] exclude = ["tests", "examples"] namespaces = false -[tool.black] -include = '\.pyi?$' -line-length = 100 -target-version = ['py311'] - [tool.ruff] -extend-exclude = ["setup.py"] lint.select = ["D", "E", "F", "I"] lint.ignore = ["D212"] line-length = 100 target-version = "py311" fix = true - -[mypy] -exclude = '^setup\.py$' diff --git a/skll/metrics.py b/skll/metrics.py index 68393cbc..c053c30b 100644 --- a/skll/metrics.py +++ b/skll/metrics.py @@ -85,6 +85,7 @@ def kappa( If labels cannot be converted to int. ValueError If invalid weight scheme. + """ # Ensure that the lists are both the same length assert len(y_true) == len(y_pred) @@ -190,6 +191,7 @@ def correlation(y_true: np.ndarray, y_pred: np.ndarray, corr_type: str = "pearso ------- float correlation value if well-defined, else 0.0 + """ # get the correlation function to use based on the given type corr_func = pearsonr @@ -226,6 +228,7 @@ def f1_score_least_frequent(y_true: np.ndarray, y_pred: np.ndarray) -> float: ------- float F1 score of the least frequent label. + """ least_frequent = np.bincount(y_true).argmin() return f1_score(y_true, y_pred, average=None)[least_frequent] @@ -253,6 +256,7 @@ def register_custom_metric(custom_metric_path: PathOrStr, custom_metric_name: st with an already existing attribute in ``skll.metrics`` or if the custom metric name conflicts with a scikit-learn or SKLL metric. + """ if not custom_metric_path: raise ValueError( @@ -332,6 +336,7 @@ def use_score_func(func_name: str, y_true: np.ndarray, y_pred: np.ndarray) -> fl ------- float The scored result from the given scorer. + """ try: scorer = get_scorer(func_name) From 63054e1def10d7c989086be5cd7515f9e31e5d5a Mon Sep 17 00:00:00 2001 From: Tamar Lavee Date: Fri, 26 Jul 2024 13:27:49 -0400 Subject: [PATCH 2/7] remove redundant ruff configuration --- .pre-commit-config.yaml | 1 - pyproject.toml | 8 ++++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b937421e..79b10dda 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -26,7 +26,6 @@ repos: rev: 'v0.3.1' hooks: - id: ruff - # args: [--line-length=100, --select, "D,E,F,I", --ignore, "D212", --per-file-ignores, "tests/test*.py:D,tests/test_input.py:E501,skll/data/featureset.py:E501,skll/learner/__init__.py:E501,skll/learner/voting.py:E501,skll/learner/utils.py:E501"] - repo: https://github.com/pre-commit/mirrors-mypy rev: 'v1.8.0' hooks: diff --git a/pyproject.toml b/pyproject.toml index 68d92f9b..0fe8dcd7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -69,3 +69,11 @@ lint.ignore = ["D212"] line-length = 100 target-version = "py311" fix = true + +[tool.ruff.lint.per-file-ignores] +"tests/test*.py" = ["D"] +"tests/test_input.py" = ["E501"] +"skll/data/featureset.py" = ["E501"] +"skll/learner/__init__.py" = ["E501"] +"skll/learner/voting.py" = ["E501"] +"skll/learner/utils.py" = ["E501"] From 492b7820cab53bd7f94734d5f2624e74453389f7 Mon Sep 17 00:00:00 2001 From: Tamar Lavee Date: Fri, 26 Jul 2024 13:30:02 -0400 Subject: [PATCH 3/7] remove isort config --- _.isort.cfg | 6 ------ .pep8speaks.yml => _.pep8speaks.yml | 2 +- 2 files changed, 1 insertion(+), 7 deletions(-) delete mode 100644 _.isort.cfg rename .pep8speaks.yml => _.pep8speaks.yml (97%) diff --git a/_.isort.cfg b/_.isort.cfg deleted file mode 100644 index 919df1ae..00000000 --- a/_.isort.cfg +++ /dev/null @@ -1,6 +0,0 @@ -[settings] -multi_line_output = 3 -include_trailing_comma = true -use_parentheses = true -ensure_newline_before_comments = true -line_length = 88 diff --git a/.pep8speaks.yml b/_.pep8speaks.yml similarity index 97% rename from .pep8speaks.yml rename to _.pep8speaks.yml index 9489f045..97477e2c 100644 --- a/.pep8speaks.yml +++ b/_.pep8speaks.yml @@ -1,4 +1,4 @@ -# File : .pep8speaks.yml +# File : _.pep8speaks.yml scanner: diff_only: True # If False, the entire file touched by the Pull Request is scanned for errors. If True, only the diff is scanned. From b1da5e041b49ca0b2787fa337a66ce28aff350be Mon Sep 17 00:00:00 2001 From: Tamar Lavee Date: Fri, 26 Jul 2024 14:27:33 -0400 Subject: [PATCH 4/7] remove pep8speaks.yml --- _.pep8speaks.yml | 23 ----------------------- 1 file changed, 23 deletions(-) delete mode 100644 _.pep8speaks.yml diff --git a/_.pep8speaks.yml b/_.pep8speaks.yml deleted file mode 100644 index 97477e2c..00000000 --- a/_.pep8speaks.yml +++ /dev/null @@ -1,23 +0,0 @@ -# File : _.pep8speaks.yml - -scanner: - diff_only: True # If False, the entire file touched by the Pull Request is scanned for errors. If True, only the diff is scanned. - linter: flake8 # Other option is pycodestyle - -flake8: # Valid if scanner.linter is flake8 - max-line-length: 100 - ignore: [W503, W504] # Errors and warnings to ignore - -no_blank_comment: False # If True, no comment is made on PR without any errors. -descending_issues_order: False # If True, PEP 8 issues in message will be displayed in descending order of line numbers in the file - -message: # Customize the comment made by the bot - opened: # Messages when a new PR is submitted - header: "Hello @{name}! Thanks for opening this PR. " - # The keyword {name} is converted into the author's username - footer: "Do see the [Hitchhiker's guide to code style](https://goo.gl/hqbW4r)" - # The messages can be written as they would over GitHub - updated: # Messages when new commits are added to the PR - header: "Hello @{name}! Thanks for updating this PR. " - footer: "" # Why to comment the link to the style guide everytime? :) - no_errors: "There are currently no PEP 8 issues detected in this Pull Request. Cheers! :tada: " From 910f33d39d8a4b92340e941377929ff5c8550a0d Mon Sep 17 00:00:00 2001 From: Tamar Lavee Date: Fri, 26 Jul 2024 14:38:54 -0400 Subject: [PATCH 5/7] update contributing README with current precommit --- CONTRIBUTING.md | 12 ++++++------ skll/utils/testing.py | 13 +++++++++++++ skll/utils/wandb.py | 15 ++++++++++++--- 3 files changed, 31 insertions(+), 9 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 68666bb1..8e58bc87 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -33,8 +33,8 @@ How to contribute $ pre-commit install [`pre-commit`](https://pre-commit.com/) is used to run pre-commit - hooks, such as [`isort`](https://pycqa.github.io/isort/) and - [`flake8`](https://flake8.pycqa.org/en/latest/). (Check + hooks, such as [`ruff`](https://github.com/astral-sh/ruff) and + [`mypy`](https://github.com/python/mypy). (Check [here](./.pre-commit-config.yaml) to see a full list of pre-commit hooks.) If you attempt to make a commit and it fails, you will be able to see which hooks passed/failed and you will have an @@ -52,13 +52,13 @@ How to contribute $ pre-commit run - To run the `black` hook alone on changed files: + To run the `ruff` hook alone on changed files: - $ pre-commit run black + $ pre-commit run ruff - To run the `black` hook alone on a given file: + To run the `ruff` hook alone on a given file: - $ pre-commit run black + $ pre-commit run ruff --files Finally, the `SKIP` environment variable can be used to indicate to `pre-commit` that certain checks should be skipped. It can be diff --git a/skll/utils/testing.py b/skll/utils/testing.py index f0824688..1eea2bda 100644 --- a/skll/utils/testing.py +++ b/skll/utils/testing.py @@ -53,6 +53,7 @@ def unlink(file_path: PathOrStr): ---------- file_path : :class:`skll.types.PathOrStr` File path to remove. + """ file_path = Path(file_path) if file_path.exists(): @@ -74,6 +75,7 @@ def fill_in_config_paths(config_template_path: PathOrStr) -> Path: ------- Path The path to the filled configuration file. + """ # convert path to Path object config_template_path = Path(config_template_path) @@ -154,6 +156,7 @@ def fill_in_config_paths_for_single_file( ------- Path The path to the filled configuration file. + """ # convert path to Path object if it's a string config_template_path = Path(config_template_path) @@ -231,6 +234,7 @@ def fill_in_config_options( ------- Path The path to the filled configuration file. + """ # convert path to Path object if it's a string config_template_path = Path(config_template_path) @@ -333,6 +337,7 @@ def fill_in_config_paths_for_fancy_output(config_template_path: PathOrStr) -> Pa ------- Path The path to the filled configuration file. + """ # convert template path to a Path object if string config_template_path = Path(config_template_path) @@ -392,6 +397,7 @@ def fill_in_config_options_for_voting_learners( - the custom seed value used for cross-validation, if any - the number of learning curve cross-validation folds (10 or 20) - the list of learning curve training sizes + """ # setup learner-type specific values based on configuration options custom_learner = "" @@ -569,6 +575,7 @@ def create_jsonlines_feature_files(path: PathOrStr): ---------- path : :class:`skll.types.PathOrStr` Full path under which to save the created feature files. + """ # convert to Path object path = Path(path) @@ -629,6 +636,7 @@ def remove_jsonlines_feature_files(path: PathOrStr): ---------- path : :class:`skll.types.PathOrStr` Path to directory in which jsonlines files reside. + """ for i in range(6): unlink(Path(path) / f"f{i}.jsonlines") @@ -842,6 +850,7 @@ def make_regression_data( 3-tuple containing the generated training featureset, the generated test featureset, and a dictionary containing the oracle feature weights + """ # if we are doing feature hashing and we have asked for more # feature bins than number of total features, we need to @@ -944,6 +953,7 @@ def make_sparse_data(use_feature_hashing=False): Tuple Tuple containing the generated training featureset and the generated test featureset. + """ # Create training data X, y = make_classification( @@ -1040,6 +1050,7 @@ def make_digits_data(num_examples=None, test_size=0.2, use_digit_names=False): ValueError If ``num_examples`` is greater than the number of available examples. + """ # load the digits data digits = load_digits(as_frame=True) @@ -1150,6 +1161,7 @@ def make_california_housing_data(num_examples=None, test_size=0.2): ValueError If ``num_examples`` is greater than the number of available examples. + """ # load the housing data housing = fetch_california_housing( @@ -1238,6 +1250,7 @@ def compute_expected_folds_for_cv_testing(featureset, num_folds=10, stratified=T expected_fold_ids : dict A dictionary mapping each ID in the featureset to a fold ID. Fold IDs range from 0 to ``num_folds``-1. + """ # initialize the return dictionary expected_fold_ids = {} diff --git a/skll/utils/wandb.py b/skll/utils/wandb.py index e26b809d..00bbdb32 100644 --- a/skll/utils/wandb.py +++ b/skll/utils/wandb.py @@ -8,13 +8,13 @@ from typing import Any, Dict, Optional, Union import pandas as pd - import wandb -from skll.config import _setup_config_parser -from skll.types import PathOrStr from wandb.sdk.lib import RunDisabled from wandb.sdk.wandb_run import Run +from skll.config import _setup_config_parser +from skll.types import PathOrStr + class WandbLogger: """Interface for Weights and Biases logging.""" @@ -30,6 +30,7 @@ def __init__(self, wandb_credentials: Optional[Dict[str, str]], config_file_path used to initialize the wandb run. If ``None``, logging to W&B will not be performed. config_file_path : str The path to this experiment's config file + """ self.wandb_run: Optional[Union[Run, RunDisabled]] = None if wandb_credentials: @@ -48,6 +49,7 @@ def log_plot(self, plot_file_path: str) -> None: ---------- plot_file_path : str The full path to the plot file. + """ plot_name = Path(plot_file_path).stem if self.wandb_run: @@ -63,6 +65,7 @@ def log_summary_file(self, summary_file_path: PathOrStr) -> None: ---------- summary_file_path : PathOrStr The path to the summary tsv file + """ if self.wandb_run: summary_df = pd.read_csv(summary_file_path, sep="\t") @@ -87,6 +90,7 @@ def log_evaluation_results(self, task_results: Dict[str, Any]) -> None: task_results : Dict[str,Any] The evaluation results of a single job of "evaluate" task or a single fold of a "cross_validate" task. + """ if self.wandb_run: task_prefix = task_results["job_name"] @@ -136,6 +140,7 @@ def log_train_results(self, task_results: Dict[str, Any]) -> None: ---------- task_results : Dict[str, Any] The train task results. + """ if self.wandb_run: task_prefix = task_results["job_name"] @@ -152,6 +157,7 @@ def log_predict_results(self, task_results: Dict[str, Any]) -> None: ---------- task_results : Dict[str, Any] The predict task results. + """ if self.wandb_run: task_prefix = task_results["job_name"] @@ -176,6 +182,7 @@ def log_conf_matrix_chart(self, task_prefix, confusion_matrix, labels) -> None: the confusion matrix values labels : List[str] label names + """ if self.wandb_run: conf_matrix_data = [] @@ -198,6 +205,7 @@ def log_to_summary(self, task_prefix, metric_name, metric_value) -> None: The metric name metric_value : Any The metric value + """ if self.wandb_run: self.wandb_run.summary[f"{task_prefix}/{metric_name}"] = metric_value @@ -217,6 +225,7 @@ def get_config_dict(config_file_path: str) -> Dict[str, Any]: Dictionary containing all SKLL configuration fields. This also includes default values when for fields that are missing in the file. + """ config_parser = _setup_config_parser(config_file_path, validate=False) return { From cf82ae9525712e47e5d7798fab2e915b66e5cd52 Mon Sep 17 00:00:00 2001 From: Tamar Lavee Date: Mon, 29 Jul 2024 10:35:22 -0400 Subject: [PATCH 6/7] run ruff on all files and apply changes. --- skll/config/__init__.py | 5 ++++ skll/config/utils.py | 5 ++++ skll/data/dict_vectorizer.py | 1 + skll/data/featureset.py | 13 +++++++++++ skll/data/readers.py | 20 ++++++++++++++++ skll/data/writers.py | 20 ++++++++++++++++ skll/experiments/__init__.py | 2 ++ skll/experiments/input.py | 1 + skll/experiments/output.py | 8 +++++++ skll/experiments/utils.py | 3 +++ skll/learner/__init__.py | 18 +++++++++++++++ skll/learner/utils.py | 23 +++++++++++++++++++ skll/learner/voting.py | 8 +++++++ .../compute_eval_from_predictions.py | 2 ++ skll/utils/commandline/filter_features.py | 1 + .../utils/commandline/generate_predictions.py | 1 + skll/utils/commandline/join_features.py | 1 + .../utils/commandline/plot_learning_curves.py | 1 + skll/utils/commandline/print_model_weights.py | 1 + skll/utils/commandline/run_experiment.py | 1 + skll/utils/commandline/skll_convert.py | 1 + skll/utils/commandline/summarize_results.py | 1 + skll/utils/logging.py | 3 +++ 23 files changed, 140 insertions(+) diff --git a/skll/config/__init__.py b/skll/config/__init__.py index b13adc2f..a3e3407c 100644 --- a/skll/config/__init__.py +++ b/skll/config/__init__.py @@ -164,6 +164,7 @@ def _find_invalid_options(self) -> Set[str]: ------- invalid_options : Set[str] The set of invalid options specified by the user. + """ # compute a list of all the valid options valid_options = list(self.defaults().keys()) + self._required_options @@ -211,6 +212,7 @@ def _find_ill_specified_options( the default value for the option does not result in running an experiment with unexpected settings, this is not really a major problem. + """ incorrectly_specified_options = [] multiply_specified_options = [] @@ -252,6 +254,7 @@ def validate(self) -> None: KeyError If any options are not defined in the appropriate sections. + """ invalid_options = self._find_invalid_options() if invalid_options: @@ -522,6 +525,7 @@ def parse_config_file( ValueError If various configuration parameters are incorrectly specified, or cause conflicts. + """ # ensure that a path is specified if not config_path: @@ -1114,6 +1118,7 @@ def _setup_config_parser(config_path: PathOrStr, validate=True) -> SKLLConfigPar ------ FileNotFoundError If the configuration file does not exist. + """ # initialize config parser with the given defaults config = SKLLConfigParser() diff --git a/skll/config/utils.py b/skll/config/utils.py index 515659a9..7215a6a1 100644 --- a/skll/config/utils.py +++ b/skll/config/utils.py @@ -31,6 +31,7 @@ def fix_json(json_string: str) -> str: ------- str The normalized JSON string. + """ json_string = json_string.replace("True", "true") json_string = json_string.replace("False", "false") @@ -63,6 +64,7 @@ def load_cv_folds(folds_file: PathOrStr, ids_to_floats=False) -> FoldMapping: ------ ValueError If example IDs cannot be converted to floats and `ids_to_floats` is `True`. + """ with open(folds_file) as f: reader = csv.reader(f) @@ -106,6 +108,7 @@ def locate_file(file_path: PathOrStr, config_dir: PathOrStr) -> str: ------ FileNotFoundError If the file does not exist. + """ if not file_path: return "" @@ -140,6 +143,7 @@ def _munge_featureset_name(name_or_list: Union[Iterable, str]) -> str: ------- res : str name components joined with '+' if input is a list or the name itself. + """ if isinstance(name_or_list, str): return name_or_list @@ -179,6 +183,7 @@ def _parse_and_validate_metrics(metrics: str, option_name: str, logger=None) -> ValueError If "mean_squared_error" is specified as a metric. + """ # create a logger if one was not passed in if not logger: diff --git a/skll/data/dict_vectorizer.py b/skll/data/dict_vectorizer.py index f9255569..d3b4232e 100644 --- a/skll/data/dict_vectorizer.py +++ b/skll/data/dict_vectorizer.py @@ -82,6 +82,7 @@ class DictVectorizer(OldDictVectorizer): FeatureHasher : performs vectorization using only a hash function. sklearn.preprocessing.OneHotEncoder : handles nominal/categorical features encoded as columns of integers. + """ def __eq__(self, other): diff --git a/skll/data/featureset.py b/skll/data/featureset.py index 8c377817..8de94acb 100644 --- a/skll/data/featureset.py +++ b/skll/data/featureset.py @@ -56,6 +56,7 @@ class FeatureSet(object): ----- If ids, labels, and/or features are not None, the number of rows in each array must be equal. + """ def __init__( @@ -125,6 +126,7 @@ def __contains__(self, value): ---------- value The value to check. + """ return value in self.ids @@ -146,6 +148,7 @@ def __eq__(self, other): ----- We consider feature values to be equal if any differences are in the sixth decimal place or higher. + """ return ( self.ids.shape == other.ids.shape @@ -218,6 +221,7 @@ def __add__(self, other: "FeatureSet") -> "FeatureSet": ValueError If there are conflicting labels. + """ # Check that the sets of IDs are equal if set(self.ids) != set(other.ids): @@ -335,6 +339,7 @@ def filter( ValueError If attempting to use features to filter a ``FeatureSet`` that uses a ``FeatureHasher`` vectorizer. + """ # Construct mask that indicates which examples to keep mask = np.ones(len(self), dtype=bool) @@ -430,6 +435,7 @@ def filtered_iter( ValueError If any of the "labels", "features", or "vectorizer" attribute is ``None``. + """ if self.features is not None and not isinstance(self.vectorizer, DictVectorizer): raise ValueError( @@ -477,6 +483,7 @@ def __sub__(self, other: "FeatureSet") -> "FeatureSet": ------- :class:`skll.data.featureset.FeatureSet` A copy of ``self`` with all features in ``other`` removed. + """ new_set = deepcopy(self) if other.vectorizer: @@ -492,6 +499,7 @@ def has_labels(self): ------- has_labels : bool Whether or not this FeatureSet has any finite labels. + """ # make sure that labels is not None or a list of Nones if self.labels is not None and not all(label is None for label in self.labels): @@ -510,6 +518,7 @@ def __str__(self): ------- str: A string representation of ``FeatureSet``. + """ return str(self.__dict__) @@ -521,6 +530,7 @@ def __repr__(self): ------- str: A string representation of ``FeatureSet``. + """ return repr(self.__dict__) @@ -542,6 +552,7 @@ def __getitem__( If `value` is a slice, then return a new ``FeatureSet`` instance containing a subset of the data. If it's an index, return the specific example by row number. + """ # Check if we're slicing if isinstance(value, slice): @@ -597,6 +608,7 @@ def split( ------- Tuple[:class:`skll.data.featureset.FeatureSet`, :class:`skll.data.featureset.FeatureSet`] A tuple containing the two featureset instances. + """ # Note: an alternative way to implement this is to make copies # of the given FeatureSet instance and then use the `filter()` @@ -655,6 +667,7 @@ def from_data_frame( ------- :class:`skll.data.featureset.FeatureSet` A ``FeatureSet`` instance generated from from the given data frame. + """ if labels_column: feature_columns = [column for column in df.columns if column != labels_column] diff --git a/skll/data/readers.py b/skll/data/readers.py index cb307a2e..ac10481b 100644 --- a/skll/data/readers.py +++ b/skll/data/readers.py @@ -118,6 +118,7 @@ class Reader(object): logger : Optional[logging.Logger], default=None A logger instance to use to log messages instead of creating a new one by default. + """ def __init__( @@ -177,6 +178,7 @@ def for_path(cls, path_or_list: Union[PathOrStr, FeatureDictList], **kwargs) -> ------ ValueError If file does not have a valid extension. + """ if not isinstance(path_or_list, (str, Path)): return DictListReader(path_or_list) @@ -211,6 +213,7 @@ def _sub_read(self, file): Raises ------ NotImplementedError + """ raise NotImplementedError @@ -229,6 +232,7 @@ def _print_progress(self, progress_num: Union[int, str], end="\r"): end : str, default='\r' The string to put at the end of the line. "\r" should be used for every update except for the final one. + """ # Print out status if not self.quiet: @@ -270,6 +274,7 @@ def _sub_read_rows(self, file: PathOrStr) -> Tuple[np.ndarray, np.ndarray, Featu ValueError If the example IDs are not unique. + """ # Get labels and IDs ids_list: List[IdType] = [] @@ -365,6 +370,7 @@ def _parse_dataframe( features : :class:`skll.types.FeatureDictList` List of feature dictionaries. + """ if df.empty: raise ValueError("No features found in possibly empty file " f"'{self.path_or_list}'.") @@ -459,6 +465,7 @@ def read(self) -> FeatureSet: ValueError If the example IDs are not unique. + """ self.logger.debug(f"Path: {self.path_or_list}") @@ -550,6 +557,7 @@ class DictListReader(Reader): logger : Optional[logging.Logger], default=None A logger instance to use to log messages instead of creating a new one by default. + """ def read(self) -> FeatureSet: @@ -560,6 +568,7 @@ def read(self) -> FeatureSet: ------- :class:`skll.data.FeatureSet` A ``FeatureSet`` representing the list of dictionaries we read in. + """ # if we are in this method, `self.path_or_list` must be a # list of dictionaries @@ -698,6 +707,7 @@ def _sub_read(self, file: IO[str]) -> FeatGenerator: ValueError If IDs cannot be converted to floats, and ``ids_to_floats`` is ``True``. + """ for example_num, line in enumerate(file): # Remove extraneous whitespace @@ -789,6 +799,7 @@ class LibSVMReader(Reader): logger : Optional[logging.Logger], default=None A logger instance to use to log messages instead of creating a new one by default. + """ line_regex = re.compile( @@ -827,6 +838,7 @@ def _pair_to_tuple(pair: str, feat_map: Dict[str, str]) -> Tuple[str, Union[floa The name of the feature. value : Union[float, int, str] The value of the example. + """ name, value = pair.split(":") if feat_map is not None: @@ -859,6 +871,7 @@ def _sub_read(self, file: IO[str]) -> FeatGenerator: ------ ValueError If line does not look like valid libsvm format. + """ feat_map: Optional[Dict[str, str]] for example_num, line in enumerate(file): @@ -952,6 +965,7 @@ class CSVReader(Reader): kwargs : Optional[Dict[str, Any]] Other arguments to the Reader object. + """ def __init__( @@ -990,6 +1004,7 @@ def _sub_read(self, file: PathOrStr) -> Tuple[np.ndarray, np.ndarray, FeatureDic features : :class:`skll.types.FeatureDictList` The list of feature dictionaries for the feature set. + """ df = pd.read_csv(file, sep=self._sep, engine=self._engine, **self._pandas_kwargs) return self._parse_dataframe( @@ -1035,6 +1050,7 @@ class TSVReader(CSVReader): kwargs : Optional[Dict[str, Any]] Other arguments to the Reader object. + """ def __init__( @@ -1072,6 +1088,7 @@ class ARFFReader(Reader): kwargs : Optional[Dict[str, Any]] Other arguments to the Reader object. + """ def __init__(self, path_or_list: Union[PathOrStr, List[Dict[str, Any]]], **kwargs): @@ -1101,6 +1118,7 @@ def split_with_quotes( escape_char : str, default='\\' The escape character. + """ return next( csv.reader([string], delimiter=delimiter, quotechar=quote_char, escapechar=escape_char) @@ -1125,6 +1143,7 @@ def _sub_read(self, file: IO[str]) -> FeatGenerator: example : :class:`skll.types.FeatureDict` The example features in dictionary format. + """ field_names = [] # Process ARFF header @@ -1235,6 +1254,7 @@ def safe_float( Union[float, int, str] The text value converted to int or float, if possible. Otherwise it's a string. + """ # convert to str to be "Safe"! text = str(text) diff --git a/skll/data/writers.py b/skll/data/writers.py index cfe0faca..1b339bba 100644 --- a/skll/data/writers.py +++ b/skll/data/writers.py @@ -66,6 +66,7 @@ class Writer(object): logger : Optional[logging.Logger], default=None A logger instance to use to log messages instead of creating a new one by default. + """ def __init__( @@ -128,6 +129,7 @@ def for_path(cls, path: PathOrStr, feature_set: FeatureSet, **kwargs) -> "Writer writer : :class:`skll.data.Writer` New instance of the Writer sub-class that is appropriate for the given path. + """ # Get lowercase extension for file extension checking # NOTE: the reason we are doing this complicated gymnastics @@ -174,6 +176,7 @@ def _write_subset( filter_features : Optional[Set[str]], default=None Set of features to include in current feature file. + """ self.logger.debug(f"sub_path: {sub_path}") self.logger.debug(f"feature_set: {self.feat_set.name}") @@ -226,6 +229,7 @@ def _write_header(self, feature_set, output_file, filter_features): filter_features : Ignored Not used. + """ pass @@ -250,6 +254,7 @@ def _write_line(self, id_, label_, feat_dict, output_file): Raises ------ NotImplementedError + """ raise NotImplementedError @@ -271,6 +276,7 @@ def _write_data(self, feature_set, output_file, filter_features): Raises ------ NotImplementedError + """ raise NotImplementedError @@ -297,6 +303,7 @@ def _get_column_names_and_indexes( column_indexes : List[int] A list of the (possibly filtered) column indexes. + """ # if we're not doing filtering, # then just take all the feature names @@ -367,6 +374,7 @@ class CSVWriter(Writer): pandas_kwargs : Optional[Dict[str], Any], default=None Arguments that will be passed directly to the `pandas` I/O reader. + """ def __init__( @@ -418,6 +426,7 @@ def _build_dataframe_with_features( ValueError If ID column is already used as feature. If label column is already used as feature. + """ # if there is no filtering, then just keep all the names (column_names, column_idxs) = self._get_column_names_and_indexes( @@ -476,6 +485,7 @@ def _build_dataframe( ValueError If ID column is already used as feature. If label column is already used as feature. + """ # create the data frame with just the features # from the feature set, at this point @@ -520,6 +530,7 @@ def _write_data( If only writing a subset of the features in the FeatureSet to ``output_file``, these are the features to include in this file. + """ df = self._build_dataframe(feature_set, filter_features=filter_features) df.to_csv(output_file, sep=self._sep, index=self._index, **self._pandas_kwargs) @@ -569,6 +580,7 @@ class TSVWriter(CSVWriter): pandas_kwargs : Optional[Dict[str, Any]], default=None Arguments that will be passed directly to the `pandas` I/O reader. + """ def __init__( @@ -640,6 +652,7 @@ class ARFFWriter(Writer): kwargs : Optional[Dict[str, Any]] The arguments to the ``Writer`` object being instantiated. + """ def __init__( @@ -687,6 +700,7 @@ def _write_header( If only writing a subset of the features in the FeatureSet to ``output_file``, these are the features to include in this file. + """ fieldnames, _ = self._get_column_names_and_indexes(self.feat_set, filter_features) fieldnames.append(self.id_col) @@ -747,6 +761,7 @@ def _write_line( ValueError If ID column name is already used as a feature. + """ # Add class column to feat_dict (unless this is unlabeled data) if self.label_col not in feat_dict: @@ -802,6 +817,7 @@ class NDJWriter(Writer): logger : Optional[logging.Logger], default=None A logger instance to use to log messages instead of creating a new one by default. + """ def __init__( @@ -840,6 +856,7 @@ def _write_line( output_file : IO[str] The file being written to. + """ example_dict: FeatureDict = {} # Don't try to add class column if this is label-less data @@ -900,6 +917,7 @@ class LibSVMWriter(Writer): label_map : Optional[Dict[str, int]], default=None A mapping from label strings to integers. + """ LIBSVM_REPLACE_DICT = { @@ -960,6 +978,7 @@ def _sanitize(name: Union[IdType, LabelType]) -> Union[IdType, LabelType]: ------- Union[:class:`skll.types.IdType`, :class:`skll.types.LabelType`] The sanitized name with special characters replaced. + """ sanitized_name = name if isinstance(sanitized_name, str): @@ -986,6 +1005,7 @@ def _write_line( output_file : IO[str] The file being written to. + """ field_values = ( sorted( diff --git a/skll/experiments/__init__.py b/skll/experiments/__init__.py index 2a52d807..1403fef4 100644 --- a/skll/experiments/__init__.py +++ b/skll/experiments/__init__.py @@ -80,6 +80,7 @@ def _classify_featureset(args: Dict[str, Any]) -> List[Dict[str, Any]]: ------ ValueError If extra unknown arguments are passed to the function. + """ # Extract all the arguments. # (There doesn't seem to be a better way to do this since one can't specify @@ -669,6 +670,7 @@ def run_configuration( If value for ``"ablation"`` is not a positive int or ``None``. OSError If the lenth of the ``FeatureSet`` name > 210. + """ try: # Read configuration diff --git a/skll/experiments/input.py b/skll/experiments/input.py index 06b92ec5..a5ada564 100644 --- a/skll/experiments/input.py +++ b/skll/experiments/input.py @@ -72,6 +72,7 @@ def load_featureset( merged_set : :class:`skll.data.featureset.FeatureSet` A ``FeatureSet`` instance containing the specified labels, IDs, features, and feature vectorizer. + """ # get a logger if one was not provided logger = logger if logger else logging.getLogger(__name__) diff --git a/skll/experiments/output.py b/skll/experiments/output.py index a335d5d1..e5f2e7fe 100644 --- a/skll/experiments/output.py +++ b/skll/experiments/output.py @@ -52,6 +52,7 @@ def _compute_ylimits_for_featureset( ylimits : Dict[str, Tuple[float, float]] A dictionary, with metric names as keys and a tuple of (lower_limit, upper_limit) as values. + """ # set the y-limits of the curves depending on what kind # of values the metric produces @@ -119,6 +120,7 @@ def _generate_learning_curve_score_plots( ------- List[str] A list of paths of the generated plots + """ # convert output dir to a path output_dir = Path(output_dir) @@ -259,6 +261,7 @@ def _generate_learning_curve_time_plots( ------- List[str] A list of paths of the generated plots + """ # convert output dir to a path output_dir = Path(output_dir) @@ -346,6 +349,7 @@ def generate_learning_curve_plots( ------- List[str] A list of paths of the generated plots + """ # convert output_dir to Path object output_dir = Path(output_dir) @@ -462,6 +466,7 @@ def _print_fancy_output( List of result dictionaries. output_file : IO[str], default=sys.stdout The file buffer to print to. + """ if not learner_result_dicts: raise ValueError("Result dictionary list is empty!") @@ -552,6 +557,7 @@ def _write_learning_curve_file(result_json_paths: List[str], output_file: IO[str list of paths to the individual result JSON files. output_file : IO[str] The file buffer to write to. + """ learner_result_dicts = [] @@ -646,6 +652,7 @@ def _write_skll_folds(skll_fold_ids: FoldMapping, skll_fold_ids_file: IO[str]) - Dictionary with ids as keys and test-fold-numbers as values. skll_fold_ids_file : IO[str] An open file handler to write to. + """ f = csv.writer(skll_fold_ids_file) f.writerow(["id", "cv_test_fold"]) @@ -670,6 +677,7 @@ def _write_summary_file(result_json_paths: List[str], output_file: IO[str], abla The file buffer to write to. ablation : int, default=0 The number of features to remove when doing ablation experiment. + """ learner_result_dicts = [] # Map from feature set names to all features in them diff --git a/skll/experiments/utils.py b/skll/experiments/utils.py index f286336f..2682460e 100644 --- a/skll/experiments/utils.py +++ b/skll/experiments/utils.py @@ -64,6 +64,7 @@ def _check_job_results(job_results: List[List[Dict[str, Any]]]) -> None: ---------- job_results : List[List[Dict[str, Any]]] A list of job result dictionaries. + """ logger = get_skll_logger("experiment") logger.info("Checking job results") @@ -101,6 +102,7 @@ def _create_learner_result_dicts( ------- res : List[Dict[str, Any]] The results of the learners, as a list of dictionaries. + """ res = [] @@ -256,6 +258,7 @@ def _get_stat_float(label_result_dict: Dict[str, float], stat: str) -> float: ------- float The value of the stat if it's in the dictionary, and NaN otherwise. + """ if stat in label_result_dict and label_result_dict[stat] is not None: return label_result_dict[stat] diff --git a/skll/learner/__init__.py b/skll/learner/__init__.py index d77cf8b7..579a0b16 100644 --- a/skll/learner/__init__.py +++ b/skll/learner/__init__.py @@ -171,6 +171,7 @@ class Learner(object): Path to module where a custom classifier is defined. logger : Optional[logging.Logger], default=None A logging object. If ``None`` is passed, get logger from ``__name__``. + """ def __init__( @@ -402,6 +403,7 @@ def from_file( ------- :class:`skll.learner.Learner` The ``Learner`` instance loaded from the file. + """ # use the logger that's passed in or if nothing was passed in, # then create a new logger @@ -435,6 +437,7 @@ def load(self, learner_path: PathOrStr) -> None: ---------- learner_path : :class:`skll.types.PathOrStr` The path to a saved learner object file to load. + """ del self.__dict__ self.__dict__ = Learner.from_file(learner_path).__dict__ @@ -460,6 +463,7 @@ def _convert_coef_array_to_feature_names(self, coef: np.ndarray, feature_name_pr ------- Dict[str, Any] A dictionary of labeled weights + """ res = {} vocabulary = {} @@ -509,6 +513,7 @@ def model_params(self) -> Tuple[Dict[str, Any], Dict[str, Any]]: ------ ValueError If the instance does not support model parameters. + """ res = {} intercept = {} @@ -623,6 +628,7 @@ def probability(self, value: bool) -> None: ---------- value : bool Whether learner should return probabilities of all labels. + """ # LinearSVC doesn't support predict_proba self._probability = value @@ -653,6 +659,7 @@ def save(self, learner_path: PathOrStr) -> None: ---------- learner_path : :class:`skll.types.PathOrStr` The path to save the ``Learner`` instance to. + """ _save_learner_to_disk(self, learner_path) @@ -671,6 +678,7 @@ def _create_estimator(self): ------ ValueError If there is no default parameter grid for estimator. + """ estimator = None default_param_grid = None @@ -705,6 +713,7 @@ def get_feature_names_out(self) -> np.ndarray: ValueError If ``self.feat_vectorizer`` is either ``None`` or a :class:`sklearn.feature_extraction.FeatureHasher`. + """ if isinstance(self.feat_vectorizer, DictVectorizer): return self.feat_vectorizer.get_feature_names_out()[self.feat_selector.get_support()] @@ -729,6 +738,7 @@ def _check_input_formatting(self, examples: FeatureSet) -> None: If labels are strings. TypeError If any features are strings. + """ # Make sure the labels for a regression task are not strings. if self.model_type._estimator_type == "regressor" and examples.labels is not None: @@ -762,6 +772,7 @@ def _check_max_feature_value(self, feat_array: np.ndarray): ---------- feat_array : numpy.ndarray A numpy array with features. + """ max_feat_abs = np.max(np.abs(feat_array.data)) if max_feat_abs > 1000.0: @@ -780,6 +791,7 @@ def _create_label_dict(self, examples: FeatureSet) -> None: ---------- examples : :class:`skll.data.featureset.FeatureSet` The examples to use for training. + """ # we don't need to do this if we have already done it # or for regression models, so simply return. @@ -818,6 +830,7 @@ def _train_setup(self, examples: FeatureSet) -> None: ---------- examples : :class:`skll.data.featureset.FeatureSet` The ``FeatureSet`` instance to use for training. + """ # Check feature values and labels self._check_input_formatting(examples) @@ -898,6 +911,7 @@ def train( If process runs out of memory converting training data to dense. ValueError If FeatureHasher is used with MultinomialNB. + """ # get the estimator type since we need it in multiple places below estimator_type = self.model_type._estimator_type @@ -1232,6 +1246,7 @@ def evaluate( the per-label PRFs, the model parameters, the grid search objective function score, and the additional evaluation metrics, if any. For regressors, the first two elements in the tuple are ``None``. + """ # are we in a regressor or a classifier estimator_type = self.model_type._estimator_type @@ -1355,6 +1370,7 @@ def predict( RuntimeError If there is a mismatch between the learner vectorizer and the test set vectorizer. + """ example_ids = examples.ids @@ -1617,6 +1633,7 @@ def cross_validate( If classification labels are not properly encoded as strings. ValueError If ``grid_search`` is ``True`` but ``grid_objective`` is ``None``. + """ # Seed the random number generator so that randomized # algorithms are replicable @@ -1814,6 +1831,7 @@ def learning_curve( ------ ValueError If the number of examples is less than 500. + """ # check that the number of training examples is more than the minimum # needed for generating a reliable learning curve diff --git a/skll/learner/utils.py b/skll/learner/utils.py index d8574b71..2500f65d 100644 --- a/skll/learner/utils.py +++ b/skll/learner/utils.py @@ -116,6 +116,7 @@ class FilteredLeaveOneGroupOut(LeaveOneGroupOut): A list of example IDs. logger : Optional[logging.Logger], default=None A logger instance. + """ def __init__( @@ -154,6 +155,7 @@ def split( The training set indices for that split. test_index : numpy.ndarray The testing set indices for that split. + """ for train_index, test_index in super(FilteredLeaveOneGroupOut, self).split(X, y, groups): train_len = len(train_index) @@ -181,6 +183,7 @@ class SelectByMinCount(SelectKBest): ---------- min_count : int, default=1 The minimum feature count to select. + """ def __init__(self, min_count: int = 1): @@ -202,6 +205,7 @@ def fit(self, X, y=None): Returns ------- self + """ # initialize a list of counts of times each feature appears col_counts = [0 for _ in range(X.shape[1])] @@ -229,6 +233,7 @@ def _get_support_mask(self): ------- mask : numpy.ndarray The mask with features to keep set to True. + """ mask = np.zeros(self.scores_.shape, dtype=bool) mask[self.scores_ >= self.min_count] = True @@ -253,6 +258,7 @@ def add_unseen_labels( Dict[:class:`skll.types.LabelType`, int] Dictionary mapping merged labels from both the training and test sets to indices. + """ # get the list of labels that were in the training set train_label_list = list(train_label_dict.keys()) @@ -317,6 +323,7 @@ def compute_evaluation_metrics( per-label PRFs, the grid search objective function score, and the additional evaluation metrics, if any. For regressors, the first two elements are ``None``. + """ # set up the logger logger = logger if logger else logging.getLogger(__name__) @@ -485,6 +492,7 @@ def compute_num_folds_from_example_counts( ValueError If ``cv_folds`` is not an integer or if the training set has fewer than 2 examples associated with a label (for classification). + """ # get a logger if not provided logger = logger if logger else logging.getLogger(__name__) @@ -540,6 +548,7 @@ def contiguous_ints_or_floats(numbers: np.ndarray) -> bool: If ``numbers`` does not contain integers or floating point values. ValueError If ``numbers`` is empty. + """ try: # make sure that number is not empty @@ -580,6 +589,7 @@ def get_acceptable_classification_metrics(label_array: np.ndarray) -> Set[str]: acceptable_metrics : Set[str] A set of metric names that are acceptable for the given classification scenario. + """ # this is a classifier so the acceptable objective # functions definitely include those metrics that @@ -668,6 +678,7 @@ def load_custom_learner( ------ ValueError If the custom learner path does not end in '.py'. + """ if not custom_learner_path: raise ValueError( @@ -722,6 +733,7 @@ def get_predictions( NotImplementedError If the scikit-learn model does not implement ``predict_proba()`` to get the class probabilities. + """ # deferred import to avoid circular dependencies from skll.learner.voting import VotingLearner @@ -787,6 +799,7 @@ def rescaled(cls): ------ ValueError If classifier cannot be rescaled (i.e. is not a regressor). + """ # If this class has already been run through the decorator, return it if hasattr(cls, "rescale"): @@ -819,6 +832,7 @@ def fit(self, X: np.ndarray, y=None): # noqa: D417 Returns ------- self + """ # fit a regular regression model orig_fit(self, X, y=y) @@ -857,6 +871,7 @@ def predict(self, X: np.ndarray) -> np.ndarray: ------- numpy.ndarray The prediction results. + """ # get the unconstrained predictions res = orig_predict(self, X) @@ -896,6 +911,7 @@ def _get_param_names(class_x): ------ RuntimeError If `varargs` exist in the scikit-learn estimator. + """ # initialize the empty list of parameter names args = [] @@ -952,6 +968,7 @@ def init(self, constrain: bool = True, rescale: bool = True, **kwargs): # noqa: Whether to rescale prediction values using z-scores. kwargs : Dict[str, Any] Keyword arguments for base class. + """ # pylint: disable=W0201 self.constrain = constrain @@ -1006,6 +1023,7 @@ def setup_cv_fold_iterator( k-fold iterator Optional[List[str]] List of cross-validation groups + """ # explicitly declare the return types kfold: Union[FilteredLeaveOneGroupOut, KFold, StratifiedKFold] @@ -1059,6 +1077,7 @@ def setup_cv_split_iterator( Iterator over the train/test featuresets int The maximum number of training samples available. + """ # seed the random number generator for replicability random_state = np.random.RandomState(123456789) @@ -1122,6 +1141,7 @@ def train_and_score( float The time taken in seconds to fit the ``learner`` on ``train_examples``. + """ # capture the time before we train the model start_time = time.time() @@ -1198,6 +1218,7 @@ def write_predictions( List of class labels, required if ``probability`` is ``True``. append : bool, default=False Should we append the current predictions to the file if it exists? + """ # create a new file starting with the given prefix prediction_file = f"{file_prefix}_predictions.tsv" @@ -1250,6 +1271,7 @@ def _save_learner_to_disk( A ``Learner`` or ``VotingLearner`` instance to save to disk. filepath : :class:`skll.types.PathOrStr` The path to save the learner instance to. + """ # create the directory if it doesn't exist learner_dir = Path(filepath).parent @@ -1288,6 +1310,7 @@ def _load_learner_from_disk( ------ ValueError If the pickled version of the ``Learner`` instance is out of date. + """ skll_version, learner = joblib.load(filepath) diff --git a/skll/learner/voting.py b/skll/learner/voting.py index 407ff2ee..93604b46 100644 --- a/skll/learner/voting.py +++ b/skll/learner/voting.py @@ -108,6 +108,7 @@ class VotingLearner(object): list and the order of the ``learner_names`` list. logger : Optional[logging.Logger], default=None A logging object. If ``None`` is passed, get logger from ``__name__``. + """ def __init__( @@ -236,6 +237,7 @@ def save(self, learner_path: PathOrStr) -> None: ---------- learner_path : :class:`skll.types.PathOrStr` The path to save the ``VotingLearner`` instance to. + """ _save_learner_to_disk(self, learner_path) @@ -257,6 +259,7 @@ def from_file( ------- learner : skll.learner.voting.VotingLearner The ``VotingLearner`` instance loaded from the file. + """ # use the logger that's passed in or if nothing was passed in, # then create a new logger @@ -322,6 +325,7 @@ def train( the number of grid search folds will be used. shuffle : bool, default=False Shuffle examples (e.g., for grid search CV.) + """ if param_grid_list is None: self._param_grids = [] @@ -443,6 +447,7 @@ def predict( name of each underlying learner as the key and the array of its predictions as the value. The second element is ``None`` if ``individual_predictions`` is set to ``False``. + """ example_ids = examples.ids @@ -551,6 +556,7 @@ def evaluate( The confusion matrix, the overall accuracy, the per-label PRFs, the model parameters, the grid search objective function score, and the additional evaluation metrics, if any. + """ # make the prediction on the test data; note that these # are either class indices or class probabilities @@ -720,6 +726,7 @@ def cross_validate( If classification labels are not properly encoded as strings. ValueError If ``grid_search`` is ``True`` but ``grid_objective`` is ``None``. + """ # Seed the random number generator so that randomized algorithms are # replicable. @@ -912,6 +919,7 @@ def learning_curve( ------ ValueError If the number of examples is less than 500. + """ # check that the number of training examples is more than the minimum # needed for generating a reliable learning curve diff --git a/skll/utils/commandline/compute_eval_from_predictions.py b/skll/utils/commandline/compute_eval_from_predictions.py index e22ea099..d1fe02a1 100755 --- a/skll/utils/commandline/compute_eval_from_predictions.py +++ b/skll/utils/commandline/compute_eval_from_predictions.py @@ -118,6 +118,7 @@ def compute_eval_from_predictions( ValueError If the requested prediction method is ``"expected_value"`` but the class names can't be converted to ints. + """ # convert the examples file and predictions file to a Path examples_file = Path(examples_file) @@ -192,6 +193,7 @@ def main(argv: Optional[List[str]] = None) -> None: argv: Optional[List[str]], default=None List of arguments, as if specified on the command-line. If ``None``, then ``sys.argv[1:]`` is used instead. + """ # Get command line arguments parser = argparse.ArgumentParser( diff --git a/skll/utils/commandline/filter_features.py b/skll/utils/commandline/filter_features.py index 3b5a5fb5..78817a27 100755 --- a/skll/utils/commandline/filter_features.py +++ b/skll/utils/commandline/filter_features.py @@ -28,6 +28,7 @@ def main(argv: Optional[List[str]] = None) -> None: argv : Optional[List[str]], default=None List of arguments, as if specified on the command-line. If ``None``, ``sys.argv[1:]`` is used instead. + """ # Get command line arguments parser = argparse.ArgumentParser( diff --git a/skll/utils/commandline/generate_predictions.py b/skll/utils/commandline/generate_predictions.py index 79a283ac..83f51a63 100755 --- a/skll/utils/commandline/generate_predictions.py +++ b/skll/utils/commandline/generate_predictions.py @@ -32,6 +32,7 @@ def main(argv: Optional[List[str]] = None): argv : Optional[List[str]], default=None List of arguments, as if specified on the command-line. If ``None``, ``sys.argv[1:]`` is used instead. + """ # Get command line arguments parser = argparse.ArgumentParser( diff --git a/skll/utils/commandline/join_features.py b/skll/utils/commandline/join_features.py index e3740763..480d9114 100755 --- a/skll/utils/commandline/join_features.py +++ b/skll/utils/commandline/join_features.py @@ -27,6 +27,7 @@ def main(argv: Optional[List[str]] = None) -> None: argv : Optional[List[str]], default=None List of arguments, as if specified on the command-line. If ``None``, ``sys.argv[1:]`` is used instead. + """ # Get command line arguments parser = argparse.ArgumentParser( diff --git a/skll/utils/commandline/plot_learning_curves.py b/skll/utils/commandline/plot_learning_curves.py index 76d57836..40722ce8 100755 --- a/skll/utils/commandline/plot_learning_curves.py +++ b/skll/utils/commandline/plot_learning_curves.py @@ -34,6 +34,7 @@ def main(argv: Optional[List[str]] = None) -> None: argv : Optional[List[str]], default=None List of arguments, as if specified on the command-line. If ``None``, ``sys.argv[1:]`` is used instead. + """ # Get command line arguments parser = argparse.ArgumentParser( diff --git a/skll/utils/commandline/print_model_weights.py b/skll/utils/commandline/print_model_weights.py index f35fe06f..758285f2 100755 --- a/skll/utils/commandline/print_model_weights.py +++ b/skll/utils/commandline/print_model_weights.py @@ -32,6 +32,7 @@ def main(argv: Optional[List[str]] = None) -> None: List of arguments, as if specified on the command-line. If ``None``, ``sys.argv[1:]`` is used instead. Defaults to ``None``. + """ parser = argparse.ArgumentParser( description="Prints out the weights of a" " given model.", diff --git a/skll/utils/commandline/run_experiment.py b/skll/utils/commandline/run_experiment.py index f6ff49ad..49e19532 100755 --- a/skll/utils/commandline/run_experiment.py +++ b/skll/utils/commandline/run_experiment.py @@ -26,6 +26,7 @@ def main(argv: Optional[List[str]] = None) -> None: argv : Optional[List[str]], default=None List of arguments, as if specified on the command-line. If ``None``, ``sys.argv[1:]`` is used instead. + """ # Get command line arguments parser = ArgumentParser( diff --git a/skll/utils/commandline/skll_convert.py b/skll/utils/commandline/skll_convert.py index 61d3f339..4999073e 100755 --- a/skll/utils/commandline/skll_convert.py +++ b/skll/utils/commandline/skll_convert.py @@ -43,6 +43,7 @@ def main(argv: Optional[List[str]] = None) -> None: argv : Optional[List[str]], default=None List of arguments, as if specified on the command-line. If ``None``, ``sys.argv[1:]`` is used instead. + """ # Get command line arguments parser = argparse.ArgumentParser( diff --git a/skll/utils/commandline/summarize_results.py b/skll/utils/commandline/summarize_results.py index b9f8dcc0..c50bc627 100755 --- a/skll/utils/commandline/summarize_results.py +++ b/skll/utils/commandline/summarize_results.py @@ -25,6 +25,7 @@ def main(argv: Optional[List[str]] = None) -> None: argv : Optional[List[str]], default=None List of arguments, as if specified on the command-line. If ``None``, ``sys.argv[1:]`` is used instead. + """ # Get command line arguments parser = argparse.ArgumentParser( diff --git a/skll/utils/logging.py b/skll/utils/logging.py index fb3ee8fc..3a19d076 100644 --- a/skll/utils/logging.py +++ b/skll/utils/logging.py @@ -33,6 +33,7 @@ def filter(self, record): ---------- record : logging.LogRecord The log record to be filtered. + """ # Check if the log record is from matplotlib.category and contains the specific message if ( @@ -86,6 +87,7 @@ def get_skll_logger( ------- logger: logging.Logger A ``Logger`` instance. + """ # first get the logger instance associated with the # given name if one already exists @@ -121,6 +123,7 @@ def close_and_remove_logger_handlers(logger: logging.Logger) -> None: ---------- logger : logging.Logger Logger instance + """ for handler in logger.handlers[:]: handler.close() From 503753a9d936bd4402ffef142c0866f8d229a494 Mon Sep 17 00:00:00 2001 From: Tamar Lavee Date: Mon, 29 Jul 2024 13:42:43 -0400 Subject: [PATCH 7/7] apply more ruff changes, mostly adding docstrings. --- examples/__init__.py | 1 + skll/utils/__init__.py | 1 + skll/utils/commandline/__init__.py | 1 + tests/other/custom_logistic_wrapper.py | 7 +++++-- tests/other/custom_metrics2.py | 2 ++ tests/other/kappa.py | 2 ++ tests/other/majority_class_learner.py | 5 +++++ 7 files changed, 17 insertions(+), 2 deletions(-) diff --git a/examples/__init__.py b/examples/__init__.py index e69de29b..8a708a68 100644 --- a/examples/__init__.py +++ b/examples/__init__.py @@ -0,0 +1 @@ +"""Data generation scripts for the different tutorial examples.""" diff --git a/skll/utils/__init__.py b/skll/utils/__init__.py index e69de29b..a14e0f8e 100644 --- a/skll/utils/__init__.py +++ b/skll/utils/__init__.py @@ -0,0 +1 @@ +"""Code for different utility scripts, functions, and classes used throughout SKLL.""" diff --git a/skll/utils/commandline/__init__.py b/skll/utils/commandline/__init__.py index e69de29b..08b0c243 100644 --- a/skll/utils/commandline/__init__.py +++ b/skll/utils/commandline/__init__.py @@ -0,0 +1 @@ +"""Command line scripts and utilities.""" diff --git a/tests/other/custom_logistic_wrapper.py b/tests/other/custom_logistic_wrapper.py index 00ad0d87..54ea90c3 100644 --- a/tests/other/custom_logistic_wrapper.py +++ b/tests/other/custom_logistic_wrapper.py @@ -1,7 +1,8 @@ # License: BSD 3 clause """ -A simple wrapper around the existing LogisticRegression class, for testing -custom learners functionality. +A simple wrapper around the existing LogisticRegression class. + +Used for testing custom learners functionality. :author: Michael Heilman (mheilman@ets.org) """ @@ -10,4 +11,6 @@ class CustomLogisticRegressionWrapper(LogisticRegression): + """A simple wrapper around the existing LogisticRegression class.""" + pass diff --git a/tests/other/custom_metrics2.py b/tests/other/custom_metrics2.py index 3a2420ab..9c36ffc4 100644 --- a/tests/other/custom_metrics2.py +++ b/tests/other/custom_metrics2.py @@ -1,5 +1,7 @@ +"""Additional custom metrics module for testing purposes.""" from sklearn.metrics import fbeta_score def f06_micro(y_true, y_pred): + """Define a custom metric for testing purposes.""" return fbeta_score(y_true, y_pred, beta=0.6, average="micro") diff --git a/tests/other/kappa.py b/tests/other/kappa.py index 0cd2602f..850d241d 100644 --- a/tests/other/kappa.py +++ b/tests/other/kappa.py @@ -1,2 +1,4 @@ +"""metric definition for testing purposes.""" def dummy_metric(y_true, y_pred): + """Return a fixed score.""" return 1.0 diff --git a/tests/other/majority_class_learner.py b/tests/other/majority_class_learner.py index 699a1a60..8c97877a 100644 --- a/tests/other/majority_class_learner.py +++ b/tests/other/majority_class_learner.py @@ -12,10 +12,14 @@ class MajorityClassLearner(BaseEstimator, ClassifierMixin): + """A simple majority class classifier.""" + def __init__(self): + """Initialize class.""" self.majority_class = None def fit(self, X, y): + """Set the majority class based on the given data.""" counts = Counter(y) max_count = -1 for label, count in counts.items(): @@ -25,4 +29,5 @@ def fit(self, X, y): return self def predict(self, X): + """Return the prediction (majority class) for the given data.""" return np.array([self.majority_class for x in range(X.shape[0])])